All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCHSET 0/9] perf: Improve cgroup profiling (v1)
@ 2019-08-28  7:31 Namhyung Kim
  2019-08-28  7:31 ` [PATCH 1/9] perf/core: Add PERF_RECORD_CGROUP event Namhyung Kim
                   ` (8 more replies)
  0 siblings, 9 replies; 43+ messages in thread
From: Namhyung Kim @ 2019-08-28  7:31 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Arnaldo Carvalho de Melo
  Cc: LKML, Jiri Olsa, Alexander Shishkin, Stephane Eranian, Tejun Heo,
	Li Zefan, Johannes Weiner, Adrian Hunter

Hello,

This work is to improve cgroup profiling in perf.  Currently it only
supports profiling tasks in a specific cgroup and there's no way to
identify which cgroup the current sample belongs to.  So I added
PERF_SAMPLE_CGROUP to add cgroup info into each sample.  It's a 64-bit
integer having inode number of the cgroup.  And kernel also generates
PERF_RECORD_CGROUP event for new groups to correlate the inode number
and cgroup name (path in the cgroup filesystem).  The inode number can
be read from userspace easily so it can synthesize the CGROUP event
for existing groups.

Note that this only works for "perf_event" cgroups (obviously) so if
users are still using cgroup-v1 interface, they need to have same
hierarchy for subsystem(s) want to profile with it.

The testing result looks something like this:

  [root@qemu build]# ./perf record --all-cgroups ./cgtest
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.009 MB perf.data (150 samples) ]
  
  [root@qemu build]# ./perf report -s cgroup,comm --stdio
  # To display the perf.data header info, please use --header/--header-only options.
  #
  #
  # Total Lost Samples: 0
  #
  # Samples: 150  of event 'cpu-clock:pppH'
  # Event count (approx.): 37500000
  #
  # Overhead  cgroup      Command
  # ........  ..........  .......
  #
      32.00%  /sub/cgrp2  looper2
      28.00%  /sub/cgrp1  looper1
      25.33%  /sub        looper0
       4.00%  /           cgtest 
       4.00%  /sub        cgtest 
       3.33%  /sub/cgrp2  cgtest 
       2.67%  /sub/cgrp1  cgtest 
       0.67%  /           looper0


The test program (cgtest) follows.

Thanks,
Namhyung


Cc: Tejun Heo <tj@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>


-------8<-----------------------------------------8<----------------
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <sched.h>
#include <errno.h>
#include <sys/stat.h>
#include <sys/wait.h>
#include <sys/prctl.h>
#include <sys/mount.h>

char cgbase[] = "/sys/fs/cgroup/perf_event";

void mkcgrp(char *name) {
  char buf[256];

  snprintf(buf, sizeof(buf), "%s%s", cgbase, name);
  if (mkdir(buf, 0755) < 0)
    perror("mkdir");
}

void rmcgrp(char *name) {
  char buf[256];

  snprintf(buf, sizeof(buf), "%s%s", cgbase, name);
  if (rmdir(buf) < 0)
    perror("rmdir");
}

void setcgrp(char *name) {
  char buf[256];
  int fd;

  snprintf(buf, sizeof(buf), "%s%s/tasks", cgbase, name);

  fd = open(buf, O_WRONLY);
  if (fd > 0) {
    if (write(fd, "0\n", 2) != 2)
      perror("write");
    close(fd);
  }
}

void create_sub_cgroup(int idx) {
  char buf[128];

  snprintf(buf, sizeof(buf), "/sub/cgrp%d", idx);
  mkcgrp(buf);
}

void remove_sub_cgroup(int idx) {
  char buf[128];

  snprintf(buf, sizeof(buf), "/sub/cgrp%d", idx);
  rmcgrp(buf);
}

void set_sub_cgroup(int idx) {
  char buf[128];

  snprintf(buf, sizeof(buf), "/sub/cgrp%d", idx);
  setcgrp(buf);
}

void set_task_name(int idx) {
  char buf[16];

  snprintf(buf, sizeof(buf), "looper%d", idx);
  prctl(PR_SET_NAME, buf, 0, 0, 0);
}

void loop(unsigned max) {
  volatile unsigned int count = 1;

  while (count++ != max) {
    asm volatile ("pause");
  }
}

void worker(int idx, unsigned cnt, int new_ns) {
  int oldns;

  create_sub_cgroup(idx);
  set_sub_cgroup(idx);

  if (new_ns) {
    if (unshare(CLONE_NEWCGROUP) < 0)
      perror("unshare");

#if 0  /* FIXME */
    if (unshare(CLONE_NEWNS) < 0)
      perror("unshare");

    if (mount("none", "/sys", NULL, MS_REMOUNT | MS_REC | MS_SLAVE, NULL) < 0)
      perror("mount --make-rslave");

    sleep(1);
    if (umount("/sys/fs/cgroup/perf_event") < 0)
      perror("umount");

    if (mount("cgroup", "/sys/fs/cgroup/perf_event", "cgroup",
              MS_NODEV | MS_NOEXEC | MS_NOSUID, "perf_event") < 0)
      perror("mount again");
#endif
  }

  if (fork() == 0) {
    set_task_name(idx);
    loop(cnt);
    exit(0);
  }
  wait(NULL);
}

int main(int argc, char *argv[])
{
  int i, nr = 2;
  int new_ns = 1;
  unsigned cnt = 1000000;
  int fd;

  if (argc > 1)
    nr = atoi(argv[1]);
  if (argc > 2)
    cnt = atoi(argv[2]);
  if (argc > 3)
    new_ns = atoi(argv[3]);

  mkcgrp("/sub");
  setcgrp("/sub");

  for (i = 0; i < nr; i++) {
    if (fork() == 0) {
      worker(i+1, cnt, new_ns);
      exit(0);
    }
  }

  set_task_name(0);
  loop(cnt);

  for (i = 0; i < nr; i++)
    wait(NULL);

  for (i = 0; i < nr; i++)
    remove_sub_cgroup(i+1);

  setcgrp("/");
  rmcgrp("/sub");

  return 0;
}
-------8<-----------------------------------------8<----------------


Namhyung Kim (9):
  perf/core: Add PERF_RECORD_CGROUP event
  perf/core: Add PERF_SAMPLE_CGROUP feature
  perf tools: Basic support for CGROUP event
  perf tools: Maintain cgroup hierarchy
  perf report: Add 'cgroup' sort key
  perf record: Support synthesizing cgroup events
  perf record: Add --all-cgroups option
  perf top: Add --all-cgroups option
  perf script: Add --show-cgroup-events option

 include/linux/perf_event.h               |   1 +
 include/uapi/linux/perf_event.h          |  18 ++-
 kernel/events/core.c                     | 128 ++++++++++++++++++++++
 tools/include/uapi/linux/perf_event.h    |  17 ++-
 tools/perf/Documentation/perf-record.txt |   5 +-
 tools/perf/Documentation/perf-report.txt |   1 +
 tools/perf/Documentation/perf-script.txt |   3 +
 tools/perf/Documentation/perf-top.txt    |   4 +
 tools/perf/builtin-diff.c                |   1 +
 tools/perf/builtin-record.c              |  10 ++
 tools/perf/builtin-report.c              |   1 +
 tools/perf/builtin-script.c              |  41 +++++++
 tools/perf/builtin-top.c                 |   9 ++
 tools/perf/lib/include/perf/event.h      |   7 ++
 tools/perf/util/cgroup.c                 |  75 ++++++++++++-
 tools/perf/util/cgroup.h                 |  16 ++-
 tools/perf/util/event.c                  | 133 +++++++++++++++++++++++
 tools/perf/util/event.h                  |  11 ++
 tools/perf/util/evsel.c                  |  12 ++
 tools/perf/util/hist.c                   |   7 ++
 tools/perf/util/hist.h                   |   1 +
 tools/perf/util/machine.c                |  19 ++++
 tools/perf/util/machine.h                |   3 +
 tools/perf/util/record.h                 |   1 +
 tools/perf/util/session.c                |   8 ++
 tools/perf/util/sort.c                   |  31 ++++++
 tools/perf/util/sort.h                   |   2 +
 tools/perf/util/tool.h                   |   2 +
 28 files changed, 556 insertions(+), 11 deletions(-)

-- 
2.23.0.187.g17f5b7556c-goog


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 1/9] perf/core: Add PERF_RECORD_CGROUP event
  2019-08-28  7:31 [PATCHSET 0/9] perf: Improve cgroup profiling (v1) Namhyung Kim
@ 2019-08-28  7:31 ` Namhyung Kim
  2019-08-28  9:44   ` Peter Zijlstra
                     ` (2 more replies)
  2019-08-28  7:31 ` [PATCH 2/9] perf/core: Add PERF_SAMPLE_CGROUP feature Namhyung Kim
                   ` (7 subsequent siblings)
  8 siblings, 3 replies; 43+ messages in thread
From: Namhyung Kim @ 2019-08-28  7:31 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Arnaldo Carvalho de Melo
  Cc: LKML, Jiri Olsa, Alexander Shishkin, Stephane Eranian, Tejun Heo,
	Li Zefan, Johannes Weiner, Adrian Hunter

To support cgroup tracking, add CGROUP event to save a link between
cgroup path and inode number.  The attr.cgroup bit was also added to
enable cgroup tracking from userspace.

This event will be generated when a new cgroup becomes active.
Userspace might need to synthesize those events for existing cgroups.

As aux_output change is also going on, I just added the bit here as
well to remove possible conflicts later.

Cc: Tejun Heo <tj@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 include/uapi/linux/perf_event.h |  15 ++++-
 kernel/events/core.c            | 112 ++++++++++++++++++++++++++++++++
 2 files changed, 126 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 7198ddd0c6b1..cb07c24b715f 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -374,7 +374,9 @@ struct perf_event_attr {
 				namespaces     :  1, /* include namespaces data */
 				ksymbol        :  1, /* include ksymbol events */
 				bpf_event      :  1, /* include bpf events */
-				__reserved_1   : 33;
+				aux_output     :  1, /* generate AUX records instead of events */
+				cgroup         :  1, /* include cgroup events */
+				__reserved_1   : 31;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@@ -999,6 +1001,17 @@ enum perf_event_type {
 	 */
 	PERF_RECORD_BPF_EVENT			= 18,
 
+	/*
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u64				ino;
+	 *	u64				path_len;
+	 *	char				path[];
+	 *	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_CGROUP			= 19,
+
 	PERF_RECORD_MAX,			/* non-ABI */
 };
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0463c1151bae..d898263db4fc 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -386,6 +386,7 @@ static atomic_t nr_freq_events __read_mostly;
 static atomic_t nr_switch_events __read_mostly;
 static atomic_t nr_ksymbol_events __read_mostly;
 static atomic_t nr_bpf_events __read_mostly;
+static atomic_t nr_cgroup_events __read_mostly;
 
 static LIST_HEAD(pmus);
 static DEFINE_MUTEX(pmus_lock);
@@ -4314,6 +4315,8 @@ static void unaccount_event(struct perf_event *event)
 		atomic_dec(&nr_comm_events);
 	if (event->attr.namespaces)
 		atomic_dec(&nr_namespaces_events);
+	if (event->attr.cgroup)
+		atomic_dec(&nr_cgroup_events);
 	if (event->attr.task)
 		atomic_dec(&nr_task_events);
 	if (event->attr.freq)
@@ -7217,6 +7220,106 @@ void perf_event_namespaces(struct task_struct *task)
 			NULL);
 }
 
+/*
+ * cgroup tracking
+ */
+#ifdef CONFIG_CGROUPS
+
+struct perf_cgroup_event {
+	char				*path;
+	struct {
+		struct perf_event_header	header;
+		u64				ino;
+		u64				path_len;
+		char				path[];
+	} event_id;
+};
+
+static int perf_event_cgroup_match(struct perf_event *event)
+{
+	return event->attr.cgroup;
+}
+
+static void perf_event_cgroup_output(struct perf_event *event, void *data)
+{
+	struct perf_cgroup_event *cgroup_event = data;
+	struct perf_output_handle handle;
+	struct perf_sample_data sample;
+	u16 header_size = cgroup_event->event_id.header.size;
+	int ret;
+
+	if (!perf_event_cgroup_match(event))
+		return;
+
+	perf_event_header__init_id(&cgroup_event->event_id.header,
+				   &sample, event);
+	ret = perf_output_begin(&handle, event,
+				cgroup_event->event_id.header.size);
+	if (ret)
+		goto out;
+
+	perf_output_put(&handle, cgroup_event->event_id);
+	__output_copy(&handle, cgroup_event->path,
+		      cgroup_event->event_id.path_len);
+
+	perf_event__output_id_sample(event, &handle, &sample);
+
+	perf_output_end(&handle);
+out:
+	cgroup_event->event_id.header.size = header_size;
+}
+
+void perf_event_cgroup(struct cgroup *cgrp)
+{
+	struct perf_cgroup_event cgroup_event;
+	char path_enomem[16] = "//enomem";
+	char *pathname;
+	size_t size;
+
+	if (!atomic_read(&nr_cgroup_events))
+		return;
+
+	cgroup_event = (struct perf_cgroup_event){
+		.event_id  = {
+			.header = {
+				.type = PERF_RECORD_CGROUP,
+				.misc = 0,
+				.size = sizeof(cgroup_event.event_id),
+			},
+			.ino = cgrp->kn->id.ino,
+		},
+	};
+
+	pathname = kmalloc(PATH_MAX, GFP_KERNEL);
+	if (pathname == NULL) {
+		cgroup_event.path = path_enomem;
+	} else {
+		/* just to be sure to have enough space for alignment */
+		cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64));
+		cgroup_event.path = pathname;
+	}
+
+	/*
+	 * Since our buffer works in 8 byte units we need to align our string
+	 * size to a multiple of 8. However, we must guarantee the tail end is
+	 * zero'd out to avoid leaking random bits to userspace.
+	 */
+	size = strlen(cgroup_event.path) + 1;
+	while (!IS_ALIGNED(size, sizeof(u64)))
+		cgroup_event.path[size++] = '\0';
+
+	cgroup_event.event_id.header.size += size;
+	cgroup_event.event_id.path_len = size;
+
+	perf_iterate_sb(perf_event_cgroup_output,
+			&cgroup_event,
+			NULL);
+
+	kfree(pathname);
+}
+
+#endif
+
 /*
  * mmap tracking
  */
@@ -10232,6 +10335,8 @@ static void account_event(struct perf_event *event)
 		atomic_inc(&nr_comm_events);
 	if (event->attr.namespaces)
 		atomic_inc(&nr_namespaces_events);
+	if (event->attr.cgroup)
+		atomic_inc(&nr_cgroup_events);
 	if (event->attr.task)
 		atomic_inc(&nr_task_events);
 	if (event->attr.freq)
@@ -12186,6 +12291,12 @@ static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
 	kfree(jc);
 }
 
+static int perf_cgroup_css_online(struct cgroup_subsys_state *css)
+{
+	perf_event_cgroup(css->cgroup);
+	return 0;
+}
+
 static int __perf_cgroup_move(void *info)
 {
 	struct task_struct *task = info;
@@ -12207,6 +12318,7 @@ static void perf_cgroup_attach(struct cgroup_taskset *tset)
 struct cgroup_subsys perf_event_cgrp_subsys = {
 	.css_alloc	= perf_cgroup_css_alloc,
 	.css_free	= perf_cgroup_css_free,
+	.css_online	= perf_cgroup_css_online,
 	.attach		= perf_cgroup_attach,
 	/*
 	 * Implicitly enable on dfl hierarchy so that perf events can
-- 
2.23.0.187.g17f5b7556c-goog


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 2/9] perf/core: Add PERF_SAMPLE_CGROUP feature
  2019-08-28  7:31 [PATCHSET 0/9] perf: Improve cgroup profiling (v1) Namhyung Kim
  2019-08-28  7:31 ` [PATCH 1/9] perf/core: Add PERF_RECORD_CGROUP event Namhyung Kim
@ 2019-08-28  7:31 ` Namhyung Kim
  2019-08-28 14:49   ` Tejun Heo
  2019-08-28  7:31 ` [PATCH 3/9] perf tools: Basic support for CGROUP event Namhyung Kim
                   ` (6 subsequent siblings)
  8 siblings, 1 reply; 43+ messages in thread
From: Namhyung Kim @ 2019-08-28  7:31 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Arnaldo Carvalho de Melo
  Cc: LKML, Jiri Olsa, Alexander Shishkin, Stephane Eranian, Tejun Heo,
	Li Zefan, Johannes Weiner

The PERF_SAMPLE_CGROUP bit is to save (perf_event) cgroup information
in the sample.  It will add a 64-bit integer to identify current
cgroup and it's the inode number in the cgroup file system.  Userspace
should use this information with PERF_RECORD_CGROUP event to match
which cgroup it belongs.

Cc: Tejun Heo <tj@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 include/linux/perf_event.h      |  1 +
 include/uapi/linux/perf_event.h |  3 ++-
 kernel/events/core.c            | 16 ++++++++++++++++
 3 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e8ad3c590a23..2b9661aa4240 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -958,6 +958,7 @@ struct perf_sample_data {
 	u64				stack_user_size;
 
 	u64				phys_addr;
+	u64				cgroup;
 } ____cacheline_aligned;
 
 /* default value for data source */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index cb07c24b715f..91a552bf9611 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -141,8 +141,9 @@ enum perf_event_sample_format {
 	PERF_SAMPLE_TRANSACTION			= 1U << 17,
 	PERF_SAMPLE_REGS_INTR			= 1U << 18,
 	PERF_SAMPLE_PHYS_ADDR			= 1U << 19,
+	PERF_SAMPLE_CGROUP			= 1U << 20,
 
-	PERF_SAMPLE_MAX = 1U << 20,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 21,		/* non-ABI */
 
 	__PERF_SAMPLE_CALLCHAIN_EARLY		= 1ULL << 63, /* non-ABI; internal use */
 };
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d898263db4fc..04ae6a42a98b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1754,6 +1754,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
 	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
 		size += sizeof(data->phys_addr);
 
+	if (sample_type & PERF_SAMPLE_CGROUP)
+		size += sizeof(data->cgroup);
+
 	event->header_size = size;
 }
 
@@ -6388,6 +6391,19 @@ void perf_output_sample(struct perf_output_handle *handle,
 	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
 		perf_output_put(handle, data->phys_addr);
 
+	if (sample_type & PERF_SAMPLE_CGROUP) {
+		u64 ino = 0;
+
+#ifdef CONFIG_CGROUP_PERF
+		struct cgroup *cgrp;
+
+		/* protected by RCU */
+		cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
+		ino = cgroup_ino(cgrp);
+#endif
+		perf_output_put(handle, ino);
+	}
+
 	if (!event->attr.watermark) {
 		int wakeup_events = event->attr.wakeup_events;
 
-- 
2.23.0.187.g17f5b7556c-goog


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 3/9] perf tools: Basic support for CGROUP event
  2019-08-28  7:31 [PATCHSET 0/9] perf: Improve cgroup profiling (v1) Namhyung Kim
  2019-08-28  7:31 ` [PATCH 1/9] perf/core: Add PERF_RECORD_CGROUP event Namhyung Kim
  2019-08-28  7:31 ` [PATCH 2/9] perf/core: Add PERF_SAMPLE_CGROUP feature Namhyung Kim
@ 2019-08-28  7:31 ` Namhyung Kim
  2019-08-30 12:55   ` Arnaldo Carvalho de Melo
  2019-08-28  7:31 ` [PATCH 4/9] perf tools: Maintain cgroup hierarchy Namhyung Kim
                   ` (5 subsequent siblings)
  8 siblings, 1 reply; 43+ messages in thread
From: Namhyung Kim @ 2019-08-28  7:31 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Arnaldo Carvalho de Melo
  Cc: LKML, Jiri Olsa, Alexander Shishkin, Stephane Eranian, Adrian Hunter

Implement basic functionality to support cgroup tracking.  Each cgroup
can be identified by inode number which can be read from userspace
too.  The actual cgroup processing will come in the later patch.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/include/uapi/linux/perf_event.h | 17 +++++++++++++++--
 tools/perf/builtin-diff.c             |  1 +
 tools/perf/builtin-report.c           |  1 +
 tools/perf/lib/include/perf/event.h   |  7 +++++++
 tools/perf/util/event.c               | 18 ++++++++++++++++++
 tools/perf/util/event.h               |  7 +++++++
 tools/perf/util/evsel.c               |  7 +++++++
 tools/perf/util/machine.c             | 12 ++++++++++++
 tools/perf/util/machine.h             |  3 +++
 tools/perf/util/session.c             |  4 ++++
 tools/perf/util/tool.h                |  1 +
 11 files changed, 76 insertions(+), 2 deletions(-)

diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index bb7b271397a6..91a552bf9611 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -141,8 +141,9 @@ enum perf_event_sample_format {
 	PERF_SAMPLE_TRANSACTION			= 1U << 17,
 	PERF_SAMPLE_REGS_INTR			= 1U << 18,
 	PERF_SAMPLE_PHYS_ADDR			= 1U << 19,
+	PERF_SAMPLE_CGROUP			= 1U << 20,
 
-	PERF_SAMPLE_MAX = 1U << 20,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 21,		/* non-ABI */
 
 	__PERF_SAMPLE_CALLCHAIN_EARLY		= 1ULL << 63, /* non-ABI; internal use */
 };
@@ -375,7 +376,8 @@ struct perf_event_attr {
 				ksymbol        :  1, /* include ksymbol events */
 				bpf_event      :  1, /* include bpf events */
 				aux_output     :  1, /* generate AUX records instead of events */
-				__reserved_1   : 32;
+				cgroup         :  1, /* include cgroup events */
+				__reserved_1   : 31;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@@ -1000,6 +1002,17 @@ enum perf_event_type {
 	 */
 	PERF_RECORD_BPF_EVENT			= 18,
 
+	/*
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u64				ino;
+	 *	u64				path_len;
+	 *	char				path[];
+	 *	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_CGROUP			= 19,
+
 	PERF_RECORD_MAX,			/* non-ABI */
 };
 
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 51c37e53b3d8..ec639340bb65 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -445,6 +445,7 @@ static struct perf_diff pdiff = {
 		.fork	= perf_event__process_fork,
 		.lost	= perf_event__process_lost,
 		.namespaces = perf_event__process_namespaces,
+		.cgroup = perf_event__process_cgroup,
 		.ordered_events = true,
 		.ordering_requires_timestamps = true,
 	},
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 318b0b95c14c..c65a59fd2a94 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -1033,6 +1033,7 @@ int cmd_report(int argc, const char **argv)
 			.mmap2		 = perf_event__process_mmap2,
 			.comm		 = perf_event__process_comm,
 			.namespaces	 = perf_event__process_namespaces,
+			.cgroup		 = perf_event__process_cgroup,
 			.exit		 = perf_event__process_exit,
 			.fork		 = perf_event__process_fork,
 			.lost		 = perf_event__process_lost,
diff --git a/tools/perf/lib/include/perf/event.h b/tools/perf/lib/include/perf/event.h
index 36ad3a4a79e6..ec112ad640a1 100644
--- a/tools/perf/lib/include/perf/event.h
+++ b/tools/perf/lib/include/perf/event.h
@@ -104,6 +104,13 @@ struct perf_record_bpf_event {
 	__u8			 tag[BPF_TAG_SIZE];  // prog tag
 };
 
+struct perf_record_cgroup {
+	struct perf_event_header header;
+	__u64			 ino;
+	__u64			 path_len;
+	char			 path[PATH_MAX];
+};
+
 struct perf_record_sample {
 	struct perf_event_header header;
 	__u64			 array[];
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 33616ea62a47..c19b00c1fc26 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -52,6 +52,7 @@ static const char *perf_event__names[] = {
 	[PERF_RECORD_NAMESPACES]		= "NAMESPACES",
 	[PERF_RECORD_KSYMBOL]			= "KSYMBOL",
 	[PERF_RECORD_BPF_EVENT]			= "BPF_EVENT",
+	[PERF_RECORD_CGROUP]			= "CGROUP",
 	[PERF_RECORD_HEADER_ATTR]		= "ATTR",
 	[PERF_RECORD_HEADER_EVENT_TYPE]		= "EVENT_TYPE",
 	[PERF_RECORD_HEADER_TRACING_DATA]	= "TRACING_DATA",
@@ -1279,6 +1280,12 @@ size_t perf_event__fprintf_namespaces(union perf_event *event, FILE *fp)
 	return ret;
 }
 
+size_t perf_event__fprintf_cgroup(union perf_event *event, FILE *fp)
+{
+	return fprintf(fp, " cgroup: %" PRI_lu64 " %s\n",
+		       event->cgroup.ino, event->cgroup.path);
+}
+
 int perf_event__process_comm(struct perf_tool *tool __maybe_unused,
 			     union perf_event *event,
 			     struct perf_sample *sample,
@@ -1295,6 +1302,14 @@ int perf_event__process_namespaces(struct perf_tool *tool __maybe_unused,
 	return machine__process_namespaces_event(machine, event, sample);
 }
 
+int perf_event__process_cgroup(struct perf_tool *tool __maybe_unused,
+			       union perf_event *event,
+			       struct perf_sample *sample,
+			       struct machine *machine)
+{
+	return machine__process_cgroup_event(machine, event, sample);
+}
+
 int perf_event__process_lost(struct perf_tool *tool __maybe_unused,
 			     union perf_event *event,
 			     struct perf_sample *sample,
@@ -1516,6 +1531,9 @@ size_t perf_event__fprintf(union perf_event *event, FILE *fp)
 	case PERF_RECORD_NAMESPACES:
 		ret += perf_event__fprintf_namespaces(event, fp);
 		break;
+	case PERF_RECORD_CGROUP:
+		ret += perf_event__fprintf_cgroup(event, fp);
+		break;
 	case PERF_RECORD_MMAP2:
 		ret += perf_event__fprintf_mmap2(event, fp);
 		break;
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 429a3fe52d6c..0170435fd1e8 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -123,6 +123,7 @@ struct perf_sample {
 	u32 raw_size;
 	u64 data_src;
 	u64 phys_addr;
+	u64 cgroup;
 	u32 flags;
 	u16 insn_len;
 	u8  cpumode;
@@ -554,6 +555,7 @@ union perf_event {
 	struct perf_record_mmap2	mmap2;
 	struct perf_record_comm		comm;
 	struct perf_record_namespaces	namespaces;
+	struct perf_record_cgroup	cgroup;
 	struct perf_record_fork		fork;
 	struct perf_record_lost		lost;
 	struct perf_record_lost_samples	lost_samples;
@@ -663,6 +665,10 @@ int perf_event__process_namespaces(struct perf_tool *tool,
 				   union perf_event *event,
 				   struct perf_sample *sample,
 				   struct machine *machine);
+int perf_event__process_cgroup(struct perf_tool *tool,
+			       union perf_event *event,
+			       struct perf_sample *sample,
+			       struct machine *machine);
 int perf_event__process_mmap(struct perf_tool *tool,
 			     union perf_event *event,
 			     struct perf_sample *sample,
@@ -750,6 +756,7 @@ size_t perf_event__fprintf_switch(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_thread_map(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_cpu_map(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_namespaces(union perf_event *event, FILE *fp);
+size_t perf_event__fprintf_cgroup(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_ksymbol(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_bpf(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf(union perf_event *event, FILE *fp);
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index fa676355559e..86a38679cad1 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1593,6 +1593,7 @@ int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr,
 	PRINT_ATTRf(ksymbol, p_unsigned);
 	PRINT_ATTRf(bpf_event, p_unsigned);
 	PRINT_ATTRf(aux_output, p_unsigned);
+	PRINT_ATTRf(cgroup, p_unsigned);
 
 	PRINT_ATTRn("{ wakeup_events, wakeup_watermark }", wakeup_events, p_unsigned);
 	PRINT_ATTRf(bp_type, p_unsigned);
@@ -2369,6 +2370,12 @@ int perf_evsel__parse_sample(struct evsel *evsel, union perf_event *event,
 		array++;
 	}
 
+	data->cgroup = 0;
+	if (type & PERF_SAMPLE_CGROUP) {
+		data->cgroup = *array;
+		array++;
+	}
+
 	return 0;
 }
 
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 93483f1764d3..61c35eef616b 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -642,6 +642,16 @@ int machine__process_namespaces_event(struct machine *machine __maybe_unused,
 	return err;
 }
 
+int machine__process_cgroup_event(struct machine *machine __maybe_unused,
+				  union perf_event *event,
+				  struct perf_sample *sample __maybe_unused)
+{
+	if (dump_trace)
+		perf_event__fprintf_cgroup(event, stdout);
+
+	return 0;
+}
+
 int machine__process_lost_event(struct machine *machine __maybe_unused,
 				union perf_event *event, struct perf_sample *sample __maybe_unused)
 {
@@ -1902,6 +1912,8 @@ int machine__process_event(struct machine *machine, union perf_event *event,
 		ret = machine__process_mmap_event(machine, event, sample); break;
 	case PERF_RECORD_NAMESPACES:
 		ret = machine__process_namespaces_event(machine, event, sample); break;
+	case PERF_RECORD_CGROUP:
+		ret = machine__process_cgroup_event(machine, event, sample); break;
 	case PERF_RECORD_MMAP2:
 		ret = machine__process_mmap2_event(machine, event, sample); break;
 	case PERF_RECORD_FORK:
diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h
index 7d69119d0b5d..608813ced0cd 100644
--- a/tools/perf/util/machine.h
+++ b/tools/perf/util/machine.h
@@ -127,6 +127,9 @@ int machine__process_switch_event(struct machine *machine,
 int machine__process_namespaces_event(struct machine *machine,
 				      union perf_event *event,
 				      struct perf_sample *sample);
+int machine__process_cgroup_event(struct machine *machine,
+				  union perf_event *event,
+				  struct perf_sample *sample);
 int machine__process_mmap_event(struct machine *machine, union perf_event *event,
 				struct perf_sample *sample);
 int machine__process_mmap2_event(struct machine *machine, union perf_event *event,
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 5786e9c807c5..2cdce7ee228c 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -457,6 +457,8 @@ void perf_tool__fill_defaults(struct perf_tool *tool)
 		tool->comm = process_event_stub;
 	if (tool->namespaces == NULL)
 		tool->namespaces = process_event_stub;
+	if (tool->cgroup == NULL)
+		tool->cgroup = process_event_stub;
 	if (tool->fork == NULL)
 		tool->fork = process_event_stub;
 	if (tool->exit == NULL)
@@ -1417,6 +1419,8 @@ static int machines__deliver_event(struct machines *machines,
 		return tool->comm(tool, event, sample, machine);
 	case PERF_RECORD_NAMESPACES:
 		return tool->namespaces(tool, event, sample, machine);
+	case PERF_RECORD_CGROUP:
+		return tool->cgroup(tool, event, sample, machine);
 	case PERF_RECORD_FORK:
 		return tool->fork(tool, event, sample, machine);
 	case PERF_RECORD_EXIT:
diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
index 2abbf668b8de..472ef5eb4068 100644
--- a/tools/perf/util/tool.h
+++ b/tools/perf/util/tool.h
@@ -46,6 +46,7 @@ struct perf_tool {
 			mmap2,
 			comm,
 			namespaces,
+			cgroup,
 			fork,
 			exit,
 			lost,
-- 
2.23.0.187.g17f5b7556c-goog


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 4/9] perf tools: Maintain cgroup hierarchy
  2019-08-28  7:31 [PATCHSET 0/9] perf: Improve cgroup profiling (v1) Namhyung Kim
                   ` (2 preceding siblings ...)
  2019-08-28  7:31 ` [PATCH 3/9] perf tools: Basic support for CGROUP event Namhyung Kim
@ 2019-08-28  7:31 ` Namhyung Kim
  2019-08-28  7:31 ` [PATCH 5/9] perf report: Add 'cgroup' sort key Namhyung Kim
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 43+ messages in thread
From: Namhyung Kim @ 2019-08-28  7:31 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Arnaldo Carvalho de Melo
  Cc: LKML, Jiri Olsa, Alexander Shishkin, Stephane Eranian

Each cgroup is kept in the global cgroup_tree sorted by the inode
number.  Hist entries have cgroup ino number can compare it directly
and later it can be used to find a group name using this tree.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/cgroup.c  | 72 +++++++++++++++++++++++++++++++++++++++
 tools/perf/util/cgroup.h  | 15 +++++---
 tools/perf/util/machine.c |  7 ++++
 tools/perf/util/session.c |  4 +++
 4 files changed, 94 insertions(+), 4 deletions(-)

diff --git a/tools/perf/util/cgroup.c b/tools/perf/util/cgroup.c
index f73599f271ff..8e4c26ea5078 100644
--- a/tools/perf/util/cgroup.c
+++ b/tools/perf/util/cgroup.c
@@ -12,6 +12,8 @@
 
 int nr_cgroups;
 
+static struct rb_root cgroup_tree = RB_ROOT;
+
 static int
 cgroupfs_find_mountpoint(char *buf, size_t maxlen)
 {
@@ -249,3 +251,73 @@ int parse_cgroups(const struct option *opt, const char *str,
 	}
 	return 0;
 }
+
+struct cgroup *cgroup__findnew(uint64_t ino, const char *path)
+{
+	struct rb_node **p = &cgroup_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct cgroup *cgrp;
+
+	while (*p != NULL) {
+		parent = *p;
+		cgrp = rb_entry(parent, struct cgroup, node);
+
+		if (cgrp->ino == ino)
+			return cgrp;
+
+		if (cgrp->ino < ino)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	cgrp = malloc(sizeof(*cgrp));
+	if (cgrp == NULL)
+		return NULL;
+
+	cgrp->name = strdup(path);
+	if (cgrp->name == NULL) {
+		free(cgrp);
+		return NULL;
+	}
+
+	cgrp->fd = -1;
+	cgrp->ino = ino;
+	refcount_set(&cgrp->refcnt, 1);
+
+	rb_link_node(&cgrp->node, parent, p);
+	rb_insert_color(&cgrp->node, &cgroup_tree);
+
+	return cgrp;
+}
+
+struct cgroup *cgroup__find_by_path(const char *path)
+{
+	struct rb_node *node;
+
+	node = rb_first(&cgroup_tree);
+	while (node) {
+		struct cgroup *cgrp = rb_entry(node, struct cgroup, node);
+
+		if (!strcmp(cgrp->name, path))
+			return cgrp;
+
+		node = rb_next(&cgrp->node);
+	}
+
+	return NULL;
+}
+
+void destroy_cgroups(void)
+{
+	struct rb_node *node;
+	struct cgroup *cgrp;
+
+	while (!RB_EMPTY_ROOT(&cgroup_tree)) {
+		node = rb_first(&cgroup_tree);
+		cgrp = rb_entry(node, struct cgroup, node);
+
+		rb_erase(node, &cgroup_tree);
+		cgroup__put(cgrp);
+	}
+}
diff --git a/tools/perf/util/cgroup.h b/tools/perf/util/cgroup.h
index 2ec11f01090d..11a8b187ec09 100644
--- a/tools/perf/util/cgroup.h
+++ b/tools/perf/util/cgroup.h
@@ -3,16 +3,18 @@
 #define __CGROUP_H__
 
 #include <linux/refcount.h>
+#include <linux/rbtree.h>
 
 struct option;
 
 struct cgroup {
-	char *name;
-	int fd;
-	refcount_t refcnt;
+	struct rb_node		node;
+	u64			ino;
+	char			*name;
+	int			fd;
+	refcount_t		refcnt;
 };
 
-
 extern int nr_cgroups; /* number of explicit cgroups defined */
 
 struct cgroup *cgroup__get(struct cgroup *cgroup);
@@ -26,4 +28,9 @@ void evlist__set_default_cgroup(struct evlist *evlist, struct cgroup *cgroup);
 
 int parse_cgroups(const struct option *opt, const char *str, int unset);
 
+struct cgroup *cgroup__findnew(uint64_t ino, const char *path);
+struct cgroup *cgroup__find_by_path(const char *path);
+
+void destroy_cgroups(void);
+
 #endif /* __CGROUP_H__ */
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 61c35eef616b..33554e745e6b 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -26,6 +26,7 @@
 #include "linux/hash.h"
 #include "asm/bug.h"
 #include "bpf-event.h"
+#include "cgroup.h"
 
 #include <linux/ctype.h>
 #include <symbol/kallsyms.h>
@@ -646,9 +647,15 @@ int machine__process_cgroup_event(struct machine *machine __maybe_unused,
 				  union perf_event *event,
 				  struct perf_sample *sample __maybe_unused)
 {
+	struct cgroup *cgrp;
+
 	if (dump_trace)
 		perf_event__fprintf_cgroup(event, stdout);
 
+	cgrp = cgroup__findnew(event->cgroup.ino, event->cgroup.path);
+	if (cgrp == NULL)
+		return -ENOMEM;
+
 	return 0;
 }
 
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 2cdce7ee228c..ffdd956d0a89 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -275,6 +275,8 @@ static void perf_session__release_decomp_events(struct perf_session *session)
 	} while (1);
 }
 
+extern void destroy_cgroups(void);
+
 void perf_session__delete(struct perf_session *session)
 {
 	if (session == NULL)
@@ -289,6 +291,8 @@ void perf_session__delete(struct perf_session *session)
 	if (session->data)
 		perf_data__close(session->data);
 	free(session);
+
+	destroy_cgroups();
 }
 
 static int process_event_synth_tracing_data_stub(struct perf_session *session
-- 
2.23.0.187.g17f5b7556c-goog


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 5/9] perf report: Add 'cgroup' sort key
  2019-08-28  7:31 [PATCHSET 0/9] perf: Improve cgroup profiling (v1) Namhyung Kim
                   ` (3 preceding siblings ...)
  2019-08-28  7:31 ` [PATCH 4/9] perf tools: Maintain cgroup hierarchy Namhyung Kim
@ 2019-08-28  7:31 ` Namhyung Kim
  2019-08-28  7:31 ` [PATCH 6/9] perf record: Support synthesizing cgroup events Namhyung Kim
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 43+ messages in thread
From: Namhyung Kim @ 2019-08-28  7:31 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Arnaldo Carvalho de Melo
  Cc: LKML, Jiri Olsa, Alexander Shishkin, Stephane Eranian

The cgroup sort key is to show cgroup membership of each task.
Currently it shows full path in the cgroupfs (not relative to the root
of cgroup namespace) since it'd be more intuitive IMHO.  Otherwise
root cgroup in different namespaces will all show same name - "/".

The cgroup sort key should come before cgroup_id otherwise
sort_dimension__add() will match it to cgroup_id as it only matches
with the given substring.

For example it will look like following.  Note that record patch
adding --all-cgroups patch will come later.

  $ perf record -a --namespace --all-cgroups  cgtest
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.208 MB perf.data (4090 samples) ]

  $ perf report -s cgroup_id,cgroup,pid
  ...
  # Overhead  cgroup id (dev/inode)  cgroup          Pid:Command
  # ........  .....................  ..........  ...............
  #
      93.96%  0/0x0                  /                 0:swapper
       1.25%  3/0xeffffffb           /               278:looper0
       0.86%  3/0xf000015f           /sub/cgrp1      280:cgtest
       0.37%  3/0xf0000160           /sub/cgrp2      281:cgtest
       0.34%  3/0xf0000163           /sub/cgrp3      282:cgtest
       0.22%  3/0xeffffffb           /sub            278:looper0
       0.20%  3/0xeffffffb           /               280:cgtest
       0.15%  3/0xf0000163           /sub/cgrp3      285:looper3

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/Documentation/perf-report.txt |  1 +
 tools/perf/util/hist.c                   |  7 ++++++
 tools/perf/util/hist.h                   |  1 +
 tools/perf/util/sort.c                   | 31 ++++++++++++++++++++++++
 tools/perf/util/sort.h                   |  2 ++
 5 files changed, 42 insertions(+)

diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index 7315f155803f..4c22146ae8fe 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -95,6 +95,7 @@ OPTIONS
 	abort cost. This is the global weight.
 	- local_weight: Local weight version of the weight above.
 	- cgroup_id: ID derived from cgroup namespace device and inode numbers.
+	- cgroup: cgroup pathname in the cgroupfs.
 	- transaction: Transaction abort flags.
 	- overhead: Overhead percentage of sample
 	- overhead_sys: Overhead percentage of sample running in system mode
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 33702675073c..b1f0a7d2a951 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -5,6 +5,7 @@
 #include "map.h"
 #include "session.h"
 #include "namespaces.h"
+#include "cgroup.h"
 #include "sort.h"
 #include "units.h"
 #include "evlist.h"
@@ -212,6 +213,11 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
 
 	if (h->trace_output)
 		hists__new_col_len(hists, HISTC_TRACE, strlen(h->trace_output));
+
+	if (h->cgroup) {
+		struct cgroup *cgrp = cgroup__findnew(h->cgroup, "<unknown>");
+		hists__new_col_len(hists, HISTC_CGROUP, strlen(cgrp->name));
+	}
 }
 
 void hists__output_recalc_col_len(struct hists *hists, int max_rows)
@@ -681,6 +687,7 @@ __hists__add_entry(struct hists *hists,
 			.dev = ns ? ns->link_info[CGROUP_NS_INDEX].dev : 0,
 			.ino = ns ? ns->link_info[CGROUP_NS_INDEX].ino : 0,
 		},
+		.cgroup = sample->cgroup,
 		.ms = {
 			.map	= al->map,
 			.sym	= al->sym,
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 83d5fc15429c..1e5c4087391b 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -37,6 +37,7 @@ enum hist_column {
 	HISTC_THREAD,
 	HISTC_COMM,
 	HISTC_CGROUP_ID,
+	HISTC_CGROUP,
 	HISTC_PARENT,
 	HISTC_CPU,
 	HISTC_SOCKET,
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 83eb3fa6f941..5524b10d7d38 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -20,6 +20,7 @@
 #include "mem-events.h"
 #include "annotate.h"
 #include "time-utils.h"
+#include "cgroup.h"
 #include <linux/kernel.h>
 
 regex_t		parent_regex;
@@ -627,6 +628,35 @@ struct sort_entry sort_cgroup_id = {
 	.se_width_idx	= HISTC_CGROUP_ID,
 };
 
+/* --sort cgroup */
+
+static int64_t
+sort__cgroup_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	return right->cgroup - left->cgroup;
+}
+
+static int hist_entry__cgroup_snprintf(struct hist_entry *he,
+				       char *bf, size_t size,
+				       unsigned int width __maybe_unused)
+{
+	const char *cgrp_name = "N/A";
+
+	if (he->cgroup) {
+		struct cgroup *cgrp = cgroup__findnew(he->cgroup, cgrp_name);
+		cgrp_name = cgrp->name;
+	}
+
+	return repsep_snprintf(bf, size, "%s", cgrp_name);
+}
+
+struct sort_entry sort_cgroup = {
+	.se_header      = "cgroup",
+	.se_cmp	        = sort__cgroup_cmp,
+	.se_snprintf    = hist_entry__cgroup_snprintf,
+	.se_width_idx	= HISTC_CGROUP,
+};
+
 /* --sort socket */
 
 static int64_t
@@ -1667,6 +1697,7 @@ static struct sort_dimension common_sort_dimensions[] = {
 	DIM(SORT_TRACE, "trace", sort_trace),
 	DIM(SORT_SYM_SIZE, "symbol_size", sort_sym_size),
 	DIM(SORT_DSO_SIZE, "dso_size", sort_dso_size),
+	DIM(SORT_CGROUP, "cgroup", sort_cgroup),
 	DIM(SORT_CGROUP_ID, "cgroup_id", sort_cgroup_id),
 	DIM(SORT_SYM_IPC_NULL, "ipc_null", sort_sym_ipc_null),
 	DIM(SORT_TIME, "time", sort_time),
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 3d7cef73924c..571092136e3b 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -98,6 +98,7 @@ struct hist_entry {
 	struct thread		*thread;
 	struct comm		*comm;
 	struct namespace_id	cgroup_id;
+	u64			cgroup;
 	u64			ip;
 	u64			transaction;
 	s32			socket;
@@ -219,6 +220,7 @@ enum sort_type {
 	SORT_TRACE,
 	SORT_SYM_SIZE,
 	SORT_DSO_SIZE,
+	SORT_CGROUP,
 	SORT_CGROUP_ID,
 	SORT_SYM_IPC_NULL,
 	SORT_TIME,
-- 
2.23.0.187.g17f5b7556c-goog


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 6/9] perf record: Support synthesizing cgroup events
  2019-08-28  7:31 [PATCHSET 0/9] perf: Improve cgroup profiling (v1) Namhyung Kim
                   ` (4 preceding siblings ...)
  2019-08-28  7:31 ` [PATCH 5/9] perf report: Add 'cgroup' sort key Namhyung Kim
@ 2019-08-28  7:31 ` Namhyung Kim
  2019-08-28  7:31 ` [PATCH 7/9] perf record: Add --all-cgroups option Namhyung Kim
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 43+ messages in thread
From: Namhyung Kim @ 2019-08-28  7:31 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Arnaldo Carvalho de Melo
  Cc: LKML, Jiri Olsa, Alexander Shishkin, Stephane Eranian

Synthesize cgroup events by iterating cgroup filesystem directories.
The cgroup event only saves the portion of cgroup path after the mount
point and the inode number.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/builtin-record.c |   5 ++
 tools/perf/util/cgroup.c    |   3 +-
 tools/perf/util/cgroup.h    |   1 +
 tools/perf/util/event.c     | 115 ++++++++++++++++++++++++++++++++++++
 tools/perf/util/event.h     |   4 ++
 tools/perf/util/tool.h      |   1 +
 6 files changed, 127 insertions(+), 2 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 359bb8f33e57..a6e3c4413b39 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -1318,6 +1318,11 @@ static int record__synthesize(struct record *rec, bool tail)
 	if (err < 0)
 		pr_warning("Couldn't synthesize bpf events.\n");
 
+	err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
+					     machine);
+	if (err < 0)
+		pr_warning("Couldn't synthesize cgroup events.\n");
+
 	err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads,
 					    process_synthesized_event, opts->sample_address,
 					    1);
diff --git a/tools/perf/util/cgroup.c b/tools/perf/util/cgroup.c
index 8e4c26ea5078..274f0f29c72d 100644
--- a/tools/perf/util/cgroup.c
+++ b/tools/perf/util/cgroup.c
@@ -14,8 +14,7 @@ int nr_cgroups;
 
 static struct rb_root cgroup_tree = RB_ROOT;
 
-static int
-cgroupfs_find_mountpoint(char *buf, size_t maxlen)
+int cgroupfs_find_mountpoint(char *buf, size_t maxlen)
 {
 	FILE *fp;
 	char mountpoint[PATH_MAX + 1], tokens[PATH_MAX + 1], type[PATH_MAX + 1];
diff --git a/tools/perf/util/cgroup.h b/tools/perf/util/cgroup.h
index 11a8b187ec09..755f9712eda4 100644
--- a/tools/perf/util/cgroup.h
+++ b/tools/perf/util/cgroup.h
@@ -17,6 +17,7 @@ struct cgroup {
 
 extern int nr_cgroups; /* number of explicit cgroups defined */
 
+int cgroupfs_find_mountpoint(char *buf, size_t maxlen);
 struct cgroup *cgroup__get(struct cgroup *cgroup);
 void cgroup__put(struct cgroup *cgroup);
 
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index c19b00c1fc26..9e71b9561f72 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -29,6 +29,7 @@
 #include "stat.h"
 #include "session.h"
 #include "bpf-event.h"
+#include "cgroup.h"
 
 #define DEFAULT_PROC_MAP_PARSE_TIMEOUT 500
 
@@ -296,6 +297,120 @@ int perf_event__synthesize_namespaces(struct perf_tool *tool,
 	return 0;
 }
 
+static int perf_event__synthesize_cgroup(struct perf_tool *tool,
+					 union perf_event *event,
+					 char *path, size_t mount_len,
+					 perf_event__handler_t process,
+					 struct machine *machine)
+{
+	size_t event_size = sizeof(event->cgroup) - sizeof(event->cgroup.path);
+	size_t path_len = strlen(path) - mount_len + 1;
+	struct stat64 stbuf;
+
+	while (path_len % sizeof(u64))
+		path[mount_len + path_len++] = '\0';
+
+	memset(&event->cgroup, 0, event_size);
+
+	event->cgroup.header.type = PERF_RECORD_CGROUP;
+	event->cgroup.header.size = event_size + path_len + machine->id_hdr_size;
+
+	if (stat64(path, &stbuf) < 0) {
+		pr_debug("stat failed: %s\n", path);
+		return -1;
+	}
+
+	event->cgroup.ino = stbuf.st_ino;
+	event->cgroup.path_len = path_len;
+	strncpy(event->cgroup.path, path + mount_len, path_len);
+	memset(event->cgroup.path + path_len, 0, machine->id_hdr_size);
+
+	if (perf_tool__process_synth_event(tool, event, machine, process) < 0) {
+		pr_debug("process synth event failed\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static int perf_event__walk_cgroup_tree(struct perf_tool *tool,
+					union perf_event *event,
+					char *path, size_t mount_len,
+					perf_event__handler_t process,
+					struct machine *machine)
+{
+	size_t pos = strlen(path);
+	DIR *d;
+	struct dirent *dent;
+	int ret = 0;
+
+	if (perf_event__synthesize_cgroup(tool, event, path, mount_len,
+					  process, machine) < 0)
+		return -1;
+
+	d = opendir(path);
+	if (d == NULL) {
+		pr_debug("failed to open directory: %s\n", path);
+		return -1;
+	}
+
+	while ((dent = readdir(d)) != NULL) {
+		if (dent->d_type != DT_DIR)
+			continue;
+		if (!strcmp(dent->d_name, ".") ||
+		    !strcmp(dent->d_name, ".."))
+			continue;
+
+		if (path[pos - 1] != '/')
+			strcat(path, "/");
+		strcat(path, dent->d_name);
+
+		ret = perf_event__walk_cgroup_tree(tool, event, path,
+						   mount_len, process, machine);
+		if (ret < 0)
+			break;
+
+		path[pos] = '\0';
+	}
+
+	closedir(d);
+	return ret;
+}
+
+int perf_event__synthesize_cgroups(struct perf_tool *tool,
+				   perf_event__handler_t process,
+				   struct machine *machine)
+{
+	union perf_event event;
+	char *cgrp_root;
+	size_t mount_len;  /* length of mount point in the path */
+	int ret = -1;
+
+	cgrp_root = malloc(PATH_MAX);
+	if (cgrp_root == NULL)
+		return -1;
+
+	if (cgroupfs_find_mountpoint(cgrp_root, PATH_MAX) < 0) {
+		pr_debug("cannot find cgroup mount point\n");
+		goto out;
+	}
+
+	mount_len = strlen(cgrp_root);
+	/* make sure the path starts with a slash (after mount point) */
+	strcat(cgrp_root, "/");
+
+	if (perf_event__walk_cgroup_tree(tool, &event, cgrp_root, mount_len,
+					 process, machine) < 0)
+		goto out;
+
+	ret = 0;
+
+out:
+	free(cgrp_root);
+
+	return ret;
+}
+
 static int perf_event__synthesize_fork(struct perf_tool *tool,
 				       union perf_event *event,
 				       pid_t pid, pid_t tgid, pid_t ppid,
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 0170435fd1e8..b4c4da69a771 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -735,6 +735,10 @@ int perf_event__synthesize_namespaces(struct perf_tool *tool,
 				      perf_event__handler_t process,
 				      struct machine *machine);
 
+int perf_event__synthesize_cgroups(struct perf_tool *tool,
+				   perf_event__handler_t process,
+				   struct machine *machine);
+
 int perf_event__synthesize_mmap_events(struct perf_tool *tool,
 				       union perf_event *event,
 				       pid_t pid, pid_t tgid,
diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
index 472ef5eb4068..3fb67bd31e4a 100644
--- a/tools/perf/util/tool.h
+++ b/tools/perf/util/tool.h
@@ -79,6 +79,7 @@ struct perf_tool {
 	bool		ordered_events;
 	bool		ordering_requires_timestamps;
 	bool		namespace_events;
+	bool		cgroup_events;
 	bool		no_warn;
 	enum show_feature_header show_feat_hdr;
 };
-- 
2.23.0.187.g17f5b7556c-goog


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 7/9] perf record: Add --all-cgroups option
  2019-08-28  7:31 [PATCHSET 0/9] perf: Improve cgroup profiling (v1) Namhyung Kim
                   ` (5 preceding siblings ...)
  2019-08-28  7:31 ` [PATCH 6/9] perf record: Support synthesizing cgroup events Namhyung Kim
@ 2019-08-28  7:31 ` Namhyung Kim
  2019-08-28  7:31 ` [PATCH 8/9] perf top: " Namhyung Kim
  2019-08-28  7:31 ` [PATCH 9/9] perf script: Add --show-cgroup-events option Namhyung Kim
  8 siblings, 0 replies; 43+ messages in thread
From: Namhyung Kim @ 2019-08-28  7:31 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Arnaldo Carvalho de Melo
  Cc: LKML, Jiri Olsa, Alexander Shishkin, Stephane Eranian

The --all-cgroups option is to enable cgroup profiling support.  It
tells kernel to record CGROUP events in the ring buffer so that perf
report can identify task/cgroup association later.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/Documentation/perf-record.txt | 5 ++++-
 tools/perf/builtin-record.c              | 5 +++++
 tools/perf/util/evsel.c                  | 5 +++++
 tools/perf/util/record.h                 | 1 +
 4 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index c6f9f31b6039..fa2e83f72461 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -382,7 +382,10 @@ displayed with the weight and local_weight sort keys.  This currently works for
 abort events and some memory events in precise mode on modern Intel CPUs.
 
 --namespaces::
-Record events of type PERF_RECORD_NAMESPACES.
+Record events of type PERF_RECORD_NAMESPACES.  This enables 'cgroup_id' sort key.
+
+--all-cgroups::
+Record events of type PERF_RECORD_CGROUP.  This enables 'cgroup' sort key.
 
 --transaction::
 Record transaction flags for transaction related events.
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index a6e3c4413b39..918af5e9f05d 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -1354,6 +1354,9 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
 	if (rec->opts.record_namespaces)
 		tool->namespace_events = true;
 
+	if (rec->opts.record_cgroup)
+		tool->cgroup_events = true;
+
 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
 		signal(SIGUSR2, snapshot_sig_handler);
 		if (rec->opts.auxtrace_snapshot_mode)
@@ -2215,6 +2218,8 @@ static struct option __record_options[] = {
 			"per thread proc mmap processing timeout in ms"),
 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
 		    "Record namespaces events"),
+	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
+		    "Record cgroup events"),
 	OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
 		    "Record context switch events"),
 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 86a38679cad1..390ecb554446 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1077,6 +1077,11 @@ void perf_evsel__config(struct evsel *evsel, struct record_opts *opts,
 	if (opts->record_namespaces)
 		attr->namespaces  = track;
 
+	if (opts->record_cgroup) {
+		attr->cgroup = track;
+		perf_evsel__set_sample_bit(evsel, CGROUP);
+	}
+
 	if (opts->record_switch_events)
 		attr->context_switch = track;
 
diff --git a/tools/perf/util/record.h b/tools/perf/util/record.h
index 00275afc524d..740d110fc770 100644
--- a/tools/perf/util/record.h
+++ b/tools/perf/util/record.h
@@ -33,6 +33,7 @@ struct record_opts {
 	bool	      auxtrace_snapshot_mode;
 	bool	      auxtrace_snapshot_on_exit;
 	bool	      record_namespaces;
+	bool	      record_cgroup;
 	bool	      record_switch_events;
 	bool	      all_kernel;
 	bool	      all_user;
-- 
2.23.0.187.g17f5b7556c-goog


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 8/9] perf top: Add --all-cgroups option
  2019-08-28  7:31 [PATCHSET 0/9] perf: Improve cgroup profiling (v1) Namhyung Kim
                   ` (6 preceding siblings ...)
  2019-08-28  7:31 ` [PATCH 7/9] perf record: Add --all-cgroups option Namhyung Kim
@ 2019-08-28  7:31 ` Namhyung Kim
  2019-08-28  7:31 ` [PATCH 9/9] perf script: Add --show-cgroup-events option Namhyung Kim
  8 siblings, 0 replies; 43+ messages in thread
From: Namhyung Kim @ 2019-08-28  7:31 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Arnaldo Carvalho de Melo
  Cc: LKML, Jiri Olsa, Alexander Shishkin, Stephane Eranian

The --all-cgroups option is to enable cgroup profiling support.  It
tells kernel to record CGROUP events in the ring buffer so that perf
report can identify task/cgroup association later.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/Documentation/perf-top.txt | 4 ++++
 tools/perf/builtin-top.c              | 9 +++++++++
 2 files changed, 13 insertions(+)

diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt
index 5596129a71cf..c75507f50071 100644
--- a/tools/perf/Documentation/perf-top.txt
+++ b/tools/perf/Documentation/perf-top.txt
@@ -266,6 +266,10 @@ Default is to monitor all CPUS.
 	Record events of type PERF_RECORD_NAMESPACES and display it with the
 	'cgroup_id' sort key.
 
+--cgroup::
+	Record events of type PERF_RECORD_CGROUP and display it with the
+	'cgroup' sort key.
+
 --switch-on EVENT_NAME::
 	Only consider events after this event is found.
 
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 5970723cd55a..f07b43c12461 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1242,6 +1242,8 @@ static int __cmd_top(struct perf_top *top)
 
 	if (opts->record_namespaces)
 		top->tool.namespace_events = true;
+	if (opts->record_cgroup)
+		top->tool.cgroup_events = true;
 
 	ret = perf_event__synthesize_bpf_events(top->session, perf_event__process,
 						&top->session->machines.host,
@@ -1249,6 +1251,11 @@ static int __cmd_top(struct perf_top *top)
 	if (ret < 0)
 		pr_debug("Couldn't synthesize BPF events: Pre-existing BPF programs won't have symbols resolved.\n");
 
+	ret = perf_event__synthesize_cgroups(&top->tool, perf_event__process,
+					     &top->session->machines.host);
+	if (ret < 0)
+		pr_debug("Couldn't synthesize cgroup events.\n");
+
 	machine__synthesize_threads(&top->session->machines.host, &opts->target,
 				    top->evlist->core.threads, false,
 				    top->nr_threads_synthesize);
@@ -1537,6 +1544,8 @@ int cmd_top(int argc, const char **argv)
 			"number of thread to run event synthesize"),
 	OPT_BOOLEAN(0, "namespaces", &opts->record_namespaces,
 		    "Record namespaces events"),
+	OPT_BOOLEAN(0, "all-cgroups", &opts->record_cgroup,
+		    "Record cgroup events"),
 	OPTS_EVSWITCH(&top.evswitch),
 	OPT_END()
 	};
-- 
2.23.0.187.g17f5b7556c-goog


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 9/9] perf script: Add --show-cgroup-events option
  2019-08-28  7:31 [PATCHSET 0/9] perf: Improve cgroup profiling (v1) Namhyung Kim
                   ` (7 preceding siblings ...)
  2019-08-28  7:31 ` [PATCH 8/9] perf top: " Namhyung Kim
@ 2019-08-28  7:31 ` Namhyung Kim
  8 siblings, 0 replies; 43+ messages in thread
From: Namhyung Kim @ 2019-08-28  7:31 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Arnaldo Carvalho de Melo
  Cc: LKML, Jiri Olsa, Alexander Shishkin, Stephane Eranian

The --show-cgroup-events option is to print CGROUP events in the
output like others.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/Documentation/perf-script.txt |  3 ++
 tools/perf/builtin-script.c              | 41 ++++++++++++++++++++++++
 2 files changed, 44 insertions(+)

diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index 2599b057e47b..3dd297600427 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -319,6 +319,9 @@ OPTIONS
 --show-bpf-events
 	Display bpf events i.e. events of type PERF_RECORD_KSYMBOL and PERF_RECORD_BPF_EVENT.
 
+--show-cgroup-events
+	Display cgroup events i.e. events of type PERF_RECORD_CGROUP.
+
 --demangle::
 	Demangle symbol names to human readable form. It's enabled by default,
 	disable with --no-demangle.
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 51e7e6d0eee6..9017889084bd 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -1629,6 +1629,7 @@ struct perf_script {
 	bool			show_lost_events;
 	bool			show_round_events;
 	bool			show_bpf_events;
+	bool			show_cgroup_events;
 	bool			allocated;
 	bool			per_event_dump;
 	struct evswitch		evswitch;
@@ -2146,6 +2147,41 @@ static int process_namespaces_event(struct perf_tool *tool,
 	return ret;
 }
 
+static int process_cgroup_event(struct perf_tool *tool,
+				union perf_event *event,
+				struct perf_sample *sample,
+				struct machine *machine)
+{
+	struct thread *thread;
+	struct perf_script *script = container_of(tool, struct perf_script, tool);
+	struct perf_session *session = script->session;
+	struct evsel *evsel = perf_evlist__id2evsel(session->evlist, sample->id);
+	int ret = -1;
+
+	thread = machine__findnew_thread(machine, sample->pid, sample->tid);
+	if (thread == NULL) {
+		pr_debug("problem processing CGROUP event, skipping it.\n");
+		return -1;
+	}
+
+	if (perf_event__process_cgroup(tool, event, sample, machine) < 0)
+		goto out;
+
+	if (!evsel->core.attr.sample_id_all) {
+		sample->cpu = 0;
+		sample->time = 0;
+	}
+	if (!filter_cpu(sample)) {
+		perf_sample__fprintf_start(sample, thread, evsel,
+					   PERF_RECORD_CGROUP, stdout);
+		perf_event__fprintf(event, stdout);
+	}
+	ret = 0;
+out:
+	thread__put(thread);
+	return ret;
+}
+
 static int process_fork_event(struct perf_tool *tool,
 			      union perf_event *event,
 			      struct perf_sample *sample,
@@ -2485,6 +2521,8 @@ static int __cmd_script(struct perf_script *script)
 		script->tool.context_switch = process_switch_event;
 	if (script->show_namespace_events)
 		script->tool.namespaces = process_namespaces_event;
+	if (script->show_cgroup_events)
+		script->tool.cgroup = process_cgroup_event;
 	if (script->show_lost_events)
 		script->tool.lost = process_lost_event;
 	if (script->show_round_events) {
@@ -3410,6 +3448,7 @@ int cmd_script(int argc, const char **argv)
 			.mmap2		 = perf_event__process_mmap2,
 			.comm		 = perf_event__process_comm,
 			.namespaces	 = perf_event__process_namespaces,
+			.cgroup		 = perf_event__process_cgroup,
 			.exit		 = perf_event__process_exit,
 			.fork		 = perf_event__process_fork,
 			.attr		 = process_attr,
@@ -3510,6 +3549,8 @@ int cmd_script(int argc, const char **argv)
 		    "Show context switch events (if recorded)"),
 	OPT_BOOLEAN('\0', "show-namespace-events", &script.show_namespace_events,
 		    "Show namespace events (if recorded)"),
+	OPT_BOOLEAN('\0', "show-cgroup-events", &script.show_cgroup_events,
+		    "Show cgroup events (if recorded)"),
 	OPT_BOOLEAN('\0', "show-lost-events", &script.show_lost_events,
 		    "Show lost events (if recorded)"),
 	OPT_BOOLEAN('\0', "show-round-events", &script.show_round_events,
-- 
2.23.0.187.g17f5b7556c-goog


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 1/9] perf/core: Add PERF_RECORD_CGROUP event
  2019-08-28  7:31 ` [PATCH 1/9] perf/core: Add PERF_RECORD_CGROUP event Namhyung Kim
@ 2019-08-28  9:44   ` Peter Zijlstra
  2019-08-28  9:44   ` Peter Zijlstra
  2019-08-28 14:48   ` Tejun Heo
  2 siblings, 0 replies; 43+ messages in thread
From: Peter Zijlstra @ 2019-08-28  9:44 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Ingo Molnar, Arnaldo Carvalho de Melo, LKML, Jiri Olsa,
	Alexander Shishkin, Stephane Eranian, Tejun Heo, Li Zefan,
	Johannes Weiner, Adrian Hunter

On Wed, Aug 28, 2019 at 04:31:22PM +0900, Namhyung Kim wrote:
> To support cgroup tracking, add CGROUP event to save a link between
> cgroup path and inode number.  The attr.cgroup bit was also added to
> enable cgroup tracking from userspace.
> 
> This event will be generated when a new cgroup becomes active.
> Userspace might need to synthesize those events for existing cgroups.
> 
> As aux_output change is also going on, I just added the bit here as
> well to remove possible conflicts later.
> 
> Cc: Tejun Heo <tj@kernel.org>
> Cc: Li Zefan <lizefan@huawei.com>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Adrian Hunter <adrian.hunter@intel.com>
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> ---
>  include/uapi/linux/perf_event.h |  15 ++++-
>  kernel/events/core.c            | 112 ++++++++++++++++++++++++++++++++
>  2 files changed, 126 insertions(+), 1 deletion(-)
> 
> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> index 7198ddd0c6b1..cb07c24b715f 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -374,7 +374,9 @@ struct perf_event_attr {
>  				namespaces     :  1, /* include namespaces data */
>  				ksymbol        :  1, /* include ksymbol events */
>  				bpf_event      :  1, /* include bpf events */
> -				__reserved_1   : 33;
> +				aux_output     :  1, /* generate AUX records instead of events */
> +				cgroup         :  1, /* include cgroup events */
> +				__reserved_1   : 31;

That looks like a rebase fail, aux_output is not from these patches.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 1/9] perf/core: Add PERF_RECORD_CGROUP event
  2019-08-28  7:31 ` [PATCH 1/9] perf/core: Add PERF_RECORD_CGROUP event Namhyung Kim
  2019-08-28  9:44   ` Peter Zijlstra
@ 2019-08-28  9:44   ` Peter Zijlstra
  2019-08-28 13:13     ` Arnaldo Carvalho de Melo
  2019-08-30  3:46     ` Namhyung Kim
  2019-08-28 14:48   ` Tejun Heo
  2 siblings, 2 replies; 43+ messages in thread
From: Peter Zijlstra @ 2019-08-28  9:44 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Ingo Molnar, Arnaldo Carvalho de Melo, LKML, Jiri Olsa,
	Alexander Shishkin, Stephane Eranian, Tejun Heo, Li Zefan,
	Johannes Weiner, Adrian Hunter

On Wed, Aug 28, 2019 at 04:31:22PM +0900, Namhyung Kim wrote:
> To support cgroup tracking, add CGROUP event to save a link between
> cgroup path and inode number.  The attr.cgroup bit was also added to
> enable cgroup tracking from userspace.
> 
> This event will be generated when a new cgroup becomes active.
> Userspace might need to synthesize those events for existing cgroups.
> 
> As aux_output change is also going on, I just added the bit here as
> well to remove possible conflicts later.

Why do we want this?


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 1/9] perf/core: Add PERF_RECORD_CGROUP event
  2019-08-28  9:44   ` Peter Zijlstra
@ 2019-08-28 13:13     ` Arnaldo Carvalho de Melo
  2019-08-30  3:46     ` Namhyung Kim
  1 sibling, 0 replies; 43+ messages in thread
From: Arnaldo Carvalho de Melo @ 2019-08-28 13:13 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Namhyung Kim, Ingo Molnar, LKML, Jiri Olsa, Alexander Shishkin,
	Stephane Eranian, Tejun Heo, Li Zefan, Johannes Weiner,
	Adrian Hunter

Em Wed, Aug 28, 2019 at 11:44:59AM +0200, Peter Zijlstra escreveu:
> On Wed, Aug 28, 2019 at 04:31:22PM +0900, Namhyung Kim wrote:
> > To support cgroup tracking, add CGROUP event to save a link between
> > cgroup path and inode number.  The attr.cgroup bit was also added to
> > enable cgroup tracking from userspace.
> > 
> > This event will be generated when a new cgroup becomes active.
> > Userspace might need to synthesize those events for existing cgroups.
> > 
> > As aux_output change is also going on, I just added the bit here as
> > well to remove possible conflicts later.
> 
> Why do we want this?

Yeah, I'd resolve the conflict later, to avoid mixing things like this.

- Arnaldo

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 1/9] perf/core: Add PERF_RECORD_CGROUP event
  2019-08-28  7:31 ` [PATCH 1/9] perf/core: Add PERF_RECORD_CGROUP event Namhyung Kim
  2019-08-28  9:44   ` Peter Zijlstra
  2019-08-28  9:44   ` Peter Zijlstra
@ 2019-08-28 14:48   ` Tejun Heo
  2019-08-30  3:56     ` Namhyung Kim
  2 siblings, 1 reply; 43+ messages in thread
From: Tejun Heo @ 2019-08-28 14:48 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Ingo Molnar, Peter Zijlstra, Arnaldo Carvalho de Melo, LKML,
	Jiri Olsa, Alexander Shishkin, Stephane Eranian, Li Zefan,
	Johannes Weiner, Adrian Hunter

Hello, Namhyung.

On Wed, Aug 28, 2019 at 04:31:22PM +0900, Namhyung Kim wrote:
> +	 * struct {
> +	 *	struct perf_event_header	header;
> +	 *	u64				ino;
> +	 *	u64				path_len;
> +	 *	char				path[];

ino and path aren't great identifers for cgroups because both can get
recycled pretty quickly.  Can you please take a look at
KERNFS_ROOT_SUPPORT_EXPORTOP?  That's the fhandle that cgroup uses,
currently the standard ino+gen which isn't ideal but good enough.
Another benefit is that the path can also be resolved efficiently from
userspace given the handle.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 2/9] perf/core: Add PERF_SAMPLE_CGROUP feature
  2019-08-28  7:31 ` [PATCH 2/9] perf/core: Add PERF_SAMPLE_CGROUP feature Namhyung Kim
@ 2019-08-28 14:49   ` Tejun Heo
  2019-08-31  3:03     ` Namhyung Kim
  0 siblings, 1 reply; 43+ messages in thread
From: Tejun Heo @ 2019-08-28 14:49 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Ingo Molnar, Peter Zijlstra, Arnaldo Carvalho de Melo, LKML,
	Jiri Olsa, Alexander Shishkin, Stephane Eranian, Li Zefan,
	Johannes Weiner

On Wed, Aug 28, 2019 at 04:31:23PM +0900, Namhyung Kim wrote:
> @@ -958,6 +958,7 @@ struct perf_sample_data {
>  	u64				stack_user_size;
>  
>  	u64				phys_addr;
> +	u64				cgroup;

Ditto, please use fhandle as the identifier.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 1/9] perf/core: Add PERF_RECORD_CGROUP event
  2019-08-28  9:44   ` Peter Zijlstra
  2019-08-28 13:13     ` Arnaldo Carvalho de Melo
@ 2019-08-30  3:46     ` Namhyung Kim
  2019-08-30  7:34       ` Peter Zijlstra
  1 sibling, 1 reply; 43+ messages in thread
From: Namhyung Kim @ 2019-08-30  3:46 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Arnaldo Carvalho de Melo, LKML, Jiri Olsa,
	Alexander Shishkin, Stephane Eranian, Tejun Heo, Li Zefan,
	Johannes Weiner, Adrian Hunter

Hi Peter,

On Wed, Aug 28, 2019 at 6:45 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Wed, Aug 28, 2019 at 04:31:22PM +0900, Namhyung Kim wrote:
> > To support cgroup tracking, add CGROUP event to save a link between
> > cgroup path and inode number.  The attr.cgroup bit was also added to
> > enable cgroup tracking from userspace.
> >
> > This event will be generated when a new cgroup becomes active.
> > Userspace might need to synthesize those events for existing cgroups.
> >
> > As aux_output change is also going on, I just added the bit here as
> > well to remove possible conflicts later.
>
> Why do we want this?

I saw below [1] and thought you have the patch introduced aux_output
and it's gonna to be merged soon.
Also the tooling patches are already in the acme/perf/core
so I just wanted to avoid conflicts.

Anyway, I'm ok with changing it.  Will remove in v2.

Thanks
Namhyung

[1] https://lkml.org/lkml/2019/8/6/586

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 1/9] perf/core: Add PERF_RECORD_CGROUP event
  2019-08-28 14:48   ` Tejun Heo
@ 2019-08-30  3:56     ` Namhyung Kim
  0 siblings, 0 replies; 43+ messages in thread
From: Namhyung Kim @ 2019-08-30  3:56 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Ingo Molnar, Peter Zijlstra, Arnaldo Carvalho de Melo, LKML,
	Jiri Olsa, Alexander Shishkin, Stephane Eranian, Li Zefan,
	Johannes Weiner, Adrian Hunter

Hi Tejun,

On Wed, Aug 28, 2019 at 11:48 PM Tejun Heo <tj@kernel.org> wrote:
>
> Hello, Namhyung.
>
> On Wed, Aug 28, 2019 at 04:31:22PM +0900, Namhyung Kim wrote:
> > +      * struct {
> > +      *      struct perf_event_header        header;
> > +      *      u64                             ino;
> > +      *      u64                             path_len;
> > +      *      char                            path[];
>
> ino and path aren't great identifers for cgroups because both can get
> recycled pretty quickly.  Can you please take a look at
> KERNFS_ROOT_SUPPORT_EXPORTOP?  That's the fhandle that cgroup uses,
> currently the standard ino+gen which isn't ideal but good enough.
> Another benefit is that the path can also be resolved efficiently from
> userspace given the handle.

Thanks for the info, I'll take a look at it.
Namhyung

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 1/9] perf/core: Add PERF_RECORD_CGROUP event
  2019-08-30  3:46     ` Namhyung Kim
@ 2019-08-30  7:34       ` Peter Zijlstra
  2019-08-30 22:49         ` Namhyung Kim
  0 siblings, 1 reply; 43+ messages in thread
From: Peter Zijlstra @ 2019-08-30  7:34 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Ingo Molnar, Arnaldo Carvalho de Melo, LKML, Jiri Olsa,
	Alexander Shishkin, Stephane Eranian, Tejun Heo, Li Zefan,
	Johannes Weiner, Adrian Hunter

On Fri, Aug 30, 2019 at 12:46:51PM +0900, Namhyung Kim wrote:
> Hi Peter,
> 
> On Wed, Aug 28, 2019 at 6:45 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Wed, Aug 28, 2019 at 04:31:22PM +0900, Namhyung Kim wrote:
> > > To support cgroup tracking, add CGROUP event to save a link between
> > > cgroup path and inode number.  The attr.cgroup bit was also added to
> > > enable cgroup tracking from userspace.
> > >
> > > This event will be generated when a new cgroup becomes active.
> > > Userspace might need to synthesize those events for existing cgroups.
> > >
> > > As aux_output change is also going on, I just added the bit here as
> > > well to remove possible conflicts later.
> >
> > Why do we want this?
> 
> I saw below [1] and thought you have the patch introduced aux_output
> and it's gonna to be merged soon.
> Also the tooling patches are already in the acme/perf/core
> so I just wanted to avoid conflicts.
> 
> Anyway, I'm ok with changing it.  Will remove in v2.

I seem to have confused both you and Arnaldo with this. This email
questions the Changelog as a whole, not just the aux thing (I send a
separate email for that).

This Changelog utterly fails to explain to me _why_ we need/want cgroup
tracking. So why do I want to review and possibly merge this? Changelog
needs to answer this.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 3/9] perf tools: Basic support for CGROUP event
  2019-08-28  7:31 ` [PATCH 3/9] perf tools: Basic support for CGROUP event Namhyung Kim
@ 2019-08-30 12:55   ` Arnaldo Carvalho de Melo
  2019-08-30 22:51     ` Namhyung Kim
  0 siblings, 1 reply; 43+ messages in thread
From: Arnaldo Carvalho de Melo @ 2019-08-30 12:55 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Ingo Molnar, Peter Zijlstra, LKML, Jiri Olsa, Alexander Shishkin,
	Stephane Eranian, Adrian Hunter

Em Wed, Aug 28, 2019 at 04:31:24PM +0900, Namhyung Kim escreveu:
> Implement basic functionality to support cgroup tracking.  Each cgroup
> can be identified by inode number which can be read from userspace
> too.  The actual cgroup processing will come in the later patch.
> 
> Cc: Adrian Hunter <adrian.hunter@intel.com>
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> ---
>  tools/include/uapi/linux/perf_event.h | 17 +++++++++++++++--

Try to have the synchronization of tools/include/ with the kernel to be
done in a separate patch, please, there is a number of projects such as
libbpf and in time libperf that will have a mirror repo outside the
kernel tree and that will get just the needed csets.

- Arnaldo

>  tools/perf/builtin-diff.c             |  1 +
>  tools/perf/builtin-report.c           |  1 +
>  tools/perf/lib/include/perf/event.h   |  7 +++++++
>  tools/perf/util/event.c               | 18 ++++++++++++++++++
>  tools/perf/util/event.h               |  7 +++++++
>  tools/perf/util/evsel.c               |  7 +++++++
>  tools/perf/util/machine.c             | 12 ++++++++++++
>  tools/perf/util/machine.h             |  3 +++
>  tools/perf/util/session.c             |  4 ++++
>  tools/perf/util/tool.h                |  1 +
>  11 files changed, 76 insertions(+), 2 deletions(-)
> 
> diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
> index bb7b271397a6..91a552bf9611 100644
> --- a/tools/include/uapi/linux/perf_event.h
> +++ b/tools/include/uapi/linux/perf_event.h
> @@ -141,8 +141,9 @@ enum perf_event_sample_format {
>  	PERF_SAMPLE_TRANSACTION			= 1U << 17,
>  	PERF_SAMPLE_REGS_INTR			= 1U << 18,
>  	PERF_SAMPLE_PHYS_ADDR			= 1U << 19,
> +	PERF_SAMPLE_CGROUP			= 1U << 20,
>  
> -	PERF_SAMPLE_MAX = 1U << 20,		/* non-ABI */
> +	PERF_SAMPLE_MAX = 1U << 21,		/* non-ABI */
>  
>  	__PERF_SAMPLE_CALLCHAIN_EARLY		= 1ULL << 63, /* non-ABI; internal use */
>  };
> @@ -375,7 +376,8 @@ struct perf_event_attr {
>  				ksymbol        :  1, /* include ksymbol events */
>  				bpf_event      :  1, /* include bpf events */
>  				aux_output     :  1, /* generate AUX records instead of events */
> -				__reserved_1   : 32;
> +				cgroup         :  1, /* include cgroup events */
> +				__reserved_1   : 31;
>  
>  	union {
>  		__u32		wakeup_events;	  /* wakeup every n events */
> @@ -1000,6 +1002,17 @@ enum perf_event_type {
>  	 */
>  	PERF_RECORD_BPF_EVENT			= 18,
>  
> +	/*
> +	 * struct {
> +	 *	struct perf_event_header	header;
> +	 *	u64				ino;
> +	 *	u64				path_len;
> +	 *	char				path[];
> +	 *	struct sample_id		sample_id;
> +	 * };
> +	 */
> +	PERF_RECORD_CGROUP			= 19,
> +
>  	PERF_RECORD_MAX,			/* non-ABI */
>  };
>  
> diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
> index 51c37e53b3d8..ec639340bb65 100644
> --- a/tools/perf/builtin-diff.c
> +++ b/tools/perf/builtin-diff.c
> @@ -445,6 +445,7 @@ static struct perf_diff pdiff = {
>  		.fork	= perf_event__process_fork,
>  		.lost	= perf_event__process_lost,
>  		.namespaces = perf_event__process_namespaces,
> +		.cgroup = perf_event__process_cgroup,
>  		.ordered_events = true,
>  		.ordering_requires_timestamps = true,
>  	},
> diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
> index 318b0b95c14c..c65a59fd2a94 100644
> --- a/tools/perf/builtin-report.c
> +++ b/tools/perf/builtin-report.c
> @@ -1033,6 +1033,7 @@ int cmd_report(int argc, const char **argv)
>  			.mmap2		 = perf_event__process_mmap2,
>  			.comm		 = perf_event__process_comm,
>  			.namespaces	 = perf_event__process_namespaces,
> +			.cgroup		 = perf_event__process_cgroup,
>  			.exit		 = perf_event__process_exit,
>  			.fork		 = perf_event__process_fork,
>  			.lost		 = perf_event__process_lost,
> diff --git a/tools/perf/lib/include/perf/event.h b/tools/perf/lib/include/perf/event.h
> index 36ad3a4a79e6..ec112ad640a1 100644
> --- a/tools/perf/lib/include/perf/event.h
> +++ b/tools/perf/lib/include/perf/event.h
> @@ -104,6 +104,13 @@ struct perf_record_bpf_event {
>  	__u8			 tag[BPF_TAG_SIZE];  // prog tag
>  };
>  
> +struct perf_record_cgroup {
> +	struct perf_event_header header;
> +	__u64			 ino;
> +	__u64			 path_len;
> +	char			 path[PATH_MAX];
> +};
> +
>  struct perf_record_sample {
>  	struct perf_event_header header;
>  	__u64			 array[];
> diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
> index 33616ea62a47..c19b00c1fc26 100644
> --- a/tools/perf/util/event.c
> +++ b/tools/perf/util/event.c
> @@ -52,6 +52,7 @@ static const char *perf_event__names[] = {
>  	[PERF_RECORD_NAMESPACES]		= "NAMESPACES",
>  	[PERF_RECORD_KSYMBOL]			= "KSYMBOL",
>  	[PERF_RECORD_BPF_EVENT]			= "BPF_EVENT",
> +	[PERF_RECORD_CGROUP]			= "CGROUP",
>  	[PERF_RECORD_HEADER_ATTR]		= "ATTR",
>  	[PERF_RECORD_HEADER_EVENT_TYPE]		= "EVENT_TYPE",
>  	[PERF_RECORD_HEADER_TRACING_DATA]	= "TRACING_DATA",
> @@ -1279,6 +1280,12 @@ size_t perf_event__fprintf_namespaces(union perf_event *event, FILE *fp)
>  	return ret;
>  }
>  
> +size_t perf_event__fprintf_cgroup(union perf_event *event, FILE *fp)
> +{
> +	return fprintf(fp, " cgroup: %" PRI_lu64 " %s\n",
> +		       event->cgroup.ino, event->cgroup.path);
> +}
> +
>  int perf_event__process_comm(struct perf_tool *tool __maybe_unused,
>  			     union perf_event *event,
>  			     struct perf_sample *sample,
> @@ -1295,6 +1302,14 @@ int perf_event__process_namespaces(struct perf_tool *tool __maybe_unused,
>  	return machine__process_namespaces_event(machine, event, sample);
>  }
>  
> +int perf_event__process_cgroup(struct perf_tool *tool __maybe_unused,
> +			       union perf_event *event,
> +			       struct perf_sample *sample,
> +			       struct machine *machine)
> +{
> +	return machine__process_cgroup_event(machine, event, sample);
> +}
> +
>  int perf_event__process_lost(struct perf_tool *tool __maybe_unused,
>  			     union perf_event *event,
>  			     struct perf_sample *sample,
> @@ -1516,6 +1531,9 @@ size_t perf_event__fprintf(union perf_event *event, FILE *fp)
>  	case PERF_RECORD_NAMESPACES:
>  		ret += perf_event__fprintf_namespaces(event, fp);
>  		break;
> +	case PERF_RECORD_CGROUP:
> +		ret += perf_event__fprintf_cgroup(event, fp);
> +		break;
>  	case PERF_RECORD_MMAP2:
>  		ret += perf_event__fprintf_mmap2(event, fp);
>  		break;
> diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
> index 429a3fe52d6c..0170435fd1e8 100644
> --- a/tools/perf/util/event.h
> +++ b/tools/perf/util/event.h
> @@ -123,6 +123,7 @@ struct perf_sample {
>  	u32 raw_size;
>  	u64 data_src;
>  	u64 phys_addr;
> +	u64 cgroup;
>  	u32 flags;
>  	u16 insn_len;
>  	u8  cpumode;
> @@ -554,6 +555,7 @@ union perf_event {
>  	struct perf_record_mmap2	mmap2;
>  	struct perf_record_comm		comm;
>  	struct perf_record_namespaces	namespaces;
> +	struct perf_record_cgroup	cgroup;
>  	struct perf_record_fork		fork;
>  	struct perf_record_lost		lost;
>  	struct perf_record_lost_samples	lost_samples;
> @@ -663,6 +665,10 @@ int perf_event__process_namespaces(struct perf_tool *tool,
>  				   union perf_event *event,
>  				   struct perf_sample *sample,
>  				   struct machine *machine);
> +int perf_event__process_cgroup(struct perf_tool *tool,
> +			       union perf_event *event,
> +			       struct perf_sample *sample,
> +			       struct machine *machine);
>  int perf_event__process_mmap(struct perf_tool *tool,
>  			     union perf_event *event,
>  			     struct perf_sample *sample,
> @@ -750,6 +756,7 @@ size_t perf_event__fprintf_switch(union perf_event *event, FILE *fp);
>  size_t perf_event__fprintf_thread_map(union perf_event *event, FILE *fp);
>  size_t perf_event__fprintf_cpu_map(union perf_event *event, FILE *fp);
>  size_t perf_event__fprintf_namespaces(union perf_event *event, FILE *fp);
> +size_t perf_event__fprintf_cgroup(union perf_event *event, FILE *fp);
>  size_t perf_event__fprintf_ksymbol(union perf_event *event, FILE *fp);
>  size_t perf_event__fprintf_bpf(union perf_event *event, FILE *fp);
>  size_t perf_event__fprintf(union perf_event *event, FILE *fp);
> diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
> index fa676355559e..86a38679cad1 100644
> --- a/tools/perf/util/evsel.c
> +++ b/tools/perf/util/evsel.c
> @@ -1593,6 +1593,7 @@ int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr,
>  	PRINT_ATTRf(ksymbol, p_unsigned);
>  	PRINT_ATTRf(bpf_event, p_unsigned);
>  	PRINT_ATTRf(aux_output, p_unsigned);
> +	PRINT_ATTRf(cgroup, p_unsigned);
>  
>  	PRINT_ATTRn("{ wakeup_events, wakeup_watermark }", wakeup_events, p_unsigned);
>  	PRINT_ATTRf(bp_type, p_unsigned);
> @@ -2369,6 +2370,12 @@ int perf_evsel__parse_sample(struct evsel *evsel, union perf_event *event,
>  		array++;
>  	}
>  
> +	data->cgroup = 0;
> +	if (type & PERF_SAMPLE_CGROUP) {
> +		data->cgroup = *array;
> +		array++;
> +	}
> +
>  	return 0;
>  }
>  
> diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
> index 93483f1764d3..61c35eef616b 100644
> --- a/tools/perf/util/machine.c
> +++ b/tools/perf/util/machine.c
> @@ -642,6 +642,16 @@ int machine__process_namespaces_event(struct machine *machine __maybe_unused,
>  	return err;
>  }
>  
> +int machine__process_cgroup_event(struct machine *machine __maybe_unused,
> +				  union perf_event *event,
> +				  struct perf_sample *sample __maybe_unused)
> +{
> +	if (dump_trace)
> +		perf_event__fprintf_cgroup(event, stdout);
> +
> +	return 0;
> +}
> +
>  int machine__process_lost_event(struct machine *machine __maybe_unused,
>  				union perf_event *event, struct perf_sample *sample __maybe_unused)
>  {
> @@ -1902,6 +1912,8 @@ int machine__process_event(struct machine *machine, union perf_event *event,
>  		ret = machine__process_mmap_event(machine, event, sample); break;
>  	case PERF_RECORD_NAMESPACES:
>  		ret = machine__process_namespaces_event(machine, event, sample); break;
> +	case PERF_RECORD_CGROUP:
> +		ret = machine__process_cgroup_event(machine, event, sample); break;
>  	case PERF_RECORD_MMAP2:
>  		ret = machine__process_mmap2_event(machine, event, sample); break;
>  	case PERF_RECORD_FORK:
> diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h
> index 7d69119d0b5d..608813ced0cd 100644
> --- a/tools/perf/util/machine.h
> +++ b/tools/perf/util/machine.h
> @@ -127,6 +127,9 @@ int machine__process_switch_event(struct machine *machine,
>  int machine__process_namespaces_event(struct machine *machine,
>  				      union perf_event *event,
>  				      struct perf_sample *sample);
> +int machine__process_cgroup_event(struct machine *machine,
> +				  union perf_event *event,
> +				  struct perf_sample *sample);
>  int machine__process_mmap_event(struct machine *machine, union perf_event *event,
>  				struct perf_sample *sample);
>  int machine__process_mmap2_event(struct machine *machine, union perf_event *event,
> diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
> index 5786e9c807c5..2cdce7ee228c 100644
> --- a/tools/perf/util/session.c
> +++ b/tools/perf/util/session.c
> @@ -457,6 +457,8 @@ void perf_tool__fill_defaults(struct perf_tool *tool)
>  		tool->comm = process_event_stub;
>  	if (tool->namespaces == NULL)
>  		tool->namespaces = process_event_stub;
> +	if (tool->cgroup == NULL)
> +		tool->cgroup = process_event_stub;
>  	if (tool->fork == NULL)
>  		tool->fork = process_event_stub;
>  	if (tool->exit == NULL)
> @@ -1417,6 +1419,8 @@ static int machines__deliver_event(struct machines *machines,
>  		return tool->comm(tool, event, sample, machine);
>  	case PERF_RECORD_NAMESPACES:
>  		return tool->namespaces(tool, event, sample, machine);
> +	case PERF_RECORD_CGROUP:
> +		return tool->cgroup(tool, event, sample, machine);
>  	case PERF_RECORD_FORK:
>  		return tool->fork(tool, event, sample, machine);
>  	case PERF_RECORD_EXIT:
> diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
> index 2abbf668b8de..472ef5eb4068 100644
> --- a/tools/perf/util/tool.h
> +++ b/tools/perf/util/tool.h
> @@ -46,6 +46,7 @@ struct perf_tool {
>  			mmap2,
>  			comm,
>  			namespaces,
> +			cgroup,
>  			fork,
>  			exit,
>  			lost,
> -- 
> 2.23.0.187.g17f5b7556c-goog

-- 

- Arnaldo

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 1/9] perf/core: Add PERF_RECORD_CGROUP event
  2019-08-30  7:34       ` Peter Zijlstra
@ 2019-08-30 22:49         ` Namhyung Kim
  2019-08-30 23:52           ` Stephane Eranian
  0 siblings, 1 reply; 43+ messages in thread
From: Namhyung Kim @ 2019-08-30 22:49 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Arnaldo Carvalho de Melo, LKML, Jiri Olsa,
	Alexander Shishkin, Stephane Eranian, Tejun Heo, Li Zefan,
	Johannes Weiner, Adrian Hunter

On Fri, Aug 30, 2019 at 4:35 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Fri, Aug 30, 2019 at 12:46:51PM +0900, Namhyung Kim wrote:
> > Hi Peter,
> >
> > On Wed, Aug 28, 2019 at 6:45 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Wed, Aug 28, 2019 at 04:31:22PM +0900, Namhyung Kim wrote:
> > > > To support cgroup tracking, add CGROUP event to save a link between
> > > > cgroup path and inode number.  The attr.cgroup bit was also added to
> > > > enable cgroup tracking from userspace.
> > > >
> > > > This event will be generated when a new cgroup becomes active.
> > > > Userspace might need to synthesize those events for existing cgroups.
> > > >
> > > > As aux_output change is also going on, I just added the bit here as
> > > > well to remove possible conflicts later.
> > >
> > > Why do we want this?
> >
> > I saw below [1] and thought you have the patch introduced aux_output
> > and it's gonna to be merged soon.
> > Also the tooling patches are already in the acme/perf/core
> > so I just wanted to avoid conflicts.
> >
> > Anyway, I'm ok with changing it.  Will remove in v2.
>
> I seem to have confused both you and Arnaldo with this. This email
> questions the Changelog as a whole, not just the aux thing (I send a
> separate email for that).
>
> This Changelog utterly fails to explain to me _why_ we need/want cgroup
> tracking. So why do I want to review and possibly merge this? Changelog
> needs to answer this.

OK.  How about this?

Systems running a large number of jobs in different cgroups want to
profiling such jobs precisely.  This includes container hosting systems
widely used today.  Currently perf supports namespace tracking but
the systems may not use (cgroup) namespace for their jobs.  Also
it'd be more intuitive to see cgroup names (as they're given by user
or sysadmin) rather than numeric cgroup/namespace id even if they
use the namespaces.

Thanks,
Namhyung

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 3/9] perf tools: Basic support for CGROUP event
  2019-08-30 12:55   ` Arnaldo Carvalho de Melo
@ 2019-08-30 22:51     ` Namhyung Kim
  0 siblings, 0 replies; 43+ messages in thread
From: Namhyung Kim @ 2019-08-30 22:51 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: Ingo Molnar, Peter Zijlstra, LKML, Jiri Olsa, Alexander Shishkin,
	Stephane Eranian, Adrian Hunter

Hi Arnaldo,

On Fri, Aug 30, 2019 at 9:55 PM Arnaldo Carvalho de Melo
<acme@kernel.org> wrote:
>
> Em Wed, Aug 28, 2019 at 04:31:24PM +0900, Namhyung Kim escreveu:
> > Implement basic functionality to support cgroup tracking.  Each cgroup
> > can be identified by inode number which can be read from userspace
> > too.  The actual cgroup processing will come in the later patch.
> >
> > Cc: Adrian Hunter <adrian.hunter@intel.com>
> > Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> > ---
> >  tools/include/uapi/linux/perf_event.h | 17 +++++++++++++++--
>
> Try to have the synchronization of tools/include/ with the kernel to be
> done in a separate patch, please, there is a number of projects such as
> libbpf and in time libperf that will have a mirror repo outside the
> kernel tree and that will get just the needed csets.

OK, will do that in v2.

Thanks,
Namhyung

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 1/9] perf/core: Add PERF_RECORD_CGROUP event
  2019-08-30 22:49         ` Namhyung Kim
@ 2019-08-30 23:52           ` Stephane Eranian
  0 siblings, 0 replies; 43+ messages in thread
From: Stephane Eranian @ 2019-08-30 23:52 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo, LKML,
	Jiri Olsa, Alexander Shishkin, Tejun Heo, Li Zefan,
	Johannes Weiner, Adrian Hunter

On Fri, Aug 30, 2019 at 3:49 PM Namhyung Kim <namhyung@kernel.org> wrote:
>
> On Fri, Aug 30, 2019 at 4:35 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Fri, Aug 30, 2019 at 12:46:51PM +0900, Namhyung Kim wrote:
> > > Hi Peter,
> > >
> > > On Wed, Aug 28, 2019 at 6:45 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > > >
> > > > On Wed, Aug 28, 2019 at 04:31:22PM +0900, Namhyung Kim wrote:
> > > > > To support cgroup tracking, add CGROUP event to save a link between
> > > > > cgroup path and inode number.  The attr.cgroup bit was also added to
> > > > > enable cgroup tracking from userspace.
> > > > >
> > > > > This event will be generated when a new cgroup becomes active.
> > > > > Userspace might need to synthesize those events for existing cgroups.
> > > > >
> > > > > As aux_output change is also going on, I just added the bit here as
> > > > > well to remove possible conflicts later.
> > > >
> > > > Why do we want this?
> > >
> > > I saw below [1] and thought you have the patch introduced aux_output
> > > and it's gonna to be merged soon.
> > > Also the tooling patches are already in the acme/perf/core
> > > so I just wanted to avoid conflicts.
> > >
> > > Anyway, I'm ok with changing it.  Will remove in v2.
> >
> > I seem to have confused both you and Arnaldo with this. This email
> > questions the Changelog as a whole, not just the aux thing (I send a
> > separate email for that).
> >
> > This Changelog utterly fails to explain to me _why_ we need/want cgroup
> > tracking. So why do I want to review and possibly merge this? Changelog
> > needs to answer this.
>
> OK.  How about this?
>
> Systems running a large number of jobs in different cgroups want to
> profiling such jobs precisely.  This includes container hosting systems
> widely used today.  Currently perf supports namespace tracking but
> the systems may not use (cgroup) namespace for their jobs.  Also
> it'd be more intuitive to see cgroup names (as they're given by user
> or sysadmin) rather than numeric cgroup/namespace id even if they
> use the namespaces.
>

In data centers you care about attributing samples to a job not such
much to a process.
A job may have multiple processes which may come and go. The cgroup on
the other hand
stays around for the entire lifetime of the job. It is much easier to
map a cgroup name to a particular
job than it is to map a pid back to a job name, especially for offline
post-processing.

Hope this clarifies why we would like this feature upstream.


>
> Thanks,
> Namhyung

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 2/9] perf/core: Add PERF_SAMPLE_CGROUP feature
  2019-08-28 14:49   ` Tejun Heo
@ 2019-08-31  3:03     ` Namhyung Kim
  2019-08-31  4:58       ` Tejun Heo
  0 siblings, 1 reply; 43+ messages in thread
From: Namhyung Kim @ 2019-08-31  3:03 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Ingo Molnar, Peter Zijlstra, Arnaldo Carvalho de Melo, LKML,
	Jiri Olsa, Alexander Shishkin, Stephane Eranian, Li Zefan,
	Johannes Weiner

Hi Tejun,

On Wed, Aug 28, 2019 at 07:49:11AM -0700, Tejun Heo wrote:
> On Wed, Aug 28, 2019 at 04:31:23PM +0900, Namhyung Kim wrote:
> > @@ -958,6 +958,7 @@ struct perf_sample_data {
> >  	u64				stack_user_size;
> >  
> >  	u64				phys_addr;
> > +	u64				cgroup;
> 
> Ditto, please use fhandle as the identifier.

Hmm.. it looks hard to use fhandle as the identifier since perf
sampling is done in NMI context.  AFAICS the encode_fh part seems ok
but getting dentry/inode from a kernfs_node seems not.

I assume kernfs_node_id's ino and gen are same to its inode's.  Then
we might use kernfs_node for encoding but not sure you like it ;-)

Thanks
Namhyung

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 2/9] perf/core: Add PERF_SAMPLE_CGROUP feature
  2019-08-31  3:03     ` Namhyung Kim
@ 2019-08-31  4:58       ` Tejun Heo
  2019-09-03  2:13         ` Namhyung Kim
  2019-09-14 14:02         ` Song Liu
  0 siblings, 2 replies; 43+ messages in thread
From: Tejun Heo @ 2019-08-31  4:58 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Ingo Molnar, Peter Zijlstra, Arnaldo Carvalho de Melo, LKML,
	Jiri Olsa, Alexander Shishkin, Stephane Eranian, Li Zefan,
	Johannes Weiner

Hello,

On Sat, Aug 31, 2019 at 12:03:26PM +0900, Namhyung Kim wrote:
> Hmm.. it looks hard to use fhandle as the identifier since perf
> sampling is done in NMI context.  AFAICS the encode_fh part seems ok
> but getting dentry/inode from a kernfs_node seems not.
> 
> I assume kernfs_node_id's ino and gen are same to its inode's.  Then
> we might use kernfs_node for encoding but not sure you like it ;-)

Oh yeah, the whole cgroup id situation is kinda shitty and it's likely
that it needs to be cleaned up a bit for this to be used widely.  The
issues are...

* As identifiers, paths sucks.  It's too big and unwieldy and can be
  rapidly reused for different instances.

* ino is compact but can't be easily mapped to path from userland and
  also not unique.

* The fhandle identifier - currently ino+gen - is better in that it's
  finite sized and compact and can be efficiently mapped to path from
  userspace.  It's also mostly unique.  However, the way gen is
  currently generated still has some chance of the same ID getting
  reused and it isn't easily accessible from inside the kernel right
  now.

Eventually, where we wanna be at is having a single 64bit identifier
which can be easily used everywhere.  It should be pretty straight
forward on 64bit machines - we can just use monotonically increasing
id and use it for everything - ino, fhandle and internal cgroup id.
On 32bit, it gets a bit complicated because ino is 32bit, so it'll
need a custom allocator which bumps gen when the lower 32bit wraps and
skips in-use inos.  Once we have that, we can use that for cgrp->id
and fhandle and derive ino from it.

This is on the to-do list but obviously hasn't happened yet.  If you
wanna take on it, great, but, otherwise, what can be done now is
either moving gen+ino generation into cgroup and tell kernfs to use it
or copy gen+ino into cgroup for easier access.  The former likely is
the better approach given that it brings us closer to where we wanna
be eventually.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 2/9] perf/core: Add PERF_SAMPLE_CGROUP feature
  2019-08-31  4:58       ` Tejun Heo
@ 2019-09-03  2:13         ` Namhyung Kim
  2019-09-05 16:56           ` Tejun Heo
  2019-09-14 14:02         ` Song Liu
  1 sibling, 1 reply; 43+ messages in thread
From: Namhyung Kim @ 2019-09-03  2:13 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Ingo Molnar, Peter Zijlstra, Arnaldo Carvalho de Melo, LKML,
	Jiri Olsa, Alexander Shishkin, Stephane Eranian, Li Zefan,
	Johannes Weiner

Hi Tejun,

Sorry for the late reply.


On Fri, Aug 30, 2019 at 09:58:15PM -0700, Tejun Heo wrote:
> Hello,
> 
> On Sat, Aug 31, 2019 at 12:03:26PM +0900, Namhyung Kim wrote:
> > Hmm.. it looks hard to use fhandle as the identifier since perf
> > sampling is done in NMI context.  AFAICS the encode_fh part seems ok
> > but getting dentry/inode from a kernfs_node seems not.
> > 
> > I assume kernfs_node_id's ino and gen are same to its inode's.  Then
> > we might use kernfs_node for encoding but not sure you like it ;-)
> 
> Oh yeah, the whole cgroup id situation is kinda shitty and it's likely
> that it needs to be cleaned up a bit for this to be used widely.  The
> issues are...
> 
> * As identifiers, paths sucks.  It's too big and unwieldy and can be
>   rapidly reused for different instances.
> 
> * ino is compact but can't be easily mapped to path from userland and
>   also not unique.
> 
> * The fhandle identifier - currently ino+gen - is better in that it's
>   finite sized and compact and can be efficiently mapped to path from
>   userspace.  It's also mostly unique.  However, the way gen is
>   currently generated still has some chance of the same ID getting
>   reused and it isn't easily accessible from inside the kernel right
>   now.
> 
> Eventually, where we wanna be at is having a single 64bit identifier
> which can be easily used everywhere.  It should be pretty straight
> forward on 64bit machines - we can just use monotonically increasing
> id and use it for everything - ino, fhandle and internal cgroup id.
> On 32bit, it gets a bit complicated because ino is 32bit, so it'll
> need a custom allocator which bumps gen when the lower 32bit wraps and
> skips in-use inos.  Once we have that, we can use that for cgrp->id
> and fhandle and derive ino from it.
> 
> This is on the to-do list but obviously hasn't happened yet.  If you
> wanna take on it, great, but, otherwise, what can be done now is
> either moving gen+ino generation into cgroup and tell kernfs to use it
> or copy gen+ino into cgroup for easier access.  The former likely is
> the better approach given that it brings us closer to where we wanna
> be eventually.

So is my understanding below correct?

 * currently kernfs ino+gen is different than inode's ino+gen
 * but it'd be better to make them same
 * so move (generic?) inode's ino+gen logic to cgroup
 * and kernfs node use the same logic (and number)
 * so perf sampling code (NMI) just access kernfs node
 * and userspace can use file handle for comparison

Thanks,
Namhyung

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 2/9] perf/core: Add PERF_SAMPLE_CGROUP feature
  2019-09-03  2:13         ` Namhyung Kim
@ 2019-09-05 16:56           ` Tejun Heo
  2019-09-08 13:28             ` Namhyung Kim
  0 siblings, 1 reply; 43+ messages in thread
From: Tejun Heo @ 2019-09-05 16:56 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Ingo Molnar, Peter Zijlstra, Arnaldo Carvalho de Melo, LKML,
	Jiri Olsa, Alexander Shishkin, Stephane Eranian, Li Zefan,
	Johannes Weiner

Hello, Namhyung.

On Tue, Sep 03, 2019 at 10:13:08AM +0800, Namhyung Kim wrote:
> So is my understanding below correct?
> 
>  * currently kernfs ino+gen is different than inode's ino+gen

They're the same.  It's just that cgroup has other less useful IDs
too.

>  * but it'd be better to make them same
>  * so move (generic?) inode's ino+gen logic to cgroup
>  * and kernfs node use the same logic (and number)
>  * so perf sampling code (NMI) just access kernfs node
>  * and userspace can use file handle for comparison

The rest, yes, pretty much.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 2/9] perf/core: Add PERF_SAMPLE_CGROUP feature
  2019-09-05 16:56           ` Tejun Heo
@ 2019-09-08 13:28             ` Namhyung Kim
  0 siblings, 0 replies; 43+ messages in thread
From: Namhyung Kim @ 2019-09-08 13:28 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Ingo Molnar, Peter Zijlstra, Arnaldo Carvalho de Melo, LKML,
	Jiri Olsa, Alexander Shishkin, Stephane Eranian, Li Zefan,
	Johannes Weiner

Hi Tejun,

On Thu, Sep 05, 2019 at 09:56:55AM -0700, Tejun Heo wrote:
> Hello, Namhyung.
> 
> On Tue, Sep 03, 2019 at 10:13:08AM +0800, Namhyung Kim wrote:
> > So is my understanding below correct?
> > 
> >  * currently kernfs ino+gen is different than inode's ino+gen
> 
> They're the same.  It's just that cgroup has other less useful IDs
> too.

Ah, ok.

> 
> >  * but it'd be better to make them same
> >  * so move (generic?) inode's ino+gen logic to cgroup
> >  * and kernfs node use the same logic (and number)
> >  * so perf sampling code (NMI) just access kernfs node
> >  * and userspace can use file handle for comparison
> 
> The rest, yes, pretty much.

Thanks for the clarification.  I'll take a look at it.

Thanks
Namhyung

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 2/9] perf/core: Add PERF_SAMPLE_CGROUP feature
  2019-08-31  4:58       ` Tejun Heo
  2019-09-03  2:13         ` Namhyung Kim
@ 2019-09-14 14:02         ` Song Liu
  2019-09-16 15:23           ` Tejun Heo
  1 sibling, 1 reply; 43+ messages in thread
From: Song Liu @ 2019-09-14 14:02 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Namhyung Kim, Ingo Molnar, Peter Zijlstra,
	Arnaldo Carvalho de Melo, LKML, Jiri Olsa, Alexander Shishkin,
	Stephane Eranian, Li Zefan, Johannes Weiner

Hi Tejun,

On Sat, Aug 31, 2019 at 6:01 AM Tejun Heo <tj@kernel.org> wrote:
>
> Hello,
>
> On Sat, Aug 31, 2019 at 12:03:26PM +0900, Namhyung Kim wrote:
> > Hmm.. it looks hard to use fhandle as the identifier since perf
> > sampling is done in NMI context.  AFAICS the encode_fh part seems ok
> > but getting dentry/inode from a kernfs_node seems not.
> >
> > I assume kernfs_node_id's ino and gen are same to its inode's.  Then
> > we might use kernfs_node for encoding but not sure you like it ;-)
>
> Oh yeah, the whole cgroup id situation is kinda shitty and it's likely
> that it needs to be cleaned up a bit for this to be used widely.  The
> issues are...

Here are my 2 cents about this.

I think we don't need a perfect identifier in this case. IIUC, the goal of
this patchset is to map each sample with a cgroup name (or full path).
To achieve this, we need

1. PERF_RECORD_CGROUP, that maps
           "64-bit number" => cgroup name/path
2. PERF_SAMPLE_CGROUP, that adds "64-bit number" to each sample.

I call the id a "64-bit number" because it is not required to be a globally
unique id. As long as it is consistent within the same perf-record session,
we won't get any confusion. Since we add PERF_RECORD_CGROUP
for each cgroup creation, we will map most of samples correctly even
when the  "64-bit number" is recycled within the same perf-record session.

At the moment, I think ino is good enough for the "64-bit number" even
for 32-bit systems. If we don't call it "ino" (just call it "cgroup_tag" or
"cgroup_id", we can change it when kernfs provides a better 64-bit id.

About full path name: The user names the full path here. If the user gives
two different workloads the same name/path, we really cannot change that.
Reasonable users would be able to make sense from the full path.

Thanks,
Song

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 2/9] perf/core: Add PERF_SAMPLE_CGROUP feature
  2019-09-14 14:02         ` Song Liu
@ 2019-09-16 15:23           ` Tejun Heo
  2019-09-19  6:42             ` Song Liu
  0 siblings, 1 reply; 43+ messages in thread
From: Tejun Heo @ 2019-09-16 15:23 UTC (permalink / raw)
  To: Song Liu
  Cc: Namhyung Kim, Ingo Molnar, Peter Zijlstra,
	Arnaldo Carvalho de Melo, LKML, Jiri Olsa, Alexander Shishkin,
	Stephane Eranian, Li Zefan, Johannes Weiner

Hello, Song.

On Sat, Sep 14, 2019 at 03:02:51PM +0100, Song Liu wrote:
> I think we don't need a perfect identifier in this case. IIUC, the goal of

I really don't want different versions of imperfect identifiers
proliferating.

> this patchset is to map each sample with a cgroup name (or full path).
> To achieve this, we need
> 
> 1. PERF_RECORD_CGROUP, that maps
>            "64-bit number" => cgroup name/path
> 2. PERF_SAMPLE_CGROUP, that adds "64-bit number" to each sample.
> 
> I call the id a "64-bit number" because it is not required to be a globally
> unique id. As long as it is consistent within the same perf-record session,
> we won't get any confusion. Since we add PERF_RECORD_CGROUP
> for each cgroup creation, we will map most of samples correctly even
> when the  "64-bit number" is recycled within the same perf-record session.
> 
> At the moment, I think ino is good enough for the "64-bit number" even
> for 32-bit systems. If we don't call it "ino" (just call it "cgroup_tag" or
> "cgroup_id", we can change it when kernfs provides a better 64-bit id.

So, a firm nack on this direction.

> About full path name: The user names the full path here. If the user gives
> two different workloads the same name/path, we really cannot change that.
> Reasonable users would be able to make sense from the full path.

I don't see why we wanna be causing this avoidable problem to users.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 2/9] perf/core: Add PERF_SAMPLE_CGROUP feature
  2019-09-16 15:23           ` Tejun Heo
@ 2019-09-19  6:42             ` Song Liu
  2019-09-20  8:47               ` Namhyung Kim
  0 siblings, 1 reply; 43+ messages in thread
From: Song Liu @ 2019-09-19  6:42 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Namhyung Kim, Ingo Molnar, Peter Zijlstra,
	Arnaldo Carvalho de Melo, LKML, Jiri Olsa, Alexander Shishkin,
	Stephane Eranian, Li Zefan, Johannes Weiner

On Mon, Sep 16, 2019 at 4:23 PM Tejun Heo <tj@kernel.org> wrote:
>
> Hello, Song.
>
> On Sat, Sep 14, 2019 at 03:02:51PM +0100, Song Liu wrote:
> > I think we don't need a perfect identifier in this case. IIUC, the goal of
>
> I really don't want different versions of imperfect identifiers
> proliferating.
>
> > this patchset is to map each sample with a cgroup name (or full path).
> > To achieve this, we need
> >
> > 1. PERF_RECORD_CGROUP, that maps
> >            "64-bit number" => cgroup name/path
> > 2. PERF_SAMPLE_CGROUP, that adds "64-bit number" to each sample.
> >
> > I call the id a "64-bit number" because it is not required to be a globally
> > unique id. As long as it is consistent within the same perf-record session,
> > we won't get any confusion. Since we add PERF_RECORD_CGROUP
> > for each cgroup creation, we will map most of samples correctly even
> > when the  "64-bit number" is recycled within the same perf-record session.
> >
> > At the moment, I think ino is good enough for the "64-bit number" even
> > for 32-bit systems. If we don't call it "ino" (just call it "cgroup_tag" or
> > "cgroup_id", we can change it when kernfs provides a better 64-bit id.
>
> So, a firm nack on this direction.
>
> > About full path name: The user names the full path here. If the user gives
> > two different workloads the same name/path, we really cannot change that.
> > Reasonable users would be able to make sense from the full path.
>
> I don't see why we wanna be causing this avoidable problem to users.

Sharing some offline discussions with Tejun.

ino in current kernfs is not a good unique ID for cgroup, because it doesn't
increase monotonically. So we need to improve kernfs.

For 64-bit, we can make the ino monotonic, and use it as the ID.
For 32-bit, we need to make the ino monotonic. and use <ino> and <gen>
as the 64-bit ID.

Thanks,
Song

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 2/9] perf/core: Add PERF_SAMPLE_CGROUP feature
  2019-09-19  6:42             ` Song Liu
@ 2019-09-20  8:47               ` Namhyung Kim
  2019-09-20 16:13                 ` Song Liu
  2019-09-20 21:04                 ` Tejun Heo
  0 siblings, 2 replies; 43+ messages in thread
From: Namhyung Kim @ 2019-09-20  8:47 UTC (permalink / raw)
  To: Song Liu
  Cc: Tejun Heo, Ingo Molnar, Peter Zijlstra, Arnaldo Carvalho de Melo,
	LKML, Jiri Olsa, Alexander Shishkin, Stephane Eranian, Li Zefan,
	Johannes Weiner

Hello Song,

On Thu, Sep 19, 2019 at 3:43 PM Song Liu <liu.song.a23@gmail.com> wrote:

> Sharing some offline discussions with Tejun.
>
> ino in current kernfs is not a good unique ID for cgroup, because it doesn't
> increase monotonically. So we need to improve kernfs.
>
> For 64-bit, we can make the ino monotonic, and use it as the ID.
> For 32-bit, we need to make the ino monotonic. and use <ino> and <gen>
> as the 64-bit ID.

Thanks for the sharing information!  For 32-bit, while the ino itself is not
monotonic, gen << 32 + ino is monotonic right?  I think we can use the
same logic of kernfs id allocation, but not sure what the problem Tejun
mentioned before is.

Thanks,
Namhyung

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 2/9] perf/core: Add PERF_SAMPLE_CGROUP feature
  2019-09-20  8:47               ` Namhyung Kim
@ 2019-09-20 16:13                 ` Song Liu
  2019-09-20 21:04                 ` Tejun Heo
  1 sibling, 0 replies; 43+ messages in thread
From: Song Liu @ 2019-09-20 16:13 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Tejun Heo, Ingo Molnar, Peter Zijlstra, Arnaldo Carvalho de Melo,
	LKML, Jiri Olsa, Alexander Shishkin, Stephane Eranian, Li Zefan,
	Johannes Weiner

Hi Namhyung,

On Fri, Sep 20, 2019 at 1:47 AM Namhyung Kim <namhyung@kernel.org> wrote:
>
> Hello Song,
>
> On Thu, Sep 19, 2019 at 3:43 PM Song Liu <liu.song.a23@gmail.com> wrote:
>
> > Sharing some offline discussions with Tejun.
> >
> > ino in current kernfs is not a good unique ID for cgroup, because it doesn't
> > increase monotonically. So we need to improve kernfs.
> >
> > For 64-bit, we can make the ino monotonic, and use it as the ID.
> > For 32-bit, we need to make the ino monotonic. and use <ino> and <gen>
> > as the 64-bit ID.
>
> Thanks for the sharing information!  For 32-bit, while the ino itself is not
> monotonic, gen << 32 + ino is monotonic right?  I think we can use the
> same logic of kernfs id allocation, but not sure what the problem Tejun
> mentioned before is.

How would we manage gen here? One way that works is:
1. make ino monotonic,
2. increase gen when 32-bit ino overflows

I think current kernfs id is not monotonic, so it can be reused before overflow.

Thanks,
Song

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 2/9] perf/core: Add PERF_SAMPLE_CGROUP feature
  2019-09-20  8:47               ` Namhyung Kim
  2019-09-20 16:13                 ` Song Liu
@ 2019-09-20 21:04                 ` Tejun Heo
  2019-10-02  6:28                   ` Namhyung Kim
  1 sibling, 1 reply; 43+ messages in thread
From: Tejun Heo @ 2019-09-20 21:04 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Song Liu, Ingo Molnar, Peter Zijlstra, Arnaldo Carvalho de Melo,
	LKML, Jiri Olsa, Alexander Shishkin, Stephane Eranian, Li Zefan,
	Johannes Weiner

On Fri, Sep 20, 2019 at 05:47:45PM +0900, Namhyung Kim wrote:
> Thanks for the sharing information!  For 32-bit, while the ino itself is not
> monotonic, gen << 32 + ino is monotonic right?  I think we can use the

It's not.  gen gets incremented on every allocation, so it has not
high but still realistic chance of collisions.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 2/9] perf/core: Add PERF_SAMPLE_CGROUP feature
  2019-09-20 21:04                 ` Tejun Heo
@ 2019-10-02  6:28                   ` Namhyung Kim
  2019-10-07 14:16                     ` Tejun Heo
  0 siblings, 1 reply; 43+ messages in thread
From: Namhyung Kim @ 2019-10-02  6:28 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Song Liu, Ingo Molnar, Peter Zijlstra, Arnaldo Carvalho de Melo,
	LKML, Jiri Olsa, Alexander Shishkin, Stephane Eranian, Li Zefan,
	Johannes Weiner

Hi Tejun,

Sorry for the late reply.

On Sat, Sep 21, 2019 at 6:04 AM Tejun Heo <tj@kernel.org> wrote:
>
> On Fri, Sep 20, 2019 at 05:47:45PM +0900, Namhyung Kim wrote:
> > Thanks for the sharing information!  For 32-bit, while the ino itself is not
> > monotonic, gen << 32 + ino is monotonic right?  I think we can use the
>
> It's not.  gen gets incremented on every allocation, so it has not
> high but still realistic chance of collisions.

In __kernfs_new_node(), gen gets increased only if idr_alloc_cyclic()
returns lower than the cursor...  I'm not sure you talked about it.

Thanks
Namhyung

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 2/9] perf/core: Add PERF_SAMPLE_CGROUP feature
  2019-10-02  6:28                   ` Namhyung Kim
@ 2019-10-07 14:16                     ` Tejun Heo
  0 siblings, 0 replies; 43+ messages in thread
From: Tejun Heo @ 2019-10-07 14:16 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Song Liu, Ingo Molnar, Peter Zijlstra, Arnaldo Carvalho de Melo,
	LKML, Jiri Olsa, Alexander Shishkin, Stephane Eranian, Li Zefan,
	Johannes Weiner

Hello,

On Wed, Oct 02, 2019 at 03:28:00PM +0900, Namhyung Kim wrote:
> On Sat, Sep 21, 2019 at 6:04 AM Tejun Heo <tj@kernel.org> wrote:
> >
> > On Fri, Sep 20, 2019 at 05:47:45PM +0900, Namhyung Kim wrote:
> > > Thanks for the sharing information!  For 32-bit, while the ino itself is not
> > > monotonic, gen << 32 + ino is monotonic right?  I think we can use the
> >
> > It's not.  gen gets incremented on every allocation, so it has not
> > high but still realistic chance of collisions.
> 
> In __kernfs_new_node(), gen gets increased only if idr_alloc_cyclic()
> returns lower than the cursor...  I'm not sure you talked about it.

Ah, I forgot that it's using cyclic idr, so yeah, it's not as bad in
terms of recycling although cyclic allocation on idr is pretty
inefficient.  I still think it'd be better to switch to rbtree and so
that 64bit can simply use monotonically increasing numbers but that
definitely isn't a must and we can juse continue with the current
allocation method.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 2/9] perf/core: Add PERF_SAMPLE_CGROUP feature
  2020-03-25 12:45 [PATCHSET 0/9] perf: Improve cgroup profiling (v6) Namhyung Kim
@ 2020-03-25 12:45 ` Namhyung Kim
  0 siblings, 0 replies; 43+ messages in thread
From: Namhyung Kim @ 2020-03-25 12:45 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: Jiri Olsa, Peter Zijlstra, Ingo Molnar, LKML, Mark Rutland,
	Alexander Shishkin, Li Zefan, Johannes Weiner, Tejun Heo,
	Peter Zijlstra

The PERF_SAMPLE_CGROUP bit is to save (perf_event) cgroup information
in the sample.  It will add a 64-bit id to identify current cgroup and
it's the file handle in the cgroup file system.  Userspace should use
this information with PERF_RECORD_CGROUP event to match which cgroup
it belongs.

I put it before PERF_SAMPLE_AUX for simplicity since it just needs a
64-bit word.  But if we want bigger samples, I can work on that
direction too.

Cc: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 include/linux/perf_event.h      |  1 +
 include/uapi/linux/perf_event.h |  3 ++-
 init/Kconfig                    |  3 ++-
 kernel/events/core.c            | 22 ++++++++++++++++++++++
 4 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 8768a39b5258..9c3e7619c929 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1020,6 +1020,7 @@ struct perf_sample_data {
 	u64				stack_user_size;
 
 	u64				phys_addr;
+	u64				cgroup;
 } ____cacheline_aligned;
 
 /* default value for data source */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index de95f6c7b273..7b2d6fc9e6ed 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -142,8 +142,9 @@ enum perf_event_sample_format {
 	PERF_SAMPLE_REGS_INTR			= 1U << 18,
 	PERF_SAMPLE_PHYS_ADDR			= 1U << 19,
 	PERF_SAMPLE_AUX				= 1U << 20,
+	PERF_SAMPLE_CGROUP			= 1U << 21,
 
-	PERF_SAMPLE_MAX = 1U << 21,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 22,		/* non-ABI */
 
 	__PERF_SAMPLE_CALLCHAIN_EARLY		= 1ULL << 63, /* non-ABI; internal use */
 };
diff --git a/init/Kconfig b/init/Kconfig
index 20a6ac33761c..7766b06a0038 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1027,7 +1027,8 @@ config CGROUP_PERF
 	help
 	  This option extends the perf per-cpu mode to restrict monitoring
 	  to threads which belong to the cgroup specified and run on the
-	  designated cpu.
+	  designated cpu.  Or this can be used to have cgroup ID in samples
+	  so that it can monitor performance events among cgroups.
 
 	  Say N if unsure.
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 994932d5e474..1569979c8912 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1862,6 +1862,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
 	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
 		size += sizeof(data->phys_addr);
 
+	if (sample_type & PERF_SAMPLE_CGROUP)
+		size += sizeof(data->cgroup);
+
 	event->header_size = size;
 }
 
@@ -6867,6 +6870,9 @@ void perf_output_sample(struct perf_output_handle *handle,
 	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
 		perf_output_put(handle, data->phys_addr);
 
+	if (sample_type & PERF_SAMPLE_CGROUP)
+		perf_output_put(handle, data->cgroup);
+
 	if (sample_type & PERF_SAMPLE_AUX) {
 		perf_output_put(handle, data->aux_size);
 
@@ -7066,6 +7072,16 @@ void perf_prepare_sample(struct perf_event_header *header,
 	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
 		data->phys_addr = perf_virt_to_phys(data->addr);
 
+#ifdef CONFIG_CGROUP_PERF
+	if (sample_type & PERF_SAMPLE_CGROUP) {
+		struct cgroup *cgrp;
+
+		/* protected by RCU */
+		cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
+		data->cgroup = cgroup_id(cgrp);
+	}
+#endif
+
 	if (sample_type & PERF_SAMPLE_AUX) {
 		u64 size;
 
@@ -11264,6 +11280,12 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
 
 	if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
 		ret = perf_reg_validate(attr->sample_regs_intr);
+
+#ifndef CONFIG_CGROUP_PERF
+	if (attr->sample_type & PERF_SAMPLE_CGROUP)
+		return -EINVAL;
+#endif
+
 out:
 	return ret;
 
-- 
2.25.1.696.g5e7596f4ac-goog


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 2/9] perf/core: Add PERF_SAMPLE_CGROUP feature
  2020-01-07 13:34 [PATCHSET 0/9] perf: Improve cgroup profiling (v4) Namhyung Kim
@ 2020-01-07 13:34 ` Namhyung Kim
  0 siblings, 0 replies; 43+ messages in thread
From: Namhyung Kim @ 2020-01-07 13:34 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Arnaldo Carvalho de Melo
  Cc: Jiri Olsa, Alexander Shishkin, Mark Rutland, Stephane Eranian,
	LKML, linux-perf-users, Tejun Heo, Li Zefan, Johannes Weiner

The PERF_SAMPLE_CGROUP bit is to save (perf_event) cgroup information
in the sample.  It will add a 64-bit id to identify current cgroup and
it's the file handle in the cgroup file system.  Userspace should use
this information with PERF_RECORD_CGROUP event to match which cgroup
it belongs.

I put it before PERF_SAMPLE_AUX for simplicity since it just needs a
64-bit word.  But if we want bigger samples, I can work on that
direction too.

Cc: Tejun Heo <tj@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 include/linux/perf_event.h      |  1 +
 include/uapi/linux/perf_event.h |  3 ++-
 init/Kconfig                    |  3 ++-
 kernel/events/core.c            | 22 ++++++++++++++++++++++
 4 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 6d4c22aee384..17b5bff045a6 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1001,6 +1001,7 @@ struct perf_sample_data {
 	u64				stack_user_size;
 
 	u64				phys_addr;
+	u64				cgroup;
 } ____cacheline_aligned;
 
 /* default value for data source */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index de2ab87ca92c..3a81e9806cb1 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -142,8 +142,9 @@ enum perf_event_sample_format {
 	PERF_SAMPLE_REGS_INTR			= 1U << 18,
 	PERF_SAMPLE_PHYS_ADDR			= 1U << 19,
 	PERF_SAMPLE_AUX				= 1U << 20,
+	PERF_SAMPLE_CGROUP			= 1U << 21,
 
-	PERF_SAMPLE_MAX = 1U << 21,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 22,		/* non-ABI */
 
 	__PERF_SAMPLE_CALLCHAIN_EARLY		= 1ULL << 63, /* non-ABI; internal use */
 };
diff --git a/init/Kconfig b/init/Kconfig
index a34064a031a5..403ed2448b22 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1026,7 +1026,8 @@ config CGROUP_PERF
 	help
 	  This option extends the perf per-cpu mode to restrict monitoring
 	  to threads which belong to the cgroup specified and run on the
-	  designated cpu.
+	  designated cpu.  Or this can be used to have cgroup ID in samples
+	  so that it can monitor performance events among cgroups.
 
 	  Say N if unsure.
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index dbc46b884505..773480527194 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1754,6 +1754,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
 	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
 		size += sizeof(data->phys_addr);
 
+	if (sample_type & PERF_SAMPLE_CGROUP)
+		size += sizeof(data->cgroup);
+
 	event->header_size = size;
 }
 
@@ -6699,6 +6702,9 @@ void perf_output_sample(struct perf_output_handle *handle,
 	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
 		perf_output_put(handle, data->phys_addr);
 
+	if (sample_type & PERF_SAMPLE_CGROUP)
+		perf_output_put(handle, data->cgroup);
+
 	if (sample_type & PERF_SAMPLE_AUX) {
 		perf_output_put(handle, data->aux_size);
 
@@ -6895,6 +6901,16 @@ void perf_prepare_sample(struct perf_event_header *header,
 	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
 		data->phys_addr = perf_virt_to_phys(data->addr);
 
+#ifdef CONFIG_CGROUP_PERF
+	if (sample_type & PERF_SAMPLE_CGROUP) {
+		struct cgroup *cgrp;
+
+		/* protected by RCU */
+		cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
+		data->cgroup = cgroup_id(cgrp);
+	}
+#endif
+
 	if (sample_type & PERF_SAMPLE_AUX) {
 		u64 size;
 
@@ -11090,6 +11106,12 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
 
 	if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
 		ret = perf_reg_validate(attr->sample_regs_intr);
+
+#ifndef CONFIG_CGROUP_PERF
+	if (attr->sample_type & PERF_SAMPLE_CGROUP)
+		return -EINVAL;
+#endif
+
 out:
 	return ret;
 
-- 
2.24.1.735.g03f4e72817-goog


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 2/9] perf/core: Add PERF_SAMPLE_CGROUP feature
  2019-12-23  6:07 [PATCHSET 0/9] perf: Improve cgroup profiling (v3) Namhyung Kim
@ 2019-12-23  6:07 ` Namhyung Kim
  0 siblings, 0 replies; 43+ messages in thread
From: Namhyung Kim @ 2019-12-23  6:07 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Arnaldo Carvalho de Melo
  Cc: Jiri Olsa, Alexander Shishkin, Mark Rutland, Stephane Eranian,
	LKML, linux-perf-users, Tejun Heo, Li Zefan, Johannes Weiner

The PERF_SAMPLE_CGROUP bit is to save (perf_event) cgroup information
in the sample.  It will add a 64-bit id to identify current cgroup and
it's the file handle in the cgroup file system.  Userspace should use
this information with PERF_RECORD_CGROUP event to match which cgroup
it belongs.

I put it before PERF_SAMPLE_AUX for simplicity since it just needs a
64-bit word.  But if we want bigger samples, I can work on that
direction too.

Cc: Tejun Heo <tj@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 include/linux/perf_event.h      |  1 +
 include/uapi/linux/perf_event.h |  3 ++-
 init/Kconfig                    |  3 ++-
 kernel/events/core.c            | 22 ++++++++++++++++++++++
 4 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 6d4c22aee384..17b5bff045a6 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1001,6 +1001,7 @@ struct perf_sample_data {
 	u64				stack_user_size;
 
 	u64				phys_addr;
+	u64				cgroup;
 } ____cacheline_aligned;
 
 /* default value for data source */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index de2ab87ca92c..3a81e9806cb1 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -142,8 +142,9 @@ enum perf_event_sample_format {
 	PERF_SAMPLE_REGS_INTR			= 1U << 18,
 	PERF_SAMPLE_PHYS_ADDR			= 1U << 19,
 	PERF_SAMPLE_AUX				= 1U << 20,
+	PERF_SAMPLE_CGROUP			= 1U << 21,
 
-	PERF_SAMPLE_MAX = 1U << 21,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 22,		/* non-ABI */
 
 	__PERF_SAMPLE_CALLCHAIN_EARLY		= 1ULL << 63, /* non-ABI; internal use */
 };
diff --git a/init/Kconfig b/init/Kconfig
index 128b68a16951..fedd7b503bf3 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1046,7 +1046,8 @@ config CGROUP_PERF
 	help
 	  This option extends the perf per-cpu mode to restrict monitoring
 	  to threads which belong to the cgroup specified and run on the
-	  designated cpu.
+	  designated cpu.  Or this can be used to have cgroup ID in samples
+	  so that it can monitor performance events among cgroups.
 
 	  Say N if unsure.
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b0aa1b921769..db04ef695a33 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1754,6 +1754,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
 	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
 		size += sizeof(data->phys_addr);
 
+	if (sample_type & PERF_SAMPLE_CGROUP)
+		size += sizeof(data->cgroup);
+
 	event->header_size = size;
 }
 
@@ -6699,6 +6702,9 @@ void perf_output_sample(struct perf_output_handle *handle,
 	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
 		perf_output_put(handle, data->phys_addr);
 
+	if (sample_type & PERF_SAMPLE_CGROUP)
+		perf_output_put(handle, data->cgroup);
+
 	if (sample_type & PERF_SAMPLE_AUX) {
 		perf_output_put(handle, data->aux_size);
 
@@ -6895,6 +6901,16 @@ void perf_prepare_sample(struct perf_event_header *header,
 	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
 		data->phys_addr = perf_virt_to_phys(data->addr);
 
+#ifdef CONFIG_CGROUP_PERF
+	if (sample_type & PERF_SAMPLE_CGROUP) {
+		struct cgroup *cgrp;
+
+		/* protected by RCU */
+		cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
+		data->cgroup = cgroup_id(cgrp);
+	}
+#endif
+
 	if (sample_type & PERF_SAMPLE_AUX) {
 		u64 size;
 
@@ -11090,6 +11106,12 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
 
 	if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
 		ret = perf_reg_validate(attr->sample_regs_intr);
+
+#ifndef CONFIG_CGROUP_PERF
+	if (attr->sample_type & PERF_SAMPLE_CGROUP)
+		return -EINVAL;
+#endif
+
 out:
 	return ret;
 
-- 
2.24.1.735.g03f4e72817-goog


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 2/9] perf/core: Add PERF_SAMPLE_CGROUP feature
  2019-12-20 16:48     ` Peter Zijlstra
@ 2019-12-20 16:59       ` Tejun Heo
  0 siblings, 0 replies; 43+ messages in thread
From: Tejun Heo @ 2019-12-20 16:59 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Namhyung Kim, Ingo Molnar, Arnaldo Carvalho de Melo, Jiri Olsa,
	Alexander Shishkin, Mark Rutland, Stephane Eranian, LKML,
	linux-perf-users, Li Zefan, Johannes Weiner

On Fri, Dec 20, 2019 at 05:48:02PM +0100, Peter Zijlstra wrote:
> That assumes the cgroup is still in existence, which might not be the
> case I presume.

Yeah, if the cgroup is already gone by the time the first event is
looked up, this wouldn't work.

-- 
tejun

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 2/9] perf/core: Add PERF_SAMPLE_CGROUP feature
  2019-12-20 15:23   ` Tejun Heo
@ 2019-12-20 16:48     ` Peter Zijlstra
  2019-12-20 16:59       ` Tejun Heo
  0 siblings, 1 reply; 43+ messages in thread
From: Peter Zijlstra @ 2019-12-20 16:48 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Namhyung Kim, Ingo Molnar, Arnaldo Carvalho de Melo, Jiri Olsa,
	Alexander Shishkin, Mark Rutland, Stephane Eranian, LKML,
	linux-perf-users, Li Zefan, Johannes Weiner

On Fri, Dec 20, 2019 at 07:23:24AM -0800, Tejun Heo wrote:
> On Fri, Dec 20, 2019 at 01:32:46PM +0900, Namhyung Kim wrote:
> > The PERF_SAMPLE_CGROUP bit is to save (perf_event) cgroup information
> > in the sample.  It will add a 64-bit id to identify current cgroup and
> > it's the file handle in the cgroup file system.  Userspace should use
> > this information with PERF_RECORD_CGROUP event to match which cgroup
> > it belongs.
> 
> You don't need PERF_RECORD_CGROUP for that.  Something like the
> following should work.
> 
> 	struct {
> 		struct file_handle fh;
> 		char stor[MAX_HANDLE_SZ];
> 	} fh_store;
> 	struct file_handle *fh = &fh_store;
> 
> 	fh->handle_type = 0xfe; // FILEID_KERNFS
> 	fh->handle_bytes = sizeof(u64);
> 	*(u64 *)fh->f_handle = cgrp_id;
> 
> 	mnt_fd = open('/sys/fs/cgroup', O_RDONLY);
> 	fd = open_by_handle_at(mnt_fd, fh, O_RDONLY);
> 
> 	snprintf(proc_path, PATH_MAX, "/proc/self/fd/%d", fd);
> 	readlink(proc_path, cgrp_path, PATH_MAX);

That assumes the cgroup is still in existence, which might not be the
case I presume.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 2/9] perf/core: Add PERF_SAMPLE_CGROUP feature
  2019-12-20  4:32 ` [PATCH 2/9] perf/core: Add PERF_SAMPLE_CGROUP feature Namhyung Kim
  2019-12-20  9:36   ` Peter Zijlstra
@ 2019-12-20 15:23   ` Tejun Heo
  2019-12-20 16:48     ` Peter Zijlstra
  1 sibling, 1 reply; 43+ messages in thread
From: Tejun Heo @ 2019-12-20 15:23 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo, Jiri Olsa,
	Alexander Shishkin, Mark Rutland, Stephane Eranian, LKML,
	linux-perf-users, Li Zefan, Johannes Weiner

On Fri, Dec 20, 2019 at 01:32:46PM +0900, Namhyung Kim wrote:
> The PERF_SAMPLE_CGROUP bit is to save (perf_event) cgroup information
> in the sample.  It will add a 64-bit id to identify current cgroup and
> it's the file handle in the cgroup file system.  Userspace should use
> this information with PERF_RECORD_CGROUP event to match which cgroup
> it belongs.

You don't need PERF_RECORD_CGROUP for that.  Something like the
following should work.

	struct {
		struct file_handle fh;
		char stor[MAX_HANDLE_SZ];
	} fh_store;
	struct file_handle *fh = &fh_store;

	fh->handle_type = 0xfe; // FILEID_KERNFS
	fh->handle_bytes = sizeof(u64);
	*(u64 *)fh->f_handle = cgrp_id;

	mnt_fd = open('/sys/fs/cgroup', O_RDONLY);
	fd = open_by_handle_at(mnt_fd, fh, O_RDONLY);

	snprintf(proc_path, PATH_MAX, "/proc/self/fd/%d", fd);
	readlink(proc_path, cgrp_path, PATH_MAX);

-- 
tejun

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 2/9] perf/core: Add PERF_SAMPLE_CGROUP feature
  2019-12-20  4:32 ` [PATCH 2/9] perf/core: Add PERF_SAMPLE_CGROUP feature Namhyung Kim
@ 2019-12-20  9:36   ` Peter Zijlstra
  2019-12-20 15:23   ` Tejun Heo
  1 sibling, 0 replies; 43+ messages in thread
From: Peter Zijlstra @ 2019-12-20  9:36 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Ingo Molnar, Arnaldo Carvalho de Melo, Jiri Olsa,
	Alexander Shishkin, Mark Rutland, Stephane Eranian, LKML,
	linux-perf-users, Tejun Heo, Li Zefan, Johannes Weiner

On Fri, Dec 20, 2019 at 01:32:46PM +0900, Namhyung Kim wrote:
> The PERF_SAMPLE_CGROUP bit is to save (perf_event) cgroup information
> in the sample.  It will add a 64-bit id to identify current cgroup and
> it's the file handle in the cgroup file system.  Userspace should use
> this information with PERF_RECORD_CGROUP event to match which cgroup
> it belongs.
> 
> I put it before PERF_SAMPLE_AUX for simplicity since it just needs a
> 64-bit word.  But if we want bigger samples, I can work on that
> direction too.
> 
> Cc: Tejun Heo <tj@kernel.org>
> Cc: Li Zefan <lizefan@huawei.com>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> ---

> @@ -6895,6 +6901,18 @@ void perf_prepare_sample(struct perf_event_header *header,
>  	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
>  		data->phys_addr = perf_virt_to_phys(data->addr);
>  
> +	if (sample_type & PERF_SAMPLE_CGROUP) {
> +		u64 cgrp_id = 0;
> +#ifdef CONFIG_CGROUP_PERF
> +		struct cgroup *cgrp;
> +
> +		/* protected by RCU */
> +		cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
> +		cgrp_id = cgroup_id(cgrp);
> +#endif
> +		data->cgroup = cgrp_id;
> +	}

Would it make more sense to refuse SAMPLE_CGROUP if !CGROUP_PERF?

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 2/9] perf/core: Add PERF_SAMPLE_CGROUP feature
  2019-12-20  4:32 [PATCHSET 0/9] perf: Improve cgroup profiling (v2) Namhyung Kim
@ 2019-12-20  4:32 ` Namhyung Kim
  2019-12-20  9:36   ` Peter Zijlstra
  2019-12-20 15:23   ` Tejun Heo
  0 siblings, 2 replies; 43+ messages in thread
From: Namhyung Kim @ 2019-12-20  4:32 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo
  Cc: Jiri Olsa, Alexander Shishkin, Mark Rutland, Stephane Eranian,
	LKML, linux-perf-users, Tejun Heo, Li Zefan, Johannes Weiner

The PERF_SAMPLE_CGROUP bit is to save (perf_event) cgroup information
in the sample.  It will add a 64-bit id to identify current cgroup and
it's the file handle in the cgroup file system.  Userspace should use
this information with PERF_RECORD_CGROUP event to match which cgroup
it belongs.

I put it before PERF_SAMPLE_AUX for simplicity since it just needs a
64-bit word.  But if we want bigger samples, I can work on that
direction too.

Cc: Tejun Heo <tj@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 include/linux/perf_event.h      |  1 +
 include/uapi/linux/perf_event.h |  3 ++-
 kernel/events/core.c            | 18 ++++++++++++++++++
 3 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 6d4c22aee384..17b5bff045a6 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1001,6 +1001,7 @@ struct perf_sample_data {
 	u64				stack_user_size;
 
 	u64				phys_addr;
+	u64				cgroup;
 } ____cacheline_aligned;
 
 /* default value for data source */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 7bae2d3380a6..e59c89b0286b 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -142,8 +142,9 @@ enum perf_event_sample_format {
 	PERF_SAMPLE_REGS_INTR			= 1U << 18,
 	PERF_SAMPLE_PHYS_ADDR			= 1U << 19,
 	PERF_SAMPLE_AUX				= 1U << 20,
+	PERF_SAMPLE_CGROUP			= 1U << 21,
 
-	PERF_SAMPLE_MAX = 1U << 21,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 22,		/* non-ABI */
 
 	__PERF_SAMPLE_CALLCHAIN_EARLY		= 1ULL << 63, /* non-ABI; internal use */
 };
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9bcb2b552acc..d19f04ecfa14 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1754,6 +1754,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
 	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
 		size += sizeof(data->phys_addr);
 
+	if (sample_type & PERF_SAMPLE_CGROUP)
+		size += sizeof(data->cgroup);
+
 	event->header_size = size;
 }
 
@@ -6699,6 +6702,9 @@ void perf_output_sample(struct perf_output_handle *handle,
 	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
 		perf_output_put(handle, data->phys_addr);
 
+	if (sample_type & PERF_SAMPLE_CGROUP)
+		perf_output_put(handle, data->cgroup);
+
 	if (sample_type & PERF_SAMPLE_AUX) {
 		perf_output_put(handle, data->aux_size);
 
@@ -6895,6 +6901,18 @@ void perf_prepare_sample(struct perf_event_header *header,
 	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
 		data->phys_addr = perf_virt_to_phys(data->addr);
 
+	if (sample_type & PERF_SAMPLE_CGROUP) {
+		u64 cgrp_id = 0;
+#ifdef CONFIG_CGROUP_PERF
+		struct cgroup *cgrp;
+
+		/* protected by RCU */
+		cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
+		cgrp_id = cgroup_id(cgrp);
+#endif
+		data->cgroup = cgrp_id;
+	}
+
 	if (sample_type & PERF_SAMPLE_AUX) {
 		u64 size;
 
-- 
2.24.1.735.g03f4e72817-goog


^ permalink raw reply	[flat|nested] 43+ messages in thread

end of thread, other threads:[~2020-03-25 12:45 UTC | newest]

Thread overview: 43+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-08-28  7:31 [PATCHSET 0/9] perf: Improve cgroup profiling (v1) Namhyung Kim
2019-08-28  7:31 ` [PATCH 1/9] perf/core: Add PERF_RECORD_CGROUP event Namhyung Kim
2019-08-28  9:44   ` Peter Zijlstra
2019-08-28  9:44   ` Peter Zijlstra
2019-08-28 13:13     ` Arnaldo Carvalho de Melo
2019-08-30  3:46     ` Namhyung Kim
2019-08-30  7:34       ` Peter Zijlstra
2019-08-30 22:49         ` Namhyung Kim
2019-08-30 23:52           ` Stephane Eranian
2019-08-28 14:48   ` Tejun Heo
2019-08-30  3:56     ` Namhyung Kim
2019-08-28  7:31 ` [PATCH 2/9] perf/core: Add PERF_SAMPLE_CGROUP feature Namhyung Kim
2019-08-28 14:49   ` Tejun Heo
2019-08-31  3:03     ` Namhyung Kim
2019-08-31  4:58       ` Tejun Heo
2019-09-03  2:13         ` Namhyung Kim
2019-09-05 16:56           ` Tejun Heo
2019-09-08 13:28             ` Namhyung Kim
2019-09-14 14:02         ` Song Liu
2019-09-16 15:23           ` Tejun Heo
2019-09-19  6:42             ` Song Liu
2019-09-20  8:47               ` Namhyung Kim
2019-09-20 16:13                 ` Song Liu
2019-09-20 21:04                 ` Tejun Heo
2019-10-02  6:28                   ` Namhyung Kim
2019-10-07 14:16                     ` Tejun Heo
2019-08-28  7:31 ` [PATCH 3/9] perf tools: Basic support for CGROUP event Namhyung Kim
2019-08-30 12:55   ` Arnaldo Carvalho de Melo
2019-08-30 22:51     ` Namhyung Kim
2019-08-28  7:31 ` [PATCH 4/9] perf tools: Maintain cgroup hierarchy Namhyung Kim
2019-08-28  7:31 ` [PATCH 5/9] perf report: Add 'cgroup' sort key Namhyung Kim
2019-08-28  7:31 ` [PATCH 6/9] perf record: Support synthesizing cgroup events Namhyung Kim
2019-08-28  7:31 ` [PATCH 7/9] perf record: Add --all-cgroups option Namhyung Kim
2019-08-28  7:31 ` [PATCH 8/9] perf top: " Namhyung Kim
2019-08-28  7:31 ` [PATCH 9/9] perf script: Add --show-cgroup-events option Namhyung Kim
2019-12-20  4:32 [PATCHSET 0/9] perf: Improve cgroup profiling (v2) Namhyung Kim
2019-12-20  4:32 ` [PATCH 2/9] perf/core: Add PERF_SAMPLE_CGROUP feature Namhyung Kim
2019-12-20  9:36   ` Peter Zijlstra
2019-12-20 15:23   ` Tejun Heo
2019-12-20 16:48     ` Peter Zijlstra
2019-12-20 16:59       ` Tejun Heo
2019-12-23  6:07 [PATCHSET 0/9] perf: Improve cgroup profiling (v3) Namhyung Kim
2019-12-23  6:07 ` [PATCH 2/9] perf/core: Add PERF_SAMPLE_CGROUP feature Namhyung Kim
2020-01-07 13:34 [PATCHSET 0/9] perf: Improve cgroup profiling (v4) Namhyung Kim
2020-01-07 13:34 ` [PATCH 2/9] perf/core: Add PERF_SAMPLE_CGROUP feature Namhyung Kim
2020-03-25 12:45 [PATCHSET 0/9] perf: Improve cgroup profiling (v6) Namhyung Kim
2020-03-25 12:45 ` [PATCH 2/9] perf/core: Add PERF_SAMPLE_CGROUP feature Namhyung Kim

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.