bpf.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] perf lock contention: Account contending locks too
@ 2024-02-28  5:33 Namhyung Kim
  2024-02-28  6:32 ` Ian Rogers
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Namhyung Kim @ 2024-02-28  5:33 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo, Ian Rogers
  Cc: Jiri Olsa, Adrian Hunter, Peter Zijlstra, Ingo Molnar, LKML,
	linux-perf-users, Song Liu, bpf

Currently it accounts the contention using delta between timestamps in
lock:contention_begin and lock:contention_end tracepoints.  But it means
the lock should see the both events during the monitoring period.

Actually there are 4 cases that happen with the monitoring:

                monitoring period
            /                       \
            |                       |
 1:  B------+-----------------------+--------E
 2:    B----+-------------E         |
 3:         |           B-----------+----E
 4:         |     B-------------E   |
            |                       |
            t0                      t1

where B and E mean contention BEGIN and END, respectively.  So it only
accounts the case 4 for now.  It seems there's no way to handle the case
1.  The case 2 might be handled if it saved the timestamp (t0), but it
lacks the information from the B notably the flags which shows the lock
types.  Also it could be a nested lock which it currently ignores.  So
I think we should ignore the case 2.

However we can handle the case 3 if we save the timestamp (t1) at the
end of the period.  And then it can iterate the map entries in the
userspace and update the lock stat accordinly.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
v2: add a comment on mark_end_timestamp  (Ian)

 tools/perf/util/bpf_lock_contention.c         | 120 ++++++++++++++++++
 .../perf/util/bpf_skel/lock_contention.bpf.c  |  16 ++-
 tools/perf/util/bpf_skel/lock_data.h          |   7 +
 3 files changed, 136 insertions(+), 7 deletions(-)

diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c
index 31ff19afc20c..9af76c6b2543 100644
--- a/tools/perf/util/bpf_lock_contention.c
+++ b/tools/perf/util/bpf_lock_contention.c
@@ -179,6 +179,123 @@ int lock_contention_prepare(struct lock_contention *con)
 	return 0;
 }
 
+/*
+ * Run the BPF program directly using BPF_PROG_TEST_RUN to update the end
+ * timestamp in ktime so that it can calculate delta easily.
+ */
+static void mark_end_timestamp(void)
+{
+	DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts,
+		.flags = BPF_F_TEST_RUN_ON_CPU,
+	);
+	int prog_fd = bpf_program__fd(skel->progs.end_timestamp);
+
+	bpf_prog_test_run_opts(prog_fd, &opts);
+}
+
+static void update_lock_stat(int map_fd, int pid, u64 end_ts,
+			     enum lock_aggr_mode aggr_mode,
+			     struct tstamp_data *ts_data)
+{
+	u64 delta;
+	struct contention_key stat_key = {};
+	struct contention_data stat_data;
+
+	if (ts_data->timestamp >= end_ts)
+		return;
+
+	delta = end_ts - ts_data->timestamp;
+
+	switch (aggr_mode) {
+	case LOCK_AGGR_CALLER:
+		stat_key.stack_id = ts_data->stack_id;
+		break;
+	case LOCK_AGGR_TASK:
+		stat_key.pid = pid;
+		break;
+	case LOCK_AGGR_ADDR:
+		stat_key.lock_addr_or_cgroup = ts_data->lock;
+		break;
+	case LOCK_AGGR_CGROUP:
+		/* TODO */
+		return;
+	default:
+		return;
+	}
+
+	if (bpf_map_lookup_elem(map_fd, &stat_key, &stat_data) < 0)
+		return;
+
+	stat_data.total_time += delta;
+	stat_data.count++;
+
+	if (delta > stat_data.max_time)
+		stat_data.max_time = delta;
+	if (delta < stat_data.min_time)
+		stat_data.min_time = delta;
+
+	bpf_map_update_elem(map_fd, &stat_key, &stat_data, BPF_EXIST);
+}
+
+/*
+ * Account entries in the tstamp map (which didn't see the corresponding
+ * lock:contention_end tracepoint) using end_ts.
+ */
+static void account_end_timestamp(struct lock_contention *con)
+{
+	int ts_fd, stat_fd;
+	int *prev_key, key;
+	u64 end_ts = skel->bss->end_ts;
+	int total_cpus;
+	enum lock_aggr_mode aggr_mode = con->aggr_mode;
+	struct tstamp_data ts_data, *cpu_data;
+
+	/* Iterate per-task tstamp map (key = TID) */
+	ts_fd = bpf_map__fd(skel->maps.tstamp);
+	stat_fd = bpf_map__fd(skel->maps.lock_stat);
+
+	prev_key = NULL;
+	while (!bpf_map_get_next_key(ts_fd, prev_key, &key)) {
+		if (bpf_map_lookup_elem(ts_fd, &key, &ts_data) == 0) {
+			int pid = key;
+
+			if (aggr_mode == LOCK_AGGR_TASK && con->owner)
+				pid = ts_data.flags;
+
+			update_lock_stat(stat_fd, pid, end_ts, aggr_mode,
+					 &ts_data);
+		}
+
+		prev_key = &key;
+	}
+
+	/* Now it'll check per-cpu tstamp map which doesn't have TID. */
+	if (aggr_mode == LOCK_AGGR_TASK || aggr_mode == LOCK_AGGR_CGROUP)
+		return;
+
+	total_cpus = cpu__max_cpu().cpu;
+	ts_fd = bpf_map__fd(skel->maps.tstamp_cpu);
+
+	cpu_data = calloc(total_cpus, sizeof(*cpu_data));
+	if (cpu_data == NULL)
+		return;
+
+	prev_key = NULL;
+	while (!bpf_map_get_next_key(ts_fd, prev_key, &key)) {
+		if (bpf_map_lookup_elem(ts_fd, &key, cpu_data) < 0)
+			goto next;
+
+		for (int i = 0; i < total_cpus; i++) {
+			update_lock_stat(stat_fd, -1, end_ts, aggr_mode,
+					 &cpu_data[i]);
+		}
+
+next:
+		prev_key = &key;
+	}
+	free(cpu_data);
+}
+
 int lock_contention_start(void)
 {
 	skel->bss->enabled = 1;
@@ -188,6 +305,7 @@ int lock_contention_start(void)
 int lock_contention_stop(void)
 {
 	skel->bss->enabled = 0;
+	mark_end_timestamp();
 	return 0;
 }
 
@@ -301,6 +419,8 @@ int lock_contention_read(struct lock_contention *con)
 	if (stack_trace == NULL)
 		return -1;
 
+	account_end_timestamp(con);
+
 	if (con->aggr_mode == LOCK_AGGR_TASK) {
 		struct thread *idle = __machine__findnew_thread(machine,
 								/*pid=*/0,
diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
index 95cd8414f6ef..fb54bd38e7d0 100644
--- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
+++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
@@ -19,13 +19,6 @@
 #define LCB_F_PERCPU	(1U << 4)
 #define LCB_F_MUTEX	(1U << 5)
 
-struct tstamp_data {
-	__u64 timestamp;
-	__u64 lock;
-	__u32 flags;
-	__s32 stack_id;
-};
-
 /* callstack storage  */
 struct {
 	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
@@ -140,6 +133,8 @@ int perf_subsys_id = -1;
 /* determine the key of lock stat */
 int aggr_mode;
 
+__u64 end_ts;
+
 /* error stat */
 int task_fail;
 int stack_fail;
@@ -559,4 +554,11 @@ int BPF_PROG(collect_lock_syms)
 	return 0;
 }
 
+SEC("raw_tp/bpf_test_finish")
+int BPF_PROG(end_timestamp)
+{
+	end_ts = bpf_ktime_get_ns();
+	return 0;
+}
+
 char LICENSE[] SEC("license") = "Dual BSD/GPL";
diff --git a/tools/perf/util/bpf_skel/lock_data.h b/tools/perf/util/bpf_skel/lock_data.h
index 08482daf61be..36af11faad03 100644
--- a/tools/perf/util/bpf_skel/lock_data.h
+++ b/tools/perf/util/bpf_skel/lock_data.h
@@ -3,6 +3,13 @@
 #ifndef UTIL_BPF_SKEL_LOCK_DATA_H
 #define UTIL_BPF_SKEL_LOCK_DATA_H
 
+struct tstamp_data {
+	u64 timestamp;
+	u64 lock;
+	u32 flags;
+	u32 stack_id;
+};
+
 struct contention_key {
 	u32 stack_id;
 	u32 pid;
-- 
2.44.0.rc1.240.g4c46232300-goog


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] perf lock contention: Account contending locks too
  2024-02-28  5:33 [PATCH v2] perf lock contention: Account contending locks too Namhyung Kim
@ 2024-02-28  6:32 ` Ian Rogers
  2024-02-28 12:22 ` Arnaldo Carvalho de Melo
  2024-03-01 19:30 ` Namhyung Kim
  2 siblings, 0 replies; 9+ messages in thread
From: Ian Rogers @ 2024-02-28  6:32 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Arnaldo Carvalho de Melo, Jiri Olsa, Adrian Hunter,
	Peter Zijlstra, Ingo Molnar, LKML, linux-perf-users, Song Liu,
	bpf

On Tue, Feb 27, 2024 at 9:33 PM Namhyung Kim <namhyung@kernel.org> wrote:
>
> Currently it accounts the contention using delta between timestamps in
> lock:contention_begin and lock:contention_end tracepoints.  But it means
> the lock should see the both events during the monitoring period.
>
> Actually there are 4 cases that happen with the monitoring:
>
>                 monitoring period
>             /                       \
>             |                       |
>  1:  B------+-----------------------+--------E
>  2:    B----+-------------E         |
>  3:         |           B-----------+----E
>  4:         |     B-------------E   |
>             |                       |
>             t0                      t1
>
> where B and E mean contention BEGIN and END, respectively.  So it only
> accounts the case 4 for now.  It seems there's no way to handle the case
> 1.  The case 2 might be handled if it saved the timestamp (t0), but it
> lacks the information from the B notably the flags which shows the lock
> types.  Also it could be a nested lock which it currently ignores.  So
> I think we should ignore the case 2.
>
> However we can handle the case 3 if we save the timestamp (t1) at the
> end of the period.  And then it can iterate the map entries in the
> userspace and update the lock stat accordinly.
>
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>

Reviewed-by: Ian Rogers <irogers@google.com>

Thanks,
Ian

> ---
> v2: add a comment on mark_end_timestamp  (Ian)
>
>  tools/perf/util/bpf_lock_contention.c         | 120 ++++++++++++++++++
>  .../perf/util/bpf_skel/lock_contention.bpf.c  |  16 ++-
>  tools/perf/util/bpf_skel/lock_data.h          |   7 +
>  3 files changed, 136 insertions(+), 7 deletions(-)
>
> diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c
> index 31ff19afc20c..9af76c6b2543 100644
> --- a/tools/perf/util/bpf_lock_contention.c
> +++ b/tools/perf/util/bpf_lock_contention.c
> @@ -179,6 +179,123 @@ int lock_contention_prepare(struct lock_contention *con)
>         return 0;
>  }
>
> +/*
> + * Run the BPF program directly using BPF_PROG_TEST_RUN to update the end
> + * timestamp in ktime so that it can calculate delta easily.
> + */
> +static void mark_end_timestamp(void)
> +{
> +       DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts,
> +               .flags = BPF_F_TEST_RUN_ON_CPU,
> +       );
> +       int prog_fd = bpf_program__fd(skel->progs.end_timestamp);
> +
> +       bpf_prog_test_run_opts(prog_fd, &opts);
> +}
> +
> +static void update_lock_stat(int map_fd, int pid, u64 end_ts,
> +                            enum lock_aggr_mode aggr_mode,
> +                            struct tstamp_data *ts_data)
> +{
> +       u64 delta;
> +       struct contention_key stat_key = {};
> +       struct contention_data stat_data;
> +
> +       if (ts_data->timestamp >= end_ts)
> +               return;
> +
> +       delta = end_ts - ts_data->timestamp;
> +
> +       switch (aggr_mode) {
> +       case LOCK_AGGR_CALLER:
> +               stat_key.stack_id = ts_data->stack_id;
> +               break;
> +       case LOCK_AGGR_TASK:
> +               stat_key.pid = pid;
> +               break;
> +       case LOCK_AGGR_ADDR:
> +               stat_key.lock_addr_or_cgroup = ts_data->lock;
> +               break;
> +       case LOCK_AGGR_CGROUP:
> +               /* TODO */
> +               return;
> +       default:
> +               return;
> +       }
> +
> +       if (bpf_map_lookup_elem(map_fd, &stat_key, &stat_data) < 0)
> +               return;
> +
> +       stat_data.total_time += delta;
> +       stat_data.count++;
> +
> +       if (delta > stat_data.max_time)
> +               stat_data.max_time = delta;
> +       if (delta < stat_data.min_time)
> +               stat_data.min_time = delta;
> +
> +       bpf_map_update_elem(map_fd, &stat_key, &stat_data, BPF_EXIST);
> +}
> +
> +/*
> + * Account entries in the tstamp map (which didn't see the corresponding
> + * lock:contention_end tracepoint) using end_ts.
> + */
> +static void account_end_timestamp(struct lock_contention *con)
> +{
> +       int ts_fd, stat_fd;
> +       int *prev_key, key;
> +       u64 end_ts = skel->bss->end_ts;
> +       int total_cpus;
> +       enum lock_aggr_mode aggr_mode = con->aggr_mode;
> +       struct tstamp_data ts_data, *cpu_data;
> +
> +       /* Iterate per-task tstamp map (key = TID) */
> +       ts_fd = bpf_map__fd(skel->maps.tstamp);
> +       stat_fd = bpf_map__fd(skel->maps.lock_stat);
> +
> +       prev_key = NULL;
> +       while (!bpf_map_get_next_key(ts_fd, prev_key, &key)) {
> +               if (bpf_map_lookup_elem(ts_fd, &key, &ts_data) == 0) {
> +                       int pid = key;
> +
> +                       if (aggr_mode == LOCK_AGGR_TASK && con->owner)
> +                               pid = ts_data.flags;
> +
> +                       update_lock_stat(stat_fd, pid, end_ts, aggr_mode,
> +                                        &ts_data);
> +               }
> +
> +               prev_key = &key;
> +       }
> +
> +       /* Now it'll check per-cpu tstamp map which doesn't have TID. */
> +       if (aggr_mode == LOCK_AGGR_TASK || aggr_mode == LOCK_AGGR_CGROUP)
> +               return;
> +
> +       total_cpus = cpu__max_cpu().cpu;
> +       ts_fd = bpf_map__fd(skel->maps.tstamp_cpu);
> +
> +       cpu_data = calloc(total_cpus, sizeof(*cpu_data));
> +       if (cpu_data == NULL)
> +               return;
> +
> +       prev_key = NULL;
> +       while (!bpf_map_get_next_key(ts_fd, prev_key, &key)) {
> +               if (bpf_map_lookup_elem(ts_fd, &key, cpu_data) < 0)
> +                       goto next;
> +
> +               for (int i = 0; i < total_cpus; i++) {
> +                       update_lock_stat(stat_fd, -1, end_ts, aggr_mode,
> +                                        &cpu_data[i]);
> +               }
> +
> +next:
> +               prev_key = &key;
> +       }
> +       free(cpu_data);
> +}
> +
>  int lock_contention_start(void)
>  {
>         skel->bss->enabled = 1;
> @@ -188,6 +305,7 @@ int lock_contention_start(void)
>  int lock_contention_stop(void)
>  {
>         skel->bss->enabled = 0;
> +       mark_end_timestamp();
>         return 0;
>  }
>
> @@ -301,6 +419,8 @@ int lock_contention_read(struct lock_contention *con)
>         if (stack_trace == NULL)
>                 return -1;
>
> +       account_end_timestamp(con);
> +
>         if (con->aggr_mode == LOCK_AGGR_TASK) {
>                 struct thread *idle = __machine__findnew_thread(machine,
>                                                                 /*pid=*/0,
> diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> index 95cd8414f6ef..fb54bd38e7d0 100644
> --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
> +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> @@ -19,13 +19,6 @@
>  #define LCB_F_PERCPU   (1U << 4)
>  #define LCB_F_MUTEX    (1U << 5)
>
> -struct tstamp_data {
> -       __u64 timestamp;
> -       __u64 lock;
> -       __u32 flags;
> -       __s32 stack_id;
> -};
> -
>  /* callstack storage  */
>  struct {
>         __uint(type, BPF_MAP_TYPE_STACK_TRACE);
> @@ -140,6 +133,8 @@ int perf_subsys_id = -1;
>  /* determine the key of lock stat */
>  int aggr_mode;
>
> +__u64 end_ts;
> +
>  /* error stat */
>  int task_fail;
>  int stack_fail;
> @@ -559,4 +554,11 @@ int BPF_PROG(collect_lock_syms)
>         return 0;
>  }
>
> +SEC("raw_tp/bpf_test_finish")
> +int BPF_PROG(end_timestamp)
> +{
> +       end_ts = bpf_ktime_get_ns();
> +       return 0;
> +}
> +
>  char LICENSE[] SEC("license") = "Dual BSD/GPL";
> diff --git a/tools/perf/util/bpf_skel/lock_data.h b/tools/perf/util/bpf_skel/lock_data.h
> index 08482daf61be..36af11faad03 100644
> --- a/tools/perf/util/bpf_skel/lock_data.h
> +++ b/tools/perf/util/bpf_skel/lock_data.h
> @@ -3,6 +3,13 @@
>  #ifndef UTIL_BPF_SKEL_LOCK_DATA_H
>  #define UTIL_BPF_SKEL_LOCK_DATA_H
>
> +struct tstamp_data {
> +       u64 timestamp;
> +       u64 lock;
> +       u32 flags;
> +       u32 stack_id;
> +};
> +
>  struct contention_key {
>         u32 stack_id;
>         u32 pid;
> --
> 2.44.0.rc1.240.g4c46232300-goog
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] perf lock contention: Account contending locks too
  2024-02-28  5:33 [PATCH v2] perf lock contention: Account contending locks too Namhyung Kim
  2024-02-28  6:32 ` Ian Rogers
@ 2024-02-28 12:22 ` Arnaldo Carvalho de Melo
  2024-02-28 20:01   ` Namhyung Kim
  2024-03-01 19:30 ` Namhyung Kim
  2 siblings, 1 reply; 9+ messages in thread
From: Arnaldo Carvalho de Melo @ 2024-02-28 12:22 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Ian Rogers, Jiri Olsa, Adrian Hunter, Peter Zijlstra,
	Ingo Molnar, LKML, linux-perf-users, Song Liu, bpf

On Tue, Feb 27, 2024 at 09:33:35PM -0800, Namhyung Kim wrote:
> Currently it accounts the contention using delta between timestamps in
> lock:contention_begin and lock:contention_end tracepoints.  But it means
> the lock should see the both events during the monitoring period.
> 
> Actually there are 4 cases that happen with the monitoring:
> 
>                 monitoring period
>             /                       \
>             |                       |
>  1:  B------+-----------------------+--------E
>  2:    B----+-------------E         |
>  3:         |           B-----------+----E
>  4:         |     B-------------E   |
>             |                       |
>             t0                      t1
> 
> where B and E mean contention BEGIN and END, respectively.  So it only
> accounts the case 4 for now.  It seems there's no way to handle the case
> 1.  The case 2 might be handled if it saved the timestamp (t0), but it
> lacks the information from the B notably the flags which shows the lock
> types.  Also it could be a nested lock which it currently ignores.  So
> I think we should ignore the case 2.

Perhaps have a separate output listing locks that were found to be with
at least tE - t0 time, with perhaps a backtrace at that END time?

With that we wouldn't miss that info, however incomplete it is and the
user would try running again, perhaps for a longer time, or start
monitoring before the observed workload starts, etc.

Anyway:

Reviwed-by: Arnaldo Carvalho de Melo <acme@redhat.com>

- Arnaldo
 
> However we can handle the case 3 if we save the timestamp (t1) at the
> end of the period.  And then it can iterate the map entries in the
> userspace and update the lock stat accordinly.
> 
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> ---
> v2: add a comment on mark_end_timestamp  (Ian)
> 
>  tools/perf/util/bpf_lock_contention.c         | 120 ++++++++++++++++++
>  .../perf/util/bpf_skel/lock_contention.bpf.c  |  16 ++-
>  tools/perf/util/bpf_skel/lock_data.h          |   7 +
>  3 files changed, 136 insertions(+), 7 deletions(-)
> 
> diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c
> index 31ff19afc20c..9af76c6b2543 100644
> --- a/tools/perf/util/bpf_lock_contention.c
> +++ b/tools/perf/util/bpf_lock_contention.c
> @@ -179,6 +179,123 @@ int lock_contention_prepare(struct lock_contention *con)
>  	return 0;
>  }
>  
> +/*
> + * Run the BPF program directly using BPF_PROG_TEST_RUN to update the end
> + * timestamp in ktime so that it can calculate delta easily.
> + */
> +static void mark_end_timestamp(void)
> +{
> +	DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts,
> +		.flags = BPF_F_TEST_RUN_ON_CPU,
> +	);
> +	int prog_fd = bpf_program__fd(skel->progs.end_timestamp);
> +
> +	bpf_prog_test_run_opts(prog_fd, &opts);
> +}
> +
> +static void update_lock_stat(int map_fd, int pid, u64 end_ts,
> +			     enum lock_aggr_mode aggr_mode,
> +			     struct tstamp_data *ts_data)
> +{
> +	u64 delta;
> +	struct contention_key stat_key = {};
> +	struct contention_data stat_data;
> +
> +	if (ts_data->timestamp >= end_ts)
> +		return;
> +
> +	delta = end_ts - ts_data->timestamp;
> +
> +	switch (aggr_mode) {
> +	case LOCK_AGGR_CALLER:
> +		stat_key.stack_id = ts_data->stack_id;
> +		break;
> +	case LOCK_AGGR_TASK:
> +		stat_key.pid = pid;
> +		break;
> +	case LOCK_AGGR_ADDR:
> +		stat_key.lock_addr_or_cgroup = ts_data->lock;
> +		break;
> +	case LOCK_AGGR_CGROUP:
> +		/* TODO */
> +		return;
> +	default:
> +		return;
> +	}
> +
> +	if (bpf_map_lookup_elem(map_fd, &stat_key, &stat_data) < 0)
> +		return;
> +
> +	stat_data.total_time += delta;
> +	stat_data.count++;
> +
> +	if (delta > stat_data.max_time)
> +		stat_data.max_time = delta;
> +	if (delta < stat_data.min_time)
> +		stat_data.min_time = delta;
> +
> +	bpf_map_update_elem(map_fd, &stat_key, &stat_data, BPF_EXIST);
> +}
> +
> +/*
> + * Account entries in the tstamp map (which didn't see the corresponding
> + * lock:contention_end tracepoint) using end_ts.
> + */
> +static void account_end_timestamp(struct lock_contention *con)
> +{
> +	int ts_fd, stat_fd;
> +	int *prev_key, key;
> +	u64 end_ts = skel->bss->end_ts;
> +	int total_cpus;
> +	enum lock_aggr_mode aggr_mode = con->aggr_mode;
> +	struct tstamp_data ts_data, *cpu_data;
> +
> +	/* Iterate per-task tstamp map (key = TID) */
> +	ts_fd = bpf_map__fd(skel->maps.tstamp);
> +	stat_fd = bpf_map__fd(skel->maps.lock_stat);
> +
> +	prev_key = NULL;
> +	while (!bpf_map_get_next_key(ts_fd, prev_key, &key)) {
> +		if (bpf_map_lookup_elem(ts_fd, &key, &ts_data) == 0) {
> +			int pid = key;
> +
> +			if (aggr_mode == LOCK_AGGR_TASK && con->owner)
> +				pid = ts_data.flags;
> +
> +			update_lock_stat(stat_fd, pid, end_ts, aggr_mode,
> +					 &ts_data);
> +		}
> +
> +		prev_key = &key;
> +	}
> +
> +	/* Now it'll check per-cpu tstamp map which doesn't have TID. */
> +	if (aggr_mode == LOCK_AGGR_TASK || aggr_mode == LOCK_AGGR_CGROUP)
> +		return;
> +
> +	total_cpus = cpu__max_cpu().cpu;
> +	ts_fd = bpf_map__fd(skel->maps.tstamp_cpu);
> +
> +	cpu_data = calloc(total_cpus, sizeof(*cpu_data));
> +	if (cpu_data == NULL)
> +		return;
> +
> +	prev_key = NULL;
> +	while (!bpf_map_get_next_key(ts_fd, prev_key, &key)) {
> +		if (bpf_map_lookup_elem(ts_fd, &key, cpu_data) < 0)
> +			goto next;
> +
> +		for (int i = 0; i < total_cpus; i++) {
> +			update_lock_stat(stat_fd, -1, end_ts, aggr_mode,
> +					 &cpu_data[i]);
> +		}
> +
> +next:
> +		prev_key = &key;
> +	}
> +	free(cpu_data);
> +}
> +
>  int lock_contention_start(void)
>  {
>  	skel->bss->enabled = 1;
> @@ -188,6 +305,7 @@ int lock_contention_start(void)
>  int lock_contention_stop(void)
>  {
>  	skel->bss->enabled = 0;
> +	mark_end_timestamp();
>  	return 0;
>  }
>  
> @@ -301,6 +419,8 @@ int lock_contention_read(struct lock_contention *con)
>  	if (stack_trace == NULL)
>  		return -1;
>  
> +	account_end_timestamp(con);
> +
>  	if (con->aggr_mode == LOCK_AGGR_TASK) {
>  		struct thread *idle = __machine__findnew_thread(machine,
>  								/*pid=*/0,
> diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> index 95cd8414f6ef..fb54bd38e7d0 100644
> --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
> +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> @@ -19,13 +19,6 @@
>  #define LCB_F_PERCPU	(1U << 4)
>  #define LCB_F_MUTEX	(1U << 5)
>  
> -struct tstamp_data {
> -	__u64 timestamp;
> -	__u64 lock;
> -	__u32 flags;
> -	__s32 stack_id;
> -};
> -
>  /* callstack storage  */
>  struct {
>  	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
> @@ -140,6 +133,8 @@ int perf_subsys_id = -1;
>  /* determine the key of lock stat */
>  int aggr_mode;
>  
> +__u64 end_ts;
> +
>  /* error stat */
>  int task_fail;
>  int stack_fail;
> @@ -559,4 +554,11 @@ int BPF_PROG(collect_lock_syms)
>  	return 0;
>  }
>  
> +SEC("raw_tp/bpf_test_finish")
> +int BPF_PROG(end_timestamp)
> +{
> +	end_ts = bpf_ktime_get_ns();
> +	return 0;
> +}
> +
>  char LICENSE[] SEC("license") = "Dual BSD/GPL";
> diff --git a/tools/perf/util/bpf_skel/lock_data.h b/tools/perf/util/bpf_skel/lock_data.h
> index 08482daf61be..36af11faad03 100644
> --- a/tools/perf/util/bpf_skel/lock_data.h
> +++ b/tools/perf/util/bpf_skel/lock_data.h
> @@ -3,6 +3,13 @@
>  #ifndef UTIL_BPF_SKEL_LOCK_DATA_H
>  #define UTIL_BPF_SKEL_LOCK_DATA_H
>  
> +struct tstamp_data {
> +	u64 timestamp;
> +	u64 lock;
> +	u32 flags;
> +	u32 stack_id;
> +};
> +
>  struct contention_key {
>  	u32 stack_id;
>  	u32 pid;
> -- 
> 2.44.0.rc1.240.g4c46232300-goog

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] perf lock contention: Account contending locks too
  2024-02-28 12:22 ` Arnaldo Carvalho de Melo
@ 2024-02-28 20:01   ` Namhyung Kim
  2024-02-28 20:16     ` Arnaldo Carvalho de Melo
  0 siblings, 1 reply; 9+ messages in thread
From: Namhyung Kim @ 2024-02-28 20:01 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: Ian Rogers, Jiri Olsa, Adrian Hunter, Peter Zijlstra,
	Ingo Molnar, LKML, linux-perf-users, Song Liu, bpf

On Wed, Feb 28, 2024 at 4:22 AM Arnaldo Carvalho de Melo
<acme@kernel.org> wrote:
>
> On Tue, Feb 27, 2024 at 09:33:35PM -0800, Namhyung Kim wrote:
> > Currently it accounts the contention using delta between timestamps in
> > lock:contention_begin and lock:contention_end tracepoints.  But it means
> > the lock should see the both events during the monitoring period.
> >
> > Actually there are 4 cases that happen with the monitoring:
> >
> >                 monitoring period
> >             /                       \
> >             |                       |
> >  1:  B------+-----------------------+--------E
> >  2:    B----+-------------E         |
> >  3:         |           B-----------+----E
> >  4:         |     B-------------E   |
> >             |                       |
> >             t0                      t1
> >
> > where B and E mean contention BEGIN and END, respectively.  So it only
> > accounts the case 4 for now.  It seems there's no way to handle the case
> > 1.  The case 2 might be handled if it saved the timestamp (t0), but it
> > lacks the information from the B notably the flags which shows the lock
> > types.  Also it could be a nested lock which it currently ignores.  So
> > I think we should ignore the case 2.
>
> Perhaps have a separate output listing locks that were found to be with
> at least tE - t0 time, with perhaps a backtrace at that END time?

Do you mean long contentions in case 3?  I'm not sure what do
you mean by tE, but they started after t0 so cannot be greater
than or equal to the monitoring period.  Maybe we can try with
say, 90% of period but we can still miss something.

And collecting backtrace of other task would be racy as the it
may not contend anymore.

>
> With that we wouldn't miss that info, however incomplete it is and the
> user would try running again, perhaps for a longer time, or start
> monitoring before the observed workload starts, etc.

Yeah, it can be useful.  Let me think about it more.

>
> Anyway:
>
> Reviwed-by: Arnaldo Carvalho de Melo <acme@redhat.com>

Thanks for your review!
Namhyung

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] perf lock contention: Account contending locks too
  2024-02-28 20:01   ` Namhyung Kim
@ 2024-02-28 20:16     ` Arnaldo Carvalho de Melo
  2024-02-28 21:19       ` Namhyung Kim
  0 siblings, 1 reply; 9+ messages in thread
From: Arnaldo Carvalho de Melo @ 2024-02-28 20:16 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Ian Rogers, Jiri Olsa, Adrian Hunter, Peter Zijlstra,
	Ingo Molnar, LKML, linux-perf-users, Song Liu, bpf

On Wed, Feb 28, 2024 at 12:01:55PM -0800, Namhyung Kim wrote:
> On Wed, Feb 28, 2024 at 4:22 AM Arnaldo Carvalho de Melo
> <acme@kernel.org> wrote:
> >
> > On Tue, Feb 27, 2024 at 09:33:35PM -0800, Namhyung Kim wrote:
> > > Currently it accounts the contention using delta between timestamps in
> > > lock:contention_begin and lock:contention_end tracepoints.  But it means
> > > the lock should see the both events during the monitoring period.
> > >
> > > Actually there are 4 cases that happen with the monitoring:
> > >
> > >                 monitoring period
> > >             /                       \
> > >             |                       |
> > >  1:  B------+-----------------------+--------E
> > >  2:    B----+-------------E         |
> > >  3:         |           B-----------+----E
> > >  4:         |     B-------------E   |
> > >             |                       |
> > >             t0                      t1
> > >
> > > where B and E mean contention BEGIN and END, respectively.  So it only
> > > accounts the case 4 for now.  It seems there's no way to handle the case
> > > 1.  The case 2 might be handled if it saved the timestamp (t0), but it
> > > lacks the information from the B notably the flags which shows the lock
> > > types.  Also it could be a nested lock which it currently ignores.  So
> > > I think we should ignore the case 2.
> >
> > Perhaps have a separate output listing locks that were found to be with
> > at least tE - t0 time, with perhaps a backtrace at that END time?
> 
> Do you mean long contentions in case 3?  I'm not sure what do
> you mean by tE, but they started after t0 so cannot be greater

case 2

                monitoring period
            /                       \
            |                       |
 2:    B----+-------------E         |
            |             |         |
            t0            tE        t1

We get a notification for event E, right? We don´t have one for B,
because it happened before we were monitoring.

> than or equal to the monitoring period.  Maybe we can try with
> say, 90% of period but we can still miss something.
> 
> And collecting backtrace of other task would be racy as the it
> may not contend anymore.
> 
> > With that we wouldn't miss that info, however incomplete it is and the
> > user would try running again, perhaps for a longer time, or start
> > monitoring before the observed workload starts, etc.
> 
> Yeah, it can be useful.  Let me think about it more.
> 
> >
> > Anyway:
> >
> > Reviwed-by: Arnaldo Carvalho de Melo <acme@redhat.com>
> 
> Thanks for your review!
> Namhyung
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] perf lock contention: Account contending locks too
  2024-02-28 20:16     ` Arnaldo Carvalho de Melo
@ 2024-02-28 21:19       ` Namhyung Kim
  2024-02-29 17:23         ` Arnaldo Carvalho de Melo
  0 siblings, 1 reply; 9+ messages in thread
From: Namhyung Kim @ 2024-02-28 21:19 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: Ian Rogers, Jiri Olsa, Adrian Hunter, Peter Zijlstra,
	Ingo Molnar, LKML, linux-perf-users, Song Liu, bpf

On Wed, Feb 28, 2024 at 12:16 PM Arnaldo Carvalho de Melo
<acme@kernel.org> wrote:
>
> On Wed, Feb 28, 2024 at 12:01:55PM -0800, Namhyung Kim wrote:
> > On Wed, Feb 28, 2024 at 4:22 AM Arnaldo Carvalho de Melo
> > <acme@kernel.org> wrote:
> > >
> > > On Tue, Feb 27, 2024 at 09:33:35PM -0800, Namhyung Kim wrote:
> > > > Currently it accounts the contention using delta between timestamps in
> > > > lock:contention_begin and lock:contention_end tracepoints.  But it means
> > > > the lock should see the both events during the monitoring period.
> > > >
> > > > Actually there are 4 cases that happen with the monitoring:
> > > >
> > > >                 monitoring period
> > > >             /                       \
> > > >             |                       |
> > > >  1:  B------+-----------------------+--------E
> > > >  2:    B----+-------------E         |
> > > >  3:         |           B-----------+----E
> > > >  4:         |     B-------------E   |
> > > >             |                       |
> > > >             t0                      t1
> > > >
> > > > where B and E mean contention BEGIN and END, respectively.  So it only
> > > > accounts the case 4 for now.  It seems there's no way to handle the case
> > > > 1.  The case 2 might be handled if it saved the timestamp (t0), but it
> > > > lacks the information from the B notably the flags which shows the lock
> > > > types.  Also it could be a nested lock which it currently ignores.  So
> > > > I think we should ignore the case 2.
> > >
> > > Perhaps have a separate output listing locks that were found to be with
> > > at least tE - t0 time, with perhaps a backtrace at that END time?
> >
> > Do you mean long contentions in case 3?  I'm not sure what do
> > you mean by tE, but they started after t0 so cannot be greater
>
> case 2
>
>                 monitoring period
>             /                       \
>             |                       |
>  2:    B----+-------------E         |
>             |             |         |
>             t0            tE        t1
>
> We get a notification for event E, right? We don´t have one for B,
> because it happened before we were monitoring.

Ah, ok.  But there should be too many events in case 2 and
I don't think users want to see them all.  And they don't have
flags.  But maybe we can update the flag when it sees exactly
the same callstack later.

Thanks,
Namhyung

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] perf lock contention: Account contending locks too
  2024-02-28 21:19       ` Namhyung Kim
@ 2024-02-29 17:23         ` Arnaldo Carvalho de Melo
  2024-02-29 21:53           ` Namhyung Kim
  0 siblings, 1 reply; 9+ messages in thread
From: Arnaldo Carvalho de Melo @ 2024-02-29 17:23 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Ian Rogers, Jiri Olsa, Adrian Hunter, Peter Zijlstra,
	Ingo Molnar, LKML, linux-perf-users, Song Liu, bpf

On Wed, Feb 28, 2024 at 01:19:12PM -0800, Namhyung Kim wrote:
> On Wed, Feb 28, 2024 at 12:16 PM Arnaldo Carvalho de Melo <acme@kernel.org> wrote:
> > On Wed, Feb 28, 2024 at 12:01:55PM -0800, Namhyung Kim wrote:
> > > On Wed, Feb 28, 2024 at 4:22 AM Arnaldo Carvalho de Melo <acme@kernel.org> wrote:
> > > > On Tue, Feb 27, 2024 at 09:33:35PM -0800, Namhyung Kim wrote:
> > > > > Currently it accounts the contention using delta between timestamps in
> > > > > lock:contention_begin and lock:contention_end tracepoints.  But it means
> > > > > the lock should see the both events during the monitoring period.

> > > > > Actually there are 4 cases that happen with the monitoring:

> > > > >                 monitoring period
> > > > >             /                       \
> > > > >             |                       |
> > > > >  1:  B------+-----------------------+--------E
> > > > >  2:    B----+-------------E         |
> > > > >  3:         |           B-----------+----E
> > > > >  4:         |     B-------------E   |
> > > > >             |                       |
> > > > >             t0                      t1

> > > > > where B and E mean contention BEGIN and END, respectively.  So it only
> > > > > accounts the case 4 for now.  It seems there's no way to handle the case
> > > > > 1.  The case 2 might be handled if it saved the timestamp (t0), but it
> > > > > lacks the information from the B notably the flags which shows the lock
> > > > > types.  Also it could be a nested lock which it currently ignores.  So
> > > > > I think we should ignore the case 2.

> > > > Perhaps have a separate output listing locks that were found to be with
> > > > at least tE - t0 time, with perhaps a backtrace at that END time?

> > > Do you mean long contentions in case 3?  I'm not sure what do
> > > you mean by tE, but they started after t0 so cannot be greater

> > case 2

> >                 monitoring period
> >             /                       \
> >             |                       |
> >  2:    B----+-------------E         |
> >             |             |         |
> >             t0            tE        t1
> >
> > We get a notification for event E, right? We don´t have one for B,
> > because it happened before we were monitoring.
> 
> Ah, ok.  But there should be too many events in case 2 and
> I don't think users want to see them all.  And they don't have

So maybe a summary, something like:

  N locks that were locked before this session started have been
  released, no further info besides this histogram of in-session
  durations:

    0-N units of time: ++
  N+1-M units of time: ++++
    ...

> flags.  But maybe we can update the flag when it sees exactly
> the same callstack later.

  The callstack, if going all the way to userspace may have the workload
targeted in the command line ( some pid, tid, CPU, etc) and thus would
point for things the user probably is interested than some other lock
that may affect it but indirectly.

- Arnaldo

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] perf lock contention: Account contending locks too
  2024-02-29 17:23         ` Arnaldo Carvalho de Melo
@ 2024-02-29 21:53           ` Namhyung Kim
  0 siblings, 0 replies; 9+ messages in thread
From: Namhyung Kim @ 2024-02-29 21:53 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: Ian Rogers, Jiri Olsa, Adrian Hunter, Peter Zijlstra,
	Ingo Molnar, LKML, linux-perf-users, Song Liu, bpf

On Thu, Feb 29, 2024 at 9:23 AM Arnaldo Carvalho de Melo
<acme@kernel.org> wrote:
>
> On Wed, Feb 28, 2024 at 01:19:12PM -0800, Namhyung Kim wrote:
> > On Wed, Feb 28, 2024 at 12:16 PM Arnaldo Carvalho de Melo <acme@kernel.org> wrote:
> > > On Wed, Feb 28, 2024 at 12:01:55PM -0800, Namhyung Kim wrote:
> > > > On Wed, Feb 28, 2024 at 4:22 AM Arnaldo Carvalho de Melo <acme@kernel.org> wrote:
> > > > > On Tue, Feb 27, 2024 at 09:33:35PM -0800, Namhyung Kim wrote:
> > > > > > Currently it accounts the contention using delta between timestamps in
> > > > > > lock:contention_begin and lock:contention_end tracepoints.  But it means
> > > > > > the lock should see the both events during the monitoring period.
>
> > > > > > Actually there are 4 cases that happen with the monitoring:
>
> > > > > >                 monitoring period
> > > > > >             /                       \
> > > > > >             |                       |
> > > > > >  1:  B------+-----------------------+--------E
> > > > > >  2:    B----+-------------E         |
> > > > > >  3:         |           B-----------+----E
> > > > > >  4:         |     B-------------E   |
> > > > > >             |                       |
> > > > > >             t0                      t1
>
> > > > > > where B and E mean contention BEGIN and END, respectively.  So it only
> > > > > > accounts the case 4 for now.  It seems there's no way to handle the case
> > > > > > 1.  The case 2 might be handled if it saved the timestamp (t0), but it
> > > > > > lacks the information from the B notably the flags which shows the lock
> > > > > > types.  Also it could be a nested lock which it currently ignores.  So
> > > > > > I think we should ignore the case 2.
>
> > > > > Perhaps have a separate output listing locks that were found to be with
> > > > > at least tE - t0 time, with perhaps a backtrace at that END time?
>
> > > > Do you mean long contentions in case 3?  I'm not sure what do
> > > > you mean by tE, but they started after t0 so cannot be greater
>
> > > case 2
>
> > >                 monitoring period
> > >             /                       \
> > >             |                       |
> > >  2:    B----+-------------E         |
> > >             |             |         |
> > >             t0            tE        t1
> > >
> > > We get a notification for event E, right? We don´t have one for B,
> > > because it happened before we were monitoring.
> >
> > Ah, ok.  But there should be too many events in case 2 and
> > I don't think users want to see them all.  And they don't have
>
> So maybe a summary, something like:
>
>   N locks that were locked before this session started have been
>   released, no further info besides this histogram of in-session
>   durations:
>
>     0-N units of time: ++
>   N+1-M units of time: ++++
>     ...

Summary could work.  But I'm not sure about the histogram
since different locks would have different behavior - spinlock
vs. mutex/semaphore.  Maybe it's more meaningful when
you have filters or separate histograms for each lock.


>
> > flags.  But maybe we can update the flag when it sees exactly
> > the same callstack later.
>
>   The callstack, if going all the way to userspace may have the workload
> targeted in the command line ( some pid, tid, CPU, etc) and thus would
> point for things the user probably is interested than some other lock
> that may affect it but indirectly.

It doesn't collect user callstacks yet since it requires recording memory
address space information of every process - basically FORK and MMAP.
Maybe we can use callstacks with build-ID and offsets but it also requires
mapping from build-ID to binary somewhere.

Anyway, it's good to add more features to it.  Let me process this
patch first and think about more later. :)

Thanks,
Namhyung

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] perf lock contention: Account contending locks too
  2024-02-28  5:33 [PATCH v2] perf lock contention: Account contending locks too Namhyung Kim
  2024-02-28  6:32 ` Ian Rogers
  2024-02-28 12:22 ` Arnaldo Carvalho de Melo
@ 2024-03-01 19:30 ` Namhyung Kim
  2 siblings, 0 replies; 9+ messages in thread
From: Namhyung Kim @ 2024-03-01 19:30 UTC (permalink / raw)
  To: Ian Rogers, Namhyung Kim, Arnaldo Carvalho de Melo
  Cc: Peter Zijlstra, bpf, Song Liu, Ingo Molnar, LKML, Adrian Hunter,
	Jiri Olsa, linux-perf-users

On Tue, 27 Feb 2024 21:33:35 -0800, Namhyung Kim wrote:
> Currently it accounts the contention using delta between timestamps in
> lock:contention_begin and lock:contention_end tracepoints.  But it means
> the lock should see the both events during the monitoring period.
> 
> Actually there are 4 cases that happen with the monitoring:
> 
>                 monitoring period
>             /                       \
>             |                       |
>  1:  B------+-----------------------+--------E
>  2:    B----+-------------E         |
>  3:         |           B-----------+----E
>  4:         |     B-------------E   |
>             |                       |
>             t0                      t1
> 
> [...]

Applied to perf-tools-next, thanks!

Best regards,
-- 
Namhyung Kim <namhyung@kernel.org>

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2024-03-01 19:30 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-02-28  5:33 [PATCH v2] perf lock contention: Account contending locks too Namhyung Kim
2024-02-28  6:32 ` Ian Rogers
2024-02-28 12:22 ` Arnaldo Carvalho de Melo
2024-02-28 20:01   ` Namhyung Kim
2024-02-28 20:16     ` Arnaldo Carvalho de Melo
2024-02-28 21:19       ` Namhyung Kim
2024-02-29 17:23         ` Arnaldo Carvalho de Melo
2024-02-29 21:53           ` Namhyung Kim
2024-03-01 19:30 ` Namhyung Kim

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).