linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v4 1/2]: perf util: map data buffer for preserving collected data
       [not found] <74fbcac7-80af-fcf0-2666-adefca98c271@linux.intel.com>
@ 2018-08-28 15:50 ` Alexey Budankov
  2018-08-28 15:57 ` [PATCH v4 2/2]: perf record: enable asynchronous trace writing Alexey Budankov
  2018-08-28 19:43 ` [PATCH v4 0/2]: perf: reduce data loss when profiling highly parallel CPU bound workloads Kim Phillips
  2 siblings, 0 replies; 5+ messages in thread
From: Alexey Budankov @ 2018-08-28 15:50 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Arnaldo Carvalho de Melo
  Cc: Alexander Shishkin, Jiri Olsa, Namhyung Kim, Andi Kleen, linux-kernel


The data buffer and accompanying AIO control block are allocated at 
perf_mmap object and the mapped data buffer size is equal to 
the kernel one.

The buffer is then used to preserve profiling data ready for dumping 
and queue it for asynchronous writing into perf trace thru implemented 
record__aio_write() function.

mmap_aio control structure of the size equal to the number of per-cpu 
kernel buffers is used to keep pointers to enqueued AIO control 
blocks for monitoring of completed AIO operations.

Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
---
Changes in v4:
 - converted mmap()/munmap() to malloc()/free() for mmap->data buffer management 
Changes in v2:
 - converted zalloc() to calloc() for allocation of mmap_aio array,
 - cleared typo and adjusted fallback branch code;
---
 tools/perf/util/evlist.c |  7 +++++++
 tools/perf/util/evlist.h |  2 ++
 tools/perf/util/mmap.c   | 10 ++++++++++
 tools/perf/util/mmap.h   |  3 +++
 4 files changed, 22 insertions(+)

diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index e7a4b31a84fb..04596f74766c 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -718,6 +718,8 @@ static void perf_evlist__munmap_nofree(struct perf_evlist *evlist)
 void perf_evlist__munmap(struct perf_evlist *evlist)
 {
 	perf_evlist__munmap_nofree(evlist);
+	if (evlist->mmap_aio)
+		zfree(&evlist->mmap_aio);
 	zfree(&evlist->mmap);
 	zfree(&evlist->overwrite_mmap);
 }
@@ -749,6 +751,11 @@ static struct perf_mmap *perf_evlist__alloc_mmap(struct perf_evlist *evlist,
 		 */
 		refcount_set(&map[i].refcnt, 0);
 	}
+
+	evlist->mmap_aio = calloc(evlist->nr_mmaps, sizeof(struct aiocb *));
+	if (!evlist->mmap_aio)
+		zfree(&map);
+
 	return map;
 }
 
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index dc66436add98..f98b949561fd 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -15,6 +15,7 @@
 #include "util.h"
 #include <signal.h>
 #include <unistd.h>
+#include <aio.h>
 
 struct pollfd;
 struct thread_map;
@@ -43,6 +44,7 @@ struct perf_evlist {
 	} workload;
 	struct fdarray	 pollfd;
 	struct perf_mmap *mmap;
+	struct aiocb	 **mmap_aio;
 	struct perf_mmap *overwrite_mmap;
 	struct thread_map *threads;
 	struct cpu_map	  *cpus;
diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c
index fc832676a798..570cba187f70 100644
--- a/tools/perf/util/mmap.c
+++ b/tools/perf/util/mmap.c
@@ -155,6 +155,10 @@ void __weak auxtrace_mmap_params__set_idx(struct auxtrace_mmap_params *mp __mayb
 
 void perf_mmap__munmap(struct perf_mmap *map)
 {
+	if (map->data != NULL) {
+		free(map->data);
+		map->data = NULL;
+	}
 	if (map->base != NULL) {
 		munmap(map->base, perf_mmap__mmap_len(map));
 		map->base = NULL;
@@ -190,6 +194,12 @@ int perf_mmap__mmap(struct perf_mmap *map, struct mmap_params *mp, int fd)
 		map->base = NULL;
 		return -1;
 	}
+	map->data = malloc(perf_mmap__mmap_len(map));
+	if (map->data == NULL) {
+		pr_debug2("failed to mmap perf event data buffer, error %d\n",
+				errno);
+		return -1;
+	}
 	map->fd = fd;
 
 	if (auxtrace_mmap__mmap(&map->auxtrace_mmap,
diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h
index d82294db1295..1974e621e36b 100644
--- a/tools/perf/util/mmap.h
+++ b/tools/perf/util/mmap.h
@@ -6,6 +6,7 @@
 #include <linux/types.h>
 #include <asm/barrier.h>
 #include <stdbool.h>
+#include <aio.h>
 #include "auxtrace.h"
 #include "event.h"
 
@@ -25,6 +26,8 @@ struct perf_mmap {
 	bool		 overwrite;
 	struct auxtrace_mmap auxtrace_mmap;
 	char		 event_copy[PERF_SAMPLE_MAX_SIZE] __aligned(8);
+	void 		 *data;
+	struct aiocb	 cblock;
 };
 
 /*

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH v4 2/2]: perf record: enable asynchronous trace writing
       [not found] <74fbcac7-80af-fcf0-2666-adefca98c271@linux.intel.com>
  2018-08-28 15:50 ` [PATCH v4 1/2]: perf util: map data buffer for preserving collected data Alexey Budankov
@ 2018-08-28 15:57 ` Alexey Budankov
  2018-08-28 19:43 ` [PATCH v4 0/2]: perf: reduce data loss when profiling highly parallel CPU bound workloads Kim Phillips
  2 siblings, 0 replies; 5+ messages in thread
From: Alexey Budankov @ 2018-08-28 15:57 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Arnaldo Carvalho de Melo
  Cc: Alexander Shishkin, Jiri Olsa, Namhyung Kim, Andi Kleen, linux-kernel


Trace file offset are linearly calculated by perf_mmap__push() code 
for the next possible write operation, but file position is updated by 
the kernel only in the second lseek() syscall after the loop. 
The first lseek() syscall reads that file position for 
the next loop iterations.

record__mmap_read_sync implements sort of a barrier between spilling 
ready profiling data to disk.

Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
---
Changes in v4:
- converted void *bf to struct perf_mmap *md in signatures
- written comment in perf_mmap__push() just before perf_mmap__get();
- written comment in record__mmap_read_sync() on possible restarting 
  of aio_write() operation and releasing perf_mmap object after all;
- added perf_mmap__put() for the cases of failed aio_write();
Changes in v3:
- written comments about nanosleep(0.5ms) call prior aio_suspend()
  to cope with intrusiveness of its implementation in glibc;
- written comments about rationale behind coping profiling data 
  into mmap->data buffer;
---
 tools/perf/builtin-record.c | 136 +++++++++++++++++++++++++++++++++++++++++---
 tools/perf/util/mmap.c      |  43 ++++++++++----
 tools/perf/util/mmap.h      |   2 +-
 3 files changed, 161 insertions(+), 20 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 22ebeb92ac51..7b628f9a7770 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -121,6 +121,23 @@ static int record__write(struct record *rec, void *bf, size_t size)
 	return 0;
 }
 
+static int record__aio_write(int trace_fd, struct aiocb *cblock,
+		void *buf, size_t size, off_t off)
+{
+	cblock->aio_fildes = trace_fd;
+	cblock->aio_buf    = buf;
+	cblock->aio_nbytes = size;
+	cblock->aio_offset = off;
+	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
+
+	if (aio_write(cblock) == -1) {
+		pr_err("failed to queue perf data, error: %m\n");
+		return -1;
+	}
+
+	return 0;
+}
+
 static int process_synthesized_event(struct perf_tool *tool,
 				     union perf_event *event,
 				     struct perf_sample *sample __maybe_unused,
@@ -130,12 +147,13 @@ static int process_synthesized_event(struct perf_tool *tool,
 	return record__write(rec, event, event->header.size);
 }
 
-static int record__pushfn(void *to, void *bf, size_t size)
+static int record__pushfn(void *to, struct perf_mmap *map, size_t size, off_t off)
 {
 	struct record *rec = to;
 
 	rec->samples++;
-	return record__write(rec, bf, size);
+	return record__aio_write(rec->session->data->file.fd, &map->cblock,
+			map->data, size, off);
 }
 
 static volatile int done;
@@ -510,13 +528,110 @@ static struct perf_event_header finished_round_event = {
 	.type = PERF_RECORD_FINISHED_ROUND,
 };
 
+static int record__mmap_read_sync(int trace_fd, struct aiocb **cblocks,
+		int cblocks_size, struct record *rec)
+{
+	size_t rem;
+	ssize_t size;
+	off_t rem_off;
+	int i, aio_ret, aio_errno, do_suspend;
+	struct perf_mmap *md;
+	struct timespec timeout0 = { 0, 0 };
+	struct timespec timeoutS = { 0, 1000 * 500  * 1 }; // 0.5ms
+
+	if (!cblocks_size)
+		return 0;
+
+	do {
+		do_suspend = 0;
+		/* aio_suspend() implementation inside glibc (as of v2.27) is
+		 * intrusive and not just blocks waiting io requests completion
+		 * but polls requests queue inducing context switches in perf
+		 * tool process. When profiling in system wide mode with tracing
+		 * context switches the trace may be polluted by context switches
+		 * from the perf process and the trace size becomes about 3-5
+		 * times bigger than that of when writing the trace serially.
+		 * To limit the volume of context switches from perf tool
+		 * process nanosleep() call below is added prior aio_suspend()
+		 * calling till every half of the kernel timer tick which is
+		 * usually 1ms (depends on CONFIG_HZ value).
+		 */
+		nanosleep(&timeoutS, NULL);
+		if (aio_suspend((const struct aiocb**)cblocks, cblocks_size, &timeout0)) {
+			if (errno == EAGAIN || errno == EINTR) {
+				do_suspend = 1;
+				continue;
+			} else {
+				pr_err("failed to sync perf data, error: %m\n");
+				break;
+			}
+		}
+		for (i = 0; i < cblocks_size; i++) {
+			if (cblocks[i] == NULL) {
+				continue;
+			}
+			aio_errno = aio_error(cblocks[i]);
+			if (aio_errno == EINPROGRESS) {
+				do_suspend = 1;
+				continue;
+			}
+			md = (struct perf_mmap*)((char*)cblocks[i] -
+				offsetof(struct perf_mmap, cblock));
+			size = aio_ret = aio_return((struct aiocb*)cblocks[i]);
+			if (aio_ret < 0) {
+				if (aio_errno == EINTR) {
+					size = 0;
+				} else {
+					pr_err("failed to write perf data, error: %m\n");
+					perf_mmap__put(md);
+					cblocks[i] = NULL;
+					continue;
+				}
+			}
+			rec->bytes_written += size;
+			rem = cblocks[i]->aio_nbytes - (size_t)size;
+			if (rem == 0) {
+				/* We incremented perf_mmap refcount after successful
+				 * push() from map->base to map->data with the
+				 * following aio_write() at perf_mmap__push() code and
+				 * decrement here when aio write request is fully
+				 * complete. We also decrement the refcount if
+				 * lowlevel AIO write syscall failed with the reason
+				 * different from -EINTR. See right above.
+				 */
+				perf_mmap__put(md);
+				cblocks[i] = NULL;
+				if (switch_output_size(rec))
+					trigger_hit(&switch_output_trigger);
+			} else {
+				/* aio write request may require restart with the
+				 * reminder if the kernel didn't write whole pushed
+				 * chunk at once.
+				 */
+				rem_off = cblocks[i]->aio_offset +
+					cblocks[i]->aio_nbytes - rem;
+				if (!record__aio_write(trace_fd,
+					(struct aiocb *)cblocks[i],
+					md->data + cblocks[i]->aio_nbytes - rem,
+					rem, rem_off))
+					do_suspend = 1;
+			}
+		}
+	} while (do_suspend);
+
+	return 0;
+}
+
 static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evlist,
 				    bool overwrite)
 {
 	u64 bytes_written = rec->bytes_written;
-	int i;
-	int rc = 0;
+	int i, rc = 0;
 	struct perf_mmap *maps;
+	int trace_fd = rec->session->data->file.fd;
+	struct aiocb **mmap_aio = rec->evlist->mmap_aio;
+	int mmap_aio_size = 0;
+	off_t off;
 
 	if (!evlist)
 		return 0;
@@ -528,14 +643,17 @@ static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evli
 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
 		return 0;
 
+	off = lseek(trace_fd, 0, SEEK_CUR);
+
 	for (i = 0; i < evlist->nr_mmaps; i++) {
 		struct auxtrace_mmap *mm = &maps[i].auxtrace_mmap;
 
 		if (maps[i].base) {
-			if (perf_mmap__push(&maps[i], rec, record__pushfn) != 0) {
-				rc = -1;
+			rc = perf_mmap__push(&maps[i], rec, record__pushfn, &off);
+			if (rc < 0)
 				goto out;
-			}
+			else if (rc > 0)
+				mmap_aio[mmap_aio_size++] = &maps[i].cblock;
 		}
 
 		if (mm->base && !rec->opts.auxtrace_snapshot_mode &&
@@ -545,6 +663,10 @@ static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evli
 		}
 	}
 
+	record__mmap_read_sync(trace_fd, mmap_aio, mmap_aio_size, rec);
+
+	lseek(trace_fd, off, SEEK_SET);
+
 	/*
 	 * Mark the round finished in case we wrote
 	 * at least one event.
diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c
index 570cba187f70..17460086423f 100644
--- a/tools/perf/util/mmap.c
+++ b/tools/perf/util/mmap.c
@@ -290,11 +290,11 @@ int perf_mmap__read_init(struct perf_mmap *map)
 }
 
 int perf_mmap__push(struct perf_mmap *md, void *to,
-		    int push(void *to, void *buf, size_t size))
+		    int push(void *to, struct perf_mmap *md, size_t size, off_t off), off_t *off)
 {
 	u64 head = perf_mmap__read_head(md);
 	unsigned char *data = md->base + page_size;
-	unsigned long size;
+	unsigned long size, size0 = 0;
 	void *buf;
 	int rc = 0;
 
@@ -304,25 +304,44 @@ int perf_mmap__push(struct perf_mmap *md, void *to,
 
 	size = md->end - md->start;
 
+	/* Data is copied into the md->data buffer to release space in
+	 * the kernel buffer as fast as possible, thru perf_mmap__consume().
+	 * That lets the kernel to proceed with storing more profiling
+	 * data into the kernel buffer earlier than other per-cpu kernel
+	 * buffers are handled.
+	 *
+	 * Coping can be done in two steps in case chunk of the profiling
+	 * data split by upper bound in the kernel buffer. In this case
+	 * we first move upper part of data from md->start till the
+	 * upper bound and then the reminder from the beginning of
+	 * the kernel buffer till the end of the data chunk.
+	 */
+
 	if ((md->start & md->mask) + size != (md->end & md->mask)) {
 		buf = &data[md->start & md->mask];
-		size = md->mask + 1 - (md->start & md->mask);
-		md->start += size;
-
-		if (push(to, buf, size) < 0) {
-			rc = -1;
-			goto out;
-		}
+		size0 = md->mask + 1 - (md->start & md->mask);
+		md->start += size0;
+		memcpy(md->data, buf, size0);
 	}
 
 	buf = &data[md->start & md->mask];
 	size = md->end - md->start;
 	md->start += size;
+	memcpy(md->data + size0, buf, size);
 
-	if (push(to, buf, size) < 0) {
-		rc = -1;
+	rc = push(to, md, size0 + size, *off) < 0 ? -1 : 1;
+	if (rc == -1)
 		goto out;
-	}
+
+	/* perf_mmap__get() below guards md->data buffer from premature
+	 * deallocation because md->base kernel buffer and md object
+	 * itself can be released earlier than just pushed aio request
+	 * on mmap->data is complete. Complement perf_mmap__put() call
+	 * is implemented at record__mmap_read_sync() code after
+	 * the pushed request completion or error from the system.
+	 */
+	perf_mmap__get(md);
+	*off += size0 + size;
 
 	md->prev = head;
 	perf_mmap__consume(md);
diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h
index 1974e621e36b..7949702bc104 100644
--- a/tools/perf/util/mmap.h
+++ b/tools/perf/util/mmap.h
@@ -95,7 +95,7 @@ union perf_event *perf_mmap__read_forward(struct perf_mmap *map);
 union perf_event *perf_mmap__read_event(struct perf_mmap *map);
 
 int perf_mmap__push(struct perf_mmap *md, void *to,
-		    int push(void *to, void *buf, size_t size));
+		    int push(void *to, struct perf_mmap *md, size_t size, off_t off), off_t *off);
 
 size_t perf_mmap__mmap_len(struct perf_mmap *map);
 

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v4 0/2]: perf: reduce data loss when profiling highly parallel CPU bound workloads
       [not found] <74fbcac7-80af-fcf0-2666-adefca98c271@linux.intel.com>
  2018-08-28 15:50 ` [PATCH v4 1/2]: perf util: map data buffer for preserving collected data Alexey Budankov
  2018-08-28 15:57 ` [PATCH v4 2/2]: perf record: enable asynchronous trace writing Alexey Budankov
@ 2018-08-28 19:43 ` Kim Phillips
  2018-08-29  8:40   ` Alexey Budankov
  2 siblings, 1 reply; 5+ messages in thread
From: Kim Phillips @ 2018-08-28 19:43 UTC (permalink / raw)
  To: Alexey Budankov
  Cc: Ingo Molnar, Peter Zijlstra, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Jiri Olsa, Namhyung Kim, Andi Kleen,
	linux-kernel

On Tue, 28 Aug 2018 18:44:57 +0300
Alexey Budankov <alexey.budankov@linux.intel.com> wrote:

> Experiment with profiling matrix multiplication code executing 128 
> threads on Intel Xeon Phi (KNM) with 272 cores, like below,
> demonstrates data loss metrics value of 98%:

So I took these two patches for a quick test-drive on intel-pt.

BEFORE (acme's today's perf/core branch):

$ sudo perf version
perf version 4.18.rc7.g55fc647
$ sudo perf record -e intel_pt//  dd if=/dev/zero of=/dev/null count=100000
100000+0 records in
100000+0 records out
51200000 bytes (51 MB, 49 MiB) copied, 0.0868081 s, 590 MB/s
[ perf record: Woken up 21 times to write data ]
[ perf record: Captured and wrote 41.302 MB perf.data ]
$ 

AFTER (== BEFORE + these two patches):

$ sudo ./perf version
perf version 4.18.rc7.gbc1c99
$ sudo perf record -e intel_pt//  dd if=/dev/zero of=/dev/null count=100000
100000+0 records in
100000+0 records out
51200000 bytes (51 MB, 49 MiB) copied, 0.0931142 s, 550 MB/s

...and it's still running, minutes afterwards.  Before I kill it,
here's some strace output:

nanosleep({tv_sec=0, tv_nsec=500000}, NULL) = 0
lseek(3, 332556518, SEEK_SET)           = 332556518
write(3, "D\0\0\0\0\0\10\0", 8)         = 8
lseek(3, 0, SEEK_CUR)                   = 332556526
futex(0x7f221e7252c8, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f221e7252cc, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f221e7252c8, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f221e7252cc, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f221e7252c8, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f221e7252cc, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f221e7252c8, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
nanosleep({tv_sec=0, tv_nsec=500000}, NULL) = 0
lseek(3, 332578462, SEEK_SET)           = 332578462
write(3, "D\0\0\0\0\0\10\0", 8)         = 8
lseek(3, 0, SEEK_CUR)                   = 332578470
futex(0x7f221e7252cc, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
nanosleep({tv_sec=0, tv_nsec=500000}, NULL) = 0
lseek(3, 332598822, SEEK_SET)           = 332598822
write(3, "D\0\0\0\0\0\10\0", 8)         = 8
lseek(3, 0, SEEK_CUR)                   = 332598830
futex(0x7f221e7252c8, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f221e7252cc, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f221e7252c8, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f221e7252cc, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f221e7252c8, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x7f221e7252cc, FUTEX_WAKE_PRIVATE, 1) = 1
^Cstrace: Process 3597 detached

I can't prove that it's these two patches that create the hang, but
this does look like a livelock situation...hm, hitting ^C doesn't stop
it...had to kill -9 it...erm, does 'perf record -e intel_pt// dd...'
work for you on a more standard machine?:

$ dmesg | grep Perf
[    0.044226] Performance Events: PEBS fmt3+, Skylake events, 32-deep LBR, full-width counters, Intel PMU driver.

Thanks,

Kim

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v4 0/2]: perf: reduce data loss when profiling highly parallel CPU bound workloads
  2018-08-28 19:43 ` [PATCH v4 0/2]: perf: reduce data loss when profiling highly parallel CPU bound workloads Kim Phillips
@ 2018-08-29  8:40   ` Alexey Budankov
  2018-08-29 10:50     ` Alexey Budankov
  0 siblings, 1 reply; 5+ messages in thread
From: Alexey Budankov @ 2018-08-29  8:40 UTC (permalink / raw)
  To: Kim Phillips
  Cc: Ingo Molnar, Peter Zijlstra, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Jiri Olsa, Namhyung Kim, Andi Kleen,
	linux-kernel


Hi Kim,

On 28.08.2018 22:43, Kim Phillips wrote:
> On Tue, 28 Aug 2018 18:44:57 +0300
> Alexey Budankov <alexey.budankov@linux.intel.com> wrote:
> 
>> Experiment with profiling matrix multiplication code executing 128 
>> threads on Intel Xeon Phi (KNM) with 272 cores, like below,
>> demonstrates data loss metrics value of 98%:
> 
> So I took these two patches for a quick test-drive on intel-pt.

Thanks for testing that out in this scenario! It hasn't been tested yet.

> 
> BEFORE (acme's today's perf/core branch):
> 
> $ sudo perf version
> perf version 4.18.rc7.g55fc647
> $ sudo perf record -e intel_pt//  dd if=/dev/zero of=/dev/null count=100000
> 100000+0 records in
> 100000+0 records out
> 51200000 bytes (51 MB, 49 MiB) copied, 0.0868081 s, 590 MB/s
> [ perf record: Woken up 21 times to write data ]
> [ perf record: Captured and wrote 41.302 MB perf.data ]
> $ 
> 
> AFTER (== BEFORE + these two patches):
> 
> $ sudo ./perf version
> perf version 4.18.rc7.gbc1c99
> $ sudo perf record -e intel_pt//  dd if=/dev/zero of=/dev/null count=100000
> 100000+0 records in
> 100000+0 records out
> 51200000 bytes (51 MB, 49 MiB) copied, 0.0931142 s, 550 MB/s
> 
> ...and it's still running, minutes afterwards.  Before I kill it,
> here's some strace output:
> 
> nanosleep({tv_sec=0, tv_nsec=500000}, NULL) = 0
> lseek(3, 332556518, SEEK_SET)           = 332556518
> write(3, "D\0\0\0\0\0\10\0", 8)         = 8
> lseek(3, 0, SEEK_CUR)                   = 332556526
> futex(0x7f221e7252c8, FUTEX_WAKE_PRIVATE, 1) = 1
> futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
> futex(0x7f221e7252cc, FUTEX_WAKE_PRIVATE, 1) = 1
> futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
> futex(0x7f221e7252c8, FUTEX_WAKE_PRIVATE, 1) = 1
> futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
> futex(0x7f221e7252cc, FUTEX_WAKE_PRIVATE, 1) = 1
> futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
> futex(0x7f221e7252c8, FUTEX_WAKE_PRIVATE, 1) = 1
> futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
> futex(0x7f221e7252cc, FUTEX_WAKE_PRIVATE, 1) = 1
> futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
> futex(0x7f221e7252c8, FUTEX_WAKE_PRIVATE, 1) = 1
> futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
> nanosleep({tv_sec=0, tv_nsec=500000}, NULL) = 0
> lseek(3, 332578462, SEEK_SET)           = 332578462
> write(3, "D\0\0\0\0\0\10\0", 8)         = 8
> lseek(3, 0, SEEK_CUR)                   = 332578470
> futex(0x7f221e7252cc, FUTEX_WAKE_PRIVATE, 1) = 1
> futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
> nanosleep({tv_sec=0, tv_nsec=500000}, NULL) = 0
> lseek(3, 332598822, SEEK_SET)           = 332598822
> write(3, "D\0\0\0\0\0\10\0", 8)         = 8
> lseek(3, 0, SEEK_CUR)                   = 332598830
> futex(0x7f221e7252c8, FUTEX_WAKE_PRIVATE, 1) = 1
> futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
> futex(0x7f221e7252cc, FUTEX_WAKE_PRIVATE, 1) = 1
> futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
> futex(0x7f221e7252c8, FUTEX_WAKE_PRIVATE, 1) = 1
> futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
> futex(0x7f221e7252cc, FUTEX_WAKE_PRIVATE, 1) = 1
> futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
> futex(0x7f221e7252c8, FUTEX_WAKE_PRIVATE, 1) = 1
> futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
> futex(0x7f221e7252cc, FUTEX_WAKE_PRIVATE, 1) = 1
> ^Cstrace: Process 3597 detached
> 
> I can't prove that it's these two patches that create the hang, but
> this does look like a livelock situation...hm, hitting ^C doesn't stop
> it...had to kill -9 it...erm, does 'perf record -e intel_pt// dd...'
> work for you on a more standard machine?:
> 
> $ dmesg | grep Perf
> [    0.044226] Performance Events: PEBS fmt3+, Skylake events, 32-deep LBR, full-width counters, Intel PMU driver.

Strace patterns look similar to the ones implemented in the patches. 
Let me reproduce and investigate the hang locally.

Thanks,
Alexey

> 
> Thanks,
> 
> Kim
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v4 0/2]: perf: reduce data loss when profiling highly parallel CPU bound workloads
  2018-08-29  8:40   ` Alexey Budankov
@ 2018-08-29 10:50     ` Alexey Budankov
  0 siblings, 0 replies; 5+ messages in thread
From: Alexey Budankov @ 2018-08-29 10:50 UTC (permalink / raw)
  To: Kim Phillips
  Cc: Ingo Molnar, Peter Zijlstra, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Jiri Olsa, Namhyung Kim, Andi Kleen,
	linux-kernel

Hi,

On 29.08.2018 11:40, Alexey Budankov wrote:
> 
> Hi Kim,
> 
> On 28.08.2018 22:43, Kim Phillips wrote:
>> On Tue, 28 Aug 2018 18:44:57 +0300
>> Alexey Budankov <alexey.budankov@linux.intel.com> wrote:
>>
>>> Experiment with profiling matrix multiplication code executing 128 
>>> threads on Intel Xeon Phi (KNM) with 272 cores, like below,
>>> demonstrates data loss metrics value of 98%:
>>
>> So I took these two patches for a quick test-drive on intel-pt.
> 
> Thanks for testing that out in this scenario! It hasn't been tested yet.
> 
>>
>> BEFORE (acme's today's perf/core branch):
>>
>> $ sudo perf version
>> perf version 4.18.rc7.g55fc647
>> $ sudo perf record -e intel_pt//  dd if=/dev/zero of=/dev/null count=100000
>> 100000+0 records in
>> 100000+0 records out
>> 51200000 bytes (51 MB, 49 MiB) copied, 0.0868081 s, 590 MB/s
>> [ perf record: Woken up 21 times to write data ]
>> [ perf record: Captured and wrote 41.302 MB perf.data ]
>> $ 
>>
>> AFTER (== BEFORE + these two patches):
>>
>> $ sudo ./perf version
>> perf version 4.18.rc7.gbc1c99
>> $ sudo perf record -e intel_pt//  dd if=/dev/zero of=/dev/null count=100000
>> 100000+0 records in
>> 100000+0 records out
>> 51200000 bytes (51 MB, 49 MiB) copied, 0.0931142 s, 550 MB/s
>>
>> ...and it's still running, minutes afterwards.  Before I kill it,
>> here's some strace output:
>>
>> nanosleep({tv_sec=0, tv_nsec=500000}, NULL) = 0
>> lseek(3, 332556518, SEEK_SET)           = 332556518
>> write(3, "D\0\0\0\0\0\10\0", 8)         = 8
>> lseek(3, 0, SEEK_CUR)                   = 332556526
>> futex(0x7f221e7252c8, FUTEX_WAKE_PRIVATE, 1) = 1
>> futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
>> futex(0x7f221e7252cc, FUTEX_WAKE_PRIVATE, 1) = 1
>> futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
>> futex(0x7f221e7252c8, FUTEX_WAKE_PRIVATE, 1) = 1
>> futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
>> futex(0x7f221e7252cc, FUTEX_WAKE_PRIVATE, 1) = 1
>> futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
>> futex(0x7f221e7252c8, FUTEX_WAKE_PRIVATE, 1) = 1
>> futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
>> futex(0x7f221e7252cc, FUTEX_WAKE_PRIVATE, 1) = 1
>> futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
>> futex(0x7f221e7252c8, FUTEX_WAKE_PRIVATE, 1) = 1
>> futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
>> nanosleep({tv_sec=0, tv_nsec=500000}, NULL) = 0
>> lseek(3, 332578462, SEEK_SET)           = 332578462
>> write(3, "D\0\0\0\0\0\10\0", 8)         = 8
>> lseek(3, 0, SEEK_CUR)                   = 332578470
>> futex(0x7f221e7252cc, FUTEX_WAKE_PRIVATE, 1) = 1
>> futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
>> nanosleep({tv_sec=0, tv_nsec=500000}, NULL) = 0
>> lseek(3, 332598822, SEEK_SET)           = 332598822
>> write(3, "D\0\0\0\0\0\10\0", 8)         = 8
>> lseek(3, 0, SEEK_CUR)                   = 332598830
>> futex(0x7f221e7252c8, FUTEX_WAKE_PRIVATE, 1) = 1
>> futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
>> futex(0x7f221e7252cc, FUTEX_WAKE_PRIVATE, 1) = 1
>> futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
>> futex(0x7f221e7252c8, FUTEX_WAKE_PRIVATE, 1) = 1
>> futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
>> futex(0x7f221e7252cc, FUTEX_WAKE_PRIVATE, 1) = 1
>> futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
>> futex(0x7f221e7252c8, FUTEX_WAKE_PRIVATE, 1) = 1
>> futex(0x7f221e725200, FUTEX_WAKE_PRIVATE, 1) = 1
>> futex(0x7f221e7252cc, FUTEX_WAKE_PRIVATE, 1) = 1
>> ^Cstrace: Process 3597 detached
>>
>> I can't prove that it's these two patches that create the hang, but
>> this does look like a livelock situation...hm, hitting ^C doesn't stop
>> it...had to kill -9 it...erm, does 'perf record -e intel_pt// dd...'
>> work for you on a more standard machine?:
>>
>> $ dmesg | grep Perf
>> [    0.044226] Performance Events: PEBS fmt3+, Skylake events, 32-deep LBR, full-width counters, Intel PMU driver.
> 
> Strace patterns look similar to the ones implemented in the patches. 
> Let me reproduce and investigate the hang locally.

The stack:

Thread 1 (Thread 0x7f170ef4a7c0 (LWP 11182)):
#0  0x00007f170e2dc350 in nanosleep () from /lib64/libpthread.so.0
#1  0x00000000004436e4 in record__mmap_read_sync (trace_fd=3, cblocks=0x275c120, cblocks_size=6, rec=0xabe1c0 <record>) at builtin-record.c:561
#2  0x0000000000443c71 in record__mmap_read_evlist (rec=0xabe1c0 <record>, evlist=0x274f4f0, overwrite=false) at builtin-record.c:671
#3  0x0000000000443d21 in record__mmap_read_all (rec=0xabe1c0 <record>) at builtin-record.c:692
#4  0x0000000000444feb in __cmd_record (rec=0xabe1c0 <record>, argc=4, argv=0x7fffda7606f0) at builtin-record.c:1183
#5  0x00000000004464dc in cmd_record (argc=4, argv=0x7fffda7606f0) at builtin-record.c:2000
#6  0x00000000004cee9e in run_builtin (p=0xace5f8 <commands+216>, argc=8, argv=0x7fffda7606f0) at perf.c:302
#7  0x00000000004cf10b in handle_internal_command (argc=8, argv=0x7fffda7606f0) at perf.c:354
#8  0x00000000004cf25a in run_argv (argcp=0x7fffda76054c, argv=0x7fffda760540) at perf.c:398
#9  0x00000000004cf5c6 in main (argc=8, argv=0x7fffda7606f0) at perf.c:520

The main thread continue consuming kernel buffers even 
after the application finishes:

Using CPUID GenuineIntel-6-5E
mmap size 528384B
sync: sched: cblock 0x7fd1d62f0088 size 192
sync: sched: cblock 0x7fd1d63102c8 size 192
sync: sched: cblock 0x7fd1d63203e8 size 288
sync: sched: cblock 0x7fd1d6360868 size 1024
sync: done: cblock 0x7fd1d62f0088
sync: done: cblock 0x7fd1d63102c8
sync: done: cblock 0x7fd1d63203e8
sync: done: cblock 0x7fd1d6360868
...
sync: done: cblock 0x7fd1d62f0088
sync: done: cblock 0x7fd1d63102c8
sync: sched: cblock 0x7fd1d62f0088 size 192
sync: sched: cblock 0x7fd1d63102c8 size 192
sync: done: cblock 0x7fd1d62f0088
sync: done: cblock 0x7fd1d63102c8
sync: sched: cblock 0x7fd1d62f0088 size 192
sync: sched: cblock 0x7fd1d63102c8 size 192
sync: done: cblock 0x7fd1d62f0088
sync: done: cblock 0x7fd1d63102c8
sync: sched: cblock 0x7fd1d62f0088 size 192
sync: sched: cblock 0x7fd1d63102c8 size 192
sync: sched: cblock 0x7fd1d6360868 size 128
100000+0 records in
100000+0 records out
51200000 bytes (51 MB, 49 MiB) copied, 0.0622905 s, 822 MB/s
sig: 17
sync: done: cblock 0x7fd1d62f0088
sync: done: cblock 0x7fd1d63102c8
sync: sched: cblock 0x7fd1d62f0088 size 192
sync: sched: cblock 0x7fd1d63001a8 size 192
sync: sched: cblock 0x7fd1d63102c8 size 192
sync: sched: cblock 0x7fd1d6360868 size 416
sync: done: cblock 0x7fd1d62f0088
sync: done: cblock 0x7fd1d63001a8
sync: done: cblock 0x7fd1d63102c8
sync: done: cblock 0x7fd1d6360868
sync: sched: cblock 0x7fd1d62f0088 size 192
sync: sched: cblock 0x7fd1d63001a8 size 192
sync: sched: cblock 0x7fd1d63102c8 size 192
...

> 
> Thanks,
> Alexey
> 
>>
>> Thanks,
>>
>> Kim
>>
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2018-08-29 10:50 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <74fbcac7-80af-fcf0-2666-adefca98c271@linux.intel.com>
2018-08-28 15:50 ` [PATCH v4 1/2]: perf util: map data buffer for preserving collected data Alexey Budankov
2018-08-28 15:57 ` [PATCH v4 2/2]: perf record: enable asynchronous trace writing Alexey Budankov
2018-08-28 19:43 ` [PATCH v4 0/2]: perf: reduce data loss when profiling highly parallel CPU bound workloads Kim Phillips
2018-08-29  8:40   ` Alexey Budankov
2018-08-29 10:50     ` Alexey Budankov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).