All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] tools/intel_guc_logger: Utility for capturing GuC firmware logs in a file
@ 2016-09-06 10:43 akash.goel
  2016-09-06 13:17 ` Tvrtko Ursulin
  0 siblings, 1 reply; 17+ messages in thread
From: akash.goel @ 2016-09-06 10:43 UTC (permalink / raw)
  To: intel-gfx; +Cc: Akash Goel

From: Akash Goel <akash.goel@intel.com>

This patch provides a test utility which helps capture GuC firmware logs and
then dump them to file.
The logs are pulled from a debugfs file '/sys/kernel/debug/dri/guc_log' and
stored into a file '/tmp/guc_log_dump.dat', the name of the output file can
be changed through a command line argument.

The utility goes into an infinite loop where it waits for the arrival of new
logs and as soon as new set of logs are produced it captures them in its local
buffer which is then flushed out to the file on disk.
Any time when logging needs to be ended, User can stop this utility (CTRL+C).

Before entering into a loop, it first discards whatever logs are present in
the debugfs file.
This way User can first launch this utility and then start a workload/activity
for which GuC firmware logs are to be actually captured and keep running the
utility for as long as its needed, like once the workload is over this utility
can be forcefully stopped.

If the logging wasn't enabled on GuC side by the Driver at boot time, utility
will first enable the logging and later on when it is stopped (CTRL+C) it will
also pause the logging on GuC side.

Signed-off-by: Akash Goel <akash.goel@intel.com>
---
 tools/Makefile.sources   |   1 +
 tools/intel_guc_logger.c | 441 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 442 insertions(+)
 create mode 100644 tools/intel_guc_logger.c

diff --git a/tools/Makefile.sources b/tools/Makefile.sources
index 2bb6c8e..be58871 100644
--- a/tools/Makefile.sources
+++ b/tools/Makefile.sources
@@ -19,6 +19,7 @@ tools_prog_lists =		\
 	intel_gpu_time		\
 	intel_gpu_top		\
 	intel_gtt		\
+	intel_guc_logger        \
 	intel_infoframes	\
 	intel_l3_parity		\
 	intel_lid		\
diff --git a/tools/intel_guc_logger.c b/tools/intel_guc_logger.c
new file mode 100644
index 0000000..92172fa
--- /dev/null
+++ b/tools/intel_guc_logger.c
@@ -0,0 +1,441 @@
+
+#define _GNU_SOURCE  /* For using O_DIRECT */
+#include <inttypes.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <sys/ioctl.h>
+#include <string.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/poll.h>
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <assert.h>
+#include <pthread.h>
+
+#include "igt.h"
+
+#define MB(x) ((uint64_t)(x) * 1024 * 1024)
+#ifndef PAGE_SIZE
+  #define PAGE_SIZE 4096
+#endif
+#define SUBBUF_SIZE (19*PAGE_SIZE)
+/* Need large buffering from logger side to hide the DISK IO latency, Driver
+ * can only store 8 snapshots of GuC log buffer in relay.
+ */
+#define NUM_SUBBUFS 100
+
+#define RELAY_FILE_NAME  "guc_log"
+#define CONTROL_FILE_NAME "i915_guc_log_control"
+
+char *read_buffer;
+char *out_filename;
+int poll_timeout = 2; /* by default 2ms timeout */
+pthread_mutex_t mutex;
+pthread_t flush_thread;
+int verbosity_level = 3; /* by default capture logs at max verbosity */
+uint32_t produced, consumed;
+uint64_t total_bytes_written;
+int num_buffers = NUM_SUBBUFS;
+int relay_fd, outfile_fd = -1;
+bool stop_logging, discard_oldlogs;
+uint32_t test_duration, max_filesize;
+pthread_cond_t underflow_cond, overflow_cond;
+
+static void guc_log_control(bool enable_logging)
+{
+	int control_fd;
+	char data[19];
+	uint64_t val;
+	int ret;
+
+	control_fd = igt_debugfs_open(CONTROL_FILE_NAME, O_WRONLY);
+	if (control_fd < 0)
+		igt_assert_f(0, "Couldn't open the guc log control file");
+
+	val = enable_logging ? ((verbosity_level << 4) | 0x1) : 0;
+
+	snprintf(data, sizeof(data), "0x%" PRIx64, val);
+	ret = write(control_fd, data, strlen(data) + 1);
+	if (ret < 0)
+		igt_assert_f(0, "Couldn't write to the log control file");
+
+	close(control_fd);
+}
+
+static void int_sig_handler(int sig)
+{
+	igt_info("Received signal %d\n", sig);
+
+	stop_logging = true;
+}
+
+static void pull_leftover_data(void)
+{
+	unsigned int bytes_read = 0;
+	int ret;
+
+	while (1) {
+		/* Read the logs from relay buffer */
+		ret = read(relay_fd, read_buffer, SUBBUF_SIZE);
+		if (!ret)
+			break;
+		else if (ret < 0)
+			igt_assert_f(0, "Failed to read from the guc log file");
+		else if (ret < SUBBUF_SIZE)
+			igt_assert_f(0, "invalid read from relay file");
+
+		bytes_read += ret;
+
+		if (outfile_fd > 0) {
+			ret = write(outfile_fd, read_buffer, SUBBUF_SIZE);
+			if (ret < 0)
+				igt_assert_f(0, "Couldn't dump the logs in a file");
+			total_bytes_written += ret;
+		}
+	};
+
+	igt_debug("%u bytes flushed\n", bytes_read);
+}
+
+static int num_filled_bufs(void)
+{
+	return (produced - consumed);
+}
+
+static void pull_data(void)
+{
+	char *ptr;
+	int ret;
+
+	pthread_mutex_lock(&mutex);
+	while (num_filled_bufs() >= num_buffers) {
+		igt_debug("overflow, will wait, produced %u, consumed %u\n", produced, consumed);
+		/* Stall the main thread in case of overflow, as there are no
+		 * buffers available to store the new logs, otherwise there
+		 * could be corruption if both threads work on the same buffer.
+		 */
+		pthread_cond_wait(&overflow_cond, &mutex);
+	};
+	pthread_mutex_unlock(&mutex);
+
+	ptr = read_buffer + (produced % num_buffers) * SUBBUF_SIZE;
+
+	/* Read the logs from relay buffer */
+	ret = read(relay_fd, ptr, SUBBUF_SIZE);
+	if (ret < 0)
+		igt_assert_f(0, "Failed to read from the guc log file");
+	else if (ret && (ret < SUBBUF_SIZE))
+		igt_assert_f(0, "invalid read from relay file");
+
+	pthread_mutex_lock(&mutex);
+	if (ret) {
+		produced++;
+		pthread_cond_signal(&underflow_cond);
+	}
+	pthread_mutex_unlock(&mutex);
+}
+
+static void *flusher(void *arg)
+{
+	char *ptr;
+	int ret;
+
+	igt_debug("Execution started of flusher thread\n");
+
+	do {
+		pthread_mutex_lock(&mutex);
+		while (!num_filled_bufs()) {
+			/* Exit only after completing the flush of all the filled
+			 * buffers as User would expect that all logs captured up
+			 * till the point of interruption/exit are written out to
+			 * the disk file.
+			 */
+			if (stop_logging) {
+				igt_debug("flusher to exit now\n");
+				pthread_mutex_unlock(&mutex);
+				return NULL;
+			}
+			pthread_cond_wait(&underflow_cond, &mutex);
+		};
+		pthread_mutex_unlock(&mutex);
+
+		ptr = read_buffer + (consumed % num_buffers) * SUBBUF_SIZE;
+
+		ret = write(outfile_fd, ptr, SUBBUF_SIZE);
+		if (ret < 0)
+			igt_assert_f(0, "Couldn't dump the logs in a file");
+
+		total_bytes_written += ret;
+		if (max_filesize && (total_bytes_written > MB(max_filesize))) {
+			igt_debug("Reached the target of %" PRIu64 " bytes\n", MB(max_filesize));
+			stop_logging = true;
+		}
+
+		pthread_mutex_lock(&mutex);
+		consumed++;
+		pthread_cond_signal(&overflow_cond);
+		pthread_mutex_unlock(&mutex);
+	} while(1);
+
+	return NULL;
+}
+
+static void init_flusher_thread(void)
+{
+	struct sched_param	thread_sched;
+	pthread_attr_t		p_attr;
+
+	pthread_cond_init(&underflow_cond, NULL);
+	pthread_cond_init(&overflow_cond, NULL);
+	pthread_mutex_init(&mutex, NULL);
+
+	if (pthread_attr_init(&p_attr))
+		igt_assert_f(0, "Error obtaining default thread attributes");
+
+	if (pthread_attr_setinheritsched(&p_attr, PTHREAD_EXPLICIT_SCHED))
+		igt_assert_f(0, "Couldn't set inheritsched");
+
+	if (pthread_attr_setschedpolicy(&p_attr, SCHED_RR))
+		igt_assert_f(0, "Couldn't set thread scheduling policy");
+
+	/* Keep the flusher task also at rt priority, so that it doesn't get
+	 * too late in flushing the collected logs in local buffers to the disk,
+	 * and so main thread always have spare buffers to collect the logs.
+	 */
+	thread_sched.sched_priority = 5;
+	if (pthread_attr_setschedparam(&p_attr, &thread_sched))
+		igt_assert_f(0, "Couldn't set thread priority");
+
+	if (pthread_create(&flush_thread, &p_attr, flusher, NULL))
+		igt_assert_f(0, "thread creation failed");
+
+	if (pthread_attr_destroy(&p_attr))
+		igt_assert_f(0, "Error destroying thread attributes");
+}
+
+static void open_relay_file(void)
+{
+	relay_fd = igt_debugfs_open(RELAY_FILE_NAME, O_RDONLY);
+	if (relay_fd < 0)
+		igt_assert_f(0, "Couldn't open the guc log file");
+
+	/* Purge the old/boot-time logs from the relay buffer.
+	 * This is more for Val team's requirement, where they have to first
+	 * purge the existing logs before starting the tests for which the logs
+	 * are actually needed. After this logger will enter into a loop and
+	 * wait for the new data, at that point benchmark can be launched from
+	 * a different shell.
+	 */
+	if (discard_oldlogs)
+		pull_leftover_data();
+}
+
+static void open_output_file(void)
+{
+	char *filename;
+
+	if (asprintf(&filename, "/tmp/%s", out_filename ? : "guc_log_dump.dat") < 0)
+		igt_assert_f(0, "Couldn't allocate the filename");
+
+	/* Use Direct IO mode for the output file, as the data written is not
+	 * supposed to be accessed again, this saves a copy of data from App's
+	 * buffer to kernel buffer (Page cache). Due to no buffering on kernel
+	 * side, data is flushed out to disk faster and more buffering can be
+	 * done on the logger side to hide the disk IO latency.
+	 */
+	outfile_fd = open(filename, O_CREAT | O_WRONLY | O_TRUNC | O_DIRECT, 0444);
+	if (outfile_fd < 0)
+		igt_assert_f(0, "Couldn't open the output file");
+
+	free(out_filename);
+	free(filename);
+}
+
+static void init_main_thread(void)
+{
+	struct sched_param	thread_sched;
+	/* Run the main thread at highest priority to ensure that it always
+	 * gets woken-up at earliest on arrival of new data and so is always
+	 * ready to pull the logs, otherwise there could be loss logs if
+	 * GuC firmware is generating logs at a very high rate.
+	 */
+	thread_sched.sched_priority = 1;
+	if (sched_setscheduler(getpid(), SCHED_FIFO, &thread_sched))
+		igt_assert_f(0, "Couldn't set the priority");
+
+	if (signal(SIGINT, int_sig_handler) == SIG_ERR)
+		igt_assert_f(0, "SIGINT handler registration failed");
+
+	/* Need an aligned pointer for direct IO */
+	if (posix_memalign((void **)&read_buffer, PAGE_SIZE, num_buffers * SUBBUF_SIZE))
+		igt_assert_f(0, "Couldn't allocate the read buffer");
+
+	/* Keep the pages locked in RAM, avoid page fault overhead */
+	if (mlock(read_buffer, num_buffers * SUBBUF_SIZE))
+		igt_assert_f(0, "Failed to lock memory");
+
+	/* Enable the logging, it may not have been enabled from boot and so
+	 * the relay file also wouldn't have been created.
+	 */
+	guc_log_control(true);
+
+	open_relay_file();
+	open_output_file();
+}
+
+static int parse_options(int opt, int opt_index, void *data)
+{
+	igt_debug("opt %c optarg %s\n", opt, optarg);
+
+	switch(opt) {
+	case 'v':
+		verbosity_level = atoi(optarg);
+		if (verbosity_level < 0)
+			verbosity_level = 0;
+		else if (verbosity_level > 3)
+			verbosity_level = 3;
+
+		igt_debug("verbosity level to be used is %d\n", verbosity_level);
+		break;
+	case 'o':
+		if (asprintf(&out_filename, "%s", optarg) < 0)
+			igt_assert_f(0, "Couldn't allocate the o/p filename");
+		igt_debug("logs to be stored in file /tmp/%s\n", out_filename);
+		break;
+	case 'b':
+		num_buffers = atoi(optarg);
+		if (num_buffers == 0)
+			num_buffers = NUM_SUBBUFS;
+		igt_debug("number of buffers to be used is %d\n", num_buffers);
+		break;
+	case 't':
+		test_duration = atoi(optarg);
+		if (test_duration > 0)
+			igt_debug("logger to run for %d seconds\n", test_duration);
+		break;
+	case 'p':
+		poll_timeout = atoi(optarg);
+		if (poll_timeout == 0)
+			poll_timeout = 2; /* at least use 2ms poll timeout */
+		if (poll_timeout > 0)
+			igt_debug("polling to be done with %d milli seconds timeout\n", poll_timeout);
+		break;
+	case 's':
+		max_filesize = atoi(optarg);
+		if (max_filesize > 0)
+			igt_debug("max allowed size of the output file is %d MBs\n", max_filesize);
+		break;
+	case 'd':
+		discard_oldlogs = true;
+		igt_debug("old/boot-time logs will be discarded\n");
+		break;
+	}
+
+	return 0;
+}
+
+static void process_command_line(int argc, char **argv)
+{
+	static struct option long_options[] = {
+		{"verbosity", required_argument, 0, 'v'},
+		{"outputfilename", required_argument, 0, 'o'},
+		{"buffers", required_argument, 0, 'b'},
+		{"testduration", required_argument, 0, 't'},
+		{"polltimeout", required_argument, 0, 'p'},
+		{"size", required_argument, 0, 's'},
+		{"discard", no_argument, 0, 'd'},
+		{ 0, 0, 0, 0 }
+	};
+
+	const char *help =
+		"  -v --verbosity=level   verbosity level of GuC logging (0-3)\n"
+		"  -o --outfile=name      name of the output file in /tmp directory, where logs will be stored\n"
+		"  -b --buffers=num       number of buffers to be maintained on logger side for storing logs\n"
+		"  -t --testduration=sec  max duration in seconds for which the logger should run, 0 == forever\n"
+		"  -p --polltimeout=ms    polling timeout in ms, -1 == indefinite wait for the new data\n"
+		"  -s --size=MB           max size of output file in MBs after which logging will be stopped, 0 == no limit\n"
+		"  -d --discard           discard the old/boot-time logs before entering into the capture loop\n";
+
+	igt_simple_init_parse_opts(&argc, argv, "v:o:b:t:p:s:d", long_options,
+				   help, parse_options, NULL);
+}
+
+int main(int argc, char **argv)
+{
+	struct pollfd relay_poll_fd;
+	struct timespec start={};
+	int nfds;
+	int ret;
+
+	process_command_line(argc, argv);
+
+	init_main_thread();
+
+	/* Use a separate thread for flushing the logs to a file on disk.
+	 * Main thread will buffer the data from relay file in its pool of
+	 * buffers and other thread will flush the data to disk in background.
+	 * This is needed, albeit by default data is written out to disk in
+	 * async mode, as when there are too many dirty pages in the RAM,
+	 * (/proc/sys/vm/dirty_ratio), kernel starts blocking the processes
+	 * doing the file writes.
+	 */
+	init_flusher_thread();
+
+	relay_poll_fd.fd = relay_fd;
+	relay_poll_fd.events = POLLIN;
+	relay_poll_fd.revents = 0;
+
+	nfds = 1; /* only one fd to poll */
+
+	while (!stop_logging)
+	{
+		if (test_duration && (igt_seconds_elapsed(&start) > test_duration)) {
+			igt_debug("Ran for stipulated %d seconds, exit now\n", test_duration);
+			stop_logging = true;
+			break;
+		}
+
+		/* Wait/poll for the new data to be available, relay doesn't
+		 * provide a blocking read.
+		 * Need to do polling with a timeout instead of indefinite wait,
+		 * to avoid relying on relay for the wakeup, as relay does wakeup
+		 * in a deferred manner on jiffies granularity by scheduling a
+		 * timer and moreover that timer is re-scheduled on every newly
+		 * produced buffer so timer keeps getting pushed out if there
+		 * are multiple flush interrupts in a very quick succession (less
+		 * than a jiffy gap between 2 flush interrupts) and relay runs
+		 * out of sub buffers to store the new logs.
+		 */
+		ret = poll(&relay_poll_fd, nfds, poll_timeout);
+		if (ret < 0) {
+			if (errno == EINTR)
+				break;
+			else
+				igt_assert_f(0, "poll call failed");
+		}
+
+		/* No data available yet, poll again, hopefully new data is round the corner */
+		if (!relay_poll_fd.revents)
+			continue;
+
+		pull_data();
+	}
+
+	/* Pause logging on the GuC side*/
+	guc_log_control(false);
+	pthread_cond_signal(&underflow_cond);
+	pthread_join(flush_thread, NULL);
+	pull_leftover_data();
+	igt_info("total_bytes_written %" PRIu64 "\n", total_bytes_written);
+
+	free(read_buffer);
+	close(relay_fd);
+	close(outfile_fd);
+	igt_exit();
+}
-- 
1.9.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH] tools/intel_guc_logger: Utility for capturing GuC firmware logs in a file
  2016-09-06 10:43 [PATCH] tools/intel_guc_logger: Utility for capturing GuC firmware logs in a file akash.goel
@ 2016-09-06 13:17 ` Tvrtko Ursulin
  2016-09-06 15:33   ` Goel, Akash
  0 siblings, 1 reply; 17+ messages in thread
From: Tvrtko Ursulin @ 2016-09-06 13:17 UTC (permalink / raw)
  To: akash.goel, intel-gfx


Hi,

On 06/09/16 11:43, akash.goel@intel.com wrote:
> From: Akash Goel <akash.goel@intel.com>
>
> This patch provides a test utility which helps capture GuC firmware logs and
> then dump them to file.
> The logs are pulled from a debugfs file '/sys/kernel/debug/dri/guc_log' and
> stored into a file '/tmp/guc_log_dump.dat', the name of the output file can
> be changed through a command line argument.
>
> The utility goes into an infinite loop where it waits for the arrival of new
> logs and as soon as new set of logs are produced it captures them in its local
> buffer which is then flushed out to the file on disk.
> Any time when logging needs to be ended, User can stop this utility (CTRL+C).
>
> Before entering into a loop, it first discards whatever logs are present in
> the debugfs file.
> This way User can first launch this utility and then start a workload/activity
> for which GuC firmware logs are to be actually captured and keep running the
> utility for as long as its needed, like once the workload is over this utility
> can be forcefully stopped.
>
> If the logging wasn't enabled on GuC side by the Driver at boot time, utility
> will first enable the logging and later on when it is stopped (CTRL+C) it will
> also pause the logging on GuC side.
>
> Signed-off-by: Akash Goel <akash.goel@intel.com>
> ---
>   tools/Makefile.sources   |   1 +
>   tools/intel_guc_logger.c | 441 +++++++++++++++++++++++++++++++++++++++++++++++
>   2 files changed, 442 insertions(+)
>   create mode 100644 tools/intel_guc_logger.c
>
> diff --git a/tools/Makefile.sources b/tools/Makefile.sources
> index 2bb6c8e..be58871 100644
> --- a/tools/Makefile.sources
> +++ b/tools/Makefile.sources
> @@ -19,6 +19,7 @@ tools_prog_lists =		\
>   	intel_gpu_time		\
>   	intel_gpu_top		\
>   	intel_gtt		\
> +	intel_guc_logger        \
>   	intel_infoframes	\
>   	intel_l3_parity		\
>   	intel_lid		\
> diff --git a/tools/intel_guc_logger.c b/tools/intel_guc_logger.c
> new file mode 100644
> index 0000000..92172fa
> --- /dev/null
> +++ b/tools/intel_guc_logger.c
> @@ -0,0 +1,441 @@
> +
> +#define _GNU_SOURCE  /* For using O_DIRECT */
> +#include <inttypes.h>
> +#include <stdio.h>
> +#include <fcntl.h>
> +#include <sched.h>
> +#include <sys/ioctl.h>
> +#include <string.h>
> +#include <errno.h>
> +#include <signal.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <sys/poll.h>
> +#include <sys/wait.h>
> +#include <sys/types.h>
> +#include <sys/syscall.h>
> +#include <sys/mman.h>
> +#include <assert.h>
> +#include <pthread.h>
> +
> +#include "igt.h"
> +
> +#define MB(x) ((uint64_t)(x) * 1024 * 1024)
> +#ifndef PAGE_SIZE
> +  #define PAGE_SIZE 4096
> +#endif
> +#define SUBBUF_SIZE (19*PAGE_SIZE)
> +/* Need large buffering from logger side to hide the DISK IO latency, Driver
> + * can only store 8 snapshots of GuC log buffer in relay.
> + */
> +#define NUM_SUBBUFS 100
> +
> +#define RELAY_FILE_NAME  "guc_log"
> +#define CONTROL_FILE_NAME "i915_guc_log_control"
> +
> +char *read_buffer;
> +char *out_filename;
> +int poll_timeout = 2; /* by default 2ms timeout */
> +pthread_mutex_t mutex;
> +pthread_t flush_thread;
> +int verbosity_level = 3; /* by default capture logs at max verbosity */
> +uint32_t produced, consumed;
> +uint64_t total_bytes_written;
> +int num_buffers = NUM_SUBBUFS;
> +int relay_fd, outfile_fd = -1;
> +bool stop_logging, discard_oldlogs;
> +uint32_t test_duration, max_filesize;
> +pthread_cond_t underflow_cond, overflow_cond;
> +
> +static void guc_log_control(bool enable_logging)
> +{
> +	int control_fd;
> +	char data[19];
> +	uint64_t val;
> +	int ret;
> +
> +	control_fd = igt_debugfs_open(CONTROL_FILE_NAME, O_WRONLY);
> +	if (control_fd < 0)
> +		igt_assert_f(0, "Couldn't open the guc log control file");
> +
> +	val = enable_logging ? ((verbosity_level << 4) | 0x1) : 0;
> +
> +	snprintf(data, sizeof(data), "0x%" PRIx64, val);
> +	ret = write(control_fd, data, strlen(data) + 1);

Minor: It looks safe like it is but something like below would maybe be 
more robust?

ret = snprintf(data, sizeof(data), "0x%" PRIx64, val);
igt_assert(ret > 2 && ret < sizeof(data));
ret = write(control_fd, data, ret);
igt_assert(ret > 0); // assuming short writes can't happen

Up to you.

> +	if (ret < 0)
> +		igt_assert_f(0, "Couldn't write to the log control file");
> +
> +	close(control_fd);
> +}
> +
> +static void int_sig_handler(int sig)
> +{
> +	igt_info("Received signal %d\n", sig);
> +
> +	stop_logging = true;
> +}
> +
> +static void pull_leftover_data(void)
> +{
> +	unsigned int bytes_read = 0;
> +	int ret;
> +
> +	while (1) {
> +		/* Read the logs from relay buffer */
> +		ret = read(relay_fd, read_buffer, SUBBUF_SIZE);
> +		if (!ret)
> +			break;
> +		else if (ret < 0)
> +			igt_assert_f(0, "Failed to read from the guc log file");
> +		else if (ret < SUBBUF_SIZE)
> +			igt_assert_f(0, "invalid read from relay file");
> +
> +		bytes_read += ret;
> +
> +		if (outfile_fd > 0) {

 >= 0 I think. Or is it even needed since open_output_file asserts if it 
fails to open?

> +			ret = write(outfile_fd, read_buffer, SUBBUF_SIZE);
> +			if (ret < 0)
> +				igt_assert_f(0, "Couldn't dump the logs in a file");

Do you need to check for short writes as well here? Man page for write 
says it can happen with out of disk space or signals.

> +			total_bytes_written += ret;
> +		}
> +	};
> +
> +	igt_debug("%u bytes flushed\n", bytes_read);
> +}
> +
> +static int num_filled_bufs(void)
> +{
> +	return (produced - consumed);
> +}
> +
> +static void pull_data(void)
> +{
> +	char *ptr;
> +	int ret;
> +
> +	pthread_mutex_lock(&mutex);
> +	while (num_filled_bufs() >= num_buffers) {
> +		igt_debug("overflow, will wait, produced %u, consumed %u\n", produced, consumed);
> +		/* Stall the main thread in case of overflow, as there are no
> +		 * buffers available to store the new logs, otherwise there
> +		 * could be corruption if both threads work on the same buffer.
> +		 */
> +		pthread_cond_wait(&overflow_cond, &mutex);
> +	};
> +	pthread_mutex_unlock(&mutex);
> +
> +	ptr = read_buffer + (produced % num_buffers) * SUBBUF_SIZE;
> +
> +	/* Read the logs from relay buffer */
> +	ret = read(relay_fd, ptr, SUBBUF_SIZE);
> +	if (ret < 0)
> +		igt_assert_f(0, "Failed to read from the guc log file");
> +	else if (ret && (ret < SUBBUF_SIZE))
> +		igt_assert_f(0, "invalid read from relay file");
> +
> +	pthread_mutex_lock(&mutex);
> +	if (ret) {
> +		produced++;
> +		pthread_cond_signal(&underflow_cond);
> +	}
> +	pthread_mutex_unlock(&mutex);

Very minor: can move the mutex under the if.

> +}
> +
> +static void *flusher(void *arg)
> +{
> +	char *ptr;
> +	int ret;
> +
> +	igt_debug("Execution started of flusher thread\n");
> +
> +	do {
> +		pthread_mutex_lock(&mutex);
> +		while (!num_filled_bufs()) {
> +			/* Exit only after completing the flush of all the filled
> +			 * buffers as User would expect that all logs captured up
> +			 * till the point of interruption/exit are written out to
> +			 * the disk file.
> +			 */
> +			if (stop_logging) {
> +				igt_debug("flusher to exit now\n");
> +				pthread_mutex_unlock(&mutex);
> +				return NULL;
> +			}
> +			pthread_cond_wait(&underflow_cond, &mutex);
> +		};
> +		pthread_mutex_unlock(&mutex);
> +
> +		ptr = read_buffer + (consumed % num_buffers) * SUBBUF_SIZE;
> +
> +		ret = write(outfile_fd, ptr, SUBBUF_SIZE);
> +		if (ret < 0)
> +			igt_assert_f(0, "Couldn't dump the logs in a file");

Question about short writes applies here as well.

> +
> +		total_bytes_written += ret;
> +		if (max_filesize && (total_bytes_written > MB(max_filesize))) {
> +			igt_debug("Reached the target of %" PRIu64 " bytes\n", MB(max_filesize));
> +			stop_logging = true;
> +		}
> +
> +		pthread_mutex_lock(&mutex);
> +		consumed++;
> +		pthread_cond_signal(&overflow_cond);
> +		pthread_mutex_unlock(&mutex);
> +	} while(1);

Minor again: You use both "while(1) {}" and "do { } while(1)" in here. 
Maybe stick with one? :D (I think "for (;;)" is preferred, but maybe I 
am misremembering things.)

> +
> +	return NULL;
> +}
> +
> +static void init_flusher_thread(void)
> +{
> +	struct sched_param	thread_sched;
> +	pthread_attr_t		p_attr;
> +
> +	pthread_cond_init(&underflow_cond, NULL);
> +	pthread_cond_init(&overflow_cond, NULL);
> +	pthread_mutex_init(&mutex, NULL);
> +
> +	if (pthread_attr_init(&p_attr))
> +		igt_assert_f(0, "Error obtaining default thread attributes");
> +
> +	if (pthread_attr_setinheritsched(&p_attr, PTHREAD_EXPLICIT_SCHED))
> +		igt_assert_f(0, "Couldn't set inheritsched");
> +
> +	if (pthread_attr_setschedpolicy(&p_attr, SCHED_RR))
> +		igt_assert_f(0, "Couldn't set thread scheduling policy");
> +
> +	/* Keep the flusher task also at rt priority, so that it doesn't get
> +	 * too late in flushing the collected logs in local buffers to the disk,
> +	 * and so main thread always have spare buffers to collect the logs.
> +	 */
> +	thread_sched.sched_priority = 5;
> +	if (pthread_attr_setschedparam(&p_attr, &thread_sched))
> +		igt_assert_f(0, "Couldn't set thread priority");
> +
> +	if (pthread_create(&flush_thread, &p_attr, flusher, NULL))
> +		igt_assert_f(0, "thread creation failed");

More nitpicks: You mix starting error messages with upper case and lower 
case.

> +
> +	if (pthread_attr_destroy(&p_attr))
> +		igt_assert_f(0, "Error destroying thread attributes");
> +}
> +
> +static void open_relay_file(void)
> +{
> +	relay_fd = igt_debugfs_open(RELAY_FILE_NAME, O_RDONLY);
> +	if (relay_fd < 0)
> +		igt_assert_f(0, "Couldn't open the guc log file");
> +
> +	/* Purge the old/boot-time logs from the relay buffer.
> +	 * This is more for Val team's requirement, where they have to first
> +	 * purge the existing logs before starting the tests for which the logs
> +	 * are actually needed. After this logger will enter into a loop and
> +	 * wait for the new data, at that point benchmark can be launched from
> +	 * a different shell.
> +	 */
> +	if (discard_oldlogs)
> +		pull_leftover_data();
> +}
> +
> +static void open_output_file(void)
> +{
> +	char *filename;
> +
> +	if (asprintf(&filename, "/tmp/%s", out_filename ? : "guc_log_dump.dat") < 0)
> +		igt_assert_f(0, "Couldn't allocate the filename");

I think you need to take the out_filename when specified by the user 
as-is and not stick it in /tmp. It is perfectly reasonable that someone 
would want to put it somewhere else.

Also better to move the default name of "guc_log_dump.dat" to where 
defaults like RELAY_FILE_NAME and such are defined.

And I would not default to /tmp but the current directory.

> +
> +	/* Use Direct IO mode for the output file, as the data written is not
> +	 * supposed to be accessed again, this saves a copy of data from App's
> +	 * buffer to kernel buffer (Page cache). Due to no buffering on kernel
> +	 * side, data is flushed out to disk faster and more buffering can be
> +	 * done on the logger side to hide the disk IO latency.
> +	 */
> +	outfile_fd = open(filename, O_CREAT | O_WRONLY | O_TRUNC | O_DIRECT, 0444);

0400 or 0440 would be better I think.

> +	if (outfile_fd < 0)
> +		igt_assert_f(0, "Couldn't open the output file");
> +
> +	free(out_filename);
> +	free(filename);
> +}
> +
> +static void init_main_thread(void)
> +{
> +	struct sched_param	thread_sched;
> +	/* Run the main thread at highest priority to ensure that it always
> +	 * gets woken-up at earliest on arrival of new data and so is always
> +	 * ready to pull the logs, otherwise there could be loss logs if
> +	 * GuC firmware is generating logs at a very high rate.
> +	 */
> +	thread_sched.sched_priority = 1;
> +	if (sched_setscheduler(getpid(), SCHED_FIFO, &thread_sched))
> +		igt_assert_f(0, "Couldn't set the priority");
> +
> +	if (signal(SIGINT, int_sig_handler) == SIG_ERR)
> +		igt_assert_f(0, "SIGINT handler registration failed");
> +
> +	/* Need an aligned pointer for direct IO */
> +	if (posix_memalign((void **)&read_buffer, PAGE_SIZE, num_buffers * SUBBUF_SIZE))
> +		igt_assert_f(0, "Couldn't allocate the read buffer");
> +
> +	/* Keep the pages locked in RAM, avoid page fault overhead */
> +	if (mlock(read_buffer, num_buffers * SUBBUF_SIZE))
> +		igt_assert_f(0, "Failed to lock memory");
> +
> +	/* Enable the logging, it may not have been enabled from boot and so
> +	 * the relay file also wouldn't have been created.
> +	 */
> +	guc_log_control(true);
> +
> +	open_relay_file();
> +	open_output_file();
> +}
> +
> +static int parse_options(int opt, int opt_index, void *data)
> +{
> +	igt_debug("opt %c optarg %s\n", opt, optarg);
> +
> +	switch(opt) {
> +	case 'v':
> +		verbosity_level = atoi(optarg);
> +		if (verbosity_level < 0)
> +			verbosity_level = 0;
> +		else if (verbosity_level > 3)
> +			verbosity_level = 3;

Notify user if invalid value was provided?

> +
> +		igt_debug("verbosity level to be used is %d\n", verbosity_level);
> +		break;
> +	case 'o':
> +		if (asprintf(&out_filename, "%s", optarg) < 0)

strdup?

> +			igt_assert_f(0, "Couldn't allocate the o/p filename");
> +		igt_debug("logs to be stored in file /tmp/%s\n", out_filename);
> +		break;
> +	case 'b':
> +		num_buffers = atoi(optarg);
> +		if (num_buffers == 0)
> +			num_buffers = NUM_SUBBUFS;

Again notify if invalid number/value.

> +		igt_debug("number of buffers to be used is %d\n", num_buffers);
> +		break;
> +	case 't':
> +		test_duration = atoi(optarg);
> +		if (test_duration > 0)

If switch was specified it should have a valid optarg or error reported.

> +			igt_debug("logger to run for %d seconds\n", test_duration);
> +		break;
> +	case 'p':
> +		poll_timeout = atoi(optarg);
> +		if (poll_timeout == 0)
> +			poll_timeout = 2; /* at least use 2ms poll timeout */

Hm I would prefer if you allowed no poll period. I even think it should 
be the default if we think relayfs will be definitely fixed to not 
require it.

In any case, commend is wrong since user can specify 1ms and get away 
with it. :)

> +		if (poll_timeout > 0)
> +			igt_debug("polling to be done with %d milli seconds timeout\n", poll_timeout);

milliseconds

> +		break;
> +	case 's':
> +		max_filesize = atoi(optarg);
> +		if (max_filesize > 0)

Same as for -t.

> +			igt_debug("max allowed size of the output file is %d MBs\n", max_filesize);
> +		break;
> +	case 'd':
> +		discard_oldlogs = true;
> +		igt_debug("old/boot-time logs will be discarded\n");
> +		break;
> +	}
> +
> +	return 0;
> +}
> +
> +static void process_command_line(int argc, char **argv)
> +{
> +	static struct option long_options[] = {
> +		{"verbosity", required_argument, 0, 'v'},
> +		{"outputfilename", required_argument, 0, 'o'},
> +		{"buffers", required_argument, 0, 'b'},
> +		{"testduration", required_argument, 0, 't'},
> +		{"polltimeout", required_argument, 0, 'p'},
> +		{"size", required_argument, 0, 's'},
> +		{"discard", no_argument, 0, 'd'},
> +		{ 0, 0, 0, 0 }
> +	};
> +
> +	const char *help =
> +		"  -v --verbosity=level   verbosity level of GuC logging (0-3)\n"
> +		"  -o --outfile=name      name of the output file in /tmp directory, where logs will be stored\n"
> +		"  -b --buffers=num       number of buffers to be maintained on logger side for storing logs\n"
> +		"  -t --testduration=sec  max duration in seconds for which the logger should run, 0 == forever\n"
> +		"  -p --polltimeout=ms    polling timeout in ms, -1 == indefinite wait for the new data\n"
> +		"  -s --size=MB           max size of output file in MBs after which logging will be stopped, 0 == no limit\n"
> +		"  -d --discard           discard the old/boot-time logs before entering into the capture loop\n";
> +
> +	igt_simple_init_parse_opts(&argc, argv, "v:o:b:t:p:s:d", long_options,
> +				   help, parse_options, NULL);
> +}
> +
> +int main(int argc, char **argv)
> +{
> +	struct pollfd relay_poll_fd;
> +	struct timespec start={};
> +	int nfds;
> +	int ret;
> +
> +	process_command_line(argc, argv);
> +
> +	init_main_thread();
> +
> +	/* Use a separate thread for flushing the logs to a file on disk.
> +	 * Main thread will buffer the data from relay file in its pool of
> +	 * buffers and other thread will flush the data to disk in background.
> +	 * This is needed, albeit by default data is written out to disk in
> +	 * async mode, as when there are too many dirty pages in the RAM,
> +	 * (/proc/sys/vm/dirty_ratio), kernel starts blocking the processes
> +	 * doing the file writes.
> +	 */
> +	init_flusher_thread();
> +
> +	relay_poll_fd.fd = relay_fd;
> +	relay_poll_fd.events = POLLIN;
> +	relay_poll_fd.revents = 0;
> +
> +	nfds = 1; /* only one fd to poll */
> +
> +	while (!stop_logging)
> +	{
> +		if (test_duration && (igt_seconds_elapsed(&start) > test_duration)) {

If you agree to allow no poll period the this would not work right? In 
that case you would need to use alarm(2) or something.

> +			igt_debug("Ran for stipulated %d seconds, exit now\n", test_duration);
> +			stop_logging = true;
> +			break;
> +		}
> +
> +		/* Wait/poll for the new data to be available, relay doesn't
> +		 * provide a blocking read.
> +		 * Need to do polling with a timeout instead of indefinite wait,
> +		 * to avoid relying on relay for the wakeup, as relay does wakeup
> +		 * in a deferred manner on jiffies granularity by scheduling a
> +		 * timer and moreover that timer is re-scheduled on every newly
> +		 * produced buffer so timer keeps getting pushed out if there
> +		 * are multiple flush interrupts in a very quick succession (less
> +		 * than a jiffy gap between 2 flush interrupts) and relay runs
> +		 * out of sub buffers to store the new logs.
> +		 */
> +		ret = poll(&relay_poll_fd, nfds, poll_timeout);
> +		if (ret < 0) {
> +			if (errno == EINTR)
> +				break;
> +			else
> +				igt_assert_f(0, "poll call failed");
> +		}
> +
> +		/* No data available yet, poll again, hopefully new data is round the corner */
> +		if (!relay_poll_fd.revents)
> +			continue;
> +
> +		pull_data();
> +	}
> +
> +	/* Pause logging on the GuC side*/
> +	guc_log_control(false);
> +	pthread_cond_signal(&underflow_cond);
> +	pthread_join(flush_thread, NULL);
> +	pull_leftover_data();
> +	igt_info("total_bytes_written %" PRIu64 "\n", total_bytes_written);
> +
> +	free(read_buffer);
> +	close(relay_fd);
> +	close(outfile_fd);
> +	igt_exit();
> +}
>

So only minor comments. The rest looks reasonable to me and I trust you 
tested it well and it works fine.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] tools/intel_guc_logger: Utility for capturing GuC firmware logs in a file
  2016-09-06 13:17 ` Tvrtko Ursulin
@ 2016-09-06 15:33   ` Goel, Akash
  2016-09-06 15:52     ` Tvrtko Ursulin
  0 siblings, 1 reply; 17+ messages in thread
From: Goel, Akash @ 2016-09-06 15:33 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx; +Cc: akash.goel



On 9/6/2016 6:47 PM, Tvrtko Ursulin wrote:
>
> Hi,
>
> On 06/09/16 11:43, akash.goel@intel.com wrote:
>> From: Akash Goel <akash.goel@intel.com>
>>
>> This patch provides a test utility which helps capture GuC firmware
>> logs and
>> then dump them to file.
>> The logs are pulled from a debugfs file
>> '/sys/kernel/debug/dri/guc_log' and
>> stored into a file '/tmp/guc_log_dump.dat', the name of the output
>> file can
>> be changed through a command line argument.
>>
>> The utility goes into an infinite loop where it waits for the arrival
>> of new
>> logs and as soon as new set of logs are produced it captures them in
>> its local
>> buffer which is then flushed out to the file on disk.
>> Any time when logging needs to be ended, User can stop this utility
>> (CTRL+C).
>>
>> Before entering into a loop, it first discards whatever logs are
>> present in
>> the debugfs file.
>> This way User can first launch this utility and then start a
>> workload/activity
>> for which GuC firmware logs are to be actually captured and keep
>> running the
>> utility for as long as its needed, like once the workload is over this
>> utility
>> can be forcefully stopped.
>>
>> If the logging wasn't enabled on GuC side by the Driver at boot time,
>> utility
>> will first enable the logging and later on when it is stopped (CTRL+C)
>> it will
>> also pause the logging on GuC side.
>>
>> Signed-off-by: Akash Goel <akash.goel@intel.com>
>> ---
>>   tools/Makefile.sources   |   1 +
>>   tools/intel_guc_logger.c | 441
>> +++++++++++++++++++++++++++++++++++++++++++++++
>>   2 files changed, 442 insertions(+)
>>   create mode 100644 tools/intel_guc_logger.c
>>
>> diff --git a/tools/Makefile.sources b/tools/Makefile.sources
>> index 2bb6c8e..be58871 100644
>> --- a/tools/Makefile.sources
>> +++ b/tools/Makefile.sources
>> @@ -19,6 +19,7 @@ tools_prog_lists =        \
>>       intel_gpu_time        \
>>       intel_gpu_top        \
>>       intel_gtt        \
>> +    intel_guc_logger        \
>>       intel_infoframes    \
>>       intel_l3_parity        \
>>       intel_lid        \
>> diff --git a/tools/intel_guc_logger.c b/tools/intel_guc_logger.c
>> new file mode 100644
>> index 0000000..92172fa
>> --- /dev/null
>> +++ b/tools/intel_guc_logger.c
>> @@ -0,0 +1,441 @@
>> +
>> +#define _GNU_SOURCE  /* For using O_DIRECT */
>> +#include <inttypes.h>
>> +#include <stdio.h>
>> +#include <fcntl.h>
>> +#include <sched.h>
>> +#include <sys/ioctl.h>
>> +#include <string.h>
>> +#include <errno.h>
>> +#include <signal.h>
>> +#include <stdlib.h>
>> +#include <unistd.h>
>> +#include <sys/poll.h>
>> +#include <sys/wait.h>
>> +#include <sys/types.h>
>> +#include <sys/syscall.h>
>> +#include <sys/mman.h>
>> +#include <assert.h>
>> +#include <pthread.h>
>> +
>> +#include "igt.h"
>> +
>> +#define MB(x) ((uint64_t)(x) * 1024 * 1024)
>> +#ifndef PAGE_SIZE
>> +  #define PAGE_SIZE 4096
>> +#endif
>> +#define SUBBUF_SIZE (19*PAGE_SIZE)
>> +/* Need large buffering from logger side to hide the DISK IO latency,
>> Driver
>> + * can only store 8 snapshots of GuC log buffer in relay.
>> + */
>> +#define NUM_SUBBUFS 100
>> +
>> +#define RELAY_FILE_NAME  "guc_log"
>> +#define CONTROL_FILE_NAME "i915_guc_log_control"
>> +
>> +char *read_buffer;
>> +char *out_filename;
>> +int poll_timeout = 2; /* by default 2ms timeout */
>> +pthread_mutex_t mutex;
>> +pthread_t flush_thread;
>> +int verbosity_level = 3; /* by default capture logs at max verbosity */
>> +uint32_t produced, consumed;
>> +uint64_t total_bytes_written;
>> +int num_buffers = NUM_SUBBUFS;
>> +int relay_fd, outfile_fd = -1;
>> +bool stop_logging, discard_oldlogs;
>> +uint32_t test_duration, max_filesize;
>> +pthread_cond_t underflow_cond, overflow_cond;
>> +
>> +static void guc_log_control(bool enable_logging)
>> +{
>> +    int control_fd;
>> +    char data[19];
>> +    uint64_t val;
>> +    int ret;
>> +
>> +    control_fd = igt_debugfs_open(CONTROL_FILE_NAME, O_WRONLY);
>> +    if (control_fd < 0)
>> +        igt_assert_f(0, "Couldn't open the guc log control file");
>> +
>> +    val = enable_logging ? ((verbosity_level << 4) | 0x1) : 0;
>> +
>> +    snprintf(data, sizeof(data), "0x%" PRIx64, val);
>> +    ret = write(control_fd, data, strlen(data) + 1);
>
> Minor: It looks safe like it is but something like below would maybe be
> more robust?
>
> ret = snprintf(data, sizeof(data), "0x%" PRIx64, val);
> igt_assert(ret > 2 && ret < sizeof(data));

ok will add, but possibility of failure will be really remote here.
but igt_assert(ret > 0) should suffice.

> ret = write(control_fd, data, ret);
> igt_assert(ret > 0); // assuming short writes can't happen
>
> Up to you.
>
>> +    if (ret < 0)
>> +        igt_assert_f(0, "Couldn't write to the log control file");
>> +
>> +    close(control_fd);
>> +}
>> +
>> +static void int_sig_handler(int sig)
>> +{
>> +    igt_info("Received signal %d\n", sig);
>> +
>> +    stop_logging = true;
>> +}
>> +
>> +static void pull_leftover_data(void)
>> +{
>> +    unsigned int bytes_read = 0;
>> +    int ret;
>> +
>> +    while (1) {
>> +        /* Read the logs from relay buffer */
>> +        ret = read(relay_fd, read_buffer, SUBBUF_SIZE);
>> +        if (!ret)
>> +            break;
>> +        else if (ret < 0)
>> +            igt_assert_f(0, "Failed to read from the guc log file");
>> +        else if (ret < SUBBUF_SIZE)
>> +            igt_assert_f(0, "invalid read from relay file");
>> +
>> +        bytes_read += ret;
>> +
>> +        if (outfile_fd > 0) {
>
>>= 0 I think. Or is it even needed since open_output_file asserts if it
> fails to open?
>
Actually pull_leftover_data() will be called twice, once before opening
the outfile. This is for purging the old logs, if enabled.

>> +            ret = write(outfile_fd, read_buffer, SUBBUF_SIZE);
>> +            if (ret < 0)
>> +                igt_assert_f(0, "Couldn't dump the logs in a file");
>
> Do you need to check for short writes as well here? Man page for write
> says it can happen with out of disk space or signals.
>
To keep things simple can just check 'ret < SUBBUF_SIZE' considering
out of disk space as a serious failure or add a loop here which will run 
until SUBBUF_SIZE bytes have been written.

>> +            total_bytes_written += ret;
>> +        }
>> +    };
>> +
>> +    igt_debug("%u bytes flushed\n", bytes_read);
>> +}
>> +
>> +static int num_filled_bufs(void)
>> +{
>> +    return (produced - consumed);
>> +}
>> +
>> +static void pull_data(void)
>> +{
>> +    char *ptr;
>> +    int ret;
>> +
>> +    pthread_mutex_lock(&mutex);
>> +    while (num_filled_bufs() >= num_buffers) {
>> +        igt_debug("overflow, will wait, produced %u, consumed %u\n",
>> produced, consumed);
>> +        /* Stall the main thread in case of overflow, as there are no
>> +         * buffers available to store the new logs, otherwise there
>> +         * could be corruption if both threads work on the same buffer.
>> +         */
>> +        pthread_cond_wait(&overflow_cond, &mutex);
>> +    };
>> +    pthread_mutex_unlock(&mutex);
>> +
>> +    ptr = read_buffer + (produced % num_buffers) * SUBBUF_SIZE;
>> +
>> +    /* Read the logs from relay buffer */
>> +    ret = read(relay_fd, ptr, SUBBUF_SIZE);
>> +    if (ret < 0)
>> +        igt_assert_f(0, "Failed to read from the guc log file");
>> +    else if (ret && (ret < SUBBUF_SIZE))
>> +        igt_assert_f(0, "invalid read from relay file");
>> +
>> +    pthread_mutex_lock(&mutex);
>> +    if (ret) {
>> +        produced++;
>> +        pthread_cond_signal(&underflow_cond);
>> +    }
>> +    pthread_mutex_unlock(&mutex);
>
> Very minor: can move the mutex under the if.
>
Fine will move under the 'if'.

>> +}
>> +
>> +static void *flusher(void *arg)
>> +{
>> +    char *ptr;
>> +    int ret;
>> +
>> +    igt_debug("Execution started of flusher thread\n");
>> +
>> +    do {
>> +        pthread_mutex_lock(&mutex);
>> +        while (!num_filled_bufs()) {
>> +            /* Exit only after completing the flush of all the filled
>> +             * buffers as User would expect that all logs captured up
>> +             * till the point of interruption/exit are written out to
>> +             * the disk file.
>> +             */
>> +            if (stop_logging) {
>> +                igt_debug("flusher to exit now\n");
>> +                pthread_mutex_unlock(&mutex);
>> +                return NULL;
>> +            }
>> +            pthread_cond_wait(&underflow_cond, &mutex);
>> +        };
>> +        pthread_mutex_unlock(&mutex);
>> +
>> +        ptr = read_buffer + (consumed % num_buffers) * SUBBUF_SIZE;
>> +
>> +        ret = write(outfile_fd, ptr, SUBBUF_SIZE);
>> +        if (ret < 0)
>> +            igt_assert_f(0, "Couldn't dump the logs in a file");
>
> Question about short writes applies here as well.
>
Fine will do the same here whatever is decided for above.

>> +
>> +        total_bytes_written += ret;
>> +        if (max_filesize && (total_bytes_written > MB(max_filesize))) {
>> +            igt_debug("Reached the target of %" PRIu64 " bytes\n",
>> MB(max_filesize));
>> +            stop_logging = true;
>> +        }
>> +
>> +        pthread_mutex_lock(&mutex);
>> +        consumed++;
>> +        pthread_cond_signal(&overflow_cond);
>> +        pthread_mutex_unlock(&mutex);
>> +    } while(1);
>
> Minor again: You use both "while(1) {}" and "do { } while(1)" in here.
> Maybe stick with one? :D (I think "for (;;)" is preferred, but maybe I
> am misremembering things.)
>
Sorry for the inconsistency, will clean this up.

>> +
>> +    return NULL;
>> +}
>> +
>> +static void init_flusher_thread(void)
>> +{
>> +    struct sched_param    thread_sched;
>> +    pthread_attr_t        p_attr;
>> +
>> +    pthread_cond_init(&underflow_cond, NULL);
>> +    pthread_cond_init(&overflow_cond, NULL);
>> +    pthread_mutex_init(&mutex, NULL);
>> +
>> +    if (pthread_attr_init(&p_attr))
>> +        igt_assert_f(0, "Error obtaining default thread attributes");
>> +
>> +    if (pthread_attr_setinheritsched(&p_attr, PTHREAD_EXPLICIT_SCHED))
>> +        igt_assert_f(0, "Couldn't set inheritsched");
>> +
>> +    if (pthread_attr_setschedpolicy(&p_attr, SCHED_RR))
>> +        igt_assert_f(0, "Couldn't set thread scheduling policy");
>> +
>> +    /* Keep the flusher task also at rt priority, so that it doesn't get
>> +     * too late in flushing the collected logs in local buffers to
>> the disk,
>> +     * and so main thread always have spare buffers to collect the logs.
>> +     */
>> +    thread_sched.sched_priority = 5;
>> +    if (pthread_attr_setschedparam(&p_attr, &thread_sched))
>> +        igt_assert_f(0, "Couldn't set thread priority");
>> +
>> +    if (pthread_create(&flush_thread, &p_attr, flusher, NULL))
>> +        igt_assert_f(0, "thread creation failed");
>
> More nitpicks: You mix starting error messages with upper case and lower
> case.

Will use the lower case for all error messages. fine ?
>
>> +
>> +    if (pthread_attr_destroy(&p_attr))
>> +        igt_assert_f(0, "Error destroying thread attributes");
>> +}
>> +
>> +static void open_relay_file(void)
>> +{
>> +    relay_fd = igt_debugfs_open(RELAY_FILE_NAME, O_RDONLY);
>> +    if (relay_fd < 0)
>> +        igt_assert_f(0, "Couldn't open the guc log file");
>> +
>> +    /* Purge the old/boot-time logs from the relay buffer.
>> +     * This is more for Val team's requirement, where they have to first
>> +     * purge the existing logs before starting the tests for which
>> the logs
>> +     * are actually needed. After this logger will enter into a loop and
>> +     * wait for the new data, at that point benchmark can be launched
>> from
>> +     * a different shell.
>> +     */
>> +    if (discard_oldlogs)
>> +        pull_leftover_data();
>> +}
>> +
>> +static void open_output_file(void)
>> +{
>> +    char *filename;
>> +
>> +    if (asprintf(&filename, "/tmp/%s", out_filename ? :
>> "guc_log_dump.dat") < 0)
>> +        igt_assert_f(0, "Couldn't allocate the filename");
>
> I think you need to take the out_filename when specified by the user
> as-is and not stick it in /tmp. It is perfectly reasonable that someone
> would want to put it somewhere else.
>
Ok so User should provide the full path.

> Also better to move the default name of "guc_log_dump.dat" to where
> defaults like RELAY_FILE_NAME and such are defined.
>
fine.

> And I would not default to /tmp but the current directory.
>
Chris had suggested to use /tmp. Fine will use the current directory.

>> +
>> +    /* Use Direct IO mode for the output file, as the data written is
>> not
>> +     * supposed to be accessed again, this saves a copy of data from
>> App's
>> +     * buffer to kernel buffer (Page cache). Due to no buffering on
>> kernel
>> +     * side, data is flushed out to disk faster and more buffering
>> can be
>> +     * done on the logger side to hide the disk IO latency.
>> +     */
>> +    outfile_fd = open(filename, O_CREAT | O_WRONLY | O_TRUNC |
>> O_DIRECT, 0444);
>
> 0400 or 0440 would be better I think.

fine, this might make the file inaccessible to Root User in case the 
logger wasn't running as a Root ?

>
>> +    if (outfile_fd < 0)
>> +        igt_assert_f(0, "Couldn't open the output file");
>> +
>> +    free(out_filename);
>> +    free(filename);
>> +}
>> +
>> +static void init_main_thread(void)
>> +{
>> +    struct sched_param    thread_sched;
>> +    /* Run the main thread at highest priority to ensure that it always
>> +     * gets woken-up at earliest on arrival of new data and so is always
>> +     * ready to pull the logs, otherwise there could be loss logs if
>> +     * GuC firmware is generating logs at a very high rate.
>> +     */
>> +    thread_sched.sched_priority = 1;
>> +    if (sched_setscheduler(getpid(), SCHED_FIFO, &thread_sched))
>> +        igt_assert_f(0, "Couldn't set the priority");
>> +
>> +    if (signal(SIGINT, int_sig_handler) == SIG_ERR)
>> +        igt_assert_f(0, "SIGINT handler registration failed");
>> +
>> +    /* Need an aligned pointer for direct IO */
>> +    if (posix_memalign((void **)&read_buffer, PAGE_SIZE, num_buffers
>> * SUBBUF_SIZE))
>> +        igt_assert_f(0, "Couldn't allocate the read buffer");
>> +
>> +    /* Keep the pages locked in RAM, avoid page fault overhead */
>> +    if (mlock(read_buffer, num_buffers * SUBBUF_SIZE))
>> +        igt_assert_f(0, "Failed to lock memory");
>> +
>> +    /* Enable the logging, it may not have been enabled from boot and so
>> +     * the relay file also wouldn't have been created.
>> +     */
>> +    guc_log_control(true);
>> +
>> +    open_relay_file();
>> +    open_output_file();
>> +}
>> +
>> +static int parse_options(int opt, int opt_index, void *data)
>> +{
>> +    igt_debug("opt %c optarg %s\n", opt, optarg);
>> +
>> +    switch(opt) {
>> +    case 'v':
>> +        verbosity_level = atoi(optarg);
>> +        if (verbosity_level < 0)
>> +            verbosity_level = 0;
>> +        else if (verbosity_level > 3)
>> +            verbosity_level = 3;
>
> Notify user if invalid value was provided?
>
Just notify but don't abort and continue by using a valid value ?

>> +
>> +        igt_debug("verbosity level to be used is %d\n",
>> verbosity_level);
>> +        break;
>> +    case 'o':
>> +        if (asprintf(&out_filename, "%s", optarg) < 0)
>
> strdup?
ok will try to use strdup.
>
>> +            igt_assert_f(0, "Couldn't allocate the o/p filename");
>> +        igt_debug("logs to be stored in file /tmp/%s\n", out_filename);
>> +        break;
>> +    case 'b':
>> +        num_buffers = atoi(optarg);
>> +        if (num_buffers == 0)
>> +            num_buffers = NUM_SUBBUFS;
>
> Again notify if invalid number/value.
>
same doubt as above.

>> +        igt_debug("number of buffers to be used is %d\n", num_buffers);
>> +        break;
>> +    case 't':
>> +        test_duration = atoi(optarg);
>> +        if (test_duration > 0)
>
> If switch was specified it should have a valid optarg or error reported.
>
>> +            igt_debug("logger to run for %d seconds\n", test_duration);
>> +        break;
>> +    case 'p':
>> +        poll_timeout = atoi(optarg);
>> +        if (poll_timeout == 0)
>> +            poll_timeout = 2; /* at least use 2ms poll timeout */
>
> Hm I would prefer if you allowed no poll period.

by no poll period, do you mean indefinite wait ?

That can be done by specifying -1 as a timeout value in the command line,

"-p --polltimeout=ms polling timeout in ms, -1 == indefinite wait for
the new data\n"

> I even think it should be the default if we think relayfs will be definitely fixed to not
> require it.
>
Can keep the indefinite wait as a default.

> In any case, commend is wrong since user can specify 1ms and get away
> with it. :)
>
>> +        if (poll_timeout > 0)
>> +            igt_debug("polling to be done with %d milli seconds
>> timeout\n", poll_timeout);
>
> milliseconds
>
>> +        break;
>> +    case 's':
>> +        max_filesize = atoi(optarg);
>> +        if (max_filesize > 0)
>
> Same as for -t.
>
>> +            igt_debug("max allowed size of the output file is %d
>> MBs\n", max_filesize);
>> +        break;
>> +    case 'd':
>> +        discard_oldlogs = true;
>> +        igt_debug("old/boot-time logs will be discarded\n");
>> +        break;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static void process_command_line(int argc, char **argv)
>> +{
>> +    static struct option long_options[] = {
>> +        {"verbosity", required_argument, 0, 'v'},
>> +        {"outputfilename", required_argument, 0, 'o'},
>> +        {"buffers", required_argument, 0, 'b'},
>> +        {"testduration", required_argument, 0, 't'},
>> +        {"polltimeout", required_argument, 0, 'p'},
>> +        {"size", required_argument, 0, 's'},
>> +        {"discard", no_argument, 0, 'd'},
>> +        { 0, 0, 0, 0 }
>> +    };
>> +
>> +    const char *help =
>> +        "  -v --verbosity=level   verbosity level of GuC logging
>> (0-3)\n"
>> +        "  -o --outfile=name      name of the output file in /tmp
>> directory, where logs will be stored\n"
>> +        "  -b --buffers=num       number of buffers to be maintained
>> on logger side for storing logs\n"
>> +        "  -t --testduration=sec  max duration in seconds for which
>> the logger should run, 0 == forever\n"
>> +        "  -p --polltimeout=ms    polling timeout in ms, -1 ==
>> indefinite wait for the new data\n"
>> +        "  -s --size=MB           max size of output file in MBs
>> after which logging will be stopped, 0 == no limit\n"
>> +        "  -d --discard           discard the old/boot-time logs
>> before entering into the capture loop\n";
>> +
>> +    igt_simple_init_parse_opts(&argc, argv, "v:o:b:t:p:s:d",
>> long_options,
>> +                   help, parse_options, NULL);
>> +}
>> +
>> +int main(int argc, char **argv)
>> +{
>> +    struct pollfd relay_poll_fd;
>> +    struct timespec start={};
>> +    int nfds;
>> +    int ret;
>> +
>> +    process_command_line(argc, argv);
>> +
>> +    init_main_thread();
>> +
>> +    /* Use a separate thread for flushing the logs to a file on disk.
>> +     * Main thread will buffer the data from relay file in its pool of
>> +     * buffers and other thread will flush the data to disk in
>> background.
>> +     * This is needed, albeit by default data is written out to disk in
>> +     * async mode, as when there are too many dirty pages in the RAM,
>> +     * (/proc/sys/vm/dirty_ratio), kernel starts blocking the processes
>> +     * doing the file writes.
>> +     */
>> +    init_flusher_thread();
>> +
>> +    relay_poll_fd.fd = relay_fd;
>> +    relay_poll_fd.events = POLLIN;
>> +    relay_poll_fd.revents = 0;
>> +
>> +    nfds = 1; /* only one fd to poll */
>> +
>> +    while (!stop_logging)
>> +    {
>> +        if (test_duration && (igt_seconds_elapsed(&start) >
>> test_duration)) {
>
> If you agree to allow no poll period the this would not work right? In
> that case you would need to use alarm(2) or something.
>

Can calculate the timeout value for poll call as,
	if (poll_timeout < 0) {
		timeout = test_duration - igt_seconds_elapsed(&start))
	}


>> +            igt_debug("Ran for stipulated %d seconds, exit now\n",
>> test_duration);
>> +            stop_logging = true;
>> +            break;
>> +        }
>> +
>> +        /* Wait/poll for the new data to be available, relay doesn't
>> +         * provide a blocking read.
>> +         * Need to do polling with a timeout instead of indefinite wait,
>> +         * to avoid relying on relay for the wakeup, as relay does
>> wakeup
>> +         * in a deferred manner on jiffies granularity by scheduling a
>> +         * timer and moreover that timer is re-scheduled on every newly
>> +         * produced buffer so timer keeps getting pushed out if there
>> +         * are multiple flush interrupts in a very quick succession
>> (less
>> +         * than a jiffy gap between 2 flush interrupts) and relay runs
>> +         * out of sub buffers to store the new logs.
>> +         */
>> +        ret = poll(&relay_poll_fd, nfds, poll_timeout);
>> +        if (ret < 0) {
>> +            if (errno == EINTR)
>> +                break;
>> +            else
>> +                igt_assert_f(0, "poll call failed");
>> +        }
>> +
>> +        /* No data available yet, poll again, hopefully new data is
>> round the corner */
>> +        if (!relay_poll_fd.revents)
>> +            continue;
>> +
>> +        pull_data();
>> +    }
>> +
>> +    /* Pause logging on the GuC side*/
>> +    guc_log_control(false);
>> +    pthread_cond_signal(&underflow_cond);
>> +    pthread_join(flush_thread, NULL);
>> +    pull_leftover_data();
>> +    igt_info("total_bytes_written %" PRIu64 "\n", total_bytes_written);
>> +
>> +    free(read_buffer);
>> +    close(relay_fd);
>> +    close(outfile_fd);
>> +    igt_exit();
>> +}
>>
>
> So only minor comments. The rest looks reasonable to me and I trust you
> tested it well and it works fine.
>
Thanks much for the thorough review.

Best regards
Akash
> Regards,
>
> Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] tools/intel_guc_logger: Utility for capturing GuC firmware logs in a file
  2016-09-06 15:33   ` Goel, Akash
@ 2016-09-06 15:52     ` Tvrtko Ursulin
  2016-09-07  8:10       ` Goel, Akash
  0 siblings, 1 reply; 17+ messages in thread
From: Tvrtko Ursulin @ 2016-09-06 15:52 UTC (permalink / raw)
  To: Goel, Akash, intel-gfx


On 06/09/16 16:33, Goel, Akash wrote:
> On 9/6/2016 6:47 PM, Tvrtko Ursulin wrote:
>> Hi,
>>
>> On 06/09/16 11:43, akash.goel@intel.com wrote:
>>> From: Akash Goel <akash.goel@intel.com>
>>>
>>> This patch provides a test utility which helps capture GuC firmware
>>> logs and
>>> then dump them to file.
>>> The logs are pulled from a debugfs file
>>> '/sys/kernel/debug/dri/guc_log' and
>>> stored into a file '/tmp/guc_log_dump.dat', the name of the output
>>> file can
>>> be changed through a command line argument.
>>>
>>> The utility goes into an infinite loop where it waits for the arrival
>>> of new
>>> logs and as soon as new set of logs are produced it captures them in
>>> its local
>>> buffer which is then flushed out to the file on disk.
>>> Any time when logging needs to be ended, User can stop this utility
>>> (CTRL+C).
>>>
>>> Before entering into a loop, it first discards whatever logs are
>>> present in
>>> the debugfs file.
>>> This way User can first launch this utility and then start a
>>> workload/activity
>>> for which GuC firmware logs are to be actually captured and keep
>>> running the
>>> utility for as long as its needed, like once the workload is over this
>>> utility
>>> can be forcefully stopped.
>>>
>>> If the logging wasn't enabled on GuC side by the Driver at boot time,
>>> utility
>>> will first enable the logging and later on when it is stopped (CTRL+C)
>>> it will
>>> also pause the logging on GuC side.
>>>
>>> Signed-off-by: Akash Goel <akash.goel@intel.com>
>>> ---
>>>   tools/Makefile.sources   |   1 +
>>>   tools/intel_guc_logger.c | 441
>>> +++++++++++++++++++++++++++++++++++++++++++++++
>>>   2 files changed, 442 insertions(+)
>>>   create mode 100644 tools/intel_guc_logger.c
>>>
>>> diff --git a/tools/Makefile.sources b/tools/Makefile.sources
>>> index 2bb6c8e..be58871 100644
>>> --- a/tools/Makefile.sources
>>> +++ b/tools/Makefile.sources
>>> @@ -19,6 +19,7 @@ tools_prog_lists =        \
>>>       intel_gpu_time        \
>>>       intel_gpu_top        \
>>>       intel_gtt        \
>>> +    intel_guc_logger        \
>>>       intel_infoframes    \
>>>       intel_l3_parity        \
>>>       intel_lid        \
>>> diff --git a/tools/intel_guc_logger.c b/tools/intel_guc_logger.c
>>> new file mode 100644
>>> index 0000000..92172fa
>>> --- /dev/null
>>> +++ b/tools/intel_guc_logger.c
>>> @@ -0,0 +1,441 @@
>>> +
>>> +#define _GNU_SOURCE  /* For using O_DIRECT */
>>> +#include <inttypes.h>
>>> +#include <stdio.h>
>>> +#include <fcntl.h>
>>> +#include <sched.h>
>>> +#include <sys/ioctl.h>
>>> +#include <string.h>
>>> +#include <errno.h>
>>> +#include <signal.h>
>>> +#include <stdlib.h>
>>> +#include <unistd.h>
>>> +#include <sys/poll.h>
>>> +#include <sys/wait.h>
>>> +#include <sys/types.h>
>>> +#include <sys/syscall.h>
>>> +#include <sys/mman.h>
>>> +#include <assert.h>
>>> +#include <pthread.h>
>>> +
>>> +#include "igt.h"
>>> +
>>> +#define MB(x) ((uint64_t)(x) * 1024 * 1024)
>>> +#ifndef PAGE_SIZE
>>> +  #define PAGE_SIZE 4096
>>> +#endif
>>> +#define SUBBUF_SIZE (19*PAGE_SIZE)
>>> +/* Need large buffering from logger side to hide the DISK IO latency,
>>> Driver
>>> + * can only store 8 snapshots of GuC log buffer in relay.
>>> + */
>>> +#define NUM_SUBBUFS 100
>>> +
>>> +#define RELAY_FILE_NAME  "guc_log"
>>> +#define CONTROL_FILE_NAME "i915_guc_log_control"
>>> +
>>> +char *read_buffer;
>>> +char *out_filename;
>>> +int poll_timeout = 2; /* by default 2ms timeout */
>>> +pthread_mutex_t mutex;
>>> +pthread_t flush_thread;
>>> +int verbosity_level = 3; /* by default capture logs at max verbosity */
>>> +uint32_t produced, consumed;
>>> +uint64_t total_bytes_written;
>>> +int num_buffers = NUM_SUBBUFS;
>>> +int relay_fd, outfile_fd = -1;
>>> +bool stop_logging, discard_oldlogs;
>>> +uint32_t test_duration, max_filesize;
>>> +pthread_cond_t underflow_cond, overflow_cond;
>>> +
>>> +static void guc_log_control(bool enable_logging)
>>> +{
>>> +    int control_fd;
>>> +    char data[19];
>>> +    uint64_t val;
>>> +    int ret;
>>> +
>>> +    control_fd = igt_debugfs_open(CONTROL_FILE_NAME, O_WRONLY);
>>> +    if (control_fd < 0)
>>> +        igt_assert_f(0, "Couldn't open the guc log control file");
>>> +
>>> +    val = enable_logging ? ((verbosity_level << 4) | 0x1) : 0;
>>> +
>>> +    snprintf(data, sizeof(data), "0x%" PRIx64, val);
>>> +    ret = write(control_fd, data, strlen(data) + 1);
>>
>> Minor: It looks safe like it is but something like below would maybe be
>> more robust?
>>
>> ret = snprintf(data, sizeof(data), "0x%" PRIx64, val);
>> igt_assert(ret > 2 && ret < sizeof(data));
>
> ok will add, but possibility of failure will be really remote here.
> but igt_assert(ret > 0) should suffice.

Yes there is no possibility for failure as it stands, just more robust 
implementation should someone change something in the future. That's why 
I said you could also decide to keep it as is. My version also avoided 
the strlen since snprintf already tells you that.

>> ret = write(control_fd, data, ret);
>> igt_assert(ret > 0); // assuming short writes can't happen
>>
>> Up to you.
>>
>>> +    if (ret < 0)
>>> +        igt_assert_f(0, "Couldn't write to the log control file");
>>> +
>>> +    close(control_fd);
>>> +}
>>> +
>>> +static void int_sig_handler(int sig)
>>> +{
>>> +    igt_info("Received signal %d\n", sig);
>>> +
>>> +    stop_logging = true;
>>> +}
>>> +
>>> +static void pull_leftover_data(void)
>>> +{
>>> +    unsigned int bytes_read = 0;
>>> +    int ret;
>>> +
>>> +    while (1) {
>>> +        /* Read the logs from relay buffer */
>>> +        ret = read(relay_fd, read_buffer, SUBBUF_SIZE);
>>> +        if (!ret)
>>> +            break;
>>> +        else if (ret < 0)
>>> +            igt_assert_f(0, "Failed to read from the guc log file");
>>> +        else if (ret < SUBBUF_SIZE)
>>> +            igt_assert_f(0, "invalid read from relay file");
>>> +
>>> +        bytes_read += ret;
>>> +
>>> +        if (outfile_fd > 0) {
>>
>>> = 0 I think. Or is it even needed since open_output_file asserts if it
>> fails to open?
>>
> Actually pull_leftover_data() will be called twice, once before opening
> the outfile. This is for purging the old logs, if enabled.

Oh right, OK then.

>
>>> +            ret = write(outfile_fd, read_buffer, SUBBUF_SIZE);
>>> +            if (ret < 0)
>>> +                igt_assert_f(0, "Couldn't dump the logs in a file");
>>
>> Do you need to check for short writes as well here? Man page for write
>> says it can happen with out of disk space or signals.
>>
> To keep things simple can just check 'ret < SUBBUF_SIZE' considering
> out of disk space as a serious failure or add a loop here which will run
> until SUBBUF_SIZE bytes have been written.

Just assert, no point in complicating things.

>>> +            total_bytes_written += ret;
>>> +        }
>>> +    };
>>> +
>>> +    igt_debug("%u bytes flushed\n", bytes_read);
>>> +}
>>> +
>>> +static int num_filled_bufs(void)
>>> +{
>>> +    return (produced - consumed);
>>> +}
>>> +
>>> +static void pull_data(void)
>>> +{
>>> +    char *ptr;
>>> +    int ret;
>>> +
>>> +    pthread_mutex_lock(&mutex);
>>> +    while (num_filled_bufs() >= num_buffers) {
>>> +        igt_debug("overflow, will wait, produced %u, consumed %u\n",
>>> produced, consumed);
>>> +        /* Stall the main thread in case of overflow, as there are no
>>> +         * buffers available to store the new logs, otherwise there
>>> +         * could be corruption if both threads work on the same buffer.
>>> +         */
>>> +        pthread_cond_wait(&overflow_cond, &mutex);
>>> +    };
>>> +    pthread_mutex_unlock(&mutex);
>>> +
>>> +    ptr = read_buffer + (produced % num_buffers) * SUBBUF_SIZE;
>>> +
>>> +    /* Read the logs from relay buffer */
>>> +    ret = read(relay_fd, ptr, SUBBUF_SIZE);
>>> +    if (ret < 0)
>>> +        igt_assert_f(0, "Failed to read from the guc log file");
>>> +    else if (ret && (ret < SUBBUF_SIZE))
>>> +        igt_assert_f(0, "invalid read from relay file");
>>> +
>>> +    pthread_mutex_lock(&mutex);
>>> +    if (ret) {
>>> +        produced++;
>>> +        pthread_cond_signal(&underflow_cond);
>>> +    }
>>> +    pthread_mutex_unlock(&mutex);
>>
>> Very minor: can move the mutex under the if.
>>
> Fine will move under the 'if'.
>
>>> +}
>>> +
>>> +static void *flusher(void *arg)
>>> +{
>>> +    char *ptr;
>>> +    int ret;
>>> +
>>> +    igt_debug("Execution started of flusher thread\n");
>>> +
>>> +    do {
>>> +        pthread_mutex_lock(&mutex);
>>> +        while (!num_filled_bufs()) {
>>> +            /* Exit only after completing the flush of all the filled
>>> +             * buffers as User would expect that all logs captured up
>>> +             * till the point of interruption/exit are written out to
>>> +             * the disk file.
>>> +             */
>>> +            if (stop_logging) {
>>> +                igt_debug("flusher to exit now\n");
>>> +                pthread_mutex_unlock(&mutex);
>>> +                return NULL;
>>> +            }
>>> +            pthread_cond_wait(&underflow_cond, &mutex);
>>> +        };
>>> +        pthread_mutex_unlock(&mutex);
>>> +
>>> +        ptr = read_buffer + (consumed % num_buffers) * SUBBUF_SIZE;
>>> +
>>> +        ret = write(outfile_fd, ptr, SUBBUF_SIZE);
>>> +        if (ret < 0)
>>> +            igt_assert_f(0, "Couldn't dump the logs in a file");
>>
>> Question about short writes applies here as well.
>>
> Fine will do the same here whatever is decided for above.
>
>>> +
>>> +        total_bytes_written += ret;
>>> +        if (max_filesize && (total_bytes_written > MB(max_filesize))) {
>>> +            igt_debug("Reached the target of %" PRIu64 " bytes\n",
>>> MB(max_filesize));
>>> +            stop_logging = true;
>>> +        }
>>> +
>>> +        pthread_mutex_lock(&mutex);
>>> +        consumed++;
>>> +        pthread_cond_signal(&overflow_cond);
>>> +        pthread_mutex_unlock(&mutex);
>>> +    } while(1);
>>
>> Minor again: You use both "while(1) {}" and "do { } while(1)" in here.
>> Maybe stick with one? :D (I think "for (;;)" is preferred, but maybe I
>> am misremembering things.)
>>
> Sorry for the inconsistency, will clean this up.
>
>>> +
>>> +    return NULL;
>>> +}
>>> +
>>> +static void init_flusher_thread(void)
>>> +{
>>> +    struct sched_param    thread_sched;
>>> +    pthread_attr_t        p_attr;
>>> +
>>> +    pthread_cond_init(&underflow_cond, NULL);
>>> +    pthread_cond_init(&overflow_cond, NULL);
>>> +    pthread_mutex_init(&mutex, NULL);
>>> +
>>> +    if (pthread_attr_init(&p_attr))
>>> +        igt_assert_f(0, "Error obtaining default thread attributes");
>>> +
>>> +    if (pthread_attr_setinheritsched(&p_attr, PTHREAD_EXPLICIT_SCHED))
>>> +        igt_assert_f(0, "Couldn't set inheritsched");
>>> +
>>> +    if (pthread_attr_setschedpolicy(&p_attr, SCHED_RR))
>>> +        igt_assert_f(0, "Couldn't set thread scheduling policy");
>>> +
>>> +    /* Keep the flusher task also at rt priority, so that it doesn't
>>> get
>>> +     * too late in flushing the collected logs in local buffers to
>>> the disk,
>>> +     * and so main thread always have spare buffers to collect the
>>> logs.
>>> +     */
>>> +    thread_sched.sched_priority = 5;
>>> +    if (pthread_attr_setschedparam(&p_attr, &thread_sched))
>>> +        igt_assert_f(0, "Couldn't set thread priority");
>>> +
>>> +    if (pthread_create(&flush_thread, &p_attr, flusher, NULL))
>>> +        igt_assert_f(0, "thread creation failed");
>>
>> More nitpicks: You mix starting error messages with upper case and lower
>> case.
>
> Will use the lower case for all error messages. fine ?

Fine with me.

>>
>>> +
>>> +    if (pthread_attr_destroy(&p_attr))
>>> +        igt_assert_f(0, "Error destroying thread attributes");
>>> +}
>>> +
>>> +static void open_relay_file(void)
>>> +{
>>> +    relay_fd = igt_debugfs_open(RELAY_FILE_NAME, O_RDONLY);
>>> +    if (relay_fd < 0)
>>> +        igt_assert_f(0, "Couldn't open the guc log file");
>>> +
>>> +    /* Purge the old/boot-time logs from the relay buffer.
>>> +     * This is more for Val team's requirement, where they have to
>>> first
>>> +     * purge the existing logs before starting the tests for which
>>> the logs
>>> +     * are actually needed. After this logger will enter into a loop
>>> and
>>> +     * wait for the new data, at that point benchmark can be launched
>>> from
>>> +     * a different shell.
>>> +     */
>>> +    if (discard_oldlogs)
>>> +        pull_leftover_data();
>>> +}
>>> +
>>> +static void open_output_file(void)
>>> +{
>>> +    char *filename;
>>> +
>>> +    if (asprintf(&filename, "/tmp/%s", out_filename ? :
>>> "guc_log_dump.dat") < 0)
>>> +        igt_assert_f(0, "Couldn't allocate the filename");
>>
>> I think you need to take the out_filename when specified by the user
>> as-is and not stick it in /tmp. It is perfectly reasonable that someone
>> would want to put it somewhere else.
>>
> Ok so User should provide the full path.

It can be relative, absolute, or just a filename. My suggestion was to 
just use what the user provided for the open(2) call.

>
>> Also better to move the default name of "guc_log_dump.dat" to where
>> defaults like RELAY_FILE_NAME and such are defined.
>>
> fine.
>
>> And I would not default to /tmp but the current directory.
>>
> Chris had suggested to use /tmp. Fine will use the current directory.
>
>>> +
>>> +    /* Use Direct IO mode for the output file, as the data written is
>>> not
>>> +     * supposed to be accessed again, this saves a copy of data from
>>> App's
>>> +     * buffer to kernel buffer (Page cache). Due to no buffering on
>>> kernel
>>> +     * side, data is flushed out to disk faster and more buffering
>>> can be
>>> +     * done on the logger side to hide the disk IO latency.
>>> +     */
>>> +    outfile_fd = open(filename, O_CREAT | O_WRONLY | O_TRUNC |
>>> O_DIRECT, 0444);
>>
>> 0400 or 0440 would be better I think.
>
> fine, this might make the file inaccessible to Root User in case the
> logger wasn't running as a Root ?

I assumed logger will run as root and it would be preferable not to have 
log files accessible by anyone. Either way root can read anything 
regardless of permissions, no?

>>
>>> +    if (outfile_fd < 0)
>>> +        igt_assert_f(0, "Couldn't open the output file");
>>> +
>>> +    free(out_filename);
>>> +    free(filename);
>>> +}
>>> +
>>> +static void init_main_thread(void)
>>> +{
>>> +    struct sched_param    thread_sched;
>>> +    /* Run the main thread at highest priority to ensure that it always
>>> +     * gets woken-up at earliest on arrival of new data and so is
>>> always
>>> +     * ready to pull the logs, otherwise there could be loss logs if
>>> +     * GuC firmware is generating logs at a very high rate.
>>> +     */
>>> +    thread_sched.sched_priority = 1;
>>> +    if (sched_setscheduler(getpid(), SCHED_FIFO, &thread_sched))
>>> +        igt_assert_f(0, "Couldn't set the priority");
>>> +
>>> +    if (signal(SIGINT, int_sig_handler) == SIG_ERR)
>>> +        igt_assert_f(0, "SIGINT handler registration failed");
>>> +
>>> +    /* Need an aligned pointer for direct IO */
>>> +    if (posix_memalign((void **)&read_buffer, PAGE_SIZE, num_buffers
>>> * SUBBUF_SIZE))
>>> +        igt_assert_f(0, "Couldn't allocate the read buffer");
>>> +
>>> +    /* Keep the pages locked in RAM, avoid page fault overhead */
>>> +    if (mlock(read_buffer, num_buffers * SUBBUF_SIZE))
>>> +        igt_assert_f(0, "Failed to lock memory");
>>> +
>>> +    /* Enable the logging, it may not have been enabled from boot
>>> and so
>>> +     * the relay file also wouldn't have been created.
>>> +     */
>>> +    guc_log_control(true);
>>> +
>>> +    open_relay_file();
>>> +    open_output_file();
>>> +}
>>> +
>>> +static int parse_options(int opt, int opt_index, void *data)
>>> +{
>>> +    igt_debug("opt %c optarg %s\n", opt, optarg);
>>> +
>>> +    switch(opt) {
>>> +    case 'v':
>>> +        verbosity_level = atoi(optarg);
>>> +        if (verbosity_level < 0)
>>> +            verbosity_level = 0;
>>> +        else if (verbosity_level > 3)
>>> +            verbosity_level = 3;
>>
>> Notify user if invalid value was provided?
>>
> Just notify but don't abort and continue by using a valid value ?

No strong opinion. Either is much better than silently using a different 
value or ignoring the fact user has typed in garbage on the command line.

>
>>> +
>>> +        igt_debug("verbosity level to be used is %d\n",
>>> verbosity_level);
>>> +        break;
>>> +    case 'o':
>>> +        if (asprintf(&out_filename, "%s", optarg) < 0)
>>
>> strdup?
> ok will try to use strdup.
>>
>>> +            igt_assert_f(0, "Couldn't allocate the o/p filename");
>>> +        igt_debug("logs to be stored in file /tmp/%s\n", out_filename);
>>> +        break;
>>> +    case 'b':
>>> +        num_buffers = atoi(optarg);
>>> +        if (num_buffers == 0)
>>> +            num_buffers = NUM_SUBBUFS;
>>
>> Again notify if invalid number/value.
>>
> same doubt as above.
>
>>> +        igt_debug("number of buffers to be used is %d\n", num_buffers);
>>> +        break;
>>> +    case 't':
>>> +        test_duration = atoi(optarg);
>>> +        if (test_duration > 0)
>>
>> If switch was specified it should have a valid optarg or error reported.
>>
>>> +            igt_debug("logger to run for %d seconds\n", test_duration);
>>> +        break;
>>> +    case 'p':
>>> +        poll_timeout = atoi(optarg);
>>> +        if (poll_timeout == 0)
>>> +            poll_timeout = 2; /* at least use 2ms poll timeout */
>>
>> Hm I would prefer if you allowed no poll period.
>
> by no poll period, do you mean indefinite wait ?
>
> That can be done by specifying -1 as a timeout value in the command line,
>
> "-p --polltimeout=ms polling timeout in ms, -1 == indefinite wait for
> the new data\n"

My bad, I missed that. All good then.

>
>> I even think it should be the default if we think relayfs will be
>> definitely fixed to not
>> require it.
>>
> Can keep the indefinite wait as a default.
>
>> In any case, commend is wrong since user can specify 1ms and get away
>> with it. :)
>>
>>> +        if (poll_timeout > 0)
>>> +            igt_debug("polling to be done with %d milli seconds
>>> timeout\n", poll_timeout);
>>
>> milliseconds
>>
>>> +        break;
>>> +    case 's':
>>> +        max_filesize = atoi(optarg);
>>> +        if (max_filesize > 0)
>>
>> Same as for -t.
>>
>>> +            igt_debug("max allowed size of the output file is %d
>>> MBs\n", max_filesize);
>>> +        break;
>>> +    case 'd':
>>> +        discard_oldlogs = true;
>>> +        igt_debug("old/boot-time logs will be discarded\n");
>>> +        break;
>>> +    }
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static void process_command_line(int argc, char **argv)
>>> +{
>>> +    static struct option long_options[] = {
>>> +        {"verbosity", required_argument, 0, 'v'},
>>> +        {"outputfilename", required_argument, 0, 'o'},
>>> +        {"buffers", required_argument, 0, 'b'},
>>> +        {"testduration", required_argument, 0, 't'},
>>> +        {"polltimeout", required_argument, 0, 'p'},
>>> +        {"size", required_argument, 0, 's'},
>>> +        {"discard", no_argument, 0, 'd'},
>>> +        { 0, 0, 0, 0 }
>>> +    };
>>> +
>>> +    const char *help =
>>> +        "  -v --verbosity=level   verbosity level of GuC logging
>>> (0-3)\n"
>>> +        "  -o --outfile=name      name of the output file in /tmp
>>> directory, where logs will be stored\n"
>>> +        "  -b --buffers=num       number of buffers to be maintained
>>> on logger side for storing logs\n"
>>> +        "  -t --testduration=sec  max duration in seconds for which
>>> the logger should run, 0 == forever\n"
>>> +        "  -p --polltimeout=ms    polling timeout in ms, -1 ==
>>> indefinite wait for the new data\n"
>>> +        "  -s --size=MB           max size of output file in MBs
>>> after which logging will be stopped, 0 == no limit\n"
>>> +        "  -d --discard           discard the old/boot-time logs
>>> before entering into the capture loop\n";
>>> +
>>> +    igt_simple_init_parse_opts(&argc, argv, "v:o:b:t:p:s:d",
>>> long_options,
>>> +                   help, parse_options, NULL);
>>> +}
>>> +
>>> +int main(int argc, char **argv)
>>> +{
>>> +    struct pollfd relay_poll_fd;
>>> +    struct timespec start={};
>>> +    int nfds;
>>> +    int ret;
>>> +
>>> +    process_command_line(argc, argv);
>>> +
>>> +    init_main_thread();
>>> +
>>> +    /* Use a separate thread for flushing the logs to a file on disk.
>>> +     * Main thread will buffer the data from relay file in its pool of
>>> +     * buffers and other thread will flush the data to disk in
>>> background.
>>> +     * This is needed, albeit by default data is written out to disk in
>>> +     * async mode, as when there are too many dirty pages in the RAM,
>>> +     * (/proc/sys/vm/dirty_ratio), kernel starts blocking the processes
>>> +     * doing the file writes.
>>> +     */
>>> +    init_flusher_thread();
>>> +
>>> +    relay_poll_fd.fd = relay_fd;
>>> +    relay_poll_fd.events = POLLIN;
>>> +    relay_poll_fd.revents = 0;
>>> +
>>> +    nfds = 1; /* only one fd to poll */
>>> +
>>> +    while (!stop_logging)
>>> +    {
>>> +        if (test_duration && (igt_seconds_elapsed(&start) >
>>> test_duration)) {
>>
>> If you agree to allow no poll period the this would not work right? In
>> that case you would need to use alarm(2) or something.
>>
>
> Can calculate the timeout value for poll call as,
>      if (poll_timeout < 0) {
>          timeout = test_duration - igt_seconds_elapsed(&start))
>      }

My point was that with indefinite poll loop will not run if there is not 
log data so timeout will not work implemented like this.

>
>
>>> +            igt_debug("Ran for stipulated %d seconds, exit now\n",
>>> test_duration);
>>> +            stop_logging = true;
>>> +            break;
>>> +        }
>>> +
>>> +        /* Wait/poll for the new data to be available, relay doesn't
>>> +         * provide a blocking read.
>>> +         * Need to do polling with a timeout instead of indefinite
>>> wait,
>>> +         * to avoid relying on relay for the wakeup, as relay does
>>> wakeup
>>> +         * in a deferred manner on jiffies granularity by scheduling a
>>> +         * timer and moreover that timer is re-scheduled on every newly
>>> +         * produced buffer so timer keeps getting pushed out if there
>>> +         * are multiple flush interrupts in a very quick succession
>>> (less
>>> +         * than a jiffy gap between 2 flush interrupts) and relay runs
>>> +         * out of sub buffers to store the new logs.
>>> +         */
>>> +        ret = poll(&relay_poll_fd, nfds, poll_timeout);
>>> +        if (ret < 0) {
>>> +            if (errno == EINTR)
>>> +                break;
>>> +            else
>>> +                igt_assert_f(0, "poll call failed");
>>> +        }
>>> +
>>> +        /* No data available yet, poll again, hopefully new data is
>>> round the corner */
>>> +        if (!relay_poll_fd.revents)
>>> +            continue;
>>> +
>>> +        pull_data();
>>> +    }
>>> +
>>> +    /* Pause logging on the GuC side*/
>>> +    guc_log_control(false);
>>> +    pthread_cond_signal(&underflow_cond);
>>> +    pthread_join(flush_thread, NULL);
>>> +    pull_leftover_data();
>>> +    igt_info("total_bytes_written %" PRIu64 "\n", total_bytes_written);
>>> +
>>> +    free(read_buffer);
>>> +    close(relay_fd);
>>> +    close(outfile_fd);
>>> +    igt_exit();
>>> +}
>>>
>>
>> So only minor comments. The rest looks reasonable to me and I trust you
>> tested it well and it works fine.
>>
> Thanks much for the thorough review.

Np.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] tools/intel_guc_logger: Utility for capturing GuC firmware logs in a file
  2016-09-06 15:52     ` Tvrtko Ursulin
@ 2016-09-07  8:10       ` Goel, Akash
  2016-09-07  8:44         ` Chris Wilson
  0 siblings, 1 reply; 17+ messages in thread
From: Goel, Akash @ 2016-09-07  8:10 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx; +Cc: akash.goel



On 9/6/2016 9:22 PM, Tvrtko Ursulin wrote:
>
> On 06/09/16 16:33, Goel, Akash wrote:
>> On 9/6/2016 6:47 PM, Tvrtko Ursulin wrote:
>>> Hi,
>>>
>>> On 06/09/16 11:43, akash.goel@intel.com wrote:
>>>> From: Akash Goel <akash.goel@intel.com>
>>>>
>>>> This patch provides a test utility which helps capture GuC firmware
>>>> logs and
>>>> then dump them to file.
>>>> The logs are pulled from a debugfs file
>>>> '/sys/kernel/debug/dri/guc_log' and
>>>> stored into a file '/tmp/guc_log_dump.dat', the name of the output
>>>> file can
>>>> be changed through a command line argument.
>>>>
>>>> The utility goes into an infinite loop where it waits for the arrival
>>>> of new
>>>> logs and as soon as new set of logs are produced it captures them in
>>>> its local
>>>> buffer which is then flushed out to the file on disk.
>>>> Any time when logging needs to be ended, User can stop this utility
>>>> (CTRL+C).
>>>>
>>>> Before entering into a loop, it first discards whatever logs are
>>>> present in
>>>> the debugfs file.
>>>> This way User can first launch this utility and then start a
>>>> workload/activity
>>>> for which GuC firmware logs are to be actually captured and keep
>>>> running the
>>>> utility for as long as its needed, like once the workload is over this
>>>> utility
>>>> can be forcefully stopped.
>>>>
>>>> If the logging wasn't enabled on GuC side by the Driver at boot time,
>>>> utility
>>>> will first enable the logging and later on when it is stopped (CTRL+C)
>>>> it will
>>>> also pause the logging on GuC side.
>>>>
>>>> Signed-off-by: Akash Goel <akash.goel@intel.com>
>>>> ---
>>>>   tools/Makefile.sources   |   1 +
>>>>   tools/intel_guc_logger.c | 441
>>>> +++++++++++++++++++++++++++++++++++++++++++++++
>>>>   2 files changed, 442 insertions(+)
>>>>   create mode 100644 tools/intel_guc_logger.c
>>>>
>>>> diff --git a/tools/Makefile.sources b/tools/Makefile.sources
>>>> index 2bb6c8e..be58871 100644
>>>> --- a/tools/Makefile.sources
>>>> +++ b/tools/Makefile.sources
>>>> @@ -19,6 +19,7 @@ tools_prog_lists =        \
>>>>       intel_gpu_time        \
>>>>       intel_gpu_top        \
>>>>       intel_gtt        \
>>>> +    intel_guc_logger        \
>>>>       intel_infoframes    \
>>>>       intel_l3_parity        \
>>>>       intel_lid        \
>>>> diff --git a/tools/intel_guc_logger.c b/tools/intel_guc_logger.c
>>>> new file mode 100644
>>>> index 0000000..92172fa
>>>> --- /dev/null
>>>> +++ b/tools/intel_guc_logger.c
>>>> @@ -0,0 +1,441 @@
>>>> +
>>>> +#define _GNU_SOURCE  /* For using O_DIRECT */
>>>> +#include <inttypes.h>
>>>> +#include <stdio.h>
>>>> +#include <fcntl.h>
>>>> +#include <sched.h>
>>>> +#include <sys/ioctl.h>
>>>> +#include <string.h>
>>>> +#include <errno.h>
>>>> +#include <signal.h>
>>>> +#include <stdlib.h>
>>>> +#include <unistd.h>
>>>> +#include <sys/poll.h>
>>>> +#include <sys/wait.h>
>>>> +#include <sys/types.h>
>>>> +#include <sys/syscall.h>
>>>> +#include <sys/mman.h>
>>>> +#include <assert.h>
>>>> +#include <pthread.h>
>>>> +
>>>> +#include "igt.h"
>>>> +
>>>> +#define MB(x) ((uint64_t)(x) * 1024 * 1024)
>>>> +#ifndef PAGE_SIZE
>>>> +  #define PAGE_SIZE 4096
>>>> +#endif
>>>> +#define SUBBUF_SIZE (19*PAGE_SIZE)
>>>> +/* Need large buffering from logger side to hide the DISK IO latency,
>>>> Driver
>>>> + * can only store 8 snapshots of GuC log buffer in relay.
>>>> + */
>>>> +#define NUM_SUBBUFS 100
>>>> +
>>>> +#define RELAY_FILE_NAME  "guc_log"
>>>> +#define CONTROL_FILE_NAME "i915_guc_log_control"
>>>> +
>>>> +char *read_buffer;
>>>> +char *out_filename;
>>>> +int poll_timeout = 2; /* by default 2ms timeout */
>>>> +pthread_mutex_t mutex;
>>>> +pthread_t flush_thread;
>>>> +int verbosity_level = 3; /* by default capture logs at max
>>>> verbosity */
>>>> +uint32_t produced, consumed;
>>>> +uint64_t total_bytes_written;
>>>> +int num_buffers = NUM_SUBBUFS;
>>>> +int relay_fd, outfile_fd = -1;
>>>> +bool stop_logging, discard_oldlogs;
>>>> +uint32_t test_duration, max_filesize;
>>>> +pthread_cond_t underflow_cond, overflow_cond;
>>>> +
>>>> +static void guc_log_control(bool enable_logging)
>>>> +{
>>>> +    int control_fd;
>>>> +    char data[19];
>>>> +    uint64_t val;
>>>> +    int ret;
>>>> +
>>>> +    control_fd = igt_debugfs_open(CONTROL_FILE_NAME, O_WRONLY);
>>>> +    if (control_fd < 0)
>>>> +        igt_assert_f(0, "Couldn't open the guc log control file");
>>>> +
>>>> +    val = enable_logging ? ((verbosity_level << 4) | 0x1) : 0;
>>>> +
>>>> +    snprintf(data, sizeof(data), "0x%" PRIx64, val);
>>>> +    ret = write(control_fd, data, strlen(data) + 1);
>>>
>>> Minor: It looks safe like it is but something like below would maybe be
>>> more robust?
>>>
>>> ret = snprintf(data, sizeof(data), "0x%" PRIx64, val);
>>> igt_assert(ret > 2 && ret < sizeof(data));
>>
>> ok will add, but possibility of failure will be really remote here.
>> but igt_assert(ret > 0) should suffice.
>
> Yes there is no possibility for failure as it stands, just more robust
> implementation should someone change something in the future. That's why
> I said you could also decide to keep it as is. My version also avoided
> the strlen since snprintf already tells you that.
>

fine, will use your version then.

>>> ret = write(control_fd, data, ret);
>>> igt_assert(ret > 0); // assuming short writes can't happen
>>>
>>> Up to you.
>>>
>>>> +    if (ret < 0)
>>>> +        igt_assert_f(0, "Couldn't write to the log control file");
>>>> +
>>>> +    close(control_fd);
>>>> +}
>>>> +
>>>> +static void int_sig_handler(int sig)
>>>> +{
>>>> +    igt_info("Received signal %d\n", sig);
>>>> +
>>>> +    stop_logging = true;
>>>> +}
>>>> +
>>>> +static void pull_leftover_data(void)
>>>> +{
>>>> +    unsigned int bytes_read = 0;
>>>> +    int ret;
>>>> +
>>>> +    while (1) {
>>>> +        /* Read the logs from relay buffer */
>>>> +        ret = read(relay_fd, read_buffer, SUBBUF_SIZE);
>>>> +        if (!ret)
>>>> +            break;
>>>> +        else if (ret < 0)
>>>> +            igt_assert_f(0, "Failed to read from the guc log file");
>>>> +        else if (ret < SUBBUF_SIZE)
>>>> +            igt_assert_f(0, "invalid read from relay file");
>>>> +
>>>> +        bytes_read += ret;
>>>> +
>>>> +        if (outfile_fd > 0) {
>>>
>>>> = 0 I think. Or is it even needed since open_output_file asserts if it
>>> fails to open?
>>>
>> Actually pull_leftover_data() will be called twice, once before opening
>> the outfile. This is for purging the old logs, if enabled.
>
> Oh right, OK then.
>
>>
>>>> +            ret = write(outfile_fd, read_buffer, SUBBUF_SIZE);
>>>> +            if (ret < 0)
>>>> +                igt_assert_f(0, "Couldn't dump the logs in a file");
>>>
>>> Do you need to check for short writes as well here? Man page for write
>>> says it can happen with out of disk space or signals.
>>>
>> To keep things simple can just check 'ret < SUBBUF_SIZE' considering
>> out of disk space as a serious failure or add a loop here which will run
>> until SUBBUF_SIZE bytes have been written.
>
> Just assert, no point in complicating things.
>
Fine will assert on (ret < SUBBUF_SIZE)

>>>> +            total_bytes_written += ret;
>>>> +        }
>>>> +    };
>>>> +
>>>> +    igt_debug("%u bytes flushed\n", bytes_read);
>>>> +}
>>>> +
>>>> +static int num_filled_bufs(void)
>>>> +{
>>>> +    return (produced - consumed);
>>>> +}
>>>> +
>>>> +static void pull_data(void)
>>>> +{
>>>> +    char *ptr;
>>>> +    int ret;
>>>> +
>>>> +    pthread_mutex_lock(&mutex);
>>>> +    while (num_filled_bufs() >= num_buffers) {
>>>> +        igt_debug("overflow, will wait, produced %u, consumed %u\n",
>>>> produced, consumed);
>>>> +        /* Stall the main thread in case of overflow, as there are no
>>>> +         * buffers available to store the new logs, otherwise there
>>>> +         * could be corruption if both threads work on the same
>>>> buffer.
>>>> +         */
>>>> +        pthread_cond_wait(&overflow_cond, &mutex);
>>>> +    };
>>>> +    pthread_mutex_unlock(&mutex);
>>>> +
>>>> +    ptr = read_buffer + (produced % num_buffers) * SUBBUF_SIZE;
>>>> +
>>>> +    /* Read the logs from relay buffer */
>>>> +    ret = read(relay_fd, ptr, SUBBUF_SIZE);
>>>> +    if (ret < 0)
>>>> +        igt_assert_f(0, "Failed to read from the guc log file");
>>>> +    else if (ret && (ret < SUBBUF_SIZE))
>>>> +        igt_assert_f(0, "invalid read from relay file");
>>>> +
>>>> +    pthread_mutex_lock(&mutex);
>>>> +    if (ret) {
>>>> +        produced++;
>>>> +        pthread_cond_signal(&underflow_cond);
>>>> +    }
>>>> +    pthread_mutex_unlock(&mutex);
>>>
>>> Very minor: can move the mutex under the if.
>>>
>> Fine will move under the 'if'.
>>
>>>> +}
>>>> +
>>>> +static void *flusher(void *arg)
>>>> +{
>>>> +    char *ptr;
>>>> +    int ret;
>>>> +
>>>> +    igt_debug("Execution started of flusher thread\n");
>>>> +
>>>> +    do {
>>>> +        pthread_mutex_lock(&mutex);
>>>> +        while (!num_filled_bufs()) {
>>>> +            /* Exit only after completing the flush of all the filled
>>>> +             * buffers as User would expect that all logs captured up
>>>> +             * till the point of interruption/exit are written out to
>>>> +             * the disk file.
>>>> +             */
>>>> +            if (stop_logging) {
>>>> +                igt_debug("flusher to exit now\n");
>>>> +                pthread_mutex_unlock(&mutex);
>>>> +                return NULL;
>>>> +            }
>>>> +            pthread_cond_wait(&underflow_cond, &mutex);
>>>> +        };
>>>> +        pthread_mutex_unlock(&mutex);
>>>> +
>>>> +        ptr = read_buffer + (consumed % num_buffers) * SUBBUF_SIZE;
>>>> +
>>>> +        ret = write(outfile_fd, ptr, SUBBUF_SIZE);
>>>> +        if (ret < 0)
>>>> +            igt_assert_f(0, "Couldn't dump the logs in a file");
>>>
>>> Question about short writes applies here as well.
>>>
>> Fine will do the same here whatever is decided for above.
>>
>>>> +
>>>> +        total_bytes_written += ret;
>>>> +        if (max_filesize && (total_bytes_written >
>>>> MB(max_filesize))) {
>>>> +            igt_debug("Reached the target of %" PRIu64 " bytes\n",
>>>> MB(max_filesize));
>>>> +            stop_logging = true;
>>>> +        }
>>>> +
>>>> +        pthread_mutex_lock(&mutex);
>>>> +        consumed++;
>>>> +        pthread_cond_signal(&overflow_cond);
>>>> +        pthread_mutex_unlock(&mutex);
>>>> +    } while(1);
>>>
>>> Minor again: You use both "while(1) {}" and "do { } while(1)" in here.
>>> Maybe stick with one? :D (I think "for (;;)" is preferred, but maybe I
>>> am misremembering things.)
>>>
>> Sorry for the inconsistency, will clean this up.
>>
>>>> +
>>>> +    return NULL;
>>>> +}
>>>> +
>>>> +static void init_flusher_thread(void)
>>>> +{
>>>> +    struct sched_param    thread_sched;
>>>> +    pthread_attr_t        p_attr;
>>>> +
>>>> +    pthread_cond_init(&underflow_cond, NULL);
>>>> +    pthread_cond_init(&overflow_cond, NULL);
>>>> +    pthread_mutex_init(&mutex, NULL);
>>>> +
>>>> +    if (pthread_attr_init(&p_attr))
>>>> +        igt_assert_f(0, "Error obtaining default thread attributes");
>>>> +
>>>> +    if (pthread_attr_setinheritsched(&p_attr, PTHREAD_EXPLICIT_SCHED))
>>>> +        igt_assert_f(0, "Couldn't set inheritsched");
>>>> +
>>>> +    if (pthread_attr_setschedpolicy(&p_attr, SCHED_RR))
>>>> +        igt_assert_f(0, "Couldn't set thread scheduling policy");
>>>> +
>>>> +    /* Keep the flusher task also at rt priority, so that it doesn't
>>>> get
>>>> +     * too late in flushing the collected logs in local buffers to
>>>> the disk,
>>>> +     * and so main thread always have spare buffers to collect the
>>>> logs.
>>>> +     */
>>>> +    thread_sched.sched_priority = 5;
>>>> +    if (pthread_attr_setschedparam(&p_attr, &thread_sched))
>>>> +        igt_assert_f(0, "Couldn't set thread priority");
>>>> +
>>>> +    if (pthread_create(&flush_thread, &p_attr, flusher, NULL))
>>>> +        igt_assert_f(0, "thread creation failed");
>>>
>>> More nitpicks: You mix starting error messages with upper case and lower
>>> case.
>>
>> Will use the lower case for all error messages. fine ?
>
> Fine with me.
>

Thanks
>>>
>>>> +
>>>> +    if (pthread_attr_destroy(&p_attr))
>>>> +        igt_assert_f(0, "Error destroying thread attributes");
>>>> +}
>>>> +
>>>> +static void open_relay_file(void)
>>>> +{
>>>> +    relay_fd = igt_debugfs_open(RELAY_FILE_NAME, O_RDONLY);
>>>> +    if (relay_fd < 0)
>>>> +        igt_assert_f(0, "Couldn't open the guc log file");
>>>> +
>>>> +    /* Purge the old/boot-time logs from the relay buffer.
>>>> +     * This is more for Val team's requirement, where they have to
>>>> first
>>>> +     * purge the existing logs before starting the tests for which
>>>> the logs
>>>> +     * are actually needed. After this logger will enter into a loop
>>>> and
>>>> +     * wait for the new data, at that point benchmark can be launched
>>>> from
>>>> +     * a different shell.
>>>> +     */
>>>> +    if (discard_oldlogs)
>>>> +        pull_leftover_data();
>>>> +}
>>>> +
>>>> +static void open_output_file(void)
>>>> +{
>>>> +    char *filename;
>>>> +
>>>> +    if (asprintf(&filename, "/tmp/%s", out_filename ? :
>>>> "guc_log_dump.dat") < 0)
>>>> +        igt_assert_f(0, "Couldn't allocate the filename");
>>>
>>> I think you need to take the out_filename when specified by the user
>>> as-is and not stick it in /tmp. It is perfectly reasonable that someone
>>> would want to put it somewhere else.
>>>
>> Ok so User should provide the full path.
>
> It can be relative, absolute, or just a filename. My suggestion was to
> just use what the user provided for the open(2) call.
>
Yes the path provided can be absolute/relative, will do the open() call 
with whatever string is provided as a command line.

>>
>>> Also better to move the default name of "guc_log_dump.dat" to where
>>> defaults like RELAY_FILE_NAME and such are defined.
>>>
>> fine.
>>
>>> And I would not default to /tmp but the current directory.
>>>
>> Chris had suggested to use /tmp. Fine will use the current directory.
>>
>>>> +
>>>> +    /* Use Direct IO mode for the output file, as the data written is
>>>> not
>>>> +     * supposed to be accessed again, this saves a copy of data from
>>>> App's
>>>> +     * buffer to kernel buffer (Page cache). Due to no buffering on
>>>> kernel
>>>> +     * side, data is flushed out to disk faster and more buffering
>>>> can be
>>>> +     * done on the logger side to hide the disk IO latency.
>>>> +     */
>>>> +    outfile_fd = open(filename, O_CREAT | O_WRONLY | O_TRUNC |
>>>> O_DIRECT, 0444);
>>>
>>> 0400 or 0440 would be better I think.
>>
>> fine, this might make the file inaccessible to Root User in case the
>> logger wasn't running as a Root ?
>
> I assumed logger will run as root and it would be preferable not to have
> log files accessible by anyone. Either way root can read anything
> regardless of permissions, no?
>
Sorry after working on selinux enabled setups (in Android, where there 
are certain restrictions on Root also), forgot that Root can do anything.

>>>
>>>> +    if (outfile_fd < 0)
>>>> +        igt_assert_f(0, "Couldn't open the output file");
>>>> +
>>>> +    free(out_filename);
>>>> +    free(filename);
>>>> +}
>>>> +
>>>> +static void init_main_thread(void)
>>>> +{
>>>> +    struct sched_param    thread_sched;
>>>> +    /* Run the main thread at highest priority to ensure that it
>>>> always
>>>> +     * gets woken-up at earliest on arrival of new data and so is
>>>> always
>>>> +     * ready to pull the logs, otherwise there could be loss logs if
>>>> +     * GuC firmware is generating logs at a very high rate.
>>>> +     */
>>>> +    thread_sched.sched_priority = 1;
>>>> +    if (sched_setscheduler(getpid(), SCHED_FIFO, &thread_sched))
>>>> +        igt_assert_f(0, "Couldn't set the priority");
>>>> +
>>>> +    if (signal(SIGINT, int_sig_handler) == SIG_ERR)
>>>> +        igt_assert_f(0, "SIGINT handler registration failed");
>>>> +
>>>> +    /* Need an aligned pointer for direct IO */
>>>> +    if (posix_memalign((void **)&read_buffer, PAGE_SIZE, num_buffers
>>>> * SUBBUF_SIZE))
>>>> +        igt_assert_f(0, "Couldn't allocate the read buffer");
>>>> +
>>>> +    /* Keep the pages locked in RAM, avoid page fault overhead */
>>>> +    if (mlock(read_buffer, num_buffers * SUBBUF_SIZE))
>>>> +        igt_assert_f(0, "Failed to lock memory");
>>>> +
>>>> +    /* Enable the logging, it may not have been enabled from boot
>>>> and so
>>>> +     * the relay file also wouldn't have been created.
>>>> +     */
>>>> +    guc_log_control(true);
>>>> +
>>>> +    open_relay_file();
>>>> +    open_output_file();
>>>> +}
>>>> +
>>>> +static int parse_options(int opt, int opt_index, void *data)
>>>> +{
>>>> +    igt_debug("opt %c optarg %s\n", opt, optarg);
>>>> +
>>>> +    switch(opt) {
>>>> +    case 'v':
>>>> +        verbosity_level = atoi(optarg);
>>>> +        if (verbosity_level < 0)
>>>> +            verbosity_level = 0;
>>>> +        else if (verbosity_level > 3)
>>>> +            verbosity_level = 3;
>>>
>>> Notify user if invalid value was provided?
>>>
>> Just notify but don't abort and continue by using a valid value ?
>
> No strong opinion. Either is much better than silently using a different
> value or ignoring the fact user has typed in garbage on the command line.
>
Ok will report/notify and then abort.
Will do the same for other arguments.

Is it fine ?

>>
>>>> +
>>>> +        igt_debug("verbosity level to be used is %d\n",
>>>> verbosity_level);
>>>> +        break;
>>>> +    case 'o':
>>>> +        if (asprintf(&out_filename, "%s", optarg) < 0)
>>>
>>> strdup?
>> ok will try to use strdup.
>>>
>>>> +            igt_assert_f(0, "Couldn't allocate the o/p filename");
>>>> +        igt_debug("logs to be stored in file /tmp/%s\n",
>>>> out_filename);
>>>> +        break;
>>>> +    case 'b':
>>>> +        num_buffers = atoi(optarg);
>>>> +        if (num_buffers == 0)
>>>> +            num_buffers = NUM_SUBBUFS;
>>>
>>> Again notify if invalid number/value.
>>>
>> same doubt as above.
>>
>>>> +        igt_debug("number of buffers to be used is %d\n",
>>>> num_buffers);
>>>> +        break;
>>>> +    case 't':
>>>> +        test_duration = atoi(optarg);
>>>> +        if (test_duration > 0)
>>>
>>> If switch was specified it should have a valid optarg or error reported.
>>>
>>>> +            igt_debug("logger to run for %d seconds\n",
>>>> test_duration);
>>>> +        break;
>>>> +    case 'p':
>>>> +        poll_timeout = atoi(optarg);
>>>> +        if (poll_timeout == 0)
>>>> +            poll_timeout = 2; /* at least use 2ms poll timeout */
>>>
>>> Hm I would prefer if you allowed no poll period.
>>
>> by no poll period, do you mean indefinite wait ?
>>
>> That can be done by specifying -1 as a timeout value in the command line,
>>
>> "-p --polltimeout=ms polling timeout in ms, -1 == indefinite wait for
>> the new data\n"
>
> My bad, I missed that. All good then.
>
>>
>>> I even think it should be the default if we think relayfs will be
>>> definitely fixed to not
>>> require it.
>>>
>> Can keep the indefinite wait as a default.
>>
>>> In any case, commend is wrong since user can specify 1ms and get away
>>> with it. :)
>>>
>>>> +        if (poll_timeout > 0)
>>>> +            igt_debug("polling to be done with %d milli seconds
>>>> timeout\n", poll_timeout);
>>>
>>> milliseconds
>>>
>>>> +        break;
>>>> +    case 's':
>>>> +        max_filesize = atoi(optarg);
>>>> +        if (max_filesize > 0)
>>>
>>> Same as for -t.
>>>
>>>> +            igt_debug("max allowed size of the output file is %d
>>>> MBs\n", max_filesize);
>>>> +        break;
>>>> +    case 'd':
>>>> +        discard_oldlogs = true;
>>>> +        igt_debug("old/boot-time logs will be discarded\n");
>>>> +        break;
>>>> +    }
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static void process_command_line(int argc, char **argv)
>>>> +{
>>>> +    static struct option long_options[] = {
>>>> +        {"verbosity", required_argument, 0, 'v'},
>>>> +        {"outputfilename", required_argument, 0, 'o'},
>>>> +        {"buffers", required_argument, 0, 'b'},
>>>> +        {"testduration", required_argument, 0, 't'},
>>>> +        {"polltimeout", required_argument, 0, 'p'},
>>>> +        {"size", required_argument, 0, 's'},
>>>> +        {"discard", no_argument, 0, 'd'},
>>>> +        { 0, 0, 0, 0 }
>>>> +    };
>>>> +
>>>> +    const char *help =
>>>> +        "  -v --verbosity=level   verbosity level of GuC logging
>>>> (0-3)\n"
>>>> +        "  -o --outfile=name      name of the output file in /tmp
>>>> directory, where logs will be stored\n"
>>>> +        "  -b --buffers=num       number of buffers to be maintained
>>>> on logger side for storing logs\n"
>>>> +        "  -t --testduration=sec  max duration in seconds for which
>>>> the logger should run, 0 == forever\n"
>>>> +        "  -p --polltimeout=ms    polling timeout in ms, -1 ==
>>>> indefinite wait for the new data\n"
>>>> +        "  -s --size=MB           max size of output file in MBs
>>>> after which logging will be stopped, 0 == no limit\n"
>>>> +        "  -d --discard           discard the old/boot-time logs
>>>> before entering into the capture loop\n";
>>>> +
>>>> +    igt_simple_init_parse_opts(&argc, argv, "v:o:b:t:p:s:d",
>>>> long_options,
>>>> +                   help, parse_options, NULL);
>>>> +}
>>>> +
>>>> +int main(int argc, char **argv)
>>>> +{
>>>> +    struct pollfd relay_poll_fd;
>>>> +    struct timespec start={};
>>>> +    int nfds;
>>>> +    int ret;
>>>> +
>>>> +    process_command_line(argc, argv);
>>>> +
>>>> +    init_main_thread();
>>>> +
>>>> +    /* Use a separate thread for flushing the logs to a file on disk.
>>>> +     * Main thread will buffer the data from relay file in its pool of
>>>> +     * buffers and other thread will flush the data to disk in
>>>> background.
>>>> +     * This is needed, albeit by default data is written out to
>>>> disk in
>>>> +     * async mode, as when there are too many dirty pages in the RAM,
>>>> +     * (/proc/sys/vm/dirty_ratio), kernel starts blocking the
>>>> processes
>>>> +     * doing the file writes.
>>>> +     */
>>>> +    init_flusher_thread();
>>>> +
>>>> +    relay_poll_fd.fd = relay_fd;
>>>> +    relay_poll_fd.events = POLLIN;
>>>> +    relay_poll_fd.revents = 0;
>>>> +
>>>> +    nfds = 1; /* only one fd to poll */
>>>> +
>>>> +    while (!stop_logging)
>>>> +    {
>>>> +        if (test_duration && (igt_seconds_elapsed(&start) >
>>>> test_duration)) {
>>>
>>> If you agree to allow no poll period the this would not work right? In
>>> that case you would need to use alarm(2) or something.
>>>
>>
>> Can calculate the timeout value for poll call as,
>>      if (poll_timeout < 0) {
>>          timeout = test_duration - igt_seconds_elapsed(&start))
>>      }
>
> My point was that with indefinite poll loop will not run if there is not
> log data so timeout will not work implemented like this.
>
I understood your concern in first place but probably didn't put forth 
my point clearly.

For more clarity, this is how think it can be addressed.

--- a/tools/intel_guc_logger.c
+++ b/tools/intel_guc_logger.c
@@ -370,6 +370,8 @@ int main(int argc, char **argv)
  {
  	struct pollfd relay_poll_fd;
  	struct timespec start={};
+	uint32_t time_elapsed;
+	int timeout;
  	int nfds;
  	int ret;

@@ -395,10 +397,17 @@ int main(int argc, char **argv)

  	while (!stop_logging)
  	{
-		if (test_duration && (igt_seconds_elapsed(&start) > test_duration)) {
-			igt_debug("Ran for stipulated %d seconds, exit now\n", test_duration);
-			stop_logging = true;
-			break;
+		timeout = poll_timeout;
+		if (test_duration) {
+			time_elapsed = igt_seconds_elapsed(&start);
+			if (time_elapsed >= test_duration) {
+				igt_debug("Ran for stipulated %d seconds, exit now\n", test_duration);
+				stop_logging = true;
+				break;
+			}
+			if (poll_timeout < 0)
+				timeout = (test_duration - time_elapsed) * 1000;
		}

  		/* Wait/poll for the new data to be available, relay doesn't
@@ -412,7 +421,7 @@ int main(int argc, char **argv)
  		 * than a jiffy gap between 2 flush interrupts) and relay runs
  		 * out of sub buffers to store the new logs.
  		 */
-		ret = poll(&relay_poll_fd, nfds, poll_timeout);
+		ret = poll(&relay_poll_fd, nfds, timeout);
  		if (ret < 0) {
  			if (errno == EINTR)
  				break;

So will not do polling with indefinite timeout and adjust the timeout 
value as per test's duration.
Does it look ok ?

Best regards
Akash
>>
>>
>>>> +            igt_debug("Ran for stipulated %d seconds, exit now\n",
>>>> test_duration);
>>>> +            stop_logging = true;
>>>> +            break;
>>>> +        }
>>>> +
>>>> +        /* Wait/poll for the new data to be available, relay doesn't
>>>> +         * provide a blocking read.
>>>> +         * Need to do polling with a timeout instead of indefinite
>>>> wait,
>>>> +         * to avoid relying on relay for the wakeup, as relay does
>>>> wakeup
>>>> +         * in a deferred manner on jiffies granularity by scheduling a
>>>> +         * timer and moreover that timer is re-scheduled on every
>>>> newly
>>>> +         * produced buffer so timer keeps getting pushed out if there
>>>> +         * are multiple flush interrupts in a very quick succession
>>>> (less
>>>> +         * than a jiffy gap between 2 flush interrupts) and relay runs
>>>> +         * out of sub buffers to store the new logs.
>>>> +         */
>>>> +        ret = poll(&relay_poll_fd, nfds, poll_timeout);
>>>> +        if (ret < 0) {
>>>> +            if (errno == EINTR)
>>>> +                break;
>>>> +            else
>>>> +                igt_assert_f(0, "poll call failed");
>>>> +        }
>>>> +
>>>> +        /* No data available yet, poll again, hopefully new data is
>>>> round the corner */
>>>> +        if (!relay_poll_fd.revents)
>>>> +            continue;
>>>> +
>>>> +        pull_data();
>>>> +    }
>>>> +
>>>> +    /* Pause logging on the GuC side*/
>>>> +    guc_log_control(false);
>>>> +    pthread_cond_signal(&underflow_cond);
>>>> +    pthread_join(flush_thread, NULL);
>>>> +    pull_leftover_data();
>>>> +    igt_info("total_bytes_written %" PRIu64 "\n",
>>>> total_bytes_written);
>>>> +
>>>> +    free(read_buffer);
>>>> +    close(relay_fd);
>>>> +    close(outfile_fd);
>>>> +    igt_exit();
>>>> +}
>>>>
>>>
>>> So only minor comments. The rest looks reasonable to me and I trust you
>>> tested it well and it works fine.
>>>
>> Thanks much for the thorough review.
>
> Np.
>
> Regards,
>
> Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] tools/intel_guc_logger: Utility for capturing GuC firmware logs in a file
  2016-09-07  8:10       ` Goel, Akash
@ 2016-09-07  8:44         ` Chris Wilson
  2016-09-07  9:37           ` Tvrtko Ursulin
  0 siblings, 1 reply; 17+ messages in thread
From: Chris Wilson @ 2016-09-07  8:44 UTC (permalink / raw)
  To: Goel, Akash; +Cc: intel-gfx

On Wed, Sep 07, 2016 at 01:40:27PM +0530, Goel, Akash wrote:
> 
> 
> On 9/6/2016 9:22 PM, Tvrtko Ursulin wrote:
> >
> >On 06/09/16 16:33, Goel, Akash wrote:
> >>On 9/6/2016 6:47 PM, Tvrtko Ursulin wrote:
> >>>Hi,
> >>>
> >>>On 06/09/16 11:43, akash.goel@intel.com wrote:
> >>>>From: Akash Goel <akash.goel@intel.com>
> >>>>
> >>>>This patch provides a test utility which helps capture GuC firmware
> >>>>logs and
> >>>>then dump them to file.
> >>>>The logs are pulled from a debugfs file
> >>>>'/sys/kernel/debug/dri/guc_log' and
> >>>>stored into a file '/tmp/guc_log_dump.dat', the name of the output
> >>>>file can
> >>>>be changed through a command line argument.
> >>>>
> >>>>The utility goes into an infinite loop where it waits for the arrival
> >>>>of new
> >>>>logs and as soon as new set of logs are produced it captures them in
> >>>>its local
> >>>>buffer which is then flushed out to the file on disk.
> >>>>Any time when logging needs to be ended, User can stop this utility
> >>>>(CTRL+C).
> >>>>
> >>>>Before entering into a loop, it first discards whatever logs are
> >>>>present in
> >>>>the debugfs file.
> >>>>This way User can first launch this utility and then start a
> >>>>workload/activity
> >>>>for which GuC firmware logs are to be actually captured and keep
> >>>>running the
> >>>>utility for as long as its needed, like once the workload is over this
> >>>>utility
> >>>>can be forcefully stopped.
> >>>>
> >>>>If the logging wasn't enabled on GuC side by the Driver at boot time,
> >>>>utility
> >>>>will first enable the logging and later on when it is stopped (CTRL+C)
> >>>>it will
> >>>>also pause the logging on GuC side.
> >>>>
> >>>>Signed-off-by: Akash Goel <akash.goel@intel.com>
> >>>>---
> >>>>  tools/Makefile.sources   |   1 +
> >>>>  tools/intel_guc_logger.c | 441
> >>>>+++++++++++++++++++++++++++++++++++++++++++++++
> >>>>  2 files changed, 442 insertions(+)
> >>>>  create mode 100644 tools/intel_guc_logger.c
> >>>>
> >>>>diff --git a/tools/Makefile.sources b/tools/Makefile.sources
> >>>>index 2bb6c8e..be58871 100644
> >>>>--- a/tools/Makefile.sources
> >>>>+++ b/tools/Makefile.sources
> >>>>@@ -19,6 +19,7 @@ tools_prog_lists =        \
> >>>>      intel_gpu_time        \
> >>>>      intel_gpu_top        \
> >>>>      intel_gtt        \
> >>>>+    intel_guc_logger        \
> >>>>      intel_infoframes    \
> >>>>      intel_l3_parity        \
> >>>>      intel_lid        \
> >>>>diff --git a/tools/intel_guc_logger.c b/tools/intel_guc_logger.c
> >>>>new file mode 100644
> >>>>index 0000000..92172fa
> >>>>--- /dev/null
> >>>>+++ b/tools/intel_guc_logger.c
> >>>>@@ -0,0 +1,441 @@
> >>>>+
> >>>>+#define _GNU_SOURCE  /* For using O_DIRECT */
> >>>>+#include <inttypes.h>
> >>>>+#include <stdio.h>
> >>>>+#include <fcntl.h>
> >>>>+#include <sched.h>
> >>>>+#include <sys/ioctl.h>
> >>>>+#include <string.h>
> >>>>+#include <errno.h>
> >>>>+#include <signal.h>
> >>>>+#include <stdlib.h>
> >>>>+#include <unistd.h>
> >>>>+#include <sys/poll.h>
> >>>>+#include <sys/wait.h>
> >>>>+#include <sys/types.h>
> >>>>+#include <sys/syscall.h>
> >>>>+#include <sys/mman.h>
> >>>>+#include <assert.h>
> >>>>+#include <pthread.h>
> >>>>+
> >>>>+#include "igt.h"
> >>>>+
> >>>>+#define MB(x) ((uint64_t)(x) * 1024 * 1024)
> >>>>+#ifndef PAGE_SIZE
> >>>>+  #define PAGE_SIZE 4096
> >>>>+#endif
> >>>>+#define SUBBUF_SIZE (19*PAGE_SIZE)
> >>>>+/* Need large buffering from logger side to hide the DISK IO latency,
> >>>>Driver
> >>>>+ * can only store 8 snapshots of GuC log buffer in relay.
> >>>>+ */
> >>>>+#define NUM_SUBBUFS 100
> >>>>+
> >>>>+#define RELAY_FILE_NAME  "guc_log"
> >>>>+#define CONTROL_FILE_NAME "i915_guc_log_control"
> >>>>+
> >>>>+char *read_buffer;
> >>>>+char *out_filename;
> >>>>+int poll_timeout = 2; /* by default 2ms timeout */
> >>>>+pthread_mutex_t mutex;
> >>>>+pthread_t flush_thread;
> >>>>+int verbosity_level = 3; /* by default capture logs at max
> >>>>verbosity */
> >>>>+uint32_t produced, consumed;
> >>>>+uint64_t total_bytes_written;
> >>>>+int num_buffers = NUM_SUBBUFS;
> >>>>+int relay_fd, outfile_fd = -1;
> >>>>+bool stop_logging, discard_oldlogs;
> >>>>+uint32_t test_duration, max_filesize;
> >>>>+pthread_cond_t underflow_cond, overflow_cond;
> >>>>+
> >>>>+static void guc_log_control(bool enable_logging)
> >>>>+{
> >>>>+    int control_fd;
> >>>>+    char data[19];
> >>>>+    uint64_t val;
> >>>>+    int ret;
> >>>>+
> >>>>+    control_fd = igt_debugfs_open(CONTROL_FILE_NAME, O_WRONLY);
> >>>>+    if (control_fd < 0)
> >>>>+        igt_assert_f(0, "Couldn't open the guc log control file");
> >>>>+
> >>>>+    val = enable_logging ? ((verbosity_level << 4) | 0x1) : 0;
> >>>>+
> >>>>+    snprintf(data, sizeof(data), "0x%" PRIx64, val);
> >>>>+    ret = write(control_fd, data, strlen(data) + 1);
> >>>
> >>>Minor: It looks safe like it is but something like below would maybe be
> >>>more robust?
> >>>
> >>>ret = snprintf(data, sizeof(data), "0x%" PRIx64, val);
> >>>igt_assert(ret > 2 && ret < sizeof(data));
> >>
> >>ok will add, but possibility of failure will be really remote here.
> >>but igt_assert(ret > 0) should suffice.
> >
> >Yes there is no possibility for failure as it stands, just more robust
> >implementation should someone change something in the future. That's why
> >I said you could also decide to keep it as is. My version also avoided
> >the strlen since snprintf already tells you that.
> >
> 
> fine, will use your version then.
> 
> >>>ret = write(control_fd, data, ret);
> >>>igt_assert(ret > 0); // assuming short writes can't happen
> >>>
> >>>Up to you.
> >>>
> >>>>+    if (ret < 0)
> >>>>+        igt_assert_f(0, "Couldn't write to the log control file");
> >>>>+
> >>>>+    close(control_fd);
> >>>>+}
> >>>>+
> >>>>+static void int_sig_handler(int sig)
> >>>>+{
> >>>>+    igt_info("Received signal %d\n", sig);
> >>>>+
> >>>>+    stop_logging = true;
> >>>>+}
> >>>>+
> >>>>+static void pull_leftover_data(void)
> >>>>+{
> >>>>+    unsigned int bytes_read = 0;
> >>>>+    int ret;
> >>>>+
> >>>>+    while (1) {
> >>>>+        /* Read the logs from relay buffer */
> >>>>+        ret = read(relay_fd, read_buffer, SUBBUF_SIZE);
> >>>>+        if (!ret)
> >>>>+            break;
> >>>>+        else if (ret < 0)
> >>>>+            igt_assert_f(0, "Failed to read from the guc log file");
> >>>>+        else if (ret < SUBBUF_SIZE)
> >>>>+            igt_assert_f(0, "invalid read from relay file");
> >>>>+
> >>>>+        bytes_read += ret;
> >>>>+
> >>>>+        if (outfile_fd > 0) {
> >>>
> >>>>= 0 I think. Or is it even needed since open_output_file asserts if it
> >>>fails to open?
> >>>
> >>Actually pull_leftover_data() will be called twice, once before opening
> >>the outfile. This is for purging the old logs, if enabled.
> >
> >Oh right, OK then.
> >
> >>
> >>>>+            ret = write(outfile_fd, read_buffer, SUBBUF_SIZE);
> >>>>+            if (ret < 0)
> >>>>+                igt_assert_f(0, "Couldn't dump the logs in a file");
> >>>
> >>>Do you need to check for short writes as well here? Man page for write
> >>>says it can happen with out of disk space or signals.
> >>>
> >>To keep things simple can just check 'ret < SUBBUF_SIZE' considering
> >>out of disk space as a serious failure or add a loop here which will run
> >>until SUBBUF_SIZE bytes have been written.
> >
> >Just assert, no point in complicating things.
> >
> Fine will assert on (ret < SUBBUF_SIZE)
> 
> >>>>+            total_bytes_written += ret;
> >>>>+        }
> >>>>+    };
> >>>>+
> >>>>+    igt_debug("%u bytes flushed\n", bytes_read);
> >>>>+}
> >>>>+
> >>>>+static int num_filled_bufs(void)
> >>>>+{
> >>>>+    return (produced - consumed);
> >>>>+}
> >>>>+
> >>>>+static void pull_data(void)
> >>>>+{
> >>>>+    char *ptr;
> >>>>+    int ret;
> >>>>+
> >>>>+    pthread_mutex_lock(&mutex);
> >>>>+    while (num_filled_bufs() >= num_buffers) {
> >>>>+        igt_debug("overflow, will wait, produced %u, consumed %u\n",
> >>>>produced, consumed);
> >>>>+        /* Stall the main thread in case of overflow, as there are no
> >>>>+         * buffers available to store the new logs, otherwise there
> >>>>+         * could be corruption if both threads work on the same
> >>>>buffer.
> >>>>+         */
> >>>>+        pthread_cond_wait(&overflow_cond, &mutex);
> >>>>+    };
> >>>>+    pthread_mutex_unlock(&mutex);
> >>>>+
> >>>>+    ptr = read_buffer + (produced % num_buffers) * SUBBUF_SIZE;
> >>>>+
> >>>>+    /* Read the logs from relay buffer */
> >>>>+    ret = read(relay_fd, ptr, SUBBUF_SIZE);
> >>>>+    if (ret < 0)
> >>>>+        igt_assert_f(0, "Failed to read from the guc log file");
> >>>>+    else if (ret && (ret < SUBBUF_SIZE))
> >>>>+        igt_assert_f(0, "invalid read from relay file");
> >>>>+
> >>>>+    pthread_mutex_lock(&mutex);
> >>>>+    if (ret) {
> >>>>+        produced++;
> >>>>+        pthread_cond_signal(&underflow_cond);
> >>>>+    }
> >>>>+    pthread_mutex_unlock(&mutex);
> >>>
> >>>Very minor: can move the mutex under the if.
> >>>
> >>Fine will move under the 'if'.
> >>
> >>>>+}
> >>>>+
> >>>>+static void *flusher(void *arg)
> >>>>+{
> >>>>+    char *ptr;
> >>>>+    int ret;
> >>>>+
> >>>>+    igt_debug("Execution started of flusher thread\n");
> >>>>+
> >>>>+    do {
> >>>>+        pthread_mutex_lock(&mutex);
> >>>>+        while (!num_filled_bufs()) {
> >>>>+            /* Exit only after completing the flush of all the filled
> >>>>+             * buffers as User would expect that all logs captured up
> >>>>+             * till the point of interruption/exit are written out to
> >>>>+             * the disk file.
> >>>>+             */
> >>>>+            if (stop_logging) {
> >>>>+                igt_debug("flusher to exit now\n");
> >>>>+                pthread_mutex_unlock(&mutex);
> >>>>+                return NULL;
> >>>>+            }
> >>>>+            pthread_cond_wait(&underflow_cond, &mutex);
> >>>>+        };
> >>>>+        pthread_mutex_unlock(&mutex);
> >>>>+
> >>>>+        ptr = read_buffer + (consumed % num_buffers) * SUBBUF_SIZE;
> >>>>+
> >>>>+        ret = write(outfile_fd, ptr, SUBBUF_SIZE);
> >>>>+        if (ret < 0)
> >>>>+            igt_assert_f(0, "Couldn't dump the logs in a file");
> >>>
> >>>Question about short writes applies here as well.
> >>>
> >>Fine will do the same here whatever is decided for above.
> >>
> >>>>+
> >>>>+        total_bytes_written += ret;
> >>>>+        if (max_filesize && (total_bytes_written >
> >>>>MB(max_filesize))) {
> >>>>+            igt_debug("Reached the target of %" PRIu64 " bytes\n",
> >>>>MB(max_filesize));
> >>>>+            stop_logging = true;
> >>>>+        }
> >>>>+
> >>>>+        pthread_mutex_lock(&mutex);
> >>>>+        consumed++;
> >>>>+        pthread_cond_signal(&overflow_cond);
> >>>>+        pthread_mutex_unlock(&mutex);
> >>>>+    } while(1);
> >>>
> >>>Minor again: You use both "while(1) {}" and "do { } while(1)" in here.
> >>>Maybe stick with one? :D (I think "for (;;)" is preferred, but maybe I
> >>>am misremembering things.)
> >>>
> >>Sorry for the inconsistency, will clean this up.
> >>
> >>>>+
> >>>>+    return NULL;
> >>>>+}
> >>>>+
> >>>>+static void init_flusher_thread(void)
> >>>>+{
> >>>>+    struct sched_param    thread_sched;
> >>>>+    pthread_attr_t        p_attr;
> >>>>+
> >>>>+    pthread_cond_init(&underflow_cond, NULL);
> >>>>+    pthread_cond_init(&overflow_cond, NULL);
> >>>>+    pthread_mutex_init(&mutex, NULL);
> >>>>+
> >>>>+    if (pthread_attr_init(&p_attr))
> >>>>+        igt_assert_f(0, "Error obtaining default thread attributes");
> >>>>+
> >>>>+    if (pthread_attr_setinheritsched(&p_attr, PTHREAD_EXPLICIT_SCHED))
> >>>>+        igt_assert_f(0, "Couldn't set inheritsched");
> >>>>+
> >>>>+    if (pthread_attr_setschedpolicy(&p_attr, SCHED_RR))
> >>>>+        igt_assert_f(0, "Couldn't set thread scheduling policy");
> >>>>+
> >>>>+    /* Keep the flusher task also at rt priority, so that it doesn't
> >>>>get
> >>>>+     * too late in flushing the collected logs in local buffers to
> >>>>the disk,
> >>>>+     * and so main thread always have spare buffers to collect the
> >>>>logs.
> >>>>+     */
> >>>>+    thread_sched.sched_priority = 5;
> >>>>+    if (pthread_attr_setschedparam(&p_attr, &thread_sched))
> >>>>+        igt_assert_f(0, "Couldn't set thread priority");
> >>>>+
> >>>>+    if (pthread_create(&flush_thread, &p_attr, flusher, NULL))
> >>>>+        igt_assert_f(0, "thread creation failed");
> >>>
> >>>More nitpicks: You mix starting error messages with upper case and lower
> >>>case.
> >>
> >>Will use the lower case for all error messages. fine ?
> >
> >Fine with me.
> >
> 
> Thanks
> >>>
> >>>>+
> >>>>+    if (pthread_attr_destroy(&p_attr))
> >>>>+        igt_assert_f(0, "Error destroying thread attributes");
> >>>>+}
> >>>>+
> >>>>+static void open_relay_file(void)
> >>>>+{
> >>>>+    relay_fd = igt_debugfs_open(RELAY_FILE_NAME, O_RDONLY);
> >>>>+    if (relay_fd < 0)
> >>>>+        igt_assert_f(0, "Couldn't open the guc log file");
> >>>>+
> >>>>+    /* Purge the old/boot-time logs from the relay buffer.
> >>>>+     * This is more for Val team's requirement, where they have to
> >>>>first
> >>>>+     * purge the existing logs before starting the tests for which
> >>>>the logs
> >>>>+     * are actually needed. After this logger will enter into a loop
> >>>>and
> >>>>+     * wait for the new data, at that point benchmark can be launched
> >>>>from
> >>>>+     * a different shell.
> >>>>+     */
> >>>>+    if (discard_oldlogs)
> >>>>+        pull_leftover_data();
> >>>>+}
> >>>>+
> >>>>+static void open_output_file(void)
> >>>>+{
> >>>>+    char *filename;
> >>>>+
> >>>>+    if (asprintf(&filename, "/tmp/%s", out_filename ? :
> >>>>"guc_log_dump.dat") < 0)
> >>>>+        igt_assert_f(0, "Couldn't allocate the filename");
> >>>
> >>>I think you need to take the out_filename when specified by the user
> >>>as-is and not stick it in /tmp. It is perfectly reasonable that someone
> >>>would want to put it somewhere else.
> >>>
> >>Ok so User should provide the full path.
> >
> >It can be relative, absolute, or just a filename. My suggestion was to
> >just use what the user provided for the open(2) call.
> >
> Yes the path provided can be absolute/relative, will do the open()
> call with whatever string is provided as a command line.
> 
> >>
> >>>Also better to move the default name of "guc_log_dump.dat" to where
> >>>defaults like RELAY_FILE_NAME and such are defined.
> >>>
> >>fine.
> >>
> >>>And I would not default to /tmp but the current directory.
> >>>
> >>Chris had suggested to use /tmp. Fine will use the current directory.
> >>
> >>>>+
> >>>>+    /* Use Direct IO mode for the output file, as the data written is
> >>>>not
> >>>>+     * supposed to be accessed again, this saves a copy of data from
> >>>>App's
> >>>>+     * buffer to kernel buffer (Page cache). Due to no buffering on
> >>>>kernel
> >>>>+     * side, data is flushed out to disk faster and more buffering
> >>>>can be
> >>>>+     * done on the logger side to hide the disk IO latency.
> >>>>+     */
> >>>>+    outfile_fd = open(filename, O_CREAT | O_WRONLY | O_TRUNC |
> >>>>O_DIRECT, 0444);
> >>>
> >>>0400 or 0440 would be better I think.
> >>
> >>fine, this might make the file inaccessible to Root User in case the
> >>logger wasn't running as a Root ?
> >
> >I assumed logger will run as root and it would be preferable not to have
> >log files accessible by anyone. Either way root can read anything
> >regardless of permissions, no?
> >
> Sorry after working on selinux enabled setups (in Android, where
> there are certain restrictions on Root also), forgot that Root can
> do anything.
> 
> >>>
> >>>>+    if (outfile_fd < 0)
> >>>>+        igt_assert_f(0, "Couldn't open the output file");
> >>>>+
> >>>>+    free(out_filename);
> >>>>+    free(filename);
> >>>>+}
> >>>>+
> >>>>+static void init_main_thread(void)
> >>>>+{
> >>>>+    struct sched_param    thread_sched;
> >>>>+    /* Run the main thread at highest priority to ensure that it
> >>>>always
> >>>>+     * gets woken-up at earliest on arrival of new data and so is
> >>>>always
> >>>>+     * ready to pull the logs, otherwise there could be loss logs if
> >>>>+     * GuC firmware is generating logs at a very high rate.
> >>>>+     */
> >>>>+    thread_sched.sched_priority = 1;
> >>>>+    if (sched_setscheduler(getpid(), SCHED_FIFO, &thread_sched))
> >>>>+        igt_assert_f(0, "Couldn't set the priority");
> >>>>+
> >>>>+    if (signal(SIGINT, int_sig_handler) == SIG_ERR)
> >>>>+        igt_assert_f(0, "SIGINT handler registration failed");
> >>>>+
> >>>>+    /* Need an aligned pointer for direct IO */
> >>>>+    if (posix_memalign((void **)&read_buffer, PAGE_SIZE, num_buffers
> >>>>* SUBBUF_SIZE))
> >>>>+        igt_assert_f(0, "Couldn't allocate the read buffer");
> >>>>+
> >>>>+    /* Keep the pages locked in RAM, avoid page fault overhead */
> >>>>+    if (mlock(read_buffer, num_buffers * SUBBUF_SIZE))
> >>>>+        igt_assert_f(0, "Failed to lock memory");
> >>>>+
> >>>>+    /* Enable the logging, it may not have been enabled from boot
> >>>>and so
> >>>>+     * the relay file also wouldn't have been created.
> >>>>+     */
> >>>>+    guc_log_control(true);
> >>>>+
> >>>>+    open_relay_file();
> >>>>+    open_output_file();
> >>>>+}
> >>>>+
> >>>>+static int parse_options(int opt, int opt_index, void *data)
> >>>>+{
> >>>>+    igt_debug("opt %c optarg %s\n", opt, optarg);
> >>>>+
> >>>>+    switch(opt) {
> >>>>+    case 'v':
> >>>>+        verbosity_level = atoi(optarg);
> >>>>+        if (verbosity_level < 0)
> >>>>+            verbosity_level = 0;
> >>>>+        else if (verbosity_level > 3)
> >>>>+            verbosity_level = 3;
> >>>
> >>>Notify user if invalid value was provided?
> >>>
> >>Just notify but don't abort and continue by using a valid value ?
> >
> >No strong opinion. Either is much better than silently using a different
> >value or ignoring the fact user has typed in garbage on the command line.
> >
> Ok will report/notify and then abort.
> Will do the same for other arguments.
> 
> Is it fine ?
> 
> >>
> >>>>+
> >>>>+        igt_debug("verbosity level to be used is %d\n",
> >>>>verbosity_level);
> >>>>+        break;
> >>>>+    case 'o':
> >>>>+        if (asprintf(&out_filename, "%s", optarg) < 0)
> >>>
> >>>strdup?
> >>ok will try to use strdup.
> >>>
> >>>>+            igt_assert_f(0, "Couldn't allocate the o/p filename");
> >>>>+        igt_debug("logs to be stored in file /tmp/%s\n",
> >>>>out_filename);
> >>>>+        break;
> >>>>+    case 'b':
> >>>>+        num_buffers = atoi(optarg);
> >>>>+        if (num_buffers == 0)
> >>>>+            num_buffers = NUM_SUBBUFS;
> >>>
> >>>Again notify if invalid number/value.
> >>>
> >>same doubt as above.
> >>
> >>>>+        igt_debug("number of buffers to be used is %d\n",
> >>>>num_buffers);
> >>>>+        break;
> >>>>+    case 't':
> >>>>+        test_duration = atoi(optarg);
> >>>>+        if (test_duration > 0)
> >>>
> >>>If switch was specified it should have a valid optarg or error reported.
> >>>
> >>>>+            igt_debug("logger to run for %d seconds\n",
> >>>>test_duration);
> >>>>+        break;
> >>>>+    case 'p':
> >>>>+        poll_timeout = atoi(optarg);
> >>>>+        if (poll_timeout == 0)
> >>>>+            poll_timeout = 2; /* at least use 2ms poll timeout */
> >>>
> >>>Hm I would prefer if you allowed no poll period.
> >>
> >>by no poll period, do you mean indefinite wait ?
> >>
> >>That can be done by specifying -1 as a timeout value in the command line,
> >>
> >>"-p --polltimeout=ms polling timeout in ms, -1 == indefinite wait for
> >>the new data\n"
> >
> >My bad, I missed that. All good then.
> >
> >>
> >>>I even think it should be the default if we think relayfs will be
> >>>definitely fixed to not
> >>>require it.
> >>>
> >>Can keep the indefinite wait as a default.
> >>
> >>>In any case, commend is wrong since user can specify 1ms and get away
> >>>with it. :)
> >>>
> >>>>+        if (poll_timeout > 0)
> >>>>+            igt_debug("polling to be done with %d milli seconds
> >>>>timeout\n", poll_timeout);
> >>>
> >>>milliseconds
> >>>
> >>>>+        break;
> >>>>+    case 's':
> >>>>+        max_filesize = atoi(optarg);
> >>>>+        if (max_filesize > 0)
> >>>
> >>>Same as for -t.
> >>>
> >>>>+            igt_debug("max allowed size of the output file is %d
> >>>>MBs\n", max_filesize);
> >>>>+        break;
> >>>>+    case 'd':
> >>>>+        discard_oldlogs = true;
> >>>>+        igt_debug("old/boot-time logs will be discarded\n");
> >>>>+        break;
> >>>>+    }
> >>>>+
> >>>>+    return 0;
> >>>>+}
> >>>>+
> >>>>+static void process_command_line(int argc, char **argv)
> >>>>+{
> >>>>+    static struct option long_options[] = {
> >>>>+        {"verbosity", required_argument, 0, 'v'},
> >>>>+        {"outputfilename", required_argument, 0, 'o'},
> >>>>+        {"buffers", required_argument, 0, 'b'},
> >>>>+        {"testduration", required_argument, 0, 't'},
> >>>>+        {"polltimeout", required_argument, 0, 'p'},
> >>>>+        {"size", required_argument, 0, 's'},
> >>>>+        {"discard", no_argument, 0, 'd'},
> >>>>+        { 0, 0, 0, 0 }
> >>>>+    };
> >>>>+
> >>>>+    const char *help =
> >>>>+        "  -v --verbosity=level   verbosity level of GuC logging
> >>>>(0-3)\n"
> >>>>+        "  -o --outfile=name      name of the output file in /tmp
> >>>>directory, where logs will be stored\n"
> >>>>+        "  -b --buffers=num       number of buffers to be maintained
> >>>>on logger side for storing logs\n"
> >>>>+        "  -t --testduration=sec  max duration in seconds for which
> >>>>the logger should run, 0 == forever\n"
> >>>>+        "  -p --polltimeout=ms    polling timeout in ms, -1 ==
> >>>>indefinite wait for the new data\n"
> >>>>+        "  -s --size=MB           max size of output file in MBs
> >>>>after which logging will be stopped, 0 == no limit\n"
> >>>>+        "  -d --discard           discard the old/boot-time logs
> >>>>before entering into the capture loop\n";
> >>>>+
> >>>>+    igt_simple_init_parse_opts(&argc, argv, "v:o:b:t:p:s:d",
> >>>>long_options,
> >>>>+                   help, parse_options, NULL);
> >>>>+}
> >>>>+
> >>>>+int main(int argc, char **argv)
> >>>>+{
> >>>>+    struct pollfd relay_poll_fd;
> >>>>+    struct timespec start={};
> >>>>+    int nfds;
> >>>>+    int ret;
> >>>>+
> >>>>+    process_command_line(argc, argv);
> >>>>+
> >>>>+    init_main_thread();
> >>>>+
> >>>>+    /* Use a separate thread for flushing the logs to a file on disk.
> >>>>+     * Main thread will buffer the data from relay file in its pool of
> >>>>+     * buffers and other thread will flush the data to disk in
> >>>>background.
> >>>>+     * This is needed, albeit by default data is written out to
> >>>>disk in
> >>>>+     * async mode, as when there are too many dirty pages in the RAM,
> >>>>+     * (/proc/sys/vm/dirty_ratio), kernel starts blocking the
> >>>>processes
> >>>>+     * doing the file writes.
> >>>>+     */
> >>>>+    init_flusher_thread();
> >>>>+
> >>>>+    relay_poll_fd.fd = relay_fd;
> >>>>+    relay_poll_fd.events = POLLIN;
> >>>>+    relay_poll_fd.revents = 0;
> >>>>+
> >>>>+    nfds = 1; /* only one fd to poll */
> >>>>+
> >>>>+    while (!stop_logging)
> >>>>+    {
> >>>>+        if (test_duration && (igt_seconds_elapsed(&start) >
> >>>>test_duration)) {
> >>>
> >>>If you agree to allow no poll period the this would not work right? In
> >>>that case you would need to use alarm(2) or something.
> >>>
> >>
> >>Can calculate the timeout value for poll call as,
> >>     if (poll_timeout < 0) {
> >>         timeout = test_duration - igt_seconds_elapsed(&start))
> >>     }
> >
> >My point was that with indefinite poll loop will not run if there is not
> >log data so timeout will not work implemented like this.
> >
> I understood your concern in first place but probably didn't put
> forth my point clearly.
> 
> For more clarity, this is how think it can be addressed.
> 
> --- a/tools/intel_guc_logger.c
> +++ b/tools/intel_guc_logger.c
> @@ -370,6 +370,8 @@ int main(int argc, char **argv)
>  {
>  	struct pollfd relay_poll_fd;
>  	struct timespec start={};
> +	uint32_t time_elapsed;
> +	int timeout;
>  	int nfds;
>  	int ret;
> 
> @@ -395,10 +397,17 @@ int main(int argc, char **argv)
> 
>  	while (!stop_logging)
>  	{
> -		if (test_duration && (igt_seconds_elapsed(&start) > test_duration)) {
> -			igt_debug("Ran for stipulated %d seconds, exit now\n", test_duration);
> -			stop_logging = true;
> -			break;
> +		timeout = poll_timeout;
> +		if (test_duration) {
> +			time_elapsed = igt_seconds_elapsed(&start);
> +			if (time_elapsed >= test_duration) {
> +				igt_debug("Ran for stipulated %d seconds, exit now\n", test_duration);
> +				stop_logging = true;
> +				break;
> +			}
> +			if (poll_timeout < 0)
> +				timeout = (test_duration - time_elapsed) * 1000;
> 		}
> 
>  		/* Wait/poll for the new data to be available, relay doesn't
> @@ -412,7 +421,7 @@ int main(int argc, char **argv)
>  		 * than a jiffy gap between 2 flush interrupts) and relay runs
>  		 * out of sub buffers to store the new logs.
>  		 */
> -		ret = poll(&relay_poll_fd, nfds, poll_timeout);
> +		ret = poll(&relay_poll_fd, nfds, timeout);
>  		if (ret < 0) {
>  			if (errno == EINTR)
>  				break;
> 
> So will not do polling with indefinite timeout and adjust the
> timeout value as per test's duration.
> Does it look ok ?

Since the comment still refers to a kernel bug that you've fixed, it can
just go. The timeout calculation is indeed more simply expressed as
alarm(timeout).

And fixing the blocking read() is about 10 lines in the kernel...
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] tools/intel_guc_logger: Utility for capturing GuC firmware logs in a file
  2016-09-07  8:44         ` Chris Wilson
@ 2016-09-07  9:37           ` Tvrtko Ursulin
  2016-09-07  9:55             ` Chris Wilson
  2016-09-07 10:45             ` [PATCH] " Goel, Akash
  0 siblings, 2 replies; 17+ messages in thread
From: Tvrtko Ursulin @ 2016-09-07  9:37 UTC (permalink / raw)
  To: Chris Wilson, Goel, Akash, intel-gfx


On 07/09/16 09:44, Chris Wilson wrote:
> On Wed, Sep 07, 2016 at 01:40:27PM +0530, Goel, Akash wrote:
>> On 9/6/2016 9:22 PM, Tvrtko Ursulin wrote:

[snip]

>>>>>> +    while (!stop_logging)
>>>>>> +    {
>>>>>> +        if (test_duration && (igt_seconds_elapsed(&start) >
>>>>>> test_duration)) {
>>>>>
>>>>> If you agree to allow no poll period the this would not work right? In
>>>>> that case you would need to use alarm(2) or something.
>>>>>
>>>>
>>>> Can calculate the timeout value for poll call as,
>>>>      if (poll_timeout < 0) {
>>>>          timeout = test_duration - igt_seconds_elapsed(&start))
>>>>      }
>>>
>>> My point was that with indefinite poll loop will not run if there is not
>>> log data so timeout will not work implemented like this.
>>>
>> I understood your concern in first place but probably didn't put
>> forth my point clearly.
>>
>> For more clarity, this is how think it can be addressed.
>>
>> --- a/tools/intel_guc_logger.c
>> +++ b/tools/intel_guc_logger.c
>> @@ -370,6 +370,8 @@ int main(int argc, char **argv)
>>   {
>>   	struct pollfd relay_poll_fd;
>>   	struct timespec start={};
>> +	uint32_t time_elapsed;
>> +	int timeout;
>>   	int nfds;
>>   	int ret;
>>
>> @@ -395,10 +397,17 @@ int main(int argc, char **argv)
>>
>>   	while (!stop_logging)
>>   	{
>> -		if (test_duration && (igt_seconds_elapsed(&start) > test_duration)) {
>> -			igt_debug("Ran for stipulated %d seconds, exit now\n", test_duration);
>> -			stop_logging = true;
>> -			break;
>> +		timeout = poll_timeout;
>> +		if (test_duration) {
>> +			time_elapsed = igt_seconds_elapsed(&start);
>> +			if (time_elapsed >= test_duration) {
>> +				igt_debug("Ran for stipulated %d seconds, exit now\n", test_duration);
>> +				stop_logging = true;
>> +				break;
>> +			}
>> +			if (poll_timeout < 0)
>> +				timeout = (test_duration - time_elapsed) * 1000;
>> 		}
>>
>>   		/* Wait/poll for the new data to be available, relay doesn't
>> @@ -412,7 +421,7 @@ int main(int argc, char **argv)
>>   		 * than a jiffy gap between 2 flush interrupts) and relay runs
>>   		 * out of sub buffers to store the new logs.
>>   		 */
>> -		ret = poll(&relay_poll_fd, nfds, poll_timeout);
>> +		ret = poll(&relay_poll_fd, nfds, timeout);
>>   		if (ret < 0) {
>>   			if (errno == EINTR)
>>   				break;
>>
>> So will not do polling with indefinite timeout and adjust the
>> timeout value as per test's duration.
>> Does it look ok ?
>
> Since the comment still refers to a kernel bug that you've fixed, it can
> just go. The timeout calculation is indeed more simply expressed as
> alarm(timeout).

Yes I wrote privately that's especially true since there is already a 
handler for SIGINT which would do the right thing for SIGALRM as well. I 
don't feel so strongly about this but now that we both think the same 
maybe go for the simpler implementation if you don't mind Akash?

> And fixing the blocking read() is about 10 lines in the kernel...

Haven't checked but if that is the case, since we are already fixing 
relayfs issues, it would be good to do that one as well since it would 
simplify the logger. Because if we do it straight away then we know 
logger can use it, and if we leave it for later then it gets uglier for 
the logger.

But if we cannot make the fix go in the same kernel version (or earlier) 
than the GuC logging then I think we don't need to block on that.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] tools/intel_guc_logger: Utility for capturing GuC firmware logs in a file
  2016-09-07  9:37           ` Tvrtko Ursulin
@ 2016-09-07  9:55             ` Chris Wilson
  2016-09-07 15:27               ` [PATCH v2] " akash.goel
  2016-09-07 10:45             ` [PATCH] " Goel, Akash
  1 sibling, 1 reply; 17+ messages in thread
From: Chris Wilson @ 2016-09-07  9:55 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: Goel, Akash, intel-gfx

On Wed, Sep 07, 2016 at 10:37:07AM +0100, Tvrtko Ursulin wrote:
> 
> On 07/09/16 09:44, Chris Wilson wrote:
> >And fixing the blocking read() is about 10 lines in the kernel...
> 
> Haven't checked but if that is the case, since we are already fixing
> relayfs issues, it would be good to do that one as well since it
> would simplify the logger. Because if we do it straight away then we
> know logger can use it, and if we leave it for later then it gets
> uglier for the logger.
> 
> But if we cannot make the fix go in the same kernel version (or
> earlier) than the GuC logging then I think we don't need to block on
> that.

Totally agree. I think the poll() example is more useful anyway, since
that is more likely to fit into the event loop of a real client.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] tools/intel_guc_logger: Utility for capturing GuC firmware logs in a file
  2016-09-07  9:37           ` Tvrtko Ursulin
  2016-09-07  9:55             ` Chris Wilson
@ 2016-09-07 10:45             ` Goel, Akash
  1 sibling, 0 replies; 17+ messages in thread
From: Goel, Akash @ 2016-09-07 10:45 UTC (permalink / raw)
  To: Tvrtko Ursulin, Chris Wilson, intel-gfx



On 9/7/2016 3:07 PM, Tvrtko Ursulin wrote:
>
> On 07/09/16 09:44, Chris Wilson wrote:
>> On Wed, Sep 07, 2016 at 01:40:27PM +0530, Goel, Akash wrote:
>>> On 9/6/2016 9:22 PM, Tvrtko Ursulin wrote:
>
> [snip]
>
>>>>>>> +    while (!stop_logging)
>>>>>>> +    {
>>>>>>> +        if (test_duration && (igt_seconds_elapsed(&start) >
>>>>>>> test_duration)) {
>>>>>>
>>>>>> If you agree to allow no poll period the this would not work
>>>>>> right? In
>>>>>> that case you would need to use alarm(2) or something.
>>>>>>
>>>>>
>>>>> Can calculate the timeout value for poll call as,
>>>>>      if (poll_timeout < 0) {
>>>>>          timeout = test_duration - igt_seconds_elapsed(&start))
>>>>>      }
>>>>
>>>> My point was that with indefinite poll loop will not run if there is
>>>> not
>>>> log data so timeout will not work implemented like this.
>>>>
>>> I understood your concern in first place but probably didn't put
>>> forth my point clearly.
>>>
>>> For more clarity, this is how think it can be addressed.
>>>
>>> --- a/tools/intel_guc_logger.c
>>> +++ b/tools/intel_guc_logger.c
>>> @@ -370,6 +370,8 @@ int main(int argc, char **argv)
>>>   {
>>>       struct pollfd relay_poll_fd;
>>>       struct timespec start={};
>>> +    uint32_t time_elapsed;
>>> +    int timeout;
>>>       int nfds;
>>>       int ret;
>>>
>>> @@ -395,10 +397,17 @@ int main(int argc, char **argv)
>>>
>>>       while (!stop_logging)
>>>       {
>>> -        if (test_duration && (igt_seconds_elapsed(&start) >
>>> test_duration)) {
>>> -            igt_debug("Ran for stipulated %d seconds, exit now\n",
>>> test_duration);
>>> -            stop_logging = true;
>>> -            break;
>>> +        timeout = poll_timeout;
>>> +        if (test_duration) {
>>> +            time_elapsed = igt_seconds_elapsed(&start);
>>> +            if (time_elapsed >= test_duration) {
>>> +                igt_debug("Ran for stipulated %d seconds, exit
>>> now\n", test_duration);
>>> +                stop_logging = true;
>>> +                break;
>>> +            }
>>> +            if (poll_timeout < 0)
>>> +                timeout = (test_duration - time_elapsed) * 1000;
>>>         }
>>>
>>>           /* Wait/poll for the new data to be available, relay doesn't
>>> @@ -412,7 +421,7 @@ int main(int argc, char **argv)
>>>            * than a jiffy gap between 2 flush interrupts) and relay runs
>>>            * out of sub buffers to store the new logs.
>>>            */
>>> -        ret = poll(&relay_poll_fd, nfds, poll_timeout);
>>> +        ret = poll(&relay_poll_fd, nfds, timeout);
>>>           if (ret < 0) {
>>>               if (errno == EINTR)
>>>                   break;
>>>
>>> So will not do polling with indefinite timeout and adjust the
>>> timeout value as per test's duration.
>>> Does it look ok ?
>>
>> Since the comment still refers to a kernel bug that you've fixed, it can
>> just go. The timeout calculation is indeed more simply expressed as
>> alarm(timeout).
>
> Yes I wrote privately that's especially true since there is already a
> handler for SIGINT which would do the right thing for SIGALRM as well. I
> don't feel so strongly about this but now that we both think the same
> maybe go for the simpler implementation if you don't mind Akash?
>
Thanks much for suggestion.
Will use 'alarm(timeout)', its definitely much simpler.

>> And fixing the blocking read() is about 10 lines in the kernel...
>
> Haven't checked but if that is the case, since we are already fixing
> relayfs issues, it would be good to do that one as well since it would
> simplify the logger. Because if we do it straight away then we know
> logger can use it, and if we leave it for later then it gets uglier for
> the logger.
>
> But if we cannot make the fix go in the same kernel version (or earlier)
> than the GuC logging then I think we don't need to block on that.
>

Sorry not sure that whether we would gain much by trying to add the 
support for blocking read in relay.

For a regular disk file, which is of a fixed size, it makes sense to 
have a provision to block the reader until file's data is paged in from 
the disk into RAM.

But for relay, data to be read would invariably be generated dynamically 
which can stop at anytime and thus the reader could get blocked for ever.

I think the current relay semantics are fine that if there is no data 
left to be read in channel buffers zero will be returned and Clients
can get to know about the generation of new data through poll (using a
timeout).

Best regards
Akash

> Regards,



>
> Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH v2] tools/intel_guc_logger: Utility for capturing GuC firmware logs in a file
  2016-09-07  9:55             ` Chris Wilson
@ 2016-09-07 15:27               ` akash.goel
  2016-09-07 15:39                 ` Tvrtko Ursulin
  0 siblings, 1 reply; 17+ messages in thread
From: akash.goel @ 2016-09-07 15:27 UTC (permalink / raw)
  To: intel-gfx; +Cc: Akash Goel

From: Akash Goel <akash.goel@intel.com>

This patch provides a test utility which helps capture GuC firmware logs and
then dump them to file.
The logs are pulled from a debugfs file '/sys/kernel/debug/dri/guc_log' and
stored into a file '/tmp/guc_log_dump.dat', the name of the output file can
be changed through a command line argument.

The utility goes into an infinite loop where it waits for the arrival of new
logs and as soon as new set of logs are produced it captures them in its local
buffer which is then flushed out to the file on disk.
Any time when logging needs to be ended, User can stop this utility (CTRL+C).

Before entering into a loop, it first discards whatever logs are present in
the debugfs file.
This way User can first launch this utility and then start a workload/activity
for which GuC firmware logs are to be actually captured and keep running the
utility for as long as its needed, like once the workload is over this utility
can be forcefully stopped.

If the logging wasn't enabled on GuC side by the Driver at boot time, utility
will first enable the logging and later on when it is stopped (CTRL+C) it will
also pause the logging on GuC side.

v2:
- Use combination of alarm system call & SIGALRM signal to run the utility
  for required duration. (Tvrtko)
- Fix inconsistencies, do minor cleanup and refactoring. (Tvrtko)

Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Signed-off-by: Akash Goel <akash.goel@intel.com>
---
 tools/Makefile.sources   |   1 +
 tools/intel_guc_logger.c | 425 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 426 insertions(+)
 create mode 100644 tools/intel_guc_logger.c

diff --git a/tools/Makefile.sources b/tools/Makefile.sources
index 2bb6c8e..be58871 100644
--- a/tools/Makefile.sources
+++ b/tools/Makefile.sources
@@ -19,6 +19,7 @@ tools_prog_lists =		\
 	intel_gpu_time		\
 	intel_gpu_top		\
 	intel_gtt		\
+	intel_guc_logger        \
 	intel_infoframes	\
 	intel_l3_parity		\
 	intel_lid		\
diff --git a/tools/intel_guc_logger.c b/tools/intel_guc_logger.c
new file mode 100644
index 0000000..6e03282
--- /dev/null
+++ b/tools/intel_guc_logger.c
@@ -0,0 +1,425 @@
+
+#define _GNU_SOURCE  /* For using O_DIRECT */
+#include <inttypes.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <sys/ioctl.h>
+#include <string.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/poll.h>
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <assert.h>
+#include <pthread.h>
+
+#include "igt.h"
+
+#define MB(x) ((uint64_t)(x) * 1024 * 1024)
+#ifndef PAGE_SIZE
+  #define PAGE_SIZE 4096
+#endif
+#define SUBBUF_SIZE (19*PAGE_SIZE)
+/* Need large buffering from logger side to hide the DISK IO latency, Driver
+ * can only store 8 snapshots of GuC log buffer in relay.
+ */
+#define NUM_SUBBUFS 100
+
+#define RELAY_FILE_NAME  "guc_log"
+#define DEFAULT_OUTPUT_FILE_NAME  "guc_log_dump.dat"
+#define CONTROL_FILE_NAME "i915_guc_log_control"
+
+char *read_buffer;
+char *out_filename;
+int poll_timeout = 2; /* by default 2ms timeout */
+pthread_mutex_t mutex;
+pthread_t flush_thread;
+int verbosity_level = 3; /* by default capture logs at max verbosity */
+uint32_t produced, consumed;
+uint64_t total_bytes_written;
+int num_buffers = NUM_SUBBUFS;
+int relay_fd, outfile_fd = -1;
+bool stop_logging, discard_oldlogs;
+uint32_t test_duration, max_filesize;
+pthread_cond_t underflow_cond, overflow_cond;
+
+static void guc_log_control(bool enable_logging)
+{
+	int control_fd;
+	char data[19];
+	uint64_t val;
+	int ret;
+
+	control_fd = igt_debugfs_open(CONTROL_FILE_NAME, O_WRONLY);
+	igt_assert_f(control_fd >= 0, "couldn't open the guc log control file");
+
+	val = enable_logging ? ((verbosity_level << 4) | 0x1) : 0;
+
+	ret = snprintf(data, sizeof(data), "0x%" PRIx64, val);
+	igt_assert(ret > 2 && ret < sizeof(data));
+
+	ret = write(control_fd, data, ret);
+	igt_assert_f(ret > 0, "couldn't write to the log control file");
+
+	close(control_fd);
+}
+
+static void int_sig_handler(int sig)
+{
+	igt_info("received signal %d\n", sig);
+
+	stop_logging = true;
+}
+
+static void pull_leftover_data(void)
+{
+	unsigned int bytes_read = 0;
+	int ret;
+
+	do {
+		/* Read the logs from relay buffer */
+		ret = read(relay_fd, read_buffer, SUBBUF_SIZE);
+		if (!ret)
+			break;
+
+		igt_assert_f(ret > 0, "failed to read from the guc log file");
+		igt_assert_f(ret == SUBBUF_SIZE, "invalid read from relay file");
+
+		bytes_read += ret;
+
+		if (outfile_fd >= 0) {
+			ret = write(outfile_fd, read_buffer, SUBBUF_SIZE);
+			igt_assert_f(ret == SUBBUF_SIZE, "couldn't dump the logs in a file");
+			total_bytes_written += ret;
+		}
+	} while(1);
+
+	igt_debug("%u bytes flushed\n", bytes_read);
+}
+
+static int num_filled_bufs(void)
+{
+	return (produced - consumed);
+}
+
+static void pull_data(void)
+{
+	char *ptr;
+	int ret;
+
+	pthread_mutex_lock(&mutex);
+	while (num_filled_bufs() >= num_buffers) {
+		igt_debug("overflow, will wait, produced %u, consumed %u\n", produced, consumed);
+		/* Stall the main thread in case of overflow, as there are no
+		 * buffers available to store the new logs, otherwise there
+		 * could be corruption if both threads work on the same buffer.
+		 */
+		pthread_cond_wait(&overflow_cond, &mutex);
+	};
+	pthread_mutex_unlock(&mutex);
+
+	ptr = read_buffer + (produced % num_buffers) * SUBBUF_SIZE;
+
+	/* Read the logs from relay buffer */
+	ret = read(relay_fd, ptr, SUBBUF_SIZE);
+	igt_assert_f(ret >= 0, "failed to read from the guc log file");
+	igt_assert_f(ret == SUBBUF_SIZE, "invalid read from relay file");
+
+	if (ret) {
+		pthread_mutex_lock(&mutex);
+		produced++;
+		pthread_cond_signal(&underflow_cond);
+		pthread_mutex_unlock(&mutex);
+	}
+}
+
+static void *flusher(void *arg)
+{
+	char *ptr;
+	int ret;
+
+	igt_debug("execution started of flusher thread\n");
+
+	do {
+		pthread_mutex_lock(&mutex);
+		while (!num_filled_bufs()) {
+			/* Exit only after completing the flush of all the filled
+			 * buffers as User would expect that all logs captured up
+			 * till the point of interruption/exit are written out to
+			 * the disk file.
+			 */
+			if (stop_logging) {
+				igt_debug("flusher to exit now\n");
+				pthread_mutex_unlock(&mutex);
+				return NULL;
+			}
+			pthread_cond_wait(&underflow_cond, &mutex);
+		};
+		pthread_mutex_unlock(&mutex);
+
+		ptr = read_buffer + (consumed % num_buffers) * SUBBUF_SIZE;
+
+		ret = write(outfile_fd, ptr, SUBBUF_SIZE);
+		igt_assert_f(ret == SUBBUF_SIZE, "couldn't dump the logs in a file");
+
+		total_bytes_written += ret;
+		if (max_filesize && (total_bytes_written > MB(max_filesize))) {
+			igt_debug("reached the target of %" PRIu64 " bytes\n", MB(max_filesize));
+			stop_logging = true;
+		}
+
+		pthread_mutex_lock(&mutex);
+		consumed++;
+		pthread_cond_signal(&overflow_cond);
+		pthread_mutex_unlock(&mutex);
+	} while(1);
+
+	return NULL;
+}
+
+static void init_flusher_thread(void)
+{
+	struct sched_param	thread_sched;
+	pthread_attr_t		p_attr;
+	int ret;
+
+	pthread_cond_init(&underflow_cond, NULL);
+	pthread_cond_init(&overflow_cond, NULL);
+	pthread_mutex_init(&mutex, NULL);
+
+	ret = pthread_attr_init(&p_attr);
+	igt_assert_f(ret == 0, "error obtaining default thread attributes");
+
+	ret = pthread_attr_setinheritsched(&p_attr, PTHREAD_EXPLICIT_SCHED);
+	igt_assert_f(ret == 0, "couldn't set inheritsched");
+
+	ret = pthread_attr_setschedpolicy(&p_attr, SCHED_RR);
+	igt_assert_f(ret == 0, "couldn't set thread scheduling policy");
+
+	/* Keep the flusher task also at rt priority, so that it doesn't get
+	 * too late in flushing the collected logs in local buffers to the disk,
+	 * and so main thread always have spare buffers to collect the logs.
+	 */
+	thread_sched.sched_priority = 5;
+	ret = pthread_attr_setschedparam(&p_attr, &thread_sched);
+	igt_assert_f(ret == 0, "couldn't set thread priority");
+
+	ret = pthread_create(&flush_thread, &p_attr, flusher, NULL);
+	igt_assert_f(ret == 0, "thread creation failed");
+
+	ret = pthread_attr_destroy(&p_attr);
+	igt_assert_f(ret == 0, "error destroying thread attributes");
+}
+
+static void open_relay_file(void)
+{
+	relay_fd = igt_debugfs_open(RELAY_FILE_NAME, O_RDONLY);
+	igt_assert_f(relay_fd >= 0, "couldn't open the guc log file");
+
+	/* Purge the old/boot-time logs from the relay buffer.
+	 * This is more for Val team's requirement, where they have to first
+	 * purge the existing logs before starting the tests for which the logs
+	 * are actually needed. After this logger will enter into a loop and
+	 * wait for the new data, at that point benchmark can be launched from
+	 * a different shell.
+	 */
+	if (discard_oldlogs)
+		pull_leftover_data();
+}
+
+static void open_output_file(void)
+{
+	/* Use Direct IO mode for the output file, as the data written is not
+	 * supposed to be accessed again, this saves a copy of data from App's
+	 * buffer to kernel buffer (Page cache). Due to no buffering on kernel
+	 * side, data is flushed out to disk faster and more buffering can be
+	 * done on the logger side to hide the disk IO latency.
+	 */
+	outfile_fd = open(out_filename ? : DEFAULT_OUTPUT_FILE_NAME,
+			  O_CREAT | O_WRONLY | O_TRUNC | O_DIRECT,
+			  0440);
+	igt_assert_f(outfile_fd >= 0, "couldn't open the output file");
+
+	free(out_filename);
+}
+
+static void init_main_thread(void)
+{
+	struct sched_param	thread_sched;
+	int ret;
+
+	/* Run the main thread at highest priority to ensure that it always
+	 * gets woken-up at earliest on arrival of new data and so is always
+	 * ready to pull the logs, otherwise there could be loss logs if
+	 * GuC firmware is generating logs at a very high rate.
+	 */
+	thread_sched.sched_priority = 1;
+	ret = sched_setscheduler(getpid(), SCHED_FIFO, &thread_sched);
+	igt_assert_f(ret == 0, "couldn't set the priority");
+
+	if (signal(SIGINT, int_sig_handler) == SIG_ERR)
+		igt_assert_f(0, "SIGINT handler registration failed");
+
+	if (signal(SIGALRM, int_sig_handler) == SIG_ERR)
+		igt_assert_f(0, "SIGALRM handler registration failed");
+
+	/* Need an aligned pointer for direct IO */
+	ret = posix_memalign((void **)&read_buffer, PAGE_SIZE, num_buffers * SUBBUF_SIZE);
+	igt_assert_f(ret == 0, "couldn't allocate the read buffer");
+
+	/* Keep the pages locked in RAM, avoid page fault overhead */
+	ret = mlock(read_buffer, num_buffers * SUBBUF_SIZE);
+	igt_assert_f(ret == 0, "failed to lock memory");
+
+	/* Enable the logging, it may not have been enabled from boot and so
+	 * the relay file also wouldn't have been created.
+	 */
+	guc_log_control(true);
+
+	open_relay_file();
+	open_output_file();
+}
+
+static int parse_options(int opt, int opt_index, void *data)
+{
+	igt_debug("opt %c optarg %s\n", opt, optarg);
+
+	switch(opt) {
+	case 'v':
+		verbosity_level = atoi(optarg);
+		igt_assert_f(verbosity_level < 0 || verbosity_level > 3, "invalid input for -v option");
+		igt_debug("verbosity level to be used is %d\n", verbosity_level);
+		break;
+	case 'o':
+		out_filename = strdup(optarg);
+		igt_assert_f(out_filename, "Couldn't allocate the o/p filename");
+		igt_debug("logs to be stored in file %s\n", out_filename);
+		break;
+	case 'b':
+		num_buffers = atoi(optarg);
+		igt_assert_f(num_buffers > 0, "invalid input for -b option");
+		igt_debug("number of buffers to be used is %d\n", num_buffers);
+		break;
+	case 't':
+		test_duration = atoi(optarg);
+		igt_assert_f(test_duration > 0, "invalid input for -t option");
+		igt_debug("logger to run for %d seconds\n", test_duration);
+		break;
+	case 'p':
+		poll_timeout = atoi(optarg);
+		igt_assert_f(poll_timeout != 0, "invalid input for -p option");
+		if (poll_timeout > 0)
+			igt_debug("polling to be done with %d milli seconds timeout\n", poll_timeout);
+		break;
+	case 's':
+		max_filesize = atoi(optarg);
+		igt_assert_f(max_filesize > 0, "invalid input for -s option");
+		igt_debug("max allowed size of the output file is %d MBs\n", max_filesize);
+		break;
+	case 'd':
+		discard_oldlogs = true;
+		igt_debug("old/boot-time logs will be discarded\n");
+		break;
+	}
+
+	return 0;
+}
+
+static void process_command_line(int argc, char **argv)
+{
+	static struct option long_options[] = {
+		{"verbosity", required_argument, 0, 'v'},
+		{"outputfilename", required_argument, 0, 'o'},
+		{"buffers", required_argument, 0, 'b'},
+		{"testduration", required_argument, 0, 't'},
+		{"polltimeout", required_argument, 0, 'p'},
+		{"size", required_argument, 0, 's'},
+		{"discard", no_argument, 0, 'd'},
+		{ 0, 0, 0, 0 }
+	};
+
+	const char *help =
+		"  -v --verbosity=level   verbosity level of GuC logging (0-3)\n"
+		"  -o --outfile=name      name of the output file in /tmp directory, where logs will be stored\n"
+		"  -b --buffers=num       number of buffers to be maintained on logger side for storing logs\n"
+		"  -t --testduration=sec  max duration in seconds for which the logger should run, 0 == forever\n"
+		"  -p --polltimeout=ms    polling timeout in ms, -1 == indefinite wait for the new data\n"
+		"  -s --size=MB           max size of output file in MBs after which logging will be stopped, 0 == no limit\n"
+		"  -d --discard           discard the old/boot-time logs before entering into the capture loop\n";
+
+	igt_simple_init_parse_opts(&argc, argv, "v:o:b:t:p:s:d", long_options,
+				   help, parse_options, NULL);
+}
+
+int main(int argc, char **argv)
+{
+	struct pollfd relay_poll_fd;
+	int nfds;
+	int ret;
+
+	process_command_line(argc, argv);
+
+	init_main_thread();
+
+	/* Use a separate thread for flushing the logs to a file on disk.
+	 * Main thread will buffer the data from relay file in its pool of
+	 * buffers and other thread will flush the data to disk in background.
+	 * This is needed, albeit by default data is written out to disk in
+	 * async mode, as when there are too many dirty pages in the RAM,
+	 * (/proc/sys/vm/dirty_ratio), kernel starts blocking the processes
+	 * doing the file writes.
+	 */
+	init_flusher_thread();
+
+	relay_poll_fd.fd = relay_fd;
+	relay_poll_fd.events = POLLIN;
+	relay_poll_fd.revents = 0;
+
+	nfds = 1; /* only one fd to poll */
+
+	alarm(test_duration); /* Start the alarm */
+
+	do {
+		/* Wait/poll for the new data to be available, relay doesn't
+		 * provide a blocking read.
+		 * On older kernels need to do polling with a timeout instead of
+		 * indefinite wait to avoid relying on relay for the wakeup, as
+		 * relay used to do the wakeup in a deferred manner on jiffies
+		 * granularity by scheduling a timer and moreover that timer was
+		 * re-scheduled on every newly produced buffer and so was pushed
+		 * out if there were multiple flush interrupts in a very quick
+		 * succession (less than a jiffy gap between 2 flush interrupts)
+		 * causing relay to run out of sub buffers to store new logs.
+		 */
+		ret = poll(&relay_poll_fd, nfds, poll_timeout);
+		if (ret < 0) {
+			if (errno == EINTR)
+				break;
+			else
+				igt_assert_f(0, "poll call failed");
+		}
+
+		/* No data available yet, poll again, hopefully new data is round the corner */
+		if (!relay_poll_fd.revents)
+			continue;
+
+		pull_data();
+	} while (!stop_logging);
+
+	/* Pause logging on the GuC side*/
+	guc_log_control(false);
+	pthread_cond_signal(&underflow_cond);
+	pthread_join(flush_thread, NULL);
+	pull_leftover_data();
+	igt_info("total bytes written %" PRIu64 "\n", total_bytes_written);
+
+	free(read_buffer);
+	close(relay_fd);
+	close(outfile_fd);
+	igt_exit();
+}
-- 
1.9.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH v2] tools/intel_guc_logger: Utility for capturing GuC firmware logs in a file
  2016-09-07 15:27               ` [PATCH v2] " akash.goel
@ 2016-09-07 15:39                 ` Tvrtko Ursulin
  2016-09-08 10:44                   ` [PATCH v3] " akash.goel
  0 siblings, 1 reply; 17+ messages in thread
From: Tvrtko Ursulin @ 2016-09-07 15:39 UTC (permalink / raw)
  To: akash.goel, intel-gfx


On 07/09/16 16:27, akash.goel@intel.com wrote:
> From: Akash Goel <akash.goel@intel.com>
>
> This patch provides a test utility which helps capture GuC firmware logs and
> then dump them to file.
> The logs are pulled from a debugfs file '/sys/kernel/debug/dri/guc_log' and
> stored into a file '/tmp/guc_log_dump.dat', the name of the output file can
> be changed through a command line argument.
>
> The utility goes into an infinite loop where it waits for the arrival of new
> logs and as soon as new set of logs are produced it captures them in its local
> buffer which is then flushed out to the file on disk.
> Any time when logging needs to be ended, User can stop this utility (CTRL+C).
>
> Before entering into a loop, it first discards whatever logs are present in
> the debugfs file.
> This way User can first launch this utility and then start a workload/activity
> for which GuC firmware logs are to be actually captured and keep running the
> utility for as long as its needed, like once the workload is over this utility
> can be forcefully stopped.
>
> If the logging wasn't enabled on GuC side by the Driver at boot time, utility
> will first enable the logging and later on when it is stopped (CTRL+C) it will
> also pause the logging on GuC side.
>
> v2:
> - Use combination of alarm system call & SIGALRM signal to run the utility
>    for required duration. (Tvrtko)
> - Fix inconsistencies, do minor cleanup and refactoring. (Tvrtko)
>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> Signed-off-by: Akash Goel <akash.goel@intel.com>
> ---
>   tools/Makefile.sources   |   1 +
>   tools/intel_guc_logger.c | 425 +++++++++++++++++++++++++++++++++++++++++++++++
>   2 files changed, 426 insertions(+)
>   create mode 100644 tools/intel_guc_logger.c
>
> diff --git a/tools/Makefile.sources b/tools/Makefile.sources
> index 2bb6c8e..be58871 100644
> --- a/tools/Makefile.sources
> +++ b/tools/Makefile.sources
> @@ -19,6 +19,7 @@ tools_prog_lists =		\
>   	intel_gpu_time		\
>   	intel_gpu_top		\
>   	intel_gtt		\
> +	intel_guc_logger        \
>   	intel_infoframes	\
>   	intel_l3_parity		\
>   	intel_lid		\
> diff --git a/tools/intel_guc_logger.c b/tools/intel_guc_logger.c
> new file mode 100644
> index 0000000..6e03282
> --- /dev/null
> +++ b/tools/intel_guc_logger.c
> @@ -0,0 +1,425 @@
> +
> +#define _GNU_SOURCE  /* For using O_DIRECT */
> +#include <inttypes.h>
> +#include <stdio.h>
> +#include <fcntl.h>
> +#include <sched.h>
> +#include <sys/ioctl.h>
> +#include <string.h>
> +#include <errno.h>
> +#include <signal.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <sys/poll.h>
> +#include <sys/wait.h>
> +#include <sys/types.h>
> +#include <sys/syscall.h>
> +#include <sys/mman.h>
> +#include <assert.h>
> +#include <pthread.h>
> +
> +#include "igt.h"
> +
> +#define MB(x) ((uint64_t)(x) * 1024 * 1024)
> +#ifndef PAGE_SIZE
> +  #define PAGE_SIZE 4096
> +#endif
> +#define SUBBUF_SIZE (19*PAGE_SIZE)
> +/* Need large buffering from logger side to hide the DISK IO latency, Driver
> + * can only store 8 snapshots of GuC log buffer in relay.
> + */
> +#define NUM_SUBBUFS 100
> +
> +#define RELAY_FILE_NAME  "guc_log"
> +#define DEFAULT_OUTPUT_FILE_NAME  "guc_log_dump.dat"
> +#define CONTROL_FILE_NAME "i915_guc_log_control"
> +
> +char *read_buffer;
> +char *out_filename;
> +int poll_timeout = 2; /* by default 2ms timeout */
> +pthread_mutex_t mutex;
> +pthread_t flush_thread;
> +int verbosity_level = 3; /* by default capture logs at max verbosity */
> +uint32_t produced, consumed;
> +uint64_t total_bytes_written;
> +int num_buffers = NUM_SUBBUFS;
> +int relay_fd, outfile_fd = -1;
> +bool stop_logging, discard_oldlogs;
> +uint32_t test_duration, max_filesize;
> +pthread_cond_t underflow_cond, overflow_cond;
> +
> +static void guc_log_control(bool enable_logging)
> +{
> +	int control_fd;
> +	char data[19];
> +	uint64_t val;
> +	int ret;
> +
> +	control_fd = igt_debugfs_open(CONTROL_FILE_NAME, O_WRONLY);
> +	igt_assert_f(control_fd >= 0, "couldn't open the guc log control file");
> +
> +	val = enable_logging ? ((verbosity_level << 4) | 0x1) : 0;
> +
> +	ret = snprintf(data, sizeof(data), "0x%" PRIx64, val);
> +	igt_assert(ret > 2 && ret < sizeof(data));
> +
> +	ret = write(control_fd, data, ret);
> +	igt_assert_f(ret > 0, "couldn't write to the log control file");
> +
> +	close(control_fd);
> +}
> +
> +static void int_sig_handler(int sig)
> +{
> +	igt_info("received signal %d\n", sig);
> +
> +	stop_logging = true;
> +}
> +
> +static void pull_leftover_data(void)
> +{
> +	unsigned int bytes_read = 0;
> +	int ret;
> +
> +	do {
> +		/* Read the logs from relay buffer */
> +		ret = read(relay_fd, read_buffer, SUBBUF_SIZE);
> +		if (!ret)
> +			break;
> +
> +		igt_assert_f(ret > 0, "failed to read from the guc log file");
> +		igt_assert_f(ret == SUBBUF_SIZE, "invalid read from relay file");
> +
> +		bytes_read += ret;
> +
> +		if (outfile_fd >= 0) {
> +			ret = write(outfile_fd, read_buffer, SUBBUF_SIZE);
> +			igt_assert_f(ret == SUBBUF_SIZE, "couldn't dump the logs in a file");
> +			total_bytes_written += ret;
> +		}
> +	} while(1);
> +
> +	igt_debug("%u bytes flushed\n", bytes_read);
> +}
> +
> +static int num_filled_bufs(void)
> +{
> +	return (produced - consumed);
> +}
> +
> +static void pull_data(void)
> +{
> +	char *ptr;
> +	int ret;
> +
> +	pthread_mutex_lock(&mutex);
> +	while (num_filled_bufs() >= num_buffers) {
> +		igt_debug("overflow, will wait, produced %u, consumed %u\n", produced, consumed);
> +		/* Stall the main thread in case of overflow, as there are no
> +		 * buffers available to store the new logs, otherwise there
> +		 * could be corruption if both threads work on the same buffer.
> +		 */
> +		pthread_cond_wait(&overflow_cond, &mutex);
> +	};
> +	pthread_mutex_unlock(&mutex);
> +
> +	ptr = read_buffer + (produced % num_buffers) * SUBBUF_SIZE;
> +
> +	/* Read the logs from relay buffer */
> +	ret = read(relay_fd, ptr, SUBBUF_SIZE);
> +	igt_assert_f(ret >= 0, "failed to read from the guc log file");
> +	igt_assert_f(ret == SUBBUF_SIZE, "invalid read from relay file");
> +
> +	if (ret) {
> +		pthread_mutex_lock(&mutex);
> +		produced++;
> +		pthread_cond_signal(&underflow_cond);
> +		pthread_mutex_unlock(&mutex);
> +	}
> +}
> +
> +static void *flusher(void *arg)
> +{
> +	char *ptr;
> +	int ret;
> +
> +	igt_debug("execution started of flusher thread\n");
> +
> +	do {
> +		pthread_mutex_lock(&mutex);
> +		while (!num_filled_bufs()) {
> +			/* Exit only after completing the flush of all the filled
> +			 * buffers as User would expect that all logs captured up
> +			 * till the point of interruption/exit are written out to
> +			 * the disk file.
> +			 */
> +			if (stop_logging) {
> +				igt_debug("flusher to exit now\n");
> +				pthread_mutex_unlock(&mutex);
> +				return NULL;
> +			}
> +			pthread_cond_wait(&underflow_cond, &mutex);
> +		};
> +		pthread_mutex_unlock(&mutex);
> +
> +		ptr = read_buffer + (consumed % num_buffers) * SUBBUF_SIZE;
> +
> +		ret = write(outfile_fd, ptr, SUBBUF_SIZE);
> +		igt_assert_f(ret == SUBBUF_SIZE, "couldn't dump the logs in a file");
> +
> +		total_bytes_written += ret;
> +		if (max_filesize && (total_bytes_written > MB(max_filesize))) {
> +			igt_debug("reached the target of %" PRIu64 " bytes\n", MB(max_filesize));
> +			stop_logging = true;
> +		}
> +
> +		pthread_mutex_lock(&mutex);
> +		consumed++;
> +		pthread_cond_signal(&overflow_cond);
> +		pthread_mutex_unlock(&mutex);
> +	} while(1);
> +
> +	return NULL;
> +}
> +
> +static void init_flusher_thread(void)
> +{
> +	struct sched_param	thread_sched;
> +	pthread_attr_t		p_attr;
> +	int ret;
> +
> +	pthread_cond_init(&underflow_cond, NULL);
> +	pthread_cond_init(&overflow_cond, NULL);
> +	pthread_mutex_init(&mutex, NULL);
> +
> +	ret = pthread_attr_init(&p_attr);
> +	igt_assert_f(ret == 0, "error obtaining default thread attributes");
> +
> +	ret = pthread_attr_setinheritsched(&p_attr, PTHREAD_EXPLICIT_SCHED);
> +	igt_assert_f(ret == 0, "couldn't set inheritsched");
> +
> +	ret = pthread_attr_setschedpolicy(&p_attr, SCHED_RR);
> +	igt_assert_f(ret == 0, "couldn't set thread scheduling policy");
> +
> +	/* Keep the flusher task also at rt priority, so that it doesn't get
> +	 * too late in flushing the collected logs in local buffers to the disk,
> +	 * and so main thread always have spare buffers to collect the logs.
> +	 */
> +	thread_sched.sched_priority = 5;
> +	ret = pthread_attr_setschedparam(&p_attr, &thread_sched);
> +	igt_assert_f(ret == 0, "couldn't set thread priority");
> +
> +	ret = pthread_create(&flush_thread, &p_attr, flusher, NULL);
> +	igt_assert_f(ret == 0, "thread creation failed");
> +
> +	ret = pthread_attr_destroy(&p_attr);
> +	igt_assert_f(ret == 0, "error destroying thread attributes");
> +}
> +
> +static void open_relay_file(void)
> +{
> +	relay_fd = igt_debugfs_open(RELAY_FILE_NAME, O_RDONLY);
> +	igt_assert_f(relay_fd >= 0, "couldn't open the guc log file");
> +
> +	/* Purge the old/boot-time logs from the relay buffer.
> +	 * This is more for Val team's requirement, where they have to first
> +	 * purge the existing logs before starting the tests for which the logs
> +	 * are actually needed. After this logger will enter into a loop and
> +	 * wait for the new data, at that point benchmark can be launched from
> +	 * a different shell.
> +	 */
> +	if (discard_oldlogs)
> +		pull_leftover_data();
> +}
> +
> +static void open_output_file(void)
> +{
> +	/* Use Direct IO mode for the output file, as the data written is not
> +	 * supposed to be accessed again, this saves a copy of data from App's
> +	 * buffer to kernel buffer (Page cache). Due to no buffering on kernel
> +	 * side, data is flushed out to disk faster and more buffering can be
> +	 * done on the logger side to hide the disk IO latency.
> +	 */
> +	outfile_fd = open(out_filename ? : DEFAULT_OUTPUT_FILE_NAME,
> +			  O_CREAT | O_WRONLY | O_TRUNC | O_DIRECT,
> +			  0440);
> +	igt_assert_f(outfile_fd >= 0, "couldn't open the output file");
> +
> +	free(out_filename);
> +}
> +
> +static void init_main_thread(void)
> +{
> +	struct sched_param	thread_sched;
> +	int ret;
> +
> +	/* Run the main thread at highest priority to ensure that it always
> +	 * gets woken-up at earliest on arrival of new data and so is always
> +	 * ready to pull the logs, otherwise there could be loss logs if
> +	 * GuC firmware is generating logs at a very high rate.
> +	 */
> +	thread_sched.sched_priority = 1;
> +	ret = sched_setscheduler(getpid(), SCHED_FIFO, &thread_sched);
> +	igt_assert_f(ret == 0, "couldn't set the priority");
> +
> +	if (signal(SIGINT, int_sig_handler) == SIG_ERR)
> +		igt_assert_f(0, "SIGINT handler registration failed");
> +
> +	if (signal(SIGALRM, int_sig_handler) == SIG_ERR)
> +		igt_assert_f(0, "SIGALRM handler registration failed");
> +
> +	/* Need an aligned pointer for direct IO */
> +	ret = posix_memalign((void **)&read_buffer, PAGE_SIZE, num_buffers * SUBBUF_SIZE);
> +	igt_assert_f(ret == 0, "couldn't allocate the read buffer");
> +
> +	/* Keep the pages locked in RAM, avoid page fault overhead */
> +	ret = mlock(read_buffer, num_buffers * SUBBUF_SIZE);
> +	igt_assert_f(ret == 0, "failed to lock memory");
> +
> +	/* Enable the logging, it may not have been enabled from boot and so
> +	 * the relay file also wouldn't have been created.
> +	 */
> +	guc_log_control(true);
> +
> +	open_relay_file();
> +	open_output_file();
> +}
> +
> +static int parse_options(int opt, int opt_index, void *data)
> +{
> +	igt_debug("opt %c optarg %s\n", opt, optarg);
> +
> +	switch(opt) {
> +	case 'v':
> +		verbosity_level = atoi(optarg);
> +		igt_assert_f(verbosity_level < 0 || verbosity_level > 3, "invalid input for -v option");
> +		igt_debug("verbosity level to be used is %d\n", verbosity_level);
> +		break;
> +	case 'o':
> +		out_filename = strdup(optarg);
> +		igt_assert_f(out_filename, "Couldn't allocate the o/p filename");
> +		igt_debug("logs to be stored in file %s\n", out_filename);
> +		break;
> +	case 'b':
> +		num_buffers = atoi(optarg);
> +		igt_assert_f(num_buffers > 0, "invalid input for -b option");
> +		igt_debug("number of buffers to be used is %d\n", num_buffers);
> +		break;
> +	case 't':
> +		test_duration = atoi(optarg);
> +		igt_assert_f(test_duration > 0, "invalid input for -t option");
> +		igt_debug("logger to run for %d seconds\n", test_duration);
> +		break;
> +	case 'p':
> +		poll_timeout = atoi(optarg);
> +		igt_assert_f(poll_timeout != 0, "invalid input for -p option");
> +		if (poll_timeout > 0)
> +			igt_debug("polling to be done with %d milli seconds timeout\n", poll_timeout);
> +		break;
> +	case 's':
> +		max_filesize = atoi(optarg);
> +		igt_assert_f(max_filesize > 0, "invalid input for -s option");
> +		igt_debug("max allowed size of the output file is %d MBs\n", max_filesize);
> +		break;
> +	case 'd':
> +		discard_oldlogs = true;
> +		igt_debug("old/boot-time logs will be discarded\n");
> +		break;
> +	}
> +
> +	return 0;
> +}
> +
> +static void process_command_line(int argc, char **argv)
> +{
> +	static struct option long_options[] = {
> +		{"verbosity", required_argument, 0, 'v'},
> +		{"outputfilename", required_argument, 0, 'o'},
> +		{"buffers", required_argument, 0, 'b'},
> +		{"testduration", required_argument, 0, 't'},
> +		{"polltimeout", required_argument, 0, 'p'},
> +		{"size", required_argument, 0, 's'},
> +		{"discard", no_argument, 0, 'd'},
> +		{ 0, 0, 0, 0 }
> +	};
> +
> +	const char *help =
> +		"  -v --verbosity=level   verbosity level of GuC logging (0-3)\n"
> +		"  -o --outfile=name      name of the output file in /tmp directory, where logs will be stored\n"
> +		"  -b --buffers=num       number of buffers to be maintained on logger side for storing logs\n"
> +		"  -t --testduration=sec  max duration in seconds for which the logger should run, 0 == forever\n"
> +		"  -p --polltimeout=ms    polling timeout in ms, -1 == indefinite wait for the new data\n"
> +		"  -s --size=MB           max size of output file in MBs after which logging will be stopped, 0 == no limit\n"
> +		"  -d --discard           discard the old/boot-time logs before entering into the capture loop\n";
> +
> +	igt_simple_init_parse_opts(&argc, argv, "v:o:b:t:p:s:d", long_options,
> +				   help, parse_options, NULL);
> +}
> +
> +int main(int argc, char **argv)
> +{
> +	struct pollfd relay_poll_fd;
> +	int nfds;
> +	int ret;
> +
> +	process_command_line(argc, argv);
> +
> +	init_main_thread();
> +
> +	/* Use a separate thread for flushing the logs to a file on disk.
> +	 * Main thread will buffer the data from relay file in its pool of
> +	 * buffers and other thread will flush the data to disk in background.
> +	 * This is needed, albeit by default data is written out to disk in
> +	 * async mode, as when there are too many dirty pages in the RAM,
> +	 * (/proc/sys/vm/dirty_ratio), kernel starts blocking the processes
> +	 * doing the file writes.
> +	 */
> +	init_flusher_thread();
> +
> +	relay_poll_fd.fd = relay_fd;
> +	relay_poll_fd.events = POLLIN;
> +	relay_poll_fd.revents = 0;
> +
> +	nfds = 1; /* only one fd to poll */
> +
> +	alarm(test_duration); /* Start the alarm */
> +
> +	do {
> +		/* Wait/poll for the new data to be available, relay doesn't
> +		 * provide a blocking read.
> +		 * On older kernels need to do polling with a timeout instead of
> +		 * indefinite wait to avoid relying on relay for the wakeup, as
> +		 * relay used to do the wakeup in a deferred manner on jiffies
> +		 * granularity by scheduling a timer and moreover that timer was
> +		 * re-scheduled on every newly produced buffer and so was pushed
> +		 * out if there were multiple flush interrupts in a very quick
> +		 * succession (less than a jiffy gap between 2 flush interrupts)
> +		 * causing relay to run out of sub buffers to store new logs.
> +		 */
> +		ret = poll(&relay_poll_fd, nfds, poll_timeout);
> +		if (ret < 0) {
> +			if (errno == EINTR)
> +				break;
> +			else
> +				igt_assert_f(0, "poll call failed");
> +		}
> +
> +		/* No data available yet, poll again, hopefully new data is round the corner */
> +		if (!relay_poll_fd.revents)
> +			continue;
> +
> +		pull_data();
> +	} while (!stop_logging);
> +
> +	/* Pause logging on the GuC side*/
> +	guc_log_control(false);
> +	pthread_cond_signal(&underflow_cond);
> +	pthread_join(flush_thread, NULL);
> +	pull_leftover_data();
> +	igt_info("total bytes written %" PRIu64 "\n", total_bytes_written);
> +
> +	free(read_buffer);
> +	close(relay_fd);
> +	close(outfile_fd);
> +	igt_exit();
> +}
>

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH v3] tools/intel_guc_logger: Utility for capturing GuC firmware logs in a file
  2016-09-07 15:39                 ` Tvrtko Ursulin
@ 2016-09-08 10:44                   ` akash.goel
  2016-10-10 10:59                     ` [PATCH v4] " akash.goel
  0 siblings, 1 reply; 17+ messages in thread
From: akash.goel @ 2016-09-08 10:44 UTC (permalink / raw)
  To: intel-gfx; +Cc: Akash Goel

From: Akash Goel <akash.goel@intel.com>

This patch provides a test utility which helps capture GuC firmware logs and
then dump them to file.
The logs are pulled from a debugfs file '/sys/kernel/debug/dri/guc_log' and
by default stored into a file 'guc_log_dump.dat'. The name, including the
location, of the output file can be changed through a command line argument.

The utility goes into an infinite loop where it waits for the arrival of new
logs and as soon as new set of logs are produced it captures them in its local
buffer which is then flushed out to the file on disk.
Any time when logging needs to be ended, User can stop this utility (CTRL+C).

Before entering into a loop, it first discards whatever logs are present in
the debugfs file.
This way User can first launch this utility and then start a workload/activity
for which GuC firmware logs are to be actually captured and keep running the
utility for as long as its needed, like once the workload is over this utility
can be forcefully stopped.

If the logging wasn't enabled on GuC side by the Driver at boot time, utility
will first enable the logging and later on when it is stopped (CTRL+C) it will
also pause the logging on GuC side.

v2:
- Use combination of alarm system call & SIGALRM signal to run the utility
  for required duration. (Tvrtko)
- Fix inconsistencies, do minor cleanup and refactoring. (Tvrtko)

v3:
- Fix discrepancy for the output file command line option and update the
  Usage/help string.

Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Signed-off-by: Akash Goel <akash.goel@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 tools/Makefile.sources   |   1 +
 tools/intel_guc_logger.c | 428 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 429 insertions(+)
 create mode 100644 tools/intel_guc_logger.c

diff --git a/tools/Makefile.sources b/tools/Makefile.sources
index 2bb6c8e..be58871 100644
--- a/tools/Makefile.sources
+++ b/tools/Makefile.sources
@@ -19,6 +19,7 @@ tools_prog_lists =		\
 	intel_gpu_time		\
 	intel_gpu_top		\
 	intel_gtt		\
+	intel_guc_logger        \
 	intel_infoframes	\
 	intel_l3_parity		\
 	intel_lid		\
diff --git a/tools/intel_guc_logger.c b/tools/intel_guc_logger.c
new file mode 100644
index 0000000..c7addd5
--- /dev/null
+++ b/tools/intel_guc_logger.c
@@ -0,0 +1,428 @@
+
+#define _GNU_SOURCE  /* For using O_DIRECT */
+#include <inttypes.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <sys/ioctl.h>
+#include <string.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/poll.h>
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <assert.h>
+#include <pthread.h>
+
+#include "igt.h"
+
+#define MB(x) ((uint64_t)(x) * 1024 * 1024)
+#ifndef PAGE_SIZE
+  #define PAGE_SIZE 4096
+#endif
+/* Currently the size of GuC log buffer is 19 pages & so is the size of relay
+ * subbuffer. If the size changes in future, then this define also needs to be
+ * updated accordingly.
+ */
+#define SUBBUF_SIZE (19*PAGE_SIZE)
+/* Need large buffering from logger side to hide the DISK IO latency, Driver
+ * can only store 8 snapshots of GuC log buffer in relay.
+ */
+#define NUM_SUBBUFS 100
+
+#define RELAY_FILE_NAME  "guc_log"
+#define DEFAULT_OUTPUT_FILE_NAME  "guc_log_dump.dat"
+#define CONTROL_FILE_NAME "i915_guc_log_control"
+
+char *read_buffer;
+char *out_filename;
+int poll_timeout = 2; /* by default 2ms timeout */
+pthread_mutex_t mutex;
+pthread_t flush_thread;
+int verbosity_level = 3; /* by default capture logs at max verbosity */
+uint32_t produced, consumed;
+uint64_t total_bytes_written;
+int num_buffers = NUM_SUBBUFS;
+int relay_fd, outfile_fd = -1;
+bool stop_logging, discard_oldlogs;
+uint32_t test_duration, max_filesize;
+pthread_cond_t underflow_cond, overflow_cond;
+
+static void guc_log_control(bool enable_logging)
+{
+	int control_fd;
+	char data[19];
+	uint64_t val;
+	int ret;
+
+	control_fd = igt_debugfs_open(CONTROL_FILE_NAME, O_WRONLY);
+	igt_assert_f(control_fd >= 0, "couldn't open the guc log control file");
+
+	val = enable_logging ? ((verbosity_level << 4) | 0x1) : 0;
+
+	ret = snprintf(data, sizeof(data), "0x%" PRIx64, val);
+	igt_assert(ret > 2 && ret < sizeof(data));
+
+	ret = write(control_fd, data, ret);
+	igt_assert_f(ret > 0, "couldn't write to the log control file");
+
+	close(control_fd);
+}
+
+static void int_sig_handler(int sig)
+{
+	igt_info("received signal %d\n", sig);
+
+	stop_logging = true;
+}
+
+static void pull_leftover_data(void)
+{
+	unsigned int bytes_read = 0;
+	int ret;
+
+	do {
+		/* Read the logs from relay buffer */
+		ret = read(relay_fd, read_buffer, SUBBUF_SIZE);
+		if (!ret)
+			break;
+
+		igt_assert_f(ret > 0, "failed to read from the guc log file");
+		igt_assert_f(ret == SUBBUF_SIZE, "invalid read from relay file");
+
+		bytes_read += ret;
+
+		if (outfile_fd >= 0) {
+			ret = write(outfile_fd, read_buffer, SUBBUF_SIZE);
+			igt_assert_f(ret == SUBBUF_SIZE, "couldn't dump the logs in a file");
+			total_bytes_written += ret;
+		}
+	} while(1);
+
+	igt_debug("%u bytes flushed\n", bytes_read);
+}
+
+static int num_filled_bufs(void)
+{
+	return (produced - consumed);
+}
+
+static void pull_data(void)
+{
+	char *ptr;
+	int ret;
+
+	pthread_mutex_lock(&mutex);
+	while (num_filled_bufs() >= num_buffers) {
+		igt_debug("overflow, will wait, produced %u, consumed %u\n", produced, consumed);
+		/* Stall the main thread in case of overflow, as there are no
+		 * buffers available to store the new logs, otherwise there
+		 * could be corruption if both threads work on the same buffer.
+		 */
+		pthread_cond_wait(&overflow_cond, &mutex);
+	};
+	pthread_mutex_unlock(&mutex);
+
+	ptr = read_buffer + (produced % num_buffers) * SUBBUF_SIZE;
+
+	/* Read the logs from relay buffer */
+	ret = read(relay_fd, ptr, SUBBUF_SIZE);
+	igt_assert_f(ret >= 0, "failed to read from the guc log file");
+	igt_assert_f(ret == SUBBUF_SIZE, "invalid read from relay file");
+
+	if (ret) {
+		pthread_mutex_lock(&mutex);
+		produced++;
+		pthread_cond_signal(&underflow_cond);
+		pthread_mutex_unlock(&mutex);
+	}
+}
+
+static void *flusher(void *arg)
+{
+	char *ptr;
+	int ret;
+
+	igt_debug("execution started of flusher thread\n");
+
+	do {
+		pthread_mutex_lock(&mutex);
+		while (!num_filled_bufs()) {
+			/* Exit only after completing the flush of all the filled
+			 * buffers as User would expect that all logs captured up
+			 * till the point of interruption/exit are written out to
+			 * the disk file.
+			 */
+			if (stop_logging) {
+				igt_debug("flusher to exit now\n");
+				pthread_mutex_unlock(&mutex);
+				return NULL;
+			}
+			pthread_cond_wait(&underflow_cond, &mutex);
+		};
+		pthread_mutex_unlock(&mutex);
+
+		ptr = read_buffer + (consumed % num_buffers) * SUBBUF_SIZE;
+
+		ret = write(outfile_fd, ptr, SUBBUF_SIZE);
+		igt_assert_f(ret == SUBBUF_SIZE, "couldn't dump the logs in a file");
+
+		total_bytes_written += ret;
+		if (max_filesize && (total_bytes_written > MB(max_filesize))) {
+			igt_debug("reached the target of %" PRIu64 " bytes\n", MB(max_filesize));
+			stop_logging = true;
+		}
+
+		pthread_mutex_lock(&mutex);
+		consumed++;
+		pthread_cond_signal(&overflow_cond);
+		pthread_mutex_unlock(&mutex);
+	} while(1);
+
+	return NULL;
+}
+
+static void init_flusher_thread(void)
+{
+	struct sched_param	thread_sched;
+	pthread_attr_t		p_attr;
+	int ret;
+
+	pthread_cond_init(&underflow_cond, NULL);
+	pthread_cond_init(&overflow_cond, NULL);
+	pthread_mutex_init(&mutex, NULL);
+
+	ret = pthread_attr_init(&p_attr);
+	igt_assert_f(ret == 0, "error obtaining default thread attributes");
+
+	ret = pthread_attr_setinheritsched(&p_attr, PTHREAD_EXPLICIT_SCHED);
+	igt_assert_f(ret == 0, "couldn't set inheritsched");
+
+	ret = pthread_attr_setschedpolicy(&p_attr, SCHED_RR);
+	igt_assert_f(ret == 0, "couldn't set thread scheduling policy");
+
+	/* Keep the flusher task also at rt priority, so that it doesn't get
+	 * too late in flushing the collected logs in local buffers to the disk,
+	 * and so main thread always have spare buffers to collect the logs.
+	 */
+	thread_sched.sched_priority = 5;
+	ret = pthread_attr_setschedparam(&p_attr, &thread_sched);
+	igt_assert_f(ret == 0, "couldn't set thread priority");
+
+	ret = pthread_create(&flush_thread, &p_attr, flusher, NULL);
+	igt_assert_f(ret == 0, "thread creation failed");
+
+	ret = pthread_attr_destroy(&p_attr);
+	igt_assert_f(ret == 0, "error destroying thread attributes");
+}
+
+static void open_relay_file(void)
+{
+	relay_fd = igt_debugfs_open(RELAY_FILE_NAME, O_RDONLY);
+	igt_assert_f(relay_fd >= 0, "couldn't open the guc log file");
+
+	/* Purge the old/boot-time logs from the relay buffer.
+	 * This is more for Val team's requirement, where they have to first
+	 * purge the existing logs before starting the tests for which the logs
+	 * are actually needed. After this logger will enter into a loop and
+	 * wait for the new data, at that point benchmark can be launched from
+	 * a different shell.
+	 */
+	if (discard_oldlogs)
+		pull_leftover_data();
+}
+
+static void open_output_file(void)
+{
+	/* Use Direct IO mode for the output file, as the data written is not
+	 * supposed to be accessed again, this saves a copy of data from App's
+	 * buffer to kernel buffer (Page cache). Due to no buffering on kernel
+	 * side, data is flushed out to disk faster and more buffering can be
+	 * done on the logger side to hide the disk IO latency.
+	 */
+	outfile_fd = open(out_filename ? : DEFAULT_OUTPUT_FILE_NAME,
+			  O_CREAT | O_WRONLY | O_TRUNC | O_DIRECT,
+			  0440);
+	igt_assert_f(outfile_fd >= 0, "couldn't open the output file");
+
+	free(out_filename);
+}
+
+static void init_main_thread(void)
+{
+	struct sched_param	thread_sched;
+	int ret;
+
+	/* Run the main thread at highest priority to ensure that it always
+	 * gets woken-up at earliest on arrival of new data and so is always
+	 * ready to pull the logs, otherwise there could be loss logs if
+	 * GuC firmware is generating logs at a very high rate.
+	 */
+	thread_sched.sched_priority = 1;
+	ret = sched_setscheduler(getpid(), SCHED_FIFO, &thread_sched);
+	igt_assert_f(ret == 0, "couldn't set the priority");
+
+	if (signal(SIGINT, int_sig_handler) == SIG_ERR)
+		igt_assert_f(0, "SIGINT handler registration failed");
+
+	if (signal(SIGALRM, int_sig_handler) == SIG_ERR)
+		igt_assert_f(0, "SIGALRM handler registration failed");
+
+	/* Need an aligned pointer for direct IO */
+	ret = posix_memalign((void **)&read_buffer, PAGE_SIZE, num_buffers * SUBBUF_SIZE);
+	igt_assert_f(ret == 0, "couldn't allocate the read buffer");
+
+	/* Keep the pages locked in RAM, avoid page fault overhead */
+	ret = mlock(read_buffer, num_buffers * SUBBUF_SIZE);
+	igt_assert_f(ret == 0, "failed to lock memory");
+
+	/* Enable the logging, it may not have been enabled from boot and so
+	 * the relay file also wouldn't have been created.
+	 */
+	guc_log_control(true);
+
+	open_relay_file();
+	open_output_file();
+}
+
+static int parse_options(int opt, int opt_index, void *data)
+{
+	igt_debug("opt %c optarg %s\n", opt, optarg);
+
+	switch(opt) {
+	case 'v':
+		verbosity_level = atoi(optarg);
+		igt_assert_f(verbosity_level < 0 || verbosity_level > 3, "invalid input for -v option");
+		igt_debug("verbosity level to be used is %d\n", verbosity_level);
+		break;
+	case 'o':
+		out_filename = strdup(optarg);
+		igt_assert_f(out_filename, "Couldn't allocate the o/p filename");
+		igt_debug("logs to be stored in file %s\n", out_filename);
+		break;
+	case 'b':
+		num_buffers = atoi(optarg);
+		igt_assert_f(num_buffers > 0, "invalid input for -b option");
+		igt_debug("number of buffers to be used is %d\n", num_buffers);
+		break;
+	case 't':
+		test_duration = atoi(optarg);
+		igt_assert_f(test_duration > 0, "invalid input for -t option");
+		igt_debug("logger to run for %d second\n", test_duration);
+		break;
+	case 'p':
+		poll_timeout = atoi(optarg);
+		igt_assert_f(poll_timeout != 0, "invalid input for -p option");
+		if (poll_timeout > 0)
+			igt_debug("polling to be done with %d millisecond timeout\n", poll_timeout);
+		break;
+	case 's':
+		max_filesize = atoi(optarg);
+		igt_assert_f(max_filesize > 0, "invalid input for -s option");
+		igt_debug("max allowed size of the output file is %d MB\n", max_filesize);
+		break;
+	case 'd':
+		discard_oldlogs = true;
+		igt_debug("old/boot-time logs will be discarded\n");
+		break;
+	}
+
+	return 0;
+}
+
+static void process_command_line(int argc, char **argv)
+{
+	static struct option long_options[] = {
+		{"verbosity", required_argument, 0, 'v'},
+		{"outputfile", required_argument, 0, 'o'},
+		{"buffers", required_argument, 0, 'b'},
+		{"testduration", required_argument, 0, 't'},
+		{"polltimeout", required_argument, 0, 'p'},
+		{"size", required_argument, 0, 's'},
+		{"discard", no_argument, 0, 'd'},
+		{ 0, 0, 0, 0 }
+	};
+
+	const char *help =
+		"  -v --verbosity=level   verbosity level of GuC logging (0-3)\n"
+		"  -o --outputfile=name   name of the output file, including the location, where logs will be stored\n"
+		"  -b --buffers=num       number of buffers to be maintained on logger side for storing logs\n"
+		"  -t --testduration=sec  max duration in seconds for which the logger should run\n"
+		"  -p --polltimeout=ms    polling timeout in ms, -1 == indefinite wait for the new data\n"
+		"  -s --size=MB           max size of output file in MBs after which logging will be stopped\n"
+		"  -d --discard           discard the old/boot-time logs before entering into the capture loop\n";
+
+	igt_simple_init_parse_opts(&argc, argv, "v:o:b:t:p:s:d", long_options,
+				   help, parse_options, NULL);
+}
+
+int main(int argc, char **argv)
+{
+	struct pollfd relay_poll_fd;
+	int nfds;
+	int ret;
+
+	process_command_line(argc, argv);
+
+	init_main_thread();
+
+	/* Use a separate thread for flushing the logs to a file on disk.
+	 * Main thread will buffer the data from relay file in its pool of
+	 * buffers and other thread will flush the data to disk in background.
+	 * This is needed, albeit by default data is written out to disk in
+	 * async mode, as when there are too many dirty pages in the RAM,
+	 * (/proc/sys/vm/dirty_ratio), kernel starts blocking the processes
+	 * doing the file writes.
+	 */
+	init_flusher_thread();
+
+	relay_poll_fd.fd = relay_fd;
+	relay_poll_fd.events = POLLIN;
+	relay_poll_fd.revents = 0;
+
+	nfds = 1; /* only one fd to poll */
+
+	alarm(test_duration); /* Start the alarm */
+
+	do {
+		/* Wait/poll for the new data to be available, relay doesn't
+		 * provide a blocking read.
+		 * On older kernels need to do polling with a timeout instead of
+		 * indefinite wait to avoid relying on relay for the wakeup, as
+		 * relay used to do the wakeup in a deferred manner on jiffies
+		 * granularity by scheduling a timer and moreover that timer was
+		 * re-scheduled on every newly produced buffer and so was pushed
+		 * out if there were multiple flush interrupts in a very quick
+		 * succession (less than a jiffy gap between 2 flush interrupts)
+		 * causing relay to run out of sub buffers to store new logs.
+		 */
+		ret = poll(&relay_poll_fd, nfds, poll_timeout);
+		if (ret < 0) {
+			if (errno == EINTR)
+				break;
+			igt_assert_f(0, "poll call failed");
+		}
+
+		/* No data available yet, poll again, hopefully new data is round the corner */
+		if (!relay_poll_fd.revents)
+			continue;
+
+		pull_data();
+	} while (!stop_logging);
+
+	/* Pause logging on the GuC side*/
+	guc_log_control(false);
+	pthread_cond_signal(&underflow_cond);
+	pthread_join(flush_thread, NULL);
+	pull_leftover_data();
+	igt_info("total bytes written %" PRIu64 "\n", total_bytes_written);
+
+	free(read_buffer);
+	close(relay_fd);
+	close(outfile_fd);
+	igt_exit();
+}
-- 
1.9.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH v4] tools/intel_guc_logger: Utility for capturing GuC firmware logs in a file
  2016-09-08 10:44                   ` [PATCH v3] " akash.goel
@ 2016-10-10 10:59                     ` akash.goel
  2016-10-10 13:52                       ` Tvrtko Ursulin
  0 siblings, 1 reply; 17+ messages in thread
From: akash.goel @ 2016-10-10 10:59 UTC (permalink / raw)
  To: intel-gfx; +Cc: Akash Goel

From: Akash Goel <akash.goel@intel.com>

This patch provides a test utility which helps capture GuC firmware logs and
then dump them to file.
The logs are pulled from a debugfs file '/sys/kernel/debug/dri/guc_log' and
by default stored into a file 'guc_log_dump.dat'. The name, including the
location, of the output file can be changed through a command line argument.

The utility goes into an infinite loop where it waits for the arrival of new
logs and as soon as new set of logs are produced it captures them in its local
buffer which is then flushed out to the file on disk.
Any time when logging needs to be ended, User can stop this utility (CTRL+C).

Before entering into a loop, it first discards whatever logs are present in
the debugfs file.
This way User can first launch this utility and then start a workload/activity
for which GuC firmware logs are to be actually captured and keep running the
utility for as long as its needed, like once the workload is over this utility
can be forcefully stopped.

If the logging wasn't enabled on GuC side by the Driver at boot time, utility
will first enable the logging and later on when it is stopped (CTRL+C) it will
also pause the logging on GuC side.

v2:
- Use combination of alarm system call & SIGALRM signal to run the utility
  for required duration. (Tvrtko)
- Fix inconsistencies, do minor cleanup and refactoring. (Tvrtko)

v3:
- Fix discrepancy for the output file command line option and update the
  Usage/help string.

v4:
- Update the exit condition for flusher thread, now will exit only after
  the capture loop is over and not when the flag to stop logging is set.
  This handles a corner case, due to which the dump of last captured buffer
  was getting missed.
- Add a newline character at the end of assert messages.
- Avoid the assert for the case, which occurs very rarely, when there are no
  bytes read from the relay file.

Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Signed-off-by: Akash Goel <akash.goel@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> (v3)
---
 tools/Makefile.sources   |   1 +
 tools/intel_guc_logger.c | 438 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 439 insertions(+)
 create mode 100644 tools/intel_guc_logger.c

diff --git a/tools/Makefile.sources b/tools/Makefile.sources
index 2bb6c8e..be58871 100644
--- a/tools/Makefile.sources
+++ b/tools/Makefile.sources
@@ -19,6 +19,7 @@ tools_prog_lists =		\
 	intel_gpu_time		\
 	intel_gpu_top		\
 	intel_gtt		\
+	intel_guc_logger        \
 	intel_infoframes	\
 	intel_l3_parity		\
 	intel_lid		\
diff --git a/tools/intel_guc_logger.c b/tools/intel_guc_logger.c
new file mode 100644
index 0000000..159a54e
--- /dev/null
+++ b/tools/intel_guc_logger.c
@@ -0,0 +1,438 @@
+
+#define _GNU_SOURCE  /* For using O_DIRECT */
+#include <inttypes.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <sys/ioctl.h>
+#include <string.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/poll.h>
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <assert.h>
+#include <pthread.h>
+
+#include "igt.h"
+
+#define MB(x) ((uint64_t)(x) * 1024 * 1024)
+#ifndef PAGE_SIZE
+  #define PAGE_SIZE 4096
+#endif
+/* Currently the size of GuC log buffer is 19 pages & so is the size of relay
+ * subbuffer. If the size changes in future, then this define also needs to be
+ * updated accordingly.
+ */
+#define SUBBUF_SIZE (19*PAGE_SIZE)
+/* Need large buffering from logger side to hide the DISK IO latency, Driver
+ * can only store 8 snapshots of GuC log buffer in relay.
+ */
+#define NUM_SUBBUFS 100
+
+#define RELAY_FILE_NAME  "guc_log"
+#define DEFAULT_OUTPUT_FILE_NAME  "guc_log_dump.dat"
+#define CONTROL_FILE_NAME "i915_guc_log_control"
+
+char *read_buffer;
+char *out_filename;
+int poll_timeout = 2; /* by default 2ms timeout */
+pthread_mutex_t mutex;
+pthread_t flush_thread;
+int verbosity_level = 3; /* by default capture logs at max verbosity */
+uint32_t produced, consumed;
+uint64_t total_bytes_written;
+int num_buffers = NUM_SUBBUFS;
+int relay_fd, outfile_fd = -1;
+uint32_t test_duration, max_filesize;
+pthread_cond_t underflow_cond, overflow_cond;
+bool stop_logging, discard_oldlogs, capturing_stopped;
+
+static void guc_log_control(bool enable_logging)
+{
+	int control_fd;
+	char data[19];
+	uint64_t val;
+	int ret;
+
+	control_fd = igt_debugfs_open(CONTROL_FILE_NAME, O_WRONLY);
+	igt_assert_f(control_fd >= 0, "couldn't open the guc log control file\n");
+
+	val = enable_logging ? ((verbosity_level << 4) | 0x1) : 0;
+
+	ret = snprintf(data, sizeof(data), "0x%" PRIx64, val);
+	igt_assert(ret > 2 && ret < sizeof(data));
+
+	ret = write(control_fd, data, ret);
+	igt_assert_f(ret > 0, "couldn't write to the log control file\n");
+
+	close(control_fd);
+}
+
+static void int_sig_handler(int sig)
+{
+	igt_info("received signal %d\n", sig);
+
+	stop_logging = true;
+}
+
+static void pull_leftover_data(void)
+{
+	unsigned int bytes_read = 0;
+	int ret;
+
+	do {
+		/* Read the logs from relay buffer */
+		ret = read(relay_fd, read_buffer, SUBBUF_SIZE);
+		if (!ret)
+			break;
+
+		igt_assert_f(ret > 0, "failed to read from the guc log file\n");
+		igt_assert_f(ret == SUBBUF_SIZE, "invalid read from relay file\n");
+
+		bytes_read += ret;
+
+		if (outfile_fd >= 0) {
+			ret = write(outfile_fd, read_buffer, SUBBUF_SIZE);
+			igt_assert_f(ret == SUBBUF_SIZE, "couldn't dump the logs in a file\n");
+			total_bytes_written += ret;
+		}
+	} while(1);
+
+	igt_debug("%u bytes flushed\n", bytes_read);
+}
+
+static int num_filled_bufs(void)
+{
+	return (produced - consumed);
+}
+
+static void pull_data(void)
+{
+	char *ptr;
+	int ret;
+
+	pthread_mutex_lock(&mutex);
+	while (num_filled_bufs() >= num_buffers) {
+		igt_debug("overflow, will wait, produced %u, consumed %u\n", produced, consumed);
+		/* Stall the main thread in case of overflow, as there are no
+		 * buffers available to store the new logs, otherwise there
+		 * could be corruption if both threads work on the same buffer.
+		 */
+		pthread_cond_wait(&overflow_cond, &mutex);
+	};
+	pthread_mutex_unlock(&mutex);
+
+	ptr = read_buffer + (produced % num_buffers) * SUBBUF_SIZE;
+
+	/* Read the logs from relay buffer */
+	ret = read(relay_fd, ptr, SUBBUF_SIZE);
+	igt_assert_f(ret >= 0, "failed to read from the guc log file\n");
+	igt_assert_f(!ret || ret == SUBBUF_SIZE, "invalid read from relay file\n");
+
+	if (ret) {
+		pthread_mutex_lock(&mutex);
+		produced++;
+		pthread_cond_signal(&underflow_cond);
+		pthread_mutex_unlock(&mutex);
+	} else {
+		/* Occasionally (very rare) read from the relay file returns no
+		 * data, albeit the polling done prior to read call indicated
+		 * availability of data.
+		 */
+		igt_debug("no data read from the relay file\n");
+	}
+}
+
+static void *flusher(void *arg)
+{
+	char *ptr;
+	int ret;
+
+	igt_debug("execution started of flusher thread\n");
+
+	do {
+		pthread_mutex_lock(&mutex);
+		while (!num_filled_bufs()) {
+			/* Exit only after completing the flush of all the filled
+			 * buffers as User would expect that all logs captured up
+			 * till the point of interruption/exit are written out to
+			 * the disk file.
+			 */
+			if (capturing_stopped) {
+				igt_debug("flusher to exit now\n");
+				pthread_mutex_unlock(&mutex);
+				return NULL;
+			}
+			pthread_cond_wait(&underflow_cond, &mutex);
+		};
+		pthread_mutex_unlock(&mutex);
+
+		ptr = read_buffer + (consumed % num_buffers) * SUBBUF_SIZE;
+
+		ret = write(outfile_fd, ptr, SUBBUF_SIZE);
+		igt_assert_f(ret == SUBBUF_SIZE, "couldn't dump the logs in a file\n");
+
+		total_bytes_written += ret;
+		if (max_filesize && (total_bytes_written > MB(max_filesize))) {
+			igt_debug("reached the target of %" PRIu64 " bytes\n", MB(max_filesize));
+			stop_logging = true;
+		}
+
+		pthread_mutex_lock(&mutex);
+		consumed++;
+		pthread_cond_signal(&overflow_cond);
+		pthread_mutex_unlock(&mutex);
+	} while(1);
+
+	return NULL;
+}
+
+static void init_flusher_thread(void)
+{
+	struct sched_param	thread_sched;
+	pthread_attr_t		p_attr;
+	int ret;
+
+	pthread_cond_init(&underflow_cond, NULL);
+	pthread_cond_init(&overflow_cond, NULL);
+	pthread_mutex_init(&mutex, NULL);
+
+	ret = pthread_attr_init(&p_attr);
+	igt_assert_f(ret == 0, "error obtaining default thread attributes\n");
+
+	ret = pthread_attr_setinheritsched(&p_attr, PTHREAD_EXPLICIT_SCHED);
+	igt_assert_f(ret == 0, "couldn't set inheritsched\n");
+
+	ret = pthread_attr_setschedpolicy(&p_attr, SCHED_RR);
+	igt_assert_f(ret == 0, "couldn't set thread scheduling policy\n");
+
+	/* Keep the flusher task also at rt priority, so that it doesn't get
+	 * too late in flushing the collected logs in local buffers to the disk,
+	 * and so main thread always have spare buffers to collect the logs.
+	 */
+	thread_sched.sched_priority = 5;
+	ret = pthread_attr_setschedparam(&p_attr, &thread_sched);
+	igt_assert_f(ret == 0, "couldn't set thread priority\n");
+
+	ret = pthread_create(&flush_thread, &p_attr, flusher, NULL);
+	igt_assert_f(ret == 0, "thread creation failed\n");
+
+	ret = pthread_attr_destroy(&p_attr);
+	igt_assert_f(ret == 0, "error destroying thread attributes\n");
+}
+
+static void open_relay_file(void)
+{
+	relay_fd = igt_debugfs_open(RELAY_FILE_NAME, O_RDONLY);
+	igt_assert_f(relay_fd >= 0, "couldn't open the guc log file\n");
+
+	/* Purge the old/boot-time logs from the relay buffer.
+	 * This is more for Val team's requirement, where they have to first
+	 * purge the existing logs before starting the tests for which the logs
+	 * are actually needed. After this logger will enter into a loop and
+	 * wait for the new data, at that point benchmark can be launched from
+	 * a different shell.
+	 */
+	if (discard_oldlogs)
+		pull_leftover_data();
+}
+
+static void open_output_file(void)
+{
+	/* Use Direct IO mode for the output file, as the data written is not
+	 * supposed to be accessed again, this saves a copy of data from App's
+	 * buffer to kernel buffer (Page cache). Due to no buffering on kernel
+	 * side, data is flushed out to disk faster and more buffering can be
+	 * done on the logger side to hide the disk IO latency.
+	 */
+	outfile_fd = open(out_filename ? : DEFAULT_OUTPUT_FILE_NAME,
+			  O_CREAT | O_WRONLY | O_TRUNC | O_DIRECT,
+			  0440);
+	igt_assert_f(outfile_fd >= 0, "couldn't open the output file\n");
+
+	free(out_filename);
+}
+
+static void init_main_thread(void)
+{
+	struct sched_param	thread_sched;
+	int ret;
+
+	/* Run the main thread at highest priority to ensure that it always
+	 * gets woken-up at earliest on arrival of new data and so is always
+	 * ready to pull the logs, otherwise there could be loss logs if
+	 * GuC firmware is generating logs at a very high rate.
+	 */
+	thread_sched.sched_priority = 1;
+	ret = sched_setscheduler(getpid(), SCHED_FIFO, &thread_sched);
+	igt_assert_f(ret == 0, "couldn't set the priority\n");
+
+	if (signal(SIGINT, int_sig_handler) == SIG_ERR)
+		igt_assert_f(0, "SIGINT handler registration failed\n");
+
+	if (signal(SIGALRM, int_sig_handler) == SIG_ERR)
+		igt_assert_f(0, "SIGALRM handler registration failed\n");
+
+	/* Need an aligned pointer for direct IO */
+	ret = posix_memalign((void **)&read_buffer, PAGE_SIZE, num_buffers * SUBBUF_SIZE);
+	igt_assert_f(ret == 0, "couldn't allocate the read buffer\n");
+
+	/* Keep the pages locked in RAM, avoid page fault overhead */
+	ret = mlock(read_buffer, num_buffers * SUBBUF_SIZE);
+	igt_assert_f(ret == 0, "failed to lock memory\n");
+
+	/* Enable the logging, it may not have been enabled from boot and so
+	 * the relay file also wouldn't have been created.
+	 */
+	guc_log_control(true);
+
+	open_relay_file();
+	open_output_file();
+}
+
+static int parse_options(int opt, int opt_index, void *data)
+{
+	igt_debug("opt %c optarg %s\n", opt, optarg);
+
+	switch(opt) {
+	case 'v':
+		verbosity_level = atoi(optarg);
+		igt_assert_f(verbosity_level < 0 || verbosity_level > 3, "invalid input for -v option\n");
+		igt_debug("verbosity level to be used is %d\n", verbosity_level);
+		break;
+	case 'o':
+		out_filename = strdup(optarg);
+		igt_assert_f(out_filename, "Couldn't allocate the o/p filename\n");
+		igt_debug("logs to be stored in file %s\n", out_filename);
+		break;
+	case 'b':
+		num_buffers = atoi(optarg);
+		igt_assert_f(num_buffers > 0, "invalid input for -b option\n");
+		igt_debug("number of buffers to be used is %d\n", num_buffers);
+		break;
+	case 't':
+		test_duration = atoi(optarg);
+		igt_assert_f(test_duration > 0, "invalid input for -t option\n");
+		igt_debug("logger to run for %d second\n", test_duration);
+		break;
+	case 'p':
+		poll_timeout = atoi(optarg);
+		igt_assert_f(poll_timeout != 0, "invalid input for -p option\n");
+		if (poll_timeout > 0)
+			igt_debug("polling to be done with %d millisecond timeout\n", poll_timeout);
+		break;
+	case 's':
+		max_filesize = atoi(optarg);
+		igt_assert_f(max_filesize > 0, "invalid input for -s option\n");
+		igt_debug("max allowed size of the output file is %d MB\n", max_filesize);
+		break;
+	case 'd':
+		discard_oldlogs = true;
+		igt_debug("old/boot-time logs will be discarded\n");
+		break;
+	}
+
+	return 0;
+}
+
+static void process_command_line(int argc, char **argv)
+{
+	static struct option long_options[] = {
+		{"verbosity", required_argument, 0, 'v'},
+		{"outputfile", required_argument, 0, 'o'},
+		{"buffers", required_argument, 0, 'b'},
+		{"testduration", required_argument, 0, 't'},
+		{"polltimeout", required_argument, 0, 'p'},
+		{"size", required_argument, 0, 's'},
+		{"discard", no_argument, 0, 'd'},
+		{ 0, 0, 0, 0 }
+	};
+
+	const char *help =
+		"  -v --verbosity=level   verbosity level of GuC logging (0-3)\n"
+		"  -o --outputfile=name   name of the output file, including the location, where logs will be stored\n"
+		"  -b --buffers=num       number of buffers to be maintained on logger side for storing logs\n"
+		"  -t --testduration=sec  max duration in seconds for which the logger should run\n"
+		"  -p --polltimeout=ms    polling timeout in ms, -1 == indefinite wait for the new data\n"
+		"  -s --size=MB           max size of output file in MBs after which logging will be stopped\n"
+		"  -d --discard           discard the old/boot-time logs before entering into the capture loop\n";
+
+	igt_simple_init_parse_opts(&argc, argv, "v:o:b:t:p:s:d", long_options,
+				   help, parse_options, NULL);
+}
+
+int main(int argc, char **argv)
+{
+	struct pollfd relay_poll_fd;
+	int nfds;
+	int ret;
+
+	process_command_line(argc, argv);
+
+	init_main_thread();
+
+	/* Use a separate thread for flushing the logs to a file on disk.
+	 * Main thread will buffer the data from relay file in its pool of
+	 * buffers and other thread will flush the data to disk in background.
+	 * This is needed, albeit by default data is written out to disk in
+	 * async mode, as when there are too many dirty pages in the RAM,
+	 * (/proc/sys/vm/dirty_ratio), kernel starts blocking the processes
+	 * doing the file writes.
+	 */
+	init_flusher_thread();
+
+	relay_poll_fd.fd = relay_fd;
+	relay_poll_fd.events = POLLIN;
+	relay_poll_fd.revents = 0;
+
+	nfds = 1; /* only one fd to poll */
+
+	alarm(test_duration); /* Start the alarm */
+
+	do {
+		/* Wait/poll for the new data to be available, relay doesn't
+		 * provide a blocking read.
+		 * On older kernels need to do polling with a timeout instead of
+		 * indefinite wait to avoid relying on relay for the wakeup, as
+		 * relay used to do the wakeup in a deferred manner on jiffies
+		 * granularity by scheduling a timer and moreover that timer was
+		 * re-scheduled on every newly produced buffer and so was pushed
+		 * out if there were multiple flush interrupts in a very quick
+		 * succession (less than a jiffy gap between 2 flush interrupts)
+		 * causing relay to run out of sub buffers to store new logs.
+		 */
+		ret = poll(&relay_poll_fd, nfds, poll_timeout);
+		if (ret < 0) {
+			if (errno == EINTR)
+				break;
+			igt_assert_f(0, "poll call failed\n");
+		}
+
+		/* No data available yet, poll again, hopefully new data is round the corner */
+		if (!relay_poll_fd.revents)
+			continue;
+
+		pull_data();
+	} while (!stop_logging);
+
+	/* Pause logging on the GuC side */
+	guc_log_control(false);
+
+	/* Signal flusher thread to make an exit */
+	capturing_stopped = 1;
+	pthread_cond_signal(&underflow_cond);
+	pthread_join(flush_thread, NULL);
+
+	pull_leftover_data();
+	igt_info("total bytes written %" PRIu64 "\n", total_bytes_written);
+
+	free(read_buffer);
+	close(relay_fd);
+	close(outfile_fd);
+	igt_exit();
+}
-- 
1.9.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] tools/intel_guc_logger: Utility for capturing GuC firmware logs in a file
  2016-10-10 10:59                     ` [PATCH v4] " akash.goel
@ 2016-10-10 13:52                       ` Tvrtko Ursulin
  2016-10-10 14:31                         ` Goel, Akash
  0 siblings, 1 reply; 17+ messages in thread
From: Tvrtko Ursulin @ 2016-10-10 13:52 UTC (permalink / raw)
  To: akash.goel, intel-gfx


On 10/10/2016 11:59, akash.goel@intel.com wrote:
> From: Akash Goel <akash.goel@intel.com>
>
> This patch provides a test utility which helps capture GuC firmware logs and
> then dump them to file.
> The logs are pulled from a debugfs file '/sys/kernel/debug/dri/guc_log' and
> by default stored into a file 'guc_log_dump.dat'. The name, including the
> location, of the output file can be changed through a command line argument.
>
> The utility goes into an infinite loop where it waits for the arrival of new
> logs and as soon as new set of logs are produced it captures them in its local
> buffer which is then flushed out to the file on disk.
> Any time when logging needs to be ended, User can stop this utility (CTRL+C).
>
> Before entering into a loop, it first discards whatever logs are present in
> the debugfs file.
> This way User can first launch this utility and then start a workload/activity
> for which GuC firmware logs are to be actually captured and keep running the
> utility for as long as its needed, like once the workload is over this utility
> can be forcefully stopped.
>
> If the logging wasn't enabled on GuC side by the Driver at boot time, utility
> will first enable the logging and later on when it is stopped (CTRL+C) it will
> also pause the logging on GuC side.
>
> v2:
> - Use combination of alarm system call & SIGALRM signal to run the utility
>    for required duration. (Tvrtko)
> - Fix inconsistencies, do minor cleanup and refactoring. (Tvrtko)
>
> v3:
> - Fix discrepancy for the output file command line option and update the
>    Usage/help string.
>
> v4:
> - Update the exit condition for flusher thread, now will exit only after
>    the capture loop is over and not when the flag to stop logging is set.
>    This handles a corner case, due to which the dump of last captured buffer
>    was getting missed.
> - Add a newline character at the end of assert messages.
> - Avoid the assert for the case, which occurs very rarely, when there are no
>    bytes read from the relay file.
>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> Signed-off-by: Akash Goel <akash.goel@intel.com>
> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> (v3)
> ---
>   tools/Makefile.sources   |   1 +
>   tools/intel_guc_logger.c | 438 +++++++++++++++++++++++++++++++++++++++++++++++
>   2 files changed, 439 insertions(+)
>   create mode 100644 tools/intel_guc_logger.c
>
> diff --git a/tools/Makefile.sources b/tools/Makefile.sources
> index 2bb6c8e..be58871 100644
> --- a/tools/Makefile.sources
> +++ b/tools/Makefile.sources
> @@ -19,6 +19,7 @@ tools_prog_lists =		\
>   	intel_gpu_time		\
>   	intel_gpu_top		\
>   	intel_gtt		\
> +	intel_guc_logger        \
>   	intel_infoframes	\
>   	intel_l3_parity		\
>   	intel_lid		\
> diff --git a/tools/intel_guc_logger.c b/tools/intel_guc_logger.c
> new file mode 100644
> index 0000000..159a54e
> --- /dev/null
> +++ b/tools/intel_guc_logger.c
> @@ -0,0 +1,438 @@
> +
> +#define _GNU_SOURCE  /* For using O_DIRECT */
> +#include <inttypes.h>
> +#include <stdio.h>
> +#include <fcntl.h>
> +#include <sched.h>
> +#include <sys/ioctl.h>
> +#include <string.h>
> +#include <errno.h>
> +#include <signal.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <sys/poll.h>
> +#include <sys/wait.h>
> +#include <sys/types.h>
> +#include <sys/syscall.h>
> +#include <sys/mman.h>
> +#include <assert.h>
> +#include <pthread.h>
> +
> +#include "igt.h"
> +
> +#define MB(x) ((uint64_t)(x) * 1024 * 1024)
> +#ifndef PAGE_SIZE
> +  #define PAGE_SIZE 4096
> +#endif
> +/* Currently the size of GuC log buffer is 19 pages & so is the size of relay
> + * subbuffer. If the size changes in future, then this define also needs to be
> + * updated accordingly.
> + */
> +#define SUBBUF_SIZE (19*PAGE_SIZE)
> +/* Need large buffering from logger side to hide the DISK IO latency, Driver
> + * can only store 8 snapshots of GuC log buffer in relay.
> + */
> +#define NUM_SUBBUFS 100
> +
> +#define RELAY_FILE_NAME  "guc_log"
> +#define DEFAULT_OUTPUT_FILE_NAME  "guc_log_dump.dat"
> +#define CONTROL_FILE_NAME "i915_guc_log_control"
> +
> +char *read_buffer;
> +char *out_filename;
> +int poll_timeout = 2; /* by default 2ms timeout */
> +pthread_mutex_t mutex;
> +pthread_t flush_thread;
> +int verbosity_level = 3; /* by default capture logs at max verbosity */
> +uint32_t produced, consumed;
> +uint64_t total_bytes_written;
> +int num_buffers = NUM_SUBBUFS;
> +int relay_fd, outfile_fd = -1;
> +uint32_t test_duration, max_filesize;
> +pthread_cond_t underflow_cond, overflow_cond;
> +bool stop_logging, discard_oldlogs, capturing_stopped;
> +
> +static void guc_log_control(bool enable_logging)
> +{
> +	int control_fd;
> +	char data[19];
> +	uint64_t val;
> +	int ret;
> +
> +	control_fd = igt_debugfs_open(CONTROL_FILE_NAME, O_WRONLY);
> +	igt_assert_f(control_fd >= 0, "couldn't open the guc log control file\n");
> +
> +	val = enable_logging ? ((verbosity_level << 4) | 0x1) : 0;
> +
> +	ret = snprintf(data, sizeof(data), "0x%" PRIx64, val);
> +	igt_assert(ret > 2 && ret < sizeof(data));
> +
> +	ret = write(control_fd, data, ret);
> +	igt_assert_f(ret > 0, "couldn't write to the log control file\n");
> +
> +	close(control_fd);
> +}
> +
> +static void int_sig_handler(int sig)
> +{
> +	igt_info("received signal %d\n", sig);
> +
> +	stop_logging = true;
> +}
> +
> +static void pull_leftover_data(void)
> +{
> +	unsigned int bytes_read = 0;
> +	int ret;
> +
> +	do {
> +		/* Read the logs from relay buffer */
> +		ret = read(relay_fd, read_buffer, SUBBUF_SIZE);
> +		if (!ret)
> +			break;
> +
> +		igt_assert_f(ret > 0, "failed to read from the guc log file\n");
> +		igt_assert_f(ret == SUBBUF_SIZE, "invalid read from relay file\n");
> +
> +		bytes_read += ret;
> +
> +		if (outfile_fd >= 0) {
> +			ret = write(outfile_fd, read_buffer, SUBBUF_SIZE);
> +			igt_assert_f(ret == SUBBUF_SIZE, "couldn't dump the logs in a file\n");
> +			total_bytes_written += ret;
> +		}
> +	} while(1);
> +
> +	igt_debug("%u bytes flushed\n", bytes_read);
> +}
> +
> +static int num_filled_bufs(void)
> +{
> +	return (produced - consumed);
> +}
> +
> +static void pull_data(void)
> +{
> +	char *ptr;
> +	int ret;
> +
> +	pthread_mutex_lock(&mutex);
> +	while (num_filled_bufs() >= num_buffers) {
> +		igt_debug("overflow, will wait, produced %u, consumed %u\n", produced, consumed);
> +		/* Stall the main thread in case of overflow, as there are no
> +		 * buffers available to store the new logs, otherwise there
> +		 * could be corruption if both threads work on the same buffer.
> +		 */
> +		pthread_cond_wait(&overflow_cond, &mutex);
> +	};
> +	pthread_mutex_unlock(&mutex);
> +
> +	ptr = read_buffer + (produced % num_buffers) * SUBBUF_SIZE;
> +
> +	/* Read the logs from relay buffer */
> +	ret = read(relay_fd, ptr, SUBBUF_SIZE);
> +	igt_assert_f(ret >= 0, "failed to read from the guc log file\n");
> +	igt_assert_f(!ret || ret == SUBBUF_SIZE, "invalid read from relay file\n");
> +
> +	if (ret) {
> +		pthread_mutex_lock(&mutex);
> +		produced++;
> +		pthread_cond_signal(&underflow_cond);
> +		pthread_mutex_unlock(&mutex);
> +	} else {
> +		/* Occasionally (very rare) read from the relay file returns no
> +		 * data, albeit the polling done prior to read call indicated
> +		 * availability of data.
> +		 */
> +		igt_debug("no data read from the relay file\n");

This worries me. Relayfs kernel side is quite simple which would suggest 
there is maybe a bug in the logger? And it is very important that we 
trust logger correctly capturing data. Because people will spend a lot 
of time debugging issues from the logs and it would be bad to have to 
chase irregularities caused by a problem in the logger.

Could you please try and identify why exactly is this happening?

Regards,

Tvrtko

> +	}
> +}
> +
> +static void *flusher(void *arg)
> +{
> +	char *ptr;
> +	int ret;
> +
> +	igt_debug("execution started of flusher thread\n");
> +
> +	do {
> +		pthread_mutex_lock(&mutex);
> +		while (!num_filled_bufs()) {
> +			/* Exit only after completing the flush of all the filled
> +			 * buffers as User would expect that all logs captured up
> +			 * till the point of interruption/exit are written out to
> +			 * the disk file.
> +			 */
> +			if (capturing_stopped) {
> +				igt_debug("flusher to exit now\n");
> +				pthread_mutex_unlock(&mutex);
> +				return NULL;
> +			}
> +			pthread_cond_wait(&underflow_cond, &mutex);
> +		};
> +		pthread_mutex_unlock(&mutex);
> +
> +		ptr = read_buffer + (consumed % num_buffers) * SUBBUF_SIZE;
> +
> +		ret = write(outfile_fd, ptr, SUBBUF_SIZE);
> +		igt_assert_f(ret == SUBBUF_SIZE, "couldn't dump the logs in a file\n");
> +
> +		total_bytes_written += ret;
> +		if (max_filesize && (total_bytes_written > MB(max_filesize))) {
> +			igt_debug("reached the target of %" PRIu64 " bytes\n", MB(max_filesize));
> +			stop_logging = true;
> +		}
> +
> +		pthread_mutex_lock(&mutex);
> +		consumed++;
> +		pthread_cond_signal(&overflow_cond);
> +		pthread_mutex_unlock(&mutex);
> +	} while(1);
> +
> +	return NULL;
> +}
> +
> +static void init_flusher_thread(void)
> +{
> +	struct sched_param	thread_sched;
> +	pthread_attr_t		p_attr;
> +	int ret;
> +
> +	pthread_cond_init(&underflow_cond, NULL);
> +	pthread_cond_init(&overflow_cond, NULL);
> +	pthread_mutex_init(&mutex, NULL);
> +
> +	ret = pthread_attr_init(&p_attr);
> +	igt_assert_f(ret == 0, "error obtaining default thread attributes\n");
> +
> +	ret = pthread_attr_setinheritsched(&p_attr, PTHREAD_EXPLICIT_SCHED);
> +	igt_assert_f(ret == 0, "couldn't set inheritsched\n");
> +
> +	ret = pthread_attr_setschedpolicy(&p_attr, SCHED_RR);
> +	igt_assert_f(ret == 0, "couldn't set thread scheduling policy\n");
> +
> +	/* Keep the flusher task also at rt priority, so that it doesn't get
> +	 * too late in flushing the collected logs in local buffers to the disk,
> +	 * and so main thread always have spare buffers to collect the logs.
> +	 */
> +	thread_sched.sched_priority = 5;
> +	ret = pthread_attr_setschedparam(&p_attr, &thread_sched);
> +	igt_assert_f(ret == 0, "couldn't set thread priority\n");
> +
> +	ret = pthread_create(&flush_thread, &p_attr, flusher, NULL);
> +	igt_assert_f(ret == 0, "thread creation failed\n");
> +
> +	ret = pthread_attr_destroy(&p_attr);
> +	igt_assert_f(ret == 0, "error destroying thread attributes\n");
> +}
> +
> +static void open_relay_file(void)
> +{
> +	relay_fd = igt_debugfs_open(RELAY_FILE_NAME, O_RDONLY);
> +	igt_assert_f(relay_fd >= 0, "couldn't open the guc log file\n");
> +
> +	/* Purge the old/boot-time logs from the relay buffer.
> +	 * This is more for Val team's requirement, where they have to first
> +	 * purge the existing logs before starting the tests for which the logs
> +	 * are actually needed. After this logger will enter into a loop and
> +	 * wait for the new data, at that point benchmark can be launched from
> +	 * a different shell.
> +	 */
> +	if (discard_oldlogs)
> +		pull_leftover_data();
> +}
> +
> +static void open_output_file(void)
> +{
> +	/* Use Direct IO mode for the output file, as the data written is not
> +	 * supposed to be accessed again, this saves a copy of data from App's
> +	 * buffer to kernel buffer (Page cache). Due to no buffering on kernel
> +	 * side, data is flushed out to disk faster and more buffering can be
> +	 * done on the logger side to hide the disk IO latency.
> +	 */
> +	outfile_fd = open(out_filename ? : DEFAULT_OUTPUT_FILE_NAME,
> +			  O_CREAT | O_WRONLY | O_TRUNC | O_DIRECT,
> +			  0440);
> +	igt_assert_f(outfile_fd >= 0, "couldn't open the output file\n");
> +
> +	free(out_filename);
> +}
> +
> +static void init_main_thread(void)
> +{
> +	struct sched_param	thread_sched;
> +	int ret;
> +
> +	/* Run the main thread at highest priority to ensure that it always
> +	 * gets woken-up at earliest on arrival of new data and so is always
> +	 * ready to pull the logs, otherwise there could be loss logs if
> +	 * GuC firmware is generating logs at a very high rate.
> +	 */
> +	thread_sched.sched_priority = 1;
> +	ret = sched_setscheduler(getpid(), SCHED_FIFO, &thread_sched);
> +	igt_assert_f(ret == 0, "couldn't set the priority\n");
> +
> +	if (signal(SIGINT, int_sig_handler) == SIG_ERR)
> +		igt_assert_f(0, "SIGINT handler registration failed\n");
> +
> +	if (signal(SIGALRM, int_sig_handler) == SIG_ERR)
> +		igt_assert_f(0, "SIGALRM handler registration failed\n");
> +
> +	/* Need an aligned pointer for direct IO */
> +	ret = posix_memalign((void **)&read_buffer, PAGE_SIZE, num_buffers * SUBBUF_SIZE);
> +	igt_assert_f(ret == 0, "couldn't allocate the read buffer\n");
> +
> +	/* Keep the pages locked in RAM, avoid page fault overhead */
> +	ret = mlock(read_buffer, num_buffers * SUBBUF_SIZE);
> +	igt_assert_f(ret == 0, "failed to lock memory\n");
> +
> +	/* Enable the logging, it may not have been enabled from boot and so
> +	 * the relay file also wouldn't have been created.
> +	 */
> +	guc_log_control(true);
> +
> +	open_relay_file();
> +	open_output_file();
> +}
> +
> +static int parse_options(int opt, int opt_index, void *data)
> +{
> +	igt_debug("opt %c optarg %s\n", opt, optarg);
> +
> +	switch(opt) {
> +	case 'v':
> +		verbosity_level = atoi(optarg);
> +		igt_assert_f(verbosity_level < 0 || verbosity_level > 3, "invalid input for -v option\n");
> +		igt_debug("verbosity level to be used is %d\n", verbosity_level);
> +		break;
> +	case 'o':
> +		out_filename = strdup(optarg);
> +		igt_assert_f(out_filename, "Couldn't allocate the o/p filename\n");
> +		igt_debug("logs to be stored in file %s\n", out_filename);
> +		break;
> +	case 'b':
> +		num_buffers = atoi(optarg);
> +		igt_assert_f(num_buffers > 0, "invalid input for -b option\n");
> +		igt_debug("number of buffers to be used is %d\n", num_buffers);
> +		break;
> +	case 't':
> +		test_duration = atoi(optarg);
> +		igt_assert_f(test_duration > 0, "invalid input for -t option\n");
> +		igt_debug("logger to run for %d second\n", test_duration);
> +		break;
> +	case 'p':
> +		poll_timeout = atoi(optarg);
> +		igt_assert_f(poll_timeout != 0, "invalid input for -p option\n");
> +		if (poll_timeout > 0)
> +			igt_debug("polling to be done with %d millisecond timeout\n", poll_timeout);
> +		break;
> +	case 's':
> +		max_filesize = atoi(optarg);
> +		igt_assert_f(max_filesize > 0, "invalid input for -s option\n");
> +		igt_debug("max allowed size of the output file is %d MB\n", max_filesize);
> +		break;
> +	case 'd':
> +		discard_oldlogs = true;
> +		igt_debug("old/boot-time logs will be discarded\n");
> +		break;
> +	}
> +
> +	return 0;
> +}
> +
> +static void process_command_line(int argc, char **argv)
> +{
> +	static struct option long_options[] = {
> +		{"verbosity", required_argument, 0, 'v'},
> +		{"outputfile", required_argument, 0, 'o'},
> +		{"buffers", required_argument, 0, 'b'},
> +		{"testduration", required_argument, 0, 't'},
> +		{"polltimeout", required_argument, 0, 'p'},
> +		{"size", required_argument, 0, 's'},
> +		{"discard", no_argument, 0, 'd'},
> +		{ 0, 0, 0, 0 }
> +	};
> +
> +	const char *help =
> +		"  -v --verbosity=level   verbosity level of GuC logging (0-3)\n"
> +		"  -o --outputfile=name   name of the output file, including the location, where logs will be stored\n"
> +		"  -b --buffers=num       number of buffers to be maintained on logger side for storing logs\n"
> +		"  -t --testduration=sec  max duration in seconds for which the logger should run\n"
> +		"  -p --polltimeout=ms    polling timeout in ms, -1 == indefinite wait for the new data\n"
> +		"  -s --size=MB           max size of output file in MBs after which logging will be stopped\n"
> +		"  -d --discard           discard the old/boot-time logs before entering into the capture loop\n";
> +
> +	igt_simple_init_parse_opts(&argc, argv, "v:o:b:t:p:s:d", long_options,
> +				   help, parse_options, NULL);
> +}
> +
> +int main(int argc, char **argv)
> +{
> +	struct pollfd relay_poll_fd;
> +	int nfds;
> +	int ret;
> +
> +	process_command_line(argc, argv);
> +
> +	init_main_thread();
> +
> +	/* Use a separate thread for flushing the logs to a file on disk.
> +	 * Main thread will buffer the data from relay file in its pool of
> +	 * buffers and other thread will flush the data to disk in background.
> +	 * This is needed, albeit by default data is written out to disk in
> +	 * async mode, as when there are too many dirty pages in the RAM,
> +	 * (/proc/sys/vm/dirty_ratio), kernel starts blocking the processes
> +	 * doing the file writes.
> +	 */
> +	init_flusher_thread();
> +
> +	relay_poll_fd.fd = relay_fd;
> +	relay_poll_fd.events = POLLIN;
> +	relay_poll_fd.revents = 0;
> +
> +	nfds = 1; /* only one fd to poll */
> +
> +	alarm(test_duration); /* Start the alarm */
> +
> +	do {
> +		/* Wait/poll for the new data to be available, relay doesn't
> +		 * provide a blocking read.
> +		 * On older kernels need to do polling with a timeout instead of
> +		 * indefinite wait to avoid relying on relay for the wakeup, as
> +		 * relay used to do the wakeup in a deferred manner on jiffies
> +		 * granularity by scheduling a timer and moreover that timer was
> +		 * re-scheduled on every newly produced buffer and so was pushed
> +		 * out if there were multiple flush interrupts in a very quick
> +		 * succession (less than a jiffy gap between 2 flush interrupts)
> +		 * causing relay to run out of sub buffers to store new logs.
> +		 */
> +		ret = poll(&relay_poll_fd, nfds, poll_timeout);
> +		if (ret < 0) {
> +			if (errno == EINTR)
> +				break;
> +			igt_assert_f(0, "poll call failed\n");
> +		}
> +
> +		/* No data available yet, poll again, hopefully new data is round the corner */
> +		if (!relay_poll_fd.revents)
> +			continue;
> +
> +		pull_data();
> +	} while (!stop_logging);
> +
> +	/* Pause logging on the GuC side */
> +	guc_log_control(false);
> +
> +	/* Signal flusher thread to make an exit */
> +	capturing_stopped = 1;
> +	pthread_cond_signal(&underflow_cond);
> +	pthread_join(flush_thread, NULL);
> +
> +	pull_leftover_data();
> +	igt_info("total bytes written %" PRIu64 "\n", total_bytes_written);
> +
> +	free(read_buffer);
> +	close(relay_fd);
> +	close(outfile_fd);
> +	igt_exit();
> +}

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] tools/intel_guc_logger: Utility for capturing GuC firmware logs in a file
  2016-10-10 13:52                       ` Tvrtko Ursulin
@ 2016-10-10 14:31                         ` Goel, Akash
  2016-10-12 13:32                           ` Tvrtko Ursulin
  0 siblings, 1 reply; 17+ messages in thread
From: Goel, Akash @ 2016-10-10 14:31 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx; +Cc: akash.goel



On 10/10/2016 7:22 PM, Tvrtko Ursulin wrote:
>
> On 10/10/2016 11:59, akash.goel@intel.com wrote:
>> From: Akash Goel <akash.goel@intel.com>
>>
>> This patch provides a test utility which helps capture GuC firmware
>> logs and
>> then dump them to file.
>> The logs are pulled from a debugfs file
>> '/sys/kernel/debug/dri/guc_log' and
>> by default stored into a file 'guc_log_dump.dat'. The name, including the
>> location, of the output file can be changed through a command line
>> argument.
>>
>> The utility goes into an infinite loop where it waits for the arrival
>> of new
>> logs and as soon as new set of logs are produced it captures them in
>> its local
>> buffer which is then flushed out to the file on disk.
>> Any time when logging needs to be ended, User can stop this utility
>> (CTRL+C).
>>
>> Before entering into a loop, it first discards whatever logs are
>> present in
>> the debugfs file.
>> This way User can first launch this utility and then start a
>> workload/activity
>> for which GuC firmware logs are to be actually captured and keep
>> running the
>> utility for as long as its needed, like once the workload is over this
>> utility
>> can be forcefully stopped.
>>
>> If the logging wasn't enabled on GuC side by the Driver at boot time,
>> utility
>> will first enable the logging and later on when it is stopped (CTRL+C)
>> it will
>> also pause the logging on GuC side.
>>
>> v2:
>> - Use combination of alarm system call & SIGALRM signal to run the
>> utility
>>    for required duration. (Tvrtko)
>> - Fix inconsistencies, do minor cleanup and refactoring. (Tvrtko)
>>
>> v3:
>> - Fix discrepancy for the output file command line option and update the
>>    Usage/help string.
>>
>> v4:
>> - Update the exit condition for flusher thread, now will exit only after
>>    the capture loop is over and not when the flag to stop logging is set.
>>    This handles a corner case, due to which the dump of last captured
>> buffer
>>    was getting missed.
>> - Add a newline character at the end of assert messages.
>> - Avoid the assert for the case, which occurs very rarely, when there
>> are no
>>    bytes read from the relay file.
>>
>> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
>> Signed-off-by: Akash Goel <akash.goel@intel.com>
>> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> (v3)
>> ---
>>   tools/Makefile.sources   |   1 +
>>   tools/intel_guc_logger.c | 438
>> +++++++++++++++++++++++++++++++++++++++++++++++
>>   2 files changed, 439 insertions(+)
>>   create mode 100644 tools/intel_guc_logger.c
>>
>> diff --git a/tools/Makefile.sources b/tools/Makefile.sources
>> index 2bb6c8e..be58871 100644
>> --- a/tools/Makefile.sources
>> +++ b/tools/Makefile.sources
>> @@ -19,6 +19,7 @@ tools_prog_lists =        \
>>       intel_gpu_time        \
>>       intel_gpu_top        \
>>       intel_gtt        \
>> +    intel_guc_logger        \
>>       intel_infoframes    \
>>       intel_l3_parity        \
>>       intel_lid        \
>> diff --git a/tools/intel_guc_logger.c b/tools/intel_guc_logger.c
>> new file mode 100644
>> index 0000000..159a54e
>> --- /dev/null
>> +++ b/tools/intel_guc_logger.c
>> @@ -0,0 +1,438 @@
>> +
>> +#define _GNU_SOURCE  /* For using O_DIRECT */
>> +#include <inttypes.h>
>> +#include <stdio.h>
>> +#include <fcntl.h>
>> +#include <sched.h>
>> +#include <sys/ioctl.h>
>> +#include <string.h>
>> +#include <errno.h>
>> +#include <signal.h>
>> +#include <stdlib.h>
>> +#include <unistd.h>
>> +#include <sys/poll.h>
>> +#include <sys/wait.h>
>> +#include <sys/types.h>
>> +#include <sys/syscall.h>
>> +#include <sys/mman.h>
>> +#include <assert.h>
>> +#include <pthread.h>
>> +
>> +#include "igt.h"
>> +
>> +#define MB(x) ((uint64_t)(x) * 1024 * 1024)
>> +#ifndef PAGE_SIZE
>> +  #define PAGE_SIZE 4096
>> +#endif
>> +/* Currently the size of GuC log buffer is 19 pages & so is the size
>> of relay
>> + * subbuffer. If the size changes in future, then this define also
>> needs to be
>> + * updated accordingly.
>> + */
>> +#define SUBBUF_SIZE (19*PAGE_SIZE)
>> +/* Need large buffering from logger side to hide the DISK IO latency,
>> Driver
>> + * can only store 8 snapshots of GuC log buffer in relay.
>> + */
>> +#define NUM_SUBBUFS 100
>> +
>> +#define RELAY_FILE_NAME  "guc_log"
>> +#define DEFAULT_OUTPUT_FILE_NAME  "guc_log_dump.dat"
>> +#define CONTROL_FILE_NAME "i915_guc_log_control"
>> +
>> +char *read_buffer;
>> +char *out_filename;
>> +int poll_timeout = 2; /* by default 2ms timeout */
>> +pthread_mutex_t mutex;
>> +pthread_t flush_thread;
>> +int verbosity_level = 3; /* by default capture logs at max verbosity */
>> +uint32_t produced, consumed;
>> +uint64_t total_bytes_written;
>> +int num_buffers = NUM_SUBBUFS;
>> +int relay_fd, outfile_fd = -1;
>> +uint32_t test_duration, max_filesize;
>> +pthread_cond_t underflow_cond, overflow_cond;
>> +bool stop_logging, discard_oldlogs, capturing_stopped;
>> +
>> +static void guc_log_control(bool enable_logging)
>> +{
>> +    int control_fd;
>> +    char data[19];
>> +    uint64_t val;
>> +    int ret;
>> +
>> +    control_fd = igt_debugfs_open(CONTROL_FILE_NAME, O_WRONLY);
>> +    igt_assert_f(control_fd >= 0, "couldn't open the guc log control
>> file\n");
>> +
>> +    val = enable_logging ? ((verbosity_level << 4) | 0x1) : 0;
>> +
>> +    ret = snprintf(data, sizeof(data), "0x%" PRIx64, val);
>> +    igt_assert(ret > 2 && ret < sizeof(data));
>> +
>> +    ret = write(control_fd, data, ret);
>> +    igt_assert_f(ret > 0, "couldn't write to the log control file\n");
>> +
>> +    close(control_fd);
>> +}
>> +
>> +static void int_sig_handler(int sig)
>> +{
>> +    igt_info("received signal %d\n", sig);
>> +
>> +    stop_logging = true;
>> +}
>> +
>> +static void pull_leftover_data(void)
>> +{
>> +    unsigned int bytes_read = 0;
>> +    int ret;
>> +
>> +    do {
>> +        /* Read the logs from relay buffer */
>> +        ret = read(relay_fd, read_buffer, SUBBUF_SIZE);
>> +        if (!ret)
>> +            break;
>> +
>> +        igt_assert_f(ret > 0, "failed to read from the guc log file\n");
>> +        igt_assert_f(ret == SUBBUF_SIZE, "invalid read from relay
>> file\n");
>> +
>> +        bytes_read += ret;
>> +
>> +        if (outfile_fd >= 0) {
>> +            ret = write(outfile_fd, read_buffer, SUBBUF_SIZE);
>> +            igt_assert_f(ret == SUBBUF_SIZE, "couldn't dump the logs
>> in a file\n");
>> +            total_bytes_written += ret;
>> +        }
>> +    } while(1);
>> +
>> +    igt_debug("%u bytes flushed\n", bytes_read);
>> +}
>> +
>> +static int num_filled_bufs(void)
>> +{
>> +    return (produced - consumed);
>> +}
>> +
>> +static void pull_data(void)
>> +{
>> +    char *ptr;
>> +    int ret;
>> +
>> +    pthread_mutex_lock(&mutex);
>> +    while (num_filled_bufs() >= num_buffers) {
>> +        igt_debug("overflow, will wait, produced %u, consumed %u\n",
>> produced, consumed);
>> +        /* Stall the main thread in case of overflow, as there are no
>> +         * buffers available to store the new logs, otherwise there
>> +         * could be corruption if both threads work on the same buffer.
>> +         */
>> +        pthread_cond_wait(&overflow_cond, &mutex);
>> +    };
>> +    pthread_mutex_unlock(&mutex);
>> +
>> +    ptr = read_buffer + (produced % num_buffers) * SUBBUF_SIZE;
>> +
>> +    /* Read the logs from relay buffer */
>> +    ret = read(relay_fd, ptr, SUBBUF_SIZE);
>> +    igt_assert_f(ret >= 0, "failed to read from the guc log file\n");
>> +    igt_assert_f(!ret || ret == SUBBUF_SIZE, "invalid read from relay
>> file\n");
>> +
>> +    if (ret) {
>> +        pthread_mutex_lock(&mutex);
>> +        produced++;
>> +        pthread_cond_signal(&underflow_cond);
>> +        pthread_mutex_unlock(&mutex);
>> +    } else {
>> +        /* Occasionally (very rare) read from the relay file returns no
>> +         * data, albeit the polling done prior to read call indicated
>> +         * availability of data.
>> +         */
>> +        igt_debug("no data read from the relay file\n");
>
> This worries me. Relayfs kernel side is quite simple which would suggest
> there is maybe a bug in the logger? And it is very important that we
> trust logger correctly capturing data. Because people will spend a lot
> of time debugging issues from the logs and it would be bad to have to
> chase irregularities caused by a problem in the logger.
>
Actually I see this issue rarely and only after logging some GBs of 
data. And most likely it won't have any bearing on the correctness of 
captured data, the missed snapshot shall be pulled in the next iteration 
of capture loop. Also no-overwrite mode of relay is being used.

As per my hunch, it can't be ruled out that this is not a relayfs issue.

> Could you please try and identify why exactly is this happening?
Will try to investigate both from relayfs side as well as logger side, 
but it would take some time.
This may not be considered a blocker.

Best regards
Akash
>
> Regards,
>
> Tvrtko
>
>> +    }
>> +}
>> +
>> +static void *flusher(void *arg)
>> +{
>> +    char *ptr;
>> +    int ret;
>> +
>> +    igt_debug("execution started of flusher thread\n");
>> +
>> +    do {
>> +        pthread_mutex_lock(&mutex);
>> +        while (!num_filled_bufs()) {
>> +            /* Exit only after completing the flush of all the filled
>> +             * buffers as User would expect that all logs captured up
>> +             * till the point of interruption/exit are written out to
>> +             * the disk file.
>> +             */
>> +            if (capturing_stopped) {
>> +                igt_debug("flusher to exit now\n");
>> +                pthread_mutex_unlock(&mutex);
>> +                return NULL;
>> +            }
>> +            pthread_cond_wait(&underflow_cond, &mutex);
>> +        };
>> +        pthread_mutex_unlock(&mutex);
>> +
>> +        ptr = read_buffer + (consumed % num_buffers) * SUBBUF_SIZE;
>> +
>> +        ret = write(outfile_fd, ptr, SUBBUF_SIZE);
>> +        igt_assert_f(ret == SUBBUF_SIZE, "couldn't dump the logs in a
>> file\n");
>> +
>> +        total_bytes_written += ret;
>> +        if (max_filesize && (total_bytes_written > MB(max_filesize))) {
>> +            igt_debug("reached the target of %" PRIu64 " bytes\n",
>> MB(max_filesize));
>> +            stop_logging = true;
>> +        }
>> +
>> +        pthread_mutex_lock(&mutex);
>> +        consumed++;
>> +        pthread_cond_signal(&overflow_cond);
>> +        pthread_mutex_unlock(&mutex);
>> +    } while(1);
>> +
>> +    return NULL;
>> +}
>> +
>> +static void init_flusher_thread(void)
>> +{
>> +    struct sched_param    thread_sched;
>> +    pthread_attr_t        p_attr;
>> +    int ret;
>> +
>> +    pthread_cond_init(&underflow_cond, NULL);
>> +    pthread_cond_init(&overflow_cond, NULL);
>> +    pthread_mutex_init(&mutex, NULL);
>> +
>> +    ret = pthread_attr_init(&p_attr);
>> +    igt_assert_f(ret == 0, "error obtaining default thread
>> attributes\n");
>> +
>> +    ret = pthread_attr_setinheritsched(&p_attr, PTHREAD_EXPLICIT_SCHED);
>> +    igt_assert_f(ret == 0, "couldn't set inheritsched\n");
>> +
>> +    ret = pthread_attr_setschedpolicy(&p_attr, SCHED_RR);
>> +    igt_assert_f(ret == 0, "couldn't set thread scheduling policy\n");
>> +
>> +    /* Keep the flusher task also at rt priority, so that it doesn't get
>> +     * too late in flushing the collected logs in local buffers to
>> the disk,
>> +     * and so main thread always have spare buffers to collect the logs.
>> +     */
>> +    thread_sched.sched_priority = 5;
>> +    ret = pthread_attr_setschedparam(&p_attr, &thread_sched);
>> +    igt_assert_f(ret == 0, "couldn't set thread priority\n");
>> +
>> +    ret = pthread_create(&flush_thread, &p_attr, flusher, NULL);
>> +    igt_assert_f(ret == 0, "thread creation failed\n");
>> +
>> +    ret = pthread_attr_destroy(&p_attr);
>> +    igt_assert_f(ret == 0, "error destroying thread attributes\n");
>> +}
>> +
>> +static void open_relay_file(void)
>> +{
>> +    relay_fd = igt_debugfs_open(RELAY_FILE_NAME, O_RDONLY);
>> +    igt_assert_f(relay_fd >= 0, "couldn't open the guc log file\n");
>> +
>> +    /* Purge the old/boot-time logs from the relay buffer.
>> +     * This is more for Val team's requirement, where they have to first
>> +     * purge the existing logs before starting the tests for which
>> the logs
>> +     * are actually needed. After this logger will enter into a loop and
>> +     * wait for the new data, at that point benchmark can be launched
>> from
>> +     * a different shell.
>> +     */
>> +    if (discard_oldlogs)
>> +        pull_leftover_data();
>> +}
>> +
>> +static void open_output_file(void)
>> +{
>> +    /* Use Direct IO mode for the output file, as the data written is
>> not
>> +     * supposed to be accessed again, this saves a copy of data from
>> App's
>> +     * buffer to kernel buffer (Page cache). Due to no buffering on
>> kernel
>> +     * side, data is flushed out to disk faster and more buffering
>> can be
>> +     * done on the logger side to hide the disk IO latency.
>> +     */
>> +    outfile_fd = open(out_filename ? : DEFAULT_OUTPUT_FILE_NAME,
>> +              O_CREAT | O_WRONLY | O_TRUNC | O_DIRECT,
>> +              0440);
>> +    igt_assert_f(outfile_fd >= 0, "couldn't open the output file\n");
>> +
>> +    free(out_filename);
>> +}
>> +
>> +static void init_main_thread(void)
>> +{
>> +    struct sched_param    thread_sched;
>> +    int ret;
>> +
>> +    /* Run the main thread at highest priority to ensure that it always
>> +     * gets woken-up at earliest on arrival of new data and so is always
>> +     * ready to pull the logs, otherwise there could be loss logs if
>> +     * GuC firmware is generating logs at a very high rate.
>> +     */
>> +    thread_sched.sched_priority = 1;
>> +    ret = sched_setscheduler(getpid(), SCHED_FIFO, &thread_sched);
>> +    igt_assert_f(ret == 0, "couldn't set the priority\n");
>> +
>> +    if (signal(SIGINT, int_sig_handler) == SIG_ERR)
>> +        igt_assert_f(0, "SIGINT handler registration failed\n");
>> +
>> +    if (signal(SIGALRM, int_sig_handler) == SIG_ERR)
>> +        igt_assert_f(0, "SIGALRM handler registration failed\n");
>> +
>> +    /* Need an aligned pointer for direct IO */
>> +    ret = posix_memalign((void **)&read_buffer, PAGE_SIZE,
>> num_buffers * SUBBUF_SIZE);
>> +    igt_assert_f(ret == 0, "couldn't allocate the read buffer\n");
>> +
>> +    /* Keep the pages locked in RAM, avoid page fault overhead */
>> +    ret = mlock(read_buffer, num_buffers * SUBBUF_SIZE);
>> +    igt_assert_f(ret == 0, "failed to lock memory\n");
>> +
>> +    /* Enable the logging, it may not have been enabled from boot and so
>> +     * the relay file also wouldn't have been created.
>> +     */
>> +    guc_log_control(true);
>> +
>> +    open_relay_file();
>> +    open_output_file();
>> +}
>> +
>> +static int parse_options(int opt, int opt_index, void *data)
>> +{
>> +    igt_debug("opt %c optarg %s\n", opt, optarg);
>> +
>> +    switch(opt) {
>> +    case 'v':
>> +        verbosity_level = atoi(optarg);
>> +        igt_assert_f(verbosity_level < 0 || verbosity_level > 3,
>> "invalid input for -v option\n");
>> +        igt_debug("verbosity level to be used is %d\n",
>> verbosity_level);
>> +        break;
>> +    case 'o':
>> +        out_filename = strdup(optarg);
>> +        igt_assert_f(out_filename, "Couldn't allocate the o/p
>> filename\n");
>> +        igt_debug("logs to be stored in file %s\n", out_filename);
>> +        break;
>> +    case 'b':
>> +        num_buffers = atoi(optarg);
>> +        igt_assert_f(num_buffers > 0, "invalid input for -b option\n");
>> +        igt_debug("number of buffers to be used is %d\n", num_buffers);
>> +        break;
>> +    case 't':
>> +        test_duration = atoi(optarg);
>> +        igt_assert_f(test_duration > 0, "invalid input for -t
>> option\n");
>> +        igt_debug("logger to run for %d second\n", test_duration);
>> +        break;
>> +    case 'p':
>> +        poll_timeout = atoi(optarg);
>> +        igt_assert_f(poll_timeout != 0, "invalid input for -p
>> option\n");
>> +        if (poll_timeout > 0)
>> +            igt_debug("polling to be done with %d millisecond
>> timeout\n", poll_timeout);
>> +        break;
>> +    case 's':
>> +        max_filesize = atoi(optarg);
>> +        igt_assert_f(max_filesize > 0, "invalid input for -s option\n");
>> +        igt_debug("max allowed size of the output file is %d MB\n",
>> max_filesize);
>> +        break;
>> +    case 'd':
>> +        discard_oldlogs = true;
>> +        igt_debug("old/boot-time logs will be discarded\n");
>> +        break;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static void process_command_line(int argc, char **argv)
>> +{
>> +    static struct option long_options[] = {
>> +        {"verbosity", required_argument, 0, 'v'},
>> +        {"outputfile", required_argument, 0, 'o'},
>> +        {"buffers", required_argument, 0, 'b'},
>> +        {"testduration", required_argument, 0, 't'},
>> +        {"polltimeout", required_argument, 0, 'p'},
>> +        {"size", required_argument, 0, 's'},
>> +        {"discard", no_argument, 0, 'd'},
>> +        { 0, 0, 0, 0 }
>> +    };
>> +
>> +    const char *help =
>> +        "  -v --verbosity=level   verbosity level of GuC logging
>> (0-3)\n"
>> +        "  -o --outputfile=name   name of the output file, including
>> the location, where logs will be stored\n"
>> +        "  -b --buffers=num       number of buffers to be maintained
>> on logger side for storing logs\n"
>> +        "  -t --testduration=sec  max duration in seconds for which
>> the logger should run\n"
>> +        "  -p --polltimeout=ms    polling timeout in ms, -1 ==
>> indefinite wait for the new data\n"
>> +        "  -s --size=MB           max size of output file in MBs
>> after which logging will be stopped\n"
>> +        "  -d --discard           discard the old/boot-time logs
>> before entering into the capture loop\n";
>> +
>> +    igt_simple_init_parse_opts(&argc, argv, "v:o:b:t:p:s:d",
>> long_options,
>> +                   help, parse_options, NULL);
>> +}
>> +
>> +int main(int argc, char **argv)
>> +{
>> +    struct pollfd relay_poll_fd;
>> +    int nfds;
>> +    int ret;
>> +
>> +    process_command_line(argc, argv);
>> +
>> +    init_main_thread();
>> +
>> +    /* Use a separate thread for flushing the logs to a file on disk.
>> +     * Main thread will buffer the data from relay file in its pool of
>> +     * buffers and other thread will flush the data to disk in
>> background.
>> +     * This is needed, albeit by default data is written out to disk in
>> +     * async mode, as when there are too many dirty pages in the RAM,
>> +     * (/proc/sys/vm/dirty_ratio), kernel starts blocking the processes
>> +     * doing the file writes.
>> +     */
>> +    init_flusher_thread();
>> +
>> +    relay_poll_fd.fd = relay_fd;
>> +    relay_poll_fd.events = POLLIN;
>> +    relay_poll_fd.revents = 0;
>> +
>> +    nfds = 1; /* only one fd to poll */
>> +
>> +    alarm(test_duration); /* Start the alarm */
>> +
>> +    do {
>> +        /* Wait/poll for the new data to be available, relay doesn't
>> +         * provide a blocking read.
>> +         * On older kernels need to do polling with a timeout instead of
>> +         * indefinite wait to avoid relying on relay for the wakeup, as
>> +         * relay used to do the wakeup in a deferred manner on jiffies
>> +         * granularity by scheduling a timer and moreover that timer was
>> +         * re-scheduled on every newly produced buffer and so was pushed
>> +         * out if there were multiple flush interrupts in a very quick
>> +         * succession (less than a jiffy gap between 2 flush interrupts)
>> +         * causing relay to run out of sub buffers to store new logs.
>> +         */
>> +        ret = poll(&relay_poll_fd, nfds, poll_timeout);
>> +        if (ret < 0) {
>> +            if (errno == EINTR)
>> +                break;
>> +            igt_assert_f(0, "poll call failed\n");
>> +        }
>> +
>> +        /* No data available yet, poll again, hopefully new data is
>> round the corner */
>> +        if (!relay_poll_fd.revents)
>> +            continue;
>> +
>> +        pull_data();
>> +    } while (!stop_logging);
>> +
>> +    /* Pause logging on the GuC side */
>> +    guc_log_control(false);
>> +
>> +    /* Signal flusher thread to make an exit */
>> +    capturing_stopped = 1;
>> +    pthread_cond_signal(&underflow_cond);
>> +    pthread_join(flush_thread, NULL);
>> +
>> +    pull_leftover_data();
>> +    igt_info("total bytes written %" PRIu64 "\n", total_bytes_written);
>> +
>> +    free(read_buffer);
>> +    close(relay_fd);
>> +    close(outfile_fd);
>> +    igt_exit();
>> +}
>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] tools/intel_guc_logger: Utility for capturing GuC firmware logs in a file
  2016-10-10 14:31                         ` Goel, Akash
@ 2016-10-12 13:32                           ` Tvrtko Ursulin
  2016-10-25  9:01                             ` Petri Latvala
  0 siblings, 1 reply; 17+ messages in thread
From: Tvrtko Ursulin @ 2016-10-12 13:32 UTC (permalink / raw)
  To: Goel, Akash, intel-gfx


On 10/10/2016 15:31, Goel, Akash wrote:
>
>
> On 10/10/2016 7:22 PM, Tvrtko Ursulin wrote:
>>
>> On 10/10/2016 11:59, akash.goel@intel.com wrote:
>>> From: Akash Goel <akash.goel@intel.com>
>>>
>>> This patch provides a test utility which helps capture GuC firmware
>>> logs and
>>> then dump them to file.
>>> The logs are pulled from a debugfs file
>>> '/sys/kernel/debug/dri/guc_log' and
>>> by default stored into a file 'guc_log_dump.dat'. The name, 
>>> including the
>>> location, of the output file can be changed through a command line
>>> argument.
>>>
>>> The utility goes into an infinite loop where it waits for the arrival
>>> of new
>>> logs and as soon as new set of logs are produced it captures them in
>>> its local
>>> buffer which is then flushed out to the file on disk.
>>> Any time when logging needs to be ended, User can stop this utility
>>> (CTRL+C).
>>>
>>> Before entering into a loop, it first discards whatever logs are
>>> present in
>>> the debugfs file.
>>> This way User can first launch this utility and then start a
>>> workload/activity
>>> for which GuC firmware logs are to be actually captured and keep
>>> running the
>>> utility for as long as its needed, like once the workload is over this
>>> utility
>>> can be forcefully stopped.
>>>
>>> If the logging wasn't enabled on GuC side by the Driver at boot time,
>>> utility
>>> will first enable the logging and later on when it is stopped (CTRL+C)
>>> it will
>>> also pause the logging on GuC side.
>>>
>>> v2:
>>> - Use combination of alarm system call & SIGALRM signal to run the
>>> utility
>>>    for required duration. (Tvrtko)
>>> - Fix inconsistencies, do minor cleanup and refactoring. (Tvrtko)
>>>
>>> v3:
>>> - Fix discrepancy for the output file command line option and update 
>>> the
>>>    Usage/help string.
>>>
>>> v4:
>>> - Update the exit condition for flusher thread, now will exit only 
>>> after
>>>    the capture loop is over and not when the flag to stop logging is 
>>> set.
>>>    This handles a corner case, due to which the dump of last captured
>>> buffer
>>>    was getting missed.
>>> - Add a newline character at the end of assert messages.
>>> - Avoid the assert for the case, which occurs very rarely, when there
>>> are no
>>>    bytes read from the relay file.
>>>
>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
>>> Signed-off-by: Akash Goel <akash.goel@intel.com>
>>> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> (v3)
>>> ---
>>>   tools/Makefile.sources   |   1 +
>>>   tools/intel_guc_logger.c | 438
>>> +++++++++++++++++++++++++++++++++++++++++++++++
>>>   2 files changed, 439 insertions(+)
>>>   create mode 100644 tools/intel_guc_logger.c
>>>
>>> diff --git a/tools/Makefile.sources b/tools/Makefile.sources
>>> index 2bb6c8e..be58871 100644
>>> --- a/tools/Makefile.sources
>>> +++ b/tools/Makefile.sources
>>> @@ -19,6 +19,7 @@ tools_prog_lists =        \
>>>       intel_gpu_time        \
>>>       intel_gpu_top        \
>>>       intel_gtt        \
>>> +    intel_guc_logger        \
>>>       intel_infoframes    \
>>>       intel_l3_parity        \
>>>       intel_lid        \
>>> diff --git a/tools/intel_guc_logger.c b/tools/intel_guc_logger.c
>>> new file mode 100644
>>> index 0000000..159a54e
>>> --- /dev/null
>>> +++ b/tools/intel_guc_logger.c
>>> @@ -0,0 +1,438 @@
>>> +
>>> +#define _GNU_SOURCE  /* For using O_DIRECT */
>>> +#include <inttypes.h>
>>> +#include <stdio.h>
>>> +#include <fcntl.h>
>>> +#include <sched.h>
>>> +#include <sys/ioctl.h>
>>> +#include <string.h>
>>> +#include <errno.h>
>>> +#include <signal.h>
>>> +#include <stdlib.h>
>>> +#include <unistd.h>
>>> +#include <sys/poll.h>
>>> +#include <sys/wait.h>
>>> +#include <sys/types.h>
>>> +#include <sys/syscall.h>
>>> +#include <sys/mman.h>
>>> +#include <assert.h>
>>> +#include <pthread.h>
>>> +
>>> +#include "igt.h"
>>> +
>>> +#define MB(x) ((uint64_t)(x) * 1024 * 1024)
>>> +#ifndef PAGE_SIZE
>>> +  #define PAGE_SIZE 4096
>>> +#endif
>>> +/* Currently the size of GuC log buffer is 19 pages & so is the size
>>> of relay
>>> + * subbuffer. If the size changes in future, then this define also
>>> needs to be
>>> + * updated accordingly.
>>> + */
>>> +#define SUBBUF_SIZE (19*PAGE_SIZE)
>>> +/* Need large buffering from logger side to hide the DISK IO latency,
>>> Driver
>>> + * can only store 8 snapshots of GuC log buffer in relay.
>>> + */
>>> +#define NUM_SUBBUFS 100
>>> +
>>> +#define RELAY_FILE_NAME  "guc_log"
>>> +#define DEFAULT_OUTPUT_FILE_NAME  "guc_log_dump.dat"
>>> +#define CONTROL_FILE_NAME "i915_guc_log_control"
>>> +
>>> +char *read_buffer;
>>> +char *out_filename;
>>> +int poll_timeout = 2; /* by default 2ms timeout */
>>> +pthread_mutex_t mutex;
>>> +pthread_t flush_thread;
>>> +int verbosity_level = 3; /* by default capture logs at max 
>>> verbosity */
>>> +uint32_t produced, consumed;
>>> +uint64_t total_bytes_written;
>>> +int num_buffers = NUM_SUBBUFS;
>>> +int relay_fd, outfile_fd = -1;
>>> +uint32_t test_duration, max_filesize;
>>> +pthread_cond_t underflow_cond, overflow_cond;
>>> +bool stop_logging, discard_oldlogs, capturing_stopped;
>>> +
>>> +static void guc_log_control(bool enable_logging)
>>> +{
>>> +    int control_fd;
>>> +    char data[19];
>>> +    uint64_t val;
>>> +    int ret;
>>> +
>>> +    control_fd = igt_debugfs_open(CONTROL_FILE_NAME, O_WRONLY);
>>> +    igt_assert_f(control_fd >= 0, "couldn't open the guc log control
>>> file\n");
>>> +
>>> +    val = enable_logging ? ((verbosity_level << 4) | 0x1) : 0;
>>> +
>>> +    ret = snprintf(data, sizeof(data), "0x%" PRIx64, val);
>>> +    igt_assert(ret > 2 && ret < sizeof(data));
>>> +
>>> +    ret = write(control_fd, data, ret);
>>> +    igt_assert_f(ret > 0, "couldn't write to the log control file\n");
>>> +
>>> +    close(control_fd);
>>> +}
>>> +
>>> +static void int_sig_handler(int sig)
>>> +{
>>> +    igt_info("received signal %d\n", sig);
>>> +
>>> +    stop_logging = true;
>>> +}
>>> +
>>> +static void pull_leftover_data(void)
>>> +{
>>> +    unsigned int bytes_read = 0;
>>> +    int ret;
>>> +
>>> +    do {
>>> +        /* Read the logs from relay buffer */
>>> +        ret = read(relay_fd, read_buffer, SUBBUF_SIZE);
>>> +        if (!ret)
>>> +            break;
>>> +
>>> +        igt_assert_f(ret > 0, "failed to read from the guc log 
>>> file\n");
>>> +        igt_assert_f(ret == SUBBUF_SIZE, "invalid read from relay
>>> file\n");
>>> +
>>> +        bytes_read += ret;
>>> +
>>> +        if (outfile_fd >= 0) {
>>> +            ret = write(outfile_fd, read_buffer, SUBBUF_SIZE);
>>> +            igt_assert_f(ret == SUBBUF_SIZE, "couldn't dump the logs
>>> in a file\n");
>>> +            total_bytes_written += ret;
>>> +        }
>>> +    } while(1);
>>> +
>>> +    igt_debug("%u bytes flushed\n", bytes_read);
>>> +}
>>> +
>>> +static int num_filled_bufs(void)
>>> +{
>>> +    return (produced - consumed);
>>> +}
>>> +
>>> +static void pull_data(void)
>>> +{
>>> +    char *ptr;
>>> +    int ret;
>>> +
>>> +    pthread_mutex_lock(&mutex);
>>> +    while (num_filled_bufs() >= num_buffers) {
>>> +        igt_debug("overflow, will wait, produced %u, consumed %u\n",
>>> produced, consumed);
>>> +        /* Stall the main thread in case of overflow, as there are no
>>> +         * buffers available to store the new logs, otherwise there
>>> +         * could be corruption if both threads work on the same 
>>> buffer.
>>> +         */
>>> +        pthread_cond_wait(&overflow_cond, &mutex);
>>> +    };
>>> +    pthread_mutex_unlock(&mutex);
>>> +
>>> +    ptr = read_buffer + (produced % num_buffers) * SUBBUF_SIZE;
>>> +
>>> +    /* Read the logs from relay buffer */
>>> +    ret = read(relay_fd, ptr, SUBBUF_SIZE);
>>> +    igt_assert_f(ret >= 0, "failed to read from the guc log file\n");
>>> +    igt_assert_f(!ret || ret == SUBBUF_SIZE, "invalid read from relay
>>> file\n");
>>> +
>>> +    if (ret) {
>>> +        pthread_mutex_lock(&mutex);
>>> +        produced++;
>>> +        pthread_cond_signal(&underflow_cond);
>>> +        pthread_mutex_unlock(&mutex);
>>> +    } else {
>>> +        /* Occasionally (very rare) read from the relay file 
>>> returns no
>>> +         * data, albeit the polling done prior to read call indicated
>>> +         * availability of data.
>>> +         */
>>> +        igt_debug("no data read from the relay file\n");
>>
>> This worries me. Relayfs kernel side is quite simple which would suggest
>> there is maybe a bug in the logger? And it is very important that we
>> trust logger correctly capturing data. Because people will spend a lot
>> of time debugging issues from the logs and it would be bad to have to
>> chase irregularities caused by a problem in the logger.
>>
> Actually I see this issue rarely and only after logging some GBs of 
> data. And most likely it won't have any bearing on the correctness of 
> captured data, the missed snapshot shall be pulled in the next 
> iteration of capture loop. Also no-overwrite mode of relay is being used.
>
> As per my hunch, it can't be ruled out that this is not a relayfs issue.
>
>> Could you please try and identify why exactly is this happening?
> Will try to investigate both from relayfs side as well as logger side, 
> but it would take some time.
> This may not be considered a blocker.
>

As you reported off list, after some deep investigation there seems to 
be no potential for lost packets. "Empty wakeup" is just a consequence 
of unsynchronized nature of relayfs.

In the light of that I am happy with the logger, so:

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] tools/intel_guc_logger: Utility for capturing GuC firmware logs in a file
  2016-10-12 13:32                           ` Tvrtko Ursulin
@ 2016-10-25  9:01                             ` Petri Latvala
  0 siblings, 0 replies; 17+ messages in thread
From: Petri Latvala @ 2016-10-25  9:01 UTC (permalink / raw)
  To: intel-gfx; +Cc: Goel, Akash

This patch is now pushed, thanks.


--
Petri Latvala


On Wed, Oct 12, 2016 at 02:32:17PM +0100, Tvrtko Ursulin wrote:
> 
> On 10/10/2016 15:31, Goel, Akash wrote:
> > 
> > 
> > On 10/10/2016 7:22 PM, Tvrtko Ursulin wrote:
> > > 
> > > On 10/10/2016 11:59, akash.goel@intel.com wrote:
> > > > From: Akash Goel <akash.goel@intel.com>
> > > > 
> > > > This patch provides a test utility which helps capture GuC firmware
> > > > logs and
> > > > then dump them to file.
> > > > The logs are pulled from a debugfs file
> > > > '/sys/kernel/debug/dri/guc_log' and
> > > > by default stored into a file 'guc_log_dump.dat'. The name,
> > > > including the
> > > > location, of the output file can be changed through a command line
> > > > argument.
> > > > 
> > > > The utility goes into an infinite loop where it waits for the arrival
> > > > of new
> > > > logs and as soon as new set of logs are produced it captures them in
> > > > its local
> > > > buffer which is then flushed out to the file on disk.
> > > > Any time when logging needs to be ended, User can stop this utility
> > > > (CTRL+C).
> > > > 
> > > > Before entering into a loop, it first discards whatever logs are
> > > > present in
> > > > the debugfs file.
> > > > This way User can first launch this utility and then start a
> > > > workload/activity
> > > > for which GuC firmware logs are to be actually captured and keep
> > > > running the
> > > > utility for as long as its needed, like once the workload is over this
> > > > utility
> > > > can be forcefully stopped.
> > > > 
> > > > If the logging wasn't enabled on GuC side by the Driver at boot time,
> > > > utility
> > > > will first enable the logging and later on when it is stopped (CTRL+C)
> > > > it will
> > > > also pause the logging on GuC side.
> > > > 
> > > > v2:
> > > > - Use combination of alarm system call & SIGALRM signal to run the
> > > > utility
> > > >    for required duration. (Tvrtko)
> > > > - Fix inconsistencies, do minor cleanup and refactoring. (Tvrtko)
> > > > 
> > > > v3:
> > > > - Fix discrepancy for the output file command line option and
> > > > update the
> > > >    Usage/help string.
> > > > 
> > > > v4:
> > > > - Update the exit condition for flusher thread, now will exit
> > > > only after
> > > >    the capture loop is over and not when the flag to stop
> > > > logging is set.
> > > >    This handles a corner case, due to which the dump of last captured
> > > > buffer
> > > >    was getting missed.
> > > > - Add a newline character at the end of assert messages.
> > > > - Avoid the assert for the case, which occurs very rarely, when there
> > > > are no
> > > >    bytes read from the relay file.
> > > > 
> > > > Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> > > > Signed-off-by: Akash Goel <akash.goel@intel.com>
> > > > Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> (v3)
> > > > ---
> > > >   tools/Makefile.sources   |   1 +
> > > >   tools/intel_guc_logger.c | 438
> > > > +++++++++++++++++++++++++++++++++++++++++++++++
> > > >   2 files changed, 439 insertions(+)
> > > >   create mode 100644 tools/intel_guc_logger.c
> > > > 
> > > > diff --git a/tools/Makefile.sources b/tools/Makefile.sources
> > > > index 2bb6c8e..be58871 100644
> > > > --- a/tools/Makefile.sources
> > > > +++ b/tools/Makefile.sources
> > > > @@ -19,6 +19,7 @@ tools_prog_lists =        \
> > > >       intel_gpu_time        \
> > > >       intel_gpu_top        \
> > > >       intel_gtt        \
> > > > +    intel_guc_logger        \
> > > >       intel_infoframes    \
> > > >       intel_l3_parity        \
> > > >       intel_lid        \
> > > > diff --git a/tools/intel_guc_logger.c b/tools/intel_guc_logger.c
> > > > new file mode 100644
> > > > index 0000000..159a54e
> > > > --- /dev/null
> > > > +++ b/tools/intel_guc_logger.c
> > > > @@ -0,0 +1,438 @@
> > > > +
> > > > +#define _GNU_SOURCE  /* For using O_DIRECT */
> > > > +#include <inttypes.h>
> > > > +#include <stdio.h>
> > > > +#include <fcntl.h>
> > > > +#include <sched.h>
> > > > +#include <sys/ioctl.h>
> > > > +#include <string.h>
> > > > +#include <errno.h>
> > > > +#include <signal.h>
> > > > +#include <stdlib.h>
> > > > +#include <unistd.h>
> > > > +#include <sys/poll.h>
> > > > +#include <sys/wait.h>
> > > > +#include <sys/types.h>
> > > > +#include <sys/syscall.h>
> > > > +#include <sys/mman.h>
> > > > +#include <assert.h>
> > > > +#include <pthread.h>
> > > > +
> > > > +#include "igt.h"
> > > > +
> > > > +#define MB(x) ((uint64_t)(x) * 1024 * 1024)
> > > > +#ifndef PAGE_SIZE
> > > > +  #define PAGE_SIZE 4096
> > > > +#endif
> > > > +/* Currently the size of GuC log buffer is 19 pages & so is the size
> > > > of relay
> > > > + * subbuffer. If the size changes in future, then this define also
> > > > needs to be
> > > > + * updated accordingly.
> > > > + */
> > > > +#define SUBBUF_SIZE (19*PAGE_SIZE)
> > > > +/* Need large buffering from logger side to hide the DISK IO latency,
> > > > Driver
> > > > + * can only store 8 snapshots of GuC log buffer in relay.
> > > > + */
> > > > +#define NUM_SUBBUFS 100
> > > > +
> > > > +#define RELAY_FILE_NAME  "guc_log"
> > > > +#define DEFAULT_OUTPUT_FILE_NAME  "guc_log_dump.dat"
> > > > +#define CONTROL_FILE_NAME "i915_guc_log_control"
> > > > +
> > > > +char *read_buffer;
> > > > +char *out_filename;
> > > > +int poll_timeout = 2; /* by default 2ms timeout */
> > > > +pthread_mutex_t mutex;
> > > > +pthread_t flush_thread;
> > > > +int verbosity_level = 3; /* by default capture logs at max
> > > > verbosity */
> > > > +uint32_t produced, consumed;
> > > > +uint64_t total_bytes_written;
> > > > +int num_buffers = NUM_SUBBUFS;
> > > > +int relay_fd, outfile_fd = -1;
> > > > +uint32_t test_duration, max_filesize;
> > > > +pthread_cond_t underflow_cond, overflow_cond;
> > > > +bool stop_logging, discard_oldlogs, capturing_stopped;
> > > > +
> > > > +static void guc_log_control(bool enable_logging)
> > > > +{
> > > > +    int control_fd;
> > > > +    char data[19];
> > > > +    uint64_t val;
> > > > +    int ret;
> > > > +
> > > > +    control_fd = igt_debugfs_open(CONTROL_FILE_NAME, O_WRONLY);
> > > > +    igt_assert_f(control_fd >= 0, "couldn't open the guc log control
> > > > file\n");
> > > > +
> > > > +    val = enable_logging ? ((verbosity_level << 4) | 0x1) : 0;
> > > > +
> > > > +    ret = snprintf(data, sizeof(data), "0x%" PRIx64, val);
> > > > +    igt_assert(ret > 2 && ret < sizeof(data));
> > > > +
> > > > +    ret = write(control_fd, data, ret);
> > > > +    igt_assert_f(ret > 0, "couldn't write to the log control file\n");
> > > > +
> > > > +    close(control_fd);
> > > > +}
> > > > +
> > > > +static void int_sig_handler(int sig)
> > > > +{
> > > > +    igt_info("received signal %d\n", sig);
> > > > +
> > > > +    stop_logging = true;
> > > > +}
> > > > +
> > > > +static void pull_leftover_data(void)
> > > > +{
> > > > +    unsigned int bytes_read = 0;
> > > > +    int ret;
> > > > +
> > > > +    do {
> > > > +        /* Read the logs from relay buffer */
> > > > +        ret = read(relay_fd, read_buffer, SUBBUF_SIZE);
> > > > +        if (!ret)
> > > > +            break;
> > > > +
> > > > +        igt_assert_f(ret > 0, "failed to read from the guc log
> > > > file\n");
> > > > +        igt_assert_f(ret == SUBBUF_SIZE, "invalid read from relay
> > > > file\n");
> > > > +
> > > > +        bytes_read += ret;
> > > > +
> > > > +        if (outfile_fd >= 0) {
> > > > +            ret = write(outfile_fd, read_buffer, SUBBUF_SIZE);
> > > > +            igt_assert_f(ret == SUBBUF_SIZE, "couldn't dump the logs
> > > > in a file\n");
> > > > +            total_bytes_written += ret;
> > > > +        }
> > > > +    } while(1);
> > > > +
> > > > +    igt_debug("%u bytes flushed\n", bytes_read);
> > > > +}
> > > > +
> > > > +static int num_filled_bufs(void)
> > > > +{
> > > > +    return (produced - consumed);
> > > > +}
> > > > +
> > > > +static void pull_data(void)
> > > > +{
> > > > +    char *ptr;
> > > > +    int ret;
> > > > +
> > > > +    pthread_mutex_lock(&mutex);
> > > > +    while (num_filled_bufs() >= num_buffers) {
> > > > +        igt_debug("overflow, will wait, produced %u, consumed %u\n",
> > > > produced, consumed);
> > > > +        /* Stall the main thread in case of overflow, as there are no
> > > > +         * buffers available to store the new logs, otherwise there
> > > > +         * could be corruption if both threads work on the same
> > > > buffer.
> > > > +         */
> > > > +        pthread_cond_wait(&overflow_cond, &mutex);
> > > > +    };
> > > > +    pthread_mutex_unlock(&mutex);
> > > > +
> > > > +    ptr = read_buffer + (produced % num_buffers) * SUBBUF_SIZE;
> > > > +
> > > > +    /* Read the logs from relay buffer */
> > > > +    ret = read(relay_fd, ptr, SUBBUF_SIZE);
> > > > +    igt_assert_f(ret >= 0, "failed to read from the guc log file\n");
> > > > +    igt_assert_f(!ret || ret == SUBBUF_SIZE, "invalid read from relay
> > > > file\n");
> > > > +
> > > > +    if (ret) {
> > > > +        pthread_mutex_lock(&mutex);
> > > > +        produced++;
> > > > +        pthread_cond_signal(&underflow_cond);
> > > > +        pthread_mutex_unlock(&mutex);
> > > > +    } else {
> > > > +        /* Occasionally (very rare) read from the relay file
> > > > returns no
> > > > +         * data, albeit the polling done prior to read call indicated
> > > > +         * availability of data.
> > > > +         */
> > > > +        igt_debug("no data read from the relay file\n");
> > > 
> > > This worries me. Relayfs kernel side is quite simple which would suggest
> > > there is maybe a bug in the logger? And it is very important that we
> > > trust logger correctly capturing data. Because people will spend a lot
> > > of time debugging issues from the logs and it would be bad to have to
> > > chase irregularities caused by a problem in the logger.
> > > 
> > Actually I see this issue rarely and only after logging some GBs of
> > data. And most likely it won't have any bearing on the correctness of
> > captured data, the missed snapshot shall be pulled in the next iteration
> > of capture loop. Also no-overwrite mode of relay is being used.
> > 
> > As per my hunch, it can't be ruled out that this is not a relayfs issue.
> > 
> > > Could you please try and identify why exactly is this happening?
> > Will try to investigate both from relayfs side as well as logger side,
> > but it would take some time.
> > This may not be considered a blocker.
> > 
> 
> As you reported off list, after some deep investigation there seems to be no
> potential for lost packets. "Empty wakeup" is just a consequence of
> unsynchronized nature of relayfs.
> 
> In the light of that I am happy with the logger, so:
> 
> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> Regards,
> 
> Tvrtko
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2016-10-25  9:02 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-09-06 10:43 [PATCH] tools/intel_guc_logger: Utility for capturing GuC firmware logs in a file akash.goel
2016-09-06 13:17 ` Tvrtko Ursulin
2016-09-06 15:33   ` Goel, Akash
2016-09-06 15:52     ` Tvrtko Ursulin
2016-09-07  8:10       ` Goel, Akash
2016-09-07  8:44         ` Chris Wilson
2016-09-07  9:37           ` Tvrtko Ursulin
2016-09-07  9:55             ` Chris Wilson
2016-09-07 15:27               ` [PATCH v2] " akash.goel
2016-09-07 15:39                 ` Tvrtko Ursulin
2016-09-08 10:44                   ` [PATCH v3] " akash.goel
2016-10-10 10:59                     ` [PATCH v4] " akash.goel
2016-10-10 13:52                       ` Tvrtko Ursulin
2016-10-10 14:31                         ` Goel, Akash
2016-10-12 13:32                           ` Tvrtko Ursulin
2016-10-25  9:01                             ` Petri Latvala
2016-09-07 10:45             ` [PATCH] " Goel, Akash

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.