[PATCH 24/35] perf inject: Add support for injecting guest sideband events

From: Adrian Hunter <adrian.hunter@intel.com>
To: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>, Namhyung Kim <namhyung@kernel.org>,
	Ian Rogers <irogers@google.com>, Andi Kleen <ak@linux.intel.com>,
	linux-kernel@vger.kernel.org, kvm@vger.kernel.org
Subject: [PATCH 24/35] perf inject: Add support for injecting guest sideband events
Date: Mon, 11 Jul 2022 12:32:07 +0300	[thread overview]
Message-ID: <20220711093218.10967-25-adrian.hunter@intel.com> (raw)
In-Reply-To: <20220711093218.10967-1-adrian.hunter@intel.com>

Inject events from a perf.data file recorded in a virtual machine into
a perf.data file recorded on the host at the same time.

Only side band events (e.g. mmap, comm, fork, exit etc) and build IDs are
injected.  Additionally, the guest kcore_dir is copied as kcore_dir__
appended to the machine PID.

This is non-trivial because:
 o It is not possible to process 2 sessions simultaneously so instead
 events are first written to a temporary file.
 o To avoid conflict, guest sample IDs are replaced with new unused sample
 IDs.
 o Guest event's CPU is changed to be the host CPU because it is more
 useful for reporting and analysis.
 o Sample ID is mapped to machine PID which is recorded with VCPU in the
 id index. This is important to allow guest events to be related to the
 guest machine and VCPU.
 o Timestamps must be converted.
 o Events are inserted to obey finished-round ordering.

The anticipated use-case is:
 - start recording sideband events in a guest machine
 - start recording an AUX area trace on the host which can trace also the
 guest (e.g. Intel PT)
 - run test case on the guest
 - stop recording on the host
 - stop recording on the guest
 - copy the guest perf.data file to the host
 - inject the guest perf.data file sideband events into the host perf.data
 file using perf inject
 - the resulting perf.data file can now be used

Subsequent patches provide Intel PT support for this.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
---
 tools/perf/Documentation/perf-inject.txt |   17 +
 tools/perf/builtin-inject.c              | 1043 +++++++++++++++++++++-
 2 files changed, 1059 insertions(+), 1 deletion(-)

diff --git a/tools/perf/Documentation/perf-inject.txt b/tools/perf/Documentation/perf-inject.txt
index 0570a1ccd344..646aa31586ed 100644
--- a/tools/perf/Documentation/perf-inject.txt
+++ b/tools/perf/Documentation/perf-inject.txt
@@ -85,6 +85,23 @@ include::itrace.txt[]
 	without updating it. Currently this option is supported only by
 	Intel PT, refer linkperf:perf-intel-pt[1]
 
+--guest-data=<path>,<pid>[,<time offset>[,<time scale>]]::
+	Insert events from a perf.data file recorded in a virtual machine at
+	the same time as the input perf.data file was recorded on the host.
+	The Process ID (PID) of the QEMU hypervisor process must be provided,
+	and the time offset and time scale (multiplier) will likely be needed
+	to convert guest time stamps into host time stamps. For example, for
+	x86 the TSC Offset and Multiplier could be provided for a virtual machine
+	using Linux command line option no-kvmclock.
+	Currently only mmap, mmap2, comm, task, context_switch, ksymbol,
+	and text_poke events are inserted, as well as build ID information.
+	The QEMU option -name debug-threads=on is needed so that thread names
+	can be used to determine which thread is running which VCPU. Note
+	libvirt seems to use this by default.
+	When using perf record in the guest, option --sample-identifier
+	should be used, and also --buildid-all and --switch-events may be
+	useful.
+
 SEE ALSO
 --------
 linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-archive[1],
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index c800911f68e7..fd4547bb75f7 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -26,6 +26,7 @@
 #include "util/thread.h"
 #include "util/namespaces.h"
 #include "util/util.h"
+#include "util/tsc.h"
 
 #include <internal/lib.h>
 
@@ -35,8 +36,70 @@
 
 #include <linux/list.h>
 #include <linux/string.h>
+#include <linux/zalloc.h>
+#include <linux/hash.h>
 #include <errno.h>
 #include <signal.h>
+#include <inttypes.h>
+
+struct guest_event {
+	struct perf_sample		sample;
+	union perf_event		*event;
+	char				event_buf[PERF_SAMPLE_MAX_SIZE];
+};
+
+struct guest_id {
+	/* hlist_node must be first, see free_hlist() */
+	struct hlist_node		node;
+	u64				id;
+	u64				host_id;
+	u32				vcpu;
+};
+
+struct guest_tid {
+	/* hlist_node must be first, see free_hlist() */
+	struct hlist_node		node;
+	/* Thread ID of QEMU thread */
+	u32				tid;
+	u32				vcpu;
+};
+
+struct guest_vcpu {
+	/* Current host CPU */
+	u32				cpu;
+	/* Thread ID of QEMU thread */
+	u32				tid;
+};
+
+struct guest_session {
+	char				*perf_data_file;
+	u32				machine_pid;
+	u64				time_offset;
+	double				time_scale;
+	struct perf_tool		tool;
+	struct perf_data		data;
+	struct perf_session		*session;
+	char				*tmp_file_name;
+	int				tmp_fd;
+	struct perf_tsc_conversion	host_tc;
+	struct perf_tsc_conversion	guest_tc;
+	bool				copy_kcore_dir;
+	bool				have_tc;
+	bool				fetched;
+	bool				ready;
+	u16				dflt_id_hdr_size;
+	u64				dflt_id;
+	u64				highest_id;
+	/* Array of guest_vcpu */
+	struct guest_vcpu		*vcpu;
+	size_t				vcpu_cnt;
+	/* Hash table for guest_id */
+	struct hlist_head		heads[PERF_EVLIST__HLIST_SIZE];
+	/* Hash table for guest_tid */
+	struct hlist_head		tids[PERF_EVLIST__HLIST_SIZE];
+	/* Place to stash next guest event */
+	struct guest_event		ev;
+};
 
 struct perf_inject {
 	struct perf_tool	tool;
@@ -59,6 +122,7 @@ struct perf_inject {
 	struct itrace_synth_opts itrace_synth_opts;
 	char			event_copy[PERF_SAMPLE_MAX_SIZE];
 	struct perf_file_section secs[HEADER_FEAT_BITS];
+	struct guest_session	guest_session;
 };
 
 struct event_entry {
@@ -698,6 +762,841 @@ static int perf_inject__sched_stat(struct perf_tool *tool,
 	return perf_event__repipe(tool, event_sw, &sample_sw, machine);
 }
 
+static struct guest_vcpu *guest_session__vcpu(struct guest_session *gs, u32 vcpu)
+{
+	if (realloc_array_as_needed(gs->vcpu, gs->vcpu_cnt, vcpu, NULL))
+		return NULL;
+	return &gs->vcpu[vcpu];
+}
+
+static int guest_session__output_bytes(struct guest_session *gs, void *buf, size_t sz)
+{
+	ssize_t ret = writen(gs->tmp_fd, buf, sz);
+
+	return ret < 0 ? ret : 0;
+}
+
+static int guest_session__repipe(struct perf_tool *tool,
+				 union perf_event *event,
+				 struct perf_sample *sample __maybe_unused,
+				 struct machine *machine __maybe_unused)
+{
+	struct guest_session *gs = container_of(tool, struct guest_session, tool);
+
+	return guest_session__output_bytes(gs, event, event->header.size);
+}
+
+static int guest_session__map_tid(struct guest_session *gs, u32 tid, u32 vcpu)
+{
+	struct guest_tid *guest_tid = zalloc(sizeof(*guest_tid));
+	int hash;
+
+	if (!guest_tid)
+		return -ENOMEM;
+
+	guest_tid->tid = tid;
+	guest_tid->vcpu = vcpu;
+	hash = hash_32(guest_tid->tid, PERF_EVLIST__HLIST_BITS);
+	hlist_add_head(&guest_tid->node, &gs->tids[hash]);
+
+	return 0;
+}
+
+static int host_peek_vm_comms_cb(struct perf_session *session __maybe_unused,
+				 union perf_event *event,
+				 u64 offset __maybe_unused, void *data)
+{
+	struct guest_session *gs = data;
+	unsigned int vcpu;
+	struct guest_vcpu *guest_vcpu;
+	int ret;
+
+	if (event->header.type != PERF_RECORD_COMM ||
+	    event->comm.pid != gs->machine_pid)
+		return 0;
+
+	/*
+	 * QEMU option -name debug-threads=on, causes thread names formatted as
+	 * below, although it is not an ABI. Also libvirt seems to use this by
+	 * default. Here we rely on it to tell us which thread is which VCPU.
+	 */
+	ret = sscanf(event->comm.comm, "CPU %u/KVM", &vcpu);
+	if (ret <= 0)
+		return ret;
+	pr_debug("Found VCPU: tid %u comm %s vcpu %u\n",
+		 event->comm.tid, event->comm.comm, vcpu);
+	if (vcpu > INT_MAX) {
+		pr_err("Invalid VCPU %u\n", vcpu);
+		return -EINVAL;
+	}
+	guest_vcpu = guest_session__vcpu(gs, vcpu);
+	if (!guest_vcpu)
+		return -ENOMEM;
+	if (guest_vcpu->tid && guest_vcpu->tid != event->comm.tid) {
+		pr_err("Fatal error: Two threads found with the same VCPU\n");
+		return -EINVAL;
+	}
+	guest_vcpu->tid = event->comm.tid;
+
+	return guest_session__map_tid(gs, event->comm.tid, vcpu);
+}
+
+static int host_peek_vm_comms(struct perf_session *session, struct guest_session *gs)
+{
+	return perf_session__peek_events(session, session->header.data_offset,
+					 session->header.data_size,
+					 host_peek_vm_comms_cb, gs);
+}
+
+static bool evlist__is_id_used(struct evlist *evlist, u64 id)
+{
+	return evlist__id2sid(evlist, id);
+}
+
+static u64 guest_session__allocate_new_id(struct guest_session *gs, struct evlist *host_evlist)
+{
+	do {
+		gs->highest_id += 1;
+	} while (!gs->highest_id || evlist__is_id_used(host_evlist, gs->highest_id));
+
+	return gs->highest_id;
+}
+
+static int guest_session__map_id(struct guest_session *gs, u64 id, u64 host_id, u32 vcpu)
+{
+	struct guest_id *guest_id = zalloc(sizeof(*guest_id));
+	int hash;
+
+	if (!guest_id)
+		return -ENOMEM;
+
+	guest_id->id = id;
+	guest_id->host_id = host_id;
+	guest_id->vcpu = vcpu;
+	hash = hash_64(guest_id->id, PERF_EVLIST__HLIST_BITS);
+	hlist_add_head(&guest_id->node, &gs->heads[hash]);
+
+	return 0;
+}
+
+static u64 evlist__find_highest_id(struct evlist *evlist)
+{
+	struct evsel *evsel;
+	u64 highest_id = 1;
+
+	evlist__for_each_entry(evlist, evsel) {
+		u32 j;
+
+		for (j = 0; j < evsel->core.ids; j++) {
+			u64 id = evsel->core.id[j];
+
+			if (id > highest_id)
+				highest_id = id;
+		}
+	}
+
+	return highest_id;
+}
+
+static int guest_session__map_ids(struct guest_session *gs, struct evlist *host_evlist)
+{
+	struct evlist *evlist = gs->session->evlist;
+	struct evsel *evsel;
+	int ret;
+
+	evlist__for_each_entry(evlist, evsel) {
+		u32 j;
+
+		for (j = 0; j < evsel->core.ids; j++) {
+			struct perf_sample_id *sid;
+			u64 host_id;
+			u64 id;
+
+			id = evsel->core.id[j];
+			sid = evlist__id2sid(evlist, id);
+			if (!sid || sid->cpu.cpu == -1)
+				continue;
+			host_id = guest_session__allocate_new_id(gs, host_evlist);
+			ret = guest_session__map_id(gs, id, host_id, sid->cpu.cpu);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+static struct guest_id *guest_session__lookup_id(struct guest_session *gs, u64 id)
+{
+	struct hlist_head *head;
+	struct guest_id *guest_id;
+	int hash;
+
+	hash = hash_64(id, PERF_EVLIST__HLIST_BITS);
+	head = &gs->heads[hash];
+
+	hlist_for_each_entry(guest_id, head, node)
+		if (guest_id->id == id)
+			return guest_id;
+
+	return NULL;
+}
+
+static int process_attr(struct perf_tool *tool, union perf_event *event,
+			struct perf_sample *sample __maybe_unused,
+			struct machine *machine __maybe_unused)
+{
+	struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
+
+	return perf_event__process_attr(tool, event, &inject->session->evlist);
+}
+
+static int guest_session__add_attr(struct guest_session *gs, struct evsel *evsel)
+{
+	struct perf_inject *inject = container_of(gs, struct perf_inject, guest_session);
+	struct perf_event_attr attr = evsel->core.attr;
+	u64 *id_array;
+	u32 *vcpu_array;
+	int ret = -ENOMEM;
+	u32 i;
+
+	id_array = calloc(evsel->core.ids, sizeof(*id_array));
+	if (!id_array)
+		return -ENOMEM;
+
+	vcpu_array = calloc(evsel->core.ids, sizeof(*vcpu_array));
+	if (!vcpu_array)
+		goto out;
+
+	for (i = 0; i < evsel->core.ids; i++) {
+		u64 id = evsel->core.id[i];
+		struct guest_id *guest_id = guest_session__lookup_id(gs, id);
+
+		if (!guest_id) {
+			pr_err("Failed to find guest id %"PRIu64"\n", id);
+			ret = -EINVAL;
+			goto out;
+		}
+		id_array[i] = guest_id->host_id;
+		vcpu_array[i] = guest_id->vcpu;
+	}
+
+	attr.sample_type |= PERF_SAMPLE_IDENTIFIER;
+	attr.exclude_host = 1;
+	attr.exclude_guest = 0;
+
+	ret = perf_event__synthesize_attr(&inject->tool, &attr, evsel->core.ids,
+					  id_array, process_attr);
+	if (ret)
+		pr_err("Failed to add guest attr.\n");
+
+	for (i = 0; i < evsel->core.ids; i++) {
+		struct perf_sample_id *sid;
+		u32 vcpu = vcpu_array[i];
+
+		sid = evlist__id2sid(inject->session->evlist, id_array[i]);
+		/* Guest event is per-thread from the host point of view */
+		sid->cpu.cpu = -1;
+		sid->tid = gs->vcpu[vcpu].tid;
+		sid->machine_pid = gs->machine_pid;
+		sid->vcpu.cpu = vcpu;
+	}
+out:
+	free(vcpu_array);
+	free(id_array);
+	return ret;
+}
+
+static int guest_session__add_attrs(struct guest_session *gs)
+{
+	struct evlist *evlist = gs->session->evlist;
+	struct evsel *evsel;
+	int ret;
+
+	evlist__for_each_entry(evlist, evsel) {
+		ret = guest_session__add_attr(gs, evsel);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int synthesize_id_index(struct perf_inject *inject, size_t new_cnt)
+{
+	struct perf_session *session = inject->session;
+	struct evlist *evlist = session->evlist;
+	struct machine *machine = &session->machines.host;
+	size_t from = evlist->core.nr_entries - new_cnt;
+
+	return __perf_event__synthesize_id_index(&inject->tool, perf_event__repipe,
+						 evlist, machine, from);
+}
+
+static struct guest_tid *guest_session__lookup_tid(struct guest_session *gs, u32 tid)
+{
+	struct hlist_head *head;
+	struct guest_tid *guest_tid;
+	int hash;
+
+	hash = hash_32(tid, PERF_EVLIST__HLIST_BITS);
+	head = &gs->tids[hash];
+
+	hlist_for_each_entry(guest_tid, head, node)
+		if (guest_tid->tid == tid)
+			return guest_tid;
+
+	return NULL;
+}
+
+static bool dso__is_in_kernel_space(struct dso *dso)
+{
+	if (dso__is_vdso(dso))
+		return false;
+
+	return dso__is_kcore(dso) ||
+	       dso->kernel ||
+	       is_kernel_module(dso->long_name, PERF_RECORD_MISC_CPUMODE_UNKNOWN);
+}
+
+static u64 evlist__first_id(struct evlist *evlist)
+{
+	struct evsel *evsel;
+
+	evlist__for_each_entry(evlist, evsel) {
+		if (evsel->core.ids)
+			return evsel->core.id[0];
+	}
+	return 0;
+}
+
+static int process_build_id(struct perf_tool *tool,
+			    union perf_event *event,
+			    struct perf_sample *sample __maybe_unused,
+			    struct machine *machine __maybe_unused)
+{
+	struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
+
+	return perf_event__process_build_id(inject->session, event);
+}
+
+static int synthesize_build_id(struct perf_inject *inject, struct dso *dso, pid_t machine_pid)
+{
+	struct machine *machine = perf_session__findnew_machine(inject->session, machine_pid);
+	u8 cpumode = dso__is_in_kernel_space(dso) ?
+			PERF_RECORD_MISC_GUEST_KERNEL :
+			PERF_RECORD_MISC_GUEST_USER;
+
+	if (!machine)
+		return -ENOMEM;
+
+	dso->hit = 1;
+
+	return perf_event__synthesize_build_id(&inject->tool, dso, cpumode,
+					       process_build_id, machine);
+}
+
+static int guest_session__add_build_ids(struct guest_session *gs)
+{
+	struct perf_inject *inject = container_of(gs, struct perf_inject, guest_session);
+	struct machine *machine = &gs->session->machines.host;
+	struct dso *dso;
+	int ret;
+
+	/* Build IDs will be put in the Build ID feature section */
+	perf_header__set_feat(&inject->session->header, HEADER_BUILD_ID);
+
+	dsos__for_each_with_build_id(dso, &machine->dsos.head) {
+		ret = synthesize_build_id(inject, dso, gs->machine_pid);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int guest_session__ksymbol_event(struct perf_tool *tool,
+					union perf_event *event,
+					struct perf_sample *sample __maybe_unused,
+					struct machine *machine __maybe_unused)
+{
+	struct guest_session *gs = container_of(tool, struct guest_session, tool);
+
+	/* Only support out-of-line i.e. no BPF support */
+	if (event->ksymbol.ksym_type != PERF_RECORD_KSYMBOL_TYPE_OOL)
+		return 0;
+
+	return guest_session__output_bytes(gs, event, event->header.size);
+}
+
+static int guest_session__start(struct guest_session *gs, const char *name, bool force)
+{
+	char tmp_file_name[] = "/tmp/perf-inject-guest_session-XXXXXX";
+	struct perf_session *session;
+	int ret;
+
+	/* Only these events will be injected */
+	gs->tool.mmap		= guest_session__repipe;
+	gs->tool.mmap2		= guest_session__repipe;
+	gs->tool.comm		= guest_session__repipe;
+	gs->tool.fork		= guest_session__repipe;
+	gs->tool.exit		= guest_session__repipe;
+	gs->tool.lost		= guest_session__repipe;
+	gs->tool.context_switch	= guest_session__repipe;
+	gs->tool.ksymbol	= guest_session__ksymbol_event;
+	gs->tool.text_poke	= guest_session__repipe;
+	/*
+	 * Processing a build ID creates a struct dso with that build ID. Later,
+	 * all guest dsos are iterated and the build IDs processed into the host
+	 * session where they will be output to the Build ID feature section
+	 * when the perf.data file header is written.
+	 */
+	gs->tool.build_id	= perf_event__process_build_id;
+	/* Process the id index to know what VCPU an ID belongs to */
+	gs->tool.id_index	= perf_event__process_id_index;
+
+	gs->tool.ordered_events	= true;
+	gs->tool.ordering_requires_timestamps = true;
+
+	gs->data.path	= name;
+	gs->data.force	= force;
+	gs->data.mode	= PERF_DATA_MODE_READ;
+
+	session = perf_session__new(&gs->data, &gs->tool);
+	if (IS_ERR(session))
+		return PTR_ERR(session);
+	gs->session = session;
+
+	/*
+	 * Initial events have zero'd ID samples. Get default ID sample size
+	 * used for removing them.
+	 */
+	gs->dflt_id_hdr_size = session->machines.host.id_hdr_size;
+	/* And default ID for adding back a host-compatible ID sample */
+	gs->dflt_id = evlist__first_id(session->evlist);
+	if (!gs->dflt_id) {
+		pr_err("Guest data has no sample IDs");
+		return -EINVAL;
+	}
+
+	/* Temporary file for guest events */
+	gs->tmp_file_name = strdup(tmp_file_name);
+	if (!gs->tmp_file_name)
+		return -ENOMEM;
+	gs->tmp_fd = mkstemp(gs->tmp_file_name);
+	if (gs->tmp_fd < 0)
+		return -errno;
+
+	if (zstd_init(&gs->session->zstd_data, 0) < 0)
+		pr_warning("Guest session decompression initialization failed.\n");
+
+	/*
+	 * perf does not support processing 2 sessions simultaneously, so output
+	 * guest events to a temporary file.
+	 */
+	ret = perf_session__process_events(gs->session);
+	if (ret)
+		return ret;
+
+	if (lseek(gs->tmp_fd, 0, SEEK_SET))
+		return -errno;
+
+	return 0;
+}
+
+/* Free hlist nodes assuming hlist_node is the first member of hlist entries */
+static void free_hlist(struct hlist_head *heads, size_t hlist_sz)
+{
+	struct hlist_node *pos, *n;
+	size_t i;
+
+	for (i = 0; i < hlist_sz; ++i) {
+		hlist_for_each_safe(pos, n, &heads[i]) {
+			hlist_del(pos);
+			free(pos);
+		}
+	}
+}
+
+static void guest_session__exit(struct guest_session *gs)
+{
+	if (gs->session) {
+		perf_session__delete(gs->session);
+		free_hlist(gs->heads, PERF_EVLIST__HLIST_SIZE);
+		free_hlist(gs->tids, PERF_EVLIST__HLIST_SIZE);
+	}
+	if (gs->tmp_file_name) {
+		if (gs->tmp_fd >= 0)
+			close(gs->tmp_fd);
+		unlink(gs->tmp_file_name);
+		free(gs->tmp_file_name);
+	}
+	free(gs->vcpu);
+	free(gs->perf_data_file);
+}
+
+static void get_tsc_conv(struct perf_tsc_conversion *tc, struct perf_record_time_conv *time_conv)
+{
+	tc->time_shift		= time_conv->time_shift;
+	tc->time_mult		= time_conv->time_mult;
+	tc->time_zero		= time_conv->time_zero;
+	tc->time_cycles		= time_conv->time_cycles;
+	tc->time_mask		= time_conv->time_mask;
+	tc->cap_user_time_zero	= time_conv->cap_user_time_zero;
+	tc->cap_user_time_short	= time_conv->cap_user_time_short;
+}
+
+static void guest_session__get_tc(struct guest_session *gs)
+{
+	struct perf_inject *inject = container_of(gs, struct perf_inject, guest_session);
+
+	get_tsc_conv(&gs->host_tc, &inject->session->time_conv);
+	get_tsc_conv(&gs->guest_tc, &gs->session->time_conv);
+}
+
+static void guest_session__convert_time(struct guest_session *gs, u64 guest_time, u64 *host_time)
+{
+	u64 tsc;
+
+	if (!guest_time) {
+		*host_time = 0;
+		return;
+	}
+
+	if (gs->guest_tc.cap_user_time_zero)
+		tsc = perf_time_to_tsc(guest_time, &gs->guest_tc);
+	else
+		tsc = guest_time;
+
+	/*
+	 * This is the correct order of operations for x86 if the TSC Offset and
+	 * Multiplier values are used.
+	 */
+	tsc -= gs->time_offset;
+	tsc /= gs->time_scale;
+
+	if (gs->host_tc.cap_user_time_zero)
+		*host_time = tsc_to_perf_time(tsc, &gs->host_tc);
+	else
+		*host_time = tsc;
+}
+
+static int guest_session__fetch(struct guest_session *gs)
+{
+	void *buf = gs->ev.event_buf;
+	struct perf_event_header *hdr = buf;
+	size_t hdr_sz = sizeof(*hdr);
+	ssize_t ret;
+
+	ret = readn(gs->tmp_fd, buf, hdr_sz);
+	if (ret < 0)
+		return ret;
+
+	if (!ret) {
+		/* Zero size means EOF */
+		hdr->size = 0;
+		return 0;
+	}
+
+	buf += hdr_sz;
+
+	ret = readn(gs->tmp_fd, buf, hdr->size - hdr_sz);
+	if (ret < 0)
+		return ret;
+
+	gs->ev.event = (union perf_event *)gs->ev.event_buf;
+	gs->ev.sample.time = 0;
+
+	if (hdr->type >= PERF_RECORD_USER_TYPE_START) {
+		pr_err("Unexpected type fetching guest event");
+		return 0;
+	}
+
+	ret = evlist__parse_sample(gs->session->evlist, gs->ev.event, &gs->ev.sample);
+	if (ret) {
+		pr_err("Parse failed fetching guest event");
+		return ret;
+	}
+
+	if (!gs->have_tc) {
+		guest_session__get_tc(gs);
+		gs->have_tc = true;
+	}
+
+	guest_session__convert_time(gs, gs->ev.sample.time, &gs->ev.sample.time);
+
+	return 0;
+}
+
+static int evlist__append_id_sample(struct evlist *evlist, union perf_event *ev,
+				    const struct perf_sample *sample)
+{
+	struct evsel *evsel;
+	void *array;
+	int ret;
+
+	evsel = evlist__id2evsel(evlist, sample->id);
+	array = ev;
+
+	if (!evsel) {
+		pr_err("No evsel for id %"PRIu64"\n", sample->id);
+		return -EINVAL;
+	}
+
+	array += ev->header.size;
+	ret = perf_event__synthesize_id_sample(array, evsel->core.attr.sample_type, sample);
+	if (ret < 0)
+		return ret;
+
+	if (ret & 7) {
+		pr_err("Bad id sample size %d\n", ret);
+		return -EINVAL;
+	}
+
+	ev->header.size += ret;
+
+	return 0;
+}
+
+static int guest_session__inject_events(struct guest_session *gs, u64 timestamp)
+{
+	struct perf_inject *inject = container_of(gs, struct perf_inject, guest_session);
+	int ret;
+
+	if (!gs->ready)
+		return 0;
+
+	while (1) {
+		struct perf_sample *sample;
+		struct guest_id *guest_id;
+		union perf_event *ev;
+		u16 id_hdr_size;
+		u8 cpumode;
+		u64 id;
+
+		if (!gs->fetched) {
+			ret = guest_session__fetch(gs);
+			if (ret)
+				return ret;
+			gs->fetched = true;
+		}
+
+		ev = gs->ev.event;
+		sample = &gs->ev.sample;
+
+		if (!ev->header.size)
+			return 0; /* EOF */
+
+		if (sample->time > timestamp)
+			return 0;
+
+		/* Change cpumode to guest */
+		cpumode = ev->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
+		if (cpumode & PERF_RECORD_MISC_USER)
+			cpumode = PERF_RECORD_MISC_GUEST_USER;
+		else
+			cpumode = PERF_RECORD_MISC_GUEST_KERNEL;
+		ev->header.misc &= ~PERF_RECORD_MISC_CPUMODE_MASK;
+		ev->header.misc |= cpumode;
+
+		id = sample->id;
+		if (!id) {
+			id = gs->dflt_id;
+			id_hdr_size = gs->dflt_id_hdr_size;
+		} else {
+			struct evsel *evsel = evlist__id2evsel(gs->session->evlist, id);
+
+			id_hdr_size = evsel__id_hdr_size(evsel);
+		}
+
+		if (id_hdr_size & 7) {
+			pr_err("Bad id_hdr_size %u\n", id_hdr_size);
+			return -EINVAL;
+		}
+
+		if (ev->header.size & 7) {
+			pr_err("Bad event size %u\n", ev->header.size);
+			return -EINVAL;
+		}
+
+		/* Remove guest id sample */
+		ev->header.size -= id_hdr_size;
+
+		if (ev->header.size & 7) {
+			pr_err("Bad raw event size %u\n", ev->header.size);
+			return -EINVAL;
+		}
+
+		guest_id = guest_session__lookup_id(gs, id);
+		if (!guest_id) {
+			pr_err("Guest event with unknown id %llu\n",
+			       (unsigned long long)id);
+			return -EINVAL;
+		}
+
+		/* Change to host ID to avoid conflicting ID values */
+		sample->id = guest_id->host_id;
+		sample->stream_id = guest_id->host_id;
+
+		if (sample->cpu != (u32)-1) {
+			if (sample->cpu >= gs->vcpu_cnt) {
+				pr_err("Guest event with unknown VCPU %u\n",
+				       sample->cpu);
+				return -EINVAL;
+			}
+			/* Change to host CPU instead of guest VCPU */
+			sample->cpu = gs->vcpu[sample->cpu].cpu;
+		}
+
+		/* New id sample with new ID and CPU */
+		ret = evlist__append_id_sample(inject->session->evlist, ev, sample);
+		if (ret)
+			return ret;
+
+		if (ev->header.size & 7) {
+			pr_err("Bad new event size %u\n", ev->header.size);
+			return -EINVAL;
+		}
+
+		gs->fetched = false;
+
+		ret = output_bytes(inject, ev, ev->header.size);
+		if (ret)
+			return ret;
+	}
+}
+
+static int guest_session__flush_events(struct guest_session *gs)
+{
+	return guest_session__inject_events(gs, -1);
+}
+
+static int host__repipe(struct perf_tool *tool,
+			union perf_event *event,
+			struct perf_sample *sample,
+			struct machine *machine)
+{
+	struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
+	int ret;
+
+	ret = guest_session__inject_events(&inject->guest_session, sample->time);
+	if (ret)
+		return ret;
+
+	return perf_event__repipe(tool, event, sample, machine);
+}
+
+static int host__finished_init(struct perf_session *session, union perf_event *event)
+{
+	struct perf_inject *inject = container_of(session->tool, struct perf_inject, tool);
+	struct guest_session *gs = &inject->guest_session;
+	int ret;
+
+	/*
+	 * Peek through host COMM events to find QEMU threads and the VCPU they
+	 * are running.
+	 */
+	ret = host_peek_vm_comms(session, gs);
+	if (ret)
+		return ret;
+
+	if (!gs->vcpu_cnt) {
+		pr_err("No VCPU theads found for pid %u\n", gs->machine_pid);
+		return -EINVAL;
+	}
+
+	/*
+	 * Allocate new (unused) host sample IDs and map them to the guest IDs.
+	 */
+	gs->highest_id = evlist__find_highest_id(session->evlist);
+	ret = guest_session__map_ids(gs, session->evlist);
+	if (ret)
+		return ret;
+
+	ret = guest_session__add_attrs(gs);
+	if (ret)
+		return ret;
+
+	ret = synthesize_id_index(inject, gs->session->evlist->core.nr_entries);
+	if (ret) {
+		pr_err("Failed to synthesize id_index\n");
+		return ret;
+	}
+
+	ret = guest_session__add_build_ids(gs);
+	if (ret) {
+		pr_err("Failed to add guest build IDs\n");
+		return ret;
+	}
+
+	gs->ready = true;
+
+	ret = guest_session__inject_events(gs, 0);
+	if (ret)
+		return ret;
+
+	return perf_event__repipe_op2_synth(session, event);
+}
+
+/*
+ * Obey finished-round ordering. The FINISHED_ROUND event is first processed
+ * which flushes host events to file up until the last flush time. Then inject
+ * guest events up to the same time. Finally write out the FINISHED_ROUND event
+ * itself.
+ */
+static int host__finished_round(struct perf_tool *tool,
+				union perf_event *event,
+				struct ordered_events *oe)
+{
+	struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
+	int ret = perf_event__process_finished_round(tool, event, oe);
+	u64 timestamp = ordered_events__last_flush_time(oe);
+
+	if (ret)
+		return ret;
+
+	ret = guest_session__inject_events(&inject->guest_session, timestamp);
+	if (ret)
+		return ret;
+
+	return perf_event__repipe_oe_synth(tool, event, oe);
+}
+
+static int host__context_switch(struct perf_tool *tool,
+				union perf_event *event,
+				struct perf_sample *sample,
+				struct machine *machine)
+{
+	struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
+	bool out = event->header.misc & PERF_RECORD_MISC_SWITCH_OUT;
+	struct guest_session *gs = &inject->guest_session;
+	u32 pid = event->context_switch.next_prev_pid;
+	u32 tid = event->context_switch.next_prev_tid;
+	struct guest_tid *guest_tid;
+	u32 vcpu;
+
+	if (out || pid != gs->machine_pid)
+		goto out;
+
+	guest_tid = guest_session__lookup_tid(gs, tid);
+	if (!guest_tid)
+		goto out;
+
+	if (sample->cpu == (u32)-1) {
+		pr_err("Switch event does not have CPU\n");
+		return -EINVAL;
+	}
+
+	vcpu = guest_tid->vcpu;
+	if (vcpu >= gs->vcpu_cnt)
+		return -EINVAL;
+
+	/* Guest is switching in, record which CPU the VCPU is now running on */
+	gs->vcpu[vcpu].cpu = sample->cpu;
+out:
+	return host__repipe(tool, event, sample, machine);
+}
+
 static void sig_handler(int sig __maybe_unused)
 {
 	session_done = 1;
@@ -767,6 +1666,61 @@ static int parse_vm_time_correlation(const struct option *opt, const char *str,
 	return inject->itrace_synth_opts.vm_tm_corr_args ? 0 : -ENOMEM;
 }
 
+static int parse_guest_data(const struct option *opt, const char *str, int unset)
+{
+	struct perf_inject *inject = opt->value;
+	struct guest_session *gs = &inject->guest_session;
+	char *tok;
+	char *s;
+
+	if (unset)
+		return 0;
+
+	if (!str)
+		goto bad_args;
+
+	s = strdup(str);
+	if (!s)
+		return -ENOMEM;
+
+	gs->perf_data_file = strsep(&s, ",");
+	if (!gs->perf_data_file)
+		goto bad_args;
+
+	gs->copy_kcore_dir = has_kcore_dir(gs->perf_data_file);
+	if (gs->copy_kcore_dir)
+		inject->output.is_dir = true;
+
+	tok = strsep(&s, ",");
+	if (!tok)
+		goto bad_args;
+	gs->machine_pid = strtoul(tok, NULL, 0);
+	if (!inject->guest_session.machine_pid)
+		goto bad_args;
+
+	gs->time_scale = 1;
+
+	tok = strsep(&s, ",");
+	if (!tok)
+		goto out;
+	gs->time_offset = strtoull(tok, NULL, 0);
+
+	tok = strsep(&s, ",");
+	if (!tok)
+		goto out;
+	gs->time_scale = strtod(tok, NULL);
+	if (!gs->time_scale)
+		goto bad_args;
+out:
+	return 0;
+
+bad_args:
+	pr_err("--guest-data option requires guest perf.data file name, "
+	       "guest machine PID, and optionally guest timestamp offset, "
+	       "and guest timestamp scale factor, separated by commas.\n");
+	return -1;
+}
+
 static int save_section_info_cb(struct perf_file_section *section,
 				struct perf_header *ph __maybe_unused,
 				int feat, int fd __maybe_unused, void *data)
@@ -896,6 +1850,22 @@ static int copy_kcore_dir(struct perf_inject *inject)
 	return ret;
 }
 
+static int guest_session__copy_kcore_dir(struct guest_session *gs)
+{
+	struct perf_inject *inject = container_of(gs, struct perf_inject, guest_session);
+	char *cmd;
+	int ret;
+
+	ret = asprintf(&cmd, "cp -r -n %s/kcore_dir %s/kcore_dir__%u >/dev/null 2>&1",
+		       gs->perf_data_file, inject->output.path, gs->machine_pid);
+	if (ret < 0)
+		return ret;
+	pr_debug("%s\n", cmd);
+	ret = system(cmd);
+	free(cmd);
+	return ret;
+}
+
 static int output_fd(struct perf_inject *inject)
 {
 	return inject->in_place_update ? -1 : perf_data__fd(&inject->output);
@@ -904,6 +1874,7 @@ static int output_fd(struct perf_inject *inject)
 static int __cmd_inject(struct perf_inject *inject)
 {
 	int ret = -EINVAL;
+	struct guest_session *gs = &inject->guest_session;
 	struct perf_session *session = inject->session;
 	int fd = output_fd(inject);
 	u64 output_data_offset;
@@ -968,6 +1939,47 @@ static int __cmd_inject(struct perf_inject *inject)
 		output_data_offset = roundup(8192 + session->header.data_offset, 4096);
 		if (inject->strip)
 			strip_init(inject);
+	} else if (gs->perf_data_file) {
+		char *name = gs->perf_data_file;
+
+		/*
+		 * Not strictly necessary, but keep these events in order wrt
+		 * guest events.
+		 */
+		inject->tool.mmap		= host__repipe;
+		inject->tool.mmap2		= host__repipe;
+		inject->tool.comm		= host__repipe;
+		inject->tool.fork		= host__repipe;
+		inject->tool.exit		= host__repipe;
+		inject->tool.lost		= host__repipe;
+		inject->tool.context_switch	= host__repipe;
+		inject->tool.ksymbol		= host__repipe;
+		inject->tool.text_poke		= host__repipe;
+		/*
+		 * Once the host session has initialized, set up sample ID
+		 * mapping and feed in guest attrs, build IDs and initial
+		 * events.
+		 */
+		inject->tool.finished_init	= host__finished_init;
+		/* Obey finished round ordering */
+		inject->tool.finished_round	= host__finished_round,
+		/* Keep track of which CPU a VCPU is runnng on */
+		inject->tool.context_switch	= host__context_switch;
+		/*
+		 * Must order events to be able to obey finished round
+		 * ordering.
+		 */
+		inject->tool.ordered_events	= true;
+		inject->tool.ordering_requires_timestamps = true;
+		/* Set up a separate session to process guest perf.data file */
+		ret = guest_session__start(gs, name, session->data->force);
+		if (ret) {
+			pr_err("Failed to process %s, error %d\n", name, ret);
+			return ret;
+		}
+		/* Allow space in the header for guest attributes */
+		output_data_offset += gs->session->header.data_offset;
+		output_data_offset = roundup(output_data_offset, 4096);
 	}
 
 	if (!inject->itrace_synth_opts.set)
@@ -980,6 +1992,18 @@ static int __cmd_inject(struct perf_inject *inject)
 	if (ret)
 		return ret;
 
+	if (gs->session) {
+		/*
+		 * Remaining guest events have later timestamps. Flush them
+		 * out to file.
+		 */
+		ret = guest_session__flush_events(gs);
+		if (ret) {
+			pr_err("Failed to flush guest events\n");
+			return ret;
+		}
+	}
+
 	if (!inject->is_pipe && !inject->in_place_update) {
 		struct inject_fc inj_fc = {
 			.fc.copy = feat_copy_cb,
@@ -1014,8 +2038,17 @@ static int __cmd_inject(struct perf_inject *inject)
 
 		if (inject->copy_kcore_dir) {
 			ret = copy_kcore_dir(inject);
-			if (ret)
+			if (ret) {
+				pr_err("Failed to copy kcore\n");
 				return ret;
+			}
+		}
+		if (gs->copy_kcore_dir) {
+			ret = guest_session__copy_kcore_dir(gs);
+			if (ret) {
+				pr_err("Failed to copy guest kcore\n");
+				return ret;
+			}
 		}
 	}
 
@@ -1113,6 +2146,12 @@ int cmd_inject(int argc, const char **argv)
 		OPT_CALLBACK_OPTARG(0, "vm-time-correlation", &inject, NULL, "opts",
 				    "correlate time between VM guests and the host",
 				    parse_vm_time_correlation),
+		OPT_CALLBACK_OPTARG(0, "guest-data", &inject, NULL, "opts",
+				    "inject events from a guest perf.data file",
+				    parse_guest_data),
+		OPT_STRING(0, "guestmount", &symbol_conf.guestmount, "directory",
+			   "guest mount directory under which every guest os"
+			   " instance has a subdir"),
 		OPT_END()
 	};
 	const char * const inject_usage[] = {
@@ -1243,6 +2282,8 @@ int cmd_inject(int argc, const char **argv)
 
 	ret = __cmd_inject(&inject);
 
+	guest_session__exit(&inject.guest_session);
+
 out_delete:
 	zstd_fini(&(inject.session->zstd_data));
 	perf_session__delete(inject.session);
-- 
2.25.1