All of lore.kernel.org
 help / color / mirror / Atom feed
* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
@ 2007-02-14 23:52 Carl Love
  2007-02-15 14:37   ` Arnd Bergmann
  0 siblings, 1 reply; 66+ messages in thread
From: Carl Love @ 2007-02-14 23:52 UTC (permalink / raw)
  To: linuxppc-dev, cbe-oss-dev, linux-kernel, oprofile-list

This is the second update to the patch previously posted by Maynard
Johnson as "PATCH 4/4. Add support to OProfile for profiling CELL".  

This repost addresses most of the issues from Milton Miller. There are 
a few suggestions that Arnd has said are beyond the scope of what he 
wants to do at this point.  We will be meeting with Arnd to get
clarification on just what he wants to defer.

We are still working on the bug in the overlay code.

Subject: Add support to OProfile for profiling Cell BE SPUs

From: Maynard Johnson <maynardj@us.ibm.com>

This patch updates the existing arch/powerpc/oprofile/op_model_cell.c
to add in the SPU profiling capabilities.  In addition, a 'cell' subdirectory
was added to arch/powerpc/oprofile to hold Cell-specific SPU profiling
code.

Signed-off-by: Carl Love <carll@us.ibm.com>
Signed-off-by: Maynard Johnson <mpjohn@us.ibm.com>

Index: linux-2.6.20-rc1/arch/powerpc/configs/cell_defconfig
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/configs/cell_defconfig	2007-01-18 16:43:14.000000000 -0600
+++ linux-2.6.20-rc1/arch/powerpc/configs/cell_defconfig	2007-02-13 19:05:45.096004736 -0600
@@ -1404,6 +1404,7 @@
 #
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=y
+CONFIG_OPROFILE_CELL=y
 # CONFIG_KPROBES is not set
 
 #
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/cell/pr_util.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/cell/pr_util.h	2007-02-14 10:43:19.659123008 -0600
@@ -0,0 +1,87 @@
+ /*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Author: Maynard Johnson <maynardj@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef PR_UTIL_H
+#define PR_UTIL_H
+
+#include <linux/cpumask.h>
+#include <linux/oprofile.h>
+#include <asm/cell-pmu.h>
+#include <asm/spu.h>
+
+static inline int number_of_online_nodes(void)
+{
+	u32 cpu; u32 tmp;
+	int nodes = 0;
+	for_each_online_cpu(cpu) {
+		tmp = cbe_cpu_to_node(cpu) + 1;
+		if (tmp > nodes)
+			nodes++;
+	}
+	return nodes;
+}
+
+/* Defines used for sync_start */
+#define SKIP_GENERIC_SYNC 0
+#define SYNC_START_ERROR -1
+#define DO_GENERIC_SYNC 1
+
+struct  spu_overlay_info
+{
+	unsigned int vma;
+	unsigned int size;
+	unsigned int offset;
+	unsigned int buf;
+};
+
+struct vma_to_fileoffset_map
+{
+	struct vma_to_fileoffset_map *next;
+	unsigned int vma;
+	unsigned int size;
+	unsigned int offset;
+	unsigned int guard_ptr;
+	unsigned int guard_val;
+};
+
+/* The three functions below are for maintaining and accessing
+ * the vma-to-fileoffset map.
+ */
+struct vma_to_fileoffset_map * create_vma_map(const struct spu * spu,
+					      u64 objectid);
+unsigned int vma_map_lookup(struct vma_to_fileoffset_map *map,
+			    unsigned int vma, const struct spu * aSpu);
+void vma_map_free(struct vma_to_fileoffset_map *map);
+
+/*
+ * Entry point for SPU profiling.
+ * cycles_reset is the SPU_CYCLES count value specified by the user.
+ */
+void start_spu_profiling(unsigned int cycles_reset);
+
+void stop_spu_profiling(void);
+
+
+/* add the necessary profiling hooks */
+int spu_sync_start(void);
+
+/* remove the hooks */
+int spu_sync_stop(void);
+
+/* Record SPU program counter samples to the oprofile event buffer. */
+void spu_sync_buffer(int spu_num, unsigned int * samples,
+		     int num_samples);
+
+void set_profiling_frequency(unsigned int freq_khz, unsigned int cycles_reset);
+
+#endif    // PR_UTIL_H
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/cell/spu_profiler.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/cell/spu_profiler.c	2007-02-14 17:03:33.931041216 -0600
@@ -0,0 +1,201 @@
+/*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Authors: Maynard Johnson <maynardj@us.ibm.com>
+ *          Carl Love <carll@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/hrtimer.h>
+#include <linux/smp.h>
+#include <linux/slab.h>
+#include <asm/cell-pmu.h>
+#include <asm/time.h>
+#include "pr_util.h"
+
+#define TRACE_ARRAY_SIZE 1024
+#define SCALE_SHIFT 14
+
+static u32 * samples;
+
+static int spu_prof_running = 0;
+static unsigned int profiling_interval = 0;
+
+extern int spu_prof_num_nodes;
+
+
+#define NUM_SPU_BITS_TRBUF 16
+#define SPUS_PER_TB_ENTRY   4
+#define SPUS_PER_NODE       8
+
+#define SPU_PC_MASK         0xFFFF
+
+void set_profiling_frequency(unsigned int freq_khz, unsigned int cycles_reset)
+{
+	unsigned long nsPerCyc;
+	if (!freq_khz)
+		freq_khz = ppc_proc_freq/1000;
+
+	/* To calculate a timeout in nanoseconds, the basic
+	 * formula is ns = cycles_reset * (NSEC_PER_SEC / cpu frequency).
+	 * To avoid floating point math, we use the scale math
+	 * technique as described in linux/jiffies.h.  We use
+	 * a scale factor of SCALE_SHIFT,which provides 4 decimal places
+	 * of precision, which is close enough for the purpose at hand.
+	 *
+	 * The value of the timeout should be small enough that the hw
+	 * trace buffer will not get more then a bout 1/3 full for the 
+	 * maximum user specified (the LFSR value) hw sampling frequency.
+	 * This is to ensure the trace buffer will never fill even if the
+	 * kernel thread scheduling varies under a heavey system load.
+	 */
+
+	nsPerCyc = (USEC_PER_SEC << SCALE_SHIFT)/freq_khz;
+	profiling_interval = (nsPerCyc * cycles_reset) >> SCALE_SHIFT;
+
+}
+
+/*
+ * Extract SPU PC from trace buffer entry
+ */
+static void spu_pc_extract(int cpu, int entry)
+{
+	/* the trace buffer is 128 bits */
+	u64 trace_buffer[2];
+	u64 spu_mask;
+	int spu;
+
+	spu_mask = SPU_PC_MASK;
+
+	/* Each SPU PC is 16 bits; hence, four spus in each of
+	 * the two 64-bit buffer entries that make up the
+	 * 128-bit trace_buffer entry.  Process two 64-bit values 
+	 * simultaneously.
+	 * trace[0] SPU PC contents are: 0 1 2 3
+	 * trace[1] SPU PC contents are: 4 5 6 7
+	 */
+
+	cbe_read_trace_buffer(cpu, trace_buffer);
+
+	for (spu = SPUS_PER_TB_ENTRY-1; spu >= 0; spu--) {
+		/* spu PC trace entry is upper 16 bits of the
+		 * 18 bit SPU program counter
+		 */
+		samples[spu * TRACE_ARRAY_SIZE + entry]
+			= (spu_mask & trace_buffer[0]) << 2;
+		samples[(spu + SPUS_PER_TB_ENTRY) * TRACE_ARRAY_SIZE + entry] 
+			= (spu_mask & trace_buffer[1]) << 2;
+
+		trace_buffer[0] = trace_buffer[0] >> NUM_SPU_BITS_TRBUF;
+		trace_buffer[1] = trace_buffer[1] >> NUM_SPU_BITS_TRBUF;
+	}
+}
+
+static int cell_spu_pc_collection(int cpu)
+{
+	u32 trace_addr;
+	int entry;
+
+	/* process the collected SPU PC for the node */
+
+	entry = 0;
+
+	trace_addr = cbe_read_pm(cpu, trace_address);
+	while (!(trace_addr & CBE_PM_TRACE_BUF_EMPTY))
+	{
+		/* there is data in the trace buffer to process */
+		spu_pc_extract(cpu, entry);
+
+		entry++;
+
+		if (entry >= TRACE_ARRAY_SIZE)
+			/* spu_samples is full */
+			break;
+
+		trace_addr = cbe_read_pm(cpu, trace_address);
+	}
+	return(entry);
+}
+
+
+static int profile_spus(struct hrtimer * timer)
+{
+	ktime_t kt;
+	int cpu, node, k, num_samples, spu_num;
+
+	if (!spu_prof_running)
+		goto stop;
+
+	for_each_online_cpu(cpu) {
+		if (cbe_get_hw_thread_id(cpu))
+			continue;
+
+		node = cbe_cpu_to_node(cpu);
+
+		num_samples = cell_spu_pc_collection(cpu);
+
+		if (num_samples == 0)
+			continue;
+
+		for (k = 0; k < SPUS_PER_NODE; k++) {
+			spu_num = k + (node * SPUS_PER_NODE);
+			spu_sync_buffer(spu_num,
+					samples + (k * TRACE_ARRAY_SIZE),
+					num_samples); 
+		}
+	}
+	smp_wmb();
+
+	kt = ktime_set(0, profiling_interval);
+	if (!spu_prof_running)
+		goto stop;
+	hrtimer_forward(timer, timer->base->get_time(), kt);
+	return HRTIMER_RESTART;
+
+ stop:
+	printk(KERN_INFO "SPU_PROF: spu-prof timer ending\n");
+	return HRTIMER_NORESTART;
+}
+
+static struct hrtimer timer;
+/*
+ * Entry point for SPU profiling.
+ * NOTE:  SPU profiling is done system-wide, not per-CPU.
+ *
+ * cycles_reset is the count value specified by the user when
+ * setting up OProfile to count SPU_CYCLES.
+ */
+void start_spu_profiling(unsigned int cycles_reset) {
+
+	ktime_t kt;
+
+	pr_debug("timer resolution: %lu\n",
+		 TICK_NSEC);
+	kt = ktime_set(0, profiling_interval);
+	hrtimer_init(&timer, CLOCK_MONOTONIC, HRTIMER_REL);
+	timer.expires = kt;
+	timer.function = profile_spus;
+
+	/* Allocate arrays for collecting SPU PC samples */
+	samples = (u32 *) kzalloc(SPUS_PER_NODE *
+				  TRACE_ARRAY_SIZE * sizeof(u32), GFP_KERNEL);
+
+	spu_prof_running = 1;
+	hrtimer_start(&timer, kt, HRTIMER_REL);
+}
+
+void stop_spu_profiling(void)
+{
+	spu_prof_running = 0;
+	hrtimer_cancel(&timer);
+	kfree(samples);
+	pr_debug("SPU_PROF: stop_spu_profiling issued\n");
+}
+
+
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/cell/spu_task_sync.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/cell/spu_task_sync.c	2007-02-14 12:19:37.494128264 -0600
@@ -0,0 +1,453 @@
+/*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Author: Maynard Johnson <maynardj@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/* The purpose of this file is to handle SPU event task switching
+ * and to record SPU context information into the OProfile
+ * event buffer.
+ *
+ * Additionally, the spu_sync_buffer function is provided as a helper
+ * for recoding actual SPU program counter samples to the event buffer.
+ */
+
+#include <linux/dcookies.h>
+#include <linux/kref.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/numa.h>
+#include <linux/oprofile.h>
+#include <linux/spinlock.h>
+#include "pr_util.h"
+
+#define RELEASE_ALL 9999
+
+static spinlock_t buffer_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t cache_lock = SPIN_LOCK_UNLOCKED;
+static int num_spu_nodes;
+int spu_prof_num_nodes;
+
+/* Container for caching information about an active SPU task.
+ *
+ */
+struct cached_info {
+	struct vma_to_fileoffset_map * map;
+	struct spu * the_spu;   /* needed to access pointer to local_store */
+	struct kref cache_ref;
+};
+
+static struct cached_info * spu_info[MAX_NUMNODES * 8];
+
+static void destroy_cached_info(struct kref * kref)
+{
+	struct cached_info * info;
+	info = container_of(kref, struct cached_info, cache_ref);
+	vma_map_free(info->map);
+	kfree(info);
+	module_put(THIS_MODULE);
+}
+
+/* Return the cached_info for the passed SPU number.
+ * ATTENTION:  Callers are responsible for obtaining the
+ *             cache_lock if needed prior to invoking this function.
+ */
+static struct cached_info * get_cached_info(struct spu * the_spu, int spu_num)
+{
+	struct cached_info * ret_info = NULL;
+	if (spu_num >= num_spu_nodes) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Invalid index %d into spu info cache\n",
+		       __FUNCTION__, __LINE__, spu_num);
+		goto out;
+	}
+	if (!spu_info[spu_num] && the_spu) {
+		spu_info[spu_num] = (struct cached_info *)
+			spu_get_profile_private(the_spu->ctx);
+		if (spu_info[spu_num])
+			kref_get(&spu_info[spu_num]->cache_ref);
+	}
+
+	ret_info = spu_info[spu_num];
+ out:
+	return ret_info;
+}
+
+
+/* Looks for cached info for the passed spu.  If not found, the
+ * cached info is created for the passed spu.
+ * Returns 0 for success; otherwise, -1 for error.
+ */
+static int
+prepare_cached_spu_info(struct spu * spu, unsigned int objectId)
+{
+	unsigned long flags = 0;
+	struct vma_to_fileoffset_map * new_map;
+	int retval = 0;
+	struct cached_info * info;
+
+        /* We won't bother getting cache_lock here since
+	 * don't do anything with the cached_info that's returned.
+	 */
+	info = get_cached_info(spu, spu->number);
+
+	if (info) {
+		pr_debug("Found cached SPU info.\n");
+		goto out;
+	}
+
+	/* Create cached_info and set spu_info[spu->number] to point to it.
+	 * spu->number is a system-wide value, not a per-node value.
+	 */
+	info = kzalloc(sizeof(struct cached_info), GFP_KERNEL);
+	if (!info) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: create vma_map failed\n",
+		       __FUNCTION__, __LINE__);
+		goto err_alloc;
+	}
+	new_map = create_vma_map(spu, objectId);
+	if (!new_map) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: create vma_map failed\n",
+		       __FUNCTION__, __LINE__);
+		goto err_alloc;
+	}
+
+	pr_debug("Created vma_map\n");
+	info->map = new_map;
+	info->the_spu = spu;
+	kref_init(&info->cache_ref);
+	spin_lock_irqsave(&cache_lock, flags);
+	spu_info[spu->number] = info;
+	/* Increment count before passing off ref to SPUFS. */
+	kref_get(&info->cache_ref);
+
+        /* We increment the module refcount here since SPUFS is
+	 * responsible for the final destruction of the cached_info,
+	 * and it must be able to access the destroy_cached_info()
+	 * function defined in the OProfile module.  We decrement
+	 * the module refcount in destroy_cached_info.
+	 */
+	try_module_get(THIS_MODULE);
+	spu_set_profile_private(spu->ctx, info, &info->cache_ref,
+				destroy_cached_info);
+	spin_unlock_irqrestore(&cache_lock, flags);
+	goto out;
+
+err_alloc:
+	retval = -1;
+out:
+	return retval;
+}
+
+/*
+ * NOTE:  The caller is responsible for locking the
+ *	  cache_lock prior to calling this function.
+ */
+static int release_cached_info(int spu_index)
+{
+	int index, end;
+	if (spu_index == RELEASE_ALL) {
+		end = num_spu_nodes;
+		index = 0;
+	} else {
+	        if (spu_index >= num_spu_nodes) {
+        	        printk(KERN_ERR "SPU_PROF: "
+			       "%s, line %d: "
+			       "Invalid index %d into spu info cache\n",
+               	               __FUNCTION__, __LINE__, spu_index);
+	                goto out;
+	        }
+		end = spu_index +1;
+		index = spu_index;
+	}
+	for (; index < end; index++) {
+		if (spu_info[index]) {
+			kref_put(&spu_info[index]->cache_ref,
+				 destroy_cached_info);
+			spu_info[index] = NULL;
+		}
+	}
+
+out:
+	return 0;
+}
+
+/* The source code for fast_get_dcookie was "borrowed"
+ * from drivers/oprofile/buffer_sync.c.
+ */
+
+/* Optimisation. We can manage without taking the dcookie sem
+ * because we cannot reach this code without at least one
+ * dcookie user still being registered (namely, the reader
+ * of the event buffer).
+ */
+static inline unsigned long fast_get_dcookie(struct dentry * dentry,
+					     struct vfsmount * vfsmnt)
+{
+	unsigned long cookie;
+
+	if (dentry->d_cookie)
+		return (unsigned long)dentry;
+	get_dcookie(dentry, vfsmnt, &cookie);
+	return cookie;
+}
+
+/* Look up the dcookie for the task's first VM_EXECUTABLE mapping,
+ * which corresponds loosely to "application name". Also, determine
+ * the offset for the SPU ELF object.  If computed offset is
+ * non-zero, it implies an embedded SPU object; otherwise, it's a
+ * separate SPU binary, in which case we retrieve it's dcookie.
+ */
+static unsigned long
+get_exec_dcookie_and_offset(
+	struct spu * spu, unsigned int * offsetp,
+	unsigned long * spu_bin_dcookie,
+	unsigned int spu_ref)
+{
+	unsigned long cookie = 0;
+	unsigned int my_offset = 0;
+	struct vm_area_struct * vma;
+	struct mm_struct * mm = spu->mm;
+
+	if (!mm)
+		goto out;
+
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		if (!vma->vm_file)
+			continue;
+		if (!(vma->vm_flags & VM_EXECUTABLE))
+			continue;
+		cookie = fast_get_dcookie(vma->vm_file->f_dentry,
+					  vma->vm_file->f_vfsmnt);
+		pr_debug("got dcookie for %s\n",
+			 vma->vm_file->f_dentry->d_name.name);
+		break;
+	}
+
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		if (vma->vm_start > spu_ref || vma->vm_end < spu_ref)
+			continue;
+		my_offset = spu_ref - vma->vm_start;
+		pr_debug("Found spu ELF at %X for file %s\n", my_offset,
+			 vma->vm_file->f_dentry->d_name.name);
+		*offsetp = my_offset;
+		if (my_offset == 0) {
+			if (!vma->vm_file) {
+				goto fail_no_spu_cookie;
+			}
+			*spu_bin_dcookie = fast_get_dcookie(
+				vma->vm_file->f_dentry,
+				vma->vm_file->f_vfsmnt);
+			pr_debug("got dcookie for %s\n",
+				 vma->vm_file->f_dentry->d_name.name);
+		}
+		break;
+	}
+
+ out:
+	return cookie;
+
+ fail_no_spu_cookie:
+	printk(KERN_ERR "SPU_PROF: "
+	       "%s, line %d: Cannot find dcookie for SPU binary\n",
+	       __FUNCTION__, __LINE__);
+	goto out;
+}
+
+
+
+/* This function finds or creates cached context information for the
+ * passed SPU and records SPU context information into the OProfile
+ * event buffer.
+ */
+static int process_context_switch(struct spu * spu, unsigned int objectId)
+{
+	unsigned long flags;
+	int retval = 0;
+	unsigned int offset = 0;
+	unsigned long spu_cookie = 0, app_dcookie = 0;
+	retval = prepare_cached_spu_info(spu, objectId);
+	if (retval == -1) {
+		goto out;
+	}
+        /* Get dcookie first because a mutex_lock is taken in that
+	 * code path, so interrupts must not be disabled.
+	 */
+	app_dcookie = get_exec_dcookie_and_offset(spu, &offset,
+						  &spu_cookie, objectId);
+
+        /* Record context info in event buffer */
+	spin_lock_irqsave(&buffer_lock, flags);
+	add_event_entry(ESCAPE_CODE);
+	add_event_entry(SPU_CTX_SWITCH_CODE);
+	add_event_entry(spu->number);
+	add_event_entry(spu->pid);
+	add_event_entry(spu->tgid);
+	add_event_entry(app_dcookie);
+
+	add_event_entry(ESCAPE_CODE);
+	if (offset) {
+	  /* When offset is non-zero,  this means the SPU ELF was embedded;
+	   * otherwise, it was loaded from a separate binary file.  For the
+	   * embedded case, we record the offset of the SPU ELF into the PPU
+	   * executable; for the non-embedded case, we record a dcookie that
+	   * points to the location of the SPU binary that was loaded.
+	   */
+		add_event_entry(SPU_OFFSET_CODE);
+		add_event_entry(offset);
+	} else {
+		add_event_entry(SPU_COOKIE_CODE);
+		add_event_entry(spu_cookie);
+	}
+	spin_unlock_irqrestore(&buffer_lock, flags);
+	smp_wmb();
+out:
+	return retval;
+}
+
+/*
+ * This function is invoked on either a bind_context or unbind_context.
+ * If called for an unbind_context, the val arg is 0; otherwise,
+ * it is the object-id value for the spu context.
+ * The data arg is of type 'struct spu *'.
+ */
+static int spu_active_notify(struct notifier_block * self, unsigned long val,
+			     void * data)
+{
+	int retval;
+	unsigned long flags = 0;
+	struct spu * the_spu = data;
+	pr_debug("SPU event notification arrived\n");
+	if (!val){
+		spin_lock_irqsave(&cache_lock, flags);
+		retval = release_cached_info(the_spu->number);
+		spin_unlock_irqrestore(&cache_lock, flags);
+	} else {
+		retval = process_context_switch(the_spu, val);
+	}
+	return retval;
+}
+
+static struct notifier_block spu_active = {
+	.notifier_call = spu_active_notify,
+};
+
+/* The main purpose of this function is to synchronize
+ * OProfile with SPUFS by registering to be notified of
+ * SPU task switches.
+ *
+ * NOTE: When profiling SPUs, we must ensure that only
+ * spu_sync_start is invoked and not the generic sync_start
+ * in drivers/oprofile/oprof.c.  A return value of
+ * SKIP_GENERIC_SYNC or SYNC_START_ERROR will
+ * accomplish this.
+ */
+int spu_sync_start(void) {
+	int ret = SKIP_GENERIC_SYNC;
+	int register_ret;
+	unsigned long flags = 0;
+	spu_prof_num_nodes = number_of_online_nodes();
+	num_spu_nodes = spu_prof_num_nodes * 8;
+
+	spin_lock_irqsave(&buffer_lock, flags);
+	add_event_entry(ESCAPE_CODE);
+	add_event_entry(SPU_PROFILING_CODE);
+	add_event_entry(num_spu_nodes);
+	spin_unlock_irqrestore(&buffer_lock, flags);
+
+        /* Register for SPU events  */
+	register_ret = spu_switch_event_register(&spu_active);
+	if (register_ret) {
+		ret = SYNC_START_ERROR;
+		goto out;
+	}
+
+	pr_debug("spu_sync_start -- running.\n");
+out:
+	return ret;
+}
+
+/* Record SPU program counter samples to the oprofile event buffer. */
+void spu_sync_buffer(int spu_num, unsigned int * samples,
+		     int num_samples)
+{
+	unsigned long long file_offset;
+	unsigned long cache_lock_flags = 0;
+	unsigned long buffer_lock_flags = 0;
+	int i;
+	struct vma_to_fileoffset_map * map;
+	struct spu * the_spu;
+	unsigned long long spu_num_ll = spu_num;
+	unsigned long long spu_num_shifted = spu_num_ll << 32;
+	struct cached_info * c_info;
+
+        /* We need to obtain the cache_lock here because it's
+	 * possible that after getting the cached_info, the SPU job
+	 * corresponding to this cached_info may end, thus resulting
+	 * in the destruction of the cached_info.
+	 */
+	spin_lock_irqsave(&cache_lock, cache_lock_flags);
+	c_info = get_cached_info(NULL, spu_num);
+	if (c_info == NULL) {
+        /* This legitimately happens when the SPU task ends before all
+	 * samples are recorded.  No big deal -- so we just drop a few samples.
+	 */
+		pr_debug("SPU_PROF: No cached SPU contex "
+			  "for SPU #%d. Dropping samples.\n", spu_num);
+		spin_unlock_irqrestore(&cache_lock, cache_lock_flags);
+		return ;
+	}
+
+	map = c_info->map;
+	the_spu = c_info->the_spu;
+	spin_lock_irqsave(&buffer_lock, buffer_lock_flags);
+	for (i = 0; i < num_samples; i++) {
+		unsigned int sample = *(samples+i);
+		file_offset = 0;
+		if (sample == 0)
+			continue;
+		file_offset = vma_map_lookup(
+			map, sample, the_spu);
+		/* For now, we'll drop samples that can't be mapped.
+		 * This can happen for generated stubs executed from
+		 * the SPU stack.  Do we need to record these somehow?
+		 */
+		if (unlikely(file_offset == 0xffffffff))
+			continue;
+		add_event_entry(file_offset | spu_num_shifted);
+	}
+	spin_unlock_irqrestore(&buffer_lock, buffer_lock_flags);
+	spin_unlock_irqrestore(&cache_lock, cache_lock_flags);
+}
+
+
+int spu_sync_stop(void)
+{
+	unsigned long flags = 0;
+	int ret = spu_switch_event_unregister(&spu_active);
+	if (ret) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: spu_switch_event_unregister returned %d\n",
+		       __FUNCTION__, __LINE__, ret);
+		goto out;
+	}
+
+	spin_lock_irqsave(&cache_lock, flags);
+	ret = release_cached_info(RELEASE_ALL);
+	spin_unlock_irqrestore(&cache_lock, flags);
+out:
+	pr_debug("spu_sync_stop -- done.\n");
+	return ret;
+}
+
+
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/cell/vma_map.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/cell/vma_map.c	2007-02-14 13:30:33.159035328 -0600
@@ -0,0 +1,277 @@
+ /*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Author: Maynard Johnson <maynardj@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/* The code in this source file is responsible for generating
+ * vma-to-fileOffset maps for both overlay and non-overlay SPU
+ * applications.
+ */
+
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/elf.h>
+#include "pr_util.h"
+
+
+void vma_map_free(struct vma_to_fileoffset_map *map)
+{
+	while (map) {
+		struct vma_to_fileoffset_map *next = map->next;
+		kfree(map);
+		map = next;
+	}
+}
+
+unsigned int
+vma_map_lookup(struct vma_to_fileoffset_map *map, unsigned int vma,
+	       const struct spu * aSpu)
+{
+	u32 offset = 0xffffffff;
+	u32 ovly_grd;
+	for (; map; map = map->next) {
+		if (vma < map->vma || vma >= map->vma + map->size)
+			continue;
+
+		if (map->guard_ptr) {
+			ovly_grd = *(u32 *)(aSpu->local_store + map->guard_ptr);
+			if (ovly_grd != map->guard_val)
+				continue;
+		}
+		offset = vma - map->vma + map->offset;
+		break;
+	}
+
+	return offset;
+}
+
+static struct vma_to_fileoffset_map *
+vma_map_add(struct vma_to_fileoffset_map * map, unsigned int vma,
+	    unsigned int size, unsigned int offset, unsigned int guard_ptr,
+	    unsigned int guard_val)
+{
+	struct vma_to_fileoffset_map * new =
+		kzalloc(sizeof(struct vma_to_fileoffset_map), GFP_KERNEL);
+	if (!new) {
+		printk(KERN_ERR "SPU_PROF: %s, line %d: malloc failed\n",
+		       __FUNCTION__, __LINE__);
+		vma_map_free(map);
+		return NULL;
+	}
+
+	new->next = map;
+	new->vma = vma;
+	new->size = size;
+	new->offset = offset;
+	new->guard_ptr = guard_ptr;
+	new->guard_val = guard_val;
+
+	return new;
+}
+
+
+/* Parse SPE ELF header and generate a list of vma_maps.
+ * A pointer to the first vma_map in the generated list
+ * of vma_maps is returned.  */
+struct vma_to_fileoffset_map * create_vma_map(const struct spu * aSpu,
+					      unsigned long spu_elf_start)
+{
+	static const unsigned char expected[EI_PAD] = {
+		[EI_MAG0] = ELFMAG0,
+		[EI_MAG1] = ELFMAG1,
+		[EI_MAG2] = ELFMAG2,
+		[EI_MAG3] = ELFMAG3,
+		[EI_CLASS] = ELFCLASS32,
+		[EI_DATA] = ELFDATA2MSB,
+		[EI_VERSION] = EV_CURRENT,
+		[EI_OSABI] = ELFOSABI_NONE
+	};
+
+	struct vma_to_fileoffset_map * map = NULL;
+	struct spu_overlay_info ovly;
+	unsigned int overlay_tbl_offset = -1;
+	unsigned long phdr_start, shdr_start;
+	Elf32_Ehdr ehdr;
+	Elf32_Phdr phdr;
+	Elf32_Shdr shdr, shdr_str;
+	Elf32_Sym sym;
+	int i, j;
+	char name[32];
+
+	unsigned int ovly_table_sym = 0;
+	unsigned int ovly_buf_table_sym = 0;
+	unsigned int ovly_table_end_sym = 0;
+	unsigned int ovly_buf_table_end_sym = 0;
+	unsigned long ovly_table;
+	unsigned int n_ovlys;
+
+	/* Get and validate ELF header.  */
+
+	if (copy_from_user(&ehdr, (void *) spu_elf_start, sizeof (ehdr)))
+		goto fail;
+
+	if (memcmp(ehdr.e_ident, expected, EI_PAD) != 0) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Unexpected e_ident parsing SPU ELF\n",
+		       __FUNCTION__, __LINE__);
+		goto fail;
+	}
+	if (ehdr.e_machine != EM_SPU) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Unexpected e_machine parsing SPU ELF\n",
+		       __FUNCTION__,  __LINE__);
+		goto fail;
+	}
+	if (ehdr.e_type != ET_EXEC) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Unexpected e_type parsing SPU ELF\n",
+		       __FUNCTION__, __LINE__);
+		goto fail;
+	}
+	phdr_start = spu_elf_start + ehdr.e_phoff;
+	shdr_start = spu_elf_start + ehdr.e_shoff;
+
+	/* Traverse program headers.  */
+	for (i = 0; i < ehdr.e_phnum; i++) {
+		if (copy_from_user(&phdr,
+				   (void *) (phdr_start + i * sizeof(phdr)),
+				   sizeof(phdr)))
+			goto fail;
+
+		if (phdr.p_type != PT_LOAD)
+			continue;
+		if (phdr.p_flags & (1 << 27))
+			continue;
+
+		map = vma_map_add(map, phdr.p_vaddr, phdr.p_memsz,
+				  phdr.p_offset, 0, 0);
+		if (!map)
+			goto fail;
+	}
+
+	pr_debug("SPU_PROF: Created non-overlay maps\n");
+	/* Traverse section table and search for overlay-related symbols.  */
+	for (i = 0; i < ehdr.e_shnum; i++) {
+		if (copy_from_user(&shdr,
+				   (void *) (shdr_start + i * sizeof(shdr)),
+				   sizeof(shdr)))
+			goto fail;
+
+		if (shdr.sh_type != SHT_SYMTAB)
+			continue;
+		if (shdr.sh_entsize != sizeof (sym))
+			continue;
+
+		if (copy_from_user(&shdr_str,
+				   (void *) (shdr_start + shdr.sh_link *
+					     sizeof(shdr)),
+				   sizeof(shdr)))
+			goto fail;
+
+		if (shdr_str.sh_type != SHT_STRTAB)
+			goto fail;;
+
+		for (j = 0; j < shdr.sh_size / sizeof (sym); j++) {
+			if (copy_from_user(&sym, (void *) (spu_elf_start +
+						       shdr.sh_offset + j *
+							   sizeof (sym)),
+					   sizeof (sym)))
+				goto fail;
+
+			if (copy_from_user(name, (void *)
+					   (spu_elf_start + shdr_str.sh_offset +
+					    sym.st_name),
+					   20))
+				goto fail;
+
+			if (memcmp(name, "_ovly_table", 12) == 0)
+				ovly_table_sym = sym.st_value;
+			if (memcmp(name, "_ovly_buf_table", 16) == 0)
+				ovly_buf_table_sym = sym.st_value;
+			if (memcmp(name, "_ovly_table_end", 16) == 0)
+				ovly_table_end_sym = sym.st_value;
+			if (memcmp(name, "_ovly_buf_table_end", 20) == 0)
+				ovly_buf_table_end_sym = sym.st_value;
+		}
+	}
+
+	/* If we don't have overlays, we're done.  */
+	if (ovly_table_sym == 0 || ovly_buf_table_sym == 0
+	    || ovly_table_end_sym == 0 || ovly_buf_table_end_sym == 0) {
+		pr_debug("SPU_PROF: No overlay table found\n");
+		goto out;
+	}
+	else {
+		pr_debug("SPU_PROF: Overlay table found\n");
+	}
+
+        /* The _ovly_table symbol represents a table with one entry
+	 * per overlay section.  The _ovly_buf_table symbol represents
+	 * a table with one entry per overlay region.
+        /* The struct spu_overlay_info gives the structure of the _ovly_table
+	 * entries.  The structure of _ovly_table_buf is simply one
+	 * u32 word per entry.
+	 */
+	overlay_tbl_offset = vma_map_lookup(map, ovly_table_sym, aSpu);
+	if (overlay_tbl_offset < 0) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Error finding SPU overlay table\n",
+		       __FUNCTION__, __LINE__);
+		goto fail;
+	}
+	ovly_table = spu_elf_start + overlay_tbl_offset;
+
+	n_ovlys = (ovly_table_end_sym -
+		   ovly_table_sym) / sizeof (ovly);
+
+	/* Traverse overlay table.  */
+	for (i = 0; i < n_ovlys; i++) {
+		if (copy_from_user(&ovly, (void *)
+				   (ovly_table + i * sizeof (ovly)),
+				   sizeof (ovly)))
+			goto fail;
+
+		/* The ovly.vma/size/offset arguments are analogous to the same
+		 * arguments used above for non-overlay maps.  The final two
+		 * args are referred to as the guard pointer and the guard
+		 * value.
+		 * The guard pointer is an entry in the _ovly_buf_table,
+		 * computed using ovly.buf as the index into the table.  Since
+		 * ovly.buf values begin at '1' to reference the first (or 0th)
+		 * entry in the _ovly_buf_table, the computation subtracts 1
+		 * from ovly.buf.
+		 * The guard value is stored in the _ovly_buf_table entry and
+		 * is an index (starting at 1) back to the _ovly_table entry
+		 * that is pointing at this _ovly_buf_table entry.  So, for
+		 * example, for an overlay scenario with one overlay segment
+		 * and two overlay sections:
+		 *      - Section 1 points to the first entry of the
+		 *        _ovly_buf_table, which contains a guard value
+		 *        of '1', referencing the first (index=0) entry of
+		 *        _ovly_table.
+		 *      - Section 2 points to the second entry of the
+		 *        _ovly_buf_table, which contains a guard value
+		 *        of '2', referencing the second (index=1) entry of
+		 *        _ovly_table.
+		 */
+		map = vma_map_add(map, ovly.vma, ovly.size, ovly.offset,
+				   ovly_buf_table_sym + (ovly.buf - 1) * 4, i + 1);
+		if (!map)
+			goto fail;
+	}
+	goto out;
+
+ fail:
+	map = NULL;
+ out:
+	return map;
+}
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/common.c
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/oprofile/common.c	2007-01-18 16:43:14.000000000 -0600
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/common.c	2007-02-14 17:03:33.932041064 -0600
@@ -29,6 +29,8 @@
 static struct op_counter_config ctr[OP_MAX_COUNTER];
 static struct op_system_config sys;
 
+static int op_powerpc_flag;
+
 static void op_handle_interrupt(struct pt_regs *regs)
 {
 	model->handle_interrupt(regs, ctr);
@@ -36,25 +38,41 @@
 
 static void op_powerpc_cpu_setup(void *dummy)
 {
-	model->cpu_setup(ctr);
+	int ret;
+
+	ret = model->cpu_setup(ctr);
+
+	if (ret != 0)
+		op_powerpc_flag = ret;
 }
 
 static int op_powerpc_setup(void)
 {
 	int err;
 
+	op_powerpc_flag = 0;
+
 	/* Grab the hardware */
 	err = reserve_pmc_hardware(op_handle_interrupt);
 	if (err)
 		return err;
 
 	/* Pre-compute the values to stuff in the hardware registers.  */
-	model->reg_setup(ctr, &sys, model->num_counters);
+	op_powerpc_flag = model->reg_setup(ctr, &sys, model->num_counters);
 
-	/* Configure the registers on all cpus.  */
+	if (op_powerpc_flag)
+		goto out;
+
+	/* Configure the registers on all cpus.  If an error occurs on one 
+	 * of the cpus, op_powerpc_flag will be set to the error */
 	on_each_cpu(op_powerpc_cpu_setup, NULL, 0, 1);
 
-	return 0;
+out:    if (op_powerpc_flag) {
+		/* error on setup release the performance counter hardware */
+		release_pmc_hardware();
+	}
+
+	return op_powerpc_flag;
 }
 
 static void op_powerpc_shutdown(void)
@@ -64,16 +82,27 @@
 
 static void op_powerpc_cpu_start(void *dummy)
 {
-	model->start(ctr);
+	/* If any of the cpus have return an error, set the
+	 * global flag to the error so it can be returned
+	 * to the generic OProfile caller.
+	 */
+	int ret;
+
+	ret = model->start(ctr);
+	if (ret != 0)
+		op_powerpc_flag = ret;
 }
 
 static int op_powerpc_start(void)
 {
+	op_powerpc_flag = 0;
+
 	if (model->global_start)
-		model->global_start(ctr);
-	if (model->start)
+		return model->global_start(ctr);
+	if (model->start) {
 		on_each_cpu(op_powerpc_cpu_start, NULL, 0, 1);
-	return 0;
+		return op_powerpc_flag;
+	}
 }
 
 static inline void op_powerpc_cpu_stop(void *dummy)
@@ -150,6 +179,8 @@
 #ifdef CONFIG_PPC_CELL_NATIVE
 		case PPC_OPROFILE_CELL:
 			model = &op_model_cell;
+			ops->sync_start = model->sync_start;
+			ops->sync_stop = model->sync_stop;
 			break;
 #endif
 		case PPC_OPROFILE_RS64:
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/Kconfig
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/oprofile/Kconfig	2007-01-18 16:43:14.000000000 -0600
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/Kconfig	2007-02-13 19:04:46.271028904 -0600
@@ -7,7 +7,8 @@
 
 config OPROFILE
 	tristate "OProfile system profiling (EXPERIMENTAL)"
-	depends on PROFILING
+	default m
+	depends on SPU_FS && PROFILING
 	help
 	  OProfile is a profiling system capable of profiling the
 	  whole system, include the kernel, kernel modules, libraries,
@@ -15,3 +16,10 @@
 
 	  If unsure, say N.
 
+config OPROFILE_CELL
+	bool "OProfile for Cell Broadband Engine"
+	depends on SPU_FS && OPROFILE
+	default y
+	help
+	  OProfile for Cell BE requires special support enabled
+	  by this option.
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/Makefile
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/oprofile/Makefile	2007-01-18 16:43:14.000000000 -0600
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/Makefile	2007-02-07 13:27:38.014976712 -0600
@@ -11,7 +11,8 @@
 		timer_int.o )
 
 oprofile-y := $(DRIVER_OBJS) common.o backtrace.o
-oprofile-$(CONFIG_PPC_CELL_NATIVE) += op_model_cell.o
+oprofile-$(CONFIG_PPC_CELL_NATIVE) += op_model_cell.o \
+					cell/spu_profiler.o cell/vma_map.o cell/spu_task_sync.o
 oprofile-$(CONFIG_PPC64) += op_model_rs64.o op_model_power4.o
 oprofile-$(CONFIG_FSL_BOOKE) += op_model_fsl_booke.o
 oprofile-$(CONFIG_6xx) += op_model_7450.o
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/op_model_cell.c
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/oprofile/op_model_cell.c	2007-02-07 13:03:22.857936584 -0600
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/op_model_cell.c	2007-02-14 17:40:41.599060824 -0600
@@ -37,11 +37,21 @@
 #include <asm/system.h>
 
 #include "../platforms/cell/interrupt.h"
+#include "cell/pr_util.h"
+
+/* spu_cycle_reset is the number of cycles between samples.
+ * This variable is used for SPU profiling and should ONLY be set
+ * at the beginning of cell_reg_setup; otherwise, it's read-only.
+ */
+static unsigned int spu_cycle_reset = 0;
+
+#define NUM_SPUS_PER_NODE    8
+#define SPU_CYCLES_EVENT_NUM 2  /*  event number for SPU_CYCLES */
 
 #define PPU_CYCLES_EVENT_NUM 1	/*  event number for CYCLES */
 #define PPU_CYCLES_GRP_NUM   1  /* special group number for identifying
-                                 * PPU_CYCLES event
-                                 */
+				 * PPU_CYCLES event
+				 */
 #define CBE_COUNT_ALL_CYCLES 0x42800000	/* PPU cycle event specifier */
 
 #define NUM_THREADS 2         /* number of physical threads in
@@ -50,6 +60,15 @@
 #define NUM_TRACE_BUS_WORDS 4
 #define NUM_INPUT_BUS_WORDS 2
 
+#define EFWCALL  ENOSYS         /* Use an existing error number that is as
+				 * close as possible for a FW call that failed.
+				 * The probability of the call failing is
+				 * very low.  Passing up the error number
+				 * ensures that the user will see an error
+				 * message saying OProfile did not start.
+				 * Dmesg will contain an accurate message
+				 * about the failure.
+				 */
 
 struct pmc_cntrl_data {
 	unsigned long vcntr;
@@ -64,7 +83,7 @@
 
 struct pm_signal {
 	u16 cpu;		/* Processor to modify */
-	u16 sub_unit;		/* hw subunit this applies to (if applicable) */
+	u16 sub_unit;		/* hw subunit this applies to (if applicable)*/
 	short int signal_group;	/* Signal Group to Enable/Disable */
 	u8 bus_word;		/* Enable/Disable on this Trace/Trigger/Event
 				 * Bus Word(s) (bitmask)
@@ -140,14 +159,25 @@
 /*
  * Firmware interface functions
  */
+
 static int
 rtas_ibm_cbe_perftools(int subfunc, int passthru,
 		       void *address, unsigned long length)
 {
 	u64 paddr = __pa(address);
 
-	return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc, passthru,
-			 paddr >> 32, paddr & 0xffffffff, length);
+	pm_rtas_token = rtas_token("ibm,cbe-perftools");
+
+	if (unlikely(pm_rtas_token == RTAS_UNKNOWN_SERVICE)) {
+		printk(KERN_ERR
+		       "%s: rtas token ibm,cbe-perftools unknown\n",
+		       __FUNCTION__);
+		return -EFWCALL;
+	} else {
+
+		return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc, 
+			 passthru, paddr >> 32, paddr & 0xffffffff, length); 
+	}
 }
 
 static void pm_rtas_reset_signals(u32 node)
@@ -174,24 +204,28 @@
 				     &pm_signal_local,
 				     sizeof(struct pm_signal));
 
-	if (ret)
+	if (unlikely(ret))
+		/* Not a fatal error. For Oprofile stop, the oprofile
+		 * functions do not support returning an error for 
+		 * failure to stop OProfile.  
+		 */
 		printk(KERN_WARNING "%s: rtas returned: %d\n",
 		       __FUNCTION__, ret);
 }
 
-static void pm_rtas_activate_signals(u32 node, u32 count)
+static int pm_rtas_activate_signals(u32 node, u32 count)
 {
 	int ret;
 	int i, j;
 	struct pm_signal pm_signal_local[NR_PHYS_CTRS];
 
 	/* There is no debug setup required for the cycles event.
-	* Note that only events in the same group can be used.
-        * Otherwise, there will be conflicts in correctly routing
-        * the signals on the debug bus.  It is the responsiblity
-        * of the OProfile user tool to check the events are in
-        * the same group.
-        */
+	 * Note that only events in the same group can be used.
+	 * Otherwise, there will be conflicts in correctly routing
+	 * the signals on the debug bus.  It is the responsiblity
+	 * of the OProfile user tool to check the events are in
+	 * the same group.
+	 */
 
 	i = 0;
 	for (j = 0; j < count; j++) {
@@ -212,10 +246,14 @@
 					     pm_signal_local,
 					     i * sizeof(struct pm_signal));
 
-		if (ret)
+		if (unlikely(ret)) {
 			printk(KERN_WARNING "%s: rtas returned: %d\n",
 			       __FUNCTION__, ret);
+			return -EFWCALL;
+		}
 	}
+
+	return 0;
 }
 
 /*
@@ -297,6 +335,7 @@
 					input_bus[j] = i;
 					pm_regs.group_control |=
 					    (i << (31 - i));
+
 					break;
 				}
 			}
@@ -481,17 +520,15 @@
 }
 
 /* This function is called once for all cpus combined */
-static void
+static int
 cell_reg_setup(struct op_counter_config *ctr,
 	       struct op_system_config *sys, int num_ctrs)
 {
 	int i, j, cpu;
+	spu_cycle_reset = 0;
 
-	pm_rtas_token = rtas_token("ibm,cbe-perftools");
-	if (pm_rtas_token == RTAS_UNKNOWN_SERVICE) {
-		printk(KERN_WARNING "%s: RTAS_UNKNOWN_SERVICE\n",
-		       __FUNCTION__);
-		goto out;
+	if (ctr[0].event == SPU_CYCLES_EVENT_NUM) {
+		spu_cycle_reset = ctr[0].count;
 	}
 
 	num_counters = num_ctrs;
@@ -568,28 +605,27 @@
 		for (i = 0; i < num_counters; ++i) {
 			per_cpu(pmc_values, cpu)[i] = reset_value[i];
 		}
-out:
-	;
+
+	return 0;
 }
 
+
+
 /* This function is called once for each cpu */
-static void cell_cpu_setup(struct op_counter_config *cntr)
+static int cell_cpu_setup(struct op_counter_config *cntr)
 {
 	u32 cpu = smp_processor_id();
 	u32 num_enabled = 0;
 	int i;
 
+	if (spu_cycle_reset)
+		return 0;
+
 	/* There is one performance monitor per processor chip (i.e. node),
 	 * so we only need to perform this function once per node.
 	 */
 	if (cbe_get_hw_thread_id(cpu))
-		goto out;
-
-	if (pm_rtas_token == RTAS_UNKNOWN_SERVICE) {
-		printk(KERN_WARNING "%s: RTAS_UNKNOWN_SERVICE\n",
-		       __FUNCTION__);
-		goto out;
-	}
+		return 0;
 
 	/* Stop all counters */
 	cbe_disable_pm(cpu);
@@ -608,16 +644,253 @@
 		}
 	}
 
-	pm_rtas_activate_signals(cbe_cpu_to_node(cpu), num_enabled);
+	/* the pm_rtas_activate_signals will return -EFWCALL if the FW
+	 * call failed. 
+	 */
+	return (pm_rtas_activate_signals(cbe_cpu_to_node(cpu), num_enabled));
+	
+}
+
+#define size 24
+#define ENTRIES  (0x1<<8) /* 256 */
+#define MAXLFSR  0xFFFFFF
+
+int initial_lfsr[] =
+{16777215, 3797240, 13519805, 11602690, 6497030, 7614675, 2328937, 2889445,
+ 12364575, 8723156, 2450594, 16280864, 14742496, 10904589, 6434212, 4996256,
+ 5814270, 13014041, 9825245, 410260, 904096, 15151047, 15487695, 3061843,
+ 16482682, 7938572, 4893279, 9390321, 4320879, 5686402, 1711063, 10176714,
+ 4512270, 1057359, 16700434, 5731602, 2070114, 16030890, 1208230, 15603106,
+ 11857845, 6470172, 1362790, 7316876, 8534496, 1629197, 10003072, 1714539,
+ 1814669, 7106700, 5427154, 3395151, 3683327, 12950450, 16620273, 12122372,
+ 7194999, 9952750, 3608260, 13604295, 2266835, 14943567, 7079230, 777380,
+ 4516801, 1737661, 8730333, 13796927, 3247181, 9950017, 3481896, 16527555,
+ 13116123, 14505033, 9781119, 4860212, 7403253, 13264219, 12269980, 100120,
+ 664506, 607795, 8274553, 13133688, 6215305, 13208866, 16439693, 3320753,
+ 8773582, 13874619, 1784784, 4513501, 11002978, 9318515, 3038856, 14254582,
+ 15484958, 15967857, 13504461, 13657322, 14724513, 13955736, 5695315, 7330509,
+ 12630101, 6826854, 439712, 4609055, 13288878, 1309632, 4996398, 11392266,
+ 793740, 7653789, 2472670, 14641200, 5164364, 5482529, 10415855, 1629108,
+ 2012376, 13661123, 14655718, 9534083, 16637925, 2537745, 9787923, 12750103,
+ 4660370, 3283461, 14862772, 7034955, 6679872, 8918232, 6506913, 103649,
+ 6085577, 13324033, 14251613, 11058220, 11998181, 3100233, 468898, 7104918,
+ 12498413, 14408165, 1208514, 15712321, 3088687, 14778333, 3632503, 11151952,
+ 98896, 9159367, 8866146, 4780737, 4925758, 12362320, 4122783, 8543358,
+ 7056879, 10876914, 6282881, 1686625, 5100373, 4573666, 9265515, 13593840,
+ 5853060, 1188880, 4237111, 15765555, 14344137, 4608332, 6590210, 13745050,
+ 10916568, 12340402, 7145275, 4417153, 2300360, 12079643, 7608534, 15238251,
+ 4947424, 7014722, 3984546, 7168073, 10759589, 16293080, 3757181, 4577717,
+ 5163790, 2488841, 4650617, 3650022, 5440654, 1814617, 6939232, 15540909,
+ 501788, 1060986, 5058235, 5078222, 3734500, 10762065, 390862, 5172712,
+ 1070780, 7904429, 1669757, 3439997, 2956788, 14944927, 12496638, 994152,
+ 8901173, 11827497, 4268056, 15725859, 1694506, 5451950, 2892428, 1434298,
+ 9048323, 13558747, 15083840, 8154495, 15830901, 391127, 14970070, 2451434,
+ 2080347, 10775644, 14599429, 12540753, 4813943, 16140655, 2421772, 12724304,
+ 12935733, 7206473, 5697333, 10328104, 2418008, 13547986, 284246, 1732363,
+ 16375319, 8109554, 16372365, 14346072, 1835890, 13059499, 2442500, 4110674};
+
+/*
+ * The hardware uses an LFSR counting sequence to determine when to capture
+ * the SPU PCs.  The SPU PC capture is done when the LFSR sequence reaches the
+ * last value in the sequence.  An LFSR sequence is like a puesdo random
+ * number sequence where each number occurs once in the sequence but the
+ * sequence is not in numerical order.  To reduce the calculation time, a
+ * sequence of 256 precomputed values in the LFSR sequence are stored in a
+ * table.  The nearest precomputed value is used as the initial point from
+ * which to caculate the desired LFSR value that is n from the end of the
+ * sequence.  The lookup table reduces the maximum number of iterations in
+ * the loop from 2^24 to 2^16.
+ */
+static int calculate_lfsr(int n)
+{
+  int i;
+
+  int start_lfsr_index;
+  unsigned int newlfsr0;
+  unsigned int lfsr = MAXLFSR;
+  unsigned int binsize = (MAXLFSR+1)/ENTRIES;
+  unsigned int howmany;
+
+  start_lfsr_index = (MAXLFSR - n) / binsize;
+  lfsr = initial_lfsr[start_lfsr_index];
+  howmany = (MAXLFSR - n) - (start_lfsr_index * (binsize));
+
+  for (i = 2; i < howmany+2; i++) {
+    newlfsr0 = (((lfsr >> (size - 1 - 0)) & 1) ^
+		((lfsr >> (size - 1 - 1)) & 1) ^
+		(((lfsr >> (size - 1 - 6)) & 1) ^
+		 ((lfsr >> (size - 1 - 23)) & 1)));
+
+    lfsr >>= 1;
+    lfsr = lfsr | (newlfsr0 << (size - 1));
+  }
+  return lfsr;
+}
+
+static int pm_rtas_activate_spu_profiling(u32 node)
+{
+	int ret, i;
+	struct pm_signal pm_signal_local[NR_PHYS_CTRS];
+
+	/* Set up the rtas call to configure the debug bus to
+	 * route the SPU PCs.  Setup the pm_signal for each SPU */
+	for (i = 0; i < NUM_SPUS_PER_NODE; i++) {
+		pm_signal_local[i].cpu = node;
+		pm_signal_local[i].signal_group = 41;
+		pm_signal_local[i].bus_word = 1 << i / 2; /* spu i on
+							   * word (i/2)
+							   */
+		pm_signal_local[i].sub_unit = i;	/* spu i */
+		pm_signal_local[i].bit = 63;
+	}
+
+	pm_rtas_token = rtas_token("ibm,cbe-perftools");
+
+	if (unlikely(pm_rtas_token == RTAS_UNKNOWN_SERVICE)) {
+		printk(KERN_WARNING "%s: RTAS_UNKNOWN_SERVICE \n",
+		       __FUNCTION__);
+		return -EFWCALL;
+	} else {
+
+		ret = rtas_ibm_cbe_perftools(SUBFUNC_ACTIVATE, 
+				     PASSTHRU_ENABLE, pm_signal_local,
+				     (NUM_SPUS_PER_NODE 
+				      * sizeof(struct pm_signal))); 
+
+		if (unlikely(ret)) {
+			printk(KERN_WARNING "%s: rtas returned: %d\n",
+			       __FUNCTION__, ret);
+			return -EFWCALL;
+		}
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_CPU_FREQ
+static int
+oprof_cpufreq_notify(struct notifier_block *nb, unsigned long val, void *data)
+{
+	int ret = 0;
+	struct cpufreq_freqs * frq = data;
+	if ((val == CPUFREQ_PRECHANGE && frq->old < frq->new) ||
+	    (val == CPUFREQ_POSTCHANGE && frq->old > frq->new) ||
+	    (val == CPUFREQ_RESUMECHANGE || val == CPUFREQ_SUSPENDCHANGE))
+		set_profiling_frequency(frq->new, spu_cycle_reset);
+	return ret;
+}
+
+static struct notifier_block cpu_freq_notifier_block = {
+	.notifier_call	= oprof_cpufreq_notify
+};
+#endif
+
+static int cell_global_start_spu(struct op_counter_config *ctr)
+{
+	int subfunc, rtn_value;
+	unsigned int lfsr_value;
+	int cpu;
+	int ret = 0;
+	int rtas_error = 0;
+	unsigned int cpu_khzfreq = 0;
+
+	/* The SPU profiling uses time-based profiling based on
+	 * cpu frequency, so if configured with the CPU_FREQ
+	 * option, we should detect frequency changes and react
+	 * accordingly.
+	 */
+#ifdef CONFIG_CPU_FREQ
+	ret = cpufreq_register_notifier(&cpu_freq_notifier_block,
+					CPUFREQ_TRANSITION_NOTIFIER);
+	if (ret < 0) 
+		/* this is not a fatal error */
+		printk(KERN_ERR "CPU freq change registration failed: %d\n",
+		       ret);
+
+	else
+		cpu_khzfreq = cpufreq_quick_get(smp_processor_id());
+#endif
+
+	set_profiling_frequency(cpu_khzfreq, spu_cycle_reset);
+
+	for_each_online_cpu(cpu) {
+		if (cbe_get_hw_thread_id(cpu))
+			continue;
+		/* Setup SPU cycle-based profiling.
+		 * Set perf_mon_control bit 0 to a zero before
+		 * enabling spu collection hardware.
+		 */
+		cbe_write_pm(cpu, pm_control, 0);
+
+		if (spu_cycle_reset > 0xFFFFFE)
+				lfsr_value = calculate_lfsr(1);  /* use largest
+								  * possible 
+								  * value
+								  */
+		else
+		    lfsr_value = calculate_lfsr(spu_cycle_reset);
+
+		if (lfsr_value == 0) {  /* must use a non zero value.  Zero
+					 * disables data collection.
+					 */
+				lfsr_value = calculate_lfsr(1);  /* use largest
+								  * possible 
+								  * value
+								 */
+		}
+
+		lfsr_value = lfsr_value << 8; /* shift lfsr to correct
+					       * register location
+					       */
+
+		ret = pm_rtas_activate_spu_profiling(cbe_cpu_to_node(cpu));
+
+		if (unlikely(ret)) {
+			rtas_error = ret;
+			goto out;
+		}
+
+		pm_rtas_token = rtas_token("ibm,cbe-spu-perftools");
+
+		if (unlikely(pm_rtas_token == RTAS_UNKNOWN_SERVICE)) {
+			printk(KERN_ERR
+			       "%s: rtas token ibm,cbe-spu-perftools unknown\n",
+			       __FUNCTION__);
+			rtas_error = -EFWCALL;
+			goto out;
+		}
+
+		subfunc = 2;	// 2 - activate SPU tracing, 3 - deactivate
+
+		/* If the rtas token lookup failed, the rtas call will also 
+		 * fail.  Failure of the rtas_call will not cause any 
+		 * additional issues. 
+		 */
+		rtn_value = rtas_call(pm_rtas_token, 3, 1, NULL, subfunc,
+		  cbe_cpu_to_node(cpu), lfsr_value); 
+
+		if (unlikely(rtn_value != 0)) {
+			printk(KERN_ERR
+			       "%s: rtas call ibm,cbe-spu-perftools failed, return = %d\n",
+			       __FUNCTION__, rtn_value);
+			rtas_error = -EFWCALL;
+			goto out;
+		}
+	}
+
+	start_spu_profiling(spu_cycle_reset);
+
+	oprofile_running = 1;
+	return 0;
+
 out:
-	;
+	return rtas_error;
 }
 
-static void cell_global_start(struct op_counter_config *ctr)
+static int cell_global_start_ppu(struct op_counter_config *ctr)
 {
-	u32 cpu;
+	u32 cpu, i;
 	u32 interrupt_mask = 0;
-	u32 i;
 
 	/* This routine gets called once for the system.
 	 * There is one performance monitor per node, so we
@@ -656,9 +929,76 @@
 	 * the above for-loop.
 	 */
 	start_virt_cntrs();
+
+	return 0;
 }
 
-static void cell_global_stop(void)
+
+static int cell_global_start(struct op_counter_config *ctr)
+{
+	if (spu_cycle_reset) {
+		return cell_global_start_spu(ctr);
+	} else {
+		return cell_global_start_ppu(ctr);
+	}
+}
+
+static void cell_global_stop_spu(void)
+/* Note the generic OProfile stop calls do not support returning 
+ * an error on stop.  Hence, will not return an error if the FW
+ * calls fail on stop.  Failure to reset the debug bus is not an issue.
+ * Failure to disable the SPU profiling is not an issue.  The FW calls
+ * to enable the performance counters and debug bus will work even if 
+ * the hardware was not cleanly reset.
+ */
+{
+	int subfunc, rtn_value;
+	unsigned int lfsr_value;
+	int cpu;
+
+	oprofile_running = 0;
+
+#ifdef CONFIG_CPU_FREQ
+	cpufreq_unregister_notifier(&cpu_freq_notifier_block,
+				    CPUFREQ_TRANSITION_NOTIFIER);
+#endif
+
+	for_each_online_cpu(cpu) {
+		if (cbe_get_hw_thread_id(cpu))
+			continue;
+
+		pm_rtas_token = rtas_token("ibm,cbe-spu-perftools");
+
+		if (unlikely(pm_rtas_token == RTAS_UNKNOWN_SERVICE)) {
+			printk(KERN_ERR
+			       "%s: rtas token ibm,cbe-spu-perftools unknown\n",
+			       __FUNCTION__);
+		} else {
+
+			subfunc = 3;	/* 2 - activate SPU tracing, 
+					 * 3 - deactivate
+					 */
+			lfsr_value = 0x8f100000;
+
+			rtn_value = rtas_call(pm_rtas_token, 3, 1, NULL, 
+					      subfunc, cbe_cpu_to_node(cpu), 
+					      lfsr_value);
+
+			if (unlikely(rtn_value != 0)) {
+				printk(KERN_ERR
+				       "%s: rtas call ibm,cbe-spu-perftools failed, return = %d\n",
+			       __FUNCTION__, rtn_value);
+			}
+		}
+
+		/* Deactivate the signals */
+		pm_rtas_reset_signals(cbe_cpu_to_node(cpu));
+	}
+
+	stop_spu_profiling();
+}
+
+static void cell_global_stop_ppu(void)
 {
 	int cpu;
 
@@ -686,6 +1026,16 @@
 	}
 }
 
+static void cell_global_stop(void)
+{
+	if (spu_cycle_reset) {
+		cell_global_stop_spu();
+	} else {
+		cell_global_stop_ppu();
+	}
+
+}
+
 static void
 cell_handle_interrupt(struct pt_regs *regs, struct op_counter_config *ctr)
 {
@@ -754,10 +1104,33 @@
 	spin_unlock_irqrestore(&virt_cntr_lock, flags);
 }
 
+/* This function is called from the generic OProfile
+ * driver.  When profiling PPUs, we need to do the
+ * generic sync start; otherwise, do spu_sync_start.
+ */
+static int cell_sync_start(void)
+{
+	if (spu_cycle_reset)
+		return spu_sync_start();
+	else
+		return DO_GENERIC_SYNC;
+}
+
+static int cell_sync_stop(void)
+{
+	if (spu_cycle_reset)
+		return spu_sync_stop();
+	else
+		return 1;
+}
+
+
 struct op_powerpc_model op_model_cell = {
 	.reg_setup = cell_reg_setup,
 	.cpu_setup = cell_cpu_setup,
 	.global_start = cell_global_start,
 	.global_stop = cell_global_stop,
+	.sync_start = cell_sync_start,
+	.sync_stop = cell_sync_stop,
 	.handle_interrupt = cell_handle_interrupt,
 };
Index: linux-2.6.20-rc1/arch/powerpc/platforms/cell/spufs/sched.c
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/platforms/cell/spufs/sched.c	2007-02-07 13:11:02.910889616 -0600
+++ linux-2.6.20-rc1/arch/powerpc/platforms/cell/spufs/sched.c	2007-02-07 13:27:38.023975344 -0600
@@ -129,6 +129,7 @@
 	ctx->spu = spu;
 	ctx->ops = &spu_hw_ops;
 	spu->pid = current->pid;
+	spu->tgid = current->tgid;
 	spu->prio = current->prio;
 	spu->mm = ctx->owner;
 	mm_needs_global_tlbie(spu->mm);
@@ -161,6 +162,7 @@
 	spu->dma_callback = NULL;
 	spu->mm = NULL;
 	spu->pid = 0;
+	spu->tgid = 0;
 	spu->prio = MAX_PRIO;
 	ctx->ops = &spu_backing_ops;
 	ctx->spu = NULL;
Index: linux-2.6.20-rc1/drivers/oprofile/buffer_sync.c
===================================================================
--- linux-2.6.20-rc1.orig/drivers/oprofile/buffer_sync.c	2007-01-18 16:43:11.000000000 -0600
+++ linux-2.6.20-rc1/drivers/oprofile/buffer_sync.c	2007-02-07 13:27:38.025975040 -0600
@@ -26,6 +26,7 @@
 #include <linux/profile.h>
 #include <linux/module.h>
 #include <linux/fs.h>
+#include <linux/oprofile.h>
  
 #include "oprofile_stats.h"
 #include "event_buffer.h"
Index: linux-2.6.20-rc1/drivers/oprofile/event_buffer.h
===================================================================
--- linux-2.6.20-rc1.orig/drivers/oprofile/event_buffer.h	2007-01-18 16:43:11.000000000 -0600
+++ linux-2.6.20-rc1/drivers/oprofile/event_buffer.h	2007-02-07 13:41:54.276001640 -0600
@@ -19,28 +19,10 @@
  
 /* wake up the process sleeping on the event file */
 void wake_up_buffer_waiter(void);
- 
-/* Each escaped entry is prefixed by ESCAPE_CODE
- * then one of the following codes, then the
- * relevant data.
- */
-#define ESCAPE_CODE			~0UL
-#define CTX_SWITCH_CODE 		1
-#define CPU_SWITCH_CODE 		2
-#define COOKIE_SWITCH_CODE 		3
-#define KERNEL_ENTER_SWITCH_CODE	4
-#define KERNEL_EXIT_SWITCH_CODE		5
-#define MODULE_LOADED_CODE		6
-#define CTX_TGID_CODE			7
-#define TRACE_BEGIN_CODE		8
-#define TRACE_END_CODE			9
- 
+
 #define INVALID_COOKIE ~0UL
 #define NO_COOKIE 0UL
 
-/* add data to the event buffer */
-void add_event_entry(unsigned long data);
- 
 extern struct file_operations event_buffer_fops;
  
 /* mutex between sync_cpu_buffers() and the
Index: linux-2.6.20-rc1/drivers/oprofile/oprof.c
===================================================================
--- linux-2.6.20-rc1.orig/drivers/oprofile/oprof.c	2007-01-18 16:43:11.000000000 -0600
+++ linux-2.6.20-rc1/drivers/oprofile/oprof.c	2007-02-09 14:52:52.436977672 -0600
@@ -53,9 +53,23 @@
 	 * us missing task deaths and eventually oopsing
 	 * when trying to process the event buffer.
 	 */
+	if (oprofile_ops.sync_start) {
+		int sync_ret = oprofile_ops.sync_start();
+		switch (sync_ret) {
+			case 0: goto post_sync;
+				break;
+			case 1: goto do_generic;
+				break;
+			case -1: goto out3;
+				break;
+			default: goto out3;
+		}
+	}
+do_generic:
 	if ((err = sync_start()))
 		goto out3;
 
+post_sync:
 	is_setup = 1;
 	mutex_unlock(&start_mutex);
 	return 0;
@@ -118,7 +132,19 @@
 void oprofile_shutdown(void)
 {
 	mutex_lock(&start_mutex);
+        if (oprofile_ops.sync_stop) {
+                int sync_ret = oprofile_ops.sync_stop();
+                switch (sync_ret) {
+                        case 0: goto post_sync;
+                                break;
+                        case 1: goto do_generic;
+                                break;
+			default: goto post_sync;
+                }
+        }
+do_generic:
 	sync_stop();
+post_sync:
 	if (oprofile_ops.shutdown)
 		oprofile_ops.shutdown();
 	is_setup = 0;
Index: linux-2.6.20-rc1/include/asm-powerpc/oprofile_impl.h
===================================================================
--- linux-2.6.20-rc1.orig/include/asm-powerpc/oprofile_impl.h	2007-01-18 16:43:19.000000000 -0600
+++ linux-2.6.20-rc1/include/asm-powerpc/oprofile_impl.h	2007-02-14 17:03:33.957037264 -0600
@@ -39,14 +39,16 @@
 
 /* Per-arch configuration */
 struct op_powerpc_model {
-	void (*reg_setup) (struct op_counter_config *,
+	int (*reg_setup) (struct op_counter_config *,
 			   struct op_system_config *,
 			   int num_counters);
-	void (*cpu_setup) (struct op_counter_config *);
-	void (*start) (struct op_counter_config *);
-        void (*global_start) (struct op_counter_config *);
+	int  (*cpu_setup) (struct op_counter_config *);
+	int  (*start) (struct op_counter_config *);
+        int  (*global_start) (struct op_counter_config *);
 	void (*stop) (void);
 	void (*global_stop) (void);
+	int (*sync_start)(void);
+	int (*sync_stop)(void);
 	void (*handle_interrupt) (struct pt_regs *,
 				  struct op_counter_config *);
 	int num_counters;
Index: linux-2.6.20-rc1/include/asm-powerpc/spu.h
===================================================================
--- linux-2.6.20-rc1.orig/include/asm-powerpc/spu.h	2007-02-07 13:09:53.545918192 -0600
+++ linux-2.6.20-rc1/include/asm-powerpc/spu.h	2007-02-07 13:30:06.580890768 -0600
@@ -128,6 +128,7 @@
 	struct spu_runqueue *rq;
 	unsigned long long timestamp;
 	pid_t pid;
+	pid_t tgid;
 	int prio;
 	int class_0_pending;
 	spinlock_t register_lock;
@@ -153,6 +154,11 @@
 int spu_irq_class_0_bottom(struct spu *spu);
 int spu_irq_class_1_bottom(struct spu *spu);
 void spu_irq_setaffinity(struct spu *spu, int cpu);
+void * spu_get_profile_private(struct spu_context * ctx);
+void spu_set_profile_private(struct spu_context * ctx, void * profile_info,
+			     struct kref * prof_info_kref,
+			     void (* prof_info_release) (struct kref * kref));
+
 
 /* system callbacks from the SPU */
 struct spu_syscall_block {
Index: linux-2.6.20-rc1/include/linux/oprofile.h
===================================================================
--- linux-2.6.20-rc1.orig/include/linux/oprofile.h	2007-01-18 16:43:18.000000000 -0600
+++ linux-2.6.20-rc1/include/linux/oprofile.h	2007-02-07 13:30:41.737005680 -0600
@@ -17,6 +17,28 @@
 #include <linux/spinlock.h>
 #include <asm/atomic.h>
  
+/* Each escaped entry is prefixed by ESCAPE_CODE
+ * then one of the following codes, then the
+ * relevant data.
+ * These #defines live in this file so that arch-specific
+ * buffer sync'ing code can access them.
+ */
+#define ESCAPE_CODE                     ~0UL
+#define CTX_SWITCH_CODE                 1
+#define CPU_SWITCH_CODE                 2
+#define COOKIE_SWITCH_CODE              3
+#define KERNEL_ENTER_SWITCH_CODE        4
+#define KERNEL_EXIT_SWITCH_CODE         5
+#define MODULE_LOADED_CODE              6
+#define CTX_TGID_CODE                   7
+#define TRACE_BEGIN_CODE                8
+#define TRACE_END_CODE                  9
+#define XEN_ENTER_SWITCH_CODE          10
+#define SPU_PROFILING_CODE             11
+#define SPU_CTX_SWITCH_CODE            12
+#define SPU_OFFSET_CODE                13
+#define SPU_COOKIE_CODE                14
+
 struct super_block;
 struct dentry;
 struct file_operations;
@@ -35,6 +57,14 @@
 	int (*start)(void);
 	/* Stop delivering interrupts. */
 	void (*stop)(void);
+	/* Arch-specific buffer sync functions.
+	 * Return value = 0:  Success
+	 * Return value = -1: Failure
+	 * Return value = 1:  Run generic sync function
+	 */
+	int (*sync_start)(void);
+	int (*sync_stop)(void);
+
 	/* Initiate a stack backtrace. Optional. */
 	void (*backtrace)(struct pt_regs * const regs, unsigned int depth);
 	/* CPU identification string. */
@@ -56,6 +86,13 @@
 void oprofile_arch_exit(void);
 
 /**
+ * Add data to the event buffer.
+ * The data passed is free-form, but typically consists of
+ * file offsets, dcookies, context information, and ESCAPE codes.
+ */
+void add_event_entry(unsigned long data);
+
+/**
  * Add a sample. This may be called from any context. Pass
  * smp_processor_id() as cpu.
  */
Index: linux-2.6.20-rc1/kernel/hrtimer.c
===================================================================
--- linux-2.6.20-rc1.orig/kernel/hrtimer.c	2007-01-18 16:43:05.000000000 -0600
+++ linux-2.6.20-rc1/kernel/hrtimer.c	2007-02-07 13:27:38.037973216 -0600
@@ -335,6 +335,7 @@
 
 	return orun;
 }
+EXPORT_SYMBOL_GPL(hrtimer_forward);
 
 /*
  * enqueue_hrtimer - internal function to (re)start a timer
Index: linux-2.6.20-rc1/arch/powerpc/kernel/time.c
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/kernel/time.c	2007-02-02 15:47:08.000000000 -0600
+++ linux-2.6.20-rc1/arch/powerpc/kernel/time.c	2007-02-07 13:27:38.041972608 -0600
@@ -122,6 +122,7 @@
 static long timezone_offset;
 
 unsigned long ppc_proc_freq;
+EXPORT_SYMBOL(ppc_proc_freq);
 unsigned long ppc_tb_freq;
 
 static u64 tb_last_jiffy __cacheline_aligned_in_smp;
Index: linux-2.6.20-rc1/arch/powerpc/platforms/cell/spufs/spufs.h
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/platforms/cell/spufs/spufs.h	2007-02-07 13:09:53.541918800 -0600
+++ linux-2.6.20-rc1/arch/powerpc/platforms/cell/spufs/spufs.h	2007-02-07 13:27:38.042972456 -0600
@@ -75,6 +75,9 @@
 
 	struct list_head gang_list;
 	struct spu_gang *gang;
+	void * profile_private;		/* To be used only by profiler */
+	struct kref * prof_priv_kref;
+	void (* prof_priv_release) (struct kref *kref);
 };
 
 struct spu_gang {
Index: linux-2.6.20-rc1/arch/powerpc/platforms/cell/spufs/context.c
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/platforms/cell/spufs/context.c	2007-02-05 14:42:04.000000000 -0600
+++ linux-2.6.20-rc1/arch/powerpc/platforms/cell/spufs/context.c	2007-02-07 13:27:38.044972152 -0600
@@ -22,6 +22,7 @@
 
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/module.h>
 #include <linux/slab.h>
 #include <asm/spu.h>
 #include <asm/spu_csa.h>
@@ -71,6 +72,8 @@
 	spu_fini_csa(&ctx->csa);
 	if (ctx->gang)
 		spu_gang_remove_ctx(ctx->gang, ctx);
+	if (ctx->prof_priv_kref)
+		kref_put(ctx->prof_priv_kref, ctx->prof_priv_release);
 	kfree(ctx);
 }
 
@@ -200,3 +203,29 @@
 
 	downgrade_write(&ctx->state_sema);
 }
+
+/* This interface allows a profiler (e.g., OProfile) to store
+ * spu_context information needed for profiling, allowing it to
+ * be saved across context save/restore operation.
+ *
+ * Assumes the caller has already incremented the ref count to
+ * profile_info; then spu_context_destroy must call kref_put
+ * on prof_info_kref.
+ */
+void spu_set_profile_private(struct spu_context * ctx, void * profile_info,
+			     struct kref * prof_info_kref,
+			     void (* prof_info_release) (struct kref * kref))
+{
+	ctx->profile_private = profile_info;
+	ctx->prof_priv_kref = prof_info_kref;
+	ctx->prof_priv_release = prof_info_release;
+}
+EXPORT_SYMBOL_GPL(spu_set_profile_private);
+
+void * spu_get_profile_private(struct spu_context * ctx)
+{
+	return ctx->profile_private;
+}
+EXPORT_SYMBOL_GPL(spu_get_profile_private);
+
+
Index: linux-2.6.20-rc1/include/linux/dcookies.h
===================================================================
--- linux-2.6.20-rc1.orig/include/linux/dcookies.h	2007-01-18 16:43:18.626538432 -0600
+++ linux-2.6.20-rc1/include/linux/dcookies.h	2007-02-09 17:06:12.226931336 -0600
@@ -12,6 +12,7 @@
 
 #ifdef CONFIG_PROFILING
  
+#include <linux/dcache.h>
 #include <linux/types.h>
  
 struct dcookie_user;
Index: linux-2.6.20-rc1/include/linux/elf-em.h
===================================================================
--- linux-2.6.20-rc1.orig/include/linux/elf-em.h	2007-01-18 16:43:18.000000000 -0600
+++ linux-2.6.20-rc1/include/linux/elf-em.h	2007-02-14 12:07:17.059067768 -0600
@@ -21,6 +21,7 @@
 #define EM_SPARC32PLUS	18	/* Sun's "v8plus" */
 #define EM_PPC		20	/* PowerPC */
 #define EM_PPC64	21       /* PowerPC64 */
+#define EM_SPU		23	/* Cell BE SPU */
 #define EM_SH		42	/* SuperH */
 #define EM_SPARCV9	43	/* SPARC v9 64-bit */
 #define EM_IA_64	50	/* HP/Intel IA-64 */



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-14 23:52 [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch Carl Love
@ 2007-02-15 14:37   ` Arnd Bergmann
  0 siblings, 0 replies; 66+ messages in thread
From: Arnd Bergmann @ 2007-02-15 14:37 UTC (permalink / raw)
  To: cbe-oss-dev; +Cc: Carl Love, linuxppc-dev, linux-kernel, oprofile-list

On Thursday 15 February 2007 00:52, Carl Love wrote:


> --- linux-2.6.20-rc1.orig/arch/powerpc/oprofile/Kconfig	2007-01-18 16:43:14.000000000 -0600
> +++ linux-2.6.20-rc1/arch/powerpc/oprofile/Kconfig	2007-02-13 19:04:46.271028904 -0600
> @@ -7,7 +7,8 @@
>  
>  config OPROFILE
>  	tristate "OProfile system profiling (EXPERIMENTAL)"
> -	depends on PROFILING
> +	default m
> +	depends on SPU_FS && PROFILING
>  	help
>  	  OProfile is a profiling system capable of profiling the
>  	  whole system, include the kernel, kernel modules, libraries,

Milton already commented on this being wrong. I think what you want
is
	depends on PROFILING && (SPU_FS = n || SPU_FS)

that should make sure that when SPU_FS=y that OPROFILE can not be 'm'.

> @@ -15,3 +16,10 @@
>  
>  	  If unsure, say N.
>  
> +config OPROFILE_CELL
> +	bool "OProfile for Cell Broadband Engine"
> +	depends on SPU_FS && OPROFILE
> +	default y
> +	help
> +	  OProfile for Cell BE requires special support enabled
> +	  by this option.

You should at least mention that this allows profiling the spus.
  
> +#define EFWCALL  ENOSYS         /* Use an existing error number that is as
> +				 * close as possible for a FW call that failed.
> +				 * The probability of the call failing is
> +				 * very low.  Passing up the error number
> +				 * ensures that the user will see an error
> +				 * message saying OProfile did not start.
> +				 * Dmesg will contain an accurate message
> +				 * about the failure.
> +				 */

ENOSYS looks wrong though. It would appear to the user as if the oprofile
function in the kernel was not present. I'd suggest EIO, and not use 
an extra define for that.


>  static int
>  rtas_ibm_cbe_perftools(int subfunc, int passthru,
>  		       void *address, unsigned long length)
>  {
>  	u64 paddr = __pa(address);
>  
> -	return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc, passthru,
> -			 paddr >> 32, paddr & 0xffffffff, length);
> +	pm_rtas_token = rtas_token("ibm,cbe-perftools");
> +
> +	if (unlikely(pm_rtas_token == RTAS_UNKNOWN_SERVICE)) {
> +		printk(KERN_ERR
> +		       "%s: rtas token ibm,cbe-perftools unknown\n",
> +		       __FUNCTION__);
> +		return -EFWCALL;
> +	} else {
> +
> +		return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc, 
> +			 passthru, paddr >> 32, paddr & 0xffffffff, length); 
> +	}
>  }

Are you now reading the rtas token every time you call rtas? that seems
like a waste of time.


> +#define size 24
> +#define ENTRIES  (0x1<<8) /* 256 */
> +#define MAXLFSR  0xFFFFFF
> +
> +int initial_lfsr[] =
> +{16777215, 3797240, 13519805, 11602690, 6497030, 7614675, 2328937, 2889445,
> + 12364575, 8723156, 2450594, 16280864, 14742496, 10904589, 6434212, 4996256,
> + 5814270, 13014041, 9825245, 410260, 904096, 15151047, 15487695, 3061843,
> + 16482682, 7938572, 4893279, 9390321, 4320879, 5686402, 1711063, 10176714,
> + 4512270, 1057359, 16700434, 5731602, 2070114, 16030890, 1208230, 15603106,
> + 11857845, 6470172, 1362790, 7316876, 8534496, 1629197, 10003072, 1714539,
> + 1814669, 7106700, 5427154, 3395151, 3683327, 12950450, 16620273, 12122372,
> + 7194999, 9952750, 3608260, 13604295, 2266835, 14943567, 7079230, 777380,
> + 4516801, 1737661, 8730333, 13796927, 3247181, 9950017, 3481896, 16527555,
> + 13116123, 14505033, 9781119, 4860212, 7403253, 13264219, 12269980, 100120,
> + 664506, 607795, 8274553, 13133688, 6215305, 13208866, 16439693, 3320753,
> + 8773582, 13874619, 1784784, 4513501, 11002978, 9318515, 3038856, 14254582,
> + 15484958, 15967857, 13504461, 13657322, 14724513, 13955736, 5695315, 7330509,
> + 12630101, 6826854, 439712, 4609055, 13288878, 1309632, 4996398, 11392266,
> + 793740, 7653789, 2472670, 14641200, 5164364, 5482529, 10415855, 1629108,
> + 2012376, 13661123, 14655718, 9534083, 16637925, 2537745, 9787923, 12750103,
> + 4660370, 3283461, 14862772, 7034955, 6679872, 8918232, 6506913, 103649,
> + 6085577, 13324033, 14251613, 11058220, 11998181, 3100233, 468898, 7104918,
> + 12498413, 14408165, 1208514, 15712321, 3088687, 14778333, 3632503, 11151952,
> + 98896, 9159367, 8866146, 4780737, 4925758, 12362320, 4122783, 8543358,
> + 7056879, 10876914, 6282881, 1686625, 5100373, 4573666, 9265515, 13593840,
> + 5853060, 1188880, 4237111, 15765555, 14344137, 4608332, 6590210, 13745050,
> + 10916568, 12340402, 7145275, 4417153, 2300360, 12079643, 7608534, 15238251,
> + 4947424, 7014722, 3984546, 7168073, 10759589, 16293080, 3757181, 4577717,
> + 5163790, 2488841, 4650617, 3650022, 5440654, 1814617, 6939232, 15540909,
> + 501788, 1060986, 5058235, 5078222, 3734500, 10762065, 390862, 5172712,
> + 1070780, 7904429, 1669757, 3439997, 2956788, 14944927, 12496638, 994152,
> + 8901173, 11827497, 4268056, 15725859, 1694506, 5451950, 2892428, 1434298,
> + 9048323, 13558747, 15083840, 8154495, 15830901, 391127, 14970070, 2451434,
> + 2080347, 10775644, 14599429, 12540753, 4813943, 16140655, 2421772, 12724304,
> + 12935733, 7206473, 5697333, 10328104, 2418008, 13547986, 284246, 1732363,
> + 16375319, 8109554, 16372365, 14346072, 1835890, 13059499, 2442500, 4110674};
> +
> +/*
> + * The hardware uses an LFSR counting sequence to determine when to capture
> + * the SPU PCs.  The SPU PC capture is done when the LFSR sequence reaches the
> + * last value in the sequence.  An LFSR sequence is like a puesdo random
> + * number sequence where each number occurs once in the sequence but the
> + * sequence is not in numerical order.  To reduce the calculation time, a
> + * sequence of 256 precomputed values in the LFSR sequence are stored in a
> + * table.  The nearest precomputed value is used as the initial point from
> + * which to caculate the desired LFSR value that is n from the end of the
> + * sequence.  The lookup table reduces the maximum number of iterations in
> + * the loop from 2^24 to 2^16.
> + */
> +static int calculate_lfsr(int n)
> +{
> +  int i;
> +
> +  int start_lfsr_index;
> +  unsigned int newlfsr0;
> +  unsigned int lfsr = MAXLFSR;
> +  unsigned int binsize = (MAXLFSR+1)/ENTRIES;
> +  unsigned int howmany;
> +
> +  start_lfsr_index = (MAXLFSR - n) / binsize;
> +  lfsr = initial_lfsr[start_lfsr_index];
> +  howmany = (MAXLFSR - n) - (start_lfsr_index * (binsize));
> +
> +  for (i = 2; i < howmany+2; i++) {
> +    newlfsr0 = (((lfsr >> (size - 1 - 0)) & 1) ^
> +		((lfsr >> (size - 1 - 1)) & 1) ^
> +		(((lfsr >> (size - 1 - 6)) & 1) ^
> +		 ((lfsr >> (size - 1 - 23)) & 1)));
> +
> +    lfsr >>= 1;
> +    lfsr = lfsr | (newlfsr0 << (size - 1));
> +  }
> +  return lfsr;
> +}

I agree with Milton that it would be far nicer even to calculate
the value from user space, but since you say that would
violate the oprofile interface conventions, let's not go there.
In order to make this code nicer on the user, you should probably
insert a 'cond_resched()' somewhere in the loop, maybe every
500 iterations or so.

it also looks like there is whitespace damage in the code here.

> +
> +/* This interface allows a profiler (e.g., OProfile) to store
> + * spu_context information needed for profiling, allowing it to
> + * be saved across context save/restore operation.
> + *
> + * Assumes the caller has already incremented the ref count to
> + * profile_info; then spu_context_destroy must call kref_put
> + * on prof_info_kref.
> + */
> +void spu_set_profile_private(struct spu_context * ctx, void * profile_info,
> +			     struct kref * prof_info_kref,
> +			     void (* prof_info_release) (struct kref * kref))
> +{
> +	ctx->profile_private = profile_info;
> +	ctx->prof_priv_kref = prof_info_kref;
> +	ctx->prof_priv_release = prof_info_release;
> +}
> +EXPORT_SYMBOL_GPL(spu_set_profile_private);

I think you don't need the profile_private member here, if you just use
container_of with ctx->prof_priv_kref in all users.

	Arnd <><

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
@ 2007-02-15 14:37   ` Arnd Bergmann
  0 siblings, 0 replies; 66+ messages in thread
From: Arnd Bergmann @ 2007-02-15 14:37 UTC (permalink / raw)
  To: cbe-oss-dev; +Cc: linuxppc-dev, linux-kernel, oprofile-list, Carl Love

On Thursday 15 February 2007 00:52, Carl Love wrote:


> --- linux-2.6.20-rc1.orig/arch/powerpc/oprofile/Kconfig	2007-01-18 16:43:14.000000000 -0600
> +++ linux-2.6.20-rc1/arch/powerpc/oprofile/Kconfig	2007-02-13 19:04:46.271028904 -0600
> @@ -7,7 +7,8 @@
>  
>  config OPROFILE
>  	tristate "OProfile system profiling (EXPERIMENTAL)"
> -	depends on PROFILING
> +	default m
> +	depends on SPU_FS && PROFILING
>  	help
>  	  OProfile is a profiling system capable of profiling the
>  	  whole system, include the kernel, kernel modules, libraries,

Milton already commented on this being wrong. I think what you want
is
	depends on PROFILING && (SPU_FS = n || SPU_FS)

that should make sure that when SPU_FS=y that OPROFILE can not be 'm'.

> @@ -15,3 +16,10 @@
>  
>  	  If unsure, say N.
>  
> +config OPROFILE_CELL
> +	bool "OProfile for Cell Broadband Engine"
> +	depends on SPU_FS && OPROFILE
> +	default y
> +	help
> +	  OProfile for Cell BE requires special support enabled
> +	  by this option.

You should at least mention that this allows profiling the spus.
  
> +#define EFWCALL  ENOSYS         /* Use an existing error number that is as
> +				 * close as possible for a FW call that failed.
> +				 * The probability of the call failing is
> +				 * very low.  Passing up the error number
> +				 * ensures that the user will see an error
> +				 * message saying OProfile did not start.
> +				 * Dmesg will contain an accurate message
> +				 * about the failure.
> +				 */

ENOSYS looks wrong though. It would appear to the user as if the oprofile
function in the kernel was not present. I'd suggest EIO, and not use 
an extra define for that.


>  static int
>  rtas_ibm_cbe_perftools(int subfunc, int passthru,
>  		       void *address, unsigned long length)
>  {
>  	u64 paddr = __pa(address);
>  
> -	return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc, passthru,
> -			 paddr >> 32, paddr & 0xffffffff, length);
> +	pm_rtas_token = rtas_token("ibm,cbe-perftools");
> +
> +	if (unlikely(pm_rtas_token == RTAS_UNKNOWN_SERVICE)) {
> +		printk(KERN_ERR
> +		       "%s: rtas token ibm,cbe-perftools unknown\n",
> +		       __FUNCTION__);
> +		return -EFWCALL;
> +	} else {
> +
> +		return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc, 
> +			 passthru, paddr >> 32, paddr & 0xffffffff, length); 
> +	}
>  }

Are you now reading the rtas token every time you call rtas? that seems
like a waste of time.


> +#define size 24
> +#define ENTRIES  (0x1<<8) /* 256 */
> +#define MAXLFSR  0xFFFFFF
> +
> +int initial_lfsr[] =
> +{16777215, 3797240, 13519805, 11602690, 6497030, 7614675, 2328937, 2889445,
> + 12364575, 8723156, 2450594, 16280864, 14742496, 10904589, 6434212, 4996256,
> + 5814270, 13014041, 9825245, 410260, 904096, 15151047, 15487695, 3061843,
> + 16482682, 7938572, 4893279, 9390321, 4320879, 5686402, 1711063, 10176714,
> + 4512270, 1057359, 16700434, 5731602, 2070114, 16030890, 1208230, 15603106,
> + 11857845, 6470172, 1362790, 7316876, 8534496, 1629197, 10003072, 1714539,
> + 1814669, 7106700, 5427154, 3395151, 3683327, 12950450, 16620273, 12122372,
> + 7194999, 9952750, 3608260, 13604295, 2266835, 14943567, 7079230, 777380,
> + 4516801, 1737661, 8730333, 13796927, 3247181, 9950017, 3481896, 16527555,
> + 13116123, 14505033, 9781119, 4860212, 7403253, 13264219, 12269980, 100120,
> + 664506, 607795, 8274553, 13133688, 6215305, 13208866, 16439693, 3320753,
> + 8773582, 13874619, 1784784, 4513501, 11002978, 9318515, 3038856, 14254582,
> + 15484958, 15967857, 13504461, 13657322, 14724513, 13955736, 5695315, 7330509,
> + 12630101, 6826854, 439712, 4609055, 13288878, 1309632, 4996398, 11392266,
> + 793740, 7653789, 2472670, 14641200, 5164364, 5482529, 10415855, 1629108,
> + 2012376, 13661123, 14655718, 9534083, 16637925, 2537745, 9787923, 12750103,
> + 4660370, 3283461, 14862772, 7034955, 6679872, 8918232, 6506913, 103649,
> + 6085577, 13324033, 14251613, 11058220, 11998181, 3100233, 468898, 7104918,
> + 12498413, 14408165, 1208514, 15712321, 3088687, 14778333, 3632503, 11151952,
> + 98896, 9159367, 8866146, 4780737, 4925758, 12362320, 4122783, 8543358,
> + 7056879, 10876914, 6282881, 1686625, 5100373, 4573666, 9265515, 13593840,
> + 5853060, 1188880, 4237111, 15765555, 14344137, 4608332, 6590210, 13745050,
> + 10916568, 12340402, 7145275, 4417153, 2300360, 12079643, 7608534, 15238251,
> + 4947424, 7014722, 3984546, 7168073, 10759589, 16293080, 3757181, 4577717,
> + 5163790, 2488841, 4650617, 3650022, 5440654, 1814617, 6939232, 15540909,
> + 501788, 1060986, 5058235, 5078222, 3734500, 10762065, 390862, 5172712,
> + 1070780, 7904429, 1669757, 3439997, 2956788, 14944927, 12496638, 994152,
> + 8901173, 11827497, 4268056, 15725859, 1694506, 5451950, 2892428, 1434298,
> + 9048323, 13558747, 15083840, 8154495, 15830901, 391127, 14970070, 2451434,
> + 2080347, 10775644, 14599429, 12540753, 4813943, 16140655, 2421772, 12724304,
> + 12935733, 7206473, 5697333, 10328104, 2418008, 13547986, 284246, 1732363,
> + 16375319, 8109554, 16372365, 14346072, 1835890, 13059499, 2442500, 4110674};
> +
> +/*
> + * The hardware uses an LFSR counting sequence to determine when to capture
> + * the SPU PCs.  The SPU PC capture is done when the LFSR sequence reaches the
> + * last value in the sequence.  An LFSR sequence is like a puesdo random
> + * number sequence where each number occurs once in the sequence but the
> + * sequence is not in numerical order.  To reduce the calculation time, a
> + * sequence of 256 precomputed values in the LFSR sequence are stored in a
> + * table.  The nearest precomputed value is used as the initial point from
> + * which to caculate the desired LFSR value that is n from the end of the
> + * sequence.  The lookup table reduces the maximum number of iterations in
> + * the loop from 2^24 to 2^16.
> + */
> +static int calculate_lfsr(int n)
> +{
> +  int i;
> +
> +  int start_lfsr_index;
> +  unsigned int newlfsr0;
> +  unsigned int lfsr = MAXLFSR;
> +  unsigned int binsize = (MAXLFSR+1)/ENTRIES;
> +  unsigned int howmany;
> +
> +  start_lfsr_index = (MAXLFSR - n) / binsize;
> +  lfsr = initial_lfsr[start_lfsr_index];
> +  howmany = (MAXLFSR - n) - (start_lfsr_index * (binsize));
> +
> +  for (i = 2; i < howmany+2; i++) {
> +    newlfsr0 = (((lfsr >> (size - 1 - 0)) & 1) ^
> +		((lfsr >> (size - 1 - 1)) & 1) ^
> +		(((lfsr >> (size - 1 - 6)) & 1) ^
> +		 ((lfsr >> (size - 1 - 23)) & 1)));
> +
> +    lfsr >>= 1;
> +    lfsr = lfsr | (newlfsr0 << (size - 1));
> +  }
> +  return lfsr;
> +}

I agree with Milton that it would be far nicer even to calculate
the value from user space, but since you say that would
violate the oprofile interface conventions, let's not go there.
In order to make this code nicer on the user, you should probably
insert a 'cond_resched()' somewhere in the loop, maybe every
500 iterations or so.

it also looks like there is whitespace damage in the code here.

> +
> +/* This interface allows a profiler (e.g., OProfile) to store
> + * spu_context information needed for profiling, allowing it to
> + * be saved across context save/restore operation.
> + *
> + * Assumes the caller has already incremented the ref count to
> + * profile_info; then spu_context_destroy must call kref_put
> + * on prof_info_kref.
> + */
> +void spu_set_profile_private(struct spu_context * ctx, void * profile_info,
> +			     struct kref * prof_info_kref,
> +			     void (* prof_info_release) (struct kref * kref))
> +{
> +	ctx->profile_private = profile_info;
> +	ctx->prof_priv_kref = prof_info_kref;
> +	ctx->prof_priv_release = prof_info_release;
> +}
> +EXPORT_SYMBOL_GPL(spu_set_profile_private);

I think you don't need the profile_private member here, if you just use
container_of with ctx->prof_priv_kref in all users.

	Arnd <><

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-15 14:37   ` Arnd Bergmann
@ 2007-02-15 16:15     ` Maynard Johnson
  -1 siblings, 0 replies; 66+ messages in thread
From: Maynard Johnson @ 2007-02-15 16:15 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: cbe-oss-dev, linuxppc-dev, linux-kernel, oprofile-list, Carl Love

Arnd Bergmann wrote:

>On Thursday 15 February 2007 00:52, Carl Love wrote:
>
>
>  
>
>>--- linux-2.6.20-rc1.orig/arch/powerpc/oprofile/Kconfig	2007-01-18 16:43:14.000000000 -0600
>>+++ linux-2.6.20-rc1/arch/powerpc/oprofile/Kconfig	2007-02-13 19:04:46.271028904 -0600
>>@@ -7,7 +7,8 @@
>> 
>> config OPROFILE
>> 	tristate "OProfile system profiling (EXPERIMENTAL)"
>>-	depends on PROFILING
>>+	default m
>>+	depends on SPU_FS && PROFILING
>> 	help
>> 	  OProfile is a profiling system capable of profiling the
>> 	  whole system, include the kernel, kernel modules, libraries,
>>    
>>
>
>Milton already commented on this being wrong. I think what you want
>is
>	depends on PROFILING && (SPU_FS = n || SPU_FS)
>
>that should make sure that when SPU_FS=y that OPROFILE can not be 'm'.
>  
>
Blast it!  I did this right on our development system, but neglected to 
update the patch correctly to remove this dependency and 'default m'.  
I'll fix in the next patch.

>  
>
>>@@ -15,3 +16,10 @@
>> 
>> 	  If unsure, say N.
>> 
>>+config OPROFILE_CELL
>>+	bool "OProfile for Cell Broadband Engine"
>>+	depends on SPU_FS && OPROFILE
>>+	default y
>>+	help
>>+	  OProfile for Cell BE requires special support enabled
>>+	  by this option.
>>    
>>
>
>You should at least mention that this allows profiling the spus.
>  
>
OK.

>  
>
>>+#define EFWCALL  ENOSYS         /* Use an existing error number that is as
>>+				 * close as possible for a FW call that failed.
>>+				 * The probability of the call failing is
>>+				 * very low.  Passing up the error number
>>+				 * ensures that the user will see an error
>>+				 * message saying OProfile did not start.
>>+				 * Dmesg will contain an accurate message
>>+				 * about the failure.
>>+				 */
>>    
>>
>
>ENOSYS looks wrong though. It would appear to the user as if the oprofile
>function in the kernel was not present. I'd suggest EIO, and not use 
>an extra define for that.
>  
>
Carl will reply to this.

>
>  
>
>> static int
>> rtas_ibm_cbe_perftools(int subfunc, int passthru,
>> 		       void *address, unsigned long length)
>> {
>> 	u64 paddr = __pa(address);
>> 
>>-	return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc, passthru,
>>-			 paddr >> 32, paddr & 0xffffffff, length);
>>+	pm_rtas_token = rtas_token("ibm,cbe-perftools");
>>+
>>+	if (unlikely(pm_rtas_token == RTAS_UNKNOWN_SERVICE)) {
>>+		printk(KERN_ERR
>>+		       "%s: rtas token ibm,cbe-perftools unknown\n",
>>+		       __FUNCTION__);
>>+		return -EFWCALL;
>>+	} else {
>>+
>>+		return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc, 
>>+			 passthru, paddr >> 32, paddr & 0xffffffff, length); 
>>+	}
>> }
>>    
>>
>
>Are you now reading the rtas token every time you call rtas? that seems
>like a waste of time.
>  
>
Carl will reply.

>
>  
>
>>+#define size 24
>>+#define ENTRIES  (0x1<<8) /* 256 */
>>+#define MAXLFSR  0xFFFFFF
>>+
>>+int initial_lfsr[] =
>>+{16777215, 3797240, 13519805, 11602690, 6497030, 7614675, 2328937, 2889445,
>>+ 12364575, 8723156, 2450594, 16280864, 14742496, 10904589, 6434212, 4996256,
>>+ 5814270, 13014041, 9825245, 410260, 904096, 15151047, 15487695, 3061843,
>>+ 16482682, 7938572, 4893279, 9390321, 4320879, 5686402, 1711063, 10176714,
>>+ 4512270, 1057359, 16700434, 5731602, 2070114, 16030890, 1208230, 15603106,
>>+ 11857845, 6470172, 1362790, 7316876, 8534496, 1629197, 10003072, 1714539,
>>+ 1814669, 7106700, 5427154, 3395151, 3683327, 12950450, 16620273, 12122372,
>>+ 7194999, 9952750, 3608260, 13604295, 2266835, 14943567, 7079230, 777380,
>>+ 4516801, 1737661, 8730333, 13796927, 3247181, 9950017, 3481896, 16527555,
>>+ 13116123, 14505033, 9781119, 4860212, 7403253, 13264219, 12269980, 100120,
>>+ 664506, 607795, 8274553, 13133688, 6215305, 13208866, 16439693, 3320753,
>>+ 8773582, 13874619, 1784784, 4513501, 11002978, 9318515, 3038856, 14254582,
>>+ 15484958, 15967857, 13504461, 13657322, 14724513, 13955736, 5695315, 7330509,
>>+ 12630101, 6826854, 439712, 4609055, 13288878, 1309632, 4996398, 11392266,
>>+ 793740, 7653789, 2472670, 14641200, 5164364, 5482529, 10415855, 1629108,
>>+ 2012376, 13661123, 14655718, 9534083, 16637925, 2537745, 9787923, 12750103,
>>+ 4660370, 3283461, 14862772, 7034955, 6679872, 8918232, 6506913, 103649,
>>+ 6085577, 13324033, 14251613, 11058220, 11998181, 3100233, 468898, 7104918,
>>+ 12498413, 14408165, 1208514, 15712321, 3088687, 14778333, 3632503, 11151952,
>>+ 98896, 9159367, 8866146, 4780737, 4925758, 12362320, 4122783, 8543358,
>>+ 7056879, 10876914, 6282881, 1686625, 5100373, 4573666, 9265515, 13593840,
>>+ 5853060, 1188880, 4237111, 15765555, 14344137, 4608332, 6590210, 13745050,
>>+ 10916568, 12340402, 7145275, 4417153, 2300360, 12079643, 7608534, 15238251,
>>+ 4947424, 7014722, 3984546, 7168073, 10759589, 16293080, 3757181, 4577717,
>>+ 5163790, 2488841, 4650617, 3650022, 5440654, 1814617, 6939232, 15540909,
>>+ 501788, 1060986, 5058235, 5078222, 3734500, 10762065, 390862, 5172712,
>>+ 1070780, 7904429, 1669757, 3439997, 2956788, 14944927, 12496638, 994152,
>>+ 8901173, 11827497, 4268056, 15725859, 1694506, 5451950, 2892428, 1434298,
>>+ 9048323, 13558747, 15083840, 8154495, 15830901, 391127, 14970070, 2451434,
>>+ 2080347, 10775644, 14599429, 12540753, 4813943, 16140655, 2421772, 12724304,
>>+ 12935733, 7206473, 5697333, 10328104, 2418008, 13547986, 284246, 1732363,
>>+ 16375319, 8109554, 16372365, 14346072, 1835890, 13059499, 2442500, 4110674};
>>+
>>+/*
>>+ * The hardware uses an LFSR counting sequence to determine when to capture
>>+ * the SPU PCs.  The SPU PC capture is done when the LFSR sequence reaches the
>>+ * last value in the sequence.  An LFSR sequence is like a puesdo random
>>+ * number sequence where each number occurs once in the sequence but the
>>+ * sequence is not in numerical order.  To reduce the calculation time, a
>>+ * sequence of 256 precomputed values in the LFSR sequence are stored in a
>>+ * table.  The nearest precomputed value is used as the initial point from
>>+ * which to caculate the desired LFSR value that is n from the end of the
>>+ * sequence.  The lookup table reduces the maximum number of iterations in
>>+ * the loop from 2^24 to 2^16.
>>+ */
>>+static int calculate_lfsr(int n)
>>+{
>>+  int i;
>>+
>>+  int start_lfsr_index;
>>+  unsigned int newlfsr0;
>>+  unsigned int lfsr = MAXLFSR;
>>+  unsigned int binsize = (MAXLFSR+1)/ENTRIES;
>>+  unsigned int howmany;
>>+
>>+  start_lfsr_index = (MAXLFSR - n) / binsize;
>>+  lfsr = initial_lfsr[start_lfsr_index];
>>+  howmany = (MAXLFSR - n) - (start_lfsr_index * (binsize));
>>+
>>+  for (i = 2; i < howmany+2; i++) {
>>+    newlfsr0 = (((lfsr >> (size - 1 - 0)) & 1) ^
>>+		((lfsr >> (size - 1 - 1)) & 1) ^
>>+		(((lfsr >> (size - 1 - 6)) & 1) ^
>>+		 ((lfsr >> (size - 1 - 23)) & 1)));
>>+
>>+    lfsr >>= 1;
>>+    lfsr = lfsr | (newlfsr0 << (size - 1));
>>+  }
>>+  return lfsr;
>>+}
>>    
>>
>
>I agree with Milton that it would be far nicer even to calculate
>the value from user space, but since you say that would
>violate the oprofile interface conventions, let's not go there.
>In order to make this code nicer on the user, you should probably
>insert a 'cond_resched()' somewhere in the loop, maybe every
>500 iterations or so.
>
>it also looks like there is whitespace damage in the code here.
>  
>
Carl will reply.

>  
>
>>+
>>+/* This interface allows a profiler (e.g., OProfile) to store
>>+ * spu_context information needed for profiling, allowing it to
>>+ * be saved across context save/restore operation.
>>+ *
>>+ * Assumes the caller has already incremented the ref count to
>>+ * profile_info; then spu_context_destroy must call kref_put
>>+ * on prof_info_kref.
>>+ */
>>+void spu_set_profile_private(struct spu_context * ctx, void * profile_info,
>>+			     struct kref * prof_info_kref,
>>+			     void (* prof_info_release) (struct kref * kref))
>>+{
>>+	ctx->profile_private = profile_info;
>>+	ctx->prof_priv_kref = prof_info_kref;
>>+	ctx->prof_priv_release = prof_info_release;
>>+}
>>+EXPORT_SYMBOL_GPL(spu_set_profile_private);
>>    
>>
>
>I think you don't need the profile_private member here, if you just use
>container_of with ctx->prof_priv_kref in all users.
>  
>
Sorry, I don't follow. We want the profile_private to be stored in the 
spu_context, don't we?  How else would I be able to do that?  And 
besides, wouldn't container_of need the struct name of profile_private?  
SPUFS doesn't have access to the type.

-Maynard

>	Arnd <><
>
>-------------------------------------------------------------------------
>Take Surveys. Earn Cash. Influence the Future of IT
>Join SourceForge.net's Techsay panel and you'll get the chance to share your
>opinions on IT & business topics through brief surveys-and earn cash
>http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
>_______________________________________________
>oprofile-list mailing list
>oprofile-list@lists.sourceforge.net
>https://lists.sourceforge.net/lists/listinfo/oprofile-list
>  
>



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
@ 2007-02-15 16:15     ` Maynard Johnson
  0 siblings, 0 replies; 66+ messages in thread
From: Maynard Johnson @ 2007-02-15 16:15 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: linuxppc-dev, Carl Love, cbe-oss-dev, oprofile-list, linux-kernel

Arnd Bergmann wrote:

>On Thursday 15 February 2007 00:52, Carl Love wrote:
>
>
>  
>
>>--- linux-2.6.20-rc1.orig/arch/powerpc/oprofile/Kconfig	2007-01-18 16:43:14.000000000 -0600
>>+++ linux-2.6.20-rc1/arch/powerpc/oprofile/Kconfig	2007-02-13 19:04:46.271028904 -0600
>>@@ -7,7 +7,8 @@
>> 
>> config OPROFILE
>> 	tristate "OProfile system profiling (EXPERIMENTAL)"
>>-	depends on PROFILING
>>+	default m
>>+	depends on SPU_FS && PROFILING
>> 	help
>> 	  OProfile is a profiling system capable of profiling the
>> 	  whole system, include the kernel, kernel modules, libraries,
>>    
>>
>
>Milton already commented on this being wrong. I think what you want
>is
>	depends on PROFILING && (SPU_FS = n || SPU_FS)
>
>that should make sure that when SPU_FS=y that OPROFILE can not be 'm'.
>  
>
Blast it!  I did this right on our development system, but neglected to 
update the patch correctly to remove this dependency and 'default m'.  
I'll fix in the next patch.

>  
>
>>@@ -15,3 +16,10 @@
>> 
>> 	  If unsure, say N.
>> 
>>+config OPROFILE_CELL
>>+	bool "OProfile for Cell Broadband Engine"
>>+	depends on SPU_FS && OPROFILE
>>+	default y
>>+	help
>>+	  OProfile for Cell BE requires special support enabled
>>+	  by this option.
>>    
>>
>
>You should at least mention that this allows profiling the spus.
>  
>
OK.

>  
>
>>+#define EFWCALL  ENOSYS         /* Use an existing error number that is as
>>+				 * close as possible for a FW call that failed.
>>+				 * The probability of the call failing is
>>+				 * very low.  Passing up the error number
>>+				 * ensures that the user will see an error
>>+				 * message saying OProfile did not start.
>>+				 * Dmesg will contain an accurate message
>>+				 * about the failure.
>>+				 */
>>    
>>
>
>ENOSYS looks wrong though. It would appear to the user as if the oprofile
>function in the kernel was not present. I'd suggest EIO, and not use 
>an extra define for that.
>  
>
Carl will reply to this.

>
>  
>
>> static int
>> rtas_ibm_cbe_perftools(int subfunc, int passthru,
>> 		       void *address, unsigned long length)
>> {
>> 	u64 paddr = __pa(address);
>> 
>>-	return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc, passthru,
>>-			 paddr >> 32, paddr & 0xffffffff, length);
>>+	pm_rtas_token = rtas_token("ibm,cbe-perftools");
>>+
>>+	if (unlikely(pm_rtas_token == RTAS_UNKNOWN_SERVICE)) {
>>+		printk(KERN_ERR
>>+		       "%s: rtas token ibm,cbe-perftools unknown\n",
>>+		       __FUNCTION__);
>>+		return -EFWCALL;
>>+	} else {
>>+
>>+		return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc, 
>>+			 passthru, paddr >> 32, paddr & 0xffffffff, length); 
>>+	}
>> }
>>    
>>
>
>Are you now reading the rtas token every time you call rtas? that seems
>like a waste of time.
>  
>
Carl will reply.

>
>  
>
>>+#define size 24
>>+#define ENTRIES  (0x1<<8) /* 256 */
>>+#define MAXLFSR  0xFFFFFF
>>+
>>+int initial_lfsr[] =
>>+{16777215, 3797240, 13519805, 11602690, 6497030, 7614675, 2328937, 2889445,
>>+ 12364575, 8723156, 2450594, 16280864, 14742496, 10904589, 6434212, 4996256,
>>+ 5814270, 13014041, 9825245, 410260, 904096, 15151047, 15487695, 3061843,
>>+ 16482682, 7938572, 4893279, 9390321, 4320879, 5686402, 1711063, 10176714,
>>+ 4512270, 1057359, 16700434, 5731602, 2070114, 16030890, 1208230, 15603106,
>>+ 11857845, 6470172, 1362790, 7316876, 8534496, 1629197, 10003072, 1714539,
>>+ 1814669, 7106700, 5427154, 3395151, 3683327, 12950450, 16620273, 12122372,
>>+ 7194999, 9952750, 3608260, 13604295, 2266835, 14943567, 7079230, 777380,
>>+ 4516801, 1737661, 8730333, 13796927, 3247181, 9950017, 3481896, 16527555,
>>+ 13116123, 14505033, 9781119, 4860212, 7403253, 13264219, 12269980, 100120,
>>+ 664506, 607795, 8274553, 13133688, 6215305, 13208866, 16439693, 3320753,
>>+ 8773582, 13874619, 1784784, 4513501, 11002978, 9318515, 3038856, 14254582,
>>+ 15484958, 15967857, 13504461, 13657322, 14724513, 13955736, 5695315, 7330509,
>>+ 12630101, 6826854, 439712, 4609055, 13288878, 1309632, 4996398, 11392266,
>>+ 793740, 7653789, 2472670, 14641200, 5164364, 5482529, 10415855, 1629108,
>>+ 2012376, 13661123, 14655718, 9534083, 16637925, 2537745, 9787923, 12750103,
>>+ 4660370, 3283461, 14862772, 7034955, 6679872, 8918232, 6506913, 103649,
>>+ 6085577, 13324033, 14251613, 11058220, 11998181, 3100233, 468898, 7104918,
>>+ 12498413, 14408165, 1208514, 15712321, 3088687, 14778333, 3632503, 11151952,
>>+ 98896, 9159367, 8866146, 4780737, 4925758, 12362320, 4122783, 8543358,
>>+ 7056879, 10876914, 6282881, 1686625, 5100373, 4573666, 9265515, 13593840,
>>+ 5853060, 1188880, 4237111, 15765555, 14344137, 4608332, 6590210, 13745050,
>>+ 10916568, 12340402, 7145275, 4417153, 2300360, 12079643, 7608534, 15238251,
>>+ 4947424, 7014722, 3984546, 7168073, 10759589, 16293080, 3757181, 4577717,
>>+ 5163790, 2488841, 4650617, 3650022, 5440654, 1814617, 6939232, 15540909,
>>+ 501788, 1060986, 5058235, 5078222, 3734500, 10762065, 390862, 5172712,
>>+ 1070780, 7904429, 1669757, 3439997, 2956788, 14944927, 12496638, 994152,
>>+ 8901173, 11827497, 4268056, 15725859, 1694506, 5451950, 2892428, 1434298,
>>+ 9048323, 13558747, 15083840, 8154495, 15830901, 391127, 14970070, 2451434,
>>+ 2080347, 10775644, 14599429, 12540753, 4813943, 16140655, 2421772, 12724304,
>>+ 12935733, 7206473, 5697333, 10328104, 2418008, 13547986, 284246, 1732363,
>>+ 16375319, 8109554, 16372365, 14346072, 1835890, 13059499, 2442500, 4110674};
>>+
>>+/*
>>+ * The hardware uses an LFSR counting sequence to determine when to capture
>>+ * the SPU PCs.  The SPU PC capture is done when the LFSR sequence reaches the
>>+ * last value in the sequence.  An LFSR sequence is like a puesdo random
>>+ * number sequence where each number occurs once in the sequence but the
>>+ * sequence is not in numerical order.  To reduce the calculation time, a
>>+ * sequence of 256 precomputed values in the LFSR sequence are stored in a
>>+ * table.  The nearest precomputed value is used as the initial point from
>>+ * which to caculate the desired LFSR value that is n from the end of the
>>+ * sequence.  The lookup table reduces the maximum number of iterations in
>>+ * the loop from 2^24 to 2^16.
>>+ */
>>+static int calculate_lfsr(int n)
>>+{
>>+  int i;
>>+
>>+  int start_lfsr_index;
>>+  unsigned int newlfsr0;
>>+  unsigned int lfsr = MAXLFSR;
>>+  unsigned int binsize = (MAXLFSR+1)/ENTRIES;
>>+  unsigned int howmany;
>>+
>>+  start_lfsr_index = (MAXLFSR - n) / binsize;
>>+  lfsr = initial_lfsr[start_lfsr_index];
>>+  howmany = (MAXLFSR - n) - (start_lfsr_index * (binsize));
>>+
>>+  for (i = 2; i < howmany+2; i++) {
>>+    newlfsr0 = (((lfsr >> (size - 1 - 0)) & 1) ^
>>+		((lfsr >> (size - 1 - 1)) & 1) ^
>>+		(((lfsr >> (size - 1 - 6)) & 1) ^
>>+		 ((lfsr >> (size - 1 - 23)) & 1)));
>>+
>>+    lfsr >>= 1;
>>+    lfsr = lfsr | (newlfsr0 << (size - 1));
>>+  }
>>+  return lfsr;
>>+}
>>    
>>
>
>I agree with Milton that it would be far nicer even to calculate
>the value from user space, but since you say that would
>violate the oprofile interface conventions, let's not go there.
>In order to make this code nicer on the user, you should probably
>insert a 'cond_resched()' somewhere in the loop, maybe every
>500 iterations or so.
>
>it also looks like there is whitespace damage in the code here.
>  
>
Carl will reply.

>  
>
>>+
>>+/* This interface allows a profiler (e.g., OProfile) to store
>>+ * spu_context information needed for profiling, allowing it to
>>+ * be saved across context save/restore operation.
>>+ *
>>+ * Assumes the caller has already incremented the ref count to
>>+ * profile_info; then spu_context_destroy must call kref_put
>>+ * on prof_info_kref.
>>+ */
>>+void spu_set_profile_private(struct spu_context * ctx, void * profile_info,
>>+			     struct kref * prof_info_kref,
>>+			     void (* prof_info_release) (struct kref * kref))
>>+{
>>+	ctx->profile_private = profile_info;
>>+	ctx->prof_priv_kref = prof_info_kref;
>>+	ctx->prof_priv_release = prof_info_release;
>>+}
>>+EXPORT_SYMBOL_GPL(spu_set_profile_private);
>>    
>>
>
>I think you don't need the profile_private member here, if you just use
>container_of with ctx->prof_priv_kref in all users.
>  
>
Sorry, I don't follow. We want the profile_private to be stored in the 
spu_context, don't we?  How else would I be able to do that?  And 
besides, wouldn't container_of need the struct name of profile_private?  
SPUFS doesn't have access to the type.

-Maynard

>	Arnd <><
>
>-------------------------------------------------------------------------
>Take Surveys. Earn Cash. Influence the Future of IT
>Join SourceForge.net's Techsay panel and you'll get the chance to share your
>opinions on IT & business topics through brief surveys-and earn cash
>http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
>_______________________________________________
>oprofile-list mailing list
>oprofile-list@lists.sourceforge.net
>https://lists.sourceforge.net/lists/listinfo/oprofile-list
>  
>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-15 16:15     ` Maynard Johnson
@ 2007-02-15 18:13       ` Arnd Bergmann
  -1 siblings, 0 replies; 66+ messages in thread
From: Arnd Bergmann @ 2007-02-15 18:13 UTC (permalink / raw)
  To: Maynard Johnson
  Cc: cbe-oss-dev, linuxppc-dev, linux-kernel, oprofile-list, Carl Love

On Thursday 15 February 2007 17:15, Maynard Johnson wrote:
> >>+void spu_set_profile_private(struct spu_context * ctx, void * profile_info,
> >>+                          struct kref * prof_info_kref,
> >>+                          void (* prof_info_release) (struct kref * kref))
> >>+{
> >>+     ctx->profile_private = profile_info;
> >>+     ctx->prof_priv_kref = prof_info_kref;
> >>+     ctx->prof_priv_release = prof_info_release;
> >>+}
> >>+EXPORT_SYMBOL_GPL(spu_set_profile_private);
> >>    
> >>
> >
> >I think you don't need the profile_private member here, if you just use
> >container_of with ctx->prof_priv_kref in all users.
> >  
> >
> Sorry, I don't follow. We want the profile_private to be stored in the 
> spu_context, don't we?  How else would I be able to do that?  And 
> besides, wouldn't container_of need the struct name of profile_private?  
> SPUFS doesn't have access to the type.

The idea was to have spu_get_profile_private return the kref pointer,
and then change the user of that to do

+       if (!spu_info[spu_num] && the_spu) {
+               spu_info[spu_num] = container_of(
+				spu_get_profile_private(the_spu->ctx),
+				struct cached_info, cache_kref);
+               if (spu_info[spu_num])
+                       kref_get(&spu_info[spu_num]->cache_ref);

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
@ 2007-02-15 18:13       ` Arnd Bergmann
  0 siblings, 0 replies; 66+ messages in thread
From: Arnd Bergmann @ 2007-02-15 18:13 UTC (permalink / raw)
  To: Maynard Johnson
  Cc: linuxppc-dev, Carl Love, cbe-oss-dev, oprofile-list, linux-kernel

On Thursday 15 February 2007 17:15, Maynard Johnson wrote:
> >>+void spu_set_profile_private(struct spu_context * ctx, void * profile_=
info,
> >>+=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0 =A0 =
=A0 struct kref * prof_info_kref,
> >>+=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0 =A0 =
=A0 void (* prof_info_release) (struct kref * kref))
> >>+{
> >>+=A0=A0=A0=A0=A0ctx->profile_private =3D profile_info;
> >>+=A0=A0=A0=A0=A0ctx->prof_priv_kref =3D prof_info_kref;
> >>+=A0=A0=A0=A0=A0ctx->prof_priv_release =3D prof_info_release;
> >>+}
> >>+EXPORT_SYMBOL_GPL(spu_set_profile_private);
> >> =A0 =A0
> >>
> >
> >I think you don't need the profile_private member here, if you just use
> >container_of with ctx->prof_priv_kref in all users.
> > =A0
> >
> Sorry, I don't follow. We want the profile_private to be stored in the=20
> spu_context, don't we? =A0How else would I be able to do that? =A0And=20
> besides, wouldn't container_of need the struct name of profile_private? =
=A0
> SPUFS doesn't have access to the type.

The idea was to have spu_get_profile_private return the kref pointer,
and then change the user of that to do

+       if (!spu_info[spu_num] && the_spu) {
+               spu_info[spu_num] =3D container_of(
+				spu_get_profile_private(the_spu->ctx),
+				struct cached_info, cache_kref);
+               if (spu_info[spu_num])
+                       kref_get(&spu_info[spu_num]->cache_ref);

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-15 14:37   ` Arnd Bergmann
@ 2007-02-15 20:21     ` Carl Love
  -1 siblings, 0 replies; 66+ messages in thread
From: Carl Love @ 2007-02-15 20:21 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: cbe-oss-dev, linuxppc-dev, linux-kernel, oprofile-list

On Thu, 2007-02-15 at 15:37 +0100, Arnd Bergmann wrote:
> On Thursday 15 February 2007 00:52, Carl Love wrote:
> 
> 
> > --- linux-2.6.20-rc1.orig/arch/powerpc/oprofile/Kconfig	2007-01-18 16:43:14.000000000 -0600
> > +++ linux-2.6.20-rc1/arch/powerpc/oprofile/Kconfig	2007-02-13 19:04:46.271028904 -0600
> > @@ -7,7 +7,8 @@
> >  
> >  config OPROFILE
> >  	tristate "OProfile system profiling (EXPERIMENTAL)"
> > -	depends on PROFILING
> > +	default m
> > +	depends on SPU_FS && PROFILING
> >  	help
> >  	  OProfile is a profiling system capable of profiling the
> >  	  whole system, include the kernel, kernel modules, libraries,
> 
> Milton already commented on this being wrong. I think what you want
> is
> 	depends on PROFILING && (SPU_FS = n || SPU_FS)
> 
> that should make sure that when SPU_FS=y that OPROFILE can not be 'm'.
> 
> > @@ -15,3 +16,10 @@
> >  
> >  	  If unsure, say N.
> >  
> > +config OPROFILE_CELL
> > +	bool "OProfile for Cell Broadband Engine"
> > +	depends on SPU_FS && OPROFILE
> > +	default y
> > +	help
> > +	  OProfile for Cell BE requires special support enabled
> > +	  by this option.
> 
> You should at least mention that this allows profiling the spus.
> 
> > +#define EFWCALL  ENOSYS         /* Use an existing error number that is as
> > +				 * close as possible for a FW call that failed.
> > +				 * The probability of the call failing is
> > +				 * very low.  Passing up the error number
> > +				 * ensures that the user will see an error
> > +				 * message saying OProfile did not start.
> > +				 * Dmesg will contain an accurate message
> > +				 * about the failure.
> > +				 */
> 
> ENOSYS looks wrong though. It would appear to the user as if the oprofile
> function in the kernel was not present. I'd suggest EIO, and not use 
> an extra define for that.
> 

OK, will do. 

> 
> >  static int
> >  rtas_ibm_cbe_perftools(int subfunc, int passthru,
> >  		       void *address, unsigned long length)
> >  {
> >  	u64 paddr = __pa(address);
> >  
> > -	return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc, passthru,
> > -			 paddr >> 32, paddr & 0xffffffff, length);
> > +	pm_rtas_token = rtas_token("ibm,cbe-perftools");
> > +
> > +	if (unlikely(pm_rtas_token == RTAS_UNKNOWN_SERVICE)) {
> > +		printk(KERN_ERR
> > +		       "%s: rtas token ibm,cbe-perftools unknown\n",
> > +		       __FUNCTION__);
> > +		return -EFWCALL;
> > +	} else {
> > +
> > +		return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc, 
> > +			 passthru, paddr >> 32, paddr & 0xffffffff, length); 
> > +	}
> >  }
> 
> Are you now reading the rtas token every time you call rtas? that seems
> like a waste of time.

There are actually two RTAS calls, i.e. two tokens.  Once for setting up
the debug bus.  The other to do the SPU PC collection.  Yes, we are
getting the token each time using the single global pm_rtas_token.  To
make sure we had the correct token, I made sure to call it each time.
As you point out it is very wasteful.  It probably would be best to just
have a second global variable say spu_rtas_token.  Then do a single call
for each global variable.  Then we just use the global variable in the
appropriate rtas_call.  This would eliminate a significant number of
calls to look up the token.  I should have thought of that earlier.  

> 
> 
> > +#define size 24
> > +#define ENTRIES  (0x1<<8) /* 256 */
> > +#define MAXLFSR  0xFFFFFF
> > +
> > +int initial_lfsr[] =
> > +{16777215, 3797240, 13519805, 11602690, 6497030, 7614675, 2328937, 2889445,
> > + 12364575, 8723156, 2450594, 16280864, 14742496, 10904589, 6434212, 4996256,
> > + 5814270, 13014041, 9825245, 410260, 904096, 15151047, 15487695, 3061843,
> > + 16482682, 7938572, 4893279, 9390321, 4320879, 5686402, 1711063, 10176714,
> > + 4512270, 1057359, 16700434, 5731602, 2070114, 16030890, 1208230, 15603106,
> > + 11857845, 6470172, 1362790, 7316876, 8534496, 1629197, 10003072, 1714539,
> > + 1814669, 7106700, 5427154, 3395151, 3683327, 12950450, 16620273, 12122372,
> > + 7194999, 9952750, 3608260, 13604295, 2266835, 14943567, 7079230, 777380,
> > + 4516801, 1737661, 8730333, 13796927, 3247181, 9950017, 3481896, 16527555,
> > + 13116123, 14505033, 9781119, 4860212, 7403253, 13264219, 12269980, 100120,
> > + 664506, 607795, 8274553, 13133688, 6215305, 13208866, 16439693, 3320753,
> > + 8773582, 13874619, 1784784, 4513501, 11002978, 9318515, 3038856, 14254582,
> > + 15484958, 15967857, 13504461, 13657322, 14724513, 13955736, 5695315, 7330509,
> > + 12630101, 6826854, 439712, 4609055, 13288878, 1309632, 4996398, 11392266,
> > + 793740, 7653789, 2472670, 14641200, 5164364, 5482529, 10415855, 1629108,
> > + 2012376, 13661123, 14655718, 9534083, 16637925, 2537745, 9787923, 12750103,
> > + 4660370, 3283461, 14862772, 7034955, 6679872, 8918232, 6506913, 103649,
> > + 6085577, 13324033, 14251613, 11058220, 11998181, 3100233, 468898, 7104918,
> > + 12498413, 14408165, 1208514, 15712321, 3088687, 14778333, 3632503, 11151952,
> > + 98896, 9159367, 8866146, 4780737, 4925758, 12362320, 4122783, 8543358,
> > + 7056879, 10876914, 6282881, 1686625, 5100373, 4573666, 9265515, 13593840,
> > + 5853060, 1188880, 4237111, 15765555, 14344137, 4608332, 6590210, 13745050,
> > + 10916568, 12340402, 7145275, 4417153, 2300360, 12079643, 7608534, 15238251,
> > + 4947424, 7014722, 3984546, 7168073, 10759589, 16293080, 3757181, 4577717,
> > + 5163790, 2488841, 4650617, 3650022, 5440654, 1814617, 6939232, 15540909,
> > + 501788, 1060986, 5058235, 5078222, 3734500, 10762065, 390862, 5172712,
> > + 1070780, 7904429, 1669757, 3439997, 2956788, 14944927, 12496638, 994152,
> > + 8901173, 11827497, 4268056, 15725859, 1694506, 5451950, 2892428, 1434298,
> > + 9048323, 13558747, 15083840, 8154495, 15830901, 391127, 14970070, 2451434,
> > + 2080347, 10775644, 14599429, 12540753, 4813943, 16140655, 2421772, 12724304,
> > + 12935733, 7206473, 5697333, 10328104, 2418008, 13547986, 284246, 1732363,
> > + 16375319, 8109554, 16372365, 14346072, 1835890, 13059499, 2442500, 4110674};
> > +
> > +/*
> > + * The hardware uses an LFSR counting sequence to determine when to capture
> > + * the SPU PCs.  The SPU PC capture is done when the LFSR sequence reaches the
> > + * last value in the sequence.  An LFSR sequence is like a puesdo random
> > + * number sequence where each number occurs once in the sequence but the
> > + * sequence is not in numerical order.  To reduce the calculation time, a
> > + * sequence of 256 precomputed values in the LFSR sequence are stored in a
> > + * table.  The nearest precomputed value is used as the initial point from
> > + * which to caculate the desired LFSR value that is n from the end of the
> > + * sequence.  The lookup table reduces the maximum number of iterations in
> > + * the loop from 2^24 to 2^16.
> > + */
> > +static int calculate_lfsr(int n)
> > +{
> > +  int i;
> > +
> > +  int start_lfsr_index;
> > +  unsigned int newlfsr0;
> > +  unsigned int lfsr = MAXLFSR;
> > +  unsigned int binsize = (MAXLFSR+1)/ENTRIES;
> > +  unsigned int howmany;
> > +
> > +  start_lfsr_index = (MAXLFSR - n) / binsize;
> > +  lfsr = initial_lfsr[start_lfsr_index];
> > +  howmany = (MAXLFSR - n) - (start_lfsr_index * (binsize));
> > +
> > +  for (i = 2; i < howmany+2; i++) {
> > +    newlfsr0 = (((lfsr >> (size - 1 - 0)) & 1) ^
> > +		((lfsr >> (size - 1 - 1)) & 1) ^
> > +		(((lfsr >> (size - 1 - 6)) & 1) ^
> > +		 ((lfsr >> (size - 1 - 23)) & 1)));
> > +
> > +    lfsr >>= 1;
> > +    lfsr = lfsr | (newlfsr0 << (size - 1));
> > +  }
> > +  return lfsr;
> > +}
> 
> I agree with Milton that it would be far nicer even to calculate
> the value from user space, but since you say that would
> violate the oprofile interface conventions, let's not go there.
> In order to make this code nicer on the user, you should probably
> insert a 'cond_resched()' somewhere in the loop, maybe every
> 500 iterations or so.
> 
> it also looks like there is whitespace damage in the code here.

I will double check on the whitespace damage.  I thought I had gotten
all that out.  

I have done some quick measurements.  The above method limits the loop
to at most 2^16 iterations.  Based on running the algorithm in user
space, it takes about 3ms of computation time to do the loop 2^16 times.

At the vary least, we need to put the resched in say every 10,000
iterations which would be about every 0.5ms.  Should we do a resched
more often?  

Additionally we could up the size of the table to 512 which would reduce
the maximum time to about 1.5ms.  What do people think about increasing
the table size?

A little more general discussion about the logarithmic algorithm and
limiting the range.  The hardware supports a 24 bit LFSR value. This
means the user can say is capture a sample every N cycles, where N is in
the range of 1 to 2^24.  The OProfile user tool enforces a minimum value
of N to make sure the overhead of OProfile doesn't bring the machine to
its knees.  The minimum values is not intended to guarantee the
performance impact of OProfile will not be significant.  It is left as
an exercise for the user to pick an N that will give minimal performance
impact.  We set the lower limit for N for SPU profiling to 100,000. This
is actually high enough that we don't seem to see much performance
impact when running OProfile.  If the user picked N=2^24 then for a
3.2GHz machine you would get about 200 samples per second on each node.
Where a sample consists of the PC value for all 8 SPUs on the node.  If
the user wanted to do a relatively long OProfile run, I can see where
they might use N=2^24 to avoid gathering too much data.  My gut feeling
is that the sampling frequency for N=2^24 is not low enough that someone
would never want to use it when doing long runs.  Hence, we should not
arbitrarily reduce the maximum value for N.  Although I would expect
that the typical value for N will be in the range of several hundred
thousand to a few million.

As for using a logarithmic spacing of the precomputed values, this
approach means that the space between the precomputed values at the high
end would be much larger then 2^14, assuming 256 precomputed values.
That means it could take much longer then 3ms to get the needed LFSR
value for a large N.  By evenly spacing the precomputed values, we can
ensure that for all N it will take less then 3ms to get the value.
Personally, I am more comfortable with a hard limit on the compute time
then a variable time that could get much bigger then the 1ms threshold
that Arnd wants for resched.  Any thoughts?

> 
> > +
> > +/* This interface allows a profiler (e.g., OProfile) to store
> > + * spu_context information needed for profiling, allowing it to
> > + * be saved across context save/restore operation.
> > + *
> > + * Assumes the caller has already incremented the ref count to
> > + * profile_info; then spu_context_destroy must call kref_put
> > + * on prof_info_kref.
> > + */
> > +void spu_set_profile_private(struct spu_context * ctx, void * profile_info,
> > +			     struct kref * prof_info_kref,
> > +			     void (* prof_info_release) (struct kref * kref))
> > +{
> > +	ctx->profile_private = profile_info;
> > +	ctx->prof_priv_kref = prof_info_kref;
> > +	ctx->prof_priv_release = prof_info_release;
> > +}
> > +EXPORT_SYMBOL_GPL(spu_set_profile_private);
> 
> I think you don't need the profile_private member here, if you just use
> container_of with ctx->prof_priv_kref in all users.
> 
> 	Arnd <><


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
@ 2007-02-15 20:21     ` Carl Love
  0 siblings, 0 replies; 66+ messages in thread
From: Carl Love @ 2007-02-15 20:21 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: linuxppc-dev, cbe-oss-dev, oprofile-list, linux-kernel

On Thu, 2007-02-15 at 15:37 +0100, Arnd Bergmann wrote:
> On Thursday 15 February 2007 00:52, Carl Love wrote:
> 
> 
> > --- linux-2.6.20-rc1.orig/arch/powerpc/oprofile/Kconfig	2007-01-18 16:43:14.000000000 -0600
> > +++ linux-2.6.20-rc1/arch/powerpc/oprofile/Kconfig	2007-02-13 19:04:46.271028904 -0600
> > @@ -7,7 +7,8 @@
> >  
> >  config OPROFILE
> >  	tristate "OProfile system profiling (EXPERIMENTAL)"
> > -	depends on PROFILING
> > +	default m
> > +	depends on SPU_FS && PROFILING
> >  	help
> >  	  OProfile is a profiling system capable of profiling the
> >  	  whole system, include the kernel, kernel modules, libraries,
> 
> Milton already commented on this being wrong. I think what you want
> is
> 	depends on PROFILING && (SPU_FS = n || SPU_FS)
> 
> that should make sure that when SPU_FS=y that OPROFILE can not be 'm'.
> 
> > @@ -15,3 +16,10 @@
> >  
> >  	  If unsure, say N.
> >  
> > +config OPROFILE_CELL
> > +	bool "OProfile for Cell Broadband Engine"
> > +	depends on SPU_FS && OPROFILE
> > +	default y
> > +	help
> > +	  OProfile for Cell BE requires special support enabled
> > +	  by this option.
> 
> You should at least mention that this allows profiling the spus.
> 
> > +#define EFWCALL  ENOSYS         /* Use an existing error number that is as
> > +				 * close as possible for a FW call that failed.
> > +				 * The probability of the call failing is
> > +				 * very low.  Passing up the error number
> > +				 * ensures that the user will see an error
> > +				 * message saying OProfile did not start.
> > +				 * Dmesg will contain an accurate message
> > +				 * about the failure.
> > +				 */
> 
> ENOSYS looks wrong though. It would appear to the user as if the oprofile
> function in the kernel was not present. I'd suggest EIO, and not use 
> an extra define for that.
> 

OK, will do. 

> 
> >  static int
> >  rtas_ibm_cbe_perftools(int subfunc, int passthru,
> >  		       void *address, unsigned long length)
> >  {
> >  	u64 paddr = __pa(address);
> >  
> > -	return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc, passthru,
> > -			 paddr >> 32, paddr & 0xffffffff, length);
> > +	pm_rtas_token = rtas_token("ibm,cbe-perftools");
> > +
> > +	if (unlikely(pm_rtas_token == RTAS_UNKNOWN_SERVICE)) {
> > +		printk(KERN_ERR
> > +		       "%s: rtas token ibm,cbe-perftools unknown\n",
> > +		       __FUNCTION__);
> > +		return -EFWCALL;
> > +	} else {
> > +
> > +		return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc, 
> > +			 passthru, paddr >> 32, paddr & 0xffffffff, length); 
> > +	}
> >  }
> 
> Are you now reading the rtas token every time you call rtas? that seems
> like a waste of time.

There are actually two RTAS calls, i.e. two tokens.  Once for setting up
the debug bus.  The other to do the SPU PC collection.  Yes, we are
getting the token each time using the single global pm_rtas_token.  To
make sure we had the correct token, I made sure to call it each time.
As you point out it is very wasteful.  It probably would be best to just
have a second global variable say spu_rtas_token.  Then do a single call
for each global variable.  Then we just use the global variable in the
appropriate rtas_call.  This would eliminate a significant number of
calls to look up the token.  I should have thought of that earlier.  

> 
> 
> > +#define size 24
> > +#define ENTRIES  (0x1<<8) /* 256 */
> > +#define MAXLFSR  0xFFFFFF
> > +
> > +int initial_lfsr[] =
> > +{16777215, 3797240, 13519805, 11602690, 6497030, 7614675, 2328937, 2889445,
> > + 12364575, 8723156, 2450594, 16280864, 14742496, 10904589, 6434212, 4996256,
> > + 5814270, 13014041, 9825245, 410260, 904096, 15151047, 15487695, 3061843,
> > + 16482682, 7938572, 4893279, 9390321, 4320879, 5686402, 1711063, 10176714,
> > + 4512270, 1057359, 16700434, 5731602, 2070114, 16030890, 1208230, 15603106,
> > + 11857845, 6470172, 1362790, 7316876, 8534496, 1629197, 10003072, 1714539,
> > + 1814669, 7106700, 5427154, 3395151, 3683327, 12950450, 16620273, 12122372,
> > + 7194999, 9952750, 3608260, 13604295, 2266835, 14943567, 7079230, 777380,
> > + 4516801, 1737661, 8730333, 13796927, 3247181, 9950017, 3481896, 16527555,
> > + 13116123, 14505033, 9781119, 4860212, 7403253, 13264219, 12269980, 100120,
> > + 664506, 607795, 8274553, 13133688, 6215305, 13208866, 16439693, 3320753,
> > + 8773582, 13874619, 1784784, 4513501, 11002978, 9318515, 3038856, 14254582,
> > + 15484958, 15967857, 13504461, 13657322, 14724513, 13955736, 5695315, 7330509,
> > + 12630101, 6826854, 439712, 4609055, 13288878, 1309632, 4996398, 11392266,
> > + 793740, 7653789, 2472670, 14641200, 5164364, 5482529, 10415855, 1629108,
> > + 2012376, 13661123, 14655718, 9534083, 16637925, 2537745, 9787923, 12750103,
> > + 4660370, 3283461, 14862772, 7034955, 6679872, 8918232, 6506913, 103649,
> > + 6085577, 13324033, 14251613, 11058220, 11998181, 3100233, 468898, 7104918,
> > + 12498413, 14408165, 1208514, 15712321, 3088687, 14778333, 3632503, 11151952,
> > + 98896, 9159367, 8866146, 4780737, 4925758, 12362320, 4122783, 8543358,
> > + 7056879, 10876914, 6282881, 1686625, 5100373, 4573666, 9265515, 13593840,
> > + 5853060, 1188880, 4237111, 15765555, 14344137, 4608332, 6590210, 13745050,
> > + 10916568, 12340402, 7145275, 4417153, 2300360, 12079643, 7608534, 15238251,
> > + 4947424, 7014722, 3984546, 7168073, 10759589, 16293080, 3757181, 4577717,
> > + 5163790, 2488841, 4650617, 3650022, 5440654, 1814617, 6939232, 15540909,
> > + 501788, 1060986, 5058235, 5078222, 3734500, 10762065, 390862, 5172712,
> > + 1070780, 7904429, 1669757, 3439997, 2956788, 14944927, 12496638, 994152,
> > + 8901173, 11827497, 4268056, 15725859, 1694506, 5451950, 2892428, 1434298,
> > + 9048323, 13558747, 15083840, 8154495, 15830901, 391127, 14970070, 2451434,
> > + 2080347, 10775644, 14599429, 12540753, 4813943, 16140655, 2421772, 12724304,
> > + 12935733, 7206473, 5697333, 10328104, 2418008, 13547986, 284246, 1732363,
> > + 16375319, 8109554, 16372365, 14346072, 1835890, 13059499, 2442500, 4110674};
> > +
> > +/*
> > + * The hardware uses an LFSR counting sequence to determine when to capture
> > + * the SPU PCs.  The SPU PC capture is done when the LFSR sequence reaches the
> > + * last value in the sequence.  An LFSR sequence is like a puesdo random
> > + * number sequence where each number occurs once in the sequence but the
> > + * sequence is not in numerical order.  To reduce the calculation time, a
> > + * sequence of 256 precomputed values in the LFSR sequence are stored in a
> > + * table.  The nearest precomputed value is used as the initial point from
> > + * which to caculate the desired LFSR value that is n from the end of the
> > + * sequence.  The lookup table reduces the maximum number of iterations in
> > + * the loop from 2^24 to 2^16.
> > + */
> > +static int calculate_lfsr(int n)
> > +{
> > +  int i;
> > +
> > +  int start_lfsr_index;
> > +  unsigned int newlfsr0;
> > +  unsigned int lfsr = MAXLFSR;
> > +  unsigned int binsize = (MAXLFSR+1)/ENTRIES;
> > +  unsigned int howmany;
> > +
> > +  start_lfsr_index = (MAXLFSR - n) / binsize;
> > +  lfsr = initial_lfsr[start_lfsr_index];
> > +  howmany = (MAXLFSR - n) - (start_lfsr_index * (binsize));
> > +
> > +  for (i = 2; i < howmany+2; i++) {
> > +    newlfsr0 = (((lfsr >> (size - 1 - 0)) & 1) ^
> > +		((lfsr >> (size - 1 - 1)) & 1) ^
> > +		(((lfsr >> (size - 1 - 6)) & 1) ^
> > +		 ((lfsr >> (size - 1 - 23)) & 1)));
> > +
> > +    lfsr >>= 1;
> > +    lfsr = lfsr | (newlfsr0 << (size - 1));
> > +  }
> > +  return lfsr;
> > +}
> 
> I agree with Milton that it would be far nicer even to calculate
> the value from user space, but since you say that would
> violate the oprofile interface conventions, let's not go there.
> In order to make this code nicer on the user, you should probably
> insert a 'cond_resched()' somewhere in the loop, maybe every
> 500 iterations or so.
> 
> it also looks like there is whitespace damage in the code here.

I will double check on the whitespace damage.  I thought I had gotten
all that out.  

I have done some quick measurements.  The above method limits the loop
to at most 2^16 iterations.  Based on running the algorithm in user
space, it takes about 3ms of computation time to do the loop 2^16 times.

At the vary least, we need to put the resched in say every 10,000
iterations which would be about every 0.5ms.  Should we do a resched
more often?  

Additionally we could up the size of the table to 512 which would reduce
the maximum time to about 1.5ms.  What do people think about increasing
the table size?

A little more general discussion about the logarithmic algorithm and
limiting the range.  The hardware supports a 24 bit LFSR value. This
means the user can say is capture a sample every N cycles, where N is in
the range of 1 to 2^24.  The OProfile user tool enforces a minimum value
of N to make sure the overhead of OProfile doesn't bring the machine to
its knees.  The minimum values is not intended to guarantee the
performance impact of OProfile will not be significant.  It is left as
an exercise for the user to pick an N that will give minimal performance
impact.  We set the lower limit for N for SPU profiling to 100,000. This
is actually high enough that we don't seem to see much performance
impact when running OProfile.  If the user picked N=2^24 then for a
3.2GHz machine you would get about 200 samples per second on each node.
Where a sample consists of the PC value for all 8 SPUs on the node.  If
the user wanted to do a relatively long OProfile run, I can see where
they might use N=2^24 to avoid gathering too much data.  My gut feeling
is that the sampling frequency for N=2^24 is not low enough that someone
would never want to use it when doing long runs.  Hence, we should not
arbitrarily reduce the maximum value for N.  Although I would expect
that the typical value for N will be in the range of several hundred
thousand to a few million.

As for using a logarithmic spacing of the precomputed values, this
approach means that the space between the precomputed values at the high
end would be much larger then 2^14, assuming 256 precomputed values.
That means it could take much longer then 3ms to get the needed LFSR
value for a large N.  By evenly spacing the precomputed values, we can
ensure that for all N it will take less then 3ms to get the value.
Personally, I am more comfortable with a hard limit on the compute time
then a variable time that could get much bigger then the 1ms threshold
that Arnd wants for resched.  Any thoughts?

> 
> > +
> > +/* This interface allows a profiler (e.g., OProfile) to store
> > + * spu_context information needed for profiling, allowing it to
> > + * be saved across context save/restore operation.
> > + *
> > + * Assumes the caller has already incremented the ref count to
> > + * profile_info; then spu_context_destroy must call kref_put
> > + * on prof_info_kref.
> > + */
> > +void spu_set_profile_private(struct spu_context * ctx, void * profile_info,
> > +			     struct kref * prof_info_kref,
> > +			     void (* prof_info_release) (struct kref * kref))
> > +{
> > +	ctx->profile_private = profile_info;
> > +	ctx->prof_priv_kref = prof_info_kref;
> > +	ctx->prof_priv_release = prof_info_release;
> > +}
> > +EXPORT_SYMBOL_GPL(spu_set_profile_private);
> 
> I think you don't need the profile_private member here, if you just use
> container_of with ctx->prof_priv_kref in all users.
> 
> 	Arnd <><

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-15 20:21     ` Carl Love
@ 2007-02-15 21:03       ` Arnd Bergmann
  -1 siblings, 0 replies; 66+ messages in thread
From: Arnd Bergmann @ 2007-02-15 21:03 UTC (permalink / raw)
  To: Carl Love; +Cc: cbe-oss-dev, linuxppc-dev, linux-kernel, oprofile-list

On Thursday 15 February 2007 21:21, Carl Love wrote:

> I have done some quick measurements.  The above method limits the loop
> to at most 2^16 iterations.  Based on running the algorithm in user
> space, it takes about 3ms of computation time to do the loop 2^16 times.
> 
> At the vary least, we need to put the resched in say every 10,000
> iterations which would be about every 0.5ms.  Should we do a resched
> more often?  

Yes, just to be on the safe side, I'd suggest to do it every 1000
iterations.
 
> Additionally we could up the size of the table to 512 which would reduce
> the maximum time to about 1.5ms.  What do people think about increasing
> the table size?

No, that won't help too much. I'd say 256 or 128 entries is the most
we should have.

> As for using a logarithmic spacing of the precomputed values, this
> approach means that the space between the precomputed values at the high
> end would be much larger then 2^14, assuming 256 precomputed values.
> That means it could take much longer then 3ms to get the needed LFSR
> value for a large N.  By evenly spacing the precomputed values, we can
> ensure that for all N it will take less then 3ms to get the value.
> Personally, I am more comfortable with a hard limit on the compute time
> then a variable time that could get much bigger then the 1ms threshold
> that Arnd wants for resched.  Any thoughts?

When using precomputed values on a logarithmic scale, I'd recommend
just rounding to the closest value and accepting the relative inaccuracy,
instead of using the precomputed value as the base and then calculating
from there.

	Arnd <><

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
@ 2007-02-15 21:03       ` Arnd Bergmann
  0 siblings, 0 replies; 66+ messages in thread
From: Arnd Bergmann @ 2007-02-15 21:03 UTC (permalink / raw)
  To: Carl Love; +Cc: linuxppc-dev, cbe-oss-dev, oprofile-list, linux-kernel

On Thursday 15 February 2007 21:21, Carl Love wrote:

> I have done some quick measurements. =A0The above method limits the loop
> to at most 2^16 iterations. =A0Based on running the algorithm in user
> space, it takes about 3ms of computation time to do the loop 2^16 times.
>=20
> At the vary least, we need to put the resched in say every 10,000
> iterations which would be about every 0.5ms. =A0Should we do a resched
> more often? =A0

Yes, just to be on the safe side, I'd suggest to do it every 1000
iterations.
=20
> Additionally we could up the size of the table to 512 which would reduce
> the maximum time to about 1.5ms. =A0What do people think about increasing
> the table size?

No, that won't help too much. I'd say 256 or 128 entries is the most
we should have.

> As for using a logarithmic spacing of the precomputed values, this
> approach means that the space between the precomputed values at the high
> end would be much larger then 2^14, assuming 256 precomputed values.
> That means it could take much longer then 3ms to get the needed LFSR
> value for a large N. =A0By evenly spacing the precomputed values, we can
> ensure that for all N it will take less then 3ms to get the value.
> Personally, I am more comfortable with a hard limit on the compute time
> then a variable time that could get much bigger then the 1ms threshold
> that Arnd wants for resched. =A0Any thoughts?

When using precomputed values on a logarithmic scale, I'd recommend
just rounding to the closest value and accepting the relative inaccuracy,
instead of using the precomputed value as the base and then calculating
from there.

	Arnd <><

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-15 20:21     ` Carl Love
@ 2007-02-15 21:50       ` Paul E. McKenney
  -1 siblings, 0 replies; 66+ messages in thread
From: Paul E. McKenney @ 2007-02-15 21:50 UTC (permalink / raw)
  To: Carl Love
  Cc: Arnd Bergmann, linuxppc-dev, cbe-oss-dev, oprofile-list, linux-kernel

On Thu, Feb 15, 2007 at 12:21:58PM -0800, Carl Love wrote:
> On Thu, 2007-02-15 at 15:37 +0100, Arnd Bergmann wrote:

[ . . . ]

> > I agree with Milton that it would be far nicer even to calculate
> > the value from user space, but since you say that would
> > violate the oprofile interface conventions, let's not go there.
> > In order to make this code nicer on the user, you should probably
> > insert a 'cond_resched()' somewhere in the loop, maybe every
> > 500 iterations or so.
> > 
> > it also looks like there is whitespace damage in the code here.
> 
> I will double check on the whitespace damage.  I thought I had gotten
> all that out.  
> 
> I have done some quick measurements.  The above method limits the loop
> to at most 2^16 iterations.  Based on running the algorithm in user
> space, it takes about 3ms of computation time to do the loop 2^16 times.
> 
> At the vary least, we need to put the resched in say every 10,000
> iterations which would be about every 0.5ms.  Should we do a resched
> more often?  
> 
> Additionally we could up the size of the table to 512 which would reduce
> the maximum time to about 1.5ms.  What do people think about increasing
> the table size?

Is this 1.5ms with interrupts disabled?  This time period is problematic
from a realtime perspective if so -- need to be able to preempt.

						Thanx, Paul

> A little more general discussion about the logarithmic algorithm and
> limiting the range.  The hardware supports a 24 bit LFSR value. This
> means the user can say is capture a sample every N cycles, where N is in
> the range of 1 to 2^24.  The OProfile user tool enforces a minimum value
> of N to make sure the overhead of OProfile doesn't bring the machine to
> its knees.  The minimum values is not intended to guarantee the
> performance impact of OProfile will not be significant.  It is left as
> an exercise for the user to pick an N that will give minimal performance
> impact.  We set the lower limit for N for SPU profiling to 100,000. This
> is actually high enough that we don't seem to see much performance
> impact when running OProfile.  If the user picked N=2^24 then for a
> 3.2GHz machine you would get about 200 samples per second on each node.
> Where a sample consists of the PC value for all 8 SPUs on the node.  If
> the user wanted to do a relatively long OProfile run, I can see where
> they might use N=2^24 to avoid gathering too much data.  My gut feeling
> is that the sampling frequency for N=2^24 is not low enough that someone
> would never want to use it when doing long runs.  Hence, we should not
> arbitrarily reduce the maximum value for N.  Although I would expect
> that the typical value for N will be in the range of several hundred
> thousand to a few million.
> 
> As for using a logarithmic spacing of the precomputed values, this
> approach means that the space between the precomputed values at the high
> end would be much larger then 2^14, assuming 256 precomputed values.
> That means it could take much longer then 3ms to get the needed LFSR
> value for a large N.  By evenly spacing the precomputed values, we can
> ensure that for all N it will take less then 3ms to get the value.
> Personally, I am more comfortable with a hard limit on the compute time
> then a variable time that could get much bigger then the 1ms threshold
> that Arnd wants for resched.  Any thoughts?
> 
> > 
> > > +
> > > +/* This interface allows a profiler (e.g., OProfile) to store
> > > + * spu_context information needed for profiling, allowing it to
> > > + * be saved across context save/restore operation.
> > > + *
> > > + * Assumes the caller has already incremented the ref count to
> > > + * profile_info; then spu_context_destroy must call kref_put
> > > + * on prof_info_kref.
> > > + */
> > > +void spu_set_profile_private(struct spu_context * ctx, void * profile_info,
> > > +			     struct kref * prof_info_kref,
> > > +			     void (* prof_info_release) (struct kref * kref))
> > > +{
> > > +	ctx->profile_private = profile_info;
> > > +	ctx->prof_priv_kref = prof_info_kref;
> > > +	ctx->prof_priv_release = prof_info_release;
> > > +}
> > > +EXPORT_SYMBOL_GPL(spu_set_profile_private);
> > 
> > I think you don't need the profile_private member here, if you just use
> > container_of with ctx->prof_priv_kref in all users.
> > 
> > 	Arnd <><
> 
> _______________________________________________
> cbe-oss-dev mailing list
> cbe-oss-dev@ozlabs.org
> https://ozlabs.org/mailman/listinfo/cbe-oss-dev

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
@ 2007-02-15 21:50       ` Paul E. McKenney
  0 siblings, 0 replies; 66+ messages in thread
From: Paul E. McKenney @ 2007-02-15 21:50 UTC (permalink / raw)
  To: Carl Love
  Cc: linuxppc-dev, oprofile-list, cbe-oss-dev, Arnd Bergmann, linux-kernel

On Thu, Feb 15, 2007 at 12:21:58PM -0800, Carl Love wrote:
> On Thu, 2007-02-15 at 15:37 +0100, Arnd Bergmann wrote:

[ . . . ]

> > I agree with Milton that it would be far nicer even to calculate
> > the value from user space, but since you say that would
> > violate the oprofile interface conventions, let's not go there.
> > In order to make this code nicer on the user, you should probably
> > insert a 'cond_resched()' somewhere in the loop, maybe every
> > 500 iterations or so.
> > 
> > it also looks like there is whitespace damage in the code here.
> 
> I will double check on the whitespace damage.  I thought I had gotten
> all that out.  
> 
> I have done some quick measurements.  The above method limits the loop
> to at most 2^16 iterations.  Based on running the algorithm in user
> space, it takes about 3ms of computation time to do the loop 2^16 times.
> 
> At the vary least, we need to put the resched in say every 10,000
> iterations which would be about every 0.5ms.  Should we do a resched
> more often?  
> 
> Additionally we could up the size of the table to 512 which would reduce
> the maximum time to about 1.5ms.  What do people think about increasing
> the table size?

Is this 1.5ms with interrupts disabled?  This time period is problematic
from a realtime perspective if so -- need to be able to preempt.

						Thanx, Paul

> A little more general discussion about the logarithmic algorithm and
> limiting the range.  The hardware supports a 24 bit LFSR value. This
> means the user can say is capture a sample every N cycles, where N is in
> the range of 1 to 2^24.  The OProfile user tool enforces a minimum value
> of N to make sure the overhead of OProfile doesn't bring the machine to
> its knees.  The minimum values is not intended to guarantee the
> performance impact of OProfile will not be significant.  It is left as
> an exercise for the user to pick an N that will give minimal performance
> impact.  We set the lower limit for N for SPU profiling to 100,000. This
> is actually high enough that we don't seem to see much performance
> impact when running OProfile.  If the user picked N=2^24 then for a
> 3.2GHz machine you would get about 200 samples per second on each node.
> Where a sample consists of the PC value for all 8 SPUs on the node.  If
> the user wanted to do a relatively long OProfile run, I can see where
> they might use N=2^24 to avoid gathering too much data.  My gut feeling
> is that the sampling frequency for N=2^24 is not low enough that someone
> would never want to use it when doing long runs.  Hence, we should not
> arbitrarily reduce the maximum value for N.  Although I would expect
> that the typical value for N will be in the range of several hundred
> thousand to a few million.
> 
> As for using a logarithmic spacing of the precomputed values, this
> approach means that the space between the precomputed values at the high
> end would be much larger then 2^14, assuming 256 precomputed values.
> That means it could take much longer then 3ms to get the needed LFSR
> value for a large N.  By evenly spacing the precomputed values, we can
> ensure that for all N it will take less then 3ms to get the value.
> Personally, I am more comfortable with a hard limit on the compute time
> then a variable time that could get much bigger then the 1ms threshold
> that Arnd wants for resched.  Any thoughts?
> 
> > 
> > > +
> > > +/* This interface allows a profiler (e.g., OProfile) to store
> > > + * spu_context information needed for profiling, allowing it to
> > > + * be saved across context save/restore operation.
> > > + *
> > > + * Assumes the caller has already incremented the ref count to
> > > + * profile_info; then spu_context_destroy must call kref_put
> > > + * on prof_info_kref.
> > > + */
> > > +void spu_set_profile_private(struct spu_context * ctx, void * profile_info,
> > > +			     struct kref * prof_info_kref,
> > > +			     void (* prof_info_release) (struct kref * kref))
> > > +{
> > > +	ctx->profile_private = profile_info;
> > > +	ctx->prof_priv_kref = prof_info_kref;
> > > +	ctx->prof_priv_release = prof_info_release;
> > > +}
> > > +EXPORT_SYMBOL_GPL(spu_set_profile_private);
> > 
> > I think you don't need the profile_private member here, if you just use
> > container_of with ctx->prof_priv_kref in all users.
> > 
> > 	Arnd <><
> 
> _______________________________________________
> cbe-oss-dev mailing list
> cbe-oss-dev@ozlabs.org
> https://ozlabs.org/mailman/listinfo/cbe-oss-dev

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-15 14:37   ` Arnd Bergmann
@ 2007-02-16  0:32     ` Maynard Johnson
  -1 siblings, 0 replies; 66+ messages in thread
From: Maynard Johnson @ 2007-02-16  0:32 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: cbe-oss-dev, linuxppc-dev, linux-kernel, oprofile-list, Carl Love

Arnd Bergmann wrote:

>On Thursday 15 February 2007 00:52, Carl Love wrote:
>
>
>  
>
>>--- linux-2.6.20-rc1.orig/arch/powerpc/oprofile/Kconfig	2007-01-18 16:43:14.000000000 -0600
>>+++ linux-2.6.20-rc1/arch/powerpc/oprofile/Kconfig	2007-02-13 19:04:46.271028904 -0600
>>@@ -7,7 +7,8 @@
>> 
>> config OPROFILE
>> 	tristate "OProfile system profiling (EXPERIMENTAL)"
>>-	depends on PROFILING
>>+	default m
>>+	depends on SPU_FS && PROFILING
>> 	help
>> 	  OProfile is a profiling system capable of profiling the
>> 	  whole system, include the kernel, kernel modules, libraries,
>>    
>>
>
>Milton already commented on this being wrong. I think what you want
>is
>	depends on PROFILING && (SPU_FS = n || SPU_FS)
>
>that should make sure that when SPU_FS=y that OPROFILE can not be 'm'.
>  
>
The above suggestion would not work if SPU_FS is not defined, since the 
entire config option is ignored if an undefined symbol is used.  So, 
here's what I propose instead: 
    - Leave the existing 'config OPROFILE' unchanged from its current 
form in mainline (shown below)
    - Add the new 'config OPROFILE_CELL' (shown below)
    - In arch/powerpc/configs/cell-defconfig, set CONFIG_OPROFILE=m, to 
correspond to setting for CONFIG_SPU_FS
    - In arch/powerpc/oprofile/Makefile, do the following:
                oprofile-$(CONFIG_OPROFILE_CELL) += op_model_cell.o \
                                                            
cell/spu_profiler.o cell/vma_map.o cell/spu_task_sync.o

===========
config OPROFILE
        tristate "OProfile system profiling (EXPERIMENTAL)"
        depends on PROFILING
        help
          OProfile is a profiling system capable of profiling the
          whole system, include the kernel, kernel modules, libraries,
          and applications.

          If unsure, say N.

config OPROFILE_CELL
        bool "OProfile for Cell Broadband Engine"
        depends on OPROFILE && SPU_FS
        default y if ((SPU_FS = y && OPROFILE = y) || (SPU_FS = m && 
OPROFILE = m))
        help
          Profiling of Cell BE SPUs requires special support enabled
          by this option.  Both SPU_FS and OPROFILE options must be
          set 'y' or both be set 'm'.
=============

Can anyone see a problem with any of this . . . or perhaps a suggestion 
of a better way?

Thanks.

-Maynard


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
@ 2007-02-16  0:32     ` Maynard Johnson
  0 siblings, 0 replies; 66+ messages in thread
From: Maynard Johnson @ 2007-02-16  0:32 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: linuxppc-dev, Carl Love, cbe-oss-dev, oprofile-list, linux-kernel

Arnd Bergmann wrote:

>On Thursday 15 February 2007 00:52, Carl Love wrote:
>
>
>  
>
>>--- linux-2.6.20-rc1.orig/arch/powerpc/oprofile/Kconfig	2007-01-18 16:43:14.000000000 -0600
>>+++ linux-2.6.20-rc1/arch/powerpc/oprofile/Kconfig	2007-02-13 19:04:46.271028904 -0600
>>@@ -7,7 +7,8 @@
>> 
>> config OPROFILE
>> 	tristate "OProfile system profiling (EXPERIMENTAL)"
>>-	depends on PROFILING
>>+	default m
>>+	depends on SPU_FS && PROFILING
>> 	help
>> 	  OProfile is a profiling system capable of profiling the
>> 	  whole system, include the kernel, kernel modules, libraries,
>>    
>>
>
>Milton already commented on this being wrong. I think what you want
>is
>	depends on PROFILING && (SPU_FS = n || SPU_FS)
>
>that should make sure that when SPU_FS=y that OPROFILE can not be 'm'.
>  
>
The above suggestion would not work if SPU_FS is not defined, since the 
entire config option is ignored if an undefined symbol is used.  So, 
here's what I propose instead: 
    - Leave the existing 'config OPROFILE' unchanged from its current 
form in mainline (shown below)
    - Add the new 'config OPROFILE_CELL' (shown below)
    - In arch/powerpc/configs/cell-defconfig, set CONFIG_OPROFILE=m, to 
correspond to setting for CONFIG_SPU_FS
    - In arch/powerpc/oprofile/Makefile, do the following:
                oprofile-$(CONFIG_OPROFILE_CELL) += op_model_cell.o \
                                                            
cell/spu_profiler.o cell/vma_map.o cell/spu_task_sync.o

===========
config OPROFILE
        tristate "OProfile system profiling (EXPERIMENTAL)"
        depends on PROFILING
        help
          OProfile is a profiling system capable of profiling the
          whole system, include the kernel, kernel modules, libraries,
          and applications.

          If unsure, say N.

config OPROFILE_CELL
        bool "OProfile for Cell Broadband Engine"
        depends on OPROFILE && SPU_FS
        default y if ((SPU_FS = y && OPROFILE = y) || (SPU_FS = m && 
OPROFILE = m))
        help
          Profiling of Cell BE SPUs requires special support enabled
          by this option.  Both SPU_FS and OPROFILE options must be
          set 'y' or both be set 'm'.
=============

Can anyone see a problem with any of this . . . or perhaps a suggestion 
of a better way?

Thanks.

-Maynard

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-15 21:50       ` Paul E. McKenney
@ 2007-02-16  0:33         ` Arnd Bergmann
  -1 siblings, 0 replies; 66+ messages in thread
From: Arnd Bergmann @ 2007-02-16  0:33 UTC (permalink / raw)
  To: paulmck; +Cc: Carl Love, linuxppc-dev, cbe-oss-dev, oprofile-list, linux-kernel

On Thursday 15 February 2007 22:50, Paul E. McKenney wrote:
> Is this 1.5ms with interrupts disabled?  This time period is problematic
> from a realtime perspective if so -- need to be able to preempt.

No, interrupts should be enabled here. Still, 1.5ms is probably a little
too long without a cond_resched() in case kernel preemption is disabled.

	Arnd <><

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
@ 2007-02-16  0:33         ` Arnd Bergmann
  0 siblings, 0 replies; 66+ messages in thread
From: Arnd Bergmann @ 2007-02-16  0:33 UTC (permalink / raw)
  To: paulmck; +Cc: linuxppc-dev, linux-kernel, cbe-oss-dev, oprofile-list, Carl Love

On Thursday 15 February 2007 22:50, Paul E. McKenney wrote:
> Is this 1.5ms with interrupts disabled? =A0This time period is problematic
> from a realtime perspective if so -- need to be able to preempt.

No, interrupts should be enabled here. Still, 1.5ms is probably a little
too long without a cond_resched() in case kernel preemption is disabled.

	Arnd <><

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-16  0:32     ` Maynard Johnson
@ 2007-02-16 17:14       ` Arnd Bergmann
  -1 siblings, 0 replies; 66+ messages in thread
From: Arnd Bergmann @ 2007-02-16 17:14 UTC (permalink / raw)
  To: Maynard Johnson
  Cc: cbe-oss-dev, linuxppc-dev, linux-kernel, oprofile-list, Carl Love

On Friday 16 February 2007 01:32, Maynard Johnson wrote:
> config OPROFILE_CELL
>         bool "OProfile for Cell Broadband Engine"
>         depends on OPROFILE && SPU_FS
>         default y if ((SPU_FS = y && OPROFILE = y) || (SPU_FS = m && 
> OPROFILE = m))
>         help
>           Profiling of Cell BE SPUs requires special support enabled
>           by this option.  Both SPU_FS and OPROFILE options must be
>           set 'y' or both be set 'm'.
> =============
> 
> Can anyone see a problem with any of this . . . or perhaps a suggestion 
> of a better way?

The text suggests it doesn't allow SPU_FS=y with OPROFILE=m, which I think
should be allowed. I also don't see any place in the code where you actually
use CONFIG_OPROFILE_CELL.

Ideally, you should be able to have an oprofile_spu module that can be
loaded after spufs.ko and oprofile.ko. In that case you only need

config OPROFILE_SPU
	depends on OPROFILE && SPU_FS
	default y

and it will automatically build oprofile_spu as a module if one of the two
is a module and won't build it if one of them is disabled.

	Arnd <><

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
@ 2007-02-16 17:14       ` Arnd Bergmann
  0 siblings, 0 replies; 66+ messages in thread
From: Arnd Bergmann @ 2007-02-16 17:14 UTC (permalink / raw)
  To: Maynard Johnson
  Cc: linuxppc-dev, Carl Love, cbe-oss-dev, oprofile-list, linux-kernel

On Friday 16 February 2007 01:32, Maynard Johnson wrote:
> config OPROFILE_CELL
> =A0 =A0 =A0 =A0 bool "OProfile for Cell Broadband Engine"
> =A0 =A0 =A0 =A0 depends on OPROFILE && SPU_FS
> =A0 =A0 =A0 =A0 default y if ((SPU_FS =3D y && OPROFILE =3D y) || (SPU_FS=
 =3D m &&=20
> OPROFILE =3D m))
> =A0 =A0 =A0 =A0 help
> =A0 =A0 =A0 =A0 =A0 Profiling of Cell BE SPUs requires special support en=
abled
> =A0 =A0 =A0 =A0 =A0 by this option. =A0Both SPU_FS and OPROFILE options m=
ust be
> =A0 =A0 =A0 =A0 =A0 set 'y' or both be set 'm'.
> =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
>=20
> Can anyone see a problem with any of this . . . or perhaps a suggestion=20
> of a better way?

The text suggests it doesn't allow SPU_FS=3Dy with OPROFILE=3Dm, which I th=
ink
should be allowed. I also don't see any place in the code where you actually
use CONFIG_OPROFILE_CELL.

Ideally, you should be able to have an oprofile_spu module that can be
loaded after spufs.ko and oprofile.ko. In that case you only need

config OPROFILE_SPU
	depends on OPROFILE && SPU_FS
	default y

and it will automatically build oprofile_spu as a module if one of the two
is a module and won't build it if one of them is disabled.

	Arnd <><

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-16 17:14       ` Arnd Bergmann
@ 2007-02-16 21:43         ` Maynard Johnson
  -1 siblings, 0 replies; 66+ messages in thread
From: Maynard Johnson @ 2007-02-16 21:43 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: cbe-oss-dev, linuxppc-dev, linux-kernel, oprofile-list, Carl Love

Arnd Bergmann wrote:

> On Friday 16 February 2007 01:32, Maynard Johnson wrote:
> 
>>config OPROFILE_CELL
>>        bool "OProfile for Cell Broadband Engine"
>>        depends on OPROFILE && SPU_FS
>>        default y if ((SPU_FS = y && OPROFILE = y) || (SPU_FS = m && 
>>OPROFILE = m))
>>        help
>>          Profiling of Cell BE SPUs requires special support enabled
>>          by this option.  Both SPU_FS and OPROFILE options must be
>>          set 'y' or both be set 'm'.
>>=============
>>
>>Can anyone see a problem with any of this . . . or perhaps a suggestion 
>>of a better way?
> 
> 
> The text suggests it doesn't allow SPU_FS=y with OPROFILE=m, which I think
> should be allowed. 
Right, good catch.  I'll add another OR to the 'default y' and correct 
the text.

 > I also don't see any place in the code where you actually
> use CONFIG_OPROFILE_CELL.
As I mentioned, I will use CONFIG_OPROFILE_CELL in the 
arch/powerpc/oprofile/Makefile as follows:
      oprofile-$(CONFIG_OPROFILE_CELL) += op_model_cell.o \
               cell/spu_profiler.o cell/vma_map.o cell/spu_task_sync.o

> 
> Ideally, you should be able to have an oprofile_spu module that can be
> loaded after spufs.ko and oprofile.ko. In that case you only need
> 
> config OPROFILE_SPU
> 	depends on OPROFILE && SPU_FS
> 	default y
> 
> and it will automatically build oprofile_spu as a module if one of the two
> is a module and won't build it if one of them is disabled.
Hmmm . . . I guess that would entail splitting out the SPU-related stuff 
from op_model_cell.c into a new file.  Maybe more -- that's just what 
comes to mind right now.  Could be very tricky, and I wonder if it's 
worth the bother.
> 
> 	Arnd <><



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
@ 2007-02-16 21:43         ` Maynard Johnson
  0 siblings, 0 replies; 66+ messages in thread
From: Maynard Johnson @ 2007-02-16 21:43 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: linuxppc-dev, Carl Love, cbe-oss-dev, oprofile-list, linux-kernel

Arnd Bergmann wrote:

> On Friday 16 February 2007 01:32, Maynard Johnson wrote:
> 
>>config OPROFILE_CELL
>>        bool "OProfile for Cell Broadband Engine"
>>        depends on OPROFILE && SPU_FS
>>        default y if ((SPU_FS = y && OPROFILE = y) || (SPU_FS = m && 
>>OPROFILE = m))
>>        help
>>          Profiling of Cell BE SPUs requires special support enabled
>>          by this option.  Both SPU_FS and OPROFILE options must be
>>          set 'y' or both be set 'm'.
>>=============
>>
>>Can anyone see a problem with any of this . . . or perhaps a suggestion 
>>of a better way?
> 
> 
> The text suggests it doesn't allow SPU_FS=y with OPROFILE=m, which I think
> should be allowed. 
Right, good catch.  I'll add another OR to the 'default y' and correct 
the text.

 > I also don't see any place in the code where you actually
> use CONFIG_OPROFILE_CELL.
As I mentioned, I will use CONFIG_OPROFILE_CELL in the 
arch/powerpc/oprofile/Makefile as follows:
      oprofile-$(CONFIG_OPROFILE_CELL) += op_model_cell.o \
               cell/spu_profiler.o cell/vma_map.o cell/spu_task_sync.o

> 
> Ideally, you should be able to have an oprofile_spu module that can be
> loaded after spufs.ko and oprofile.ko. In that case you only need
> 
> config OPROFILE_SPU
> 	depends on OPROFILE && SPU_FS
> 	default y
> 
> and it will automatically build oprofile_spu as a module if one of the two
> is a module and won't build it if one of them is disabled.
Hmmm . . . I guess that would entail splitting out the SPU-related stuff 
from op_model_cell.c into a new file.  Maybe more -- that's just what 
comes to mind right now.  Could be very tricky, and I wonder if it's 
worth the bother.
> 
> 	Arnd <><

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-16 21:43         ` Maynard Johnson
@ 2007-02-18 23:18           ` Maynard Johnson
  -1 siblings, 0 replies; 66+ messages in thread
From: Maynard Johnson @ 2007-02-18 23:18 UTC (permalink / raw)
  To: maynardj
  Cc: Arnd Bergmann, linuxppc-dev, Carl Love, cbe-oss-dev,
	oprofile-list, linux-kernel

Maynard Johnson wrote:

>Arnd Bergmann wrote:
>
>  
>
>>On Friday 16 February 2007 01:32, Maynard Johnson wrote:
>>
>>    
>>
>>>config OPROFILE_CELL
>>>       bool "OProfile for Cell Broadband Engine"
>>>       depends on OPROFILE && SPU_FS
>>>       default y if ((SPU_FS = y && OPROFILE = y) || (SPU_FS = m && 
>>>OPROFILE = m))
>>>       help
>>>         Profiling of Cell BE SPUs requires special support enabled
>>>         by this option.  Both SPU_FS and OPROFILE options must be
>>>         set 'y' or both be set 'm'.
>>>=============
>>>
>>>Can anyone see a problem with any of this . . . or perhaps a suggestion 
>>>of a better way?
>>>      
>>>
>>The text suggests it doesn't allow SPU_FS=y with OPROFILE=m, which I think
>>should be allowed. 
>>    
>>
>Right, good catch.  I'll add another OR to the 'default y' and correct 
>the text.
>  
>
Actually, it makes more sense to do the following:

config OPROFILE_CELL
       bool "OProfile for Cell Broadband Engine"
       depends on  (SPU_FS = y && OPROFILE = m) || (SPU_FS = y && 
OPROFILE = y) || (SPU_FS = m && OPROFILE = m)
       default y
       help
         Profiling of Cell BE SPUs requires special support enabled by 
this option.

> > I also don't see any place in the code where you actually
>  
>
>>use CONFIG_OPROFILE_CELL.
>>    
>>
>As I mentioned, I will use CONFIG_OPROFILE_CELL in the 
>arch/powerpc/oprofile/Makefile as follows:
>      oprofile-$(CONFIG_OPROFILE_CELL) += op_model_cell.o \
>               cell/spu_profiler.o cell/vma_map.o cell/spu_task_sync.o
>
>  
>
>
[snip]

>>	Arnd <><
>>    
>>
>
>
>_______________________________________________
>Linuxppc-dev mailing list
>Linuxppc-dev@ozlabs.org
>https://ozlabs.org/mailman/listinfo/linuxppc-dev
>  
>



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
@ 2007-02-18 23:18           ` Maynard Johnson
  0 siblings, 0 replies; 66+ messages in thread
From: Maynard Johnson @ 2007-02-18 23:18 UTC (permalink / raw)
  To: maynardj
  Cc: Arnd Bergmann, linux-kernel, linuxppc-dev, oprofile-list,
	cbe-oss-dev, Carl Love

Maynard Johnson wrote:

>Arnd Bergmann wrote:
>
>  
>
>>On Friday 16 February 2007 01:32, Maynard Johnson wrote:
>>
>>    
>>
>>>config OPROFILE_CELL
>>>       bool "OProfile for Cell Broadband Engine"
>>>       depends on OPROFILE && SPU_FS
>>>       default y if ((SPU_FS = y && OPROFILE = y) || (SPU_FS = m && 
>>>OPROFILE = m))
>>>       help
>>>         Profiling of Cell BE SPUs requires special support enabled
>>>         by this option.  Both SPU_FS and OPROFILE options must be
>>>         set 'y' or both be set 'm'.
>>>=============
>>>
>>>Can anyone see a problem with any of this . . . or perhaps a suggestion 
>>>of a better way?
>>>      
>>>
>>The text suggests it doesn't allow SPU_FS=y with OPROFILE=m, which I think
>>should be allowed. 
>>    
>>
>Right, good catch.  I'll add another OR to the 'default y' and correct 
>the text.
>  
>
Actually, it makes more sense to do the following:

config OPROFILE_CELL
       bool "OProfile for Cell Broadband Engine"
       depends on  (SPU_FS = y && OPROFILE = m) || (SPU_FS = y && 
OPROFILE = y) || (SPU_FS = m && OPROFILE = m)
       default y
       help
         Profiling of Cell BE SPUs requires special support enabled by 
this option.

> > I also don't see any place in the code where you actually
>  
>
>>use CONFIG_OPROFILE_CELL.
>>    
>>
>As I mentioned, I will use CONFIG_OPROFILE_CELL in the 
>arch/powerpc/oprofile/Makefile as follows:
>      oprofile-$(CONFIG_OPROFILE_CELL) += op_model_cell.o \
>               cell/spu_profiler.o cell/vma_map.o cell/spu_task_sync.o
>
>  
>
>
[snip]

>>	Arnd <><
>>    
>>
>
>
>_______________________________________________
>Linuxppc-dev mailing list
>Linuxppc-dev@ozlabs.org
>https://ozlabs.org/mailman/listinfo/linuxppc-dev
>  
>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-27 16:52     ` Maynard Johnson
@ 2007-02-28  1:44       ` Arnd Bergmann
  -1 siblings, 0 replies; 66+ messages in thread
From: Arnd Bergmann @ 2007-02-28  1:44 UTC (permalink / raw)
  To: cbe-oss-dev
  Cc: Maynard Johnson, Gerhard Stenzel, linux-kernel, linuxppc-dev,
	Mike Perks, oprofile-list

On Tuesday 27 February 2007, Maynard Johnson wrote:
> I have applied the "cleanup" patch that Arnd sent, but had to fix up a 
> few things:
>    -  Bug fix:  Initialize retval in spu_task_sync.c, line 95, otherwise 
> OProfile this function returns non-zero and OProfile fails.
>    -  Remove unused codes in include/linux/oprofile.h
>    -  Compile warnings:  Initialize offset and spu_cookie at lines 283 
> and 284 in spu_task_sync.c
> 
> With these changes and some userspace changes that were necessary to 
> correspond with Arnd's changes, our testing was successful.
> 
> A fixup patch is attached.
> 

The patch does not contain any of the metadata I need to apply it
(subject, description, signed-off-by).

> @@ -280,8 +280,8 @@ static int process_context_switch(struct
>  {
>         unsigned long flags;
>         int retval;
> -       unsigned int offset;
> -       unsigned long spu_cookie, app_dcookie;
> +       unsigned int offset = 0;
> +       unsigned long spu_cookie = 0, app_dcookie;
>         retval = prepare_cached_spu_info(spu, objectId);
>         if (retval)
>                 goto out;

No, this is wrong. Leaving the variables uninitialized at least warns
you about the bug you have in this function: when there is anything wrong,
you just continue writing the record with zero offset and dcookie values
in it. Instead, you should get handle the error condition somewhere
down the code.

It's harmless most of the time, but you really should not be painting
over your bugs by blindly initializing variables.

> diff -paur linux-orig/include/linux/oprofile.h linux-new/include/linux/oprofile.h
> --- linux-orig/include/linux/oprofile.h 2007-02-27 14:41:29.000000000 -0600
> +++ linux-new/include/linux/oprofile.h  2007-02-27 14:43:18.000000000 -0600
> @@ -36,9 +36,6 @@
>  #define XEN_ENTER_SWITCH_CODE          10
>  #define SPU_PROFILING_CODE             11
>  #define SPU_CTX_SWITCH_CODE            12
> -#define SPU_OFFSET_CODE                13
> -#define SPU_COOKIE_CODE                14
> -#define SPU_SHLIB_COOKIE_CODE          15
>  
>  struct super_block;
>  struct dentry;
> 
Right, I forgot about this.

	Arnd <><


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
@ 2007-02-28  1:44       ` Arnd Bergmann
  0 siblings, 0 replies; 66+ messages in thread
From: Arnd Bergmann @ 2007-02-28  1:44 UTC (permalink / raw)
  To: cbe-oss-dev
  Cc: Gerhard Stenzel, linux-kernel, linuxppc-dev, Mike Perks, oprofile-list

On Tuesday 27 February 2007, Maynard Johnson wrote:
> I have applied the "cleanup" patch that Arnd sent, but had to fix up a=20
> few things:
> =A0 =A0- =A0Bug fix: =A0Initialize retval in spu_task_sync.c, line 95, ot=
herwise=20
> OProfile this function returns non-zero and OProfile fails.
> =A0 =A0- =A0Remove unused codes in include/linux/oprofile.h
> =A0 =A0- =A0Compile warnings: =A0Initialize offset and spu_cookie at line=
s 283=20
> and 284 in spu_task_sync.c
>=20
> With these changes and some userspace changes that were necessary to=20
> correspond with Arnd's changes, our testing was successful.
>=20
> A fixup patch is attached.
>=20

The patch does not contain any of the metadata I need to apply it
(subject, description, signed-off-by).

> @@ -280,8 +280,8 @@ static int process_context_switch(struct
>  {
>         unsigned long flags;
>         int retval;
> -       unsigned int offset;
> -       unsigned long spu_cookie, app_dcookie;
> +       unsigned int offset =3D 0;
> +       unsigned long spu_cookie =3D 0, app_dcookie;
>         retval =3D prepare_cached_spu_info(spu, objectId);
>         if (retval)
>                 goto out;

No, this is wrong. Leaving the variables uninitialized at least warns
you about the bug you have in this function: when there is anything wrong,
you just continue writing the record with zero offset and dcookie values
in it. Instead, you should get handle the error condition somewhere
down the code.

It's harmless most of the time, but you really should not be painting
over your bugs by blindly initializing variables.

> diff -paur linux-orig/include/linux/oprofile.h linux-new/include/linux/op=
rofile.h
> --- linux-orig/include/linux/oprofile.h 2007-02-27 14:41:29.000000000 -06=
00
> +++ linux-new/include/linux/oprofile.h  2007-02-27 14:43:18.000000000 -06=
00
> @@ -36,9 +36,6 @@
>  #define XEN_ENTER_SWITCH_CODE          10
>  #define SPU_PROFILING_CODE             11
>  #define SPU_CTX_SWITCH_CODE            12
> -#define SPU_OFFSET_CODE                13
> -#define SPU_COOKIE_CODE                14
> -#define SPU_SHLIB_COOKIE_CODE          15
> =20
>  struct super_block;
>  struct dentry;
>=20
Right, I forgot about this.

	Arnd <><

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-26 23:50   ` Arnd Bergmann
@ 2007-02-27 16:52     ` Maynard Johnson
  -1 siblings, 0 replies; 66+ messages in thread
From: Maynard Johnson @ 2007-02-27 16:52 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: linuxppc-dev, linux-kernel, cbe-oss-dev, oprofile-list,
	Gerhard Stenzel, Mike Perks

[-- Attachment #1: Type: text/plain, Size: 1279 bytes --]

I have applied the "cleanup" patch that Arnd sent, but had to fix up a 
few things:
   -  Bug fix:  Initialize retval in spu_task_sync.c, line 95, otherwise 
OProfile this function returns non-zero and OProfile fails.
   -  Remove unused codes in include/linux/oprofile.h
   -  Compile warnings:  Initialize offset and spu_cookie at lines 283 
and 284 in spu_task_sync.c

With these changes and some userspace changes that were necessary to 
correspond with Arnd's changes, our testing was successful.

A fixup patch is attached.

P.S.  We have a single patch with all these changes applied if anyone 
would like us to post it.

-Maynard


Arnd Bergmann wrote:

>On Thursday 22 February 2007, Carl Love wrote:
>  
>
>>This patch updates the existing arch/powerpc/oprofile/op_model_cell.c
>>to add in the SPU profiling capabilities.  In addition, a 'cell' subdirectory
>>was added to arch/powerpc/oprofile to hold Cell-specific SPU profiling
>>code.
>>    
>>
>
>There was a significant amount of whitespace breakage in this patch,
>which I cleaned up. The patch below consists of the other things
>I changed as a further cleanup. Note that I changed the format
>of the context switch record, which I found too complicated, as
>I described on IRC last week.
>
>	Arnd <><
>
>  
>


[-- Attachment #2: fixups-to-arnd-oprof_spu.patch --]
[-- Type: text/x-diff, Size: 1416 bytes --]

diff -paur linux-orig/arch/powerpc/oprofile/cell/spu_task_sync.c linux-new/arch/powerpc/oprofile/cell/spu_task_sync.c
--- linux-orig/arch/powerpc/oprofile/cell/spu_task_sync.c	2007-02-27 17:10:24.000000000 -0600
+++ linux-new/arch/powerpc/oprofile/cell/spu_task_sync.c	2007-02-27 17:08:57.000000000 -0600
@@ -92,7 +92,7 @@ prepare_cached_spu_info(struct spu * spu
 {
 	unsigned long flags;
 	struct vma_to_fileoffset_map * new_map;
-	int retval;
+	int retval = 0;
 	struct cached_info * info;
 
         /* We won't bother getting cache_lock here since
@@ -280,8 +280,8 @@ static int process_context_switch(struct
 {
 	unsigned long flags;
 	int retval;
-	unsigned int offset;
-	unsigned long spu_cookie, app_dcookie;
+	unsigned int offset = 0;
+	unsigned long spu_cookie = 0, app_dcookie;
 	retval = prepare_cached_spu_info(spu, objectId);
 	if (retval)
 		goto out;
diff -paur linux-orig/include/linux/oprofile.h linux-new/include/linux/oprofile.h
--- linux-orig/include/linux/oprofile.h	2007-02-27 14:41:29.000000000 -0600
+++ linux-new/include/linux/oprofile.h	2007-02-27 14:43:18.000000000 -0600
@@ -36,9 +36,6 @@
 #define XEN_ENTER_SWITCH_CODE          10
 #define SPU_PROFILING_CODE             11
 #define SPU_CTX_SWITCH_CODE            12
-#define SPU_OFFSET_CODE                13
-#define SPU_COOKIE_CODE                14
-#define SPU_SHLIB_COOKIE_CODE          15
 
 struct super_block;
 struct dentry;

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
@ 2007-02-27 16:52     ` Maynard Johnson
  0 siblings, 0 replies; 66+ messages in thread
From: Maynard Johnson @ 2007-02-27 16:52 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Gerhard Stenzel, linux-kernel, linuxppc-dev, Mike Perks,
	oprofile-list, cbe-oss-dev

[-- Attachment #1: Type: text/plain, Size: 1279 bytes --]

I have applied the "cleanup" patch that Arnd sent, but had to fix up a 
few things:
   -  Bug fix:  Initialize retval in spu_task_sync.c, line 95, otherwise 
OProfile this function returns non-zero and OProfile fails.
   -  Remove unused codes in include/linux/oprofile.h
   -  Compile warnings:  Initialize offset and spu_cookie at lines 283 
and 284 in spu_task_sync.c

With these changes and some userspace changes that were necessary to 
correspond with Arnd's changes, our testing was successful.

A fixup patch is attached.

P.S.  We have a single patch with all these changes applied if anyone 
would like us to post it.

-Maynard


Arnd Bergmann wrote:

>On Thursday 22 February 2007, Carl Love wrote:
>  
>
>>This patch updates the existing arch/powerpc/oprofile/op_model_cell.c
>>to add in the SPU profiling capabilities.  In addition, a 'cell' subdirectory
>>was added to arch/powerpc/oprofile to hold Cell-specific SPU profiling
>>code.
>>    
>>
>
>There was a significant amount of whitespace breakage in this patch,
>which I cleaned up. The patch below consists of the other things
>I changed as a further cleanup. Note that I changed the format
>of the context switch record, which I found too complicated, as
>I described on IRC last week.
>
>	Arnd <><
>
>  
>


[-- Attachment #2: fixups-to-arnd-oprof_spu.patch --]
[-- Type: text/x-diff, Size: 1416 bytes --]

diff -paur linux-orig/arch/powerpc/oprofile/cell/spu_task_sync.c linux-new/arch/powerpc/oprofile/cell/spu_task_sync.c
--- linux-orig/arch/powerpc/oprofile/cell/spu_task_sync.c	2007-02-27 17:10:24.000000000 -0600
+++ linux-new/arch/powerpc/oprofile/cell/spu_task_sync.c	2007-02-27 17:08:57.000000000 -0600
@@ -92,7 +92,7 @@ prepare_cached_spu_info(struct spu * spu
 {
 	unsigned long flags;
 	struct vma_to_fileoffset_map * new_map;
-	int retval;
+	int retval = 0;
 	struct cached_info * info;
 
         /* We won't bother getting cache_lock here since
@@ -280,8 +280,8 @@ static int process_context_switch(struct
 {
 	unsigned long flags;
 	int retval;
-	unsigned int offset;
-	unsigned long spu_cookie, app_dcookie;
+	unsigned int offset = 0;
+	unsigned long spu_cookie = 0, app_dcookie;
 	retval = prepare_cached_spu_info(spu, objectId);
 	if (retval)
 		goto out;
diff -paur linux-orig/include/linux/oprofile.h linux-new/include/linux/oprofile.h
--- linux-orig/include/linux/oprofile.h	2007-02-27 14:41:29.000000000 -0600
+++ linux-new/include/linux/oprofile.h	2007-02-27 14:43:18.000000000 -0600
@@ -36,9 +36,6 @@
 #define XEN_ENTER_SWITCH_CODE          10
 #define SPU_PROFILING_CODE             11
 #define SPU_CTX_SWITCH_CODE            12
-#define SPU_OFFSET_CODE                13
-#define SPU_COOKIE_CODE                14
-#define SPU_SHLIB_COOKIE_CODE          15
 
 struct super_block;
 struct dentry;

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-26 23:50   ` Arnd Bergmann
@ 2007-02-27  1:31     ` Michael Ellerman
  -1 siblings, 0 replies; 66+ messages in thread
From: Michael Ellerman @ 2007-02-27  1:31 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: linuxppc-dev, linux-kernel, cbe-oss-dev, oprofile-list

[-- Attachment #1: Type: text/plain, Size: 1551 bytes --]

On Tue, 2007-02-27 at 00:50 +0100, Arnd Bergmann wrote:
> On Thursday 22 February 2007, Carl Love wrote:
> > This patch updates the existing arch/powerpc/oprofile/op_model_cell.c
> > to add in the SPU profiling capabilities.  In addition, a 'cell' subdirectory
> > was added to arch/powerpc/oprofile to hold Cell-specific SPU profiling
> > code.
> 
> There was a significant amount of whitespace breakage in this patch,
> which I cleaned up. The patch below consists of the other things
> I changed as a further cleanup. Note that I changed the format
> of the context switch record, which I found too complicated, as
> I described on IRC last week.
> 
> 	Arnd <><
> 
> --
> Subject: cleanup spu oprofile code
> 
> From: Arnd Bergmann <arnd.bergmann@de.ibm.com>
> This cleans up some of the new oprofile code. It's mostly
> cosmetic changes, like way multi-line comments are formatted.
> The most significant change is a simplification of the
> context-switch record format.
> 
> It does mean the oprofile report tool needs to be adapted,
> but I'm sure that it pays off in the end.

I hate to be a stickler, but this patch is quite large, contains
multiple changes, and mixes formatting changes with functional
changes ... makes it a little hard to review :/

cheers

-- 
Michael Ellerman
OzLabs, IBM Australia Development Lab

wwweb: http://michael.ellerman.id.au
phone: +61 2 6212 1183 (tie line 70 21183)

We do not inherit the earth from our ancestors,
we borrow it from our children. - S.M.A.R.T Person

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
@ 2007-02-27  1:31     ` Michael Ellerman
  0 siblings, 0 replies; 66+ messages in thread
From: Michael Ellerman @ 2007-02-27  1:31 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: linuxppc-dev, linux-kernel, oprofile-list, cbe-oss-dev

[-- Attachment #1: Type: text/plain, Size: 1551 bytes --]

On Tue, 2007-02-27 at 00:50 +0100, Arnd Bergmann wrote:
> On Thursday 22 February 2007, Carl Love wrote:
> > This patch updates the existing arch/powerpc/oprofile/op_model_cell.c
> > to add in the SPU profiling capabilities.  In addition, a 'cell' subdirectory
> > was added to arch/powerpc/oprofile to hold Cell-specific SPU profiling
> > code.
> 
> There was a significant amount of whitespace breakage in this patch,
> which I cleaned up. The patch below consists of the other things
> I changed as a further cleanup. Note that I changed the format
> of the context switch record, which I found too complicated, as
> I described on IRC last week.
> 
> 	Arnd <><
> 
> --
> Subject: cleanup spu oprofile code
> 
> From: Arnd Bergmann <arnd.bergmann@de.ibm.com>
> This cleans up some of the new oprofile code. It's mostly
> cosmetic changes, like way multi-line comments are formatted.
> The most significant change is a simplification of the
> context-switch record format.
> 
> It does mean the oprofile report tool needs to be adapted,
> but I'm sure that it pays off in the end.

I hate to be a stickler, but this patch is quite large, contains
multiple changes, and mixes formatting changes with functional
changes ... makes it a little hard to review :/

cheers

-- 
Michael Ellerman
OzLabs, IBM Australia Development Lab

wwweb: http://michael.ellerman.id.au
phone: +61 2 6212 1183 (tie line 70 21183)

We do not inherit the earth from our ancestors,
we borrow it from our children. - S.M.A.R.T Person

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-22  0:02 Carl Love
@ 2007-02-26 23:50   ` Arnd Bergmann
  0 siblings, 0 replies; 66+ messages in thread
From: Arnd Bergmann @ 2007-02-26 23:50 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Carl Love, cbe-oss-dev, linux-kernel, oprofile-list

On Thursday 22 February 2007, Carl Love wrote:
> This patch updates the existing arch/powerpc/oprofile/op_model_cell.c
> to add in the SPU profiling capabilities.  In addition, a 'cell' subdirectory
> was added to arch/powerpc/oprofile to hold Cell-specific SPU profiling
> code.

There was a significant amount of whitespace breakage in this patch,
which I cleaned up. The patch below consists of the other things
I changed as a further cleanup. Note that I changed the format
of the context switch record, which I found too complicated, as
I described on IRC last week.

	Arnd <><

--
Subject: cleanup spu oprofile code

From: Arnd Bergmann <arnd.bergmann@de.ibm.com>
This cleans up some of the new oprofile code. It's mostly
cosmetic changes, like way multi-line comments are formatted.
The most significant change is a simplification of the
context-switch record format.

It does mean the oprofile report tool needs to be adapted,
but I'm sure that it pays off in the end.

Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
Index: linux-2.6/arch/powerpc/oprofile/cell/spu_task_sync.c
===================================================================
--- linux-2.6.orig/arch/powerpc/oprofile/cell/spu_task_sync.c
+++ linux-2.6/arch/powerpc/oprofile/cell/spu_task_sync.c
@@ -61,11 +61,12 @@ static void destroy_cached_info(struct k
 static struct cached_info * get_cached_info(struct spu * the_spu, int spu_num)
 {
 	struct kref * ref;
-	struct cached_info * ret_info = NULL;
+	struct cached_info * ret_info;
 	if (spu_num >= num_spu_nodes) {
 		printk(KERN_ERR "SPU_PROF: "
 		       "%s, line %d: Invalid index %d into spu info cache\n",
 		       __FUNCTION__, __LINE__, spu_num);
+		ret_info = NULL;
 		goto out;
 	}
 	if (!spu_info[spu_num] && the_spu) {
@@ -89,9 +90,9 @@ static struct cached_info * get_cached_i
 static int
 prepare_cached_spu_info(struct spu * spu, unsigned int objectId)
 {
-	unsigned long flags = 0;
+	unsigned long flags;
 	struct vma_to_fileoffset_map * new_map;
-	int retval = 0;
+	int retval;
 	struct cached_info * info;
 
 	/* We won't bother getting cache_lock here since
@@ -112,6 +113,7 @@ prepare_cached_spu_info(struct spu * spu
 		printk(KERN_ERR "SPU_PROF: "
 		       "%s, line %d: create vma_map failed\n",
 		       __FUNCTION__, __LINE__);
+		retval = -ENOMEM;
 		goto err_alloc;
 	}
 	new_map = create_vma_map(spu, objectId);
@@ -119,6 +121,7 @@ prepare_cached_spu_info(struct spu * spu
 		printk(KERN_ERR "SPU_PROF: "
 		       "%s, line %d: create vma_map failed\n",
 		       __FUNCTION__, __LINE__);
+		retval = -ENOMEM;
 		goto err_alloc;
 	}
 
@@ -144,7 +147,7 @@ prepare_cached_spu_info(struct spu * spu
 	goto out;
 
 err_alloc:
-	retval = -1;
+	kfree(info);
 out:
 	return retval;
 }
@@ -215,11 +218,9 @@ static inline unsigned long fast_get_dco
 static unsigned long
 get_exec_dcookie_and_offset(struct spu * spu, unsigned int * offsetp,
 			    unsigned long * spu_bin_dcookie,
-			    unsigned long * shlib_dcookie,
 			    unsigned int spu_ref)
 {
 	unsigned long app_cookie = 0;
-	unsigned long * image_cookie = NULL;
 	unsigned int my_offset = 0;
 	struct file * app = NULL;
 	struct vm_area_struct * vma;
@@ -252,24 +253,17 @@ get_exec_dcookie_and_offset(struct spu *
 			 my_offset, spu_ref,
 			 vma->vm_file->f_dentry->d_name.name);
 		*offsetp = my_offset;
-		if (my_offset == 0)
-			image_cookie = spu_bin_dcookie;
-		else if (vma->vm_file != app)
-			image_cookie = shlib_dcookie;
 		break;
 	}
 
-	if (image_cookie) {
-		*image_cookie = fast_get_dcookie(vma->vm_file->f_dentry,
+	*spu_bin_dcookie = fast_get_dcookie(vma->vm_file->f_dentry,
 						 vma->vm_file->f_vfsmnt);
-		pr_debug("got dcookie for %s\n",
-			 vma->vm_file->f_dentry->d_name.name);
-	}
+	pr_debug("got dcookie for %s\n", vma->vm_file->f_dentry->d_name.name);
 
- out:
+out:
 	return app_cookie;
 
- fail_no_image_cookie:
+fail_no_image_cookie:
 	printk(KERN_ERR "SPU_PROF: "
 		"%s, line %d: Cannot find dcookie for SPU binary\n",
 		__FUNCTION__, __LINE__);
@@ -285,18 +279,18 @@ get_exec_dcookie_and_offset(struct spu *
 static int process_context_switch(struct spu * spu, unsigned int objectId)
 {
 	unsigned long flags;
-	int retval = 0;
-	unsigned int offset = 0;
-	unsigned long spu_cookie = 0, app_dcookie = 0, shlib_cookie = 0;
+	int retval;
+	unsigned int offset;
+	unsigned long spu_cookie, app_dcookie;
+
 	retval = prepare_cached_spu_info(spu, objectId);
-	if (retval == -1) {
+	if (retval)
 		goto out;
-	}
+
 	/* Get dcookie first because a mutex_lock is taken in that
 	 * code path, so interrupts must not be disabled.
 	 */
-	app_dcookie = get_exec_dcookie_and_offset(spu, &offset, &spu_cookie,
-						  &shlib_cookie, objectId);
+	app_dcookie = get_exec_dcookie_and_offset(spu, &offset, &spu_cookie, objectId);
 
 	/* Record context info in event buffer */
 	spin_lock_irqsave(&buffer_lock, flags);
@@ -306,27 +300,8 @@ static int process_context_switch(struct
 	add_event_entry(spu->pid);
 	add_event_entry(spu->tgid);
 	add_event_entry(app_dcookie);
-
-	if (offset) {
-		/* When offset is non-zero, the SPU ELF was embedded;
-		 * otherwise, it was loaded from a separate binary file. For
-		 * embedded case, we record the offset into the embedding file
-		 * where the SPU ELF was placed.  The embedding file may be
-		 * either the executable application binary or shared library.
-		 * For the non-embedded case, we record a dcookie that
-		 * points to the location of the separate SPU binary that was
-		 * loaded.
-		 */
-		if (shlib_cookie) {
-			add_event_entry(SPU_SHLIB_COOKIE_CODE);
-			add_event_entry(shlib_cookie);
-		}
-		add_event_entry(SPU_OFFSET_CODE);
-		add_event_entry(offset);
-	} else {
-		add_event_entry(SPU_COOKIE_CODE);
-		add_event_entry(spu_cookie);
-	}
+	add_event_entry(spu_cookie);
+	add_event_entry(offset);
 	spin_unlock_irqrestore(&buffer_lock, flags);
 	smp_wmb();
 out:
@@ -343,8 +318,8 @@ static int spu_active_notify(struct noti
 				void * data)
 {
 	int retval;
-	unsigned long flags = 0;
-	struct spu * the_spu = data;
+	unsigned long flags;
+	struct spu *the_spu = data;
 	pr_debug("SPU event notification arrived\n");
 	if (!val){
 		spin_lock_irqsave(&cache_lock, flags);
@@ -403,8 +378,7 @@ void spu_sync_buffer(int spu_num, unsign
 		     int num_samples)
 {
 	unsigned long long file_offset;
-	unsigned long cache_lock_flags = 0;
-	unsigned long buffer_lock_flags = 0;
+	unsigned long flags;
 	int i;
 	struct vma_to_fileoffset_map * map;
 	struct spu * the_spu;
@@ -417,29 +391,27 @@ void spu_sync_buffer(int spu_num, unsign
 	 * corresponding to this cached_info may end, thus resulting
 	 * in the destruction of the cached_info.
 	 */
-	spin_lock_irqsave(&cache_lock, cache_lock_flags);
+	spin_lock_irqsave(&cache_lock, flags);
 	c_info = get_cached_info(NULL, spu_num);
-	if (c_info == NULL) {
+	if (!c_info) {
 	/* This legitimately happens when the SPU task ends before all
 	 * samples are recorded.  No big deal -- so we just drop a few samples.
 	 */
 		pr_debug("SPU_PROF: No cached SPU contex "
 			  "for SPU #%d. Dropping samples.\n", spu_num);
-		spin_unlock_irqrestore(&cache_lock, cache_lock_flags);
-		return ;
+		goto out;
 	}
 
 	map = c_info->map;
 	the_spu = c_info->the_spu;
-	spin_lock_irqsave(&buffer_lock, buffer_lock_flags);
+	spin_lock(&buffer_lock);
 	for (i = 0; i < num_samples; i++) {
 		unsigned int sample = *(samples+i);
 		int grd_val = 0;
 		file_offset = 0;
 		if (sample == 0)
 			continue;
-		file_offset = vma_map_lookup(
-			map, sample, the_spu, &grd_val);
+		file_offset = vma_map_lookup( map, sample, the_spu, &grd_val);
 
 		/* If overlays are used by this SPU application, the guard
 		 * value is non-zero, indicating which overlay section is in
@@ -460,8 +432,9 @@ void spu_sync_buffer(int spu_num, unsign
 			continue;
 		add_event_entry(file_offset | spu_num_shifted);
 	}
-	spin_unlock_irqrestore(&buffer_lock, buffer_lock_flags);
-	spin_unlock_irqrestore(&cache_lock, cache_lock_flags);
+	spin_unlock(&buffer_lock);
+out:
+	spin_unlock_irqrestore(&cache_lock, flags);
 }
 
 
Index: linux-2.6/arch/powerpc/oprofile/op_model_cell.c
===================================================================
--- linux-2.6.orig/arch/powerpc/oprofile/op_model_cell.c
+++ linux-2.6/arch/powerpc/oprofile/op_model_cell.c
@@ -40,7 +40,8 @@
 #include "../platforms/cell/cbe_regs.h"
 #include "cell/pr_util.h"
 
-/* spu_cycle_reset is the number of cycles between samples.
+/*
+ * spu_cycle_reset is the number of cycles between samples.
  * This variable is used for SPU profiling and should ONLY be set
  * at the beginning of cell_reg_setup; otherwise, it's read-only.
  */
@@ -73,7 +74,6 @@ struct pmc_cntrl_data {
 /*
  * ibm,cbe-perftools rtas parameters
  */
-
 struct pm_signal {
 	u16 cpu;		/* Processor to modify */
 	u16 sub_unit;		/* hw subunit this applies to (if applicable)*/
@@ -123,7 +123,8 @@ static DEFINE_PER_CPU(unsigned long[NR_P
 
 static struct pmc_cntrl_data pmc_cntrl[NUM_THREADS][NR_PHYS_CTRS];
 
-/* The CELL profiling code makes rtas calls to setup the debug bus to
+/*
+ * The CELL profiling code makes rtas calls to setup the debug bus to
  * route the performance signals.  Additionally, SPU profiling requires
  * a second rtas call to setup the hardware to capture the SPU PCs.
  * The EIO error value is returned if the token lookups or the rtas
@@ -137,16 +138,21 @@ static struct pmc_cntrl_data pmc_cntrl[N
  * either.
  */
 
-/* Interpetation of hdw_thread:
+/*
+ * Interpetation of hdw_thread:
  * 0 - even virtual cpus 0, 2, 4,...
  * 1 - odd virtual cpus 1, 3, 5, ...
+ *
+ * FIXME: this is strictly wrong, we need to clean this up in a number
+ * of places. It works for now. -arnd
  */
 static u32 hdw_thread;
 
 static u32 virt_cntr_inter_mask;
 static struct timer_list timer_virt_cntr;
 
-/* pm_signal needs to be global since it is initialized in
+/*
+ * pm_signal needs to be global since it is initialized in
  * cell_reg_setup at the time when the necessary information
  * is available.
  */
@@ -167,7 +173,6 @@ static unsigned char input_bus[NUM_INPUT
 /*
  * Firmware interface functions
  */
-
 static int
 rtas_ibm_cbe_perftools(int subfunc, int passthru,
 		       void *address, unsigned long length)
@@ -183,12 +188,13 @@ static void pm_rtas_reset_signals(u32 no
 	int ret;
 	struct pm_signal pm_signal_local;
 
-	/*  The debug bus is being set to the passthru disable state.
-	 *  However, the FW still expects atleast one legal signal routing
-	 *  entry or it will return an error on the arguments.	If we don't
-	 *  supply a valid entry, we must ignore all return values.  Ignoring
-	 *  all return values means we might miss an error we should be
-	 *  concerned about.
+	/*
+	 * The debug bus is being set to the passthru disable state.
+	 * However, the FW still expects atleast one legal signal routing
+	 * entry or it will return an error on the arguments.	If we don't
+	 * supply a valid entry, we must ignore all return values.  Ignoring
+	 * all return values means we might miss an error we should be
+	 * concerned about.
 	 */
 
 	/*  fw expects physical cpu #. */
@@ -203,7 +209,8 @@ static void pm_rtas_reset_signals(u32 no
 				     sizeof(struct pm_signal));
 
 	if (unlikely(ret))
-		/* Not a fatal error. For Oprofile stop, the oprofile
+		/*
+		 * Not a fatal error. For Oprofile stop, the oprofile
 		 * functions do not support returning an error for
 		 * failure to stop OProfile.
 		 */
@@ -217,7 +224,8 @@ static int pm_rtas_activate_signals(u32 
 	int i, j;
 	struct pm_signal pm_signal_local[NR_PHYS_CTRS];
 
-	/* There is no debug setup required for the cycles event.
+	/*
+	 * There is no debug setup required for the cycles event.
 	 * Note that only events in the same group can be used.
 	 * Otherwise, there will be conflicts in correctly routing
 	 * the signals on the debug bus.  It is the responsiblity
@@ -295,7 +303,8 @@ static void set_pm_event(u32 ctr, int ev
 	pm_regs.pm07_cntrl[ctr] |= PM07_CTR_POLARITY(polarity);
 	pm_regs.pm07_cntrl[ctr] |= PM07_CTR_INPUT_CONTROL(input_control);
 
-	/* Some of the islands signal selection is based on 64 bit words.
+	/*
+	 * Some of the islands signal selection is based on 64 bit words.
 	 * The debug bus words are 32 bits, the input words to the performance
 	 * counters are defined as 32 bits.  Need to convert the 64 bit island
 	 * specification to the appropriate 32 input bit and bus word for the
@@ -345,7 +354,8 @@ out:
 
 static void write_pm_cntrl(int cpu)
 {
-	/* Oprofile will use 32 bit counters, set bits 7:10 to 0
+	/*
+	 * Oprofile will use 32 bit counters, set bits 7:10 to 0
 	 * pmregs.pm_cntrl is a global
 	 */
 
@@ -362,7 +372,8 @@ static void write_pm_cntrl(int cpu)
 	if (pm_regs.pm_cntrl.freeze == 1)
 		val |= CBE_PM_FREEZE_ALL_CTRS;
 
-	/* Routine set_count_mode must be called previously to set
+	/*
+	 * Routine set_count_mode must be called previously to set
 	 * the count mode based on the user selection of user and kernel.
 	 */
 	val |= CBE_PM_COUNT_MODE_SET(pm_regs.pm_cntrl.count_mode);
@@ -372,7 +383,8 @@ static void write_pm_cntrl(int cpu)
 static inline void
 set_count_mode(u32 kernel, u32 user)
 {
-	/* The user must specify user and kernel if they want them. If
+	/*
+	 * The user must specify user and kernel if they want them. If
 	 *  neither is specified, OProfile will count in hypervisor mode.
 	 *  pm_regs.pm_cntrl is a global
 	 */
@@ -413,17 +425,18 @@ static inline void enable_ctr(u32 cpu, u
  * pair of per-cpu arrays is used for storing the previous and next
  * pmc values for a given node.
  * NOTE: We use the per-cpu variable to improve cache performance.
+ *
+ * This routine will alternate loading the virtual counters for
+ * virtual CPUs
  */
 static void cell_virtual_cntr(unsigned long data)
 {
-	/* This routine will alternate loading the virtual counters for
-	 * virtual CPUs
-	 */
 	int i, prev_hdw_thread, next_hdw_thread;
 	u32 cpu;
 	unsigned long flags;
 
-	/* Make sure that the interrupt_hander and the virt counter are
+	/*
+	 * Make sure that the interrupt_hander and the virt counter are
 	 * not both playing with the counters on the same node.
 	 */
 
@@ -435,22 +448,25 @@ static void cell_virtual_cntr(unsigned l
 	hdw_thread = 1 ^ hdw_thread;
 	next_hdw_thread = hdw_thread;
 
-	for (i = 0; i < num_counters; i++)
-	/* There are some per thread events.  Must do the
+	/*
+	 * There are some per thread events.  Must do the
 	 * set event, for the thread that is being started
 	 */
+	for (i = 0; i < num_counters; i++)
 		set_pm_event(i,
 			pmc_cntrl[next_hdw_thread][i].evnts,
 			pmc_cntrl[next_hdw_thread][i].masks);
 
-	/* The following is done only once per each node, but
+	/*
+	 * The following is done only once per each node, but
 	 * we need cpu #, not node #, to pass to the cbe_xxx functions.
 	 */
 	for_each_online_cpu(cpu) {
 		if (cbe_get_hw_thread_id(cpu))
 			continue;
 
-		/* stop counters, save counter values, restore counts
+		/*
+		 * stop counters, save counter values, restore counts
 		 * for previous thread
 		 */
 		cbe_disable_pm(cpu);
@@ -479,13 +495,15 @@ static void cell_virtual_cntr(unsigned l
 						      next_hdw_thread)[i]);
 		}
 
-		/* Switch to the other thread. Change the interrupt
+		/*
+		 * Switch to the other thread. Change the interrupt
 		 * and control regs to be scheduled on the CPU
 		 * corresponding to the thread to execute.
 		 */
 		for (i = 0; i < num_counters; i++) {
 			if (pmc_cntrl[next_hdw_thread][i].enabled) {
-				/* There are some per thread events.
+				/*
+				 * There are some per thread events.
 				 * Must do the set event, enable_cntr
 				 * for each cpu.
 				 */
@@ -517,9 +535,8 @@ static void start_virt_cntrs(void)
 }
 
 /* This function is called once for all cpus combined */
-static int
-cell_reg_setup(struct op_counter_config *ctr,
-	       struct op_system_config *sys, int num_ctrs)
+static int cell_reg_setup(struct op_counter_config *ctr,
+			struct op_system_config *sys, int num_ctrs)
 {
 	int i, j, cpu;
 	spu_cycle_reset = 0;
@@ -527,7 +544,8 @@ cell_reg_setup(struct op_counter_config 
 	if (ctr[0].event == SPU_CYCLES_EVENT_NUM) {
 		spu_cycle_reset = ctr[0].count;
 
-		/* Each node will need to make the rtas call to start
+		/*
+		 * Each node will need to make the rtas call to start
 		 * and stop SPU profiling.  Get the token once and store it.
 		 */
 		spu_rtas_token = rtas_token("ibm,cbe-spu-perftools");
@@ -542,7 +560,8 @@ cell_reg_setup(struct op_counter_config 
 
 	pm_rtas_token = rtas_token("ibm,cbe-perftools");
 
-	/* For all events excetp PPU CYCLEs, each node will need to make
+	/*
+	 * For all events excetp PPU CYCLEs, each node will need to make
 	 * the rtas cbe-perftools call to setup and reset the debug bus.
 	 * Make the token lookup call once and store it in the global
 	 * variable pm_rtas_token.
@@ -579,7 +598,8 @@ cell_reg_setup(struct op_counter_config 
 			per_cpu(pmc_values, j)[i] = 0;
 	}
 
-	/* Setup the thread 1 events, map the thread 0 event to the
+	/*
+	 * Setup the thread 1 events, map the thread 0 event to the
 	 * equivalent thread 1 event.
 	 */
 	for (i = 0; i < num_ctrs; ++i) {
@@ -603,7 +623,8 @@ cell_reg_setup(struct op_counter_config 
 	for (i = 0; i < NUM_INPUT_BUS_WORDS; i++)
 		input_bus[i] = 0xff;
 
-	/* Our counters count up, and "count" refers to
+	/*
+	 * Our counters count up, and "count" refers to
 	 * how much before the next interrupt, and we interrupt
 	 * on overflow.	 So we calculate the starting value
 	 * which will give us "count" until overflow.
@@ -667,19 +688,19 @@ static int cell_cpu_setup(struct op_coun
 		}
 	}
 
-	/* the pm_rtas_activate_signals will return -EIO if the FW
+	/*
+	 * The pm_rtas_activate_signals will return -EIO if the FW
 	 * call failed.
 	 */
-	return (pm_rtas_activate_signals(cbe_cpu_to_node(cpu), num_enabled));
-
+	return pm_rtas_activate_signals(cbe_cpu_to_node(cpu), num_enabled);
 }
 
 #define ENTRIES	 303
 #define MAXLFSR	 0xFFFFFF
 
 /* precomputed table of 24 bit LFSR values */
-int initial_lfsr[] =
-{8221349, 12579195, 5379618, 10097839, 7512963, 7519310, 3955098, 10753424,
+static int initial_lfsr[] = {
+ 8221349, 12579195, 5379618, 10097839, 7512963, 7519310, 3955098, 10753424,
  15507573, 7458917, 285419, 2641121, 9780088, 3915503, 6668768, 1548716,
  4885000, 8774424, 9650099, 2044357, 2304411, 9326253, 10332526, 4421547,
  3440748, 10179459, 13332843, 10375561, 1313462, 8375100, 5198480, 6071392,
@@ -716,7 +737,8 @@ int initial_lfsr[] =
  3258216, 12505185, 6007317, 9218111, 14661019, 10537428, 11731949, 9027003,
  6641507, 9490160, 200241, 9720425, 16277895, 10816638, 1554761, 10431375,
  7467528, 6790302, 3429078, 14633753, 14428997, 11463204, 3576212, 2003426,
- 6123687, 820520, 9992513, 15784513, 5778891, 6428165, 8388607};
+ 6123687, 820520, 9992513, 15784513, 5778891, 6428165, 8388607
+};
 
 /*
  * The hardware uses an LFSR counting sequence to determine when to capture
@@ -777,28 +799,25 @@ int initial_lfsr[] =
 
 static int calculate_lfsr(int n)
 {
-	/* The ranges and steps are in powers of 2 so the calculations
+	/*
+	 * The ranges and steps are in powers of 2 so the calculations
 	 * can be done using shifts rather then divide.
 	 */
 	int index;
 
-	if ((n >> 16) == 0) {
+	if ((n >> 16) == 0)
 		index = 0;
-
-	} else if (((n - V2_16) >> 19) == 0) {
+	else if (((n - V2_16) >> 19) == 0)
 		index = ((n - V2_16) >> 12) + 1;
-
-	} else if (((n - V2_16 - V2_19) >> 22) == 0) {
+	else if (((n - V2_16 - V2_19) >> 22) == 0)
 		index = ((n - V2_16 - V2_19) >> 15 ) + 1 + 128;
+	else if (((n - V2_16 - V2_19 - V2_22) >> 24) == 0)
+		index = ((n - V2_16 - V2_19 - V2_22) >> 18 ) + 1 + 256;
+	else
+		index = ENTRIES-1;
 
-	} else if (((n - V2_16 - V2_19 - V2_22) >> 24) == 0) {
-		index = ((n - V2_16 - V2_19 - V2_22) >> 18 )
-			+ 1 + 256;
-	}
-
-	if ((index > ENTRIES) || (index < 0))	/* make sure index is
-						 * valid
-						 */
+	/* make sure index is valid */
+	if ((index > ENTRIES) || (index < 0))
 		index = ENTRIES-1;
 
 	return initial_lfsr[index];
@@ -809,15 +828,17 @@ static int pm_rtas_activate_spu_profilin
 	int ret, i;
 	struct pm_signal pm_signal_local[NR_PHYS_CTRS];
 
-	/* Set up the rtas call to configure the debug bus to
-	 * route the SPU PCs.  Setup the pm_signal for each SPU */
+	/*
+	 * Set up the rtas call to configure the debug bus to
+	 * route the SPU PCs.  Setup the pm_signal for each SPU
+	 */
 	for (i = 0; i < NUM_SPUS_PER_NODE; i++) {
 		pm_signal_local[i].cpu = node;
 		pm_signal_local[i].signal_group = 41;
-		pm_signal_local[i].bus_word = 1 << i / 2; /* spu i on
-							   * word (i/2)
-							   */
-		pm_signal_local[i].sub_unit = i;	/* spu i */
+		/* spu i on word (i/2) */
+		pm_signal_local[i].bus_word = 1 << i / 2;
+		/* spu i */
+		pm_signal_local[i].sub_unit = i;
 		pm_signal_local[i].bit = 63;
 	}
 
@@ -858,8 +879,8 @@ static int cell_global_start_spu(struct 
 	int subfunc, rtn_value;
 	unsigned int lfsr_value;
 	int cpu;
-	int ret = 0;
-	int rtas_error = 0;
+	int ret;
+	int rtas_error;
 	unsigned int cpu_khzfreq = 0;
 
 	/* The SPU profiling uses time-based profiling based on
@@ -884,24 +905,23 @@ static int cell_global_start_spu(struct 
 	for_each_online_cpu(cpu) {
 		if (cbe_get_hw_thread_id(cpu))
 			continue;
-		/* Setup SPU cycle-based profiling.
+
+		/*
+		 * Setup SPU cycle-based profiling.
 		 * Set perf_mon_control bit 0 to a zero before
 		 * enabling spu collection hardware.
 		 */
 		cbe_write_pm(cpu, pm_control, 0);
 
 		if (spu_cycle_reset > MAX_SPU_COUNT)
-			/* use largest possible value
-			 */
+			/* use largest possible value */
 			lfsr_value = calculate_lfsr(MAX_SPU_COUNT-1);
 		else
-		    lfsr_value = calculate_lfsr(spu_cycle_reset);
+			lfsr_value = calculate_lfsr(spu_cycle_reset);
 
-		if (lfsr_value == 0) {	/* must use a non zero value.  Zero
-					 * disables data collection.
-					 */
-				lfsr_value = calculate_lfsr(1);
-		}
+		/* must use a non zero value. Zero disables data collection. */
+		if (lfsr_value == 0)
+			lfsr_value = calculate_lfsr(1);
 
 		lfsr_value = lfsr_value << 8; /* shift lfsr to correct
 						* register location
@@ -916,7 +936,7 @@ static int cell_global_start_spu(struct 
 		}
 
 
-		subfunc = 2;	// 2 - activate SPU tracing, 3 - deactivate
+		subfunc = 2;	/* 2 - activate SPU tracing, 3 - deactivate */
 
 		/* start profiling */
 		rtn_value = rtas_call(spu_rtas_token, 3, 1, NULL, subfunc,
@@ -976,7 +996,8 @@ static int cell_global_start_ppu(struct 
 	oprofile_running = 1;
 	smp_wmb();
 
-	/* NOTE: start_virt_cntrs will result in cell_virtual_cntr() being
+	/*
+	 * NOTE: start_virt_cntrs will result in cell_virtual_cntr() being
 	 * executed which manipulates the PMU.	We start the "virtual counter"
 	 * here so that we do not need to synchronize access to the PMU in
 	 * the above for-loop.
@@ -986,7 +1007,6 @@ static int cell_global_start_ppu(struct 
 	return 0;
 }
 
-
 static int cell_global_start(struct op_counter_config *ctr)
 {
 	if (spu_cycle_reset) {
@@ -996,14 +1016,15 @@ static int cell_global_start(struct op_c
 	}
 }
 
-static void cell_global_stop_spu(void)
-/* Note the generic OProfile stop calls do not support returning
+/*
+ * Note the generic OProfile stop calls do not support returning
  * an error on stop.  Hence, will not return an error if the FW
  * calls fail on stop.	Failure to reset the debug bus is not an issue.
  * Failure to disable the SPU profiling is not an issue.  The FW calls
  * to enable the performance counters and debug bus will work even if
  * the hardware was not cleanly reset.
  */
+static void cell_global_stop_spu(void)
 {
 	int subfunc, rtn_value;
 	unsigned int lfsr_value;
@@ -1020,7 +1041,8 @@ static void cell_global_stop_spu(void)
 		if (cbe_get_hw_thread_id(cpu))
 			continue;
 
-		subfunc = 3;	/* 2 - activate SPU tracing,
+		subfunc = 3;	/*
+				 * 2 - activate SPU tracing,
 				 * 3 - deactivate
 				 */
 		lfsr_value = 0x8f100000;
@@ -1046,7 +1068,8 @@ static void cell_global_stop_ppu(void)
 {
 	int cpu;
 
-	/* This routine will be called once for the system.
+	/*
+	 * This routine will be called once for the system.
 	 * There is one performance monitor per node, so we
 	 * only need to perform this function once per node.
 	 */
@@ -1079,8 +1102,8 @@ static void cell_global_stop(void)
 	}
 }
 
-static void
-cell_handle_interrupt(struct pt_regs *regs, struct op_counter_config *ctr)
+static void cell_handle_interrupt(struct pt_regs *regs,
+				struct op_counter_config *ctr)
 {
 	u32 cpu;
 	u64 pc;
@@ -1091,13 +1114,15 @@ cell_handle_interrupt(struct pt_regs *re
 
 	cpu = smp_processor_id();
 
-	/* Need to make sure the interrupt handler and the virt counter
+	/*
+	 * Need to make sure the interrupt handler and the virt counter
 	 * routine are not running at the same time. See the
 	 * cell_virtual_cntr() routine for additional comments.
 	 */
 	spin_lock_irqsave(&virt_cntr_lock, flags);
 
-	/* Need to disable and reenable the performance counters
+	/*
+	 * Need to disable and reenable the performance counters
 	 * to get the desired behavior from the hardware.  This
 	 * is hardware specific.
 	 */
@@ -1106,7 +1131,8 @@ cell_handle_interrupt(struct pt_regs *re
 
 	interrupt_mask = cbe_get_and_clear_pm_interrupts(cpu);
 
-	/* If the interrupt mask has been cleared, then the virt cntr
+	/*
+	 * If the interrupt mask has been cleared, then the virt cntr
 	 * has cleared the interrupt.  When the thread that generated
 	 * the interrupt is restored, the data count will be restored to
 	 * 0xffffff0 to cause the interrupt to be regenerated.
@@ -1124,7 +1150,8 @@ cell_handle_interrupt(struct pt_regs *re
 			}
 		}
 
-		/* The counters were frozen by the interrupt.
+		/*
+		 * The counters were frozen by the interrupt.
 		 * Reenable the interrupt and restart the counters.
 		 * If there was a race between the interrupt handler and
 		 * the virtual counter routine.	 The virutal counter
@@ -1134,7 +1161,8 @@ cell_handle_interrupt(struct pt_regs *re
 		cbe_enable_pm_interrupts(cpu, hdw_thread,
 					 virt_cntr_inter_mask);
 
-		/* The writes to the various performance counters only writes
+		/*
+		 * The writes to the various performance counters only writes
 		 * to a latch.	The new values (interrupt setting bits, reset
 		 * counter value etc.) are not copied to the actual registers
 		 * until the performance monitor is enabled.  In order to get
@@ -1147,7 +1175,8 @@ cell_handle_interrupt(struct pt_regs *re
 	spin_unlock_irqrestore(&virt_cntr_lock, flags);
 }
 
-/* This function is called from the generic OProfile
+/*
+ * This function is called from the generic OProfile
  * driver.  When profiling PPUs, we need to do the
  * generic sync start; otherwise, do spu_sync_start.
  */
@@ -1167,7 +1196,6 @@ static int cell_sync_stop(void)
 		return 1;
 }
 
-
 struct op_powerpc_model op_model_cell = {
 	.reg_setup = cell_reg_setup,
 	.cpu_setup = cell_cpu_setup,

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
@ 2007-02-26 23:50   ` Arnd Bergmann
  0 siblings, 0 replies; 66+ messages in thread
From: Arnd Bergmann @ 2007-02-26 23:50 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: linux-kernel, cbe-oss-dev, oprofile-list, Carl Love

On Thursday 22 February 2007, Carl Love wrote:
> This patch updates the existing arch/powerpc/oprofile/op_model_cell.c
> to add in the SPU profiling capabilities. =A0In addition, a 'cell' subdir=
ectory
> was added to arch/powerpc/oprofile to hold Cell-specific SPU profiling
> code.

There was a significant amount of whitespace breakage in this patch,
which I cleaned up. The patch below consists of the other things
I changed as a further cleanup. Note that I changed the format
of the context switch record, which I found too complicated, as
I described on IRC last week.

	Arnd <><

=2D-
Subject: cleanup spu oprofile code

=46rom: Arnd Bergmann <arnd.bergmann@de.ibm.com>
This cleans up some of the new oprofile code. It's mostly
cosmetic changes, like way multi-line comments are formatted.
The most significant change is a simplification of the
context-switch record format.

It does mean the oprofile report tool needs to be adapted,
but I'm sure that it pays off in the end.

Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
Index: linux-2.6/arch/powerpc/oprofile/cell/spu_task_sync.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
=2D-- linux-2.6.orig/arch/powerpc/oprofile/cell/spu_task_sync.c
+++ linux-2.6/arch/powerpc/oprofile/cell/spu_task_sync.c
@@ -61,11 +61,12 @@ static void destroy_cached_info(struct k
 static struct cached_info * get_cached_info(struct spu * the_spu, int spu_=
num)
 {
 	struct kref * ref;
=2D	struct cached_info * ret_info =3D NULL;
+	struct cached_info * ret_info;
 	if (spu_num >=3D num_spu_nodes) {
 		printk(KERN_ERR "SPU_PROF: "
 		       "%s, line %d: Invalid index %d into spu info cache\n",
 		       __FUNCTION__, __LINE__, spu_num);
+		ret_info =3D NULL;
 		goto out;
 	}
 	if (!spu_info[spu_num] && the_spu) {
@@ -89,9 +90,9 @@ static struct cached_info * get_cached_i
 static int
 prepare_cached_spu_info(struct spu * spu, unsigned int objectId)
 {
=2D	unsigned long flags =3D 0;
+	unsigned long flags;
 	struct vma_to_fileoffset_map * new_map;
=2D	int retval =3D 0;
+	int retval;
 	struct cached_info * info;
=20
 	/* We won't bother getting cache_lock here since
@@ -112,6 +113,7 @@ prepare_cached_spu_info(struct spu * spu
 		printk(KERN_ERR "SPU_PROF: "
 		       "%s, line %d: create vma_map failed\n",
 		       __FUNCTION__, __LINE__);
+		retval =3D -ENOMEM;
 		goto err_alloc;
 	}
 	new_map =3D create_vma_map(spu, objectId);
@@ -119,6 +121,7 @@ prepare_cached_spu_info(struct spu * spu
 		printk(KERN_ERR "SPU_PROF: "
 		       "%s, line %d: create vma_map failed\n",
 		       __FUNCTION__, __LINE__);
+		retval =3D -ENOMEM;
 		goto err_alloc;
 	}
=20
@@ -144,7 +147,7 @@ prepare_cached_spu_info(struct spu * spu
 	goto out;
=20
 err_alloc:
=2D	retval =3D -1;
+	kfree(info);
 out:
 	return retval;
 }
@@ -215,11 +218,9 @@ static inline unsigned long fast_get_dco
 static unsigned long
 get_exec_dcookie_and_offset(struct spu * spu, unsigned int * offsetp,
 			    unsigned long * spu_bin_dcookie,
=2D			    unsigned long * shlib_dcookie,
 			    unsigned int spu_ref)
 {
 	unsigned long app_cookie =3D 0;
=2D	unsigned long * image_cookie =3D NULL;
 	unsigned int my_offset =3D 0;
 	struct file * app =3D NULL;
 	struct vm_area_struct * vma;
@@ -252,24 +253,17 @@ get_exec_dcookie_and_offset(struct spu *
 			 my_offset, spu_ref,
 			 vma->vm_file->f_dentry->d_name.name);
 		*offsetp =3D my_offset;
=2D		if (my_offset =3D=3D 0)
=2D			image_cookie =3D spu_bin_dcookie;
=2D		else if (vma->vm_file !=3D app)
=2D			image_cookie =3D shlib_dcookie;
 		break;
 	}
=20
=2D	if (image_cookie) {
=2D		*image_cookie =3D fast_get_dcookie(vma->vm_file->f_dentry,
+	*spu_bin_dcookie =3D fast_get_dcookie(vma->vm_file->f_dentry,
 						 vma->vm_file->f_vfsmnt);
=2D		pr_debug("got dcookie for %s\n",
=2D			 vma->vm_file->f_dentry->d_name.name);
=2D	}
+	pr_debug("got dcookie for %s\n", vma->vm_file->f_dentry->d_name.name);
=20
=2D out:
+out:
 	return app_cookie;
=20
=2D fail_no_image_cookie:
+fail_no_image_cookie:
 	printk(KERN_ERR "SPU_PROF: "
 		"%s, line %d: Cannot find dcookie for SPU binary\n",
 		__FUNCTION__, __LINE__);
@@ -285,18 +279,18 @@ get_exec_dcookie_and_offset(struct spu *
 static int process_context_switch(struct spu * spu, unsigned int objectId)
 {
 	unsigned long flags;
=2D	int retval =3D 0;
=2D	unsigned int offset =3D 0;
=2D	unsigned long spu_cookie =3D 0, app_dcookie =3D 0, shlib_cookie =3D 0;
+	int retval;
+	unsigned int offset;
+	unsigned long spu_cookie, app_dcookie;
+
 	retval =3D prepare_cached_spu_info(spu, objectId);
=2D	if (retval =3D=3D -1) {
+	if (retval)
 		goto out;
=2D	}
+
 	/* Get dcookie first because a mutex_lock is taken in that
 	 * code path, so interrupts must not be disabled.
 	 */
=2D	app_dcookie =3D get_exec_dcookie_and_offset(spu, &offset, &spu_cookie,
=2D						  &shlib_cookie, objectId);
+	app_dcookie =3D get_exec_dcookie_and_offset(spu, &offset, &spu_cookie, ob=
jectId);
=20
 	/* Record context info in event buffer */
 	spin_lock_irqsave(&buffer_lock, flags);
@@ -306,27 +300,8 @@ static int process_context_switch(struct
 	add_event_entry(spu->pid);
 	add_event_entry(spu->tgid);
 	add_event_entry(app_dcookie);
=2D
=2D	if (offset) {
=2D		/* When offset is non-zero, the SPU ELF was embedded;
=2D		 * otherwise, it was loaded from a separate binary file. For
=2D		 * embedded case, we record the offset into the embedding file
=2D		 * where the SPU ELF was placed.  The embedding file may be
=2D		 * either the executable application binary or shared library.
=2D		 * For the non-embedded case, we record a dcookie that
=2D		 * points to the location of the separate SPU binary that was
=2D		 * loaded.
=2D		 */
=2D		if (shlib_cookie) {
=2D			add_event_entry(SPU_SHLIB_COOKIE_CODE);
=2D			add_event_entry(shlib_cookie);
=2D		}
=2D		add_event_entry(SPU_OFFSET_CODE);
=2D		add_event_entry(offset);
=2D	} else {
=2D		add_event_entry(SPU_COOKIE_CODE);
=2D		add_event_entry(spu_cookie);
=2D	}
+	add_event_entry(spu_cookie);
+	add_event_entry(offset);
 	spin_unlock_irqrestore(&buffer_lock, flags);
 	smp_wmb();
 out:
@@ -343,8 +318,8 @@ static int spu_active_notify(struct noti
 				void * data)
 {
 	int retval;
=2D	unsigned long flags =3D 0;
=2D	struct spu * the_spu =3D data;
+	unsigned long flags;
+	struct spu *the_spu =3D data;
 	pr_debug("SPU event notification arrived\n");
 	if (!val){
 		spin_lock_irqsave(&cache_lock, flags);
@@ -403,8 +378,7 @@ void spu_sync_buffer(int spu_num, unsign
 		     int num_samples)
 {
 	unsigned long long file_offset;
=2D	unsigned long cache_lock_flags =3D 0;
=2D	unsigned long buffer_lock_flags =3D 0;
+	unsigned long flags;
 	int i;
 	struct vma_to_fileoffset_map * map;
 	struct spu * the_spu;
@@ -417,29 +391,27 @@ void spu_sync_buffer(int spu_num, unsign
 	 * corresponding to this cached_info may end, thus resulting
 	 * in the destruction of the cached_info.
 	 */
=2D	spin_lock_irqsave(&cache_lock, cache_lock_flags);
+	spin_lock_irqsave(&cache_lock, flags);
 	c_info =3D get_cached_info(NULL, spu_num);
=2D	if (c_info =3D=3D NULL) {
+	if (!c_info) {
 	/* This legitimately happens when the SPU task ends before all
 	 * samples are recorded.  No big deal -- so we just drop a few samples.
 	 */
 		pr_debug("SPU_PROF: No cached SPU contex "
 			  "for SPU #%d. Dropping samples.\n", spu_num);
=2D		spin_unlock_irqrestore(&cache_lock, cache_lock_flags);
=2D		return ;
+		goto out;
 	}
=20
 	map =3D c_info->map;
 	the_spu =3D c_info->the_spu;
=2D	spin_lock_irqsave(&buffer_lock, buffer_lock_flags);
+	spin_lock(&buffer_lock);
 	for (i =3D 0; i < num_samples; i++) {
 		unsigned int sample =3D *(samples+i);
 		int grd_val =3D 0;
 		file_offset =3D 0;
 		if (sample =3D=3D 0)
 			continue;
=2D		file_offset =3D vma_map_lookup(
=2D			map, sample, the_spu, &grd_val);
+		file_offset =3D vma_map_lookup( map, sample, the_spu, &grd_val);
=20
 		/* If overlays are used by this SPU application, the guard
 		 * value is non-zero, indicating which overlay section is in
@@ -460,8 +432,9 @@ void spu_sync_buffer(int spu_num, unsign
 			continue;
 		add_event_entry(file_offset | spu_num_shifted);
 	}
=2D	spin_unlock_irqrestore(&buffer_lock, buffer_lock_flags);
=2D	spin_unlock_irqrestore(&cache_lock, cache_lock_flags);
+	spin_unlock(&buffer_lock);
+out:
+	spin_unlock_irqrestore(&cache_lock, flags);
 }
=20
=20
Index: linux-2.6/arch/powerpc/oprofile/op_model_cell.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
=2D-- linux-2.6.orig/arch/powerpc/oprofile/op_model_cell.c
+++ linux-2.6/arch/powerpc/oprofile/op_model_cell.c
@@ -40,7 +40,8 @@
 #include "../platforms/cell/cbe_regs.h"
 #include "cell/pr_util.h"
=20
=2D/* spu_cycle_reset is the number of cycles between samples.
+/*
+ * spu_cycle_reset is the number of cycles between samples.
  * This variable is used for SPU profiling and should ONLY be set
  * at the beginning of cell_reg_setup; otherwise, it's read-only.
  */
@@ -73,7 +74,6 @@ struct pmc_cntrl_data {
 /*
  * ibm,cbe-perftools rtas parameters
  */
=2D
 struct pm_signal {
 	u16 cpu;		/* Processor to modify */
 	u16 sub_unit;		/* hw subunit this applies to (if applicable)*/
@@ -123,7 +123,8 @@ static DEFINE_PER_CPU(unsigned long[NR_P
=20
 static struct pmc_cntrl_data pmc_cntrl[NUM_THREADS][NR_PHYS_CTRS];
=20
=2D/* The CELL profiling code makes rtas calls to setup the debug bus to
+/*
+ * The CELL profiling code makes rtas calls to setup the debug bus to
  * route the performance signals.  Additionally, SPU profiling requires
  * a second rtas call to setup the hardware to capture the SPU PCs.
  * The EIO error value is returned if the token lookups or the rtas
@@ -137,16 +138,21 @@ static struct pmc_cntrl_data pmc_cntrl[N
  * either.
  */
=20
=2D/* Interpetation of hdw_thread:
+/*
+ * Interpetation of hdw_thread:
  * 0 - even virtual cpus 0, 2, 4,...
  * 1 - odd virtual cpus 1, 3, 5, ...
+ *
+ * FIXME: this is strictly wrong, we need to clean this up in a number
+ * of places. It works for now. -arnd
  */
 static u32 hdw_thread;
=20
 static u32 virt_cntr_inter_mask;
 static struct timer_list timer_virt_cntr;
=20
=2D/* pm_signal needs to be global since it is initialized in
+/*
+ * pm_signal needs to be global since it is initialized in
  * cell_reg_setup at the time when the necessary information
  * is available.
  */
@@ -167,7 +173,6 @@ static unsigned char input_bus[NUM_INPUT
 /*
  * Firmware interface functions
  */
=2D
 static int
 rtas_ibm_cbe_perftools(int subfunc, int passthru,
 		       void *address, unsigned long length)
@@ -183,12 +188,13 @@ static void pm_rtas_reset_signals(u32 no
 	int ret;
 	struct pm_signal pm_signal_local;
=20
=2D	/*  The debug bus is being set to the passthru disable state.
=2D	 *  However, the FW still expects atleast one legal signal routing
=2D	 *  entry or it will return an error on the arguments.	If we don't
=2D	 *  supply a valid entry, we must ignore all return values.  Ignoring
=2D	 *  all return values means we might miss an error we should be
=2D	 *  concerned about.
+	/*
+	 * The debug bus is being set to the passthru disable state.
+	 * However, the FW still expects atleast one legal signal routing
+	 * entry or it will return an error on the arguments.	If we don't
+	 * supply a valid entry, we must ignore all return values.  Ignoring
+	 * all return values means we might miss an error we should be
+	 * concerned about.
 	 */
=20
 	/*  fw expects physical cpu #. */
@@ -203,7 +209,8 @@ static void pm_rtas_reset_signals(u32 no
 				     sizeof(struct pm_signal));
=20
 	if (unlikely(ret))
=2D		/* Not a fatal error. For Oprofile stop, the oprofile
+		/*
+		 * Not a fatal error. For Oprofile stop, the oprofile
 		 * functions do not support returning an error for
 		 * failure to stop OProfile.
 		 */
@@ -217,7 +224,8 @@ static int pm_rtas_activate_signals(u32=20
 	int i, j;
 	struct pm_signal pm_signal_local[NR_PHYS_CTRS];
=20
=2D	/* There is no debug setup required for the cycles event.
+	/*
+	 * There is no debug setup required for the cycles event.
 	 * Note that only events in the same group can be used.
 	 * Otherwise, there will be conflicts in correctly routing
 	 * the signals on the debug bus.  It is the responsiblity
@@ -295,7 +303,8 @@ static void set_pm_event(u32 ctr, int ev
 	pm_regs.pm07_cntrl[ctr] |=3D PM07_CTR_POLARITY(polarity);
 	pm_regs.pm07_cntrl[ctr] |=3D PM07_CTR_INPUT_CONTROL(input_control);
=20
=2D	/* Some of the islands signal selection is based on 64 bit words.
+	/*
+	 * Some of the islands signal selection is based on 64 bit words.
 	 * The debug bus words are 32 bits, the input words to the performance
 	 * counters are defined as 32 bits.  Need to convert the 64 bit island
 	 * specification to the appropriate 32 input bit and bus word for the
@@ -345,7 +354,8 @@ out:
=20
 static void write_pm_cntrl(int cpu)
 {
=2D	/* Oprofile will use 32 bit counters, set bits 7:10 to 0
+	/*
+	 * Oprofile will use 32 bit counters, set bits 7:10 to 0
 	 * pmregs.pm_cntrl is a global
 	 */
=20
@@ -362,7 +372,8 @@ static void write_pm_cntrl(int cpu)
 	if (pm_regs.pm_cntrl.freeze =3D=3D 1)
 		val |=3D CBE_PM_FREEZE_ALL_CTRS;
=20
=2D	/* Routine set_count_mode must be called previously to set
+	/*
+	 * Routine set_count_mode must be called previously to set
 	 * the count mode based on the user selection of user and kernel.
 	 */
 	val |=3D CBE_PM_COUNT_MODE_SET(pm_regs.pm_cntrl.count_mode);
@@ -372,7 +383,8 @@ static void write_pm_cntrl(int cpu)
 static inline void
 set_count_mode(u32 kernel, u32 user)
 {
=2D	/* The user must specify user and kernel if they want them. If
+	/*
+	 * The user must specify user and kernel if they want them. If
 	 *  neither is specified, OProfile will count in hypervisor mode.
 	 *  pm_regs.pm_cntrl is a global
 	 */
@@ -413,17 +425,18 @@ static inline void enable_ctr(u32 cpu, u
  * pair of per-cpu arrays is used for storing the previous and next
  * pmc values for a given node.
  * NOTE: We use the per-cpu variable to improve cache performance.
+ *
+ * This routine will alternate loading the virtual counters for
+ * virtual CPUs
  */
 static void cell_virtual_cntr(unsigned long data)
 {
=2D	/* This routine will alternate loading the virtual counters for
=2D	 * virtual CPUs
=2D	 */
 	int i, prev_hdw_thread, next_hdw_thread;
 	u32 cpu;
 	unsigned long flags;
=20
=2D	/* Make sure that the interrupt_hander and the virt counter are
+	/*
+	 * Make sure that the interrupt_hander and the virt counter are
 	 * not both playing with the counters on the same node.
 	 */
=20
@@ -435,22 +448,25 @@ static void cell_virtual_cntr(unsigned l
 	hdw_thread =3D 1 ^ hdw_thread;
 	next_hdw_thread =3D hdw_thread;
=20
=2D	for (i =3D 0; i < num_counters; i++)
=2D	/* There are some per thread events.  Must do the
+	/*
+	 * There are some per thread events.  Must do the
 	 * set event, for the thread that is being started
 	 */
+	for (i =3D 0; i < num_counters; i++)
 		set_pm_event(i,
 			pmc_cntrl[next_hdw_thread][i].evnts,
 			pmc_cntrl[next_hdw_thread][i].masks);
=20
=2D	/* The following is done only once per each node, but
+	/*
+	 * The following is done only once per each node, but
 	 * we need cpu #, not node #, to pass to the cbe_xxx functions.
 	 */
 	for_each_online_cpu(cpu) {
 		if (cbe_get_hw_thread_id(cpu))
 			continue;
=20
=2D		/* stop counters, save counter values, restore counts
+		/*
+		 * stop counters, save counter values, restore counts
 		 * for previous thread
 		 */
 		cbe_disable_pm(cpu);
@@ -479,13 +495,15 @@ static void cell_virtual_cntr(unsigned l
 						      next_hdw_thread)[i]);
 		}
=20
=2D		/* Switch to the other thread. Change the interrupt
+		/*
+		 * Switch to the other thread. Change the interrupt
 		 * and control regs to be scheduled on the CPU
 		 * corresponding to the thread to execute.
 		 */
 		for (i =3D 0; i < num_counters; i++) {
 			if (pmc_cntrl[next_hdw_thread][i].enabled) {
=2D				/* There are some per thread events.
+				/*
+				 * There are some per thread events.
 				 * Must do the set event, enable_cntr
 				 * for each cpu.
 				 */
@@ -517,9 +535,8 @@ static void start_virt_cntrs(void)
 }
=20
 /* This function is called once for all cpus combined */
=2Dstatic int
=2Dcell_reg_setup(struct op_counter_config *ctr,
=2D	       struct op_system_config *sys, int num_ctrs)
+static int cell_reg_setup(struct op_counter_config *ctr,
+			struct op_system_config *sys, int num_ctrs)
 {
 	int i, j, cpu;
 	spu_cycle_reset =3D 0;
@@ -527,7 +544,8 @@ cell_reg_setup(struct op_counter_config=20
 	if (ctr[0].event =3D=3D SPU_CYCLES_EVENT_NUM) {
 		spu_cycle_reset =3D ctr[0].count;
=20
=2D		/* Each node will need to make the rtas call to start
+		/*
+		 * Each node will need to make the rtas call to start
 		 * and stop SPU profiling.  Get the token once and store it.
 		 */
 		spu_rtas_token =3D rtas_token("ibm,cbe-spu-perftools");
@@ -542,7 +560,8 @@ cell_reg_setup(struct op_counter_config=20
=20
 	pm_rtas_token =3D rtas_token("ibm,cbe-perftools");
=20
=2D	/* For all events excetp PPU CYCLEs, each node will need to make
+	/*
+	 * For all events excetp PPU CYCLEs, each node will need to make
 	 * the rtas cbe-perftools call to setup and reset the debug bus.
 	 * Make the token lookup call once and store it in the global
 	 * variable pm_rtas_token.
@@ -579,7 +598,8 @@ cell_reg_setup(struct op_counter_config=20
 			per_cpu(pmc_values, j)[i] =3D 0;
 	}
=20
=2D	/* Setup the thread 1 events, map the thread 0 event to the
+	/*
+	 * Setup the thread 1 events, map the thread 0 event to the
 	 * equivalent thread 1 event.
 	 */
 	for (i =3D 0; i < num_ctrs; ++i) {
@@ -603,7 +623,8 @@ cell_reg_setup(struct op_counter_config=20
 	for (i =3D 0; i < NUM_INPUT_BUS_WORDS; i++)
 		input_bus[i] =3D 0xff;
=20
=2D	/* Our counters count up, and "count" refers to
+	/*
+	 * Our counters count up, and "count" refers to
 	 * how much before the next interrupt, and we interrupt
 	 * on overflow.	 So we calculate the starting value
 	 * which will give us "count" until overflow.
@@ -667,19 +688,19 @@ static int cell_cpu_setup(struct op_coun
 		}
 	}
=20
=2D	/* the pm_rtas_activate_signals will return -EIO if the FW
+	/*
+	 * The pm_rtas_activate_signals will return -EIO if the FW
 	 * call failed.
 	 */
=2D	return (pm_rtas_activate_signals(cbe_cpu_to_node(cpu), num_enabled));
=2D
+	return pm_rtas_activate_signals(cbe_cpu_to_node(cpu), num_enabled);
 }
=20
 #define ENTRIES	 303
 #define MAXLFSR	 0xFFFFFF
=20
 /* precomputed table of 24 bit LFSR values */
=2Dint initial_lfsr[] =3D
=2D{8221349, 12579195, 5379618, 10097839, 7512963, 7519310, 3955098, 107534=
24,
+static int initial_lfsr[] =3D {
+ 8221349, 12579195, 5379618, 10097839, 7512963, 7519310, 3955098, 10753424,
  15507573, 7458917, 285419, 2641121, 9780088, 3915503, 6668768, 1548716,
  4885000, 8774424, 9650099, 2044357, 2304411, 9326253, 10332526, 4421547,
  3440748, 10179459, 13332843, 10375561, 1313462, 8375100, 5198480, 6071392,
@@ -716,7 +737,8 @@ int initial_lfsr[] =3D
  3258216, 12505185, 6007317, 9218111, 14661019, 10537428, 11731949, 902700=
3,
  6641507, 9490160, 200241, 9720425, 16277895, 10816638, 1554761, 10431375,
  7467528, 6790302, 3429078, 14633753, 14428997, 11463204, 3576212, 2003426,
=2D 6123687, 820520, 9992513, 15784513, 5778891, 6428165, 8388607};
+ 6123687, 820520, 9992513, 15784513, 5778891, 6428165, 8388607
+};
=20
 /*
  * The hardware uses an LFSR counting sequence to determine when to capture
@@ -777,28 +799,25 @@ int initial_lfsr[] =3D
=20
 static int calculate_lfsr(int n)
 {
=2D	/* The ranges and steps are in powers of 2 so the calculations
+	/*
+	 * The ranges and steps are in powers of 2 so the calculations
 	 * can be done using shifts rather then divide.
 	 */
 	int index;
=20
=2D	if ((n >> 16) =3D=3D 0) {
+	if ((n >> 16) =3D=3D 0)
 		index =3D 0;
=2D
=2D	} else if (((n - V2_16) >> 19) =3D=3D 0) {
+	else if (((n - V2_16) >> 19) =3D=3D 0)
 		index =3D ((n - V2_16) >> 12) + 1;
=2D
=2D	} else if (((n - V2_16 - V2_19) >> 22) =3D=3D 0) {
+	else if (((n - V2_16 - V2_19) >> 22) =3D=3D 0)
 		index =3D ((n - V2_16 - V2_19) >> 15 ) + 1 + 128;
+	else if (((n - V2_16 - V2_19 - V2_22) >> 24) =3D=3D 0)
+		index =3D ((n - V2_16 - V2_19 - V2_22) >> 18 ) + 1 + 256;
+	else
+		index =3D ENTRIES-1;
=20
=2D	} else if (((n - V2_16 - V2_19 - V2_22) >> 24) =3D=3D 0) {
=2D		index =3D ((n - V2_16 - V2_19 - V2_22) >> 18 )
=2D			+ 1 + 256;
=2D	}
=2D
=2D	if ((index > ENTRIES) || (index < 0))	/* make sure index is
=2D						 * valid
=2D						 */
+	/* make sure index is valid */
+	if ((index > ENTRIES) || (index < 0))
 		index =3D ENTRIES-1;
=20
 	return initial_lfsr[index];
@@ -809,15 +828,17 @@ static int pm_rtas_activate_spu_profilin
 	int ret, i;
 	struct pm_signal pm_signal_local[NR_PHYS_CTRS];
=20
=2D	/* Set up the rtas call to configure the debug bus to
=2D	 * route the SPU PCs.  Setup the pm_signal for each SPU */
+	/*
+	 * Set up the rtas call to configure the debug bus to
+	 * route the SPU PCs.  Setup the pm_signal for each SPU
+	 */
 	for (i =3D 0; i < NUM_SPUS_PER_NODE; i++) {
 		pm_signal_local[i].cpu =3D node;
 		pm_signal_local[i].signal_group =3D 41;
=2D		pm_signal_local[i].bus_word =3D 1 << i / 2; /* spu i on
=2D							   * word (i/2)
=2D							   */
=2D		pm_signal_local[i].sub_unit =3D i;	/* spu i */
+		/* spu i on word (i/2) */
+		pm_signal_local[i].bus_word =3D 1 << i / 2;
+		/* spu i */
+		pm_signal_local[i].sub_unit =3D i;
 		pm_signal_local[i].bit =3D 63;
 	}
=20
@@ -858,8 +879,8 @@ static int cell_global_start_spu(struct=20
 	int subfunc, rtn_value;
 	unsigned int lfsr_value;
 	int cpu;
=2D	int ret =3D 0;
=2D	int rtas_error =3D 0;
+	int ret;
+	int rtas_error;
 	unsigned int cpu_khzfreq =3D 0;
=20
 	/* The SPU profiling uses time-based profiling based on
@@ -884,24 +905,23 @@ static int cell_global_start_spu(struct=20
 	for_each_online_cpu(cpu) {
 		if (cbe_get_hw_thread_id(cpu))
 			continue;
=2D		/* Setup SPU cycle-based profiling.
+
+		/*
+		 * Setup SPU cycle-based profiling.
 		 * Set perf_mon_control bit 0 to a zero before
 		 * enabling spu collection hardware.
 		 */
 		cbe_write_pm(cpu, pm_control, 0);
=20
 		if (spu_cycle_reset > MAX_SPU_COUNT)
=2D			/* use largest possible value
=2D			 */
+			/* use largest possible value */
 			lfsr_value =3D calculate_lfsr(MAX_SPU_COUNT-1);
 		else
=2D		    lfsr_value =3D calculate_lfsr(spu_cycle_reset);
+			lfsr_value =3D calculate_lfsr(spu_cycle_reset);
=20
=2D		if (lfsr_value =3D=3D 0) {	/* must use a non zero value.  Zero
=2D					 * disables data collection.
=2D					 */
=2D				lfsr_value =3D calculate_lfsr(1);
=2D		}
+		/* must use a non zero value. Zero disables data collection. */
+		if (lfsr_value =3D=3D 0)
+			lfsr_value =3D calculate_lfsr(1);
=20
 		lfsr_value =3D lfsr_value << 8; /* shift lfsr to correct
 						* register location
@@ -916,7 +936,7 @@ static int cell_global_start_spu(struct=20
 		}
=20
=20
=2D		subfunc =3D 2;	// 2 - activate SPU tracing, 3 - deactivate
+		subfunc =3D 2;	/* 2 - activate SPU tracing, 3 - deactivate */
=20
 		/* start profiling */
 		rtn_value =3D rtas_call(spu_rtas_token, 3, 1, NULL, subfunc,
@@ -976,7 +996,8 @@ static int cell_global_start_ppu(struct=20
 	oprofile_running =3D 1;
 	smp_wmb();
=20
=2D	/* NOTE: start_virt_cntrs will result in cell_virtual_cntr() being
+	/*
+	 * NOTE: start_virt_cntrs will result in cell_virtual_cntr() being
 	 * executed which manipulates the PMU.	We start the "virtual counter"
 	 * here so that we do not need to synchronize access to the PMU in
 	 * the above for-loop.
@@ -986,7 +1007,6 @@ static int cell_global_start_ppu(struct=20
 	return 0;
 }
=20
=2D
 static int cell_global_start(struct op_counter_config *ctr)
 {
 	if (spu_cycle_reset) {
@@ -996,14 +1016,15 @@ static int cell_global_start(struct op_c
 	}
 }
=20
=2Dstatic void cell_global_stop_spu(void)
=2D/* Note the generic OProfile stop calls do not support returning
+/*
+ * Note the generic OProfile stop calls do not support returning
  * an error on stop.  Hence, will not return an error if the FW
  * calls fail on stop.	Failure to reset the debug bus is not an issue.
  * Failure to disable the SPU profiling is not an issue.  The FW calls
  * to enable the performance counters and debug bus will work even if
  * the hardware was not cleanly reset.
  */
+static void cell_global_stop_spu(void)
 {
 	int subfunc, rtn_value;
 	unsigned int lfsr_value;
@@ -1020,7 +1041,8 @@ static void cell_global_stop_spu(void)
 		if (cbe_get_hw_thread_id(cpu))
 			continue;
=20
=2D		subfunc =3D 3;	/* 2 - activate SPU tracing,
+		subfunc =3D 3;	/*
+				 * 2 - activate SPU tracing,
 				 * 3 - deactivate
 				 */
 		lfsr_value =3D 0x8f100000;
@@ -1046,7 +1068,8 @@ static void cell_global_stop_ppu(void)
 {
 	int cpu;
=20
=2D	/* This routine will be called once for the system.
+	/*
+	 * This routine will be called once for the system.
 	 * There is one performance monitor per node, so we
 	 * only need to perform this function once per node.
 	 */
@@ -1079,8 +1102,8 @@ static void cell_global_stop(void)
 	}
 }
=20
=2Dstatic void
=2Dcell_handle_interrupt(struct pt_regs *regs, struct op_counter_config *ct=
r)
+static void cell_handle_interrupt(struct pt_regs *regs,
+				struct op_counter_config *ctr)
 {
 	u32 cpu;
 	u64 pc;
@@ -1091,13 +1114,15 @@ cell_handle_interrupt(struct pt_regs *re
=20
 	cpu =3D smp_processor_id();
=20
=2D	/* Need to make sure the interrupt handler and the virt counter
+	/*
+	 * Need to make sure the interrupt handler and the virt counter
 	 * routine are not running at the same time. See the
 	 * cell_virtual_cntr() routine for additional comments.
 	 */
 	spin_lock_irqsave(&virt_cntr_lock, flags);
=20
=2D	/* Need to disable and reenable the performance counters
+	/*
+	 * Need to disable and reenable the performance counters
 	 * to get the desired behavior from the hardware.  This
 	 * is hardware specific.
 	 */
@@ -1106,7 +1131,8 @@ cell_handle_interrupt(struct pt_regs *re
=20
 	interrupt_mask =3D cbe_get_and_clear_pm_interrupts(cpu);
=20
=2D	/* If the interrupt mask has been cleared, then the virt cntr
+	/*
+	 * If the interrupt mask has been cleared, then the virt cntr
 	 * has cleared the interrupt.  When the thread that generated
 	 * the interrupt is restored, the data count will be restored to
 	 * 0xffffff0 to cause the interrupt to be regenerated.
@@ -1124,7 +1150,8 @@ cell_handle_interrupt(struct pt_regs *re
 			}
 		}
=20
=2D		/* The counters were frozen by the interrupt.
+		/*
+		 * The counters were frozen by the interrupt.
 		 * Reenable the interrupt and restart the counters.
 		 * If there was a race between the interrupt handler and
 		 * the virtual counter routine.	 The virutal counter
@@ -1134,7 +1161,8 @@ cell_handle_interrupt(struct pt_regs *re
 		cbe_enable_pm_interrupts(cpu, hdw_thread,
 					 virt_cntr_inter_mask);
=20
=2D		/* The writes to the various performance counters only writes
+		/*
+		 * The writes to the various performance counters only writes
 		 * to a latch.	The new values (interrupt setting bits, reset
 		 * counter value etc.) are not copied to the actual registers
 		 * until the performance monitor is enabled.  In order to get
@@ -1147,7 +1175,8 @@ cell_handle_interrupt(struct pt_regs *re
 	spin_unlock_irqrestore(&virt_cntr_lock, flags);
 }
=20
=2D/* This function is called from the generic OProfile
+/*
+ * This function is called from the generic OProfile
  * driver.  When profiling PPUs, we need to do the
  * generic sync start; otherwise, do spu_sync_start.
  */
@@ -1167,7 +1196,6 @@ static int cell_sync_stop(void)
 		return 1;
 }
=20
=2D
 struct op_powerpc_model op_model_cell =3D {
 	.reg_setup =3D cell_reg_setup,
 	.cpu_setup =3D cell_cpu_setup,

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
@ 2007-02-22  0:02 Carl Love
  2007-02-26 23:50   ` Arnd Bergmann
  0 siblings, 1 reply; 66+ messages in thread
From: Carl Love @ 2007-02-22  0:02 UTC (permalink / raw)
  To: linuxppc-dev, cbe-oss-dev, linux-kernel, oprofile-list

This is the third update to the patch previously posted by Maynard
Johnson as "PATCH 4/4. Add support to OProfile for profiling CELL".  

This posting has the following changes:
- Added lib support but it is untested. Waiting on a test case.
- LFSR calculation is completely table driven
- Detecting overlay switches and discarding samples collected when the 
  overlay occured.
- Fixed the Kconfig file.
- RTAS token call and returning error value reworked
- Added lock around the samples array access.
- SPU overlay support validated (there wasn't a bug)
- Misc changes per other minor review commnets

The following are still outstanding issues:
- Samples from dynamic code on the stack (stubs) are still being 
  silently dropped.  Still plan on putting them into anonymous bucket.
- Working on draining samples when context switch occurs.
- File renaming and refactoring suggestions have not been done.  There 
  is still ongoing discussions about this.
- Moving file offset code from kernel to user space has not been done.  
  This is still being discussed.

Would really like to have Anton Blanchard or similar person look over
the non CELL specific OProfile code changes.  

Subject: Add support to OProfile for profiling Cell BE SPUs

From: Maynard Johnson <maynardj@us.ibm.com>

This patch updates the existing arch/powerpc/oprofile/op_model_cell.c
to add in the SPU profiling capabilities.  In addition, a 'cell' subdirectory
was added to arch/powerpc/oprofile to hold Cell-specific SPU profiling
code.

Signed-off-by: Carl Love <carll@us.ibm.com>
Signed-off-by: Maynard Johnson <mpjohn@us.ibm.com>

Index: linux-2.6.20/arch/powerpc/configs/cell_defconfig
===================================================================
--- linux-2.6.20.orig/arch/powerpc/configs/cell_defconfig	2007-02-20 13:49:02.021236368 -0600
+++ linux-2.6.20/arch/powerpc/configs/cell_defconfig	2007-02-20 13:49:52.760242968 -0600
@@ -1415,7 +1415,7 @@
 # Instrumentation Support
 #
 CONFIG_PROFILING=y
-CONFIG_OPROFILE=y
+CONFIG_OPROFILE=m
 # CONFIG_KPROBES is not set
 
 #
Index: linux-2.6.20/arch/powerpc/oprofile/cell/pr_util.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.20/arch/powerpc/oprofile/cell/pr_util.h	2007-02-21 17:28:54.609263688 -0600
@@ -0,0 +1,88 @@
+ /*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Author: Maynard Johnson <maynardj@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef PR_UTIL_H
+#define PR_UTIL_H
+
+#include <linux/cpumask.h>
+#include <linux/oprofile.h>
+#include <asm/cell-pmu.h>
+#include <asm/spu.h>
+
+static inline int number_of_online_nodes(void)
+{
+	u32 cpu; u32 tmp;
+	int nodes = 0;
+	for_each_online_cpu(cpu) {
+		tmp = cbe_cpu_to_node(cpu) + 1;
+		if (tmp > nodes)
+			nodes++;
+	}
+	return nodes;
+}
+
+/* Defines used for sync_start */
+#define SKIP_GENERIC_SYNC 0
+#define SYNC_START_ERROR -1
+#define DO_GENERIC_SYNC 1
+
+struct  spu_overlay_info
+{
+	unsigned int vma;
+	unsigned int size;
+	unsigned int offset;
+	unsigned int buf;
+};
+
+struct vma_to_fileoffset_map
+{
+	struct vma_to_fileoffset_map *next;
+	unsigned int vma;
+	unsigned int size;
+	unsigned int offset;
+	unsigned int guard_ptr;
+	unsigned int guard_val;
+};
+
+/* The three functions below are for maintaining and accessing
+ * the vma-to-fileoffset map.
+ */
+struct vma_to_fileoffset_map * create_vma_map(const struct spu * spu,
+					      u64 objectid);
+unsigned int vma_map_lookup(struct vma_to_fileoffset_map *map,
+			    unsigned int vma, const struct spu * aSpu,
+			    int * grd_val);
+void vma_map_free(struct vma_to_fileoffset_map *map);
+
+/*
+ * Entry point for SPU profiling.
+ * cycles_reset is the SPU_CYCLES count value specified by the user.
+ */
+void start_spu_profiling(unsigned int cycles_reset);
+
+void stop_spu_profiling(void);
+
+
+/* add the necessary profiling hooks */
+int spu_sync_start(void);
+
+/* remove the hooks */
+int spu_sync_stop(void);
+
+/* Record SPU program counter samples to the oprofile event buffer. */
+void spu_sync_buffer(int spu_num, unsigned int * samples,
+		     int num_samples);
+
+void set_profiling_frequency(unsigned int freq_khz, unsigned int cycles_reset);
+
+#endif    // PR_UTIL_H
Index: linux-2.6.20/arch/powerpc/oprofile/cell/spu_profiler.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.20/arch/powerpc/oprofile/cell/spu_profiler.c	2007-02-21 17:28:54.610263536 -0600
@@ -0,0 +1,220 @@
+/*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Authors: Maynard Johnson <maynardj@us.ibm.com>
+ *          Carl Love <carll@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/hrtimer.h>
+#include <linux/smp.h>
+#include <linux/slab.h>
+#include <asm/cell-pmu.h>
+#include <asm/time.h>
+#include "pr_util.h"
+
+#define TRACE_ARRAY_SIZE 1024
+#define SCALE_SHIFT 14
+
+static u32 * samples;
+
+static int spu_prof_running = 0;
+static unsigned int profiling_interval = 0;
+
+extern int spu_prof_num_nodes;
+
+
+#define NUM_SPU_BITS_TRBUF 16
+#define SPUS_PER_TB_ENTRY   4
+#define SPUS_PER_NODE       8
+
+#define SPU_PC_MASK         0xFFFF
+
+static spinlock_t sample_array_lock=SPIN_LOCK_UNLOCKED;
+unsigned long sample_array_lock_flags;
+
+void set_profiling_frequency(unsigned int freq_khz, unsigned int cycles_reset)
+{
+	unsigned long nsPerCyc;
+	if (!freq_khz)
+		freq_khz = ppc_proc_freq/1000;
+
+	/* To calculate a timeout in nanoseconds, the basic
+	 * formula is ns = cycles_reset * (NSEC_PER_SEC / cpu frequency).
+	 * To avoid floating point math, we use the scale math
+	 * technique as described in linux/jiffies.h.  We use
+	 * a scale factor of SCALE_SHIFT,which provides 4 decimal places
+	 * of precision, which is close enough for the purpose at hand.
+	 *
+	 * The value of the timeout should be small enough that the hw
+	 * trace buffer will not get more then a bout 1/3 full for the
+	 * maximum user specified (the LFSR value) hw sampling frequency.
+	 * This is to ensure the trace buffer will never fill even if the
+	 * kernel thread scheduling varies under a heavey system load.
+	 */
+
+	nsPerCyc = (USEC_PER_SEC << SCALE_SHIFT)/freq_khz;
+	profiling_interval = (nsPerCyc * cycles_reset) >> SCALE_SHIFT;
+
+}
+
+/*
+ * Extract SPU PC from trace buffer entry
+ */
+static void spu_pc_extract(int cpu, int entry)
+{
+	/* the trace buffer is 128 bits */
+	u64 trace_buffer[2];
+	u64 spu_mask;
+	int spu;
+
+	spu_mask = SPU_PC_MASK;
+
+	/* Each SPU PC is 16 bits; hence, four spus in each of
+	 * the two 64-bit buffer entries that make up the
+	 * 128-bit trace_buffer entry.  Process two 64-bit values
+	 * simultaneously.
+	 * trace[0] SPU PC contents are: 0 1 2 3
+	 * trace[1] SPU PC contents are: 4 5 6 7
+	 */
+
+	cbe_read_trace_buffer(cpu, trace_buffer);
+
+	for (spu = SPUS_PER_TB_ENTRY-1; spu >= 0; spu--) {
+		/* spu PC trace entry is upper 16 bits of the
+		 * 18 bit SPU program counter
+		 */
+		samples[spu * TRACE_ARRAY_SIZE + entry]
+			= (spu_mask & trace_buffer[0]) << 2;
+		samples[(spu + SPUS_PER_TB_ENTRY) * TRACE_ARRAY_SIZE + entry]
+			= (spu_mask & trace_buffer[1]) << 2;
+
+		trace_buffer[0] = trace_buffer[0] >> NUM_SPU_BITS_TRBUF;
+		trace_buffer[1] = trace_buffer[1] >> NUM_SPU_BITS_TRBUF;
+	}
+}
+
+static int cell_spu_pc_collection(int cpu)
+{
+	u32 trace_addr;
+	int entry;
+
+	/* process the collected SPU PC for the node */
+
+	entry = 0;
+
+	trace_addr = cbe_read_pm(cpu, trace_address);
+	while (!(trace_addr & CBE_PM_TRACE_BUF_EMPTY))
+	{
+		/* there is data in the trace buffer to process */
+		spu_pc_extract(cpu, entry);
+
+		entry++;
+
+		if (entry >= TRACE_ARRAY_SIZE)
+			/* spu_samples is full */
+			break;
+
+		trace_addr = cbe_read_pm(cpu, trace_address);
+	}
+
+	return(entry);
+}
+
+
+static int profile_spus(struct hrtimer * timer)
+{
+	ktime_t kt;
+	int cpu, node, k, num_samples, spu_num;
+
+	if (!spu_prof_running)
+		goto stop;
+
+	for_each_online_cpu(cpu) {
+		if (cbe_get_hw_thread_id(cpu))
+			continue;
+
+		node = cbe_cpu_to_node(cpu);
+
+		/* There should only be on kernel thread at a time processing
+		 * the samples.  In the very unlikely case that the processing
+		 * is taking a very long time and multiple kernel threads are
+		 * started to process the samples.  Make sure only one kernel
+		 * thread is working on the samples array at a time.  The
+		 * sample array must be loaded and then processed for a given
+		 * cpu.  The sample array is not per cpu.
+		 */
+		spin_lock_irqsave(&sample_array_lock,
+				  sample_array_lock_flags);
+		num_samples = cell_spu_pc_collection(cpu);
+
+		if (num_samples == 0) {
+			spin_unlock_irqrestore(&sample_array_lock,
+					       sample_array_lock_flags);
+			continue;
+		}
+
+		for (k = 0; k < SPUS_PER_NODE; k++) {
+			spu_num = k + (node * SPUS_PER_NODE);
+			spu_sync_buffer(spu_num,
+					samples + (k * TRACE_ARRAY_SIZE),
+					num_samples);
+		}
+
+		spin_unlock_irqrestore(&sample_array_lock,
+				       sample_array_lock_flags);
+
+	}
+	smp_wmb();
+
+	kt = ktime_set(0, profiling_interval);
+	if (!spu_prof_running)
+		goto stop;
+	hrtimer_forward(timer, timer->base->get_time(), kt);
+	return HRTIMER_RESTART;
+
+ stop:
+	printk(KERN_INFO "SPU_PROF: spu-prof timer ending\n");
+	return HRTIMER_NORESTART;
+}
+
+static struct hrtimer timer;
+/*
+ * Entry point for SPU profiling.
+ * NOTE:  SPU profiling is done system-wide, not per-CPU.
+ *
+ * cycles_reset is the count value specified by the user when
+ * setting up OProfile to count SPU_CYCLES.
+ */
+void start_spu_profiling(unsigned int cycles_reset) {
+
+	ktime_t kt;
+
+	pr_debug("timer resolution: %lu\n",
+		 TICK_NSEC);
+	kt = ktime_set(0, profiling_interval);
+	hrtimer_init(&timer, CLOCK_MONOTONIC, HRTIMER_REL);
+	timer.expires = kt;
+	timer.function = profile_spus;
+
+	/* Allocate arrays for collecting SPU PC samples */
+	samples = (u32 *) kzalloc(SPUS_PER_NODE *
+				  TRACE_ARRAY_SIZE * sizeof(u32), GFP_KERNEL);
+
+	spu_prof_running = 1;
+	hrtimer_start(&timer, kt, HRTIMER_REL);
+}
+
+void stop_spu_profiling(void)
+{
+	spu_prof_running = 0;
+	hrtimer_cancel(&timer);
+	kfree(samples);
+	pr_debug("SPU_PROF: stop_spu_profiling issued\n");
+}
Index: linux-2.6.20/arch/powerpc/oprofile/cell/spu_task_sync.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.20/arch/powerpc/oprofile/cell/spu_task_sync.c	2007-02-21 17:28:54.610263536 -0600
@@ -0,0 +1,487 @@
+/*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Author: Maynard Johnson <maynardj@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/* The purpose of this file is to handle SPU event task switching
+ * and to record SPU context information into the OProfile
+ * event buffer.
+ *
+ * Additionally, the spu_sync_buffer function is provided as a helper
+ * for recoding actual SPU program counter samples to the event buffer.
+ */
+#include <linux/dcookies.h>
+#include <linux/kref.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/numa.h>
+#include <linux/oprofile.h>
+#include <linux/spinlock.h>
+#include "pr_util.h"
+
+#define RELEASE_ALL 9999
+
+static spinlock_t buffer_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t cache_lock = SPIN_LOCK_UNLOCKED;
+static int num_spu_nodes;
+int spu_prof_num_nodes;
+int last_guard_val[MAX_NUMNODES * 8];
+
+/* Container for caching information about an active SPU task. */
+struct cached_info {
+	struct vma_to_fileoffset_map * map;
+	struct spu * the_spu;   /* needed to access pointer to local_store */
+	struct kref cache_ref;
+};
+
+static struct cached_info * spu_info[MAX_NUMNODES * 8];
+
+static void destroy_cached_info(struct kref * kref)
+{
+	struct cached_info * info;
+	info = container_of(kref, struct cached_info, cache_ref);
+	vma_map_free(info->map);
+	kfree(info);
+	module_put(THIS_MODULE);
+}
+
+/* Return the cached_info for the passed SPU number.
+ * ATTENTION:  Callers are responsible for obtaining the
+ *             cache_lock if needed prior to invoking this function.
+ */
+static struct cached_info * get_cached_info(struct spu * the_spu, int spu_num)
+{
+	struct kref * ref;
+	struct cached_info * ret_info = NULL;
+	if (spu_num >= num_spu_nodes) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Invalid index %d into spu info cache\n",
+		       __FUNCTION__, __LINE__, spu_num);
+		goto out;
+	}
+	if (!spu_info[spu_num] && the_spu) {
+		ref = spu_get_profile_private_kref(the_spu->ctx);
+		if (ref) {
+			spu_info[spu_num] = container_of(ref, struct cached_info, cache_ref);
+			kref_get(&spu_info[spu_num]->cache_ref);
+		}
+	}
+
+	ret_info = spu_info[spu_num];
+ out:
+	return ret_info;
+}
+
+
+/* Looks for cached info for the passed spu.  If not found, the
+ * cached info is created for the passed spu.
+ * Returns 0 for success; otherwise, -1 for error.
+ */
+static int
+prepare_cached_spu_info(struct spu * spu, unsigned int objectId)
+{
+	unsigned long flags = 0;
+	struct vma_to_fileoffset_map * new_map;
+	int retval = 0;
+	struct cached_info * info;
+
+        /* We won't bother getting cache_lock here since
+	 * don't do anything with the cached_info that's returned.
+	 */
+	info = get_cached_info(spu, spu->number);
+
+	if (info) {
+		pr_debug("Found cached SPU info.\n");
+		goto out;
+	}
+
+	/* Create cached_info and set spu_info[spu->number] to point to it.
+	 * spu->number is a system-wide value, not a per-node value.
+	 */
+	info = kzalloc(sizeof(struct cached_info), GFP_KERNEL);
+	if (!info) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: create vma_map failed\n",
+		       __FUNCTION__, __LINE__);
+		goto err_alloc;
+	}
+	new_map = create_vma_map(spu, objectId);
+	if (!new_map) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: create vma_map failed\n",
+		       __FUNCTION__, __LINE__);
+		goto err_alloc;
+	}
+
+	pr_debug("Created vma_map\n");
+	info->map = new_map;
+	info->the_spu = spu;
+	kref_init(&info->cache_ref);
+	spin_lock_irqsave(&cache_lock, flags);
+	spu_info[spu->number] = info;
+	/* Increment count before passing off ref to SPUFS. */
+	kref_get(&info->cache_ref);
+
+        /* We increment the module refcount here since SPUFS is
+	 * responsible for the final destruction of the cached_info,
+	 * and it must be able to access the destroy_cached_info()
+	 * function defined in the OProfile module.  We decrement
+	 * the module refcount in destroy_cached_info.
+	 */
+	try_module_get(THIS_MODULE);
+	spu_set_profile_private_kref(spu->ctx, &info->cache_ref,
+				destroy_cached_info);
+	spin_unlock_irqrestore(&cache_lock, flags);
+	goto out;
+
+err_alloc:
+	retval = -1;
+out:
+	return retval;
+}
+
+/*
+ * NOTE:  The caller is responsible for locking the
+ *	  cache_lock prior to calling this function.
+ */
+static int release_cached_info(int spu_index)
+{
+	int index, end;
+	if (spu_index == RELEASE_ALL) {
+		end = num_spu_nodes;
+		index = 0;
+	} else {
+	        if (spu_index >= num_spu_nodes) {
+        	        printk(KERN_ERR "SPU_PROF: "
+			       "%s, line %d: "
+			       "Invalid index %d into spu info cache\n",
+               	               __FUNCTION__, __LINE__, spu_index);
+	                goto out;
+	        }
+		end = spu_index +1;
+		index = spu_index;
+	}
+	for (; index < end; index++) {
+		if (spu_info[index]) {
+			kref_put(&spu_info[index]->cache_ref,
+				 destroy_cached_info);
+			spu_info[index] = NULL;
+		}
+	}
+
+out:
+	return 0;
+}
+
+/* The source code for fast_get_dcookie was "borrowed"
+ * from drivers/oprofile/buffer_sync.c.
+ */
+
+/* Optimisation. We can manage without taking the dcookie sem
+ * because we cannot reach this code without at least one
+ * dcookie user still being registered (namely, the reader
+ * of the event buffer).
+ */
+static inline unsigned long fast_get_dcookie(struct dentry * dentry,
+					     struct vfsmount * vfsmnt)
+{
+	unsigned long cookie;
+
+	if (dentry->d_cookie)
+		return (unsigned long)dentry;
+	get_dcookie(dentry, vfsmnt, &cookie);
+	return cookie;
+}
+
+/* Look up the dcookie for the task's first VM_EXECUTABLE mapping,
+ * which corresponds loosely to "application name". Also, determine
+ * the offset for the SPU ELF object.  If computed offset is
+ * non-zero, it implies an embedded SPU object; otherwise, it's a
+ * separate SPU binary, in which case we retrieve it's dcookie.
+ * For the embedded case, we must determine if SPU ELF is embedded
+ * in the executable application or another file (i.e., shared lib).
+ * If embedded in a shared lib, we must get the dcookie and return
+ * that to the caller.
+ */
+static unsigned long
+get_exec_dcookie_and_offset(struct spu * spu, unsigned int * offsetp,
+			    unsigned long * spu_bin_dcookie,
+			    unsigned long * shlib_dcookie, 
+			    unsigned int spu_ref)
+{
+	unsigned long app_cookie = 0;
+	unsigned long * image_cookie = NULL;
+	unsigned int my_offset = 0;
+	struct file * app = NULL;
+	struct vm_area_struct * vma;
+	struct mm_struct * mm = spu->mm;
+
+	if (!mm)
+		goto out;
+
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		if (!vma->vm_file)
+			continue;
+		if (!(vma->vm_flags & VM_EXECUTABLE))
+			continue;
+		app_cookie = fast_get_dcookie(vma->vm_file->f_dentry,
+					  vma->vm_file->f_vfsmnt);
+		pr_debug("got dcookie for %s\n",
+			 vma->vm_file->f_dentry->d_name.name);
+		app = vma->vm_file;
+		break;
+	}
+
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		if (vma->vm_start > spu_ref || vma->vm_end <= spu_ref)
+			continue;
+		my_offset = spu_ref - vma->vm_start;
+		if (!vma->vm_file)
+			goto fail_no_image_cookie;
+
+		pr_debug("Found spu ELF at %X(object-id:%X) for file %s\n",
+			 my_offset, spu_ref,
+			 vma->vm_file->f_dentry->d_name.name);
+		*offsetp = my_offset;
+		if (my_offset == 0)
+			image_cookie = spu_bin_dcookie;
+		else if (vma->vm_file != app)
+			image_cookie = shlib_dcookie;
+		break;
+	}
+
+	if (image_cookie) {
+		*image_cookie = fast_get_dcookie(vma->vm_file->f_dentry,
+						 vma->vm_file->f_vfsmnt);
+		pr_debug("got dcookie for %s\n",
+			 vma->vm_file->f_dentry->d_name.name);
+	}
+
+ out:
+	return app_cookie;
+
+ fail_no_image_cookie:
+	printk(KERN_ERR "SPU_PROF: "
+	       "%s, line %d: Cannot find dcookie for SPU binary\n",
+	       __FUNCTION__, __LINE__);
+	goto out;
+}
+
+
+
+/* This function finds or creates cached context information for the
+ * passed SPU and records SPU context information into the OProfile
+ * event buffer.
+ */
+static int process_context_switch(struct spu * spu, unsigned int objectId)
+{
+	unsigned long flags;
+	int retval = 0;
+	unsigned int offset = 0;
+	unsigned long spu_cookie = 0, app_dcookie = 0, shlib_cookie = 0;
+	retval = prepare_cached_spu_info(spu, objectId);
+	if (retval == -1) {
+		goto out;
+	}
+        /* Get dcookie first because a mutex_lock is taken in that
+	 * code path, so interrupts must not be disabled.
+	 */
+	app_dcookie = get_exec_dcookie_and_offset(spu, &offset, &spu_cookie,
+						  &shlib_cookie, objectId);
+
+        /* Record context info in event buffer */
+	spin_lock_irqsave(&buffer_lock, flags);
+	add_event_entry(ESCAPE_CODE);
+	add_event_entry(SPU_CTX_SWITCH_CODE);
+	add_event_entry(spu->number);
+	add_event_entry(spu->pid);
+	add_event_entry(spu->tgid);
+	add_event_entry(app_dcookie);
+
+	if (offset) {
+		/* When offset is non-zero, the SPU ELF was embedded;
+		 * otherwise, it was loaded from a separate binary file. For
+		 * embedded case, we record the offset into the embedding file
+		 * where the SPU ELF was placed.  The embedding file may be
+		 * either the executable application binary or shared library.
+		 * For the non-embedded case, we record a dcookie that
+		 * points to the location of the separate SPU binary that was
+		 * loaded.
+		 */
+		if (shlib_cookie) {
+			add_event_entry(SPU_SHLIB_COOKIE_CODE);
+			add_event_entry(shlib_cookie);
+		}
+		add_event_entry(SPU_OFFSET_CODE);
+		add_event_entry(offset);
+	} else {
+		add_event_entry(SPU_COOKIE_CODE);
+		add_event_entry(spu_cookie);
+	}
+	spin_unlock_irqrestore(&buffer_lock, flags);
+	smp_wmb();
+out:
+	return retval;
+}
+
+/*
+ * This function is invoked on either a bind_context or unbind_context.
+ * If called for an unbind_context, the val arg is 0; otherwise,
+ * it is the object-id value for the spu context.
+ * The data arg is of type 'struct spu *'.
+ */
+static int spu_active_notify(struct notifier_block * self, unsigned long val,
+			     void * data)
+{
+	int retval;
+	unsigned long flags = 0;
+	struct spu * the_spu = data;
+	pr_debug("SPU event notification arrived\n");
+	if (!val){
+		spin_lock_irqsave(&cache_lock, flags);
+		retval = release_cached_info(the_spu->number);
+		spin_unlock_irqrestore(&cache_lock, flags);
+	} else {
+		retval = process_context_switch(the_spu, val);
+	}
+	return retval;
+}
+
+static struct notifier_block spu_active = {
+	.notifier_call = spu_active_notify,
+};
+
+/* The main purpose of this function is to synchronize
+ * OProfile with SPUFS by registering to be notified of
+ * SPU task switches.
+ *
+ * NOTE: When profiling SPUs, we must ensure that only
+ * spu_sync_start is invoked and not the generic sync_start
+ * in drivers/oprofile/oprof.c.  A return value of
+ * SKIP_GENERIC_SYNC or SYNC_START_ERROR will
+ * accomplish this.
+ */
+int spu_sync_start(void) {
+	int k;
+	int ret = SKIP_GENERIC_SYNC;
+	int register_ret;
+	unsigned long flags = 0;
+	spu_prof_num_nodes = number_of_online_nodes();
+	num_spu_nodes = spu_prof_num_nodes * 8;
+
+	spin_lock_irqsave(&buffer_lock, flags);
+	add_event_entry(ESCAPE_CODE);
+	add_event_entry(SPU_PROFILING_CODE);
+	add_event_entry(num_spu_nodes);
+	spin_unlock_irqrestore(&buffer_lock, flags);
+
+        /* Register for SPU events  */
+	register_ret = spu_switch_event_register(&spu_active);
+	if (register_ret) {
+		ret = SYNC_START_ERROR;
+		goto out;
+	}
+
+	for (k = 0; k < (MAX_NUMNODES * 8); k++)
+		last_guard_val[k] = 0;
+	pr_debug("spu_sync_start -- running.\n");
+out:
+	return ret;
+}
+
+/* Record SPU program counter samples to the oprofile event buffer. */
+void spu_sync_buffer(int spu_num, unsigned int * samples,
+		     int num_samples)
+{
+	unsigned long long file_offset;
+	unsigned long cache_lock_flags = 0;
+	unsigned long buffer_lock_flags = 0;
+	int i;
+	struct vma_to_fileoffset_map * map;
+	struct spu * the_spu;
+	unsigned long long spu_num_ll = spu_num;
+	unsigned long long spu_num_shifted = spu_num_ll << 32;
+	struct cached_info * c_info;
+
+        /* We need to obtain the cache_lock here because it's
+	 * possible that after getting the cached_info, the SPU job
+	 * corresponding to this cached_info may end, thus resulting
+	 * in the destruction of the cached_info.
+	 */
+	spin_lock_irqsave(&cache_lock, cache_lock_flags);
+	c_info = get_cached_info(NULL, spu_num);
+	if (c_info == NULL) {
+        /* This legitimately happens when the SPU task ends before all
+	 * samples are recorded.  No big deal -- so we just drop a few samples.
+	 */
+		pr_debug("SPU_PROF: No cached SPU contex "
+			  "for SPU #%d. Dropping samples.\n", spu_num);
+		spin_unlock_irqrestore(&cache_lock, cache_lock_flags);
+		return ;
+	}
+
+	map = c_info->map;
+	the_spu = c_info->the_spu;
+	spin_lock_irqsave(&buffer_lock, buffer_lock_flags);
+	for (i = 0; i < num_samples; i++) {
+		unsigned int sample = *(samples+i);
+		int grd_val = 0;
+		file_offset = 0;
+		if (sample == 0)
+			continue;
+                file_offset = vma_map_lookup(
+                        map, sample, the_spu, &grd_val);
+
+                /* If overlays are used by this SPU application, the guard
+		 * value is non-zero, indicating which overlay section is in
+		 * use.  We need to discard samples taken during the time
+		 * period which an overlay occurs (i.e., guard value changes).
+		 */
+		if (grd_val && grd_val != last_guard_val[spu_num]) {
+			last_guard_val[spu_num] = grd_val;
+			/* Drop the rest of the samples. */
+			break;
+		}
+			
+		/* For now, we'll drop samples that can't be mapped.
+		 * This can happen for generated stubs executed from
+		 * the SPU stack.  Do we need to record these somehow?
+		 */
+		if (unlikely(file_offset == 0xffffffff))
+			continue;
+		add_event_entry(file_offset | spu_num_shifted);
+	}
+	spin_unlock_irqrestore(&buffer_lock, buffer_lock_flags);
+	spin_unlock_irqrestore(&cache_lock, cache_lock_flags);
+}
+
+
+int spu_sync_stop(void)
+{
+	unsigned long flags = 0;
+	int ret = spu_switch_event_unregister(&spu_active);
+	if (ret) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: spu_switch_event_unregister returned %d\n",
+		       __FUNCTION__, __LINE__, ret);
+		goto out;
+	}
+
+	spin_lock_irqsave(&cache_lock, flags);
+	ret = release_cached_info(RELEASE_ALL);
+	spin_unlock_irqrestore(&cache_lock, flags);
+out:
+	pr_debug("spu_sync_stop -- done.\n");
+	return ret;
+}
+
+
Index: linux-2.6.20/arch/powerpc/oprofile/cell/vma_map.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.20/arch/powerpc/oprofile/cell/vma_map.c	2007-02-20 13:49:52.776240536 -0600
@@ -0,0 +1,279 @@
+ /*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Author: Maynard Johnson <maynardj@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/* The code in this source file is responsible for generating
+ * vma-to-fileOffset maps for both overlay and non-overlay SPU
+ * applications.
+ */
+
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/elf.h>
+#include "pr_util.h"
+
+
+void vma_map_free(struct vma_to_fileoffset_map *map)
+{
+	while (map) {
+		struct vma_to_fileoffset_map *next = map->next;
+		kfree(map);
+		map = next;
+	}
+}
+
+unsigned int
+vma_map_lookup(struct vma_to_fileoffset_map *map, unsigned int vma,
+	       const struct spu * aSpu, int * grd_val)
+{
+	u32 offset = 0xffffffff;
+	u32 ovly_grd;
+	for (; map; map = map->next) {
+		if (vma < map->vma || vma >= map->vma + map->size)
+			continue;
+
+		if (map->guard_ptr) {
+			ovly_grd = *(u32 *)(aSpu->local_store + map->guard_ptr);
+			if (ovly_grd != map->guard_val)
+				continue;
+			*grd_val = ovly_grd;
+		}
+		offset = vma - map->vma + map->offset;
+		break;
+	}
+
+	return offset;
+}
+
+static struct vma_to_fileoffset_map *
+vma_map_add(struct vma_to_fileoffset_map * map, unsigned int vma,
+	    unsigned int size, unsigned int offset, unsigned int guard_ptr,
+	    unsigned int guard_val)
+{
+	struct vma_to_fileoffset_map * new =
+		kzalloc(sizeof(struct vma_to_fileoffset_map), GFP_KERNEL);
+	if (!new) {
+		printk(KERN_ERR "SPU_PROF: %s, line %d: malloc failed\n",
+		       __FUNCTION__, __LINE__);
+		vma_map_free(map);
+		return NULL;
+	}
+
+	new->next = map;
+	new->vma = vma;
+	new->size = size;
+	new->offset = offset;
+	new->guard_ptr = guard_ptr;
+	new->guard_val = guard_val;
+
+	return new;
+}
+
+
+/* Parse SPE ELF header and generate a list of vma_maps.
+ * A pointer to the first vma_map in the generated list
+ * of vma_maps is returned.  */
+struct vma_to_fileoffset_map * create_vma_map(const struct spu * aSpu,
+					      unsigned long spu_elf_start)
+{
+	static const unsigned char expected[EI_PAD] = {
+		[EI_MAG0] = ELFMAG0,
+		[EI_MAG1] = ELFMAG1,
+		[EI_MAG2] = ELFMAG2,
+		[EI_MAG3] = ELFMAG3,
+		[EI_CLASS] = ELFCLASS32,
+		[EI_DATA] = ELFDATA2MSB,
+		[EI_VERSION] = EV_CURRENT,
+		[EI_OSABI] = ELFOSABI_NONE
+	};
+
+	int grd_val;
+	struct vma_to_fileoffset_map * map = NULL;
+	struct spu_overlay_info ovly;
+	unsigned int overlay_tbl_offset = -1;
+	unsigned long phdr_start, shdr_start;
+	Elf32_Ehdr ehdr;
+	Elf32_Phdr phdr;
+	Elf32_Shdr shdr, shdr_str;
+	Elf32_Sym sym;
+	int i, j;
+	char name[32];
+
+	unsigned int ovly_table_sym = 0;
+	unsigned int ovly_buf_table_sym = 0;
+	unsigned int ovly_table_end_sym = 0;
+	unsigned int ovly_buf_table_end_sym = 0;
+	unsigned long ovly_table;
+	unsigned int n_ovlys;
+
+	/* Get and validate ELF header.  */
+
+	if (copy_from_user(&ehdr, (void *) spu_elf_start, sizeof (ehdr)))
+		goto fail;
+
+	if (memcmp(ehdr.e_ident, expected, EI_PAD) != 0) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Unexpected e_ident parsing SPU ELF\n",
+		       __FUNCTION__, __LINE__);
+		goto fail;
+	}
+	if (ehdr.e_machine != EM_SPU) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Unexpected e_machine parsing SPU ELF\n",
+		       __FUNCTION__,  __LINE__);
+		goto fail;
+	}
+	if (ehdr.e_type != ET_EXEC) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Unexpected e_type parsing SPU ELF\n",
+		       __FUNCTION__, __LINE__);
+		goto fail;
+	}
+	phdr_start = spu_elf_start + ehdr.e_phoff;
+	shdr_start = spu_elf_start + ehdr.e_shoff;
+
+	/* Traverse program headers.  */
+	for (i = 0; i < ehdr.e_phnum; i++) {
+		if (copy_from_user(&phdr,
+				   (void *) (phdr_start + i * sizeof(phdr)),
+				   sizeof(phdr)))
+			goto fail;
+
+		if (phdr.p_type != PT_LOAD)
+			continue;
+		if (phdr.p_flags & (1 << 27))
+			continue;
+
+		map = vma_map_add(map, phdr.p_vaddr, phdr.p_memsz,
+				  phdr.p_offset, 0, 0);
+		if (!map)
+			goto fail;
+	}
+
+	pr_debug("SPU_PROF: Created non-overlay maps\n");
+	/* Traverse section table and search for overlay-related symbols.  */
+	for (i = 0; i < ehdr.e_shnum; i++) {
+		if (copy_from_user(&shdr,
+				   (void *) (shdr_start + i * sizeof(shdr)),
+				   sizeof(shdr)))
+			goto fail;
+
+		if (shdr.sh_type != SHT_SYMTAB)
+			continue;
+		if (shdr.sh_entsize != sizeof (sym))
+			continue;
+
+		if (copy_from_user(&shdr_str,
+				   (void *) (shdr_start + shdr.sh_link *
+					     sizeof(shdr)),
+				   sizeof(shdr)))
+			goto fail;
+
+		if (shdr_str.sh_type != SHT_STRTAB)
+			goto fail;;
+
+		for (j = 0; j < shdr.sh_size / sizeof (sym); j++) {
+			if (copy_from_user(&sym, (void *) (spu_elf_start +
+						       shdr.sh_offset + j *
+							   sizeof (sym)),
+					   sizeof (sym)))
+				goto fail;
+
+			if (copy_from_user(name, (void *)
+					   (spu_elf_start + shdr_str.sh_offset +
+					    sym.st_name),
+					   20))
+				goto fail;
+
+			if (memcmp(name, "_ovly_table", 12) == 0)
+				ovly_table_sym = sym.st_value;
+			if (memcmp(name, "_ovly_buf_table", 16) == 0)
+				ovly_buf_table_sym = sym.st_value;
+			if (memcmp(name, "_ovly_table_end", 16) == 0)
+				ovly_table_end_sym = sym.st_value;
+			if (memcmp(name, "_ovly_buf_table_end", 20) == 0)
+				ovly_buf_table_end_sym = sym.st_value;
+		}
+	}
+
+	/* If we don't have overlays, we're done.  */
+	if (ovly_table_sym == 0 || ovly_buf_table_sym == 0
+	    || ovly_table_end_sym == 0 || ovly_buf_table_end_sym == 0) {
+		pr_debug("SPU_PROF: No overlay table found\n");
+		goto out;
+	}
+	else {
+		pr_debug("SPU_PROF: Overlay table found\n");
+	}
+
+        /* The _ovly_table symbol represents a table with one entry
+	 * per overlay section.  The _ovly_buf_table symbol represents
+	 * a table with one entry per overlay region.
+         * The struct spu_overlay_info gives the structure of the _ovly_table
+	 * entries.  The structure of _ovly_table_buf is simply one
+	 * u32 word per entry.
+	 */
+	overlay_tbl_offset = vma_map_lookup(map, ovly_table_sym, aSpu, &grd_val);
+	if (overlay_tbl_offset < 0) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Error finding SPU overlay table\n",
+		       __FUNCTION__, __LINE__);
+		goto fail;
+	}
+	ovly_table = spu_elf_start + overlay_tbl_offset;
+
+	n_ovlys = (ovly_table_end_sym -
+		   ovly_table_sym) / sizeof (ovly);
+
+	/* Traverse overlay table.  */
+	for (i = 0; i < n_ovlys; i++) {
+		if (copy_from_user(&ovly, (void *)
+				   (ovly_table + i * sizeof (ovly)),
+				   sizeof (ovly)))
+			goto fail;
+
+		/* The ovly.vma/size/offset arguments are analogous to the same
+		 * arguments used above for non-overlay maps.  The final two
+		 * args are referred to as the guard pointer and the guard
+		 * value.
+		 * The guard pointer is an entry in the _ovly_buf_table,
+		 * computed using ovly.buf as the index into the table.  Since
+		 * ovly.buf values begin at '1' to reference the first (or 0th)
+		 * entry in the _ovly_buf_table, the computation subtracts 1
+		 * from ovly.buf.
+		 * The guard value is stored in the _ovly_buf_table entry and
+		 * is an index (starting at 1) back to the _ovly_table entry
+		 * that is pointing at this _ovly_buf_table entry.  So, for
+		 * example, for an overlay scenario with one overlay segment
+		 * and two overlay sections:
+		 *      - Section 1 points to the first entry of the
+		 *        _ovly_buf_table, which contains a guard value
+		 *        of '1', referencing the first (index=0) entry of
+		 *        _ovly_table.
+		 *      - Section 2 points to the second entry of the
+		 *        _ovly_buf_table, which contains a guard value
+		 *        of '2', referencing the second (index=1) entry of
+		 *        _ovly_table.
+		 */
+		map = vma_map_add(map, ovly.vma, ovly.size, ovly.offset,
+				   ovly_buf_table_sym + (ovly.buf - 1) * 4, i + 1);
+		if (!map)
+			goto fail;
+	}
+	goto out;
+
+ fail:
+	map = NULL;
+ out:
+	return map;
+}
Index: linux-2.6.20/arch/powerpc/oprofile/common.c
===================================================================
--- linux-2.6.20.orig/arch/powerpc/oprofile/common.c	2007-02-20 13:49:02.029235152 -0600
+++ linux-2.6.20/arch/powerpc/oprofile/common.c	2007-02-20 16:42:26.626176048 -0600
@@ -29,6 +29,8 @@
 static struct op_counter_config ctr[OP_MAX_COUNTER];
 static struct op_system_config sys;
 
+static int op_powerpc_flag;
+
 static void op_handle_interrupt(struct pt_regs *regs)
 {
 	model->handle_interrupt(regs, ctr);
@@ -36,25 +38,41 @@
 
 static void op_powerpc_cpu_setup(void *dummy)
 {
-	model->cpu_setup(ctr);
+	int ret;
+
+	ret = model->cpu_setup(ctr);
+
+	if (ret != 0)
+		op_powerpc_flag = ret;
 }
 
 static int op_powerpc_setup(void)
 {
 	int err;
 
+	op_powerpc_flag = 0;
+
 	/* Grab the hardware */
 	err = reserve_pmc_hardware(op_handle_interrupt);
 	if (err)
 		return err;
 
 	/* Pre-compute the values to stuff in the hardware registers.  */
-	model->reg_setup(ctr, &sys, model->num_counters);
+	op_powerpc_flag = model->reg_setup(ctr, &sys, model->num_counters);
 
-	/* Configure the registers on all cpus.  */
+	if (op_powerpc_flag)
+		goto out;
+
+	/* Configure the registers on all cpus.  If an error occurs on one 
+	 * of the cpus, op_powerpc_flag will be set to the error */
 	on_each_cpu(op_powerpc_cpu_setup, NULL, 0, 1);
 
-	return 0;
+out:    if (op_powerpc_flag) {
+		/* error on setup release the performance counter hardware */
+		release_pmc_hardware();
+	}
+
+	return op_powerpc_flag;
 }
 
 static void op_powerpc_shutdown(void)
@@ -64,16 +82,29 @@
 
 static void op_powerpc_cpu_start(void *dummy)
 {
-	model->start(ctr);
+	/* If any of the cpus have return an error, set the
+	 * global flag to the error so it can be returned
+	 * to the generic OProfile caller.
+	 */
+	int ret;
+
+	ret = model->start(ctr);
+	if (ret != 0)
+		op_powerpc_flag = ret;
 }
 
 static int op_powerpc_start(void)
 {
+	op_powerpc_flag = 0;
+
 	if (model->global_start)
-		model->global_start(ctr);
-	if (model->start)
+		return model->global_start(ctr);
+	if (model->start) {
 		on_each_cpu(op_powerpc_cpu_start, NULL, 0, 1);
-	return 0;
+		return op_powerpc_flag;
+	}
+	return -EIO; /* No start function is defined for this
+			power architecture */
 }
 
 static inline void op_powerpc_cpu_stop(void *dummy)
@@ -150,6 +181,8 @@
 #ifdef CONFIG_PPC_CELL_NATIVE
 		case PPC_OPROFILE_CELL:
 			model = &op_model_cell;
+			ops->sync_start = model->sync_start;
+			ops->sync_stop = model->sync_stop;
 			break;
 #endif
 		case PPC_OPROFILE_RS64:
Index: linux-2.6.20/arch/powerpc/oprofile/Kconfig
===================================================================
--- linux-2.6.20.orig/arch/powerpc/oprofile/Kconfig	2007-02-20 13:49:02.028235304 -0600
+++ linux-2.6.20/arch/powerpc/oprofile/Kconfig	2007-02-20 13:49:52.779240080 -0600
@@ -7,7 +7,7 @@
 
 config OPROFILE
 	tristate "OProfile system profiling (EXPERIMENTAL)"
-	depends on PROFILING
+        depends on PROFILING
 	help
 	  OProfile is a profiling system capable of profiling the
 	  whole system, include the kernel, kernel modules, libraries,
@@ -15,3 +15,10 @@
 
 	  If unsure, say N.
 
+config OPROFILE_CELL
+	bool "OProfile for Cell Broadband Engine"
+	depends on (SPU_FS = y && OPROFILE = m) || (SPU_FS = y && OPROFILE = y) || (SPU_FS = m && OPROFILE = m)
+	default y
+	help
+	  Profiling of Cell BE SPUs requires special support enabled
+	  by this option.
Index: linux-2.6.20/arch/powerpc/oprofile/Makefile
===================================================================
--- linux-2.6.20.orig/arch/powerpc/oprofile/Makefile	2007-02-20 13:49:02.027235456 -0600
+++ linux-2.6.20/arch/powerpc/oprofile/Makefile	2007-02-20 13:49:52.781239776 -0600
@@ -11,7 +11,8 @@
 		timer_int.o )
 
 oprofile-y := $(DRIVER_OBJS) common.o backtrace.o
-oprofile-$(CONFIG_PPC_CELL_NATIVE) += op_model_cell.o
+oprofile-$(CONFIG_OPROFILE_CELL) += op_model_cell.o \
+					cell/spu_profiler.o cell/vma_map.o cell/spu_task_sync.o
 oprofile-$(CONFIG_PPC64) += op_model_rs64.o op_model_power4.o
 oprofile-$(CONFIG_FSL_BOOKE) += op_model_fsl_booke.o
 oprofile-$(CONFIG_6xx) += op_model_7450.o
Index: linux-2.6.20/arch/powerpc/oprofile/op_model_cell.c
===================================================================
--- linux-2.6.20.orig/arch/powerpc/oprofile/op_model_cell.c	2007-02-20 13:49:02.030235000 -0600
+++ linux-2.6.20/arch/powerpc/oprofile/op_model_cell.c	2007-02-20 16:49:48.719198544 -0600
@@ -37,11 +37,21 @@
 #include <asm/system.h>
 
 #include "../platforms/cell/interrupt.h"
+#include "cell/pr_util.h"
+
+/* spu_cycle_reset is the number of cycles between samples.
+ * This variable is used for SPU profiling and should ONLY be set
+ * at the beginning of cell_reg_setup; otherwise, it's read-only.
+ */
+static unsigned int spu_cycle_reset = 0;
+
+#define NUM_SPUS_PER_NODE    8
+#define SPU_CYCLES_EVENT_NUM 2  /*  event number for SPU_CYCLES */
 
 #define PPU_CYCLES_EVENT_NUM 1	/*  event number for CYCLES */
 #define PPU_CYCLES_GRP_NUM   1  /* special group number for identifying
-                                 * PPU_CYCLES event
-                                 */
+				 * PPU_CYCLES event
+				 */
 #define CBE_COUNT_ALL_CYCLES 0x42800000	/* PPU cycle event specifier */
 
 #define NUM_THREADS 2         /* number of physical threads in
@@ -50,6 +60,7 @@
 #define NUM_TRACE_BUS_WORDS 4
 #define NUM_INPUT_BUS_WORDS 2
 
+#define MAX_SPU_COUNT 0xFFFFFF  /* maximum 24 bit LFSR value */
 
 struct pmc_cntrl_data {
 	unsigned long vcntr;
@@ -64,7 +75,7 @@
 
 struct pm_signal {
 	u16 cpu;		/* Processor to modify */
-	u16 sub_unit;		/* hw subunit this applies to (if applicable) */
+	u16 sub_unit;		/* hw subunit this applies to (if applicable)*/
 	short int signal_group;	/* Signal Group to Enable/Disable */
 	u8 bus_word;		/* Enable/Disable on this Trace/Trigger/Event
 				 * Bus Word(s) (bitmask)
@@ -111,6 +122,20 @@
 
 static struct pmc_cntrl_data pmc_cntrl[NUM_THREADS][NR_PHYS_CTRS];
 
+/* The CELL profiling code makes rtas calls to setup the debug bus to
+ * route the performance signals.  Additionally, SPU profiling requires
+ * a second rtas call to setup the hardware to capture the SPU PCs.
+ * The EIO error value is returned if the token lookups or the rtas
+ * call fail.  The EIO error number is the best choice of the existing
+ * error numbers.  The probability of rtas related error is very low.  But
+ * by returning EIO and printing additional information to dmsg the user
+ * will know that OProfile did not start and dmesg will tell them why.
+ * OProfile does not support returning errors on Stop.  Not a huge issue
+ * since failure to reset the debug bus or stop the SPU PC collection is
+ * not a fatel issue.  Chances are if the Stop failed, Start doesn't work
+ * either.
+ */
+
 /* Interpetation of hdw_thread:
  * 0 - even virtual cpus 0, 2, 4,...
  * 1 - odd virtual cpus 1, 3, 5, ...
@@ -125,7 +150,8 @@
  * is available.
  */
 static struct pm_signal pm_signal[NR_PHYS_CTRS];
-static int pm_rtas_token;
+static int pm_rtas_token;    /* token for debug bus setup call */
+static int spu_rtas_token;   /* token for SPU cycle profiling */
 
 static u32 reset_value[NR_PHYS_CTRS];
 static int num_counters;
@@ -140,14 +166,15 @@
 /*
  * Firmware interface functions
  */
+
 static int
 rtas_ibm_cbe_perftools(int subfunc, int passthru,
 		       void *address, unsigned long length)
 {
 	u64 paddr = __pa(address);
 
-	return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc, passthru,
-			 paddr >> 32, paddr & 0xffffffff, length);
+	return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc,
+			 passthru, paddr >> 32, paddr & 0xffffffff, length);
 }
 
 static void pm_rtas_reset_signals(u32 node)
@@ -174,24 +201,28 @@
 				     &pm_signal_local,
 				     sizeof(struct pm_signal));
 
-	if (ret)
+	if (unlikely(ret))
+		/* Not a fatal error. For Oprofile stop, the oprofile
+		 * functions do not support returning an error for
+		 * failure to stop OProfile.
+		 */
 		printk(KERN_WARNING "%s: rtas returned: %d\n",
 		       __FUNCTION__, ret);
 }
 
-static void pm_rtas_activate_signals(u32 node, u32 count)
+static int pm_rtas_activate_signals(u32 node, u32 count)
 {
 	int ret;
 	int i, j;
 	struct pm_signal pm_signal_local[NR_PHYS_CTRS];
 
 	/* There is no debug setup required for the cycles event.
-	* Note that only events in the same group can be used.
-        * Otherwise, there will be conflicts in correctly routing
-        * the signals on the debug bus.  It is the responsiblity
-        * of the OProfile user tool to check the events are in
-        * the same group.
-        */
+	 * Note that only events in the same group can be used.
+	 * Otherwise, there will be conflicts in correctly routing
+	 * the signals on the debug bus.  It is the responsiblity
+	 * of the OProfile user tool to check the events are in
+	 * the same group.
+	 */
 
 	i = 0;
 	for (j = 0; j < count; j++) {
@@ -212,10 +243,14 @@
 					     pm_signal_local,
 					     i * sizeof(struct pm_signal));
 
-		if (ret)
+		if (unlikely(ret)) {
 			printk(KERN_WARNING "%s: rtas returned: %d\n",
 			       __FUNCTION__, ret);
+			return -EIO;
+		}
 	}
+
+	return 0;
 }
 
 /*
@@ -297,6 +332,7 @@
 					input_bus[j] = i;
 					pm_regs.group_control |=
 					    (i << (31 - i));
+
 					break;
 				}
 			}
@@ -386,9 +422,8 @@
 	u32 cpu;
 	unsigned long flags;
 
-	/* Make sure that the interrupt_hander and
-	 * the virt counter are not both playing with
-	 * the counters on the same node.
+	/* Make sure that the interrupt_hander and the virt counter are
+	 * not both playing with the counters on the same node.
 	 */
 
 	spin_lock_irqsave(&virt_cntr_lock, flags);
@@ -481,17 +516,41 @@
 }
 
 /* This function is called once for all cpus combined */
-static void
+static int
 cell_reg_setup(struct op_counter_config *ctr,
 	       struct op_system_config *sys, int num_ctrs)
 {
 	int i, j, cpu;
+	spu_cycle_reset = 0;
+
+	if (ctr[0].event == SPU_CYCLES_EVENT_NUM) {
+		spu_cycle_reset = ctr[0].count;
+
+		/* Each node will need to make the rtas call to start
+		 * and stop SPU profiling.  Get the token once and store it.
+		 */
+		spu_rtas_token = rtas_token("ibm,cbe-spu-perftools");
+
+		if (unlikely(spu_rtas_token == RTAS_UNKNOWN_SERVICE)) {
+			printk(KERN_ERR
+			       "%s: rtas token ibm,cbe-spu-perftools unknown\n",
+			       __FUNCTION__);
+			return -EIO;
+		}
+	}
 
 	pm_rtas_token = rtas_token("ibm,cbe-perftools");
-	if (pm_rtas_token == RTAS_UNKNOWN_SERVICE) {
-		printk(KERN_WARNING "%s: RTAS_UNKNOWN_SERVICE\n",
+
+	/* For all events excetp PPU CYCLEs, each node will need to make
+	 * the rtas cbe-perftools call to setup and reset the debug bus.
+	 * Make the token lookup call once and store it in the global
+	 * variable pm_rtas_token.
+	 */
+	if (unlikely(pm_rtas_token == RTAS_UNKNOWN_SERVICE)) {
+		printk(KERN_ERR
+		       "%s: rtas token ibm,cbe-perftools unknown\n",
 		       __FUNCTION__);
-		goto out;
+		return -EIO;
 	}
 
 	num_counters = num_ctrs;
@@ -568,28 +627,27 @@
 		for (i = 0; i < num_counters; ++i) {
 			per_cpu(pmc_values, cpu)[i] = reset_value[i];
 		}
-out:
-	;
+
+	return 0;
 }
 
+
+
 /* This function is called once for each cpu */
-static void cell_cpu_setup(struct op_counter_config *cntr)
+static int cell_cpu_setup(struct op_counter_config *cntr)
 {
 	u32 cpu = smp_processor_id();
 	u32 num_enabled = 0;
 	int i;
 
+	if (spu_cycle_reset)
+		return 0;
+
 	/* There is one performance monitor per processor chip (i.e. node),
 	 * so we only need to perform this function once per node.
 	 */
 	if (cbe_get_hw_thread_id(cpu))
-		goto out;
-
-	if (pm_rtas_token == RTAS_UNKNOWN_SERVICE) {
-		printk(KERN_WARNING "%s: RTAS_UNKNOWN_SERVICE\n",
-		       __FUNCTION__);
-		goto out;
-	}
+		return 0;
 
 	/* Stop all counters */
 	cbe_disable_pm(cpu);
@@ -608,16 +666,283 @@
 		}
 	}
 
-	pm_rtas_activate_signals(cbe_cpu_to_node(cpu), num_enabled);
+	/* the pm_rtas_activate_signals will return -EIO if the FW
+	 * call failed. 
+	 */
+	return (pm_rtas_activate_signals(cbe_cpu_to_node(cpu), num_enabled));
+	
+}
+
+#define ENTRIES  303
+#define MAXLFSR  0xFFFFFF
+
+/* precomputed table of 24 bit LFSR values */
+int initial_lfsr[] =
+{8221349, 12579195, 5379618, 10097839, 7512963, 7519310, 3955098, 10753424,
+ 15507573, 7458917, 285419, 2641121, 9780088, 3915503, 6668768, 1548716,
+ 4885000, 8774424, 9650099, 2044357, 2304411, 9326253, 10332526, 4421547,
+ 3440748, 10179459, 13332843, 10375561, 1313462, 8375100, 5198480, 6071392,
+ 9341783, 1526887, 3985002, 1439429, 13923762, 7010104, 11969769, 4547026,
+ 2040072, 4025602, 3437678, 7939992, 11444177, 4496094, 9803157, 10745556,
+ 3671780, 4257846, 5662259, 13196905, 3237343, 12077182, 16222879, 7587769,
+ 14706824, 2184640, 12591135, 10420257, 7406075, 3648978, 11042541, 15906893,
+ 11914928, 4732944, 10695697, 12928164, 11980531, 4430912, 11939291, 2917017,
+ 6119256, 4172004, 9373765, 8410071, 14788383, 5047459, 5474428, 1737756,
+ 15967514, 13351758, 6691285, 8034329, 2856544, 14394753, 11310160, 12149558,
+ 7487528, 7542781, 15668898, 12525138, 12790975, 3707933, 9106617, 1965401,
+ 16219109, 12801644, 2443203, 4909502, 8762329, 3120803, 6360315, 9309720,
+ 15164599, 10844842, 4456529, 6667610, 14924259, 884312, 6234963, 3326042,
+ 15973422, 13919464, 5272099, 6414643, 3909029, 2764324, 5237926, 4774955,
+ 10445906, 4955302, 5203726, 10798229, 11443419, 2303395, 333836, 9646934,
+ 3464726, 4159182, 568492, 995747, 10318756, 13299332, 4836017, 8237783,
+ 3878992, 2581665, 11394667, 5672745, 14412947, 3159169, 9094251, 16467278,
+ 8671392, 15230076, 4843545, 7009238, 15504095, 1494895, 9627886, 14485051,
+ 8304291, 252817, 12421642, 16085736, 4774072, 2456177, 4160695, 15409741,
+ 4902868, 5793091, 13162925, 16039714, 782255, 11347835, 14884586, 366972,
+ 16308990, 11913488, 13390465, 2958444, 10340278, 1177858, 1319431, 10426302,
+ 2868597, 126119, 5784857, 5245324, 10903900, 16436004, 3389013, 1742384,
+ 14674502, 10279218, 8536112, 10364279, 6877778, 14051163, 1025130, 6072469,
+ 1988305, 8354440, 8216060, 16342977, 13112639, 3976679, 5913576, 8816697,
+ 6879995, 14043764, 3339515, 9364420, 15808858, 12261651, 2141560, 5636398,
+ 10345425, 10414756, 781725, 6155650, 4746914, 5078683, 7469001, 6799140,
+ 10156444, 9667150, 10116470, 4133858, 2121972, 1124204, 1003577, 1611214,
+ 14304602, 16221850, 13878465, 13577744, 3629235, 8772583, 10881308, 2410386,
+ 7300044, 5378855, 9301235, 12755149, 4977682, 8083074, 10327581, 6395087,
+ 9155434, 15501696, 7514362, 14520507, 15808945, 3244584, 4741962, 9658130,
+ 14336147, 8654727, 7969093, 15759799, 14029445, 5038459, 9894848, 8659300,
+ 13699287, 8834306, 10712885, 14753895, 10410465, 3373251, 309501, 9561475,
+ 5526688, 14647426, 14209836, 5339224, 207299, 14069911, 8722990, 2290950,
+ 3258216, 12505185, 6007317, 9218111, 14661019, 10537428, 11731949, 9027003,
+ 6641507, 9490160, 200241, 9720425, 16277895, 10816638, 1554761, 10431375,
+ 7467528, 6790302, 3429078, 14633753, 14428997, 11463204, 3576212, 2003426,
+ 6123687, 820520, 9992513, 15784513, 5778891, 6428165, 8388607};
+
+/*
+ * The hardware uses an LFSR counting sequence to determine when to capture
+ * the SPU PCs.  An LFSR sequence is like a puesdo random number sequence
+ * where each number occurs once in the sequence but the sequence is not in
+ * numerical order. The SPU PC capture is done when the LFSR sequence reaches
+ * the last value in the sequence.  Hence the user specified value N
+ * corresponds to the LFSR number that is N from the end of the sequence.
+ * 
+ * To avoid the time to compute the LFSR, a lookup table is used.  The 24 bit
+ * LFSR sequence is broken into four ranges.  The spacing of the precomputed
+ * values is adjusted in each range so the error between the user specifed
+ * number (N) of events between samples and the actual number of events based
+ * on the precomputed value will be les then about 6.2%.  Note, if the user
+ * specifies N < 2^16, the LFSR value that is 2^16 from the end will be used.
+ * This is to prevent the loss of samples because the trace buffer is full.
+ *
+ *        User specified N                  Step between          Index in
+ *                                      precomputed values      precomputed
+ *                                                                 table
+ * 0               to  2^16-1                  ----                  0
+ * 2^16            to  2^16+2^19-1             2^12                1 to 128
+ * 2^16+2^19       to  2^16+2^19+2^22-1        2^15              129 to 256
+ * 2^16+2^19+2^22  to  2^24-1                  2^18              257 to 302
+ *
+ *
+ * For example, the LFSR values in the second range are computed for 2^16,
+ * 2^16+2^12, ... , 2^19-2^16, 2^19 and stored in the table at indicies
+ * 1, 2,..., 127, 128.
+ * 
+ * The 24 bit LFSR value for the nth number in the sequence can be
+ * calculated using the following code:
+ *
+ * #define size 24
+ * int calculate_lfsr(int n)
+ * {
+ *   int i;
+ *   unsigned int newlfsr0;
+ *   unsigned int lfsr = 0xFFFFFF;
+ *   unsigned int howmany = n;
+ * 
+ *   for (i = 2; i < howmany + 2; i++) {
+ *     newlfsr0 = (((lfsr >> (size - 1 - 0)) & 1) ^
+ *     ((lfsr >> (size - 1 - 1)) & 1) ^
+ *     (((lfsr >> (size - 1 - 6)) & 1) ^
+ *     ((lfsr >> (size - 1 - 23)) & 1)));
+ *
+ *     lfsr >>= 1;
+ *     lfsr = lfsr | (newlfsr0 << (size - 1));
+ *   }
+ *   return lfsr;
+ * }
+ */
+
+#define V2_16  (0x1 <<16)
+#define V2_19  (0x1 <<19)
+#define V2_22  (0x1 <<22)
+
+static int calculate_lfsr(int n)
+{
+	/* The ranges and steps are in powers of 2 so the calculations
+	 * can be done using shifts rather then divide.
+	 */
+	int index;
+
+	if ((n >> 16) == 0) {
+		index = 0;
+
+	} else if (((n - V2_16) >> 19) == 0) {
+		index = ((n - V2_16) >> 12) + 1;
+
+	} else if (((n - V2_16 - V2_19) >> 22) == 0) {
+		index = ((n - V2_16 - V2_19) >> 15 ) + 1 + 128;
+
+	} else if (((n - V2_16 - V2_19 - V2_22) >> 24) == 0) {
+		index = ((n - V2_16 - V2_19 - V2_22) >> 18 ) 
+			+ 1 + 256;
+	} 
+
+	if ((index > ENTRIES) || (index < 0))   /* make sure index is 
+						 * valid
+						 */
+		index = ENTRIES-1;
+
+	return initial_lfsr[index];
+}
+
+static int pm_rtas_activate_spu_profiling(u32 node)
+{
+	int ret, i;
+	struct pm_signal pm_signal_local[NR_PHYS_CTRS];
+
+	/* Set up the rtas call to configure the debug bus to
+	 * route the SPU PCs.  Setup the pm_signal for each SPU */
+	for (i = 0; i < NUM_SPUS_PER_NODE; i++) {
+		pm_signal_local[i].cpu = node;
+		pm_signal_local[i].signal_group = 41;
+		pm_signal_local[i].bus_word = 1 << i / 2; /* spu i on
+							   * word (i/2)
+							   */
+		pm_signal_local[i].sub_unit = i;	/* spu i */
+		pm_signal_local[i].bit = 63;
+	}
+
+	ret = rtas_ibm_cbe_perftools(SUBFUNC_ACTIVATE,
+				     PASSTHRU_ENABLE, pm_signal_local,
+				     (NUM_SPUS_PER_NODE
+				      * sizeof(struct pm_signal)));
+
+	if (unlikely(ret)) {
+		printk(KERN_WARNING "%s: rtas returned: %d\n",
+		       __FUNCTION__, ret);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_CPU_FREQ
+static int
+oprof_cpufreq_notify(struct notifier_block *nb, unsigned long val, void *data)
+{
+	int ret = 0;
+	struct cpufreq_freqs * frq = data;
+	if ((val == CPUFREQ_PRECHANGE && frq->old < frq->new) ||
+	    (val == CPUFREQ_POSTCHANGE && frq->old > frq->new) ||
+	    (val == CPUFREQ_RESUMECHANGE || val == CPUFREQ_SUSPENDCHANGE))
+		set_profiling_frequency(frq->new, spu_cycle_reset);
+	return ret;
+}
+
+static struct notifier_block cpu_freq_notifier_block = {
+	.notifier_call	= oprof_cpufreq_notify
+};
+#endif
+
+static int cell_global_start_spu(struct op_counter_config *ctr)
+{
+	int subfunc, rtn_value;
+	unsigned int lfsr_value;
+	int cpu;
+	int ret = 0;
+	int rtas_error = 0;
+	unsigned int cpu_khzfreq = 0;
+
+	/* The SPU profiling uses time-based profiling based on
+	 * cpu frequency, so if configured with the CPU_FREQ
+	 * option, we should detect frequency changes and react
+	 * accordingly.
+	 */
+#ifdef CONFIG_CPU_FREQ
+	ret = cpufreq_register_notifier(&cpu_freq_notifier_block,
+					CPUFREQ_TRANSITION_NOTIFIER);
+	if (ret < 0) 
+		/* this is not a fatal error */
+		printk(KERN_ERR "CPU freq change registration failed: %d\n",
+		       ret);
+
+	else
+		cpu_khzfreq = cpufreq_quick_get(smp_processor_id());
+#endif
+
+	set_profiling_frequency(cpu_khzfreq, spu_cycle_reset);
+
+	for_each_online_cpu(cpu) {
+		if (cbe_get_hw_thread_id(cpu))
+			continue;
+		/* Setup SPU cycle-based profiling.
+		 * Set perf_mon_control bit 0 to a zero before
+		 * enabling spu collection hardware.
+		 */
+		cbe_write_pm(cpu, pm_control, 0);
+
+		if (spu_cycle_reset > MAX_SPU_COUNT)
+			/* use largest possible value
+			 */
+			lfsr_value = calculate_lfsr(MAX_SPU_COUNT-1);
+		else
+		    lfsr_value = calculate_lfsr(spu_cycle_reset);
+
+		if (lfsr_value == 0) {  /* must use a non zero value.  Zero
+					 * disables data collection.
+					 */
+				lfsr_value = calculate_lfsr(1);  
+		}
+
+		lfsr_value = lfsr_value << 8; /* shift lfsr to correct
+					       * register location
+					       */
+
+		/* debug bus setup */
+		ret = pm_rtas_activate_spu_profiling(cbe_cpu_to_node(cpu));
+
+		if (unlikely(ret)) {
+			rtas_error = ret;
+			goto out;
+		}
+
+
+		subfunc = 2;	// 2 - activate SPU tracing, 3 - deactivate
+
+		/* start profiling */
+		rtn_value = rtas_call(spu_rtas_token, 3, 1, NULL, subfunc,
+		  cbe_cpu_to_node(cpu), lfsr_value); 
+
+		if (unlikely(rtn_value != 0)) {
+			printk(KERN_ERR
+			       "%s: rtas call ibm,cbe-spu-perftools failed, return = %d\n",
+			       __FUNCTION__, rtn_value);
+			rtas_error = -EIO;
+			goto out;
+		}
+	}
+
+	start_spu_profiling(spu_cycle_reset);
+
+	oprofile_running = 1;
+	return 0;
+
 out:
-	;
+	return rtas_error;
 }
 
-static void cell_global_start(struct op_counter_config *ctr)
+static int cell_global_start_ppu(struct op_counter_config *ctr)
 {
-	u32 cpu;
+	u32 cpu, i;
 	u32 interrupt_mask = 0;
-	u32 i;
 
 	/* This routine gets called once for the system.
 	 * There is one performance monitor per node, so we
@@ -656,9 +981,67 @@
 	 * the above for-loop.
 	 */
 	start_virt_cntrs();
+
+	return 0;
 }
 
-static void cell_global_stop(void)
+
+static int cell_global_start(struct op_counter_config *ctr)
+{
+	if (spu_cycle_reset) {
+		return cell_global_start_spu(ctr);
+	} else {
+		return cell_global_start_ppu(ctr);
+	}
+}
+
+static void cell_global_stop_spu(void)
+/* Note the generic OProfile stop calls do not support returning
+ * an error on stop.  Hence, will not return an error if the FW
+ * calls fail on stop.  Failure to reset the debug bus is not an issue.
+ * Failure to disable the SPU profiling is not an issue.  The FW calls
+ * to enable the performance counters and debug bus will work even if
+ * the hardware was not cleanly reset.
+ */
+{
+	int subfunc, rtn_value;
+	unsigned int lfsr_value;
+	int cpu;
+
+	oprofile_running = 0;
+
+#ifdef CONFIG_CPU_FREQ
+	cpufreq_unregister_notifier(&cpu_freq_notifier_block,
+				    CPUFREQ_TRANSITION_NOTIFIER);
+#endif
+
+	for_each_online_cpu(cpu) {
+		if (cbe_get_hw_thread_id(cpu))
+			continue;
+
+		subfunc = 3;	/* 2 - activate SPU tracing,
+				 * 3 - deactivate
+				 */
+		lfsr_value = 0x8f100000;
+
+		rtn_value = rtas_call(spu_rtas_token, 3, 1, NULL,
+				      subfunc, cbe_cpu_to_node(cpu),
+				      lfsr_value);
+
+		if (unlikely(rtn_value != 0)) {
+			printk(KERN_ERR
+			       "%s: rtas call ibm,cbe-spu-perftools failed, return = %d\n",
+			       __FUNCTION__, rtn_value);
+		}
+
+		/* Deactivate the signals */
+		pm_rtas_reset_signals(cbe_cpu_to_node(cpu));
+	}
+
+	stop_spu_profiling();
+}
+
+static void cell_global_stop_ppu(void)
 {
 	int cpu;
 
@@ -686,6 +1069,15 @@
 	}
 }
 
+static void cell_global_stop(void)
+{
+	if (spu_cycle_reset) {
+		cell_global_stop_spu();
+	} else {
+		cell_global_stop_ppu();
+	}
+}
+
 static void
 cell_handle_interrupt(struct pt_regs *regs, struct op_counter_config *ctr)
 {
@@ -754,10 +1146,33 @@
 	spin_unlock_irqrestore(&virt_cntr_lock, flags);
 }
 
+/* This function is called from the generic OProfile
+ * driver.  When profiling PPUs, we need to do the
+ * generic sync start; otherwise, do spu_sync_start.
+ */
+static int cell_sync_start(void)
+{
+	if (spu_cycle_reset)
+		return spu_sync_start();
+	else
+		return DO_GENERIC_SYNC;
+}
+
+static int cell_sync_stop(void)
+{
+	if (spu_cycle_reset)
+		return spu_sync_stop();
+	else
+		return 1;
+}
+
+
 struct op_powerpc_model op_model_cell = {
 	.reg_setup = cell_reg_setup,
 	.cpu_setup = cell_cpu_setup,
 	.global_start = cell_global_start,
 	.global_stop = cell_global_stop,
+	.sync_start = cell_sync_start,
+	.sync_stop = cell_sync_stop,
 	.handle_interrupt = cell_handle_interrupt,
 };
Index: linux-2.6.20/arch/powerpc/platforms/cell/spufs/sched.c
===================================================================
--- linux-2.6.20.orig/arch/powerpc/platforms/cell/spufs/sched.c	2007-02-20 13:49:02.023236064 -0600
+++ linux-2.6.20/arch/powerpc/platforms/cell/spufs/sched.c	2007-02-20 13:49:52.793237952 -0600
@@ -194,6 +194,7 @@
 	ctx->spu = spu;
 	ctx->ops = &spu_hw_ops;
 	spu->pid = current->pid;
+	spu->tgid = current->tgid;
 	spu->mm = ctx->owner;
 	mm_needs_global_tlbie(spu->mm);
 	spu->ibox_callback = spufs_ibox_callback;
@@ -238,6 +239,7 @@
 	spu->dma_callback = NULL;
 	spu->mm = NULL;
 	spu->pid = 0;
+	spu->tgid = 0;
 	ctx->ops = &spu_backing_ops;
 	ctx->spu = NULL;
 	spu->flags = 0;
Index: linux-2.6.20/drivers/oprofile/buffer_sync.c
===================================================================
--- linux-2.6.20.orig/drivers/oprofile/buffer_sync.c	2007-02-20 13:49:02.031234848 -0600
+++ linux-2.6.20/drivers/oprofile/buffer_sync.c	2007-02-20 13:49:52.795237648 -0600
@@ -26,6 +26,7 @@
 #include <linux/profile.h>
 #include <linux/module.h>
 #include <linux/fs.h>
+#include <linux/oprofile.h>
  
 #include "oprofile_stats.h"
 #include "event_buffer.h"
Index: linux-2.6.20/drivers/oprofile/event_buffer.h
===================================================================
--- linux-2.6.20.orig/drivers/oprofile/event_buffer.h	2007-02-20 13:49:02.031234848 -0600
+++ linux-2.6.20/drivers/oprofile/event_buffer.h	2007-02-20 13:49:52.797237344 -0600
@@ -19,28 +19,10 @@
  
 /* wake up the process sleeping on the event file */
 void wake_up_buffer_waiter(void);
- 
-/* Each escaped entry is prefixed by ESCAPE_CODE
- * then one of the following codes, then the
- * relevant data.
- */
-#define ESCAPE_CODE			~0UL
-#define CTX_SWITCH_CODE 		1
-#define CPU_SWITCH_CODE 		2
-#define COOKIE_SWITCH_CODE 		3
-#define KERNEL_ENTER_SWITCH_CODE	4
-#define KERNEL_EXIT_SWITCH_CODE		5
-#define MODULE_LOADED_CODE		6
-#define CTX_TGID_CODE			7
-#define TRACE_BEGIN_CODE		8
-#define TRACE_END_CODE			9
- 
+
 #define INVALID_COOKIE ~0UL
 #define NO_COOKIE 0UL
 
-/* add data to the event buffer */
-void add_event_entry(unsigned long data);
- 
 extern struct file_operations event_buffer_fops;
  
 /* mutex between sync_cpu_buffers() and the
Index: linux-2.6.20/drivers/oprofile/oprof.c
===================================================================
--- linux-2.6.20.orig/drivers/oprofile/oprof.c	2007-02-20 13:49:02.032234696 -0600
+++ linux-2.6.20/drivers/oprofile/oprof.c	2007-02-20 13:49:52.798237192 -0600
@@ -53,9 +53,23 @@
 	 * us missing task deaths and eventually oopsing
 	 * when trying to process the event buffer.
 	 */
+	if (oprofile_ops.sync_start) {
+		int sync_ret = oprofile_ops.sync_start();
+		switch (sync_ret) {
+			case 0: goto post_sync;
+				break;
+			case 1: goto do_generic;
+				break;
+			case -1: goto out3;
+				break;
+			default: goto out3;
+		}
+	}
+do_generic:
 	if ((err = sync_start()))
 		goto out3;
 
+post_sync:
 	is_setup = 1;
 	mutex_unlock(&start_mutex);
 	return 0;
@@ -118,7 +132,19 @@
 void oprofile_shutdown(void)
 {
 	mutex_lock(&start_mutex);
+        if (oprofile_ops.sync_stop) {
+                int sync_ret = oprofile_ops.sync_stop();
+                switch (sync_ret) {
+                        case 0: goto post_sync;
+                                break;
+                        case 1: goto do_generic;
+                                break;
+			default: goto post_sync;
+                }
+        }
+do_generic:
 	sync_stop();
+post_sync:
 	if (oprofile_ops.shutdown)
 		oprofile_ops.shutdown();
 	is_setup = 0;
Index: linux-2.6.20/include/asm-powerpc/oprofile_impl.h
===================================================================
--- linux-2.6.20.orig/include/asm-powerpc/oprofile_impl.h	2007-02-20 13:49:02.036234088 -0600
+++ linux-2.6.20/include/asm-powerpc/oprofile_impl.h	2007-02-20 13:49:52.800236888 -0600
@@ -39,14 +39,16 @@
 
 /* Per-arch configuration */
 struct op_powerpc_model {
-	void (*reg_setup) (struct op_counter_config *,
+	int (*reg_setup) (struct op_counter_config *,
 			   struct op_system_config *,
 			   int num_counters);
-	void (*cpu_setup) (struct op_counter_config *);
-	void (*start) (struct op_counter_config *);
-        void (*global_start) (struct op_counter_config *);
+	int  (*cpu_setup) (struct op_counter_config *);
+	int  (*start) (struct op_counter_config *);
+        int  (*global_start) (struct op_counter_config *);
 	void (*stop) (void);
 	void (*global_stop) (void);
+	int (*sync_start)(void);
+	int (*sync_stop)(void);
 	void (*handle_interrupt) (struct pt_regs *,
 				  struct op_counter_config *);
 	int num_counters;
Index: linux-2.6.20/include/asm-powerpc/spu.h
===================================================================
--- linux-2.6.20.orig/include/asm-powerpc/spu.h	2007-02-20 13:49:02.036234088 -0600
+++ linux-2.6.20/include/asm-powerpc/spu.h	2007-02-20 13:49:52.803236432 -0600
@@ -129,6 +129,7 @@
 	struct spu_runqueue *rq;
 	unsigned long long timestamp;
 	pid_t pid;
+	pid_t tgid;
 	int class_0_pending;
 	spinlock_t register_lock;
 
@@ -167,6 +168,20 @@
 int spu_irq_class_1_bottom(struct spu *spu);
 void spu_irq_setaffinity(struct spu *spu, int cpu);
 
+/* This interface allows a profiler (e.g., OProfile) to store a ref
+ * to spu context information that it creates.  This caching technique
+ * avoids the need to recreate this information after a save/restore operation.
+ *
+ * Assumes the caller has already incremented the ref count to
+ * profile_info; then spu_context_destroy must call kref_put
+ * on prof_info_kref.
+ */
+void spu_set_profile_private_kref(struct spu_context * ctx, 
+				  struct kref * prof_info_kref,
+				  void (* prof_info_release) (struct kref * kref));
+
+void * spu_get_profile_private_kref(struct spu_context * ctx);
+
 /* system callbacks from the SPU */
 struct spu_syscall_block {
 	u64 nr_ret;
Index: linux-2.6.20/include/linux/oprofile.h
===================================================================
--- linux-2.6.20.orig/include/linux/oprofile.h	2007-02-20 13:49:02.035234240 -0600
+++ linux-2.6.20/include/linux/oprofile.h	2007-02-20 19:27:06.824221368 -0600
@@ -17,6 +17,29 @@
 #include <linux/spinlock.h>
 #include <asm/atomic.h>
  
+/* Each escaped entry is prefixed by ESCAPE_CODE
+ * then one of the following codes, then the
+ * relevant data.
+ * These #defines live in this file so that arch-specific
+ * buffer sync'ing code can access them.
+ */
+#define ESCAPE_CODE                     ~0UL
+#define CTX_SWITCH_CODE                 1
+#define CPU_SWITCH_CODE                 2
+#define COOKIE_SWITCH_CODE              3
+#define KERNEL_ENTER_SWITCH_CODE        4
+#define KERNEL_EXIT_SWITCH_CODE         5
+#define MODULE_LOADED_CODE              6
+#define CTX_TGID_CODE                   7
+#define TRACE_BEGIN_CODE                8
+#define TRACE_END_CODE                  9
+#define XEN_ENTER_SWITCH_CODE          10
+#define SPU_PROFILING_CODE             11
+#define SPU_CTX_SWITCH_CODE            12
+#define SPU_OFFSET_CODE                13
+#define SPU_COOKIE_CODE                14
+#define SPU_SHLIB_COOKIE_CODE          15
+
 struct super_block;
 struct dentry;
 struct file_operations;
@@ -35,6 +58,14 @@
 	int (*start)(void);
 	/* Stop delivering interrupts. */
 	void (*stop)(void);
+	/* Arch-specific buffer sync functions.
+	 * Return value = 0:  Success
+	 * Return value = -1: Failure
+	 * Return value = 1:  Run generic sync function
+	 */
+	int (*sync_start)(void);
+	int (*sync_stop)(void);
+
 	/* Initiate a stack backtrace. Optional. */
 	void (*backtrace)(struct pt_regs * const regs, unsigned int depth);
 	/* CPU identification string. */
@@ -56,6 +87,13 @@
 void oprofile_arch_exit(void);
 
 /**
+ * Add data to the event buffer.
+ * The data passed is free-form, but typically consists of
+ * file offsets, dcookies, context information, and ESCAPE codes.
+ */
+void add_event_entry(unsigned long data);
+
+/**
  * Add a sample. This may be called from any context. Pass
  * smp_processor_id() as cpu.
  */
Index: linux-2.6.20/kernel/hrtimer.c
===================================================================
--- linux-2.6.20.orig/kernel/hrtimer.c	2007-02-20 13:49:02.033234544 -0600
+++ linux-2.6.20/kernel/hrtimer.c	2007-02-20 13:49:52.807235824 -0600
@@ -335,6 +335,7 @@
 
 	return orun;
 }
+EXPORT_SYMBOL_GPL(hrtimer_forward);
 
 /*
  * enqueue_hrtimer - internal function to (re)start a timer
Index: linux-2.6.20/arch/powerpc/kernel/time.c
===================================================================
--- linux-2.6.20.orig/arch/powerpc/kernel/time.c	2007-02-20 13:49:02.025235760 -0600
+++ linux-2.6.20/arch/powerpc/kernel/time.c	2007-02-20 13:49:52.811235216 -0600
@@ -122,6 +122,7 @@
 static long timezone_offset;
 
 unsigned long ppc_proc_freq;
+EXPORT_SYMBOL(ppc_proc_freq);
 unsigned long ppc_tb_freq;
 
 static u64 tb_last_jiffy __cacheline_aligned_in_smp;
Index: linux-2.6.20/arch/powerpc/platforms/cell/spufs/spufs.h
===================================================================
--- linux-2.6.20.orig/arch/powerpc/platforms/cell/spufs/spufs.h	2007-02-20 13:49:02.023236064 -0600
+++ linux-2.6.20/arch/powerpc/platforms/cell/spufs/spufs.h	2007-02-20 13:49:52.812235064 -0600
@@ -80,6 +80,8 @@
 
 	struct list_head gang_list;
 	struct spu_gang *gang;
+	struct kref * prof_priv_kref;
+	void (* prof_priv_release) (struct kref *kref);
 
 	/* scheduler fields */
  	struct list_head rq;
Index: linux-2.6.20/arch/powerpc/platforms/cell/spufs/context.c
===================================================================
--- linux-2.6.20.orig/arch/powerpc/platforms/cell/spufs/context.c	2007-02-20 13:49:02.024235912 -0600
+++ linux-2.6.20/arch/powerpc/platforms/cell/spufs/context.c	2007-02-20 13:49:52.814234760 -0600
@@ -22,6 +22,7 @@
 
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/module.h>
 #include <linux/slab.h>
 #include <asm/spu.h>
 #include <asm/spu_csa.h>
@@ -76,6 +77,8 @@
 	spu_fini_csa(&ctx->csa);
 	if (ctx->gang)
 		spu_gang_remove_ctx(ctx->gang, ctx);
+	if (ctx->prof_priv_kref)
+		kref_put(ctx->prof_priv_kref, ctx->prof_priv_release);
 	kfree(ctx);
 }
 
@@ -202,3 +205,20 @@
 	if (ctx->state != SPU_STATE_SAVED)
 		spu_deactivate(ctx);
 }
+
+void spu_set_profile_private_kref(struct spu_context * ctx, 
+				  struct kref * prof_info_kref,
+				  void (* prof_info_release) (struct kref * kref))
+{
+	ctx->prof_priv_kref = prof_info_kref;
+	ctx->prof_priv_release = prof_info_release;
+}
+EXPORT_SYMBOL_GPL(spu_set_profile_private_kref);
+
+void * spu_get_profile_private_kref(struct spu_context * ctx)
+{
+	return ctx->prof_priv_kref;
+}
+EXPORT_SYMBOL_GPL(spu_get_profile_private_kref);
+
+
Index: linux-2.6.20/include/linux/dcookies.h
===================================================================
--- linux-2.6.20.orig/include/linux/dcookies.h	2007-02-20 13:49:02.034234392 -0600
+++ linux-2.6.20/include/linux/dcookies.h	2007-02-20 13:49:52.815234608 -0600
@@ -12,6 +12,7 @@
 
 #ifdef CONFIG_PROFILING
  
+#include <linux/dcache.h>
 #include <linux/types.h>
  
 struct dcookie_user;
Index: linux-2.6.20/include/linux/elf-em.h
===================================================================
--- linux-2.6.20.orig/include/linux/elf-em.h	2007-02-20 13:49:02.034234392 -0600
+++ linux-2.6.20/include/linux/elf-em.h	2007-02-20 13:49:52.816234456 -0600
@@ -21,6 +21,7 @@
 #define EM_SPARC32PLUS	18	/* Sun's "v8plus" */
 #define EM_PPC		20	/* PowerPC */
 #define EM_PPC64	21       /* PowerPC64 */
+#define EM_SPU		23	/* Cell BE SPU */
 #define EM_SH		42	/* SuperH */
 #define EM_SPARCV9	43	/* SPARC v9 64-bit */
 #define EM_IA_64	50	/* HP/Intel IA-64 */
Index: linux-2.6.20/arch/powerpc/oprofile/op_model_rs64.c
===================================================================
--- linux-2.6.20.orig/arch/powerpc/oprofile/op_model_rs64.c	2007-02-20 13:49:02.027235456 -0600
+++ linux-2.6.20/arch/powerpc/oprofile/op_model_rs64.c	2007-02-20 13:49:52.818234152 -0600
@@ -88,9 +88,9 @@
 
 static int num_counters;
 
-static void rs64_reg_setup(struct op_counter_config *ctr,
-			   struct op_system_config *sys,
-			   int num_ctrs)
+static int rs64_reg_setup(struct op_counter_config *ctr,
+			  struct op_system_config *sys,
+			  int num_ctrs)
 {
 	int i;
 
@@ -100,9 +100,10 @@
 		reset_value[i] = 0x80000000UL - ctr[i].count;
 
 	/* XXX setup user and kernel profiling */
+	return 0;
 }
 
-static void rs64_cpu_setup(struct op_counter_config *ctr)
+static int rs64_cpu_setup(struct op_counter_config *ctr)
 {
 	unsigned int mmcr0;
 
@@ -125,9 +126,11 @@
 	    mfspr(SPRN_MMCR0));
 	dbg("setup on cpu %d, mmcr1 %lx\n", smp_processor_id(),
 	    mfspr(SPRN_MMCR1));
+
+	return 0;
 }
 
-static void rs64_start(struct op_counter_config *ctr)
+static int rs64_start(struct op_counter_config *ctr)
 {
 	int i;
 	unsigned int mmcr0;
@@ -155,6 +158,7 @@
 	mtspr(SPRN_MMCR0, mmcr0);
 
 	dbg("start on cpu %d, mmcr0 %x\n", smp_processor_id(), mmcr0);
+	return 0;
 }
 
 static void rs64_stop(void)
Index: linux-2.6.20/arch/powerpc/oprofile/op_model_power4.c
===================================================================
--- linux-2.6.20.orig/arch/powerpc/oprofile/op_model_power4.c	2007-02-20 13:49:02.029235152 -0600
+++ linux-2.6.20/arch/powerpc/oprofile/op_model_power4.c	2007-02-20 13:49:52.820233848 -0600
@@ -30,7 +30,7 @@
 static u64 mmcr1_val;
 static u64 mmcra_val;
 
-static void power4_reg_setup(struct op_counter_config *ctr,
+static int power4_reg_setup(struct op_counter_config *ctr,
 			     struct op_system_config *sys,
 			     int num_ctrs)
 {
@@ -58,6 +58,8 @@
 		mmcr0_val &= ~MMCR0_PROBLEM_DISABLE;
 	else
 		mmcr0_val |= MMCR0_PROBLEM_DISABLE;
+	
+	return 0;
 }
 
 extern void ppc64_enable_pmcs(void);
@@ -82,7 +84,7 @@
 	return 0;
 }
 
-static void power4_cpu_setup(struct op_counter_config *ctr)
+static int power4_cpu_setup(struct op_counter_config *ctr)
 {
 	unsigned int mmcr0 = mmcr0_val;
 	unsigned long mmcra = mmcra_val;
@@ -109,9 +111,11 @@
 	    mfspr(SPRN_MMCR1));
 	dbg("setup on cpu %d, mmcra %lx\n", smp_processor_id(),
 	    mfspr(SPRN_MMCRA));
+
+	return 0;
 }
 
-static void power4_start(struct op_counter_config *ctr)
+static int power4_start(struct op_counter_config *ctr)
 {
 	int i;
 	unsigned int mmcr0;
@@ -146,6 +150,7 @@
 	oprofile_running = 1;
 
 	dbg("start on cpu %d, mmcr0 %x\n", smp_processor_id(), mmcr0);
+	return 0;
 }
 
 static void power4_stop(void)



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-11 22:46               ` Milton Miller
@ 2007-02-12 16:38                 ` Carl Love
  -1 siblings, 0 replies; 66+ messages in thread
From: Carl Love @ 2007-02-12 16:38 UTC (permalink / raw)
  To: Milton Miller
  Cc: cbe-oss-dev, Arnd Bergmann, LKML, linuxppc-dev, oprofile-list

On Sun, 2007-02-11 at 16:46 -0600, Milton Miller wrote:

[cut]

> 
> As far as I understand, you are providing access to a completely new
> hardware that is related to the PMU hardware by the fact that it
> collects a program counter.   It doesn't use the PMU counters nor the
> PMU event selection.
> 
> In fact, why can the existing op_model_cell profiling not run while
> the SPU profiling runs?   Is there a shared debug bus inside the
> chip?   Or just the data stream with your buffer_sync code?
> 

There are two reasons you cannot do SPU profiling and profiling on non
SPU events at the sametime.  1) the SPU PC values are routed on the
debug bus.  You cannot also route the signals for other non SPU events
on the debug bus since they will conflict.  Specifically, the signals
would get logically OR'd on the bus.  The exception is PPU cycles which
does not use the debug bus.  2) the hardware that captures the SPU
program counters has some shared components with the HW performance
counters.  To use the SPU program counter hardware, the performance
counter hardware must be disabled.

In summary, we cannot do SPU cycle profiling and non SPU event profiling
at the same time due to limitations in the hardware.


[cut]


> milton
> 


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
@ 2007-02-12 16:38                 ` Carl Love
  0 siblings, 0 replies; 66+ messages in thread
From: Carl Love @ 2007-02-12 16:38 UTC (permalink / raw)
  To: Milton Miller
  Cc: linuxppc-dev, oprofile-list, cbe-oss-dev, Arnd Bergmann, LKML

On Sun, 2007-02-11 at 16:46 -0600, Milton Miller wrote:

[cut]

> 
> As far as I understand, you are providing access to a completely new
> hardware that is related to the PMU hardware by the fact that it
> collects a program counter.   It doesn't use the PMU counters nor the
> PMU event selection.
> 
> In fact, why can the existing op_model_cell profiling not run while
> the SPU profiling runs?   Is there a shared debug bus inside the
> chip?   Or just the data stream with your buffer_sync code?
> 

There are two reasons you cannot do SPU profiling and profiling on non
SPU events at the sametime.  1) the SPU PC values are routed on the
debug bus.  You cannot also route the signals for other non SPU events
on the debug bus since they will conflict.  Specifically, the signals
would get logically OR'd on the bus.  The exception is PPU cycles which
does not use the debug bus.  2) the hardware that captures the SPU
program counters has some shared components with the HW performance
counters.  To use the SPU program counter hardware, the performance
counter hardware must be disabled.

In summary, we cannot do SPU cycle profiling and non SPU event profiling
at the same time due to limitations in the hardware.


[cut]


> milton
> 

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-09 16:17             ` Carl Love
@ 2007-02-11 22:46               ` Milton Miller
  -1 siblings, 0 replies; 66+ messages in thread
From: Milton Miller @ 2007-02-11 22:46 UTC (permalink / raw)
  To: Carl Love; +Cc: cbe-oss-dev, Arnd Bergmann, LKML, linuxppc-dev, oprofile-list


On Feb 9, 2007, at 10:17 AM, Carl Love wrote:

> On Thu, 2007-02-08 at 20:46 -0600, Milton Miller wrote:
>>  On Feb 8, 2007, at 4:51 PM, Carl Love wrote:
>>
>>> On Thu, 2007-02-08 at 18:21 +0100, Arnd Bergmann wrote:
>>>> On Thursday 08 February 2007 15:18, Milton Miller wrote:
>>>>
>>>>> 1) sample rate setup
>>>>>
>>>>>     In the current patch, the user specifies a sample rate as a 
>>>>> time
>>>>> interval.
>>>>>     The kernel is (a) calling cpufreq to get the current cpu
>>>>> frequency,
>>>>> (b)
>>>>>     converting the rate to a cycle count, (c) converting this to a
>>>>> 24 bit
>>>>>     LFSR count, an iterative algorithm (in this patch, starting 
>>>>> from
>>>>>     one of 256 values so a max of 2^16 or 64k iterations), (d)
>>>>> calculating
>>>>>     an trace unload interval.   In addition, a cpufreq notifier is
>>>>> registered
>>>>>     to recalculate on frequency changes.
>>>
>>> No.  The user issues the command opcontrol --event:N  where N is the
>>> number of events (cycles, l2 cache misses, instructions retired etc)
>>> that are to elapse between collecting the samples.
>>
>> So you are saying that b and c are primary, and a is used to calculate
>> a safe value for d.   All of the above work is dont, just from a
>> different starting point?
>
> There are two things 1) setup the LFSR to control how often the 
> Hardware
> puts samples into the trace buffer.  2) setup the kernel timer to read
> the trace buffer (your d, d is a function of the cpu freq) and process
> the samples.
>
> (c is nonsense)


Well, its cacluate the rate from the count vs count from the rate.

>
> The kernel timer was set with the goal the the hardware trace buffer
> would not get more then half full to ensure we would not lose samples
> even for the maximum rate that the hardware would be adding samples to
> the trace buffer (user specified N=100,000).

That should be well commented.

>>
>>> The OProfile passes
>>> the value N to the kernel via the variable ctr[i].count.  Where i is
>>> the
>>> performance counter entry for that event.
>>
>> Ok I haven't looked a the api closely.


Actually, the oprofile userspace fills out files in a file system to 
tell
the kernel what it needs to know.   The powerpc code defines the 
reources
needed to use the PMU hardware, which is (1) common mux selects, for
processors that need them, and (2) a set of directories, one for each 
of the
pmu counters, each of which contain the controls for that counter (such
as enable kernel space, enable user space, event timeout to interrupt or
sample collection, etc.   The ctr[i].count is one of these files.

>>
>>> Specifically with SPU
>>> profiling, we do not use performance counters because the CELL HW 
>>> does
>>> not allow the normal the PPU to read the SPU PC when a performance
>>> counter interrupt occurs.  We are using some additional hw support in
>>> the chip that allows us to periodically capture the SPU PC.  There is
>>> an
>>> LFSR hardware counter that can be started at an arbitrary LFSR value.
>>> When the last LFSR value in the sequence is reached, a sample is 
>>> taken
>>> and stored in the trace buffer.  Hence, the value of N specified by 
>>> the
>>> user must get converted to the LFSR value that is N from the end of 
>>> the
>>> sequence.
>>
>> Ok so its arbitray load count to max vs count and compare.   A 
>> critical
>> detail when computing the value to load, but the net result is the
>> same; the value for the count it hard to determine.
>
> The above statement makes no sense to me.

I think I talked past you.  Your description of hadrware vs mine was
different in that the counter always ends at a specified point and is
loaded with the variable count, where mine had it comparing to the
count as it incremented.

However, I now see that you were referring to the fact that what the
user specifes, the count, has to be converted to a LFSR value.  My point
is this can be done in the user-space oprofile code.  It already has
to look up magic numbers for setting up event selection muxes for other
hardware, adding a lfsr calculation is not beyond resoon.  Nor is having
it provide two values in two files.

>
> Determining the initial LFSR value that is N values from the last value
> in the sequence is not easy to do.

Well, its easy, its just order(N).

>>> The same clock that the processor is running at is used to
>>> control the LFSR count.  Hence the LFSR counter increments once per 
>>> CPU
>>> clock cycle regardless of the CPU frequency or changes in the
>>> frequency.
>>> There is no calculation for the LFSR value that is a function of the
>>> processor frequency.  There is no need to adjust the LFSR when the
>>> processor frequency changes.
>>
>> Oh, so the lfsr doesn't have to be recomputed, only the time
>> between unloads.
>
> The LFSR value is computed ONCE when you start OProfile.  The value is
> setup in the hardware once when OProfile starts.  The hardware will
> always restart with the value given to it after it reaches the last
> value in the sequence.  What you call the "time between unloads" is the
> time at which you schedule the kernel routine to empty the trace 
> buffer.
> It is calculated once.  It would only need to be recomputed if the cpu
> frequency changed.



>>>>>
>>>>>     The obvious problem is step (c), running a loop potentially 64
>>>>> thousand
>>>>>     times in kernel space will have a noticeable impact on other
>>>>> threads.
>>>>>
>>>>>     I propose instead that user space perform the above 4 steps, 
>>>>> and
>>>>> provide
>>>>>     the kernel with two inputs: (1) the value to load in the LFSR
>>>>> and (2)
>>>>>     the periodic frequency / time interval at which to empty the
>>>>> hardware
>>>>>     trace buffer, perform sample analysis, and send the data to the
>>>>> oprofile
>>>>>     subsystem.
>>>>>
>>>>>     There should be no security issues with this approach.   If the
>>>>> LFSR
>>>>> value
>>>>>     is calculated incorrectly, either it will be too short, causing
>>>>> the
>>>>> trace
>>>>>     array to overfill and data to be dropped, or it will be too
>>>>> long, and
>>>>>     there will be fewer samples.   Likewise, the kernel periodic 
>>>>> poll
>>>>> can be
>>>>>     too long, again causing overflow, or too frequent, causing only
>>>>> timer
>>>>>     execution overhead.
>>>>>
>>>>>     Various data is collected by the kernel while processing the
>>>>> periodic timer,
>>>>>     this approach would also allow the profiling tools to control 
>>>>> the
>>>>>     frequency of this collection.   More frequent collection 
>>>>> results
>>>>> in
>>>>> more
>>>>>     accurate sample data, with the linear cost of poll execution
>>>>> overhead.
>>>>>
>>>>>     Frequency changes can be handled either by the profile code
>>>>> setting
>>>>>     collection at a higher than necessary rate, or by interacting
>>>>> with
>>>>> the
>>>>>     governor to limit the speeds.
>>>>>
>>>>>     Optionally, the kernel can add a record indicating that some
>>>>> data was
>>>>>     likely dropped if it is able to read all 256 entries without
>>>>> underflowing
>>>>>     the array.  This can be used as hint to user space that the
>>>>> kernel
>>>>> time
>>>>>     was too long for the collection rate.
>>>>
>>>> Moving the sample rate computation to user space sounds like the 
>>>> right
>>>> idea, but why not have a more drastic version of it:
>>>
>>> No, I do not agree.  The user/kernel API pass N where N is the number
>>> of
>>> events between samples.  We are not at liberty to just change the 
>>> API.
>>> We we did do this, we fully expect that John Levon will push back
>>> saying
>>> why make an architecture specific API change when it isn't necessary.
>>
>> [So you have not asked.]
>>
>> <Kludge> If you want to overlaod the existing array, one
>> event could be the lfsr sample rate, and another event be
>> the collection time.  That would stay within the framework.
>> But that is a kludge. </Kludge>
>>
>> [Me goes and reads kernel profile driver and skims powerpc code].
>>
>> You are confusing the user interface (what the user specifies on the
>> command line) with the kernel API.
>>
>> It is somewhat hidden by the PowerPC specific common code in
>> arch/powerpc/oprofile/common.c.  That is where the counter
>> array is exposd.
>>
>> The user to kernel api is not fill out an array of counter
>> event names and sampling intervals.
>>
>> The user to kernel interface is a file system that contains a
>> heirachy of files.  Each file consists of the hex ascii
>> represntation of a unsigned long.   The filesystem interfaces
>> to the kernel by provding an API to create directorys and files,
>> specifing the name of the directory or file.  Theere are helper
>> routines to connect a file to a ulong and read access to an atomic_t.
>> The common driver (in drivers/oprofile) creates the file system
>> and some common files that implement the control interface, which
>> interfaces to the architecture specific driver through the ops
>> array.
>>
>> The powerpc common driver creates a heiarchy exposing the
>> values to be placed in the performance monitor registers,
>> and directory of counters with the event selection.
>>
>> Since this architeture code does not seem to match the
>> capabilitys of the hardware, it would seem that this is
>> the area to change.   This driver does not seem to use
>> the actual PMU interrput or sprs.   Lets make it its
>> own directory with its own controls.
>>
>> I don't see how exposing the sample collection to
>> rate and the computation of the LFSR create a userspace
>> api change;  I think its totally within the framework.
>
> I was being a bit simplistic in my explination.  I am well aware of the
> file system.  The point is the USER specifies the rate (every N events)
> that they want to have sampling done.  We are using the existing
> mechanism to pass the value of N to the kernel.  So from that
> standpoint, we are trying to be consistent in how it is done with the
> PPU. I feel that this is best to try to handle the N value in the same
> way rather then having a completely different way.


>
> If we were to put the LFSR into the user space, you would pass the N
> into the kernel for the PPU profiling case.  To make the API clean, you
> would have to create a new file entry to pass the LFSR value.  For SPU
> profiling you would not pass N instead you would pass LFSR.  I think it
> is a bad idea to have these two different things for PPU versus SPU
> profiling.  I really feel it is best to consistent to use pass N for 
> PPU
> and SPU.  Then deal with converting N to the LFSR for the special case
> of SPU profiling.
>

As far as I understand, you are providing access to a completely new
hardware that is related to the PMU hardware by the fact that it
collects a program counter.   It doesn't use the PMU counters nor the
PMU event selection.

In fact, why can the existing op_model_cell profiling not run while
the SPU profiling runs?   Is there a shared debug bus inside the
chip?   Or just the data stream with your buffer_sync code?

> Unlike POWER 4/5 support where we absolutely had to add entries to the
> API to pass the three values for the control registers.  We already 
> have
> an established mechanism for passing N from user to kernel.  Just use
> it.  We are very sure that John Levon, the OProfile user maintainer,
> will say the same thing and refuse to accept adding to the API to pass
> the LFSR when the whole thing can be handled in a more consistent way
> that does not require an architecture specific change.  And I also feel
> that we really don't need or want an additional architecture specific
> API change.
>
>>
>>> Please define "drastic" in this context.  Do you mean make the table
>>> bigger i.e. more then 256 precomputed elements?  I did 256 since Arnd
>>> seemed to think that would be a reasonable size. Based on his example
>>> How much kernel space are we willing to use to same some computation?
>>> Keep in mind only one of the entries in the table will ever be used.
>>>
>>> I think if we found the LFSR that was with in 2^10 of the desired 
>>> value
>>> that would be good enough. It would be within 1% of the minimum N the
>>> user can specify.  That would require a table with 2^14 entries.  
>>> That
>>> seems unreasonably large.
>>
>> Why does the table have to be linear?  Compute samples at various
>> logrimathic staring points.   Determine how many significant bits
>> you want to keep, and have an array for each sample length.
>>
>> ie for 3 bits of significance, you could have
>> F00000, E00000, ... 800000,   0F0000, ......
>> this would take (24-n) * 2^(n-1) slots, while maintaing user
>> control of the range.
>>
>
> I don't see any advantage in this log approach.  If we do this with a
> linear table with a reasonable number of pre calculated values.  I 
> think
> that a table with no more then 1024 entries would be reasonable.  The
> overhead for calculating the desired LFSR value would be going through
> the for loop 16K times, for 1024 entries in the table, is not
> unreasonable.  I think this whole discussion of moving the LFSR to the
> user space is not needed.  The overhead of the for loop does not 
> justify
> pushing the LFSR determination to user space.  But that is just my
> opinion.  I am open to suggestions on how big to make the lookup table
> in the kernel.  But I really am apposed to putting the LFSR into the
> user space.
>
>>>
>>> Anyway, the user controls how often sampling is done by setting N.
>>
>> When calling a user space program.   The events are converted to
>> a series of control register values that are communicated to the
>> kernel by writing to files in the file system.   The writes are
>> interpreted (converted from ascii hex to binary longs) and stored
>> until the control files are written, at which point callbacks
>> copy and interpret the controls and start the hardware collection.
>>
>> Frankly, I would expect a lot more resistance to the event data
>> stream generation changes and duplicaton.
>
> I don't agree.  But that is just my opinion based on my experience
> working on OProfile.
>

Well, I haven't worked with oprofile in the past, but I have worked
on the kernel.  And I stay by my statement.

milton


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
@ 2007-02-11 22:46               ` Milton Miller
  0 siblings, 0 replies; 66+ messages in thread
From: Milton Miller @ 2007-02-11 22:46 UTC (permalink / raw)
  To: Carl Love; +Cc: linuxppc-dev, oprofile-list, cbe-oss-dev, Arnd Bergmann, LKML


On Feb 9, 2007, at 10:17 AM, Carl Love wrote:

> On Thu, 2007-02-08 at 20:46 -0600, Milton Miller wrote:
>>  On Feb 8, 2007, at 4:51 PM, Carl Love wrote:
>>
>>> On Thu, 2007-02-08 at 18:21 +0100, Arnd Bergmann wrote:
>>>> On Thursday 08 February 2007 15:18, Milton Miller wrote:
>>>>
>>>>> 1) sample rate setup
>>>>>
>>>>>     In the current patch, the user specifies a sample rate as a 
>>>>> time
>>>>> interval.
>>>>>     The kernel is (a) calling cpufreq to get the current cpu
>>>>> frequency,
>>>>> (b)
>>>>>     converting the rate to a cycle count, (c) converting this to a
>>>>> 24 bit
>>>>>     LFSR count, an iterative algorithm (in this patch, starting 
>>>>> from
>>>>>     one of 256 values so a max of 2^16 or 64k iterations), (d)
>>>>> calculating
>>>>>     an trace unload interval.   In addition, a cpufreq notifier is
>>>>> registered
>>>>>     to recalculate on frequency changes.
>>>
>>> No.  The user issues the command opcontrol --event:N  where N is the
>>> number of events (cycles, l2 cache misses, instructions retired etc)
>>> that are to elapse between collecting the samples.
>>
>> So you are saying that b and c are primary, and a is used to calculate
>> a safe value for d.   All of the above work is dont, just from a
>> different starting point?
>
> There are two things 1) setup the LFSR to control how often the 
> Hardware
> puts samples into the trace buffer.  2) setup the kernel timer to read
> the trace buffer (your d, d is a function of the cpu freq) and process
> the samples.
>
> (c is nonsense)


Well, its cacluate the rate from the count vs count from the rate.

>
> The kernel timer was set with the goal the the hardware trace buffer
> would not get more then half full to ensure we would not lose samples
> even for the maximum rate that the hardware would be adding samples to
> the trace buffer (user specified N=100,000).

That should be well commented.

>>
>>> The OProfile passes
>>> the value N to the kernel via the variable ctr[i].count.  Where i is
>>> the
>>> performance counter entry for that event.
>>
>> Ok I haven't looked a the api closely.


Actually, the oprofile userspace fills out files in a file system to 
tell
the kernel what it needs to know.   The powerpc code defines the 
reources
needed to use the PMU hardware, which is (1) common mux selects, for
processors that need them, and (2) a set of directories, one for each 
of the
pmu counters, each of which contain the controls for that counter (such
as enable kernel space, enable user space, event timeout to interrupt or
sample collection, etc.   The ctr[i].count is one of these files.

>>
>>> Specifically with SPU
>>> profiling, we do not use performance counters because the CELL HW 
>>> does
>>> not allow the normal the PPU to read the SPU PC when a performance
>>> counter interrupt occurs.  We are using some additional hw support in
>>> the chip that allows us to periodically capture the SPU PC.  There is
>>> an
>>> LFSR hardware counter that can be started at an arbitrary LFSR value.
>>> When the last LFSR value in the sequence is reached, a sample is 
>>> taken
>>> and stored in the trace buffer.  Hence, the value of N specified by 
>>> the
>>> user must get converted to the LFSR value that is N from the end of 
>>> the
>>> sequence.
>>
>> Ok so its arbitray load count to max vs count and compare.   A 
>> critical
>> detail when computing the value to load, but the net result is the
>> same; the value for the count it hard to determine.
>
> The above statement makes no sense to me.

I think I talked past you.  Your description of hadrware vs mine was
different in that the counter always ends at a specified point and is
loaded with the variable count, where mine had it comparing to the
count as it incremented.

However, I now see that you were referring to the fact that what the
user specifes, the count, has to be converted to a LFSR value.  My point
is this can be done in the user-space oprofile code.  It already has
to look up magic numbers for setting up event selection muxes for other
hardware, adding a lfsr calculation is not beyond resoon.  Nor is having
it provide two values in two files.

>
> Determining the initial LFSR value that is N values from the last value
> in the sequence is not easy to do.

Well, its easy, its just order(N).

>>> The same clock that the processor is running at is used to
>>> control the LFSR count.  Hence the LFSR counter increments once per 
>>> CPU
>>> clock cycle regardless of the CPU frequency or changes in the
>>> frequency.
>>> There is no calculation for the LFSR value that is a function of the
>>> processor frequency.  There is no need to adjust the LFSR when the
>>> processor frequency changes.
>>
>> Oh, so the lfsr doesn't have to be recomputed, only the time
>> between unloads.
>
> The LFSR value is computed ONCE when you start OProfile.  The value is
> setup in the hardware once when OProfile starts.  The hardware will
> always restart with the value given to it after it reaches the last
> value in the sequence.  What you call the "time between unloads" is the
> time at which you schedule the kernel routine to empty the trace 
> buffer.
> It is calculated once.  It would only need to be recomputed if the cpu
> frequency changed.



>>>>>
>>>>>     The obvious problem is step (c), running a loop potentially 64
>>>>> thousand
>>>>>     times in kernel space will have a noticeable impact on other
>>>>> threads.
>>>>>
>>>>>     I propose instead that user space perform the above 4 steps, 
>>>>> and
>>>>> provide
>>>>>     the kernel with two inputs: (1) the value to load in the LFSR
>>>>> and (2)
>>>>>     the periodic frequency / time interval at which to empty the
>>>>> hardware
>>>>>     trace buffer, perform sample analysis, and send the data to the
>>>>> oprofile
>>>>>     subsystem.
>>>>>
>>>>>     There should be no security issues with this approach.   If the
>>>>> LFSR
>>>>> value
>>>>>     is calculated incorrectly, either it will be too short, causing
>>>>> the
>>>>> trace
>>>>>     array to overfill and data to be dropped, or it will be too
>>>>> long, and
>>>>>     there will be fewer samples.   Likewise, the kernel periodic 
>>>>> poll
>>>>> can be
>>>>>     too long, again causing overflow, or too frequent, causing only
>>>>> timer
>>>>>     execution overhead.
>>>>>
>>>>>     Various data is collected by the kernel while processing the
>>>>> periodic timer,
>>>>>     this approach would also allow the profiling tools to control 
>>>>> the
>>>>>     frequency of this collection.   More frequent collection 
>>>>> results
>>>>> in
>>>>> more
>>>>>     accurate sample data, with the linear cost of poll execution
>>>>> overhead.
>>>>>
>>>>>     Frequency changes can be handled either by the profile code
>>>>> setting
>>>>>     collection at a higher than necessary rate, or by interacting
>>>>> with
>>>>> the
>>>>>     governor to limit the speeds.
>>>>>
>>>>>     Optionally, the kernel can add a record indicating that some
>>>>> data was
>>>>>     likely dropped if it is able to read all 256 entries without
>>>>> underflowing
>>>>>     the array.  This can be used as hint to user space that the
>>>>> kernel
>>>>> time
>>>>>     was too long for the collection rate.
>>>>
>>>> Moving the sample rate computation to user space sounds like the 
>>>> right
>>>> idea, but why not have a more drastic version of it:
>>>
>>> No, I do not agree.  The user/kernel API pass N where N is the number
>>> of
>>> events between samples.  We are not at liberty to just change the 
>>> API.
>>> We we did do this, we fully expect that John Levon will push back
>>> saying
>>> why make an architecture specific API change when it isn't necessary.
>>
>> [So you have not asked.]
>>
>> <Kludge> If you want to overlaod the existing array, one
>> event could be the lfsr sample rate, and another event be
>> the collection time.  That would stay within the framework.
>> But that is a kludge. </Kludge>
>>
>> [Me goes and reads kernel profile driver and skims powerpc code].
>>
>> You are confusing the user interface (what the user specifies on the
>> command line) with the kernel API.
>>
>> It is somewhat hidden by the PowerPC specific common code in
>> arch/powerpc/oprofile/common.c.  That is where the counter
>> array is exposd.
>>
>> The user to kernel api is not fill out an array of counter
>> event names and sampling intervals.
>>
>> The user to kernel interface is a file system that contains a
>> heirachy of files.  Each file consists of the hex ascii
>> represntation of a unsigned long.   The filesystem interfaces
>> to the kernel by provding an API to create directorys and files,
>> specifing the name of the directory or file.  Theere are helper
>> routines to connect a file to a ulong and read access to an atomic_t.
>> The common driver (in drivers/oprofile) creates the file system
>> and some common files that implement the control interface, which
>> interfaces to the architecture specific driver through the ops
>> array.
>>
>> The powerpc common driver creates a heiarchy exposing the
>> values to be placed in the performance monitor registers,
>> and directory of counters with the event selection.
>>
>> Since this architeture code does not seem to match the
>> capabilitys of the hardware, it would seem that this is
>> the area to change.   This driver does not seem to use
>> the actual PMU interrput or sprs.   Lets make it its
>> own directory with its own controls.
>>
>> I don't see how exposing the sample collection to
>> rate and the computation of the LFSR create a userspace
>> api change;  I think its totally within the framework.
>
> I was being a bit simplistic in my explination.  I am well aware of the
> file system.  The point is the USER specifies the rate (every N events)
> that they want to have sampling done.  We are using the existing
> mechanism to pass the value of N to the kernel.  So from that
> standpoint, we are trying to be consistent in how it is done with the
> PPU. I feel that this is best to try to handle the N value in the same
> way rather then having a completely different way.


>
> If we were to put the LFSR into the user space, you would pass the N
> into the kernel for the PPU profiling case.  To make the API clean, you
> would have to create a new file entry to pass the LFSR value.  For SPU
> profiling you would not pass N instead you would pass LFSR.  I think it
> is a bad idea to have these two different things for PPU versus SPU
> profiling.  I really feel it is best to consistent to use pass N for 
> PPU
> and SPU.  Then deal with converting N to the LFSR for the special case
> of SPU profiling.
>

As far as I understand, you are providing access to a completely new
hardware that is related to the PMU hardware by the fact that it
collects a program counter.   It doesn't use the PMU counters nor the
PMU event selection.

In fact, why can the existing op_model_cell profiling not run while
the SPU profiling runs?   Is there a shared debug bus inside the
chip?   Or just the data stream with your buffer_sync code?

> Unlike POWER 4/5 support where we absolutely had to add entries to the
> API to pass the three values for the control registers.  We already 
> have
> an established mechanism for passing N from user to kernel.  Just use
> it.  We are very sure that John Levon, the OProfile user maintainer,
> will say the same thing and refuse to accept adding to the API to pass
> the LFSR when the whole thing can be handled in a more consistent way
> that does not require an architecture specific change.  And I also feel
> that we really don't need or want an additional architecture specific
> API change.
>
>>
>>> Please define "drastic" in this context.  Do you mean make the table
>>> bigger i.e. more then 256 precomputed elements?  I did 256 since Arnd
>>> seemed to think that would be a reasonable size. Based on his example
>>> How much kernel space are we willing to use to same some computation?
>>> Keep in mind only one of the entries in the table will ever be used.
>>>
>>> I think if we found the LFSR that was with in 2^10 of the desired 
>>> value
>>> that would be good enough. It would be within 1% of the minimum N the
>>> user can specify.  That would require a table with 2^14 entries.  
>>> That
>>> seems unreasonably large.
>>
>> Why does the table have to be linear?  Compute samples at various
>> logrimathic staring points.   Determine how many significant bits
>> you want to keep, and have an array for each sample length.
>>
>> ie for 3 bits of significance, you could have
>> F00000, E00000, ... 800000,   0F0000, ......
>> this would take (24-n) * 2^(n-1) slots, while maintaing user
>> control of the range.
>>
>
> I don't see any advantage in this log approach.  If we do this with a
> linear table with a reasonable number of pre calculated values.  I 
> think
> that a table with no more then 1024 entries would be reasonable.  The
> overhead for calculating the desired LFSR value would be going through
> the for loop 16K times, for 1024 entries in the table, is not
> unreasonable.  I think this whole discussion of moving the LFSR to the
> user space is not needed.  The overhead of the for loop does not 
> justify
> pushing the LFSR determination to user space.  But that is just my
> opinion.  I am open to suggestions on how big to make the lookup table
> in the kernel.  But I really am apposed to putting the LFSR into the
> user space.
>
>>>
>>> Anyway, the user controls how often sampling is done by setting N.
>>
>> When calling a user space program.   The events are converted to
>> a series of control register values that are communicated to the
>> kernel by writing to files in the file system.   The writes are
>> interpreted (converted from ascii hex to binary longs) and stored
>> until the control files are written, at which point callbacks
>> copy and interpret the controls and start the hardware collection.
>>
>> Frankly, I would expect a lot more resistance to the event data
>> stream generation changes and duplicaton.
>
> I don't agree.  But that is just my opinion based on my experience
> working on OProfile.
>

Well, I haven't worked with oprofile in the past, but I have worked
on the kernel.  And I stay by my statement.

milton

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-09 19:10           ` Arnd Bergmann
@ 2007-02-09 19:46             ` Milton Miller
  -1 siblings, 0 replies; 66+ messages in thread
From: Milton Miller @ 2007-02-09 19:46 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: cbe-oss-dev, LKML, linuxppc-dev, Carl Love, oprofile-list


On Feb 9, 2007, at 1:10 PM, Arnd Bergmann wrote:

> On Friday 09 February 2007 19:47, Milton Miller wrote:
>> On Feb 8, 2007, at 11:21 AM, Arnd Bergmann wrote:
>>
>
>>> Doing the translation in two stages in user space, as you
>>> suggest here, definitely makes sense to me. I think it
>>> can be done a little simpler though:
>>>
>>> Why would you need the accurate dcookie information to be
>>> provided by the kernel? The ELF loader is done in user
>>> space, and the kernel only reproduces what it thinks that
>>> came up with. If the kernel only gives the dcookie information
>>> about the SPU ELF binary to the oprofile user space, then
>>> that can easily recreate the same mapping.
>>
>> Actually, I was trying to cover issues such as anonymous
>> memory.   If the kernel doesn't generate dcookies for
>> the load segments the information is not recoverable once
>> the task exits.  This would also allow the loader to create
>> an artifical elf header that covered both the base executable
>> and a dynamicly linked one.
>>
>> Other alternatives exist, such as a structure for context-id
>> that would have its own identifing magic and an array of elf
>> header pointers.
>
> But _why_ do you want to solve that problem? we don't have
> dynamically linked binaries and I really don't see why the loader
> would want to create artificial elf headers...

I'm explainign how they could be handled.   Actully I think
the other proposal (an identified structure that points to
sevear elf headers) would be more approprate.  As you point
out, if there are presently no dynamic libraries in use, it
doesn't have to be solved today.  I'm just trying to make
the code future proof, or at least a clear path forward.

>
>>> The kernel still needs to provide the overlay identifiers
>>> though.
>>
>> Yes, which means it needs to parse the header (or libpse
>> be enhanced to write the monitor info to a spufs file).
>
> we thought about this in the past and discarded it because of
> the complexity of an spufs interface that would handle this
> correctly.

Not sure what would be difficuult, and it would allow other
binary formats.   But parsing the headers in the kernel
means existing userspace doesn't have to be upgraded, so I
am not proposing this requirement.

>
>>> yes, this sounds nice. But tt does not at all help accuracy,
>>> only performance, right?
>>
>> It allows the user space to know when the sample was taken
>> and  be aware of the ambiguity.   If the sample rate is
>> high enough in relation to the overlay switch rate, user space
>> could decide to discard the ambiguous samples.
>
> yes, good point.
>
>>>> This approach allows multiple objects by its nature.  A new
>>>> elf header could be constructed in memory that contained
>>>> the union of the elf objects load segments, and the tools
>>>> will magically work.   Alternatively the object id could
>>>> point to a new structure, identified via a new header, that
>>>> it points to other elf headers (easily differentiated by the
>>>> elf magic headers).   Other binary formats, including several
>>>> objects in a ar archive, could be supported.
>>>
>>> Yes, that would be a new feature if the kernel passed dcookie
>>> information for every section, but I doubt that it is worth
>>> it. I have not seen any program that allows loading code
>>> from more than one ELF file. In particular, the ELF format
>>> on the SPU is currently lacking the relocation mechanisms
>>> that you would need for resolving spu-side symbols at load
>>> time
>>
>> Actually, It could check all load segments, and only report
>> those where the dcookie changes (as opposed to the offset).
>
> I'm not really following you here, but probably you misunderstood
> my point as well.

I was thinking in terms of dyanmic libraries, and totally skipped
your comment about the relocaiton info being missing.   My reply
point was that the table could be compressed to the current
entry if all hased to the same vm area.

>
>>> This seems to incur a run-time overhead on the SPU even if not
>>> profiling, I would consider that not acceptable.
>>
>> It definitely is overhead.  Which means it would have to be
>> optional, like gprof.
>
> There is some work going on for another profiler independent
> of the hardware feature that only relies on instrumenting the
> spu executable for things like DMA transfers and overlay
> changes.

Regardless, its beyond the current scope.

milton


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
@ 2007-02-09 19:46             ` Milton Miller
  0 siblings, 0 replies; 66+ messages in thread
From: Milton Miller @ 2007-02-09 19:46 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: linuxppc-dev, Carl Love, cbe-oss-dev, oprofile-list, LKML


On Feb 9, 2007, at 1:10 PM, Arnd Bergmann wrote:

> On Friday 09 February 2007 19:47, Milton Miller wrote:
>> On Feb 8, 2007, at 11:21 AM, Arnd Bergmann wrote:
>>
>
>>> Doing the translation in two stages in user space, as you
>>> suggest here, definitely makes sense to me. I think it
>>> can be done a little simpler though:
>>>
>>> Why would you need the accurate dcookie information to be
>>> provided by the kernel? The ELF loader is done in user
>>> space, and the kernel only reproduces what it thinks that
>>> came up with. If the kernel only gives the dcookie information
>>> about the SPU ELF binary to the oprofile user space, then
>>> that can easily recreate the same mapping.
>>
>> Actually, I was trying to cover issues such as anonymous
>> memory. =A0 If the kernel doesn't generate dcookies for
>> the load segments the information is not recoverable once
>> the task exits. =A0This would also allow the loader to create
>> an artifical elf header that covered both the base executable
>> and a dynamicly linked one.
>>
>> Other alternatives exist, such as a structure for context-id
>> that would have its own identifing magic and an array of elf
>> header pointers.
>
> But _why_ do you want to solve that problem? we don't have
> dynamically linked binaries and I really don't see why the loader
> would want to create artificial elf headers...

I'm explainign how they could be handled.   Actully I think
the other proposal (an identified structure that points to
sevear elf headers) would be more approprate.  As you point
out, if there are presently no dynamic libraries in use, it
doesn't have to be solved today.  I'm just trying to make
the code future proof, or at least a clear path forward.

>
>>> The kernel still needs to provide the overlay identifiers
>>> though.
>>
>> Yes, which means it needs to parse the header (or libpse
>> be enhanced to write the monitor info to a spufs file).
>
> we thought about this in the past and discarded it because of
> the complexity of an spufs interface that would handle this
> correctly.

Not sure what would be difficuult, and it would allow other
binary formats.   But parsing the headers in the kernel
means existing userspace doesn't have to be upgraded, so I
am not proposing this requirement.

>
>>> yes, this sounds nice. But tt does not at all help accuracy,
>>> only performance, right?
>>
>> It allows the user space to know when the sample was taken
>> and =A0be aware of the ambiguity. =A0 If the sample rate is
>> high enough in relation to the overlay switch rate, user space
>> could decide to discard the ambiguous samples.
>
> yes, good point.
>
>>>> This approach allows multiple objects by its nature. =A0A new
>>>> elf header could be constructed in memory that contained
>>>> the union of the elf objects load segments, and the tools
>>>> will magically work. =A0 Alternatively the object id could
>>>> point to a new structure, identified via a new header, that
>>>> it points to other elf headers (easily differentiated by the
>>>> elf magic headers). =A0 Other binary formats, including several
>>>> objects in a ar archive, could be supported.
>>>
>>> Yes, that would be a new feature if the kernel passed dcookie
>>> information for every section, but I doubt that it is worth
>>> it. I have not seen any program that allows loading code
>>> from more than one ELF file. In particular, the ELF format
>>> on the SPU is currently lacking the relocation mechanisms
>>> that you would need for resolving spu-side symbols at load
>>> time
>>
>> Actually, It could check all load segments, and only report
>> those where the dcookie changes (as opposed to the offset).
>
> I'm not really following you here, but probably you misunderstood
> my point as well.

I was thinking in terms of dyanmic libraries, and totally skipped
your comment about the relocaiton info being missing.   My reply
point was that the table could be compressed to the current
entry if all hased to the same vm area.

>
>>> This seems to incur a run-time overhead on the SPU even if not
>>> profiling, I would consider that not acceptable.
>>
>> It definitely is overhead. =A0Which means it would have to be
>> optional, like gprof.
>
> There is some work going on for another profiler independent
> of the hardware feature that only relies on instrumenting the
> spu executable for things like DMA transfers and overlay
> changes.

Regardless, its beyond the current scope.

milton

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-09 18:47         ` Milton Miller
@ 2007-02-09 19:10           ` Arnd Bergmann
  -1 siblings, 0 replies; 66+ messages in thread
From: Arnd Bergmann @ 2007-02-09 19:10 UTC (permalink / raw)
  To: Milton Miller; +Cc: cbe-oss-dev, LKML, linuxppc-dev, Carl Love, oprofile-list

On Friday 09 February 2007 19:47, Milton Miller wrote:
> On Feb 8, 2007, at 11:21 AM, Arnd Bergmann wrote:
> 

> > Doing the translation in two stages in user space, as you
> > suggest here, definitely makes sense to me. I think it
> > can be done a little simpler though:
> >
> > Why would you need the accurate dcookie information to be
> > provided by the kernel? The ELF loader is done in user
> > space, and the kernel only reproduces what it thinks that
> > came up with. If the kernel only gives the dcookie information
> > about the SPU ELF binary to the oprofile user space, then
> > that can easily recreate the same mapping.
> 
> Actually, I was trying to cover issues such as anonymous
> memory.   If the kernel doesn't generate dcookies for
> the load segments the information is not recoverable once
> the task exits.  This would also allow the loader to create
> an artifical elf header that covered both the base executable
> and a dynamicly linked one.
> 
> Other alternatives exist, such as a structure for context-id
> that would have its own identifing magic and an array of elf
> header pointers.

But _why_ do you want to solve that problem? we don't have
dynamically linked binaries and I really don't see why the loader
would want to create artificial elf headers...

> > The kernel still needs to provide the overlay identifiers
> > though.
> 
> Yes, which means it needs to parse the header (or libpse
> be enhanced to write the monitor info to a spufs file).

we thought about this in the past and discarded it because of
the complexity of an spufs interface that would handle this
correctly. 

> > yes, this sounds nice. But tt does not at all help accuracy,
> > only performance, right?
> 
> It allows the user space to know when the sample was taken
> and  be aware of the ambiguity.   If the sample rate is
> high enough in relation to the overlay switch rate, user space
> could decide to discard the ambiguous samples.

yes, good point.

> >> This approach allows multiple objects by its nature.  A new
> >> elf header could be constructed in memory that contained
> >> the union of the elf objects load segments, and the tools
> >> will magically work.   Alternatively the object id could
> >> point to a new structure, identified via a new header, that
> >> it points to other elf headers (easily differentiated by the
> >> elf magic headers).   Other binary formats, including several
> >> objects in a ar archive, could be supported.
> >
> > Yes, that would be a new feature if the kernel passed dcookie
> > information for every section, but I doubt that it is worth
> > it. I have not seen any program that allows loading code
> > from more than one ELF file. In particular, the ELF format
> > on the SPU is currently lacking the relocation mechanisms
> > that you would need for resolving spu-side symbols at load
> > time
> 
> Actually, It could check all load segments, and only report
> those where the dcookie changes (as opposed to the offset).

I'm not really following you here, but probably you misunderstood
my point as well.

> > This seems to incur a run-time overhead on the SPU even if not
> > profiling, I would consider that not acceptable.
> 
> It definitely is overhead.  Which means it would have to be
> optional, like gprof.

There is some work going on for another profiler independent
of the hardware feature that only relies on instrumenting the
spu executable for things like DMA transfers and overlay
changes. 

	Arnd <><

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
@ 2007-02-09 19:10           ` Arnd Bergmann
  0 siblings, 0 replies; 66+ messages in thread
From: Arnd Bergmann @ 2007-02-09 19:10 UTC (permalink / raw)
  To: Milton Miller; +Cc: linuxppc-dev, Carl Love, cbe-oss-dev, oprofile-list, LKML

On Friday 09 February 2007 19:47, Milton Miller wrote:
> On Feb 8, 2007, at 11:21 AM, Arnd Bergmann wrote:
>=20

> > Doing the translation in two stages in user space, as you
> > suggest here, definitely makes sense to me. I think it
> > can be done a little simpler though:
> >
> > Why would you need the accurate dcookie information to be
> > provided by the kernel? The ELF loader is done in user
> > space, and the kernel only reproduces what it thinks that
> > came up with. If the kernel only gives the dcookie information
> > about the SPU ELF binary to the oprofile user space, then
> > that can easily recreate the same mapping.
>=20
> Actually, I was trying to cover issues such as anonymous
> memory. =A0 If the kernel doesn't generate dcookies for
> the load segments the information is not recoverable once
> the task exits. =A0This would also allow the loader to create
> an artifical elf header that covered both the base executable
> and a dynamicly linked one.
>=20
> Other alternatives exist, such as a structure for context-id
> that would have its own identifing magic and an array of elf
> header pointers.

But _why_ do you want to solve that problem? we don't have
dynamically linked binaries and I really don't see why the loader
would want to create artificial elf headers...

> > The kernel still needs to provide the overlay identifiers
> > though.
>=20
> Yes, which means it needs to parse the header (or libpse
> be enhanced to write the monitor info to a spufs file).

we thought about this in the past and discarded it because of
the complexity of an spufs interface that would handle this
correctly.=20

> > yes, this sounds nice. But tt does not at all help accuracy,
> > only performance, right?
>=20
> It allows the user space to know when the sample was taken
> and =A0be aware of the ambiguity. =A0 If the sample rate is
> high enough in relation to the overlay switch rate, user space
> could decide to discard the ambiguous samples.

yes, good point.

> >> This approach allows multiple objects by its nature. =A0A new
> >> elf header could be constructed in memory that contained
> >> the union of the elf objects load segments, and the tools
> >> will magically work. =A0 Alternatively the object id could
> >> point to a new structure, identified via a new header, that
> >> it points to other elf headers (easily differentiated by the
> >> elf magic headers). =A0 Other binary formats, including several
> >> objects in a ar archive, could be supported.
> >
> > Yes, that would be a new feature if the kernel passed dcookie
> > information for every section, but I doubt that it is worth
> > it. I have not seen any program that allows loading code
> > from more than one ELF file. In particular, the ELF format
> > on the SPU is currently lacking the relocation mechanisms
> > that you would need for resolving spu-side symbols at load
> > time
>=20
> Actually, It could check all load segments, and only report
> those where the dcookie changes (as opposed to the offset).

I'm not really following you here, but probably you misunderstood
my point as well.

> > This seems to incur a run-time overhead on the SPU even if not
> > profiling, I would consider that not acceptable.
>=20
> It definitely is overhead. =A0Which means it would have to be
> optional, like gprof.

There is some work going on for another profiler independent
of the hardware feature that only relies on instrumenting the
spu executable for things like DMA transfers and overlay
changes.=20

	Arnd <><

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-08 17:21       ` Arnd Bergmann
@ 2007-02-09 18:47         ` Milton Miller
  -1 siblings, 0 replies; 66+ messages in thread
From: Milton Miller @ 2007-02-09 18:47 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: cbe-oss-dev, LKML, linuxppc-dev, Carl Love, oprofile-list


On Feb 8, 2007, at 11:21 AM, Arnd Bergmann wrote:

> On Thursday 08 February 2007 15:18, Milton Miller wrote:
>
>> The current patch specifically identifies that only single
>> elf objects are handled.  There is no code to handle dynamic
>> linked libraries or overlays.   Nor is there any method to
>> present samples that may have been collected during context
>> switch processing, they must be discarded.
>
> I thought it already did handle overlays, what did I miss here?

It does, see my reply to Maynard.  Not sure what I was thinking
when I wrote this, possibly I was thinking of the inaccuracies.

>
>> My proposal is to change what is presented to user space.  Instead
>> of trying to translate the SPU address to the backing file
>> as the samples are recorded, store the samples as the SPU
>> context and address.  The context switch would record tid,
>> pid, object id as it does now.   In addition, if this is a
>> new object-id, the kernel would read elf headers as it does
>> today.  However, it would then proceed to provide accurate
>> dcookie information for each loader region and overlay.
>
> Doing the translation in two stages in user space, as you
> suggest here, definitely makes sense to me. I think it
> can be done a little simpler though:
>
> Why would you need the accurate dcookie information to be
> provided by the kernel? The ELF loader is done in user
> space, and the kernel only reproduces what it thinks that
> came up with. If the kernel only gives the dcookie information
> about the SPU ELF binary to the oprofile user space, then
> that can easily recreate the same mapping.

Actually, I was trying to cover issues such as anonymous
memory.   If the kernel doesn't generate dcookies for
the load segments the information is not recoverable once
the task exits.  This would also allow the loader to create
an artifical elf header that covered both the base executable
and a dynamicly linked one.

Other alternatives exist, such as a structure for context-id
that would have its own identifing magic and an array of elf
header pointers.


>
> The kernel still needs to provide the overlay identifiers
> though.

Yes, which means it needs to parse the header (or libpse
be enhanced to write the monitor info to a spufs file).

>
>> To identify which overlays are active, (instead of the present
>> read on use and search the list to translate approach) the
>> kernel would record the location of the overlay identifiers
>> as it parsed the kernel, but would then read the identification
>> word and would record the present value as an sample from
>> a separate but related stream.   The kernel could maintain
>> the last value for each overlay and only send profile events
>> for the deltas.
>
> right.
>
>> This approach trades translation lookup overhead for each
>> recorded sample for a burst of data on new context activation.
>> In addition it exposes the sample point of the overlay identifier
>> vs the address collection.  This allows the ambiguity to be
>> exposed to user space.   In addition, with the above proposed
>> kernel timer vs sample collection, user space could limit the
>> elapsed time between the address collection and the overlay
>> id check.
>
> yes, this sounds nice. But tt does not at all help accuracy,
> only performance, right?

It allows the user space to know when the sample was taken
and  be aware of the ambiguity.   If the sample rate is
high enough in relation to the overlay switch rate, user space
could decide to discard the ambiguous samples.

>
>> This approach allows multiple objects by its nature.  A new
>> elf header could be constructed in memory that contained
>> the union of the elf objects load segments, and the tools
>> will magically work.   Alternatively the object id could
>> point to a new structure, identified via a new header, that
>> it points to other elf headers (easily differentiated by the
>> elf magic headers).   Other binary formats, including several
>> objects in a ar archive, could be supported.
>
> Yes, that would be a new feature if the kernel passed dcookie
> information for every section, but I doubt that it is worth
> it. I have not seen any program that allows loading code
> from more than one ELF file. In particular, the ELF format
> on the SPU is currently lacking the relocation mechanisms
> that you would need for resolving spu-side symbols at load
> time

Actually, It could check all load segments, and only report
those where the dcookie changes (as opposed to the offset).

> .
>
>> If better overlay identification is required, in theory the
>> overlay switch code could be augmented to record the switches
>> (DMA reference time from the PowerPC memory and record a
>> relative decrementer in the SPU), this is obviously a future
>> item.  But it is facilitated by having user space resolve the
>> SPU to source file translation.
>
> This seems to incur a run-time overhead on the SPU even if not
> profiling, I would consider that not acceptable.

It definitely is overhead.  Which means it would have to be
optional, like gprof.


milton


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
@ 2007-02-09 18:47         ` Milton Miller
  0 siblings, 0 replies; 66+ messages in thread
From: Milton Miller @ 2007-02-09 18:47 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: linuxppc-dev, Carl Love, cbe-oss-dev, oprofile-list, LKML


On Feb 8, 2007, at 11:21 AM, Arnd Bergmann wrote:

> On Thursday 08 February 2007 15:18, Milton Miller wrote:
>
>> The current patch specifically identifies that only single
>> elf objects are handled.  There is no code to handle dynamic
>> linked libraries or overlays.   Nor is there any method to
>> present samples that may have been collected during context
>> switch processing, they must be discarded.
>
> I thought it already did handle overlays, what did I miss here?

It does, see my reply to Maynard.  Not sure what I was thinking
when I wrote this, possibly I was thinking of the inaccuracies.

>
>> My proposal is to change what is presented to user space.  Instead
>> of trying to translate the SPU address to the backing file
>> as the samples are recorded, store the samples as the SPU
>> context and address.  The context switch would record tid,
>> pid, object id as it does now.   In addition, if this is a
>> new object-id, the kernel would read elf headers as it does
>> today.  However, it would then proceed to provide accurate
>> dcookie information for each loader region and overlay.
>
> Doing the translation in two stages in user space, as you
> suggest here, definitely makes sense to me. I think it
> can be done a little simpler though:
>
> Why would you need the accurate dcookie information to be
> provided by the kernel? The ELF loader is done in user
> space, and the kernel only reproduces what it thinks that
> came up with. If the kernel only gives the dcookie information
> about the SPU ELF binary to the oprofile user space, then
> that can easily recreate the same mapping.

Actually, I was trying to cover issues such as anonymous
memory.   If the kernel doesn't generate dcookies for
the load segments the information is not recoverable once
the task exits.  This would also allow the loader to create
an artifical elf header that covered both the base executable
and a dynamicly linked one.

Other alternatives exist, such as a structure for context-id
that would have its own identifing magic and an array of elf
header pointers.


>
> The kernel still needs to provide the overlay identifiers
> though.

Yes, which means it needs to parse the header (or libpse
be enhanced to write the monitor info to a spufs file).

>
>> To identify which overlays are active, (instead of the present
>> read on use and search the list to translate approach) the
>> kernel would record the location of the overlay identifiers
>> as it parsed the kernel, but would then read the identification
>> word and would record the present value as an sample from
>> a separate but related stream.   The kernel could maintain
>> the last value for each overlay and only send profile events
>> for the deltas.
>
> right.
>
>> This approach trades translation lookup overhead for each
>> recorded sample for a burst of data on new context activation.
>> In addition it exposes the sample point of the overlay identifier
>> vs the address collection.  This allows the ambiguity to be
>> exposed to user space.   In addition, with the above proposed
>> kernel timer vs sample collection, user space could limit the
>> elapsed time between the address collection and the overlay
>> id check.
>
> yes, this sounds nice. But tt does not at all help accuracy,
> only performance, right?

It allows the user space to know when the sample was taken
and  be aware of the ambiguity.   If the sample rate is
high enough in relation to the overlay switch rate, user space
could decide to discard the ambiguous samples.

>
>> This approach allows multiple objects by its nature.  A new
>> elf header could be constructed in memory that contained
>> the union of the elf objects load segments, and the tools
>> will magically work.   Alternatively the object id could
>> point to a new structure, identified via a new header, that
>> it points to other elf headers (easily differentiated by the
>> elf magic headers).   Other binary formats, including several
>> objects in a ar archive, could be supported.
>
> Yes, that would be a new feature if the kernel passed dcookie
> information for every section, but I doubt that it is worth
> it. I have not seen any program that allows loading code
> from more than one ELF file. In particular, the ELF format
> on the SPU is currently lacking the relocation mechanisms
> that you would need for resolving spu-side symbols at load
> time

Actually, It could check all load segments, and only report
those where the dcookie changes (as opposed to the offset).

> .
>
>> If better overlay identification is required, in theory the
>> overlay switch code could be augmented to record the switches
>> (DMA reference time from the PowerPC memory and record a
>> relative decrementer in the SPU), this is obviously a future
>> item.  But it is facilitated by having user space resolve the
>> SPU to source file translation.
>
> This seems to incur a run-time overhead on the SPU even if not
> profiling, I would consider that not acceptable.

It definitely is overhead.  Which means it would have to be
optional, like gprof.


milton

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-08 23:59       ` Maynard Johnson
@ 2007-02-09 18:03         ` Milton Miller
  -1 siblings, 0 replies; 66+ messages in thread
From: Milton Miller @ 2007-02-09 18:03 UTC (permalink / raw)
  To: Maynard Johnson; +Cc: cbe-oss-dev, LKML, linuxppc-dev, Carl Love, oprofile-list


On Feb 8, 2007, at 5:59 PM, Maynard Johnson wrote:

> Milton,
> Thank you for your comments.  Carl will reply to certain parts of your 
> posting where he's more knowledgeable than I.  See my replies below.
>

Thanks for the pleasant tone and dialog.

> Milton Miller wrote:
>> On Feb 6, 2007, at 5:02 PM, Carl Love wrote:
>>> This is the first update to the patch previously posted by Maynard
>>> Johnson as "PATCH 4/4. Add support to OProfile for profiling CELL".
>>

>> Data collected
>>
>>
>> The current patch starts tackling these translation issues for the
>> presently common case of a static self contained binary from a single
>> file, either single separate source file or embedded in the data of
>> the host application.   When creating the trace entry for a SPU
>> context switch, it records the application owner, pid, tid, and
>> dcookie of the main executable.   It addition, it looks up the
>> object-id as a virtual address and records the offset if it is 
>> non-zero,
>> or the dcookie of the object if it is zero.   The code then creates
>> a data structure by reading the elf headers from the user process
>> (at the address given by the object-id) and building a list of
>> SPU address to elf object offsets, as specified by the ELF loader
>> headers.   In addition to the elf loader section, it processes the
>> overlay headers and records the address, size, and magic number of
>> the overlay.
>>
>> When the hardware trace entries are processed, each address is
>> looked up this structure and translated to the elf offset.  If
>> it is an overlay region, the overlay identify word is read and
>> the list is searched for the matching overlay.  The resulting
>> offset is sent to the oprofile system.
>>
>> The current patch specifically identifies that only single
>> elf objects are handled.  There is no code to handle dynamic
>> linked libraries or overlays.   Nor is there any method to
>>
> Yes, we do handle overlays.  (Note: I'm looking into a bug
> right now in our overlay support.)

I knew you handled overlays, and I did not mean to say that
you did not.   I am not sure how that got there.  I may have
been thinking of the kernel supplied context switch code
discussion, or how the code supplied dcookie or offset but
not both.  Actually, I might have been referring to the
fact that overlays are guessed rather than recorded.

>> present samples that may have been collected during context
>> switch processing, they must be discarded.
>>
>>
>> My proposal is to change what is presented to user space.  Instead
>> of trying to translate the SPU address to the backing file
>> as the samples are recorded, store the samples as the SPU
>> context and address.  The context switch would record tid,
>> pid, object id as it does now.   In addition, if this is a
>> new object-id, the kernel would read elf headers as it does
>> today.  However, it would then proceed to provide accurate
>> dcookie information for each loader region and overlay.  To
>> identify which overlays are active, (instead of the present
>> read on use and search the list to translate approach) the
>> kernel would record the location of the overlay identifiers
>> as it parsed the kernel, but would then read the identification
>> word and would record the present value as an sample from
>> a separate but related stream.   The kernel could maintain
>> the last value for each overlay and only send profile events
>> for the deltas.
>>
> Discussions on this topic in the past have resulted in the
> current implementation precisely because we're able to record
> the samples as fileoffsets, just as the userspace tools expect.

I was not part of the previous discussions, so please forgive me.

> I haven't had time to check out how much this would impact the
> userspace tools, but my gut feel is that it would be quite
> significant.  If we were developing this module with a matching
> newly-created userspace tool, I would be more inclined to agree
> that this makes sense.

I have not yet studied the user space tool.   In fact, when I
made this proposal, I had not studied the kernel oprofile code
either, although I had read the concepts and discussion of the
event buffer when the base patch was added to the kernel.

I have read and now consider myself to have some understanding
of the kernel side.  I note that the user space tool calls itself
alpha and the kernel support experimental.   I only looked at
the user space enough to determine it is written in C++.

I would hope the tool would be modular enough to insert a data
transformation pass to do this conversion that the kernel is
doing.

> But you give no rationale for your
> proposal that justifies the change.  The current implementation
> works, it has no impact on normal, non-profiling behavior,  and
> the overhead during profiling is not noticeable.

I was proposing this for several of reasons.

One, there were expressed limitations  in the current
proposal.  There is a requirement that everything be linked
into one ELF object for it to be profiled seemed significant
to me.  This implies that shared libraries (well, dynamic
linked ones) have no path forward in this framework.

Two, there was a discussion of profiling the kernel context
switch, that seemed to be deferred.

Three, I saw the amount of code added to create the buffer
stream, but had not studied it yet.   It appeared to be
creating a totally new stream to be interpreted by user space.
With that in mind, I was exploring what the right interface
should be with more freedom.

Fourth, the interpretation was being done by heuristics and not
first source data.  By this I mean that both the overlay
identification is delayed, and that the code being executed
is a copy from the file.   While existing users may leave
the code mapped into the host process, there is no inherent
requirement to do so.  Now that I understand overlays, I see
they would have to remain mapped.

Of these, I think the fourth is the most compelling argument,
although the first and the second were what caused me to
try to understand why all this new code was needed.

My proposal was an attempt to expose the raw data, providing
user space what we know vs some guess.  I was trying to provide
the information that was needed to perform the translation
in user space with at least the same accuracy, but with the
ambiguities exposed.


>> This approach trades translation lookup overhead for each
>> recorded sample for a burst of data on new context activation.
>> In addition it exposes the sample point of the overlay identifier
>> vs the address collection.  This allows the ambiguity to be
>> exposed to user space.   In addition, with the above proposed
>> kernel timer vs sample collection, user space could limit the
>> elapsed time between the address collection and the overlay
>> id check.
>>
> Yes, there is a window here where an overlay could occur before
> we finish processing a group of samples that were actually taken
> from a different overlay.  The obvious way to prevent that is
> for the kernel (or SPUFS) to be notified of the overlay and let
> OProfile know that we need to drain (perhaps discard would be
> best) our sample trace buffer.  As you indicate above, your
> proposal faces the same issue, but would just decrease the number
> of bogus samples.

Actually, with my proposal, I let user space decide how to handle
the ambiguity.   If the SPU periodically switches between two
overlays then the sampling will be out of sync.   Flags could be
passed to the user space decoder to influence the overlay; for
instance one flag might say place all samples on each overlay
in turn, or create separate buckets when the overlay was known
to change during sample collection (or just discard those samples).

> I contend that the relative number of bogus samples will be
> quite low in either case.

I think this totally depends on the frequency of overlay changes.
If an application frequently swaps between overlays then the
data could be wrong more than it is right.   If instead
the overlay is only switched when a fundamental phase change
in data processing occurs, then the ambiguity will have
little affect on the sample quality.


> Ideally, we should have a mechanism to eliminate them completely
> so as to avoid confusion the user's part when they're looking at
> a report.  Even a few bogus samples in the wrong place can be
> troubling.  Such a mechanism will be a good future enhancement.

I think the obvious most accurate to resolve this would be to have
the overlay load code record an event in a buffer with a
timestamp, the kernel record a timestamp periodically during
collection, and the user space do the correlation.   However,
this obviously involves both space and time overhead, as Arnd
pointed out.


Actually, if we record in the data stream that the overlay
changed from last read, then user space could know that samples
from that overlay are suspect, during that portion.   We
would need to mark both the beginning and end of that trace
array unload.


When I started to write this reply, I had done my review of
the existing kernel code, and thought I would severely back
off from this proposal.  However, now that I have written
my reasons for the proposal, I think I would still like to
have it explored.

Now that I have read the kernel code and slept on it, let
me also propose, in the alternative, an approach that tries
to reuse the existing sample framework be used instead of
the total re-implemntation.  This idea is still mostly
concept, so please bear with me.  That is to say I read
the headers, and have some understanding of how the stages
of data collection fit together, but have not studied the
actual impact to the implentation.

Even if we do stay with the kernel resolution of the SPU
address to source in the process vm, why do we need to
write new buffer sync code?   Instead of recording the hit
as an ELF object source, why not resolve it to the threads
context vm address, and let the existing oprofile code
lookup the dcookie and offset?

1) The  user space needs to find the elf header of embedded
objects.   Answer: on context switch pass the object-id as
a sample, with appropriate escape code.

2) THe offset will be file relative instead of elf object
relative.   Answer: user space has to account for this
offset either when reading the instructions for decode or
when doing the sample lookup.   Answer b: if the context
id dcookie is for the same file, subtract its offset.
Actually, only do this if its in range of the file size
as indicated in the ELF headers.   Hmmm.. this would imply
a artifical dcookie for the offset object-id case.

3) In addition to the additional records to record the
context-id on switch, the existing interfaces assume the
sample is being recorded from the context of the running
thread, implicitly reading cpu number and active mm.
Answer: Yes, but factoring and code reuse should be
possible.


[I wonder if any other nommu architectures use overlays
like this.  I would not be surprised if the answer is
no, they have enough address space.]

milton
--
miltonm@bga.com   Milton Miller
Speaking for myself only.


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
@ 2007-02-09 18:03         ` Milton Miller
  0 siblings, 0 replies; 66+ messages in thread
From: Milton Miller @ 2007-02-09 18:03 UTC (permalink / raw)
  To: Maynard Johnson; +Cc: linuxppc-dev, Carl Love, cbe-oss-dev, oprofile-list, LKML


On Feb 8, 2007, at 5:59 PM, Maynard Johnson wrote:

> Milton,
> Thank you for your comments.  Carl will reply to certain parts of your 
> posting where he's more knowledgeable than I.  See my replies below.
>

Thanks for the pleasant tone and dialog.

> Milton Miller wrote:
>> On Feb 6, 2007, at 5:02 PM, Carl Love wrote:
>>> This is the first update to the patch previously posted by Maynard
>>> Johnson as "PATCH 4/4. Add support to OProfile for profiling CELL".
>>

>> Data collected
>>
>>
>> The current patch starts tackling these translation issues for the
>> presently common case of a static self contained binary from a single
>> file, either single separate source file or embedded in the data of
>> the host application.   When creating the trace entry for a SPU
>> context switch, it records the application owner, pid, tid, and
>> dcookie of the main executable.   It addition, it looks up the
>> object-id as a virtual address and records the offset if it is 
>> non-zero,
>> or the dcookie of the object if it is zero.   The code then creates
>> a data structure by reading the elf headers from the user process
>> (at the address given by the object-id) and building a list of
>> SPU address to elf object offsets, as specified by the ELF loader
>> headers.   In addition to the elf loader section, it processes the
>> overlay headers and records the address, size, and magic number of
>> the overlay.
>>
>> When the hardware trace entries are processed, each address is
>> looked up this structure and translated to the elf offset.  If
>> it is an overlay region, the overlay identify word is read and
>> the list is searched for the matching overlay.  The resulting
>> offset is sent to the oprofile system.
>>
>> The current patch specifically identifies that only single
>> elf objects are handled.  There is no code to handle dynamic
>> linked libraries or overlays.   Nor is there any method to
>>
> Yes, we do handle overlays.  (Note: I'm looking into a bug
> right now in our overlay support.)

I knew you handled overlays, and I did not mean to say that
you did not.   I am not sure how that got there.  I may have
been thinking of the kernel supplied context switch code
discussion, or how the code supplied dcookie or offset but
not both.  Actually, I might have been referring to the
fact that overlays are guessed rather than recorded.

>> present samples that may have been collected during context
>> switch processing, they must be discarded.
>>
>>
>> My proposal is to change what is presented to user space.  Instead
>> of trying to translate the SPU address to the backing file
>> as the samples are recorded, store the samples as the SPU
>> context and address.  The context switch would record tid,
>> pid, object id as it does now.   In addition, if this is a
>> new object-id, the kernel would read elf headers as it does
>> today.  However, it would then proceed to provide accurate
>> dcookie information for each loader region and overlay.  To
>> identify which overlays are active, (instead of the present
>> read on use and search the list to translate approach) the
>> kernel would record the location of the overlay identifiers
>> as it parsed the kernel, but would then read the identification
>> word and would record the present value as an sample from
>> a separate but related stream.   The kernel could maintain
>> the last value for each overlay and only send profile events
>> for the deltas.
>>
> Discussions on this topic in the past have resulted in the
> current implementation precisely because we're able to record
> the samples as fileoffsets, just as the userspace tools expect.

I was not part of the previous discussions, so please forgive me.

> I haven't had time to check out how much this would impact the
> userspace tools, but my gut feel is that it would be quite
> significant.  If we were developing this module with a matching
> newly-created userspace tool, I would be more inclined to agree
> that this makes sense.

I have not yet studied the user space tool.   In fact, when I
made this proposal, I had not studied the kernel oprofile code
either, although I had read the concepts and discussion of the
event buffer when the base patch was added to the kernel.

I have read and now consider myself to have some understanding
of the kernel side.  I note that the user space tool calls itself
alpha and the kernel support experimental.   I only looked at
the user space enough to determine it is written in C++.

I would hope the tool would be modular enough to insert a data
transformation pass to do this conversion that the kernel is
doing.

> But you give no rationale for your
> proposal that justifies the change.  The current implementation
> works, it has no impact on normal, non-profiling behavior,  and
> the overhead during profiling is not noticeable.

I was proposing this for several of reasons.

One, there were expressed limitations  in the current
proposal.  There is a requirement that everything be linked
into one ELF object for it to be profiled seemed significant
to me.  This implies that shared libraries (well, dynamic
linked ones) have no path forward in this framework.

Two, there was a discussion of profiling the kernel context
switch, that seemed to be deferred.

Three, I saw the amount of code added to create the buffer
stream, but had not studied it yet.   It appeared to be
creating a totally new stream to be interpreted by user space.
With that in mind, I was exploring what the right interface
should be with more freedom.

Fourth, the interpretation was being done by heuristics and not
first source data.  By this I mean that both the overlay
identification is delayed, and that the code being executed
is a copy from the file.   While existing users may leave
the code mapped into the host process, there is no inherent
requirement to do so.  Now that I understand overlays, I see
they would have to remain mapped.

Of these, I think the fourth is the most compelling argument,
although the first and the second were what caused me to
try to understand why all this new code was needed.

My proposal was an attempt to expose the raw data, providing
user space what we know vs some guess.  I was trying to provide
the information that was needed to perform the translation
in user space with at least the same accuracy, but with the
ambiguities exposed.


>> This approach trades translation lookup overhead for each
>> recorded sample for a burst of data on new context activation.
>> In addition it exposes the sample point of the overlay identifier
>> vs the address collection.  This allows the ambiguity to be
>> exposed to user space.   In addition, with the above proposed
>> kernel timer vs sample collection, user space could limit the
>> elapsed time between the address collection and the overlay
>> id check.
>>
> Yes, there is a window here where an overlay could occur before
> we finish processing a group of samples that were actually taken
> from a different overlay.  The obvious way to prevent that is
> for the kernel (or SPUFS) to be notified of the overlay and let
> OProfile know that we need to drain (perhaps discard would be
> best) our sample trace buffer.  As you indicate above, your
> proposal faces the same issue, but would just decrease the number
> of bogus samples.

Actually, with my proposal, I let user space decide how to handle
the ambiguity.   If the SPU periodically switches between two
overlays then the sampling will be out of sync.   Flags could be
passed to the user space decoder to influence the overlay; for
instance one flag might say place all samples on each overlay
in turn, or create separate buckets when the overlay was known
to change during sample collection (or just discard those samples).

> I contend that the relative number of bogus samples will be
> quite low in either case.

I think this totally depends on the frequency of overlay changes.
If an application frequently swaps between overlays then the
data could be wrong more than it is right.   If instead
the overlay is only switched when a fundamental phase change
in data processing occurs, then the ambiguity will have
little affect on the sample quality.


> Ideally, we should have a mechanism to eliminate them completely
> so as to avoid confusion the user's part when they're looking at
> a report.  Even a few bogus samples in the wrong place can be
> troubling.  Such a mechanism will be a good future enhancement.

I think the obvious most accurate to resolve this would be to have
the overlay load code record an event in a buffer with a
timestamp, the kernel record a timestamp periodically during
collection, and the user space do the correlation.   However,
this obviously involves both space and time overhead, as Arnd
pointed out.


Actually, if we record in the data stream that the overlay
changed from last read, then user space could know that samples
from that overlay are suspect, during that portion.   We
would need to mark both the beginning and end of that trace
array unload.


When I started to write this reply, I had done my review of
the existing kernel code, and thought I would severely back
off from this proposal.  However, now that I have written
my reasons for the proposal, I think I would still like to
have it explored.

Now that I have read the kernel code and slept on it, let
me also propose, in the alternative, an approach that tries
to reuse the existing sample framework be used instead of
the total re-implemntation.  This idea is still mostly
concept, so please bear with me.  That is to say I read
the headers, and have some understanding of how the stages
of data collection fit together, but have not studied the
actual impact to the implentation.

Even if we do stay with the kernel resolution of the SPU
address to source in the process vm, why do we need to
write new buffer sync code?   Instead of recording the hit
as an ELF object source, why not resolve it to the threads
context vm address, and let the existing oprofile code
lookup the dcookie and offset?

1) The  user space needs to find the elf header of embedded
objects.   Answer: on context switch pass the object-id as
a sample, with appropriate escape code.

2) THe offset will be file relative instead of elf object
relative.   Answer: user space has to account for this
offset either when reading the instructions for decode or
when doing the sample lookup.   Answer b: if the context
id dcookie is for the same file, subtract its offset.
Actually, only do this if its in range of the file size
as indicated in the ELF headers.   Hmmm.. this would imply
a artifical dcookie for the offset object-id case.

3) In addition to the additional records to record the
context-id on switch, the existing interfaces assume the
sample is being recorded from the context of the running
thread, implicitly reading cpu number and active mm.
Answer: Yes, but factoring and code reuse should be
possible.


[I wonder if any other nommu architectures use overlays
like this.  I would not be surprised if the answer is
no, they have enough address space.]

milton
--
miltonm@bga.com   Milton Miller
Speaking for myself only.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-09  2:46           ` Milton Miller
@ 2007-02-09 16:17             ` Carl Love
  -1 siblings, 0 replies; 66+ messages in thread
From: Carl Love @ 2007-02-09 16:17 UTC (permalink / raw)
  To: Milton Miller
  Cc: cbe-oss-dev, Arnd Bergmann, LKML, linuxppc-dev, oprofile-list

On Thu, 2007-02-08 at 20:46 -0600, Milton Miller wrote:
>  On Feb 8, 2007, at 4:51 PM, Carl Love wrote:
>  
> > On Thu, 2007-02-08 at 18:21 +0100, Arnd Bergmann wrote:
> >> On Thursday 08 February 2007 15:18, Milton Miller wrote:
> >>
> >>> 1) sample rate setup
> >>>
> >>>     In the current patch, the user specifies a sample rate as a time
> >>> interval.
> >>>     The kernel is (a) calling cpufreq to get the current cpu 
> >>> frequency,
> >>> (b)
> >>>     converting the rate to a cycle count, (c) converting this to a 
> >>> 24 bit
> >>>     LFSR count, an iterative algorithm (in this patch, starting from
> >>>     one of 256 values so a max of 2^16 or 64k iterations), (d)
> >>> calculating
> >>>     an trace unload interval.   In addition, a cpufreq notifier is
> >>> registered
> >>>     to recalculate on frequency changes.
> >
> > No.  The user issues the command opcontrol --event:N  where N is the
> > number of events (cycles, l2 cache misses, instructions retired etc)
> > that are to elapse between collecting the samples.
>  
> So you are saying that b and c are primary, and a is used to calculate
> a safe value for d.   All of the above work is dont, just from a
> different starting point?

There are two things 1) setup the LFSR to control how often the Hardware
puts samples into the trace buffer.  2) setup the kernel timer to read
the trace buffer (your d, d is a function of the cpu freq) and process
the samples.  

(c is nonsense)

The kernel timer was set with the goal the the hardware trace buffer
would not get more then half full to ensure we would not lose samples
even for the maximum rate that the hardware would be adding samples to
the trace buffer (user specified N=100,000).  

>  
> 
> > The OProfile passes
> > the value N to the kernel via the variable ctr[i].count.  Where i is 
> > the
> > performance counter entry for that event.
>  
> Ok I haven't looked a the api closely.
>  
> > Specifically with SPU
> > profiling, we do not use performance counters because the CELL HW does
> > not allow the normal the PPU to read the SPU PC when a performance
> > counter interrupt occurs.  We are using some additional hw support in
> > the chip that allows us to periodically capture the SPU PC.  There is 
> > an
> > LFSR hardware counter that can be started at an arbitrary LFSR value.
> > When the last LFSR value in the sequence is reached, a sample is taken
> > and stored in the trace buffer.  Hence, the value of N specified by the
> > user must get converted to the LFSR value that is N from the end of the
> > sequence.
>  
> Ok so its arbitray load count to max vs count and compare.   A critical
> detail when computing the value to load, but the net result is the
> same; the value for the count it hard to determine.

The above statement makes no sense to me.  

Determining the initial LFSR value that is N values from the last value
in the sequence is not easy to do.  

>  
> > The same clock that the processor is running at is used to
> > control the LFSR count.  Hence the LFSR counter increments once per CPU
> > clock cycle regardless of the CPU frequency or changes in the 
> > frequency.
> > There is no calculation for the LFSR value that is a function of the
> > processor frequency.  There is no need to adjust the LFSR when the
> > processor frequency changes.
>  
> 
> 
> Oh, so the lfsr doesn't have to be recomputed, only the time
> between unloads.

The LFSR value is computed ONCE when you start OProfile.  The value is
setup in the hardware once when OProfile starts.  The hardware will
always restart with the value given to it after it reaches the last
value in the sequence.  What you call the "time between unloads" is the
time at which you schedule the kernel routine to empty the trace buffer.
It is calculated once.  It would only need to be recomputed if the cpu
frequency changed.

>  
> >
> > Milton had a comment about the code
> >
> >  if (ctr[0].event == SPU_CYCLES_EVENT_NUM) {
> >> +             spu_cycle_reset = ctr[0].count;
> >> +             return;
> >> +     }
> >
> > Well, given the above description, it is clear that if you are doing 
> > SPU
> > event profiling, the value N is put into the cntr[0].cnt entry since
> > there is only one event.  Thus in cell_global_start_spu() you use
> > spu_cycle_reset as the argument to the lfsr calculation routine to get
> > the LFSR value that is N from the end of the sequence.
>  
> I was looking at the patch and the context was not very good.   You
> might consider adding -p to your diff command, it provides the function
> name after the @@.
>  
> However, in this case, I think I just need to see the final result.
>  
> >
> >>>
> >>>     The obvious problem is step (c), running a loop potentially 64
> >>> thousand
> >>>     times in kernel space will have a noticeable impact on other 
> >>> threads.
> >>>
> >>>     I propose instead that user space perform the above 4 steps, and
> >>> provide
> >>>     the kernel with two inputs: (1) the value to load in the LFSR 
> >>> and (2)
> >>>     the periodic frequency / time interval at which to empty the 
> >>> hardware
> >>>     trace buffer, perform sample analysis, and send the data to the
> >>> oprofile
> >>>     subsystem.
> >>>
> >>>     There should be no security issues with this approach.   If the 
> >>> LFSR
> >>> value
> >>>     is calculated incorrectly, either it will be too short, causing 
> >>> the
> >>> trace
> >>>     array to overfill and data to be dropped, or it will be too 
> >>> long, and
> >>>     there will be fewer samples.   Likewise, the kernel periodic poll
> >>> can be
> >>>     too long, again causing overflow, or too frequent, causing only 
> >>> timer
> >>>     execution overhead.
> >>>
> >>>     Various data is collected by the kernel while processing the
> >>> periodic timer,
> >>>     this approach would also allow the profiling tools to control the
> >>>     frequency of this collection.   More frequent collection results 
> >>> in
> >>> more
> >>>     accurate sample data, with the linear cost of poll execution
> >>> overhead.
> >>>
> >>>     Frequency changes can be handled either by the profile code 
> >>> setting
> >>>     collection at a higher than necessary rate, or by interacting 
> >>> with
> >>> the
> >>>     governor to limit the speeds.
> >>>
> >>>     Optionally, the kernel can add a record indicating that some 
> >>> data was
> >>>     likely dropped if it is able to read all 256 entries without
> >>> underflowing
> >>>     the array.  This can be used as hint to user space that the 
> >>> kernel
> >>> time
> >>>     was too long for the collection rate.
> >>
> >> Moving the sample rate computation to user space sounds like the right
> >> idea, but why not have a more drastic version of it:
> >
> > No, I do not agree.  The user/kernel API pass N where N is the number 
> > of
> > events between samples.  We are not at liberty to just change the API.
> > We we did do this, we fully expect that John Levon will push back 
> > saying
> > why make an architecture specific API change when it isn't necessary.
>  
> [So you have not asked.]
>  
> <Kludge> If you want to overlaod the existing array, one
> event could be the lfsr sample rate, and another event be
> the collection time.  That would stay within the framework.
> But that is a kludge. </Kludge>
>  
> [Me goes and reads kernel profile driver and skims powerpc code].
>  
> You are confusing the user interface (what the user specifies on the
> command line) with the kernel API.
>  
> It is somewhat hidden by the PowerPC specific common code in
> arch/powerpc/oprofile/common.c.  That is where the counter
> array is exposd.
>  
> The user to kernel api is not fill out an array of counter
> event names and sampling intervals.
>  
> The user to kernel interface is a file system that contains a
> heirachy of files.  Each file consists of the hex ascii
> represntation of a unsigned long.   The filesystem interfaces
> to the kernel by provding an API to create directorys and files,
> specifing the name of the directory or file.  Theere are helper
> routines to connect a file to a ulong and read access to an atomic_t.
> The common driver (in drivers/oprofile) creates the file system
> and some common files that implement the control interface, which
> interfaces to the architecture specific driver through the ops
> array.
>  
> The powerpc common driver creates a heiarchy exposing the
> values to be placed in the performance monitor registers,
> and directory of counters with the event selection.
>  
> Since this architeture code does not seem to match the
> capabilitys of the hardware, it would seem that this is
> the area to change.   This driver does not seem to use
> the actual PMU interrput or sprs.   Lets make it its
> own directory with its own controls.
>  
> I don't see how exposing the sample collection to
> rate and the computation of the LFSR create a userspace
> api change;  I think its totally within the framework.

I was being a bit simplistic in my explination.  I am well aware of the
file system.  The point is the USER specifies the rate (every N events)
that they want to have sampling done.  We are using the existing
mechanism to pass the value of N to the kernel.  So from that
standpoint, we are trying to be consistent in how it is done with the
PPU. I feel that this is best to try to handle the N value in the same
way rather then having a completely different way.  

If we were to put the LFSR into the user space, you would pass the N
into the kernel for the PPU profiling case.  To make the API clean, you
would have to create a new file entry to pass the LFSR value.  For SPU
profiling you would not pass N instead you would pass LFSR.  I think it
is a bad idea to have these two different things for PPU versus SPU
profiling.  I really feel it is best to consistent to use pass N for PPU
and SPU.  Then deal with converting N to the LFSR for the special case
of SPU profiling.  

Unlike POWER 4/5 support where we absolutely had to add entries to the
API to pass the three values for the control registers.  We already have
an established mechanism for passing N from user to kernel.  Just use
it.  We are very sure that John Levon, the OProfile user maintainer,
will say the same thing and refuse to accept adding to the API to pass
the LFSR when the whole thing can be handled in a more consistent way
that does not require an architecture specific change.  And I also feel
that we really don't need or want an additional architecture specific
API change. 

>  
> > Please define "drastic" in this context.  Do you mean make the table
> > bigger i.e. more then 256 precomputed elements?  I did 256 since Arnd
> > seemed to think that would be a reasonable size. Based on his example
> > How much kernel space are we willing to use to same some computation?
> > Keep in mind only one of the entries in the table will ever be used.
> >
> > I think if we found the LFSR that was with in 2^10 of the desired value
> > that would be good enough. It would be within 1% of the minimum N the
> > user can specify.  That would require a table with 2^14 entries.  That
> > seems unreasonably large.
>  
> Why does the table have to be linear?  Compute samples at various
> logrimathic staring points.   Determine how many significant bits
> you want to keep, and have an array for each sample length.
>  
> ie for 3 bits of significance, you could have
> F00000, E00000, ... 800000,   0F0000, ......
> this would take (24-n) * 2^(n-1) slots, while maintaing user
> control of the range.
>  

I don't see any advantage in this log approach.  If we do this with a
linear table with a reasonable number of pre calculated values.  I think
that a table with no more then 1024 entries would be reasonable.  The
overhead for calculating the desired LFSR value would be going through
the for loop 16K times, for 1024 entries in the table, is not
unreasonable.  I think this whole discussion of moving the LFSR to the
user space is not needed.  The overhead of the for loop does not justify
pushing the LFSR determination to user space.  But that is just my
opinion.  I am open to suggestions on how big to make the lookup table
in the kernel.  But I really am apposed to putting the LFSR into the
user space.

> >
> > Anyway, the user controls how often sampling is done by setting N.
>  
> When calling a user space program.   The events are converted to
> a series of control register values that are communicated to the
> kernel by writing to files in the file system.   The writes are
> interpreted (converted from ascii hex to binary longs) and stored
> until the control files are written, at which point callbacks
> copy and interpret the controls and start the hardware collection.
>  
> Frankly, I would expect a lot more resistance to the event data
> stream generation changes and duplicaton.

I don't agree.  But that is just my opinion based on my experience
working on OProfile.

>  
> 
> milton
>  


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
@ 2007-02-09 16:17             ` Carl Love
  0 siblings, 0 replies; 66+ messages in thread
From: Carl Love @ 2007-02-09 16:17 UTC (permalink / raw)
  To: Milton Miller
  Cc: linuxppc-dev, oprofile-list, cbe-oss-dev, Arnd Bergmann, LKML

On Thu, 2007-02-08 at 20:46 -0600, Milton Miller wrote:
>  On Feb 8, 2007, at 4:51 PM, Carl Love wrote:
>  
> > On Thu, 2007-02-08 at 18:21 +0100, Arnd Bergmann wrote:
> >> On Thursday 08 February 2007 15:18, Milton Miller wrote:
> >>
> >>> 1) sample rate setup
> >>>
> >>>     In the current patch, the user specifies a sample rate as a time
> >>> interval.
> >>>     The kernel is (a) calling cpufreq to get the current cpu 
> >>> frequency,
> >>> (b)
> >>>     converting the rate to a cycle count, (c) converting this to a 
> >>> 24 bit
> >>>     LFSR count, an iterative algorithm (in this patch, starting from
> >>>     one of 256 values so a max of 2^16 or 64k iterations), (d)
> >>> calculating
> >>>     an trace unload interval.   In addition, a cpufreq notifier is
> >>> registered
> >>>     to recalculate on frequency changes.
> >
> > No.  The user issues the command opcontrol --event:N  where N is the
> > number of events (cycles, l2 cache misses, instructions retired etc)
> > that are to elapse between collecting the samples.
>  
> So you are saying that b and c are primary, and a is used to calculate
> a safe value for d.   All of the above work is dont, just from a
> different starting point?

There are two things 1) setup the LFSR to control how often the Hardware
puts samples into the trace buffer.  2) setup the kernel timer to read
the trace buffer (your d, d is a function of the cpu freq) and process
the samples.  

(c is nonsense)

The kernel timer was set with the goal the the hardware trace buffer
would not get more then half full to ensure we would not lose samples
even for the maximum rate that the hardware would be adding samples to
the trace buffer (user specified N=100,000).  

>  
> 
> > The OProfile passes
> > the value N to the kernel via the variable ctr[i].count.  Where i is 
> > the
> > performance counter entry for that event.
>  
> Ok I haven't looked a the api closely.
>  
> > Specifically with SPU
> > profiling, we do not use performance counters because the CELL HW does
> > not allow the normal the PPU to read the SPU PC when a performance
> > counter interrupt occurs.  We are using some additional hw support in
> > the chip that allows us to periodically capture the SPU PC.  There is 
> > an
> > LFSR hardware counter that can be started at an arbitrary LFSR value.
> > When the last LFSR value in the sequence is reached, a sample is taken
> > and stored in the trace buffer.  Hence, the value of N specified by the
> > user must get converted to the LFSR value that is N from the end of the
> > sequence.
>  
> Ok so its arbitray load count to max vs count and compare.   A critical
> detail when computing the value to load, but the net result is the
> same; the value for the count it hard to determine.

The above statement makes no sense to me.  

Determining the initial LFSR value that is N values from the last value
in the sequence is not easy to do.  

>  
> > The same clock that the processor is running at is used to
> > control the LFSR count.  Hence the LFSR counter increments once per CPU
> > clock cycle regardless of the CPU frequency or changes in the 
> > frequency.
> > There is no calculation for the LFSR value that is a function of the
> > processor frequency.  There is no need to adjust the LFSR when the
> > processor frequency changes.
>  
> 
> 
> Oh, so the lfsr doesn't have to be recomputed, only the time
> between unloads.

The LFSR value is computed ONCE when you start OProfile.  The value is
setup in the hardware once when OProfile starts.  The hardware will
always restart with the value given to it after it reaches the last
value in the sequence.  What you call the "time between unloads" is the
time at which you schedule the kernel routine to empty the trace buffer.
It is calculated once.  It would only need to be recomputed if the cpu
frequency changed.

>  
> >
> > Milton had a comment about the code
> >
> >  if (ctr[0].event == SPU_CYCLES_EVENT_NUM) {
> >> +             spu_cycle_reset = ctr[0].count;
> >> +             return;
> >> +     }
> >
> > Well, given the above description, it is clear that if you are doing 
> > SPU
> > event profiling, the value N is put into the cntr[0].cnt entry since
> > there is only one event.  Thus in cell_global_start_spu() you use
> > spu_cycle_reset as the argument to the lfsr calculation routine to get
> > the LFSR value that is N from the end of the sequence.
>  
> I was looking at the patch and the context was not very good.   You
> might consider adding -p to your diff command, it provides the function
> name after the @@.
>  
> However, in this case, I think I just need to see the final result.
>  
> >
> >>>
> >>>     The obvious problem is step (c), running a loop potentially 64
> >>> thousand
> >>>     times in kernel space will have a noticeable impact on other 
> >>> threads.
> >>>
> >>>     I propose instead that user space perform the above 4 steps, and
> >>> provide
> >>>     the kernel with two inputs: (1) the value to load in the LFSR 
> >>> and (2)
> >>>     the periodic frequency / time interval at which to empty the 
> >>> hardware
> >>>     trace buffer, perform sample analysis, and send the data to the
> >>> oprofile
> >>>     subsystem.
> >>>
> >>>     There should be no security issues with this approach.   If the 
> >>> LFSR
> >>> value
> >>>     is calculated incorrectly, either it will be too short, causing 
> >>> the
> >>> trace
> >>>     array to overfill and data to be dropped, or it will be too 
> >>> long, and
> >>>     there will be fewer samples.   Likewise, the kernel periodic poll
> >>> can be
> >>>     too long, again causing overflow, or too frequent, causing only 
> >>> timer
> >>>     execution overhead.
> >>>
> >>>     Various data is collected by the kernel while processing the
> >>> periodic timer,
> >>>     this approach would also allow the profiling tools to control the
> >>>     frequency of this collection.   More frequent collection results 
> >>> in
> >>> more
> >>>     accurate sample data, with the linear cost of poll execution
> >>> overhead.
> >>>
> >>>     Frequency changes can be handled either by the profile code 
> >>> setting
> >>>     collection at a higher than necessary rate, or by interacting 
> >>> with
> >>> the
> >>>     governor to limit the speeds.
> >>>
> >>>     Optionally, the kernel can add a record indicating that some 
> >>> data was
> >>>     likely dropped if it is able to read all 256 entries without
> >>> underflowing
> >>>     the array.  This can be used as hint to user space that the 
> >>> kernel
> >>> time
> >>>     was too long for the collection rate.
> >>
> >> Moving the sample rate computation to user space sounds like the right
> >> idea, but why not have a more drastic version of it:
> >
> > No, I do not agree.  The user/kernel API pass N where N is the number 
> > of
> > events between samples.  We are not at liberty to just change the API.
> > We we did do this, we fully expect that John Levon will push back 
> > saying
> > why make an architecture specific API change when it isn't necessary.
>  
> [So you have not asked.]
>  
> <Kludge> If you want to overlaod the existing array, one
> event could be the lfsr sample rate, and another event be
> the collection time.  That would stay within the framework.
> But that is a kludge. </Kludge>
>  
> [Me goes and reads kernel profile driver and skims powerpc code].
>  
> You are confusing the user interface (what the user specifies on the
> command line) with the kernel API.
>  
> It is somewhat hidden by the PowerPC specific common code in
> arch/powerpc/oprofile/common.c.  That is where the counter
> array is exposd.
>  
> The user to kernel api is not fill out an array of counter
> event names and sampling intervals.
>  
> The user to kernel interface is a file system that contains a
> heirachy of files.  Each file consists of the hex ascii
> represntation of a unsigned long.   The filesystem interfaces
> to the kernel by provding an API to create directorys and files,
> specifing the name of the directory or file.  Theere are helper
> routines to connect a file to a ulong and read access to an atomic_t.
> The common driver (in drivers/oprofile) creates the file system
> and some common files that implement the control interface, which
> interfaces to the architecture specific driver through the ops
> array.
>  
> The powerpc common driver creates a heiarchy exposing the
> values to be placed in the performance monitor registers,
> and directory of counters with the event selection.
>  
> Since this architeture code does not seem to match the
> capabilitys of the hardware, it would seem that this is
> the area to change.   This driver does not seem to use
> the actual PMU interrput or sprs.   Lets make it its
> own directory with its own controls.
>  
> I don't see how exposing the sample collection to
> rate and the computation of the LFSR create a userspace
> api change;  I think its totally within the framework.

I was being a bit simplistic in my explination.  I am well aware of the
file system.  The point is the USER specifies the rate (every N events)
that they want to have sampling done.  We are using the existing
mechanism to pass the value of N to the kernel.  So from that
standpoint, we are trying to be consistent in how it is done with the
PPU. I feel that this is best to try to handle the N value in the same
way rather then having a completely different way.  

If we were to put the LFSR into the user space, you would pass the N
into the kernel for the PPU profiling case.  To make the API clean, you
would have to create a new file entry to pass the LFSR value.  For SPU
profiling you would not pass N instead you would pass LFSR.  I think it
is a bad idea to have these two different things for PPU versus SPU
profiling.  I really feel it is best to consistent to use pass N for PPU
and SPU.  Then deal with converting N to the LFSR for the special case
of SPU profiling.  

Unlike POWER 4/5 support where we absolutely had to add entries to the
API to pass the three values for the control registers.  We already have
an established mechanism for passing N from user to kernel.  Just use
it.  We are very sure that John Levon, the OProfile user maintainer,
will say the same thing and refuse to accept adding to the API to pass
the LFSR when the whole thing can be handled in a more consistent way
that does not require an architecture specific change.  And I also feel
that we really don't need or want an additional architecture specific
API change. 

>  
> > Please define "drastic" in this context.  Do you mean make the table
> > bigger i.e. more then 256 precomputed elements?  I did 256 since Arnd
> > seemed to think that would be a reasonable size. Based on his example
> > How much kernel space are we willing to use to same some computation?
> > Keep in mind only one of the entries in the table will ever be used.
> >
> > I think if we found the LFSR that was with in 2^10 of the desired value
> > that would be good enough. It would be within 1% of the minimum N the
> > user can specify.  That would require a table with 2^14 entries.  That
> > seems unreasonably large.
>  
> Why does the table have to be linear?  Compute samples at various
> logrimathic staring points.   Determine how many significant bits
> you want to keep, and have an array for each sample length.
>  
> ie for 3 bits of significance, you could have
> F00000, E00000, ... 800000,   0F0000, ......
> this would take (24-n) * 2^(n-1) slots, while maintaing user
> control of the range.
>  

I don't see any advantage in this log approach.  If we do this with a
linear table with a reasonable number of pre calculated values.  I think
that a table with no more then 1024 entries would be reasonable.  The
overhead for calculating the desired LFSR value would be going through
the for loop 16K times, for 1024 entries in the table, is not
unreasonable.  I think this whole discussion of moving the LFSR to the
user space is not needed.  The overhead of the for loop does not justify
pushing the LFSR determination to user space.  But that is just my
opinion.  I am open to suggestions on how big to make the lookup table
in the kernel.  But I really am apposed to putting the LFSR into the
user space.

> >
> > Anyway, the user controls how often sampling is done by setting N.
>  
> When calling a user space program.   The events are converted to
> a series of control register values that are communicated to the
> kernel by writing to files in the file system.   The writes are
> interpreted (converted from ascii hex to binary longs) and stored
> until the control files are written, at which point callbacks
> copy and interpret the controls and start the hardware collection.
>  
> Frankly, I would expect a lot more resistance to the event data
> stream generation changes and duplicaton.

I don't agree.  But that is just my opinion based on my experience
working on OProfile.

>  
> 
> milton
>  

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-08 22:51         ` Carl Love
@ 2007-02-09  2:46           ` Milton Miller
  -1 siblings, 0 replies; 66+ messages in thread
From: Milton Miller @ 2007-02-09  2:46 UTC (permalink / raw)
  To: Carl Love; +Cc: cbe-oss-dev, Arnd Bergmann, LKML, linuxppc-dev, oprofile-list


On Feb 8, 2007, at 4:51 PM, Carl Love wrote:

> On Thu, 2007-02-08 at 18:21 +0100, Arnd Bergmann wrote:
>> On Thursday 08 February 2007 15:18, Milton Miller wrote:
>>
>>> 1) sample rate setup
>>>
>>>     In the current patch, the user specifies a sample rate as a time
>>> interval.
>>>     The kernel is (a) calling cpufreq to get the current cpu 
>>> frequency,
>>> (b)
>>>     converting the rate to a cycle count, (c) converting this to a 
>>> 24 bit
>>>     LFSR count, an iterative algorithm (in this patch, starting from
>>>     one of 256 values so a max of 2^16 or 64k iterations), (d)
>>> calculating
>>>     an trace unload interval.   In addition, a cpufreq notifier is
>>> registered
>>>     to recalculate on frequency changes.
>
> No.  The user issues the command opcontrol --event:N  where N is the
> number of events (cycles, l2 cache misses, instructions retired etc)
> that are to elapse between collecting the samples.

So you are saying that b and c are primary, and a is used to calculate
a safe value for d.   All of the above work is dont, just from a
different starting point?


> The OProfile passes
> the value N to the kernel via the variable ctr[i].count.  Where i is 
> the
> performance counter entry for that event.

Ok I haven't looked a the api closely.

> Specifically with SPU
> profiling, we do not use performance counters because the CELL HW does
> not allow the normal the PPU to read the SPU PC when a performance
> counter interrupt occurs.  We are using some additional hw support in
> the chip that allows us to periodically capture the SPU PC.  There is 
> an
> LFSR hardware counter that can be started at an arbitrary LFSR value.
> When the last LFSR value in the sequence is reached, a sample is taken
> and stored in the trace buffer.  Hence, the value of N specified by the
> user must get converted to the LFSR value that is N from the end of the
> sequence.

Ok so its arbitray load count to max vs count and compare.   A critical
detail when computing the value to load, but the net result is the
same; the value for the count it hard to determine.

> The same clock that the processor is running at is used to
> control the LFSR count.  Hence the LFSR counter increments once per CPU
> clock cycle regardless of the CPU frequency or changes in the 
> frequency.
> There is no calculation for the LFSR value that is a function of the
> processor frequency.  There is no need to adjust the LFSR when the
> processor frequency changes.



Oh, so the lfsr doesn't have to be recomputed, only the time
between unloads.

>
> Milton had a comment about the code
>
>  if (ctr[0].event == SPU_CYCLES_EVENT_NUM) {
>> +             spu_cycle_reset = ctr[0].count;
>> +             return;
>> +     }
>
> Well, given the above description, it is clear that if you are doing 
> SPU
> event profiling, the value N is put into the cntr[0].cnt entry since
> there is only one event.  Thus in cell_global_start_spu() you use
> spu_cycle_reset as the argument to the lfsr calculation routine to get
> the LFSR value that is N from the end of the sequence.

I was looking at the patch and the context was not very good.   You
might consider adding -p to your diff command, it provides the function
name after the @@.

However, in this case, I think I just need to see the final result.

>
>>>
>>>     The obvious problem is step (c), running a loop potentially 64
>>> thousand
>>>     times in kernel space will have a noticeable impact on other 
>>> threads.
>>>
>>>     I propose instead that user space perform the above 4 steps, and
>>> provide
>>>     the kernel with two inputs: (1) the value to load in the LFSR 
>>> and (2)
>>>     the periodic frequency / time interval at which to empty the 
>>> hardware
>>>     trace buffer, perform sample analysis, and send the data to the
>>> oprofile
>>>     subsystem.
>>>
>>>     There should be no security issues with this approach.   If the 
>>> LFSR
>>> value
>>>     is calculated incorrectly, either it will be too short, causing 
>>> the
>>> trace
>>>     array to overfill and data to be dropped, or it will be too 
>>> long, and
>>>     there will be fewer samples.   Likewise, the kernel periodic poll
>>> can be
>>>     too long, again causing overflow, or too frequent, causing only 
>>> timer
>>>     execution overhead.
>>>
>>>     Various data is collected by the kernel while processing the
>>> periodic timer,
>>>     this approach would also allow the profiling tools to control the
>>>     frequency of this collection.   More frequent collection results 
>>> in
>>> more
>>>     accurate sample data, with the linear cost of poll execution
>>> overhead.
>>>
>>>     Frequency changes can be handled either by the profile code 
>>> setting
>>>     collection at a higher than necessary rate, or by interacting 
>>> with
>>> the
>>>     governor to limit the speeds.
>>>
>>>     Optionally, the kernel can add a record indicating that some 
>>> data was
>>>     likely dropped if it is able to read all 256 entries without
>>> underflowing
>>>     the array.  This can be used as hint to user space that the 
>>> kernel
>>> time
>>>     was too long for the collection rate.
>>
>> Moving the sample rate computation to user space sounds like the right
>> idea, but why not have a more drastic version of it:
>
> No, I do not agree.  The user/kernel API pass N where N is the number 
> of
> events between samples.  We are not at liberty to just change the API.
> We we did do this, we fully expect that John Levon will push back 
> saying
> why make an architecture specific API change when it isn't necessary.

[So you have not asked.]

<Kludge> If you want to overlaod the existing array, one
event could be the lfsr sample rate, and another event be
the collection time.  That would stay within the framework.
But that is a kludge. </Kludge>

[Me goes and reads kernel profile driver and skims powerpc code].

You are confusing the user interface (what the user specifies on the
command line) with the kernel API.

It is somewhat hidden by the PowerPC specific common code in
arch/powerpc/oprofile/common.c.  That is where the counter
array is exposd.

The user to kernel api is not fill out an array of counter
event names and sampling intervals.

The user to kernel interface is a file system that contains a
heirachy of files.  Each file consists of the hex ascii
represntation of a unsigned long.   The filesystem interfaces
to the kernel by provding an API to create directorys and files,
specifing the name of the directory or file.  Theere are helper
routines to connect a file to a ulong and read access to an atomic_t.
The common driver (in drivers/oprofile) creates the file system
and some common files that implement the control interface, which
interfaces to the architecture specific driver through the ops
array.

The powerpc common driver creates a heiarchy exposing the
values to be placed in the performance monitor registers,
and directory of counters with the event selection.

Since this architeture code does not seem to match the
capabilitys of the hardware, it would seem that this is
the area to change.   This driver does not seem to use
the actual PMU interrput or sprs.   Lets make it its
own directory with its own controls.

I don't see how exposing the sample collection to
rate and the computation of the LFSR create a userspace
api change;  I think its totally within the framework.

> Please define "drastic" in this context.  Do you mean make the table
> bigger i.e. more then 256 precomputed elements?  I did 256 since Arnd
> seemed to think that would be a reasonable size. Based on his example
> How much kernel space are we willing to use to same some computation?
> Keep in mind only one of the entries in the table will ever be used.
>
> I think if we found the LFSR that was with in 2^10 of the desired value
> that would be good enough. It would be within 1% of the minimum N the
> user can specify.  That would require a table with 2^14 entries.  That
> seems unreasonably large.

Why does the table have to be linear?  Compute samples at various
logrimathic staring points.   Determine how many significant bits
you want to keep, and have an array for each sample length.

ie for 3 bits of significance, you could have
F00000, E00000, ... 800000,   0F0000, ......
this would take (24-n) * 2^(n-1) slots, while maintaing user
control of the range.

>
> Anyway, the user controls how often sampling is done by setting N.

When calling a user space program.   The events are converted to
a series of control register values that are communicated to the
kernel by writing to files in the file system.   The writes are
interpreted (converted from ascii hex to binary longs) and stored
until the control files are written, at which point callbacks
copy and interpret the controls and start the hardware collection.

Frankly, I would expect a lot more resistance to the event data
stream generation changes and duplicaton.


milton


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
@ 2007-02-09  2:46           ` Milton Miller
  0 siblings, 0 replies; 66+ messages in thread
From: Milton Miller @ 2007-02-09  2:46 UTC (permalink / raw)
  To: Carl Love; +Cc: linuxppc-dev, oprofile-list, cbe-oss-dev, Arnd Bergmann, LKML


On Feb 8, 2007, at 4:51 PM, Carl Love wrote:

> On Thu, 2007-02-08 at 18:21 +0100, Arnd Bergmann wrote:
>> On Thursday 08 February 2007 15:18, Milton Miller wrote:
>>
>>> 1) sample rate setup
>>>
>>>     In the current patch, the user specifies a sample rate as a time
>>> interval.
>>>     The kernel is (a) calling cpufreq to get the current cpu 
>>> frequency,
>>> (b)
>>>     converting the rate to a cycle count, (c) converting this to a 
>>> 24 bit
>>>     LFSR count, an iterative algorithm (in this patch, starting from
>>>     one of 256 values so a max of 2^16 or 64k iterations), (d)
>>> calculating
>>>     an trace unload interval.   In addition, a cpufreq notifier is
>>> registered
>>>     to recalculate on frequency changes.
>
> No.  The user issues the command opcontrol --event:N  where N is the
> number of events (cycles, l2 cache misses, instructions retired etc)
> that are to elapse between collecting the samples.

So you are saying that b and c are primary, and a is used to calculate
a safe value for d.   All of the above work is dont, just from a
different starting point?


> The OProfile passes
> the value N to the kernel via the variable ctr[i].count.  Where i is 
> the
> performance counter entry for that event.

Ok I haven't looked a the api closely.

> Specifically with SPU
> profiling, we do not use performance counters because the CELL HW does
> not allow the normal the PPU to read the SPU PC when a performance
> counter interrupt occurs.  We are using some additional hw support in
> the chip that allows us to periodically capture the SPU PC.  There is 
> an
> LFSR hardware counter that can be started at an arbitrary LFSR value.
> When the last LFSR value in the sequence is reached, a sample is taken
> and stored in the trace buffer.  Hence, the value of N specified by the
> user must get converted to the LFSR value that is N from the end of the
> sequence.

Ok so its arbitray load count to max vs count and compare.   A critical
detail when computing the value to load, but the net result is the
same; the value for the count it hard to determine.

> The same clock that the processor is running at is used to
> control the LFSR count.  Hence the LFSR counter increments once per CPU
> clock cycle regardless of the CPU frequency or changes in the 
> frequency.
> There is no calculation for the LFSR value that is a function of the
> processor frequency.  There is no need to adjust the LFSR when the
> processor frequency changes.



Oh, so the lfsr doesn't have to be recomputed, only the time
between unloads.

>
> Milton had a comment about the code
>
>  if (ctr[0].event == SPU_CYCLES_EVENT_NUM) {
>> +             spu_cycle_reset = ctr[0].count;
>> +             return;
>> +     }
>
> Well, given the above description, it is clear that if you are doing 
> SPU
> event profiling, the value N is put into the cntr[0].cnt entry since
> there is only one event.  Thus in cell_global_start_spu() you use
> spu_cycle_reset as the argument to the lfsr calculation routine to get
> the LFSR value that is N from the end of the sequence.

I was looking at the patch and the context was not very good.   You
might consider adding -p to your diff command, it provides the function
name after the @@.

However, in this case, I think I just need to see the final result.

>
>>>
>>>     The obvious problem is step (c), running a loop potentially 64
>>> thousand
>>>     times in kernel space will have a noticeable impact on other 
>>> threads.
>>>
>>>     I propose instead that user space perform the above 4 steps, and
>>> provide
>>>     the kernel with two inputs: (1) the value to load in the LFSR 
>>> and (2)
>>>     the periodic frequency / time interval at which to empty the 
>>> hardware
>>>     trace buffer, perform sample analysis, and send the data to the
>>> oprofile
>>>     subsystem.
>>>
>>>     There should be no security issues with this approach.   If the 
>>> LFSR
>>> value
>>>     is calculated incorrectly, either it will be too short, causing 
>>> the
>>> trace
>>>     array to overfill and data to be dropped, or it will be too 
>>> long, and
>>>     there will be fewer samples.   Likewise, the kernel periodic poll
>>> can be
>>>     too long, again causing overflow, or too frequent, causing only 
>>> timer
>>>     execution overhead.
>>>
>>>     Various data is collected by the kernel while processing the
>>> periodic timer,
>>>     this approach would also allow the profiling tools to control the
>>>     frequency of this collection.   More frequent collection results 
>>> in
>>> more
>>>     accurate sample data, with the linear cost of poll execution
>>> overhead.
>>>
>>>     Frequency changes can be handled either by the profile code 
>>> setting
>>>     collection at a higher than necessary rate, or by interacting 
>>> with
>>> the
>>>     governor to limit the speeds.
>>>
>>>     Optionally, the kernel can add a record indicating that some 
>>> data was
>>>     likely dropped if it is able to read all 256 entries without
>>> underflowing
>>>     the array.  This can be used as hint to user space that the 
>>> kernel
>>> time
>>>     was too long for the collection rate.
>>
>> Moving the sample rate computation to user space sounds like the right
>> idea, but why not have a more drastic version of it:
>
> No, I do not agree.  The user/kernel API pass N where N is the number 
> of
> events between samples.  We are not at liberty to just change the API.
> We we did do this, we fully expect that John Levon will push back 
> saying
> why make an architecture specific API change when it isn't necessary.

[So you have not asked.]

<Kludge> If you want to overlaod the existing array, one
event could be the lfsr sample rate, and another event be
the collection time.  That would stay within the framework.
But that is a kludge. </Kludge>

[Me goes and reads kernel profile driver and skims powerpc code].

You are confusing the user interface (what the user specifies on the
command line) with the kernel API.

It is somewhat hidden by the PowerPC specific common code in
arch/powerpc/oprofile/common.c.  That is where the counter
array is exposd.

The user to kernel api is not fill out an array of counter
event names and sampling intervals.

The user to kernel interface is a file system that contains a
heirachy of files.  Each file consists of the hex ascii
represntation of a unsigned long.   The filesystem interfaces
to the kernel by provding an API to create directorys and files,
specifing the name of the directory or file.  Theere are helper
routines to connect a file to a ulong and read access to an atomic_t.
The common driver (in drivers/oprofile) creates the file system
and some common files that implement the control interface, which
interfaces to the architecture specific driver through the ops
array.

The powerpc common driver creates a heiarchy exposing the
values to be placed in the performance monitor registers,
and directory of counters with the event selection.

Since this architeture code does not seem to match the
capabilitys of the hardware, it would seem that this is
the area to change.   This driver does not seem to use
the actual PMU interrput or sprs.   Lets make it its
own directory with its own controls.

I don't see how exposing the sample collection to
rate and the computation of the LFSR create a userspace
api change;  I think its totally within the framework.

> Please define "drastic" in this context.  Do you mean make the table
> bigger i.e. more then 256 precomputed elements?  I did 256 since Arnd
> seemed to think that would be a reasonable size. Based on his example
> How much kernel space are we willing to use to same some computation?
> Keep in mind only one of the entries in the table will ever be used.
>
> I think if we found the LFSR that was with in 2^10 of the desired value
> that would be good enough. It would be within 1% of the minimum N the
> user can specify.  That would require a table with 2^14 entries.  That
> seems unreasonably large.

Why does the table have to be linear?  Compute samples at various
logrimathic staring points.   Determine how many significant bits
you want to keep, and have an array for each sample length.

ie for 3 bits of significance, you could have
F00000, E00000, ... 800000,   0F0000, ......
this would take (24-n) * 2^(n-1) slots, while maintaing user
control of the range.

>
> Anyway, the user controls how often sampling is done by setting N.

When calling a user space program.   The events are converted to
a series of control register values that are communicated to the
kernel by writing to files in the file system.   The writes are
interpreted (converted from ascii hex to binary longs) and stored
until the control files are written, at which point callbacks
copy and interpret the controls and start the hardware collection.

Frankly, I would expect a lot more resistance to the event data
stream generation changes and duplicaton.


milton

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-08 14:18     ` Milton Miller
@ 2007-02-08 23:59       ` Maynard Johnson
  -1 siblings, 0 replies; 66+ messages in thread
From: Maynard Johnson @ 2007-02-08 23:59 UTC (permalink / raw)
  To: Milton Miller; +Cc: Carl Love, linuxppc-dev, LKML, oprofile-list, cbe-oss-dev

Milton,
Thank you for your comments.  Carl will reply to certain parts of your 
posting where he's more knowledgeable than I.  See my replies below.

-Maynard

Milton Miller wrote:

>On Feb 6, 2007, at 5:02 PM, Carl Love wrote:
>
>  
>
>>This is the first update to the patch previously posted by Maynard
>>Johnson as "PATCH 4/4. Add support to OProfile for profiling CELL".
>>
>>
>>    
>>
>  
>
[snip]

>
>Data collected
>
>
>The current patch starts tackling these translation issues for the
>presently common case of a static self contained binary from a single
>file, either single separate source file or embedded in the data of
>the host application.   When creating the trace entry for a SPU
>context switch, it records the application owner, pid, tid, and
>dcookie of the main executable.   It addition, it looks up the
>object-id as a virtual address and records the offset if it is non-zero,
>or the dcookie of the object if it is zero.   The code then creates
>a data structure by reading the elf headers from the user process
>(at the address given by the object-id) and building a list of
>SPU address to elf object offsets, as specified by the ELF loader
>headers.   In addition to the elf loader section, it processes the
>overlay headers and records the address, size, and magic number of
>the overlay.
>
>When the hardware trace entries are processed, each address is
>looked up this structure and translated to the elf offset.  If
>it is an overlay region, the overlay identify word is read and
>the list is searched for the matching overlay.  The resulting
>offset is sent to the oprofile system.
>
>The current patch specifically identifies that only single
>elf objects are handled.  There is no code to handle dynamic
>linked libraries or overlays.   Nor is there any method to
>  
>
Yes, we do handle overlays.  (Note: I'm looking into a bug right now in 
our overlay support.)

>present samples that may have been collected during context
>switch processing, they must be discarded.
>
>
>My proposal is to change what is presented to user space.  Instead
>of trying to translate the SPU address to the backing file
>as the samples are recorded, store the samples as the SPU
>context and address.  The context switch would record tid,
>pid, object id as it does now.   In addition, if this is a
>new object-id, the kernel would read elf headers as it does
>today.  However, it would then proceed to provide accurate
>dcookie information for each loader region and overlay.  To
>identify which overlays are active, (instead of the present
>read on use and search the list to translate approach) the
>kernel would record the location of the overlay identifiers
>as it parsed the kernel, but would then read the identification
>word and would record the present value as an sample from
>a separate but related stream.   The kernel could maintain
>the last value for each overlay and only send profile events
>for the deltas.
>  
>
Discussions on this topic in the past have resulted in the current 
implementation precisely because we're able to record the samples as 
fileoffsets, just as the userspace tools expect.   I haven't had time to 
check out how much this would impact the userspace tools, but my gut 
feel is that it would be quite significant.  If we were developing this 
module with a matching newly-created userspace tool, I would be more 
inclined to agree that this makes sense.  But you give no rationale for 
your proposal that justifies the change.  The current implementation 
works, it has no impact on normal, non-profiling behavior,  and the 
overhead during profiling is not noticeable.

>This approach trades translation lookup overhead for each
>recorded sample for a burst of data on new context activation.
>In addition it exposes the sample point of the overlay identifier
>vs the address collection.  This allows the ambiguity to be
>exposed to user space.   In addition, with the above proposed
>kernel timer vs sample collection, user space could limit the
>elapsed time between the address collection and the overlay
>id check.
>  
>
Yes, there is a window here where an overlay could occur before we 
finish processing a group of samples that were actually taken from a 
different overlay.  The obvious way to prevent that is for the kernel 
(or SPUFS) to be notified of the overlay and let OProfile know that we 
need to drain (perhaps discard would be best) our sample trace buffer.  
As you indicate above, your proposal faces the same issue, but would 
just decrease the number of bogus samples.  I contend that the relative 
number of bogus samples will be quite low in either case.  Ideally, we 
should have a mechanism to eliminate them completely so as to avoid 
confusion the user's part when they're looking at a report.  Even a few 
bogus samples in the wrong place can be troubling.  Such a mechanism 
will be a good future enhancement.

[snip]

>milton
>--
>miltonm@bga.com   Milton Miller
>Speaking for myself only.
>
>_______________________________________________
>Linuxppc-dev mailing list
>Linuxppc-dev@ozlabs.org
>https://ozlabs.org/mailman/listinfo/linuxppc-dev
>  
>



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
@ 2007-02-08 23:59       ` Maynard Johnson
  0 siblings, 0 replies; 66+ messages in thread
From: Maynard Johnson @ 2007-02-08 23:59 UTC (permalink / raw)
  To: Milton Miller; +Cc: linuxppc-dev, cbe-oss-dev, LKML, oprofile-list, Carl Love

Milton,
Thank you for your comments.  Carl will reply to certain parts of your 
posting where he's more knowledgeable than I.  See my replies below.

-Maynard

Milton Miller wrote:

>On Feb 6, 2007, at 5:02 PM, Carl Love wrote:
>
>  
>
>>This is the first update to the patch previously posted by Maynard
>>Johnson as "PATCH 4/4. Add support to OProfile for profiling CELL".
>>
>>
>>    
>>
>  
>
[snip]

>
>Data collected
>
>
>The current patch starts tackling these translation issues for the
>presently common case of a static self contained binary from a single
>file, either single separate source file or embedded in the data of
>the host application.   When creating the trace entry for a SPU
>context switch, it records the application owner, pid, tid, and
>dcookie of the main executable.   It addition, it looks up the
>object-id as a virtual address and records the offset if it is non-zero,
>or the dcookie of the object if it is zero.   The code then creates
>a data structure by reading the elf headers from the user process
>(at the address given by the object-id) and building a list of
>SPU address to elf object offsets, as specified by the ELF loader
>headers.   In addition to the elf loader section, it processes the
>overlay headers and records the address, size, and magic number of
>the overlay.
>
>When the hardware trace entries are processed, each address is
>looked up this structure and translated to the elf offset.  If
>it is an overlay region, the overlay identify word is read and
>the list is searched for the matching overlay.  The resulting
>offset is sent to the oprofile system.
>
>The current patch specifically identifies that only single
>elf objects are handled.  There is no code to handle dynamic
>linked libraries or overlays.   Nor is there any method to
>  
>
Yes, we do handle overlays.  (Note: I'm looking into a bug right now in 
our overlay support.)

>present samples that may have been collected during context
>switch processing, they must be discarded.
>
>
>My proposal is to change what is presented to user space.  Instead
>of trying to translate the SPU address to the backing file
>as the samples are recorded, store the samples as the SPU
>context and address.  The context switch would record tid,
>pid, object id as it does now.   In addition, if this is a
>new object-id, the kernel would read elf headers as it does
>today.  However, it would then proceed to provide accurate
>dcookie information for each loader region and overlay.  To
>identify which overlays are active, (instead of the present
>read on use and search the list to translate approach) the
>kernel would record the location of the overlay identifiers
>as it parsed the kernel, but would then read the identification
>word and would record the present value as an sample from
>a separate but related stream.   The kernel could maintain
>the last value for each overlay and only send profile events
>for the deltas.
>  
>
Discussions on this topic in the past have resulted in the current 
implementation precisely because we're able to record the samples as 
fileoffsets, just as the userspace tools expect.   I haven't had time to 
check out how much this would impact the userspace tools, but my gut 
feel is that it would be quite significant.  If we were developing this 
module with a matching newly-created userspace tool, I would be more 
inclined to agree that this makes sense.  But you give no rationale for 
your proposal that justifies the change.  The current implementation 
works, it has no impact on normal, non-profiling behavior,  and the 
overhead during profiling is not noticeable.

>This approach trades translation lookup overhead for each
>recorded sample for a burst of data on new context activation.
>In addition it exposes the sample point of the overlay identifier
>vs the address collection.  This allows the ambiguity to be
>exposed to user space.   In addition, with the above proposed
>kernel timer vs sample collection, user space could limit the
>elapsed time between the address collection and the overlay
>id check.
>  
>
Yes, there is a window here where an overlay could occur before we 
finish processing a group of samples that were actually taken from a 
different overlay.  The obvious way to prevent that is for the kernel 
(or SPUFS) to be notified of the overlay and let OProfile know that we 
need to drain (perhaps discard would be best) our sample trace buffer.  
As you indicate above, your proposal faces the same issue, but would 
just decrease the number of bogus samples.  I contend that the relative 
number of bogus samples will be quite low in either case.  Ideally, we 
should have a mechanism to eliminate them completely so as to avoid 
confusion the user's part when they're looking at a report.  Even a few 
bogus samples in the wrong place can be troubling.  Such a mechanism 
will be a good future enhancement.

[snip]

>milton
>--
>miltonm@bga.com   Milton Miller
>Speaking for myself only.
>
>_______________________________________________
>Linuxppc-dev mailing list
>Linuxppc-dev@ozlabs.org
>https://ozlabs.org/mailman/listinfo/linuxppc-dev
>  
>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-08 17:21       ` Arnd Bergmann
@ 2007-02-08 22:51         ` Carl Love
  -1 siblings, 0 replies; 66+ messages in thread
From: Carl Love @ 2007-02-08 22:51 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: cbe-oss-dev, Milton Miller, linuxppc-dev, LKML, oprofile-list

On Thu, 2007-02-08 at 18:21 +0100, Arnd Bergmann wrote:
> On Thursday 08 February 2007 15:18, Milton Miller wrote:
>  
> > 1) sample rate setup
> > 
> >     In the current patch, the user specifies a sample rate as a time 
> > interval.
> >     The kernel is (a) calling cpufreq to get the current cpu frequency, 
> > (b)
> >     converting the rate to a cycle count, (c) converting this to a 24 bit
> >     LFSR count, an iterative algorithm (in this patch, starting from
> >     one of 256 values so a max of 2^16 or 64k iterations), (d) 
> > calculating
> >     an trace unload interval.   In addition, a cpufreq notifier is 
> > registered
> >     to recalculate on frequency changes.

No.  The user issues the command opcontrol --event:N  where N is the
number of events (cycles, l2 cache misses, instructions retired etc)
that are to elapse between collecting the samples.  The OProfile passes
the value N to the kernel via the variable ctr[i].count.  Where i is the
performance counter entry for that event.  Specifically with SPU
profiling, we do not use performance counters because the CELL HW does
not allow the normal the PPU to read the SPU PC when a performance
counter interrupt occurs.  We are using some additional hw support in
the chip that allows us to periodically capture the SPU PC.  There is an
LFSR hardware counter that can be started at an arbitrary LFSR value.
When the last LFSR value in the sequence is reached, a sample is taken
and stored in the trace buffer.  Hence, the value of N specified by the
user must get converted to the LFSR value that is N from the end of the
sequence.  The same clock that the processor is running at is used to
control the LFSR count.  Hence the LFSR counter increments once per CPU
clock cycle regardless of the CPU frequency or changes in the frequency.
There is no calculation for the LFSR value that is a function of the
processor frequency.  There is no need to adjust the LFSR when the
processor frequency changes.  

Milton had a comment about the code 

 if (ctr[0].event == SPU_CYCLES_EVENT_NUM) {
> +             spu_cycle_reset = ctr[0].count;
> +             return;
> +     }

Well, given the above description, it is clear that if you are doing SPU
event profiling, the value N is put into the cntr[0].cnt entry since
there is only one event.  Thus in cell_global_start_spu() you use
spu_cycle_reset as the argument to the lfsr calculation routine to get
the LFSR value that is N from the end of the sequence.

> > 
> >     The obvious problem is step (c), running a loop potentially 64 
> > thousand
> >     times in kernel space will have a noticeable impact on other threads.
> > 
> >     I propose instead that user space perform the above 4 steps, and 
> > provide
> >     the kernel with two inputs: (1) the value to load in the LFSR and (2)
> >     the periodic frequency / time interval at which to empty the hardware
> >     trace buffer, perform sample analysis, and send the data to the 
> > oprofile
> >     subsystem.
> > 
> >     There should be no security issues with this approach.   If the LFSR 
> > value
> >     is calculated incorrectly, either it will be too short, causing the 
> > trace
> >     array to overfill and data to be dropped, or it will be too long, and
> >     there will be fewer samples.   Likewise, the kernel periodic poll 
> > can be
> >     too long, again causing overflow, or too frequent, causing only timer
> >     execution overhead.
> > 
> >     Various data is collected by the kernel while processing the 
> > periodic timer,
> >     this approach would also allow the profiling tools to control the
> >     frequency of this collection.   More frequent collection results in 
> > more
> >     accurate sample data, with the linear cost of poll execution 
> > overhead.
> > 
> >     Frequency changes can be handled either by the profile code setting
> >     collection at a higher than necessary rate, or by interacting with 
> > the
> >     governor to limit the speeds.
> > 
> >     Optionally, the kernel can add a record indicating that some data was
> >     likely dropped if it is able to read all 256 entries without 
> > underflowing
> >     the array.  This can be used as hint to user space that the kernel 
> > time
> >     was too long for the collection rate.
>  
> Moving the sample rate computation to user space sounds like the right
> idea, but why not have a more drastic version of it:

No, I do not agree.  The user/kernel API pass N where N is the number of
events between samples.  We are not at liberty to just change the API.
We we did do this, we fully expect that John Levon will push back saying
why make an architecture specific API change when it isn't necessary.  

Please define "drastic" in this context.  Do you mean make the table
bigger i.e. more then 256 precomputed elements?  I did 256 since Arnd
seemed to think that would be a reasonable size. Based on his example
How much kernel space are we willing to use to same some computation?
Keep in mind only one of the entries in the table will ever be used.

I think if we found the LFSR that was with in 2^10 of the desired value
that would be good enough. It would be within 1% of the minimum N the
user can specify.  That would require a table with 2^14 entries.  That
seems unreasonably large.

Anyway, the user controls how often sampling is done by setting N. 

>  
                Carl Love
> [cut]


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
@ 2007-02-08 22:51         ` Carl Love
  0 siblings, 0 replies; 66+ messages in thread
From: Carl Love @ 2007-02-08 22:51 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: linuxppc-dev, oprofile-list, cbe-oss-dev, Milton Miller, LKML

On Thu, 2007-02-08 at 18:21 +0100, Arnd Bergmann wrote:
> On Thursday 08 February 2007 15:18, Milton Miller wrote:
>  
> > 1) sample rate setup
> > 
> >     In the current patch, the user specifies a sample rate as a time 
> > interval.
> >     The kernel is (a) calling cpufreq to get the current cpu frequency, 
> > (b)
> >     converting the rate to a cycle count, (c) converting this to a 24 bit
> >     LFSR count, an iterative algorithm (in this patch, starting from
> >     one of 256 values so a max of 2^16 or 64k iterations), (d) 
> > calculating
> >     an trace unload interval.   In addition, a cpufreq notifier is 
> > registered
> >     to recalculate on frequency changes.

No.  The user issues the command opcontrol --event:N  where N is the
number of events (cycles, l2 cache misses, instructions retired etc)
that are to elapse between collecting the samples.  The OProfile passes
the value N to the kernel via the variable ctr[i].count.  Where i is the
performance counter entry for that event.  Specifically with SPU
profiling, we do not use performance counters because the CELL HW does
not allow the normal the PPU to read the SPU PC when a performance
counter interrupt occurs.  We are using some additional hw support in
the chip that allows us to periodically capture the SPU PC.  There is an
LFSR hardware counter that can be started at an arbitrary LFSR value.
When the last LFSR value in the sequence is reached, a sample is taken
and stored in the trace buffer.  Hence, the value of N specified by the
user must get converted to the LFSR value that is N from the end of the
sequence.  The same clock that the processor is running at is used to
control the LFSR count.  Hence the LFSR counter increments once per CPU
clock cycle regardless of the CPU frequency or changes in the frequency.
There is no calculation for the LFSR value that is a function of the
processor frequency.  There is no need to adjust the LFSR when the
processor frequency changes.  

Milton had a comment about the code 

 if (ctr[0].event == SPU_CYCLES_EVENT_NUM) {
> +             spu_cycle_reset = ctr[0].count;
> +             return;
> +     }

Well, given the above description, it is clear that if you are doing SPU
event profiling, the value N is put into the cntr[0].cnt entry since
there is only one event.  Thus in cell_global_start_spu() you use
spu_cycle_reset as the argument to the lfsr calculation routine to get
the LFSR value that is N from the end of the sequence.

> > 
> >     The obvious problem is step (c), running a loop potentially 64 
> > thousand
> >     times in kernel space will have a noticeable impact on other threads.
> > 
> >     I propose instead that user space perform the above 4 steps, and 
> > provide
> >     the kernel with two inputs: (1) the value to load in the LFSR and (2)
> >     the periodic frequency / time interval at which to empty the hardware
> >     trace buffer, perform sample analysis, and send the data to the 
> > oprofile
> >     subsystem.
> > 
> >     There should be no security issues with this approach.   If the LFSR 
> > value
> >     is calculated incorrectly, either it will be too short, causing the 
> > trace
> >     array to overfill and data to be dropped, or it will be too long, and
> >     there will be fewer samples.   Likewise, the kernel periodic poll 
> > can be
> >     too long, again causing overflow, or too frequent, causing only timer
> >     execution overhead.
> > 
> >     Various data is collected by the kernel while processing the 
> > periodic timer,
> >     this approach would also allow the profiling tools to control the
> >     frequency of this collection.   More frequent collection results in 
> > more
> >     accurate sample data, with the linear cost of poll execution 
> > overhead.
> > 
> >     Frequency changes can be handled either by the profile code setting
> >     collection at a higher than necessary rate, or by interacting with 
> > the
> >     governor to limit the speeds.
> > 
> >     Optionally, the kernel can add a record indicating that some data was
> >     likely dropped if it is able to read all 256 entries without 
> > underflowing
> >     the array.  This can be used as hint to user space that the kernel 
> > time
> >     was too long for the collection rate.
>  
> Moving the sample rate computation to user space sounds like the right
> idea, but why not have a more drastic version of it:

No, I do not agree.  The user/kernel API pass N where N is the number of
events between samples.  We are not at liberty to just change the API.
We we did do this, we fully expect that John Levon will push back saying
why make an architecture specific API change when it isn't necessary.  

Please define "drastic" in this context.  Do you mean make the table
bigger i.e. more then 256 precomputed elements?  I did 256 since Arnd
seemed to think that would be a reasonable size. Based on his example
How much kernel space are we willing to use to same some computation?
Keep in mind only one of the entries in the table will ever be used.

I think if we found the LFSR that was with in 2^10 of the desired value
that would be good enough. It would be within 1% of the minimum N the
user can specify.  That would require a table with 2^14 entries.  That
seems unreasonably large.

Anyway, the user controls how often sampling is done by setting N. 

>  
                Carl Love
> [cut]

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-08 17:21       ` Arnd Bergmann
@ 2007-02-08 18:01         ` Adrian Reber
  -1 siblings, 0 replies; 66+ messages in thread
From: Adrian Reber @ 2007-02-08 18:01 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: cbe-oss-dev, linuxppc-dev, oprofile-list, LKML, Milton Miller, Carl Love

On Thu, Feb 08, 2007 at 06:21:56PM +0100, Arnd Bergmann wrote:
[...]
> Moving the sample rate computation to user space sounds like the right
> idea, but why not have a more drastic version of it:
> 
> Right now, all products that support this feature run at the same clock
> rate (3.2 Ghz), with cpufreq, we can reduce this to 1.6 Ghz. If I understand
> this correctly, the value depends only on the frequency, so we could simply
> hardcode this in the kernel, and print out a warning message if we ever
> encounter a different frequency, right?

Just for the record... CAB is running with 2.8 GHz. At least all the boards
I have seen.

		Adrian

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
@ 2007-02-08 18:01         ` Adrian Reber
  0 siblings, 0 replies; 66+ messages in thread
From: Adrian Reber @ 2007-02-08 18:01 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: LKML, Milton Miller, linuxppc-dev, oprofile-list, cbe-oss-dev, Carl Love

On Thu, Feb 08, 2007 at 06:21:56PM +0100, Arnd Bergmann wrote:
[...]
> Moving the sample rate computation to user space sounds like the right
> idea, but why not have a more drastic version of it:
> 
> Right now, all products that support this feature run at the same clock
> rate (3.2 Ghz), with cpufreq, we can reduce this to 1.6 Ghz. If I understand
> this correctly, the value depends only on the frequency, so we could simply
> hardcode this in the kernel, and print out a warning message if we ever
> encounter a different frequency, right?

Just for the record... CAB is running with 2.8 GHz. At least all the boards
I have seen.

		Adrian

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-08 14:18     ` Milton Miller
@ 2007-02-08 17:21       ` Arnd Bergmann
  -1 siblings, 0 replies; 66+ messages in thread
From: Arnd Bergmann @ 2007-02-08 17:21 UTC (permalink / raw)
  To: cbe-oss-dev; +Cc: Milton Miller, Carl Love, linuxppc-dev, LKML, oprofile-list

On Thursday 08 February 2007 15:18, Milton Miller wrote:

> 1) sample rate setup
> 
>     In the current patch, the user specifies a sample rate as a time 
> interval.
>     The kernel is (a) calling cpufreq to get the current cpu frequency, 
> (b)
>     converting the rate to a cycle count, (c) converting this to a 24 bit
>     LFSR count, an iterative algorithm (in this patch, starting from
>     one of 256 values so a max of 2^16 or 64k iterations), (d) 
> calculating
>     an trace unload interval.   In addition, a cpufreq notifier is 
> registered
>     to recalculate on frequency changes.
> 
>     The obvious problem is step (c), running a loop potentially 64 
> thousand
>     times in kernel space will have a noticeable impact on other threads.
> 
>     I propose instead that user space perform the above 4 steps, and 
> provide
>     the kernel with two inputs: (1) the value to load in the LFSR and (2)
>     the periodic frequency / time interval at which to empty the hardware
>     trace buffer, perform sample analysis, and send the data to the 
> oprofile
>     subsystem.
> 
>     There should be no security issues with this approach.   If the LFSR 
> value
>     is calculated incorrectly, either it will be too short, causing the 
> trace
>     array to overfill and data to be dropped, or it will be too long, and
>     there will be fewer samples.   Likewise, the kernel periodic poll 
> can be
>     too long, again causing overflow, or too frequent, causing only timer
>     execution overhead.
> 
>     Various data is collected by the kernel while processing the 
> periodic timer,
>     this approach would also allow the profiling tools to control the
>     frequency of this collection.   More frequent collection results in 
> more
>     accurate sample data, with the linear cost of poll execution 
> overhead.
> 
>     Frequency changes can be handled either by the profile code setting
>     collection at a higher than necessary rate, or by interacting with 
> the
>     governor to limit the speeds.
> 
>     Optionally, the kernel can add a record indicating that some data was
>     likely dropped if it is able to read all 256 entries without 
> underflowing
>     the array.  This can be used as hint to user space that the kernel 
> time
>     was too long for the collection rate.

Moving the sample rate computation to user space sounds like the right
idea, but why not have a more drastic version of it:

Right now, all products that support this feature run at the same clock
rate (3.2 Ghz), with cpufreq, we can reduce this to 1.6 Ghz. If I understand
this correctly, the value depends only on the frequency, so we could simply
hardcode this in the kernel, and print out a warning message if we ever
encounter a different frequency, right?

> The current patch specifically identifies that only single
> elf objects are handled.  There is no code to handle dynamic
> linked libraries or overlays.   Nor is there any method to
> present samples that may have been collected during context
> switch processing, they must be discarded.

I thought it already did handle overlays, what did I miss here?

> My proposal is to change what is presented to user space.  Instead
> of trying to translate the SPU address to the backing file
> as the samples are recorded, store the samples as the SPU
> context and address.  The context switch would record tid,
> pid, object id as it does now.   In addition, if this is a
> new object-id, the kernel would read elf headers as it does
> today.  However, it would then proceed to provide accurate
> dcookie information for each loader region and overlay.

Doing the translation in two stages in user space, as you
suggest here, definitely makes sense to me. I think it
can be done a little simpler though:

Why would you need the accurate dcookie information to be
provided by the kernel? The ELF loader is done in user
space, and the kernel only reproduces what it thinks that
came up with. If the kernel only gives the dcookie information
about the SPU ELF binary to the oprofile user space, then
that can easily recreate the same mapping.

The kernel still needs to provide the overlay identifiers
though.

> To identify which overlays are active, (instead of the present
> read on use and search the list to translate approach) the
> kernel would record the location of the overlay identifiers
> as it parsed the kernel, but would then read the identification
> word and would record the present value as an sample from
> a separate but related stream.   The kernel could maintain
> the last value for each overlay and only send profile events
> for the deltas.

right.

> This approach trades translation lookup overhead for each
> recorded sample for a burst of data on new context activation.
> In addition it exposes the sample point of the overlay identifier
> vs the address collection.  This allows the ambiguity to be
> exposed to user space.   In addition, with the above proposed
> kernel timer vs sample collection, user space could limit the
> elapsed time between the address collection and the overlay
> id check.

yes, this sounds nice. But tt does not at all help accuracy,
only performance, right?
 
> This approach allows multiple objects by its nature.  A new
> elf header could be constructed in memory that contained
> the union of the elf objects load segments, and the tools
> will magically work.   Alternatively the object id could
> point to a new structure, identified via a new header, that
> it points to other elf headers (easily differentiated by the
> elf magic headers).   Other binary formats, including several
> objects in a ar archive, could be supported.

Yes, that would be a new feature if the kernel passed dcookie
information for every section, but I doubt that it is worth
it. I have not seen any program that allows loading code
from more than one ELF file. In particular, the ELF format
on the SPU is currently lacking the relocation mechanisms
that you would need for resolving spu-side symbols at load
time.
 
> If better overlay identification is required, in theory the
> overlay switch code could be augmented to record the switches
> (DMA reference time from the PowerPC memory and record a
> relative decrementer in the SPU), this is obviously a future
> item.  But it is facilitated by having user space resolve the
> SPU to source file translation.

This seems to incur a run-time overhead on the SPU even if not
profiling, I would consider that not acceptable.

	Arnd <><

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
@ 2007-02-08 17:21       ` Arnd Bergmann
  0 siblings, 0 replies; 66+ messages in thread
From: Arnd Bergmann @ 2007-02-08 17:21 UTC (permalink / raw)
  To: cbe-oss-dev; +Cc: linuxppc-dev, oprofile-list, LKML, Milton Miller, Carl Love

On Thursday 08 February 2007 15:18, Milton Miller wrote:

> 1) sample rate setup
> 
>     In the current patch, the user specifies a sample rate as a time 
> interval.
>     The kernel is (a) calling cpufreq to get the current cpu frequency, 
> (b)
>     converting the rate to a cycle count, (c) converting this to a 24 bit
>     LFSR count, an iterative algorithm (in this patch, starting from
>     one of 256 values so a max of 2^16 or 64k iterations), (d) 
> calculating
>     an trace unload interval.   In addition, a cpufreq notifier is 
> registered
>     to recalculate on frequency changes.
> 
>     The obvious problem is step (c), running a loop potentially 64 
> thousand
>     times in kernel space will have a noticeable impact on other threads.
> 
>     I propose instead that user space perform the above 4 steps, and 
> provide
>     the kernel with two inputs: (1) the value to load in the LFSR and (2)
>     the periodic frequency / time interval at which to empty the hardware
>     trace buffer, perform sample analysis, and send the data to the 
> oprofile
>     subsystem.
> 
>     There should be no security issues with this approach.   If the LFSR 
> value
>     is calculated incorrectly, either it will be too short, causing the 
> trace
>     array to overfill and data to be dropped, or it will be too long, and
>     there will be fewer samples.   Likewise, the kernel periodic poll 
> can be
>     too long, again causing overflow, or too frequent, causing only timer
>     execution overhead.
> 
>     Various data is collected by the kernel while processing the 
> periodic timer,
>     this approach would also allow the profiling tools to control the
>     frequency of this collection.   More frequent collection results in 
> more
>     accurate sample data, with the linear cost of poll execution 
> overhead.
> 
>     Frequency changes can be handled either by the profile code setting
>     collection at a higher than necessary rate, or by interacting with 
> the
>     governor to limit the speeds.
> 
>     Optionally, the kernel can add a record indicating that some data was
>     likely dropped if it is able to read all 256 entries without 
> underflowing
>     the array.  This can be used as hint to user space that the kernel 
> time
>     was too long for the collection rate.

Moving the sample rate computation to user space sounds like the right
idea, but why not have a more drastic version of it:

Right now, all products that support this feature run at the same clock
rate (3.2 Ghz), with cpufreq, we can reduce this to 1.6 Ghz. If I understand
this correctly, the value depends only on the frequency, so we could simply
hardcode this in the kernel, and print out a warning message if we ever
encounter a different frequency, right?

> The current patch specifically identifies that only single
> elf objects are handled.  There is no code to handle dynamic
> linked libraries or overlays.   Nor is there any method to
> present samples that may have been collected during context
> switch processing, they must be discarded.

I thought it already did handle overlays, what did I miss here?

> My proposal is to change what is presented to user space.  Instead
> of trying to translate the SPU address to the backing file
> as the samples are recorded, store the samples as the SPU
> context and address.  The context switch would record tid,
> pid, object id as it does now.   In addition, if this is a
> new object-id, the kernel would read elf headers as it does
> today.  However, it would then proceed to provide accurate
> dcookie information for each loader region and overlay.

Doing the translation in two stages in user space, as you
suggest here, definitely makes sense to me. I think it
can be done a little simpler though:

Why would you need the accurate dcookie information to be
provided by the kernel? The ELF loader is done in user
space, and the kernel only reproduces what it thinks that
came up with. If the kernel only gives the dcookie information
about the SPU ELF binary to the oprofile user space, then
that can easily recreate the same mapping.

The kernel still needs to provide the overlay identifiers
though.

> To identify which overlays are active, (instead of the present
> read on use and search the list to translate approach) the
> kernel would record the location of the overlay identifiers
> as it parsed the kernel, but would then read the identification
> word and would record the present value as an sample from
> a separate but related stream.   The kernel could maintain
> the last value for each overlay and only send profile events
> for the deltas.

right.

> This approach trades translation lookup overhead for each
> recorded sample for a burst of data on new context activation.
> In addition it exposes the sample point of the overlay identifier
> vs the address collection.  This allows the ambiguity to be
> exposed to user space.   In addition, with the above proposed
> kernel timer vs sample collection, user space could limit the
> elapsed time between the address collection and the overlay
> id check.

yes, this sounds nice. But tt does not at all help accuracy,
only performance, right?
 
> This approach allows multiple objects by its nature.  A new
> elf header could be constructed in memory that contained
> the union of the elf objects load segments, and the tools
> will magically work.   Alternatively the object id could
> point to a new structure, identified via a new header, that
> it points to other elf headers (easily differentiated by the
> elf magic headers).   Other binary formats, including several
> objects in a ar archive, could be supported.

Yes, that would be a new feature if the kernel passed dcookie
information for every section, but I doubt that it is worth
it. I have not seen any program that allows loading code
from more than one ELF file. In particular, the ELF format
on the SPU is currently lacking the relocation mechanisms
that you would need for resolving spu-side symbols at load
time.
 
> If better overlay identification is required, in theory the
> overlay switch code could be augmented to record the switches
> (DMA reference time from the PowerPC memory and record a
> relative decrementer in the SPU), this is obviously a future
> item.  But it is facilitated by having user space resolve the
> SPU to source file translation.

This seems to incur a run-time overhead on the SPU even if not
profiling, I would consider that not acceptable.

	Arnd <><

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-07 22:48       ` Michael Ellerman
@ 2007-02-08 15:03         ` Maynard Johnson
  -1 siblings, 0 replies; 66+ messages in thread
From: Maynard Johnson @ 2007-02-08 15:03 UTC (permalink / raw)
  To: michael; +Cc: Carl Love, linuxppc-dev, linux-kernel, cbe-oss-dev

Michael,
Thanks very much for the advice.  Both issues have been solved now, with 
your help.

-Maynard

Michael Ellerman wrote:

>On Wed, 2007-02-07 at 09:41 -0600, Maynard Johnson wrote:
>  
>
>>Carl Love wrote:
>>
>>    
>>
>>>Subject: Add support to OProfile for profiling Cell BE SPUs
>>>
>>>From: Maynard Johnson <maynardj@us.ibm.com>
>>>
>>>This patch updates the existing arch/powerpc/oprofile/op_model_cell.c
>>>to add in the SPU profiling capabilities.  In addition, a 'cell' subdirectory
>>>was added to arch/powerpc/oprofile to hold Cell-specific SPU profiling
>>>code.
>>>
>>>Signed-off-by: Carl Love <carll@us.ibm.com>
>>>Signed-off-by: Maynard Johnson <mpjohn@us.ibm.com>
>>>
>>>Index: linux-2.6.20-rc1/arch/powerpc/configs/cell_defconfig
>>> 
>>>
>>>      
>>>
>>I've discovered more problems with the kref handling for the cached_info 
>>object that we store in the spu_context.  :-(
>>
>>When the OProfile module initially determines that no cached_info yet 
>>exists for a given spu_context, it creates the cached_info, inits the 
>>cached_info's kref (which increments the refcount) and does a kref_get 
>>(for SPUFS' ref) before passing the cached_info reference off to SUPFS 
>>to store into the spu_context.  When OProfile shuts down or the SPU job 
>>ends, OProfile gives up its ref to the cached_info with kref_put.  Then 
>>when SPUFS destroys the spu_context, it also gives up its ref.  HOWEVER 
>>. . . . If OProfile shuts down while the SPU job is still active _and_ 
>>_then_ is restarted while the job is still active, OProfile will find 
>>that the cached_info exists for the given spu_context, so it won't go 
>>through the process of creating it and doing kref_init on the kref.  
>>Under this scenario, OProfile does not own a ref of its own to the 
>>cached_info, and should not be doing a kref_put when done using the 
>>cached_info -- but it does, and so does SPUFS when the spu_context is 
>>destroyed.  The end result (with the code as currently written) is that 
>>an extra kref_put is done when the refcount is already down to zero.  To 
>>fix this, OProfile needs to detect when it finds an existing cached_info 
>>already stored in the spu_context.  Then, instead of creating a new one, 
>>it sets a reminder flag to be used later when it's done using the cached 
>>info to indicate whether or not it needs to call kref_put.
>>    
>>
>
>I think all you want to do is when oprofile finds the cached_info
>already existing, it does a kref_get(). After all it doesn't have a
>reference to it, so before it starts using it it must inc the ref count.
>
>  
>
>>Unfortunately, there's another problem (one that should have been 
>>obvious to me).  The cached_info's kref "release" function is 
>>destroy_cached_info(), defined in the OProfile module.  If the OProfile 
>>module is unloaded when SPUFS destroys the spu_context and calls 
>>kref_put on the cached_info's kref -- KABOOM!  The destroy_cached_info 
>>function (the second arg to kref_put) is not in memory, so we get a 
>>paging fault.  I see a couple options to solve this:
>>    1) Don't store the cached_info in the spu_context.  Basically, go 
>>back to the simplistic model of creating/deleting the cached_info on 
>>every SPU task activation/deactivation.
>>    2)  If there's some way to do this, force the OProfile module to 
>>stay loaded until SPUFS has destroyed its last spu_context that holds a 
>>cached_info object.
>>    
>>
>
>There is a mechanism for that, you just have each cached_info inc the
>module's refcount.
>
>Another option would be to have a mapping, in the oprofile code, from
>spu_contexts to cached_infos, ie. a hash table or something.
>
>cheers
>
>  
>



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
@ 2007-02-08 15:03         ` Maynard Johnson
  0 siblings, 0 replies; 66+ messages in thread
From: Maynard Johnson @ 2007-02-08 15:03 UTC (permalink / raw)
  To: michael; +Cc: linuxppc-dev, linux-kernel, cbe-oss-dev, Carl Love

Michael,
Thanks very much for the advice.  Both issues have been solved now, with 
your help.

-Maynard

Michael Ellerman wrote:

>On Wed, 2007-02-07 at 09:41 -0600, Maynard Johnson wrote:
>  
>
>>Carl Love wrote:
>>
>>    
>>
>>>Subject: Add support to OProfile for profiling Cell BE SPUs
>>>
>>>From: Maynard Johnson <maynardj@us.ibm.com>
>>>
>>>This patch updates the existing arch/powerpc/oprofile/op_model_cell.c
>>>to add in the SPU profiling capabilities.  In addition, a 'cell' subdirectory
>>>was added to arch/powerpc/oprofile to hold Cell-specific SPU profiling
>>>code.
>>>
>>>Signed-off-by: Carl Love <carll@us.ibm.com>
>>>Signed-off-by: Maynard Johnson <mpjohn@us.ibm.com>
>>>
>>>Index: linux-2.6.20-rc1/arch/powerpc/configs/cell_defconfig
>>> 
>>>
>>>      
>>>
>>I've discovered more problems with the kref handling for the cached_info 
>>object that we store in the spu_context.  :-(
>>
>>When the OProfile module initially determines that no cached_info yet 
>>exists for a given spu_context, it creates the cached_info, inits the 
>>cached_info's kref (which increments the refcount) and does a kref_get 
>>(for SPUFS' ref) before passing the cached_info reference off to SUPFS 
>>to store into the spu_context.  When OProfile shuts down or the SPU job 
>>ends, OProfile gives up its ref to the cached_info with kref_put.  Then 
>>when SPUFS destroys the spu_context, it also gives up its ref.  HOWEVER 
>>. . . . If OProfile shuts down while the SPU job is still active _and_ 
>>_then_ is restarted while the job is still active, OProfile will find 
>>that the cached_info exists for the given spu_context, so it won't go 
>>through the process of creating it and doing kref_init on the kref.  
>>Under this scenario, OProfile does not own a ref of its own to the 
>>cached_info, and should not be doing a kref_put when done using the 
>>cached_info -- but it does, and so does SPUFS when the spu_context is 
>>destroyed.  The end result (with the code as currently written) is that 
>>an extra kref_put is done when the refcount is already down to zero.  To 
>>fix this, OProfile needs to detect when it finds an existing cached_info 
>>already stored in the spu_context.  Then, instead of creating a new one, 
>>it sets a reminder flag to be used later when it's done using the cached 
>>info to indicate whether or not it needs to call kref_put.
>>    
>>
>
>I think all you want to do is when oprofile finds the cached_info
>already existing, it does a kref_get(). After all it doesn't have a
>reference to it, so before it starts using it it must inc the ref count.
>
>  
>
>>Unfortunately, there's another problem (one that should have been 
>>obvious to me).  The cached_info's kref "release" function is 
>>destroy_cached_info(), defined in the OProfile module.  If the OProfile 
>>module is unloaded when SPUFS destroys the spu_context and calls 
>>kref_put on the cached_info's kref -- KABOOM!  The destroy_cached_info 
>>function (the second arg to kref_put) is not in memory, so we get a 
>>paging fault.  I see a couple options to solve this:
>>    1) Don't store the cached_info in the spu_context.  Basically, go 
>>back to the simplistic model of creating/deleting the cached_info on 
>>every SPU task activation/deactivation.
>>    2)  If there's some way to do this, force the OProfile module to 
>>stay loaded until SPUFS has destroyed its last spu_context that holds a 
>>cached_info object.
>>    
>>
>
>There is a mechanism for that, you just have each cached_info inc the
>module's refcount.
>
>Another option would be to have a mapping, in the oprofile code, from
>spu_contexts to cached_infos, ie. a hash table or something.
>
>cheers
>
>  
>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-06 23:02   ` Carl Love
@ 2007-02-08 14:18     ` Milton Miller
  -1 siblings, 0 replies; 66+ messages in thread
From: Milton Miller @ 2007-02-08 14:18 UTC (permalink / raw)
  To: Carl Love; +Cc: LKML, linuxppc-dev, cbe-oss-dev, oprofile-list


On Feb 6, 2007, at 5:02 PM, Carl Love wrote:

> This is the first update to the patch previously posted by Maynard
> Johnson as "PATCH 4/4. Add support to OProfile for profiling CELL".
>
> This repost fixes the line wrap issue that Ben mentioned.  Also the 
> kref
> handling for the cached info has been fixed and simplified.
>
> There are still a few items from the comments being discussed
> specifically how to profile the dynamic code for the SPFS context 
> switch
> code and how to deal with dynamic code stubs for library support.  Our
> proposal is to assign the samples from the SPFS and dynamic library 
> code
> to an anonymous sample bucket.  The support for properly handling the
> symbol extraction in these cases would be deferred to a later SDK.
>
> There is also a bug in profiling overlay code that we are 
> investigating.

I'd like to talk about about both some of the concepts, including what
is logged and what the kernel API is, and provide some directed comments
on the code in the patch as it stands.

[Ok, the background and concepts portion is big enough, I'll send
this to the several lists for comment, and the code comments to
just cbe-oss-dev]

First, I'll give some background and my understanding of both the
hardware, and some of the linux interface.  I'm not consulting any
manuals, so I could be mistaken.   I'm basing this on a combination
of past reading of the kernel, this patch, a discussion with Arnd,
and my knowledge of the PowerPC processor and other IBM processors.
Hopefully this will be of use to other reviewers.

Background:

The cell broadband architecture consists of PowerPC processors
and synergistic processing units, or SPUs.  The current chip has
a multi-threaded PowerPC core and 8 SPUs.   Each SPU has an
execution core (running its own instruction set), a 256kB
private memory (local store), and DMA engine that can access
the coherent memory domain of the PowerPC processor(s).  Multiple
chips can be connected together in a single coherent SMP system.
The addresses provided to the DMA engine are effective, translated
through virtual to real address spaces like the PowerPC MMU.
The SPUs are intended to run application threads, and require
the PowerPC to perform exception handling and other operating
system tasks.

Because of the limited address space of the SPU (18
bits), the compilers (gcc) have support for code
overlays.  The ELF format defines these overlays
including file offsets, sizes, load address, and
a unique word to identify the overlay.  The toolchain
has support to check that an overlay is present and
setup the DMA engine to load it if it is not present.

In Linux, the SPUs are controlled through spufs.  Once
a SPU context is created, the local store and registers can be
modified through the file system.  A linux thread makes
a syscall requesting the context to run.  This call is
blocking to that thread; it waits for an event from the
SPU (either an exception or a reschedule). In this regard
the syscall can be thought of as a cross instruction set
function call.  The contents of the local store are
initialized and modified via read/write or mmap and memcopy
of a spufs file. Specifically, pages are not mapped into the
local store, they are copied into it.  The SPU context is
tied to the creating mm; this provides a clear context for
the DMA engine (controlled by the program in the SPU).

To assist with the use of the SPUs, and to provide some
portability to other environments, a library, libpse, is
available and widely used.   Among other items, it provides
an elf loader for SPU elf objects.  Applications may mmap
external objects or embed the elf objects into their
executable (as data).  The contained objects copied to the
SPU local store as dictated by the elf header.   To assist
other tools, spufs provides an object-id file.   Before this
patch, it has been treated as an opaque number. Although not
present in the first release of libpse, current versions write
the virtual address of the elf header to this file
when the loader is called.

This patch is trying to enable oprofile to collect and
analyze SPU execution, using the trace collection
hardware of the processor.   The trace collection hardware
consists of a LFSR (linear-feedback shift register) counter
and an array 256 bits wide and 256 entries deep.  When the
counter matches the programed value, it causes an entry
to be written to the trace array and the counter to be
reloaded.  On chip logic tracks how many entries have
been written and not yet read.  When programed to trace
the SPUs, the trace entry consists of the upper 16 bits of
the 18 bit program counter for each SPU; the 8 SPUs split
over the two words.  By their nature, LFSRs require a minimum
of hardware (typically 3 exclusive-or gates and a shift
register), but the count sequence is pseudo-random.

The oprofile system is designed to collect a stream of trace
data and context information to and provide it to user space
for post processing and analysis.   While at the lowest level
the data is a stream of words, the kernel and user space tools
agree on a protocol that provides meaning to the words by
prefixing stream elements with escape sequences to pass
infrequent context to interpret the trace data.

For most streams today, the kernel translates a hardware
collected address through the mm struct and vma list to
the backing file and offset in that file.  A given file
is hashed to a word by providing a "dcookie" that maps
through the dcache to a given file and path.  The user
space tool that collects the data from the kernel trace
buffer queries the kernel and translates the stream back to
files in the file system and offsets into those files.   The
user space analysis tools can then take that information
and display the file name, instructions, and symbolic
debug information that may be contained in the typical
elf executable or other binary format.


1) sample rate setup

    In the current patch, the user specifies a sample rate as a time 
interval.
    The kernel is (a) calling cpufreq to get the current cpu frequency, 
(b)
    converting the rate to a cycle count, (c) converting this to a 24 bit
    LFSR count, an iterative algorithm (in this patch, starting from
    one of 256 values so a max of 2^16 or 64k iterations), (d) 
calculating
    an trace unload interval.   In addition, a cpufreq notifier is 
registered
    to recalculate on frequency changes.

    The obvious problem is step (c), running a loop potentially 64 
thousand
    times in kernel space will have a noticeable impact on other threads.

    I propose instead that user space perform the above 4 steps, and 
provide
    the kernel with two inputs: (1) the value to load in the LFSR and (2)
    the periodic frequency / time interval at which to empty the hardware
    trace buffer, perform sample analysis, and send the data to the 
oprofile
    subsystem.

    There should be no security issues with this approach.   If the LFSR 
value
    is calculated incorrectly, either it will be too short, causing the 
trace
    array to overfill and data to be dropped, or it will be too long, and
    there will be fewer samples.   Likewise, the kernel periodic poll 
can be
    too long, again causing overflow, or too frequent, causing only timer
    execution overhead.

    Various data is collected by the kernel while processing the 
periodic timer,
    this approach would also allow the profiling tools to control the
    frequency of this collection.   More frequent collection results in 
more
    accurate sample data, with the linear cost of poll execution 
overhead.

    Frequency changes can be handled either by the profile code setting
    collection at a higher than necessary rate, or by interacting with 
the
    governor to limit the speeds.

    Optionally, the kernel can add a record indicating that some data was
    likely dropped if it is able to read all 256 entries without 
underflowing
    the array.  This can be used as hint to user space that the kernel 
time
    was too long for the collection rate.


Data collected

The trace hardware provides the execution address of the SPUs in their
local store at some time in the past.   The samples are read most
efficiently in batch.

To be of use to oprofile, the raw address must be translated to the
execution context, and ideally to the persistent file system object
where the code is stored.   In addition, for the case where the elf
image is embedded in another file as data, the start of the elf
image is needed.

By draining the trace array on context switch, the kernel can map the
SPU number to the SPU context and linux thread id (and hence the mm
context of the process).  However, the SPU address recorded has an
arbitrary mapping to source address in that thread's context.   Because
the SPU is executing a copy of object mapped into its private, always
writable store, the normal oprofile mm lookup is ineffective.  In 
addition,
because of the active use of overlays, the mapping of SPU address to
source vm address changes over time.   The oprofile driver is 
necessarily
reading the sample later in time.

The current patch starts tackling these translation issues for the
presently common case of a static self contained binary from a single
file, either single separate source file or embedded in the data of
the host application.   When creating the trace entry for a SPU
context switch, it records the application owner, pid, tid, and
dcookie of the main executable.   It addition, it looks up the
object-id as a virtual address and records the offset if it is non-zero,
or the dcookie of the object if it is zero.   The code then creates
a data structure by reading the elf headers from the user process
(at the address given by the object-id) and building a list of
SPU address to elf object offsets, as specified by the ELF loader
headers.   In addition to the elf loader section, it processes the
overlay headers and records the address, size, and magic number of
the overlay.

When the hardware trace entries are processed, each address is
looked up this structure and translated to the elf offset.  If
it is an overlay region, the overlay identify word is read and
the list is searched for the matching overlay.  The resulting
offset is sent to the oprofile system.

The current patch specifically identifies that only single
elf objects are handled.  There is no code to handle dynamic
linked libraries or overlays.   Nor is there any method to
present samples that may have been collected during context
switch processing, they must be discarded.


My proposal is to change what is presented to user space.  Instead
of trying to translate the SPU address to the backing file
as the samples are recorded, store the samples as the SPU
context and address.  The context switch would record tid,
pid, object id as it does now.   In addition, if this is a
new object-id, the kernel would read elf headers as it does
today.  However, it would then proceed to provide accurate
dcookie information for each loader region and overlay.  To
identify which overlays are active, (instead of the present
read on use and search the list to translate approach) the
kernel would record the location of the overlay identifiers
as it parsed the kernel, but would then read the identification
word and would record the present value as an sample from
a separate but related stream.   The kernel could maintain
the last value for each overlay and only send profile events
for the deltas.

This approach trades translation lookup overhead for each
recorded sample for a burst of data on new context activation.
In addition it exposes the sample point of the overlay identifier
vs the address collection.  This allows the ambiguity to be
exposed to user space.   In addition, with the above proposed
kernel timer vs sample collection, user space could limit the
elapsed time between the address collection and the overlay
id check.

This approach allows multiple objects by its nature.  A new
elf header could be constructed in memory that contained
the union of the elf objects load segments, and the tools
will magically work.   Alternatively the object id could
point to a new structure, identified via a new header, that
it points to other elf headers (easily differentiated by the
elf magic headers).   Other binary formats, including several
objects in a ar archive, could be supported.

If better overlay identification is required, in theory the
overlay switch code could be augmented to record the switches
(DMA reference time from the PowerPC memory and record a
relative decrementer in the SPU), this is obviously a future
item.  But it is facilitated by having user space resolve the
SPU to source file translation.

milton
--
miltonm@bga.com   Milton Miller
Speaking for myself only.


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
@ 2007-02-08 14:18     ` Milton Miller
  0 siblings, 0 replies; 66+ messages in thread
From: Milton Miller @ 2007-02-08 14:18 UTC (permalink / raw)
  To: Carl Love; +Cc: linuxppc-dev, LKML, oprofile-list, cbe-oss-dev


On Feb 6, 2007, at 5:02 PM, Carl Love wrote:

> This is the first update to the patch previously posted by Maynard
> Johnson as "PATCH 4/4. Add support to OProfile for profiling CELL".
>
> This repost fixes the line wrap issue that Ben mentioned.  Also the 
> kref
> handling for the cached info has been fixed and simplified.
>
> There are still a few items from the comments being discussed
> specifically how to profile the dynamic code for the SPFS context 
> switch
> code and how to deal with dynamic code stubs for library support.  Our
> proposal is to assign the samples from the SPFS and dynamic library 
> code
> to an anonymous sample bucket.  The support for properly handling the
> symbol extraction in these cases would be deferred to a later SDK.
>
> There is also a bug in profiling overlay code that we are 
> investigating.

I'd like to talk about about both some of the concepts, including what
is logged and what the kernel API is, and provide some directed comments
on the code in the patch as it stands.

[Ok, the background and concepts portion is big enough, I'll send
this to the several lists for comment, and the code comments to
just cbe-oss-dev]

First, I'll give some background and my understanding of both the
hardware, and some of the linux interface.  I'm not consulting any
manuals, so I could be mistaken.   I'm basing this on a combination
of past reading of the kernel, this patch, a discussion with Arnd,
and my knowledge of the PowerPC processor and other IBM processors.
Hopefully this will be of use to other reviewers.

Background:

The cell broadband architecture consists of PowerPC processors
and synergistic processing units, or SPUs.  The current chip has
a multi-threaded PowerPC core and 8 SPUs.   Each SPU has an
execution core (running its own instruction set), a 256kB
private memory (local store), and DMA engine that can access
the coherent memory domain of the PowerPC processor(s).  Multiple
chips can be connected together in a single coherent SMP system.
The addresses provided to the DMA engine are effective, translated
through virtual to real address spaces like the PowerPC MMU.
The SPUs are intended to run application threads, and require
the PowerPC to perform exception handling and other operating
system tasks.

Because of the limited address space of the SPU (18
bits), the compilers (gcc) have support for code
overlays.  The ELF format defines these overlays
including file offsets, sizes, load address, and
a unique word to identify the overlay.  The toolchain
has support to check that an overlay is present and
setup the DMA engine to load it if it is not present.

In Linux, the SPUs are controlled through spufs.  Once
a SPU context is created, the local store and registers can be
modified through the file system.  A linux thread makes
a syscall requesting the context to run.  This call is
blocking to that thread; it waits for an event from the
SPU (either an exception or a reschedule). In this regard
the syscall can be thought of as a cross instruction set
function call.  The contents of the local store are
initialized and modified via read/write or mmap and memcopy
of a spufs file. Specifically, pages are not mapped into the
local store, they are copied into it.  The SPU context is
tied to the creating mm; this provides a clear context for
the DMA engine (controlled by the program in the SPU).

To assist with the use of the SPUs, and to provide some
portability to other environments, a library, libpse, is
available and widely used.   Among other items, it provides
an elf loader for SPU elf objects.  Applications may mmap
external objects or embed the elf objects into their
executable (as data).  The contained objects copied to the
SPU local store as dictated by the elf header.   To assist
other tools, spufs provides an object-id file.   Before this
patch, it has been treated as an opaque number. Although not
present in the first release of libpse, current versions write
the virtual address of the elf header to this file
when the loader is called.

This patch is trying to enable oprofile to collect and
analyze SPU execution, using the trace collection
hardware of the processor.   The trace collection hardware
consists of a LFSR (linear-feedback shift register) counter
and an array 256 bits wide and 256 entries deep.  When the
counter matches the programed value, it causes an entry
to be written to the trace array and the counter to be
reloaded.  On chip logic tracks how many entries have
been written and not yet read.  When programed to trace
the SPUs, the trace entry consists of the upper 16 bits of
the 18 bit program counter for each SPU; the 8 SPUs split
over the two words.  By their nature, LFSRs require a minimum
of hardware (typically 3 exclusive-or gates and a shift
register), but the count sequence is pseudo-random.

The oprofile system is designed to collect a stream of trace
data and context information to and provide it to user space
for post processing and analysis.   While at the lowest level
the data is a stream of words, the kernel and user space tools
agree on a protocol that provides meaning to the words by
prefixing stream elements with escape sequences to pass
infrequent context to interpret the trace data.

For most streams today, the kernel translates a hardware
collected address through the mm struct and vma list to
the backing file and offset in that file.  A given file
is hashed to a word by providing a "dcookie" that maps
through the dcache to a given file and path.  The user
space tool that collects the data from the kernel trace
buffer queries the kernel and translates the stream back to
files in the file system and offsets into those files.   The
user space analysis tools can then take that information
and display the file name, instructions, and symbolic
debug information that may be contained in the typical
elf executable or other binary format.


1) sample rate setup

    In the current patch, the user specifies a sample rate as a time 
interval.
    The kernel is (a) calling cpufreq to get the current cpu frequency, 
(b)
    converting the rate to a cycle count, (c) converting this to a 24 bit
    LFSR count, an iterative algorithm (in this patch, starting from
    one of 256 values so a max of 2^16 or 64k iterations), (d) 
calculating
    an trace unload interval.   In addition, a cpufreq notifier is 
registered
    to recalculate on frequency changes.

    The obvious problem is step (c), running a loop potentially 64 
thousand
    times in kernel space will have a noticeable impact on other threads.

    I propose instead that user space perform the above 4 steps, and 
provide
    the kernel with two inputs: (1) the value to load in the LFSR and (2)
    the periodic frequency / time interval at which to empty the hardware
    trace buffer, perform sample analysis, and send the data to the 
oprofile
    subsystem.

    There should be no security issues with this approach.   If the LFSR 
value
    is calculated incorrectly, either it will be too short, causing the 
trace
    array to overfill and data to be dropped, or it will be too long, and
    there will be fewer samples.   Likewise, the kernel periodic poll 
can be
    too long, again causing overflow, or too frequent, causing only timer
    execution overhead.

    Various data is collected by the kernel while processing the 
periodic timer,
    this approach would also allow the profiling tools to control the
    frequency of this collection.   More frequent collection results in 
more
    accurate sample data, with the linear cost of poll execution 
overhead.

    Frequency changes can be handled either by the profile code setting
    collection at a higher than necessary rate, or by interacting with 
the
    governor to limit the speeds.

    Optionally, the kernel can add a record indicating that some data was
    likely dropped if it is able to read all 256 entries without 
underflowing
    the array.  This can be used as hint to user space that the kernel 
time
    was too long for the collection rate.


Data collected

The trace hardware provides the execution address of the SPUs in their
local store at some time in the past.   The samples are read most
efficiently in batch.

To be of use to oprofile, the raw address must be translated to the
execution context, and ideally to the persistent file system object
where the code is stored.   In addition, for the case where the elf
image is embedded in another file as data, the start of the elf
image is needed.

By draining the trace array on context switch, the kernel can map the
SPU number to the SPU context and linux thread id (and hence the mm
context of the process).  However, the SPU address recorded has an
arbitrary mapping to source address in that thread's context.   Because
the SPU is executing a copy of object mapped into its private, always
writable store, the normal oprofile mm lookup is ineffective.  In 
addition,
because of the active use of overlays, the mapping of SPU address to
source vm address changes over time.   The oprofile driver is 
necessarily
reading the sample later in time.

The current patch starts tackling these translation issues for the
presently common case of a static self contained binary from a single
file, either single separate source file or embedded in the data of
the host application.   When creating the trace entry for a SPU
context switch, it records the application owner, pid, tid, and
dcookie of the main executable.   It addition, it looks up the
object-id as a virtual address and records the offset if it is non-zero,
or the dcookie of the object if it is zero.   The code then creates
a data structure by reading the elf headers from the user process
(at the address given by the object-id) and building a list of
SPU address to elf object offsets, as specified by the ELF loader
headers.   In addition to the elf loader section, it processes the
overlay headers and records the address, size, and magic number of
the overlay.

When the hardware trace entries are processed, each address is
looked up this structure and translated to the elf offset.  If
it is an overlay region, the overlay identify word is read and
the list is searched for the matching overlay.  The resulting
offset is sent to the oprofile system.

The current patch specifically identifies that only single
elf objects are handled.  There is no code to handle dynamic
linked libraries or overlays.   Nor is there any method to
present samples that may have been collected during context
switch processing, they must be discarded.


My proposal is to change what is presented to user space.  Instead
of trying to translate the SPU address to the backing file
as the samples are recorded, store the samples as the SPU
context and address.  The context switch would record tid,
pid, object id as it does now.   In addition, if this is a
new object-id, the kernel would read elf headers as it does
today.  However, it would then proceed to provide accurate
dcookie information for each loader region and overlay.  To
identify which overlays are active, (instead of the present
read on use and search the list to translate approach) the
kernel would record the location of the overlay identifiers
as it parsed the kernel, but would then read the identification
word and would record the present value as an sample from
a separate but related stream.   The kernel could maintain
the last value for each overlay and only send profile events
for the deltas.

This approach trades translation lookup overhead for each
recorded sample for a burst of data on new context activation.
In addition it exposes the sample point of the overlay identifier
vs the address collection.  This allows the ambiguity to be
exposed to user space.   In addition, with the above proposed
kernel timer vs sample collection, user space could limit the
elapsed time between the address collection and the overlay
id check.

This approach allows multiple objects by its nature.  A new
elf header could be constructed in memory that contained
the union of the elf objects load segments, and the tools
will magically work.   Alternatively the object id could
point to a new structure, identified via a new header, that
it points to other elf headers (easily differentiated by the
elf magic headers).   Other binary formats, including several
objects in a ar archive, could be supported.

If better overlay identification is required, in theory the
overlay switch code could be augmented to record the switches
(DMA reference time from the PowerPC memory and record a
relative decrementer in the SPU), this is obviously a future
item.  But it is facilitated by having user space resolve the
SPU to source file translation.

milton
--
miltonm@bga.com   Milton Miller
Speaking for myself only.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-07 15:41     ` Maynard Johnson
@ 2007-02-07 22:48       ` Michael Ellerman
  -1 siblings, 0 replies; 66+ messages in thread
From: Michael Ellerman @ 2007-02-07 22:48 UTC (permalink / raw)
  To: Maynard Johnson; +Cc: Carl Love, linuxppc-dev, linux-kernel, cbe-oss-dev

[-- Attachment #1: Type: text/plain, Size: 3804 bytes --]

On Wed, 2007-02-07 at 09:41 -0600, Maynard Johnson wrote:
> Carl Love wrote:
> 
> >
> >Subject: Add support to OProfile for profiling Cell BE SPUs
> >
> >From: Maynard Johnson <maynardj@us.ibm.com>
> >
> >This patch updates the existing arch/powerpc/oprofile/op_model_cell.c
> >to add in the SPU profiling capabilities.  In addition, a 'cell' subdirectory
> >was added to arch/powerpc/oprofile to hold Cell-specific SPU profiling
> >code.
> >
> >Signed-off-by: Carl Love <carll@us.ibm.com>
> >Signed-off-by: Maynard Johnson <mpjohn@us.ibm.com>
> >
> >Index: linux-2.6.20-rc1/arch/powerpc/configs/cell_defconfig
> >  
> >
> I've discovered more problems with the kref handling for the cached_info 
> object that we store in the spu_context.  :-(
> 
> When the OProfile module initially determines that no cached_info yet 
> exists for a given spu_context, it creates the cached_info, inits the 
> cached_info's kref (which increments the refcount) and does a kref_get 
> (for SPUFS' ref) before passing the cached_info reference off to SUPFS 
> to store into the spu_context.  When OProfile shuts down or the SPU job 
> ends, OProfile gives up its ref to the cached_info with kref_put.  Then 
> when SPUFS destroys the spu_context, it also gives up its ref.  HOWEVER 
> . . . . If OProfile shuts down while the SPU job is still active _and_ 
> _then_ is restarted while the job is still active, OProfile will find 
> that the cached_info exists for the given spu_context, so it won't go 
> through the process of creating it and doing kref_init on the kref.  
> Under this scenario, OProfile does not own a ref of its own to the 
> cached_info, and should not be doing a kref_put when done using the 
> cached_info -- but it does, and so does SPUFS when the spu_context is 
> destroyed.  The end result (with the code as currently written) is that 
> an extra kref_put is done when the refcount is already down to zero.  To 
> fix this, OProfile needs to detect when it finds an existing cached_info 
> already stored in the spu_context.  Then, instead of creating a new one, 
> it sets a reminder flag to be used later when it's done using the cached 
> info to indicate whether or not it needs to call kref_put.

I think all you want to do is when oprofile finds the cached_info
already existing, it does a kref_get(). After all it doesn't have a
reference to it, so before it starts using it it must inc the ref count.

> Unfortunately, there's another problem (one that should have been 
> obvious to me).  The cached_info's kref "release" function is 
> destroy_cached_info(), defined in the OProfile module.  If the OProfile 
> module is unloaded when SPUFS destroys the spu_context and calls 
> kref_put on the cached_info's kref -- KABOOM!  The destroy_cached_info 
> function (the second arg to kref_put) is not in memory, so we get a 
> paging fault.  I see a couple options to solve this:
>     1) Don't store the cached_info in the spu_context.  Basically, go 
> back to the simplistic model of creating/deleting the cached_info on 
> every SPU task activation/deactivation.
>     2)  If there's some way to do this, force the OProfile module to 
> stay loaded until SPUFS has destroyed its last spu_context that holds a 
> cached_info object.

There is a mechanism for that, you just have each cached_info inc the
module's refcount.

Another option would be to have a mapping, in the oprofile code, from
spu_contexts to cached_infos, ie. a hash table or something.

cheers

-- 
Michael Ellerman
OzLabs, IBM Australia Development Lab

wwweb: http://michael.ellerman.id.au
phone: +61 2 6212 1183 (tie line 70 21183)

We do not inherit the earth from our ancestors,
we borrow it from our children. - S.M.A.R.T Person

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
@ 2007-02-07 22:48       ` Michael Ellerman
  0 siblings, 0 replies; 66+ messages in thread
From: Michael Ellerman @ 2007-02-07 22:48 UTC (permalink / raw)
  To: Maynard Johnson; +Cc: linuxppc-dev, linux-kernel, cbe-oss-dev, Carl Love

[-- Attachment #1: Type: text/plain, Size: 3804 bytes --]

On Wed, 2007-02-07 at 09:41 -0600, Maynard Johnson wrote:
> Carl Love wrote:
> 
> >
> >Subject: Add support to OProfile for profiling Cell BE SPUs
> >
> >From: Maynard Johnson <maynardj@us.ibm.com>
> >
> >This patch updates the existing arch/powerpc/oprofile/op_model_cell.c
> >to add in the SPU profiling capabilities.  In addition, a 'cell' subdirectory
> >was added to arch/powerpc/oprofile to hold Cell-specific SPU profiling
> >code.
> >
> >Signed-off-by: Carl Love <carll@us.ibm.com>
> >Signed-off-by: Maynard Johnson <mpjohn@us.ibm.com>
> >
> >Index: linux-2.6.20-rc1/arch/powerpc/configs/cell_defconfig
> >  
> >
> I've discovered more problems with the kref handling for the cached_info 
> object that we store in the spu_context.  :-(
> 
> When the OProfile module initially determines that no cached_info yet 
> exists for a given spu_context, it creates the cached_info, inits the 
> cached_info's kref (which increments the refcount) and does a kref_get 
> (for SPUFS' ref) before passing the cached_info reference off to SUPFS 
> to store into the spu_context.  When OProfile shuts down or the SPU job 
> ends, OProfile gives up its ref to the cached_info with kref_put.  Then 
> when SPUFS destroys the spu_context, it also gives up its ref.  HOWEVER 
> . . . . If OProfile shuts down while the SPU job is still active _and_ 
> _then_ is restarted while the job is still active, OProfile will find 
> that the cached_info exists for the given spu_context, so it won't go 
> through the process of creating it and doing kref_init on the kref.  
> Under this scenario, OProfile does not own a ref of its own to the 
> cached_info, and should not be doing a kref_put when done using the 
> cached_info -- but it does, and so does SPUFS when the spu_context is 
> destroyed.  The end result (with the code as currently written) is that 
> an extra kref_put is done when the refcount is already down to zero.  To 
> fix this, OProfile needs to detect when it finds an existing cached_info 
> already stored in the spu_context.  Then, instead of creating a new one, 
> it sets a reminder flag to be used later when it's done using the cached 
> info to indicate whether or not it needs to call kref_put.

I think all you want to do is when oprofile finds the cached_info
already existing, it does a kref_get(). After all it doesn't have a
reference to it, so before it starts using it it must inc the ref count.

> Unfortunately, there's another problem (one that should have been 
> obvious to me).  The cached_info's kref "release" function is 
> destroy_cached_info(), defined in the OProfile module.  If the OProfile 
> module is unloaded when SPUFS destroys the spu_context and calls 
> kref_put on the cached_info's kref -- KABOOM!  The destroy_cached_info 
> function (the second arg to kref_put) is not in memory, so we get a 
> paging fault.  I see a couple options to solve this:
>     1) Don't store the cached_info in the spu_context.  Basically, go 
> back to the simplistic model of creating/deleting the cached_info on 
> every SPU task activation/deactivation.
>     2)  If there's some way to do this, force the OProfile module to 
> stay loaded until SPUFS has destroyed its last spu_context that holds a 
> cached_info object.

There is a mechanism for that, you just have each cached_info inc the
module's refcount.

Another option would be to have a mapping, in the oprofile code, from
spu_contexts to cached_infos, ie. a hash table or something.

cheers

-- 
Michael Ellerman
OzLabs, IBM Australia Development Lab

wwweb: http://michael.ellerman.id.au
phone: +61 2 6212 1183 (tie line 70 21183)

We do not inherit the earth from our ancestors,
we borrow it from our children. - S.M.A.R.T Person

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-06 23:02   ` Carl Love
@ 2007-02-07 15:41     ` Maynard Johnson
  -1 siblings, 0 replies; 66+ messages in thread
From: Maynard Johnson @ 2007-02-07 15:41 UTC (permalink / raw)
  To: Carl Love; +Cc: linux-kernel, linuxppc-dev, cbe-oss-dev

Carl Love wrote:

>
>Subject: Add support to OProfile for profiling Cell BE SPUs
>
>From: Maynard Johnson <maynardj@us.ibm.com>
>
>This patch updates the existing arch/powerpc/oprofile/op_model_cell.c
>to add in the SPU profiling capabilities.  In addition, a 'cell' subdirectory
>was added to arch/powerpc/oprofile to hold Cell-specific SPU profiling
>code.
>
>Signed-off-by: Carl Love <carll@us.ibm.com>
>Signed-off-by: Maynard Johnson <mpjohn@us.ibm.com>
>
>Index: linux-2.6.20-rc1/arch/powerpc/configs/cell_defconfig
>  
>
I've discovered more problems with the kref handling for the cached_info 
object that we store in the spu_context.  :-(

When the OProfile module initially determines that no cached_info yet 
exists for a given spu_context, it creates the cached_info, inits the 
cached_info's kref (which increments the refcount) and does a kref_get 
(for SPUFS' ref) before passing the cached_info reference off to SUPFS 
to store into the spu_context.  When OProfile shuts down or the SPU job 
ends, OProfile gives up its ref to the cached_info with kref_put.  Then 
when SPUFS destroys the spu_context, it also gives up its ref.  HOWEVER 
. . . . If OProfile shuts down while the SPU job is still active _and_ 
_then_ is restarted while the job is still active, OProfile will find 
that the cached_info exists for the given spu_context, so it won't go 
through the process of creating it and doing kref_init on the kref.  
Under this scenario, OProfile does not own a ref of its own to the 
cached_info, and should not be doing a kref_put when done using the 
cached_info -- but it does, and so does SPUFS when the spu_context is 
destroyed.  The end result (with the code as currently written) is that 
an extra kref_put is done when the refcount is already down to zero.  To 
fix this, OProfile needs to detect when it finds an existing cached_info 
already stored in the spu_context.  Then, instead of creating a new one, 
it sets a reminder flag to be used later when it's done using the cached 
info to indicate whether or not it needs to call kref_put.

Unfortunately, there's another problem (one that should have been 
obvious to me).  The cached_info's kref "release" function is 
destroy_cached_info(), defined in the OProfile module.  If the OProfile 
module is unloaded when SPUFS destroys the spu_context and calls 
kref_put on the cached_info's kref -- KABOOM!  The destroy_cached_info 
function (the second arg to kref_put) is not in memory, so we get a 
paging fault.  I see a couple options to solve this:
    1) Don't store the cached_info in the spu_context.  Basically, go 
back to the simplistic model of creating/deleting the cached_info on 
every SPU task activation/deactivation.
    2)  If there's some way to do this, force the OProfile module to 
stay loaded until SPUFS has destroyed its last spu_context that holds a 
cached_info object.

I thought about putting the cached_info's kref "release" function in 
SPUFS, but this just won't work. This implies that SPUFS needs to know 
about the structure of the cached_info, e.g., that it contains the 
vma_map member that needs to be freed.  But even with that information, 
it's not enough, since the vma_map member consists of list of vma_maps, 
which is why we have the vma_map_free() function.  So SPUFS would still 
need access to vma_map_free() from the OProfile module.

Opinions from others would be appreciated.

Thanks.
-Maynard

>+/* Container for caching information about an active SPU task.
>+ * 
>+ */
>+struct cached_info {
>+	struct vma_to_fileoffset_map * map;
>+	struct spu * the_spu;   /* needed to access pointer to local_store */
>+	struct kref cache_ref;
>+};
>+
>+static struct cached_info * spu_info[MAX_NUMNODES * 8];
>+
>+static void destroy_cached_info(struct kref * kref)
>+{
>+	struct cached_info * info;
>+	info = container_of(kref, struct cached_info, cache_ref);
>+	vma_map_free(info->map);
>+	kfree(info);
>+}
>+
>+/* Return the cached_info for the passed SPU number.
>+ * 
>+ */
>+static struct cached_info * get_cached_info(struct spu * the_spu, int spu_num)
>+{
>+	struct cached_info * ret_info = NULL;
>+	unsigned long flags = 0;
>+	if (spu_num >= num_spu_nodes) {
>+		printk(KERN_ERR "SPU_PROF: " 
>+		       "%s, line %d: Invalid index %d into spu info cache\n",
>+		       __FUNCTION__, __LINE__, spu_num); 
>+		goto out;
>+	}
>+	spin_lock_irqsave(&cache_lock, flags);
>+	if (!spu_info[spu_num] && the_spu)
>+		spu_info[spu_num] = (struct cached_info *)
>+			spu_get_profile_private(the_spu->ctx);
>+
>+	ret_info = spu_info[spu_num];
>+	spin_unlock_irqrestore(&cache_lock, flags);
>+ out:
>+	return ret_info;
>+}
>+
>+
>+/* Looks for cached info for the passed spu.  If not found, the
>+ * cached info is created for the passed spu.
>+ * Returns 0 for success; otherwise, -1 for error.  
>+ */ 
>+static int
>+prepare_cached_spu_info(struct spu * spu, unsigned int objectId)
>+{
>+	unsigned long flags = 0;
>+	struct vma_to_fileoffset_map * new_map;
>+	int retval = 0;
>+	struct cached_info * info = get_cached_info(spu, spu->number);
>+
>+	if (info) {
>+		pr_debug("Found cached SPU info.\n");
>+		goto out;
>+	}
>+
>+	/* Create cached_info and set spu_info[spu->number] to point to it.
>+	 * spu->number is a system-wide value, not a per-node value.
>+	 */
>+	info = kzalloc(sizeof(struct cached_info), GFP_KERNEL);
>+	if (!info) {
>+		printk(KERN_ERR "SPU_PROF: "
>+		       "%s, line %d: create vma_map failed\n",
>+		       __FUNCTION__, __LINE__);
>+		goto err_alloc;
>+	}
>+	new_map = create_vma_map(spu, objectId);
>+	if (!new_map) {
>+		printk(KERN_ERR "SPU_PROF: "
>+		       "%s, line %d: create vma_map failed\n",
>+		       __FUNCTION__, __LINE__);
>+		goto err_alloc;
>+	}
>+
>+	pr_debug("Created vma_map\n");
>+	info->map = new_map;
>+	info->the_spu = spu;
>+	kref_init(&info->cache_ref);
>+	spin_lock_irqsave(&cache_lock, flags);
>+	spu_info[spu->number] = info;
>+	spin_unlock_irqrestore(&cache_lock, flags);
>+	/* Increment count before passing off ref to SPUFS. */
>+	kref_get(&info->cache_ref);
>+	spu_set_profile_private(spu->ctx, info, &info->cache_ref,
>+				destroy_cached_info);
>+	goto out;
>+	
>+err_alloc:
>+	retval = -1;
>+out:
>+	return retval;
>+}
>+
>+/*
>+ * NOTE:  The caller is responsible for locking the
>+ *	  cache_lock prior to calling this function.
>+ */
>+static int release_cached_info(int spu_index)
>+{
>+	int index, end;
>+	if (spu_index == RELEASE_ALL) {
>+		end = num_spu_nodes;
>+		index = 0;
>+	} else {
>+	        if (spu_index >= num_spu_nodes) {
>+        	        printk(KERN_ERR "SPU_PROF: "
>+			       "%s, line %d: Invalid index %d into spu info cache\n",
>+               	               __FUNCTION__, __LINE__, spu_index);
>+	                goto out;
>+	        }
>+		end = spu_index +1;
>+		index = spu_index;
>+	}
>+	for (; index < end; index++) {
>+		if (spu_info[index]) {
>+			kref_put(&spu_info[index]->cache_ref, destroy_cached_info);
>+			spu_info[index] = NULL;
>+		}
>+	}
>+
>+out:
>+	return 0;
>+}
>+
>Index: linux-2.6.20-rc1/arch/powerpc/platforms/cell/spufs/context.c
>===================================================================
>--- linux-2.6.20-rc1.orig/arch/powerpc/platforms/cell/spufs/context.c	2007-02-05 14:42:04.359859432 -0600
>+++ linux-2.6.20-rc1/arch/powerpc/platforms/cell/spufs/context.c	2007-02-06 16:44:05.983965096 -0600
>@@ -22,6 +22,7 @@
>
> #include <linux/fs.h>
> #include <linux/mm.h>
>+#include <linux/module.h>
> #include <linux/slab.h>
> #include <asm/spu.h>
> #include <asm/spu_csa.h>
>@@ -71,6 +72,8 @@
> 	spu_fini_csa(&ctx->csa);
> 	if (ctx->gang)
> 		spu_gang_remove_ctx(ctx->gang, ctx);
>+	if (ctx->prof_priv_kref)
>+		kref_put(ctx->prof_priv_kref, ctx->prof_priv_release);
> 	kfree(ctx);
> }
>
>@@ -200,3 +203,29 @@
>
> 	downgrade_write(&ctx->state_sema);
> }
>+
>+/* This interface allows a profiler (e.g., OProfile) to store
>+ * spu_context information needed for profiling, allowing it to
>+ * be saved across context save/restore operation.
>+ *
>+ * Assumes the caller has already incremented the ref count to
>+ * profile_info; then spu_context_destroy must call kref_put
>+ * on prof_info_kref.
>+ */
>+void spu_set_profile_private(struct spu_context * ctx, void * profile_info,
>+			     struct kref * prof_info_kref,
>+			     void (* prof_info_release) (struct kref * kref))
>+{
>+	ctx->profile_private = profile_info;
>+	ctx->prof_priv_kref = prof_info_kref;
>+	ctx->prof_priv_release = prof_info_release;
>+}
>+EXPORT_SYMBOL_GPL(spu_set_profile_private);
>+
>+void * spu_get_profile_private(struct spu_context * ctx)
>+{
>+	return ctx->profile_private;
>+}
>+EXPORT_SYMBOL_GPL(spu_get_profile_private);
>+
>+
>
>
>  
>



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
@ 2007-02-07 15:41     ` Maynard Johnson
  0 siblings, 0 replies; 66+ messages in thread
From: Maynard Johnson @ 2007-02-07 15:41 UTC (permalink / raw)
  To: Carl Love; +Cc: linuxppc-dev, linux-kernel, cbe-oss-dev

Carl Love wrote:

>
>Subject: Add support to OProfile for profiling Cell BE SPUs
>
>From: Maynard Johnson <maynardj@us.ibm.com>
>
>This patch updates the existing arch/powerpc/oprofile/op_model_cell.c
>to add in the SPU profiling capabilities.  In addition, a 'cell' subdirectory
>was added to arch/powerpc/oprofile to hold Cell-specific SPU profiling
>code.
>
>Signed-off-by: Carl Love <carll@us.ibm.com>
>Signed-off-by: Maynard Johnson <mpjohn@us.ibm.com>
>
>Index: linux-2.6.20-rc1/arch/powerpc/configs/cell_defconfig
>  
>
I've discovered more problems with the kref handling for the cached_info 
object that we store in the spu_context.  :-(

When the OProfile module initially determines that no cached_info yet 
exists for a given spu_context, it creates the cached_info, inits the 
cached_info's kref (which increments the refcount) and does a kref_get 
(for SPUFS' ref) before passing the cached_info reference off to SUPFS 
to store into the spu_context.  When OProfile shuts down or the SPU job 
ends, OProfile gives up its ref to the cached_info with kref_put.  Then 
when SPUFS destroys the spu_context, it also gives up its ref.  HOWEVER 
. . . . If OProfile shuts down while the SPU job is still active _and_ 
_then_ is restarted while the job is still active, OProfile will find 
that the cached_info exists for the given spu_context, so it won't go 
through the process of creating it and doing kref_init on the kref.  
Under this scenario, OProfile does not own a ref of its own to the 
cached_info, and should not be doing a kref_put when done using the 
cached_info -- but it does, and so does SPUFS when the spu_context is 
destroyed.  The end result (with the code as currently written) is that 
an extra kref_put is done when the refcount is already down to zero.  To 
fix this, OProfile needs to detect when it finds an existing cached_info 
already stored in the spu_context.  Then, instead of creating a new one, 
it sets a reminder flag to be used later when it's done using the cached 
info to indicate whether or not it needs to call kref_put.

Unfortunately, there's another problem (one that should have been 
obvious to me).  The cached_info's kref "release" function is 
destroy_cached_info(), defined in the OProfile module.  If the OProfile 
module is unloaded when SPUFS destroys the spu_context and calls 
kref_put on the cached_info's kref -- KABOOM!  The destroy_cached_info 
function (the second arg to kref_put) is not in memory, so we get a 
paging fault.  I see a couple options to solve this:
    1) Don't store the cached_info in the spu_context.  Basically, go 
back to the simplistic model of creating/deleting the cached_info on 
every SPU task activation/deactivation.
    2)  If there's some way to do this, force the OProfile module to 
stay loaded until SPUFS has destroyed its last spu_context that holds a 
cached_info object.

I thought about putting the cached_info's kref "release" function in 
SPUFS, but this just won't work. This implies that SPUFS needs to know 
about the structure of the cached_info, e.g., that it contains the 
vma_map member that needs to be freed.  But even with that information, 
it's not enough, since the vma_map member consists of list of vma_maps, 
which is why we have the vma_map_free() function.  So SPUFS would still 
need access to vma_map_free() from the OProfile module.

Opinions from others would be appreciated.

Thanks.
-Maynard

>+/* Container for caching information about an active SPU task.
>+ * 
>+ */
>+struct cached_info {
>+	struct vma_to_fileoffset_map * map;
>+	struct spu * the_spu;   /* needed to access pointer to local_store */
>+	struct kref cache_ref;
>+};
>+
>+static struct cached_info * spu_info[MAX_NUMNODES * 8];
>+
>+static void destroy_cached_info(struct kref * kref)
>+{
>+	struct cached_info * info;
>+	info = container_of(kref, struct cached_info, cache_ref);
>+	vma_map_free(info->map);
>+	kfree(info);
>+}
>+
>+/* Return the cached_info for the passed SPU number.
>+ * 
>+ */
>+static struct cached_info * get_cached_info(struct spu * the_spu, int spu_num)
>+{
>+	struct cached_info * ret_info = NULL;
>+	unsigned long flags = 0;
>+	if (spu_num >= num_spu_nodes) {
>+		printk(KERN_ERR "SPU_PROF: " 
>+		       "%s, line %d: Invalid index %d into spu info cache\n",
>+		       __FUNCTION__, __LINE__, spu_num); 
>+		goto out;
>+	}
>+	spin_lock_irqsave(&cache_lock, flags);
>+	if (!spu_info[spu_num] && the_spu)
>+		spu_info[spu_num] = (struct cached_info *)
>+			spu_get_profile_private(the_spu->ctx);
>+
>+	ret_info = spu_info[spu_num];
>+	spin_unlock_irqrestore(&cache_lock, flags);
>+ out:
>+	return ret_info;
>+}
>+
>+
>+/* Looks for cached info for the passed spu.  If not found, the
>+ * cached info is created for the passed spu.
>+ * Returns 0 for success; otherwise, -1 for error.  
>+ */ 
>+static int
>+prepare_cached_spu_info(struct spu * spu, unsigned int objectId)
>+{
>+	unsigned long flags = 0;
>+	struct vma_to_fileoffset_map * new_map;
>+	int retval = 0;
>+	struct cached_info * info = get_cached_info(spu, spu->number);
>+
>+	if (info) {
>+		pr_debug("Found cached SPU info.\n");
>+		goto out;
>+	}
>+
>+	/* Create cached_info and set spu_info[spu->number] to point to it.
>+	 * spu->number is a system-wide value, not a per-node value.
>+	 */
>+	info = kzalloc(sizeof(struct cached_info), GFP_KERNEL);
>+	if (!info) {
>+		printk(KERN_ERR "SPU_PROF: "
>+		       "%s, line %d: create vma_map failed\n",
>+		       __FUNCTION__, __LINE__);
>+		goto err_alloc;
>+	}
>+	new_map = create_vma_map(spu, objectId);
>+	if (!new_map) {
>+		printk(KERN_ERR "SPU_PROF: "
>+		       "%s, line %d: create vma_map failed\n",
>+		       __FUNCTION__, __LINE__);
>+		goto err_alloc;
>+	}
>+
>+	pr_debug("Created vma_map\n");
>+	info->map = new_map;
>+	info->the_spu = spu;
>+	kref_init(&info->cache_ref);
>+	spin_lock_irqsave(&cache_lock, flags);
>+	spu_info[spu->number] = info;
>+	spin_unlock_irqrestore(&cache_lock, flags);
>+	/* Increment count before passing off ref to SPUFS. */
>+	kref_get(&info->cache_ref);
>+	spu_set_profile_private(spu->ctx, info, &info->cache_ref,
>+				destroy_cached_info);
>+	goto out;
>+	
>+err_alloc:
>+	retval = -1;
>+out:
>+	return retval;
>+}
>+
>+/*
>+ * NOTE:  The caller is responsible for locking the
>+ *	  cache_lock prior to calling this function.
>+ */
>+static int release_cached_info(int spu_index)
>+{
>+	int index, end;
>+	if (spu_index == RELEASE_ALL) {
>+		end = num_spu_nodes;
>+		index = 0;
>+	} else {
>+	        if (spu_index >= num_spu_nodes) {
>+        	        printk(KERN_ERR "SPU_PROF: "
>+			       "%s, line %d: Invalid index %d into spu info cache\n",
>+               	               __FUNCTION__, __LINE__, spu_index);
>+	                goto out;
>+	        }
>+		end = spu_index +1;
>+		index = spu_index;
>+	}
>+	for (; index < end; index++) {
>+		if (spu_info[index]) {
>+			kref_put(&spu_info[index]->cache_ref, destroy_cached_info);
>+			spu_info[index] = NULL;
>+		}
>+	}
>+
>+out:
>+	return 0;
>+}
>+
>Index: linux-2.6.20-rc1/arch/powerpc/platforms/cell/spufs/context.c
>===================================================================
>--- linux-2.6.20-rc1.orig/arch/powerpc/platforms/cell/spufs/context.c	2007-02-05 14:42:04.359859432 -0600
>+++ linux-2.6.20-rc1/arch/powerpc/platforms/cell/spufs/context.c	2007-02-06 16:44:05.983965096 -0600
>@@ -22,6 +22,7 @@
>
> #include <linux/fs.h>
> #include <linux/mm.h>
>+#include <linux/module.h>
> #include <linux/slab.h>
> #include <asm/spu.h>
> #include <asm/spu_csa.h>
>@@ -71,6 +72,8 @@
> 	spu_fini_csa(&ctx->csa);
> 	if (ctx->gang)
> 		spu_gang_remove_ctx(ctx->gang, ctx);
>+	if (ctx->prof_priv_kref)
>+		kref_put(ctx->prof_priv_kref, ctx->prof_priv_release);
> 	kfree(ctx);
> }
>
>@@ -200,3 +203,29 @@
>
> 	downgrade_write(&ctx->state_sema);
> }
>+
>+/* This interface allows a profiler (e.g., OProfile) to store
>+ * spu_context information needed for profiling, allowing it to
>+ * be saved across context save/restore operation.
>+ *
>+ * Assumes the caller has already incremented the ref count to
>+ * profile_info; then spu_context_destroy must call kref_put
>+ * on prof_info_kref.
>+ */
>+void spu_set_profile_private(struct spu_context * ctx, void * profile_info,
>+			     struct kref * prof_info_kref,
>+			     void (* prof_info_release) (struct kref * kref))
>+{
>+	ctx->profile_private = profile_info;
>+	ctx->prof_priv_kref = prof_info_kref;
>+	ctx->prof_priv_release = prof_info_release;
>+}
>+EXPORT_SYMBOL_GPL(spu_set_profile_private);
>+
>+void * spu_get_profile_private(struct spu_context * ctx)
>+{
>+	return ctx->profile_private;
>+}
>+EXPORT_SYMBOL_GPL(spu_get_profile_private);
>+
>+
>
>
>  
>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-06  0:28 [RFC,PATCH] CELL PPU " Carl Love
@ 2007-02-06 23:02   ` Carl Love
  0 siblings, 0 replies; 66+ messages in thread
From: Carl Love @ 2007-02-06 23:02 UTC (permalink / raw)
  To: linux-kernel; +Cc: cbe-oss-dev, linuxppc-dev

This is the first update to the patch previously posted by Maynard
Johnson as "PATCH 4/4. Add support to OProfile for profiling CELL".  

This repost fixes the line wrap issue that Ben mentioned.  Also the kref
handling for the cached info has been fixed and simplified.

There are still a few items from the comments being discussed
specifically how to profile the dynamic code for the SPFS context switch
code and how to deal with dynamic code stubs for library support.  Our
proposal is to assign the samples from the SPFS and dynamic library code
to an anonymous sample bucket.  The support for properly handling the
symbol extraction in these cases would be deferred to a later SDK.    

There is also a bug in profiling overlay code that we are investigating.


Subject: Add support to OProfile for profiling Cell BE SPUs

From: Maynard Johnson <maynardj@us.ibm.com>

This patch updates the existing arch/powerpc/oprofile/op_model_cell.c
to add in the SPU profiling capabilities.  In addition, a 'cell' subdirectory
was added to arch/powerpc/oprofile to hold Cell-specific SPU profiling
code.

Signed-off-by: Carl Love <carll@us.ibm.com>
Signed-off-by: Maynard Johnson <mpjohn@us.ibm.com>

Index: linux-2.6.20-rc1/arch/powerpc/configs/cell_defconfig
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/configs/cell_defconfig	2007-01-18 16:43:14.230540320 -0600
+++ linux-2.6.20-rc1/arch/powerpc/configs/cell_defconfig	2007-02-01 17:21:46.928875304 -0600
@@ -1403,7 +1403,7 @@
 # Instrumentation Support
 #
 CONFIG_PROFILING=y
-CONFIG_OPROFILE=y
+CONFIG_OPROFILE=m
 # CONFIG_KPROBES is not set
 
 #
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/cell/pr_util.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/cell/pr_util.h	2007-02-03 15:56:01.094856152 -0600
@@ -0,0 +1,78 @@
+ /*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Author: Maynard Johnson <maynardj@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef PR_UTIL_H
+#define PR_UTIL_H
+
+#include <linux/cpumask.h>
+#include <linux/oprofile.h>
+#include <asm/cell-pmu.h>
+#include <asm/spu.h>
+
+static inline int number_of_online_nodes(void) 
+{
+	u32 cpu; u32 tmp;
+	int nodes = 0;
+	for_each_online_cpu(cpu) {
+		tmp = cbe_cpu_to_node(cpu) + 1;
+		if (tmp > nodes)
+			nodes++;
+	}
+	return nodes;
+}
+
+/* Defines used for sync_start */
+#define SKIP_GENERIC_SYNC 0
+#define SYNC_START_ERROR -1
+#define DO_GENERIC_SYNC 1
+
+struct vma_to_fileoffset_map
+{
+	struct vma_to_fileoffset_map *next;
+	unsigned int vma;
+	unsigned int size;
+	unsigned int offset;
+	unsigned int guard_ptr;
+	unsigned int guard_val;
+};
+
+/* The three functions below are for maintaining and accessing
+ * the vma-to-fileoffset map.
+ */
+struct vma_to_fileoffset_map * create_vma_map(const struct spu * spu, u64 objectid);
+unsigned int vma_map_lookup(struct vma_to_fileoffset_map *map, unsigned int vma,
+			    const struct spu * aSpu);
+void vma_map_free(struct vma_to_fileoffset_map *map);
+
+/*
+ * Entry point for SPU profiling.
+ * cycles_reset is the SPU_CYCLES count value specified by the user.
+ */
+void start_spu_profiling(unsigned int cycles_reset);
+
+void stop_spu_profiling(void);
+
+ 
+/* add the necessary profiling hooks */
+int spu_sync_start(void);
+
+/* remove the hooks */
+int spu_sync_stop(void);
+ 
+/* Record SPU program counter samples to the oprofile event buffer. */
+void spu_sync_buffer(int spu_num, unsigned int * samples, 
+		     int num_samples);
+
+void set_profiling_frequency(unsigned int freq_khz, unsigned int cycles_reset);
+
+#endif    // PR_UTIL_H 
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/cell/spu_profiler.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/cell/spu_profiler.c	2007-02-05 09:32:25.708937424 -0600
@@ -0,0 +1,203 @@
+/*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Authors: Maynard Johnson <maynardj@us.ibm.com>
+ *          Carl Love <carll@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/hrtimer.h>
+#include <linux/smp.h>
+#include <linux/slab.h>
+#include <asm/cell-pmu.h>
+#include <asm/time.h>
+#include "pr_util.h"
+
+#define TRACE_ARRAY_SIZE 1024
+#define SCALE_SHIFT 14 
+
+static u32 * samples;
+
+static int spu_prof_running = 0;
+static unsigned int profiling_interval = 0;
+
+extern int spu_prof_num_nodes;
+
+
+#define NUM_SPU_BITS_TRBUF 16
+#define SPUS_PER_TB_ENTRY   4
+#define SPUS_PER_NODE       8
+
+void set_profiling_frequency(unsigned int freq_khz, unsigned int cycles_reset)
+{
+	unsigned long nsPerCyc;
+	if (!freq_khz)
+		freq_khz = ppc_proc_freq/1000;
+
+        /* To calculate a timeout in nanoseconds, the basic
+	 * formula is ns = cycles_reset * (NSEC_PER_SEC / cpu frequency).
+	 * To avoid floating point math, we use the scale math
+	 * technique as described in linux/jiffies.h.  We use
+	 * a scale factor of SCALE_SHIFT,which provides 4 decimal places
+	 * of precision, which is close enough for the purpose at hand.
+	 */
+
+	nsPerCyc = (USEC_PER_SEC << SCALE_SHIFT)/freq_khz;
+	profiling_interval = (nsPerCyc * cycles_reset) >> SCALE_SHIFT;
+
+}
+
+/*
+ * Extract SPU PC from trace buffer entry
+ */
+static void spu_pc_extract(int cpu, int entry)
+{
+        /* the trace buffer is 128 bits */
+	u64 trace_buffer[2];
+	u64 spu_pc_lower;  
+	u64 spu_pc_upper;
+	u64 spu_mask;
+	int spu;
+	int node_factor;
+	
+	spu_mask = 0xFFFF;
+	node_factor = cbe_cpu_to_node(cpu) * SPUS_PER_NODE;
+	
+	/* Each SPU PC is 16 bits; hence, four spus in each of 
+	 * the two 64-bit buffer entries that make up the
+	 * 128-bit trace_buffer entry.  Process the upper and
+	 * lower 64-bit values simultaneously.
+	 * trace[0] SPU PC contents are: 0 1 2 3
+	 * trace[1] SPU PC contents are: 4 5 6 7
+	 */
+
+	cbe_read_trace_buffer(cpu, trace_buffer);  
+
+	for (spu = SPUS_PER_TB_ENTRY-1; spu >= 0; spu--) {
+		spu_pc_lower = spu_mask & trace_buffer[0];
+		trace_buffer[0] = trace_buffer[0] >> NUM_SPU_BITS_TRBUF;
+
+		spu_pc_upper = spu_mask & trace_buffer[1];
+		trace_buffer[1] = trace_buffer[1] >> NUM_SPU_BITS_TRBUF;
+		
+		/* spu PC trace entry is upper 16 bits of the
+		 * 18 bit SPU program counter 
+		 */
+		spu_pc_lower = spu_pc_lower << 2;
+		spu_pc_upper = spu_pc_upper << 2;
+		
+		samples[((node_factor + spu) * TRACE_ARRAY_SIZE) + entry]
+			= (u32) spu_pc_lower;
+		samples[((node_factor + spu + SPUS_PER_TB_ENTRY) 
+			 * TRACE_ARRAY_SIZE) + entry] = (u32) spu_pc_upper;
+	}
+}
+
+static int cell_spu_pc_collection(int cpu)
+{
+	u32 trace_addr;
+	int entry;
+
+	/* process the collected SPU PC for the node */
+
+	entry = 0;
+
+	trace_addr = cbe_read_pm(cpu, trace_address);
+	while ((trace_addr & CBE_PM_TRACE_BUF_EMPTY) != 0x400)
+	{
+		/* there is data in the trace buffer to process */
+		spu_pc_extract(cpu, entry);
+
+		entry++;
+
+		if (entry >= TRACE_ARRAY_SIZE) 
+			/* spu_samples is full */
+			break;
+
+		trace_addr = cbe_read_pm(cpu, trace_address);
+	}
+	return(entry);
+}
+
+
+static int profile_spus(struct hrtimer * timer)
+{
+	ktime_t kt;
+	int cpu, node, k, num_samples, spu_num;
+	
+	if (!spu_prof_running)
+		goto stop;
+
+	for_each_online_cpu(cpu) {
+		if (cbe_get_hw_thread_id(cpu))
+			continue;
+
+		node = cbe_cpu_to_node(cpu);
+
+		num_samples = cell_spu_pc_collection(cpu);
+
+		if (num_samples == 0)
+			continue;
+
+		for (k = 0; k < SPUS_PER_NODE; k++) {
+			spu_num = k + (node * SPUS_PER_NODE);
+			spu_sync_buffer(spu_num, 
+					samples + (spu_num * TRACE_ARRAY_SIZE),
+					num_samples);
+		}
+	}
+	smp_wmb();
+
+	kt = ktime_set(0, profiling_interval);
+	if (!spu_prof_running)
+		goto stop;
+	hrtimer_forward(timer, timer->base->get_time(), kt);
+	return HRTIMER_RESTART;
+
+ stop:
+	printk(KERN_INFO "SPU_PROF: spu-prof timer ending\n");
+	return HRTIMER_NORESTART;
+}
+
+static struct hrtimer timer;
+/*
+ * Entry point for SPU profiling.
+ * NOTE:  SPU profiling is done system-wide, not per-CPU.
+ *
+ * cycles_reset is the count value specified by the user when
+ * setting up OProfile to count SPU_CYCLES.
+ */
+void start_spu_profiling(unsigned int cycles_reset) {
+
+	ktime_t kt;
+	
+	pr_debug("timer resolution: %lu\n", 
+		 TICK_NSEC);
+	kt = ktime_set(0, profiling_interval);
+	hrtimer_init(&timer, CLOCK_MONOTONIC, HRTIMER_REL);
+	timer.expires = kt;
+	timer.function = profile_spus;
+
+        /* Allocate arrays for collecting SPU PC samples */
+	samples = (u32 *) kzalloc(spu_prof_num_nodes * SPUS_PER_NODE * 
+				  TRACE_ARRAY_SIZE * sizeof(u32), GFP_KERNEL);
+
+	spu_prof_running = 1;
+	hrtimer_start(&timer, kt, HRTIMER_REL);
+}
+
+void stop_spu_profiling(void) 
+{
+	spu_prof_running = 0;
+	hrtimer_cancel(&timer);
+	kfree(samples);
+	pr_debug("SPU_PROF: stop_spu_profiling issued\n");
+}
+
+
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/cell/spu_task_sync.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/cell/spu_task_sync.c	2007-02-06 16:43:27.832908640 -0600
@@ -0,0 +1,425 @@
+/*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Author: Maynard Johnson <maynardj@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/* The purpose of this file is to handle SPU event task switching
+ * and to record SPU context information into the OProfile
+ * event buffer. 
+ *
+ * Additionally, the spu_sync_buffer function is provided as a helper
+ * for recoding actual SPU program counter samples to the event buffer.
+ */
+
+#include <linux/notifier.h>
+#include <linux/list.h>
+#include <linux/numa.h>
+#include <linux/mm.h>
+#include <linux/dcookies.h>
+#include <linux/spinlock.h>
+#include <linux/kref.h>
+#include <linux/oprofile.h>
+#include "pr_util.h"
+
+#define RELEASE_ALL 9999
+
+static spinlock_t buffer_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t cache_lock = SPIN_LOCK_UNLOCKED;
+static int num_spu_nodes;
+int spu_prof_num_nodes;
+
+/* Container for caching information about an active SPU task.
+ * 
+ */
+struct cached_info {
+	struct vma_to_fileoffset_map * map;
+	struct spu * the_spu;   /* needed to access pointer to local_store */
+	struct kref cache_ref;
+};
+
+static struct cached_info * spu_info[MAX_NUMNODES * 8];
+
+static void destroy_cached_info(struct kref * kref)
+{
+	struct cached_info * info;
+	info = container_of(kref, struct cached_info, cache_ref);
+	vma_map_free(info->map);
+	kfree(info);
+}
+
+/* Return the cached_info for the passed SPU number.
+ * 
+ */
+static struct cached_info * get_cached_info(struct spu * the_spu, int spu_num)
+{
+	struct cached_info * ret_info = NULL;
+	unsigned long flags = 0;
+	if (spu_num >= num_spu_nodes) {
+		printk(KERN_ERR "SPU_PROF: " 
+		       "%s, line %d: Invalid index %d into spu info cache\n",
+		       __FUNCTION__, __LINE__, spu_num); 
+		goto out;
+	}
+	spin_lock_irqsave(&cache_lock, flags);
+	if (!spu_info[spu_num] && the_spu)
+		spu_info[spu_num] = (struct cached_info *)
+			spu_get_profile_private(the_spu->ctx);
+
+	ret_info = spu_info[spu_num];
+	spin_unlock_irqrestore(&cache_lock, flags);
+ out:
+	return ret_info;
+}
+
+
+/* Looks for cached info for the passed spu.  If not found, the
+ * cached info is created for the passed spu.
+ * Returns 0 for success; otherwise, -1 for error.  
+ */ 
+static int
+prepare_cached_spu_info(struct spu * spu, unsigned int objectId)
+{
+	unsigned long flags = 0;
+	struct vma_to_fileoffset_map * new_map;
+	int retval = 0;
+	struct cached_info * info = get_cached_info(spu, spu->number);
+
+	if (info) {
+		pr_debug("Found cached SPU info.\n");
+		goto out;
+	}
+
+	/* Create cached_info and set spu_info[spu->number] to point to it.
+	 * spu->number is a system-wide value, not a per-node value.
+	 */
+	info = kzalloc(sizeof(struct cached_info), GFP_KERNEL);
+	if (!info) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: create vma_map failed\n",
+		       __FUNCTION__, __LINE__);
+		goto err_alloc;
+	}
+	new_map = create_vma_map(spu, objectId);
+	if (!new_map) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: create vma_map failed\n",
+		       __FUNCTION__, __LINE__);
+		goto err_alloc;
+	}
+
+	pr_debug("Created vma_map\n");
+	info->map = new_map;
+	info->the_spu = spu;
+	kref_init(&info->cache_ref);
+	spin_lock_irqsave(&cache_lock, flags);
+	spu_info[spu->number] = info;
+	spin_unlock_irqrestore(&cache_lock, flags);
+	/* Increment count before passing off ref to SPUFS. */
+	kref_get(&info->cache_ref);
+	spu_set_profile_private(spu->ctx, info, &info->cache_ref,
+				destroy_cached_info);
+	goto out;
+	
+err_alloc:
+	retval = -1;
+out:
+	return retval;
+}
+
+/*
+ * NOTE:  The caller is responsible for locking the
+ *	  cache_lock prior to calling this function.
+ */
+static int release_cached_info(int spu_index)
+{
+	int index, end;
+	if (spu_index == RELEASE_ALL) {
+		end = num_spu_nodes;
+		index = 0;
+	} else {
+	        if (spu_index >= num_spu_nodes) {
+        	        printk(KERN_ERR "SPU_PROF: "
+			       "%s, line %d: Invalid index %d into spu info cache\n",
+               	               __FUNCTION__, __LINE__, spu_index);
+	                goto out;
+	        }
+		end = spu_index +1;
+		index = spu_index;
+	}
+	for (; index < end; index++) {
+		if (spu_info[index]) {
+			kref_put(&spu_info[index]->cache_ref, destroy_cached_info);
+			spu_info[index] = NULL;
+		}
+	}
+
+out:
+	return 0;
+}
+
+/* The source code for fast_get_dcookie was "borrowed"
+ * from drivers/oprofile/buffer_sync.c.
+ */
+
+/* Optimisation. We can manage without taking the dcookie sem
+ * because we cannot reach this code without at least one
+ * dcookie user still being registered (namely, the reader
+ * of the event buffer).
+ */
+static inline unsigned long fast_get_dcookie(struct dentry * dentry,
+					     struct vfsmount * vfsmnt)
+{
+	unsigned long cookie;
+
+	if (dentry->d_cookie)
+		return (unsigned long)dentry;
+	get_dcookie(dentry, vfsmnt, &cookie);
+	return cookie;
+}
+
+/* Look up the dcookie for the task's first VM_EXECUTABLE mapping,
+ * which corresponds loosely to "application name". Also, determine
+ * the offset for the SPU ELF object.  If computed offset is 
+ * non-zero, it implies an embedded SPU object; otherwise, it's a
+ * separate SPU binary, in which case we retrieve it's dcookie.
+ */
+static unsigned long 
+get_exec_dcookie_and_offset(
+	struct spu * spu, unsigned int * offsetp,
+	unsigned long * spu_bin_dcookie,
+	unsigned int spu_ref)
+{
+	unsigned long cookie = 0;
+	unsigned int my_offset = 0;
+	struct vm_area_struct * vma;
+	struct mm_struct * mm = spu->mm;
+
+	if (!mm)
+		goto out;
+
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		if (!vma->vm_file)
+			continue;
+		if (!(vma->vm_flags & VM_EXECUTABLE))
+			continue;
+		cookie = fast_get_dcookie(vma->vm_file->f_dentry,
+					  vma->vm_file->f_vfsmnt);
+		pr_debug("got dcookie for %s\n",
+			 vma->vm_file->f_dentry->d_name.name);
+		break;
+	}
+
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		if (vma->vm_start > spu_ref || vma->vm_end < spu_ref)
+			continue;
+		my_offset = spu_ref - vma->vm_start;
+		pr_debug("Found spu ELF at "
+			 " %X for file %s\n", my_offset,
+			 vma->vm_file->f_dentry->d_name.name);
+		*offsetp = my_offset;
+		if (my_offset == 0) {
+			if (!vma->vm_file) {
+				goto fail_no_spu_cookie;
+			}
+			*spu_bin_dcookie = fast_get_dcookie(
+				vma->vm_file->f_dentry,
+				vma->vm_file->f_vfsmnt);
+			pr_debug("got dcookie for %s\n",
+				 vma->vm_file->f_dentry->d_name.name);
+		}
+		break;			
+	}
+	
+ out:
+	return cookie;
+
+ fail_no_spu_cookie:
+	printk(KERN_ERR "SPU_PROF: "
+	       "%s, line %d: Cannot find dcookie for SPU binary\n",
+	       __FUNCTION__, __LINE__);
+	goto out;
+}
+
+
+
+/* This function finds or creates cached context information for the
+ * passed SPU and records SPU context information into the OProfile
+ * event buffer.
+ */
+static int process_context_switch(struct spu * spu, unsigned int objectId)
+{
+	unsigned long flags;
+	int retval = 0;
+	unsigned int offset = 0;
+	unsigned long spu_cookie = 0, app_dcookie = 0;
+	retval = prepare_cached_spu_info(spu, objectId);
+	if (retval == -1) {
+		goto out;
+	}
+        /* Get dcookie first because a mutex_lock is taken in that
+	 * code path, so interrupts must not be disabled.
+	 */
+	app_dcookie = get_exec_dcookie_and_offset(spu, &offset,
+						  &spu_cookie, objectId);
+
+        /* Record context info in event buffer */
+	spin_lock_irqsave(&buffer_lock, flags);
+	add_event_entry(ESCAPE_CODE);
+	add_event_entry(SPU_CTX_SWITCH_CODE);
+	add_event_entry(spu->number);
+	add_event_entry(spu->pid);
+	add_event_entry(spu->tgid);
+	add_event_entry(app_dcookie);
+
+	add_event_entry(ESCAPE_CODE);
+	if (offset) {
+	  /* When offset is non-zero,  this means the SPU ELF was embedded;
+	   * otherwise, it was loaded from a separate binary file.  For the
+	   * embedded case, we record the offset of the SPU ELF into the PPU
+	   * executable; for the non-embedded case, we record a dcookie that
+	   * points to the location of the SPU binary that was loaded.
+	   */
+		add_event_entry(SPU_OFFSET_CODE);
+		add_event_entry(offset);
+	} else {
+		add_event_entry(SPU_COOKIE_CODE);
+		add_event_entry(spu_cookie);
+	}
+	spin_unlock_irqrestore(&buffer_lock, flags);
+	smp_wmb();
+out:
+	return retval;
+}
+
+/* 
+ * This function is invoked on either a bind_context or unbind_context.  
+ * If called for an unbind_context, the val arg is 0; otherwise, 
+ * it is the object-id value for the spu context.
+ * The data arg is of type 'struct spu *'.
+ */
+static int spu_active_notify(struct notifier_block * self, unsigned long val,
+			     void * data)
+{
+	int retval;
+	unsigned long flags = 0;
+	struct spu * the_spu = data;
+	pr_debug("SPU event notification arrived\n");
+	if (!val){
+		spin_lock_irqsave(&cache_lock, flags);
+		retval = release_cached_info(the_spu->number);
+		spin_unlock_irqrestore(&cache_lock, flags);
+	} else {
+		retval = process_context_switch(the_spu, val);
+	}
+	return retval;
+}
+
+static struct notifier_block spu_active = {
+	.notifier_call = spu_active_notify,
+};
+
+/* The main purpose of this function is to synchronize
+ * OProfile with SPUFS by registering to be notified of
+ * SPU task switches.
+ *
+ * NOTE: When profiling SPUs, we must ensure that only
+ * spu_sync_start is invoked and not the generic sync_start
+ * in drivers/oprofile/oprof.c.  A return value of
+ * SKIP_GENERIC_SYNC or SYNC_START_ERROR will
+ * accomplish this.
+ */
+int spu_sync_start(void) {
+	int ret = SKIP_GENERIC_SYNC;
+	int register_ret;
+	unsigned long flags = 0;
+	spu_prof_num_nodes = number_of_online_nodes();
+	num_spu_nodes = spu_prof_num_nodes * 8;
+
+	spin_lock_irqsave(&buffer_lock, flags);
+	add_event_entry(ESCAPE_CODE);
+	add_event_entry(SPU_PROFILING_CODE);
+	add_event_entry(num_spu_nodes);
+	spin_unlock_irqrestore(&buffer_lock, flags);
+
+        /* Register for SPU events  */
+	register_ret = spu_switch_event_register(&spu_active);
+	if (register_ret) {
+		ret = SYNC_START_ERROR;
+		goto out;
+	}
+
+	pr_debug("spu_sync_start -- running.\n");
+out:
+	return ret;	
+}
+
+/* Record SPU program counter samples to the oprofile event buffer. */
+void spu_sync_buffer(int spu_num, unsigned int * samples, 
+		     int num_samples)
+{
+	unsigned long flags = 0;
+	int i;
+	struct vma_to_fileoffset_map * map;
+	struct spu * the_spu;
+	unsigned long long spu_num_ll = spu_num;
+	unsigned long long spu_num_shifted = spu_num_ll << 32;
+	struct cached_info * c_info = get_cached_info(NULL, spu_num);
+	if (c_info == NULL) {
+        /* This legitimately happens when the SPU task ends before all 
+	 * samples are recorded.  No big deal -- so we just drop a few samples.
+	 */
+		pr_debug("SPU_PROF: No cached SPU contex "
+			  "for SPU #%d. Dropping samples.\n", spu_num);
+		return ;
+	}
+
+	map = c_info->map;
+	the_spu = c_info->the_spu;
+	spin_lock_irqsave(&buffer_lock, flags);
+	for (i = 0; i < num_samples; i++) {
+		unsigned long long file_offset;
+		unsigned int sample = *(samples+i);
+		if (sample == 0)
+			continue;
+		file_offset = vma_map_lookup(
+			map, sample, the_spu);
+		/* For now, we'll drop samples that can't be mapped.
+		 * This can happen for generated stubs executed from
+		 * the SPU stack.  Do we need to record these somehow?
+		 */
+		if (unlikely(file_offset == -1))
+			continue;
+		add_event_entry(file_offset | spu_num_shifted);
+	}
+	spin_unlock_irqrestore(&buffer_lock, flags);
+}
+
+
+int spu_sync_stop(void)
+{
+	unsigned long flags = 0;
+	int ret = spu_switch_event_unregister(&spu_active);
+	if (ret) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: spu_switch_event_unregister returned %d\n",
+		       __FUNCTION__, __LINE__, ret);
+		goto out;
+	} 
+
+	spin_lock_irqsave(&cache_lock, flags);
+	ret = release_cached_info(RELEASE_ALL);
+	spin_unlock_irqrestore(&cache_lock, flags);
+out:
+	pr_debug("spu_sync_stop -- done.\n");
+	return ret;
+}
+
+
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/cell/vma_map.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/cell/vma_map.c	2007-02-01 17:21:46.944872872 -0600
@@ -0,0 +1,229 @@
+ /*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Author: Maynard Johnson <maynardj@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/* The code in this source file is responsible for generating
+ * vma-to-fileOffset maps for both overlay and non-overlay SPU
+ * applications.
+ */
+
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/elf.h>
+#include "pr_util.h"
+
+
+void vma_map_free(struct vma_to_fileoffset_map *map)
+{
+	while (map) {
+		struct vma_to_fileoffset_map *next = map->next;
+		kfree(map);
+		map = next;
+	}
+}
+
+unsigned int
+vma_map_lookup(struct vma_to_fileoffset_map *map, unsigned int vma,
+	       const struct spu * aSpu)
+{
+	u32 offset = -1;
+	u32 ovly_grd;
+	for (; map; map = map->next) {
+		if (vma < map->vma || vma >= map->vma + map->size)
+			continue;
+
+		if (map->guard_ptr) {
+			ovly_grd = *(u32 *)(aSpu->local_store + map->guard_ptr);
+			if (ovly_grd != map->guard_val)
+				continue;
+		}
+		break;
+	}
+
+	if (likely(map != NULL)) {
+		offset = vma - map->vma + map->offset;
+	}
+	return offset;
+}
+
+static struct vma_to_fileoffset_map *
+vma_map_add(struct vma_to_fileoffset_map *map, unsigned int vma,
+	    unsigned int size, unsigned int offset, unsigned int guard_ptr,
+	    unsigned int guard_val)
+{
+	struct vma_to_fileoffset_map *new = kzalloc(sizeof(struct vma_to_fileoffset_map), GFP_KERNEL);
+	if (!new) {
+		printk(KERN_ERR "SPU_PROF: %s, line %d: malloc failed\n",
+		       __FUNCTION__, __LINE__);
+		vma_map_free(map);
+		return NULL;
+	}
+
+	new->next = map;
+	new->vma = vma;
+	new->size = size;
+	new->offset = offset;
+	new->guard_ptr = guard_ptr;
+	new->guard_val = guard_val;
+
+	return new;
+}
+
+
+/* Parse SPE ELF header and generate a list of vma_maps.
+ * A pointer to the first vma_map in the generated list
+ * of vma_maps is returned.  */
+struct vma_to_fileoffset_map * create_vma_map(const struct spu * aSpu, 
+					      unsigned long spu_elf_start)
+{
+	static const unsigned char expected[EI_PAD] = {
+		[EI_MAG0] = ELFMAG0,
+		[EI_MAG1] = ELFMAG1,
+		[EI_MAG2] = ELFMAG2,
+		[EI_MAG3] = ELFMAG3,
+		[EI_CLASS] = ELFCLASS32,
+		[EI_DATA] = ELFDATA2MSB,
+		[EI_VERSION] = EV_CURRENT,
+		[EI_OSABI] = ELFOSABI_NONE
+	};
+
+	struct vma_to_fileoffset_map *map = NULL;
+	unsigned int overlay_tbl_offset = -1;
+	unsigned long phdr_start, shdr_start;
+	Elf32_Ehdr ehdr;
+	Elf32_Phdr phdr;
+	Elf32_Shdr shdr, shdr_str;
+	Elf32_Sym sym;
+	int i, j;
+	char name[32];
+
+	unsigned int ovly_table_sym = 0;
+	unsigned int ovly_buf_table_sym = 0;
+	unsigned int ovly_table_end_sym = 0;
+	unsigned int ovly_buf_table_end_sym = 0;
+	unsigned long ovly_table;
+	unsigned int n_ovlys;
+
+	struct {
+		unsigned int vma;
+		unsigned int size;
+		unsigned int offset;
+		unsigned int buf;
+	} ovly;
+
+	/* Get and validate ELF header.  */
+
+	copy_from_user(&ehdr, (void *) spu_elf_start, sizeof (ehdr));
+	if (memcmp(ehdr.e_ident, expected, EI_PAD) != 0) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Unexpected value parsing SPU ELF\n",
+		       __FUNCTION__, __LINE__);
+		return NULL;
+	}
+	if (ehdr.e_machine != 23) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Unexpected value parsing SPU ELF\n",
+		       __FUNCTION__,  __LINE__);
+
+		return NULL;
+	}
+	if (ehdr.e_type != ET_EXEC) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Unexpected value parsing SPU ELF\n",
+		       __FUNCTION__, __LINE__);
+		return NULL;
+	}
+	phdr_start = spu_elf_start + ehdr.e_phoff;
+	shdr_start = spu_elf_start + ehdr.e_shoff;
+
+	/* Traverse program headers.  */
+	for (i = 0; i < ehdr.e_phnum; i++) {
+		copy_from_user(&phdr, (void *) (phdr_start + i * sizeof(phdr)), 
+			       sizeof(phdr));
+		if (phdr.p_type != PT_LOAD)
+			continue;
+		if (phdr.p_flags & (1 << 27))
+			continue;
+
+		map = vma_map_add(map, phdr.p_vaddr, phdr.p_memsz, 
+				  phdr.p_offset, 0, 0);
+		if (!map)
+			return NULL;
+	}
+
+	pr_debug("SPU_PROF: Created non-overlay maps\n");	
+	/* Traverse section table and search for overlay-related symbols.  */
+	for (i = 0; i < ehdr.e_shnum; i++) {
+		copy_from_user(&shdr, (void *) (shdr_start + i * sizeof(shdr)), 
+			       sizeof(shdr));
+		if (shdr.sh_type != SHT_SYMTAB)
+			continue;
+		if (shdr.sh_entsize != sizeof (sym))
+			continue;
+
+		copy_from_user(&shdr_str, 
+			       (void *) (shdr_start + shdr.sh_link * sizeof(shdr)),
+			       sizeof(shdr));
+		if (shdr_str.sh_type != SHT_STRTAB)
+			return NULL;
+
+		for (j = 0; j < shdr.sh_size / sizeof (sym); j++) {
+			copy_from_user(&sym, (void *) (spu_elf_start +
+						       shdr.sh_offset + j * sizeof (sym)),
+				       sizeof (sym));
+			copy_from_user(name, (void *) (spu_elf_start + shdr_str.sh_offset + 
+						       sym.st_name),
+				       20);
+			if (memcmp(name, "_ovly_table", 12) == 0)
+				ovly_table_sym = sym.st_value;
+			if (memcmp(name, "_ovly_buf_table", 16) == 0)
+				ovly_buf_table_sym = sym.st_value;
+			if (memcmp(name, "_ovly_table_end", 16) == 0)
+				ovly_table_end_sym = sym.st_value;
+			if (memcmp(name, "_ovly_buf_table_end", 20) == 0)
+				ovly_buf_table_end_sym = sym.st_value;
+		}
+	}
+
+	/* If we don't have overlays, we're done.  */
+	if (ovly_table_sym == 0 || ovly_buf_table_sym == 0
+	    || ovly_table_end_sym == 0 || ovly_buf_table_end_sym == 0) {
+		pr_debug("SPU_PROF: No overlay table found\n");
+		return map;
+	}
+	else {
+		pr_debug("SPU_PROF: Overlay table found\n");
+	}
+
+	overlay_tbl_offset = vma_map_lookup(map, ovly_table_sym, aSpu);
+	if (overlay_tbl_offset < 0) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Error finding SPU overlay table\n",
+		       __FUNCTION__, __LINE__);
+		return NULL;
+	}
+	ovly_table = spu_elf_start + overlay_tbl_offset;
+	n_ovlys = (ovly_table_end_sym - ovly_table_sym) / sizeof (ovly);
+
+	/* Traverse overlay table.  */
+	for (i = 0; i < n_ovlys; i++) {
+		copy_from_user(&ovly, (void *) (ovly_table + i * sizeof (ovly)),
+			       sizeof (ovly));
+		map = vma_map_add(map, ovly.vma, ovly.size, ovly.offset,
+				   ovly_buf_table_sym + (ovly.buf - 1) * 4, i + 1);
+		if (!map)
+			return NULL;
+	}
+	
+	return map;
+}
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/common.c
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/oprofile/common.c	2007-01-18 16:43:14.429510072 -0600
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/common.c	2007-02-01 17:21:46.946872568 -0600
@@ -150,6 +150,8 @@
 #ifdef CONFIG_PPC_CELL_NATIVE
 		case PPC_OPROFILE_CELL:
 			model = &op_model_cell;
+			ops->sync_start = model->sync_start;
+			ops->sync_stop = model->sync_stop;
 			break;
 #endif
 		case PPC_OPROFILE_RS64:
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/Kconfig
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/oprofile/Kconfig	2007-01-18 16:43:14.426510528 -0600
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/Kconfig	2007-02-03 17:05:51.967892936 -0600
@@ -7,7 +7,8 @@
 
 config OPROFILE
 	tristate "OProfile system profiling (EXPERIMENTAL)"
-	depends on PROFILING
+	default m
+	depends on SPU_FS && PROFILING
 	help
 	  OProfile is a profiling system capable of profiling the
 	  whole system, include the kernel, kernel modules, libraries,
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/Makefile
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/oprofile/Makefile	2007-01-18 16:43:14.429510072 -0600
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/Makefile	2007-02-01 17:21:46.948872264 -0600
@@ -11,7 +11,8 @@
 		timer_int.o )
 
 oprofile-y := $(DRIVER_OBJS) common.o backtrace.o
-oprofile-$(CONFIG_PPC_CELL_NATIVE) += op_model_cell.o
+oprofile-$(CONFIG_PPC_CELL_NATIVE) += op_model_cell.o \
+					cell/spu_profiler.o cell/vma_map.o cell/spu_task_sync.o
 oprofile-$(CONFIG_PPC64) += op_model_rs64.o op_model_power4.o
 oprofile-$(CONFIG_FSL_BOOKE) += op_model_fsl_booke.o
 oprofile-$(CONFIG_6xx) += op_model_7450.o
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/op_model_cell.c
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/oprofile/op_model_cell.c	2007-02-01 17:21:38.388840624 -0600
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/op_model_cell.c	2007-02-03 15:59:38.555810464 -0600
@@ -37,6 +37,16 @@
 #include <asm/system.h>
 
 #include "../platforms/cell/interrupt.h"
+#include "cell/pr_util.h"
+
+/* spu_cycle_reset is the number of cycles between samples.
+ * This variable is used for SPU profiling and should ONLY be set
+ * at the beginning of cell_reg_setup; otherwise, it's read-only.
+ */
+static unsigned int spu_cycle_reset = 0;
+
+#define NUM_SPUS_PER_NODE    8
+#define SPU_CYCLES_EVENT_NUM 2        /*  event number for SPU_CYCLES */
 
 #define PPU_CYCLES_EVENT_NUM 1	/*  event number for CYCLES */
 #define PPU_CYCLES_GRP_NUM   1  /* special group number for identifying
@@ -50,7 +60,6 @@
 #define NUM_TRACE_BUS_WORDS 4
 #define NUM_INPUT_BUS_WORDS 2
 
-
 struct pmc_cntrl_data {
 	unsigned long vcntr;
 	unsigned long evnts;
@@ -140,12 +149,21 @@
 /*
  * Firmware interface functions
  */
+
 static int
 rtas_ibm_cbe_perftools(int subfunc, int passthru,
 		       void *address, unsigned long length)
 {
 	u64 paddr = __pa(address);
 
+	pm_rtas_token = rtas_token("ibm,cbe-perftools");  
+
+	if (pm_rtas_token == RTAS_UNKNOWN_SERVICE) {
+	  printk(KERN_ERR
+		 "%s: rtas token ibm,cbe-perftools unknown\n",
+		 __FUNCTION__);
+	}
+
 	return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc, passthru,
 			 paddr >> 32, paddr & 0xffffffff, length);
 }
@@ -486,7 +504,12 @@
 	       struct op_system_config *sys, int num_ctrs)
 {
 	int i, j, cpu;
+	spu_cycle_reset = 0;
 
+	if (ctr[0].event == SPU_CYCLES_EVENT_NUM) {
+		spu_cycle_reset = ctr[0].count;
+		return;
+	}
 	pm_rtas_token = rtas_token("ibm,cbe-perftools");
 	if (pm_rtas_token == RTAS_UNKNOWN_SERVICE) {
 		printk(KERN_WARNING "%s: RTAS_UNKNOWN_SERVICE\n",
@@ -572,6 +595,8 @@
 	;
 }
 
+
+
 /* This function is called once for each cpu */
 static void cell_cpu_setup(struct op_counter_config *cntr)
 {
@@ -579,6 +604,9 @@
 	u32 num_enabled = 0;
 	int i;
 
+	if (spu_cycle_reset)
+		return;
+
 	/* There is one performance monitor per processor chip (i.e. node),
 	 * so we only need to perform this function once per node.
 	 */
@@ -613,11 +641,216 @@
 	;
 }
 
-static void cell_global_start(struct op_counter_config *ctr)
+#define size 24
+#define ENTRIES  (0x1<<8) /* 256 */
+#define MAXLFSR  0xFFFFFF
+
+int initial_lfsr[] =
+{16777215, 3797240, 13519805, 11602690, 6497030, 7614675, 2328937, 2889445,
+ 12364575, 8723156, 2450594, 16280864, 14742496, 10904589, 6434212, 4996256,
+ 5814270, 13014041, 9825245, 410260, 904096, 15151047, 15487695, 3061843,
+ 16482682, 7938572, 4893279, 9390321, 4320879, 5686402, 1711063, 10176714,
+ 4512270, 1057359, 16700434, 5731602, 2070114, 16030890, 1208230, 15603106,
+ 11857845, 6470172, 1362790, 7316876, 8534496, 1629197, 10003072, 1714539,
+ 1814669, 7106700, 5427154, 3395151, 3683327, 12950450, 16620273, 12122372,
+ 7194999, 9952750, 3608260, 13604295, 2266835, 14943567, 7079230, 777380,
+ 4516801, 1737661, 8730333, 13796927, 3247181, 9950017, 3481896, 16527555,
+ 13116123, 14505033, 9781119, 4860212, 7403253, 13264219, 12269980, 100120,
+ 664506, 607795, 8274553, 13133688, 6215305, 13208866, 16439693, 3320753,
+ 8773582, 13874619, 1784784, 4513501, 11002978, 9318515, 3038856, 14254582,
+ 15484958, 15967857, 13504461, 13657322, 14724513, 13955736, 5695315, 7330509,
+ 12630101, 6826854, 439712, 4609055, 13288878, 1309632, 4996398, 11392266,
+ 793740, 7653789, 2472670, 14641200, 5164364, 5482529, 10415855, 1629108,
+ 2012376, 13661123, 14655718, 9534083, 16637925, 2537745, 9787923, 12750103,
+ 4660370, 3283461, 14862772, 7034955, 6679872, 8918232, 6506913, 103649,
+ 6085577, 13324033, 14251613, 11058220, 11998181, 3100233, 468898, 7104918,
+ 12498413, 14408165, 1208514, 15712321, 3088687, 14778333, 3632503, 11151952,
+ 98896, 9159367, 8866146, 4780737, 4925758, 12362320, 4122783, 8543358,
+ 7056879, 10876914, 6282881, 1686625, 5100373, 4573666, 9265515, 13593840,
+ 5853060, 1188880, 4237111, 15765555, 14344137, 4608332, 6590210, 13745050,
+ 10916568, 12340402, 7145275, 4417153, 2300360, 12079643, 7608534, 15238251,
+ 4947424, 7014722, 3984546, 7168073, 10759589, 16293080, 3757181, 4577717,
+ 5163790, 2488841, 4650617, 3650022, 5440654, 1814617, 6939232, 15540909,
+ 501788, 1060986, 5058235, 5078222, 3734500, 10762065, 390862, 5172712,
+ 1070780, 7904429, 1669757, 3439997, 2956788, 14944927, 12496638, 994152,
+ 8901173, 11827497, 4268056, 15725859, 1694506, 5451950, 2892428, 1434298,
+ 9048323, 13558747, 15083840, 8154495, 15830901, 391127, 14970070, 2451434,
+ 2080347, 10775644, 14599429, 12540753, 4813943, 16140655, 2421772, 12724304,
+ 12935733, 7206473, 5697333, 10328104, 2418008, 13547986, 284246, 1732363,
+ 16375319, 8109554, 16372365, 14346072, 1835890, 13059499, 2442500, 4110674};
+
+/*
+ * The hardware uses an LFSR counting sequence to determine when to capture
+ * the SPU PCs.  The SPU PC capture is done when the LFSR sequence reaches the
+ * last value in the sequence.  An LFSR sequence is like a puesdo random 
+ * number sequence where each number occurs once in the sequence but the 
+ * sequence is not in numerical order.  To reduce the calculation time, a 
+ * sequence of 256 precomputed values in the LFSR sequence are stored in a
+ * table.  The nearest precomputed value is used as the initial point from
+ * which to caculate the desired LFSR value that is n from the end of the 
+ * sequence.  The lookup table reduces the maximum number of iterations in 
+ * the loop from 2^24 to 2^16.
+ */
+static int calculate_lfsr(int n)
 {
-	u32 cpu;
+  int i;
+
+  int start_lfsr_index;
+  unsigned int newlfsr0;
+  unsigned int lfsr = MAXLFSR;
+  unsigned int binsize = (MAXLFSR+1)/ENTRIES;
+  unsigned int howmany; 
+
+  start_lfsr_index = (MAXLFSR - n) / binsize;
+  lfsr = initial_lfsr[start_lfsr_index];
+  howmany = (MAXLFSR - n) - (start_lfsr_index * (binsize));
+
+  for (i = 2; i < howmany+2; i++) {
+    newlfsr0 = (((lfsr >> (size - 1 - 0)) & 1) ^
+		((lfsr >> (size - 1 - 1)) & 1) ^
+		(((lfsr >> (size - 1 - 6)) & 1) ^
+		 ((lfsr >> (size - 1 - 23)) & 1)));
+
+    lfsr >>= 1;
+    lfsr = lfsr | (newlfsr0 << (size - 1)); 
+  }
+  return lfsr;
+}
+
+static void pm_rtas_activate_spu_profiling(u32 node)
+{
+	int ret, i;
+	struct pm_signal pm_signal_local[NR_PHYS_CTRS];
+
+	/* Set up the rtas call to configure the debug bus to 
+	 * route the SPU PCs.  Setup the pm_signal for each SPU */
+	for (i = 0; i < NUM_SPUS_PER_NODE; i++) {
+		pm_signal_local[i].cpu = node;
+		pm_signal_local[i].signal_group = 41;
+		pm_signal_local[i].bus_word = 1 << i / 2; /* spu i on 
+							   * word (i/2) 
+							   */
+		pm_signal_local[i].sub_unit = i;	/* spu i */
+		pm_signal_local[i].bit = 63;
+	}
+
+	pm_rtas_token = rtas_token("ibm,cbe-perftools");
+	if (pm_rtas_token == RTAS_UNKNOWN_SERVICE) {
+		printk(KERN_WARNING "%s: RTAS_UNKNOWN_SERVICE \n",
+		       __FUNCTION__);
+	}
+
+	ret = rtas_ibm_cbe_perftools(SUBFUNC_ACTIVATE, PASSTHRU_ENABLE,
+				     pm_signal_local,
+				     8 * sizeof(struct pm_signal)); //FIXME 8 to #define
+
+	if (ret)
+		printk(KERN_WARNING "%s: rtas returned: %d\n",
+		       __FUNCTION__, ret);
+
+}
+
+#ifdef CONFIG_CPU_FREQ
+static int
+oprof_cpufreq_notify(struct notifier_block *nb, unsigned long val, void *data)
+{
+	int ret = 0;
+	struct cpufreq_freqs * frq = data;
+	if ((val == CPUFREQ_PRECHANGE && frq->old < frq->new) ||
+	    (val == CPUFREQ_POSTCHANGE && frq->old > frq->new) ||
+	    (val == CPUFREQ_RESUMECHANGE || val == CPUFREQ_SUSPENDCHANGE))
+		set_profiling_frequency(frq->new, spu_cycle_reset);
+	return ret;
+}
+
+static struct notifier_block cpu_freq_notifier_block = {
+	.notifier_call	= oprof_cpufreq_notify
+};
+#endif
+
+static void cell_global_start_spu(struct op_counter_config *ctr)
+{
+	int subfunc, rtn_value;
+	unsigned int lfsr_value;
+	int cpu;
+	int ret = 0;
+	unsigned int cpu_khzfreq = 0;
+
+	/* The SPU profiling uses time-based profiling based on
+	 * cpu frequency, so if configured with the CPU_FREQ
+	 * option, we should detect frequency changes and react
+	 * accordingly.
+	 */
+#ifdef CONFIG_CPU_FREQ
+	ret = cpufreq_register_notifier(&cpu_freq_notifier_block,
+					CPUFREQ_TRANSITION_NOTIFIER);
+	if (ret < 0)
+		printk(KERN_ERR "CPU freq change registration failed: %d\n",
+		       ret);
+	else
+		cpu_khzfreq = cpufreq_quick_get(smp_processor_id());
+#endif
+
+	set_profiling_frequency(cpu_khzfreq, spu_cycle_reset);
+
+	for_each_online_cpu(cpu) {
+		if (cbe_get_hw_thread_id(cpu))
+			continue;
+		/* Setup SPU cycle-based profiling.
+		 * Set perf_mon_control bit 0 to a zero before
+		 * enabling spu collection hardware.
+		 */
+		cbe_write_pm(cpu, pm_control, 0);
+
+		pm_rtas_activate_spu_profiling(cbe_cpu_to_node(cpu));
+
+		if (spu_cycle_reset > 0xFFFFFE) 
+				lfsr_value = calculate_lfsr(1);  /* use largest possible 
+								  *  value 
+								  */
+		else 
+		    lfsr_value = calculate_lfsr(spu_cycle_reset);
+
+		if (lfsr_value == 0) {  /* must use a non zero value.  Zero
+					 * disables data collection.
+					 */
+				lfsr_value = calculate_lfsr(1);  /* use largest possible 
+								  * value 
+								 */
+		}
+
+		lfsr_value = lfsr_value << 8; /* shift lfsr to correct 
+					       * register location
+					       */
+		
+		pm_rtas_token = rtas_token("ibm,cbe-spu-perftools");  
+
+		if (pm_rtas_token == RTAS_UNKNOWN_SERVICE) {
+			printk(KERN_ERR
+			       "%s: rtas token ibm,cbe-spu-perftools unknown\n",
+			       __FUNCTION__);
+		}
+
+		subfunc = 2;	// 2 - activate SPU tracing, 3 - deactivate
+
+		rtn_value = rtas_call(pm_rtas_token, 3, 1, NULL, subfunc,
+			  cbe_cpu_to_node(cpu), lfsr_value);
+
+		if (rtn_value != 0)
+			printk(KERN_ERR
+			       "%s: rtas call ibm,cbe-spu-perftools failed, return = %d\n",
+			       __FUNCTION__, rtn_value);
+	}
+
+	start_spu_profiling(spu_cycle_reset);
+
+	oprofile_running = 1;
+}
+
+static void cell_global_start_ppu(struct op_counter_config *ctr)
+{
+	u32 cpu, i;
 	u32 interrupt_mask = 0;
-	u32 i;
 
 	/* This routine gets called once for the system.
 	 * There is one performance monitor per node, so we
@@ -658,7 +891,61 @@
 	start_virt_cntrs();
 }
 
-static void cell_global_stop(void)
+
+static void cell_global_start(struct op_counter_config *ctr)
+{
+	if (spu_cycle_reset) {
+		cell_global_start_spu(ctr);
+	} else {
+		cell_global_start_ppu(ctr);
+	}
+}
+
+static void cell_global_stop_spu(void)
+{
+	int subfunc, rtn_value;
+	unsigned int lfsr_value;
+	int cpu;
+
+	oprofile_running = 0;
+
+#ifdef CONFIG_CPU_FREQ
+	cpufreq_unregister_notifier(&cpu_freq_notifier_block,
+				    CPUFREQ_TRANSITION_NOTIFIER);
+#endif
+
+	for_each_online_cpu(cpu) {
+		if (cbe_get_hw_thread_id(cpu))
+			continue;
+
+		pm_rtas_token = rtas_token("ibm,cbe-spu-perftools");  
+
+		if (pm_rtas_token == RTAS_UNKNOWN_SERVICE) {
+			printk(KERN_ERR
+			       "%s: rtas token ibm,cbe-spu-perftools unknown\n",
+			       __FUNCTION__);
+		}
+
+		subfunc = 3;	// 2 - activate SPU tracing, 3 - deactivate
+		lfsr_value = 0x8f100000;
+
+		rtn_value =
+		    rtas_call(pm_rtas_token, 3, 1, NULL, subfunc,
+			      cbe_cpu_to_node(cpu), lfsr_value);
+
+		if (rtn_value != 0)
+			printk
+			    ("ERROR, rtas call ibm,cbe-spu-perftools failed, return = %d\n",
+			     rtn_value);
+
+		/* Deactivate the signals */
+		pm_rtas_reset_signals(cbe_cpu_to_node(cpu));
+	}
+
+	stop_spu_profiling();
+}
+
+static void cell_global_stop_ppu(void)
 {
 	int cpu;
 
@@ -686,6 +973,16 @@
 	}
 }
 
+static void cell_global_stop(void)
+{
+	if (spu_cycle_reset) {
+		cell_global_stop_spu();
+	} else {
+		cell_global_stop_ppu();
+	}
+
+}
+
 static void
 cell_handle_interrupt(struct pt_regs *regs, struct op_counter_config *ctr)
 {
@@ -754,10 +1051,35 @@
 	spin_unlock_irqrestore(&virt_cntr_lock, flags);
 }
 
+/* This function is called from the generic OProfile
+ * driver.  When profiling PPUs, we need to do the
+ * generic sync start; otherwise, do spu_sync_start.
+ */
+static int cell_sync_start(void)
+{
+	if (spu_cycle_reset)
+		return spu_sync_start();
+	else
+		return DO_GENERIC_SYNC;
+}
+
+static int cell_sync_stop(void)
+{
+	if (spu_cycle_reset)
+		return spu_sync_stop();
+	else
+		return 1;
+}
+
+
 struct op_powerpc_model op_model_cell = {
 	.reg_setup = cell_reg_setup,
 	.cpu_setup = cell_cpu_setup,
 	.global_start = cell_global_start,
 	.global_stop = cell_global_stop,
+	.sync_start = cell_sync_start,
+	.sync_stop = cell_sync_stop,
 	.handle_interrupt = cell_handle_interrupt,
 };
+
+
Index: linux-2.6.20-rc1/arch/powerpc/platforms/cell/spufs/sched.c
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/platforms/cell/spufs/sched.c	2007-02-01 17:21:41.943834416 -0600
+++ linux-2.6.20-rc1/arch/powerpc/platforms/cell/spufs/sched.c	2007-02-01 17:21:46.957870896 -0600
@@ -129,6 +129,7 @@
 	ctx->spu = spu;
 	ctx->ops = &spu_hw_ops;
 	spu->pid = current->pid;
+	spu->tgid = current->tgid;
 	spu->prio = current->prio;
 	spu->mm = ctx->owner;
 	mm_needs_global_tlbie(spu->mm);
@@ -161,6 +162,7 @@
 	spu->dma_callback = NULL;
 	spu->mm = NULL;
 	spu->pid = 0;
+	spu->tgid = 0;
 	spu->prio = MAX_PRIO;
 	ctx->ops = &spu_backing_ops;
 	ctx->spu = NULL;
Index: linux-2.6.20-rc1/drivers/oprofile/buffer_sync.c
===================================================================
--- linux-2.6.20-rc1.orig/drivers/oprofile/buffer_sync.c	2007-01-18 16:43:11.675529376 -0600
+++ linux-2.6.20-rc1/drivers/oprofile/buffer_sync.c	2007-02-01 17:21:46.960870440 -0600
@@ -26,6 +26,7 @@
 #include <linux/profile.h>
 #include <linux/module.h>
 #include <linux/fs.h>
+#include <linux/oprofile.h>
  
 #include "oprofile_stats.h"
 #include "event_buffer.h"
Index: linux-2.6.20-rc1/drivers/oprofile/event_buffer.h
===================================================================
--- linux-2.6.20-rc1.orig/drivers/oprofile/event_buffer.h	2007-01-18 16:43:11.673529680 -0600
+++ linux-2.6.20-rc1/drivers/oprofile/event_buffer.h	2007-02-01 17:21:46.962870136 -0600
@@ -19,28 +19,10 @@
  
 /* wake up the process sleeping on the event file */
 void wake_up_buffer_waiter(void);
- 
-/* Each escaped entry is prefixed by ESCAPE_CODE
- * then one of the following codes, then the
- * relevant data.
- */
-#define ESCAPE_CODE			~0UL
-#define CTX_SWITCH_CODE 		1
-#define CPU_SWITCH_CODE 		2
-#define COOKIE_SWITCH_CODE 		3
-#define KERNEL_ENTER_SWITCH_CODE	4
-#define KERNEL_EXIT_SWITCH_CODE		5
-#define MODULE_LOADED_CODE		6
-#define CTX_TGID_CODE			7
-#define TRACE_BEGIN_CODE		8
-#define TRACE_END_CODE			9
- 
+  
 #define INVALID_COOKIE ~0UL
 #define NO_COOKIE 0UL
 
-/* add data to the event buffer */
-void add_event_entry(unsigned long data);
- 
 extern struct file_operations event_buffer_fops;
  
 /* mutex between sync_cpu_buffers() and the
Index: linux-2.6.20-rc1/drivers/oprofile/oprof.c
===================================================================
--- linux-2.6.20-rc1.orig/drivers/oprofile/oprof.c	2007-01-18 16:43:11.675529376 -0600
+++ linux-2.6.20-rc1/drivers/oprofile/oprof.c	2007-02-01 17:21:46.964869832 -0600
@@ -53,9 +53,23 @@
 	 * us missing task deaths and eventually oopsing
 	 * when trying to process the event buffer.
 	 */
+	if (oprofile_ops.sync_start) {
+		int sync_ret = oprofile_ops.sync_start();
+		switch (sync_ret) {
+			case 0: goto post_sync;
+				break;
+			case 1: goto do_generic;
+				break;
+			case -1: goto out3;
+				break;
+			default: goto out3;
+		}
+	}
+do_generic:
 	if ((err = sync_start()))
 		goto out3;
 
+post_sync:
 	is_setup = 1;
 	mutex_unlock(&start_mutex);
 	return 0;
@@ -118,7 +132,19 @@
 void oprofile_shutdown(void)
 {
 	mutex_lock(&start_mutex);
+        if (oprofile_ops.sync_stop) {
+                int sync_ret = oprofile_ops.sync_stop();
+                switch (sync_ret) {
+                        case 0: goto post_sync;
+                                break;
+                        case 1: goto do_generic;
+                                break;
+			default: goto post_sync;
+                }
+        }
+do_generic:
 	sync_stop();
+post_sync:
 	if (oprofile_ops.shutdown)
 		oprofile_ops.shutdown();
 	is_setup = 0;
Index: linux-2.6.20-rc1/include/asm-powerpc/oprofile_impl.h
===================================================================
--- linux-2.6.20-rc1.orig/include/asm-powerpc/oprofile_impl.h	2007-01-18 16:43:19.315566704 -0600
+++ linux-2.6.20-rc1/include/asm-powerpc/oprofile_impl.h	2007-02-01 17:21:46.966869528 -0600
@@ -47,6 +47,8 @@
         void (*global_start) (struct op_counter_config *);
 	void (*stop) (void);
 	void (*global_stop) (void);
+	int (*sync_start)(void);
+	int (*sync_stop)(void);
 	void (*handle_interrupt) (struct pt_regs *,
 				  struct op_counter_config *);
 	int num_counters;
Index: linux-2.6.20-rc1/include/asm-powerpc/spu.h
===================================================================
--- linux-2.6.20-rc1.orig/include/asm-powerpc/spu.h	2007-02-01 17:21:41.950833352 -0600
+++ linux-2.6.20-rc1/include/asm-powerpc/spu.h	2007-02-05 08:34:38.498856800 -0600
@@ -128,6 +128,7 @@
 	struct spu_runqueue *rq;
 	unsigned long long timestamp;
 	pid_t pid;
+	pid_t tgid;
 	int prio;
 	int class_0_pending;
 	spinlock_t register_lock;
@@ -153,6 +154,11 @@
 int spu_irq_class_0_bottom(struct spu *spu);
 int spu_irq_class_1_bottom(struct spu *spu);
 void spu_irq_setaffinity(struct spu *spu, int cpu);
+void * spu_get_profile_private(struct spu_context * ctx);
+void spu_set_profile_private(struct spu_context * ctx, void * profile_info,
+			     struct kref * prof_info_kref, 
+			     void (* prof_info_release) (struct kref * kref));
+
 
 /* system callbacks from the SPU */
 struct spu_syscall_block {
Index: linux-2.6.20-rc1/include/linux/oprofile.h
===================================================================
--- linux-2.6.20-rc1.orig/include/linux/oprofile.h	2007-01-18 16:43:18.379575976 -0600
+++ linux-2.6.20-rc1/include/linux/oprofile.h	2007-02-01 17:21:46.970868920 -0600
@@ -17,6 +17,28 @@
 #include <linux/spinlock.h>
 #include <asm/atomic.h>
  
+/* Each escaped entry is prefixed by ESCAPE_CODE
+ * then one of the following codes, then the
+ * relevant data.
+ * These #defines live in this file so that arch-specific
+ * buffer sync'ing code can access them.  
+ */
+#define ESCAPE_CODE                     ~0UL
+#define CTX_SWITCH_CODE                 1
+#define CPU_SWITCH_CODE                 2
+#define COOKIE_SWITCH_CODE              3
+#define KERNEL_ENTER_SWITCH_CODE        4
+#define KERNEL_EXIT_SWITCH_CODE         5
+#define MODULE_LOADED_CODE              6
+#define CTX_TGID_CODE                   7
+#define TRACE_BEGIN_CODE                8
+#define TRACE_END_CODE                  9
+#define XEN_ENTER_SWITCH_CODE          10
+#define SPU_PROFILING_CODE             11
+#define SPU_CTX_SWITCH_CODE            12
+#define SPU_OFFSET_CODE                13
+#define SPU_COOKIE_CODE                14
+
 struct super_block;
 struct dentry;
 struct file_operations;
@@ -35,6 +57,14 @@
 	int (*start)(void);
 	/* Stop delivering interrupts. */
 	void (*stop)(void);
+	/* Arch-specific buffer sync functions.
+	 * Return value = 0:  Success
+	 * Return value = -1: Failure
+	 * Return value = 1:  Run generic sync function
+	 */
+	int (*sync_start)(void);
+	int (*sync_stop)(void);
+
 	/* Initiate a stack backtrace. Optional. */
 	void (*backtrace)(struct pt_regs * const regs, unsigned int depth);
 	/* CPU identification string. */
@@ -56,6 +86,13 @@
 void oprofile_arch_exit(void);
 
 /**
+ * Add data to the event buffer.
+ * The data passed is free-form, but typically consists of
+ * file offsets, dcookies, context information, and ESCAPE codes.
+ */
+void add_event_entry(unsigned long data);
+ 
+/**
  * Add a sample. This may be called from any context. Pass
  * smp_processor_id() as cpu.
  */
Index: linux-2.6.20-rc1/kernel/hrtimer.c
===================================================================
--- linux-2.6.20-rc1.orig/kernel/hrtimer.c	2007-01-18 16:43:05.808489704 -0600
+++ linux-2.6.20-rc1/kernel/hrtimer.c	2007-02-01 17:21:46.973868464 -0600
@@ -335,6 +335,7 @@
 
 	return orun;
 }
+EXPORT_SYMBOL_GPL(hrtimer_forward);
 
 /*
  * enqueue_hrtimer - internal function to (re)start a timer
Index: linux-2.6.20-rc1/arch/powerpc/kernel/time.c
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/kernel/time.c	2007-02-02 15:47:08.624906680 -0600
+++ linux-2.6.20-rc1/arch/powerpc/kernel/time.c	2007-02-02 17:06:28.183894912 -0600
@@ -122,6 +122,7 @@
 static long timezone_offset;
 
 unsigned long ppc_proc_freq;
+EXPORT_SYMBOL(ppc_proc_freq);
 unsigned long ppc_tb_freq;
 
 static u64 tb_last_jiffy __cacheline_aligned_in_smp;
Index: linux-2.6.20-rc1/arch/powerpc/platforms/cell/spufs/spufs.h
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/platforms/cell/spufs/spufs.h	2007-02-01 17:21:41.945834112 -0600
+++ linux-2.6.20-rc1/arch/powerpc/platforms/cell/spufs/spufs.h	2007-02-05 08:06:01.793907392 -0600
@@ -75,6 +75,9 @@
 
 	struct list_head gang_list;
 	struct spu_gang *gang;
+	void * profile_private;		/* To be used only by profiler */
+	struct kref * prof_priv_kref;
+	void (* prof_priv_release) (struct kref *kref);
 };
 
 struct spu_gang {
Index: linux-2.6.20-rc1/arch/powerpc/platforms/cell/spufs/context.c
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/platforms/cell/spufs/context.c	2007-02-05 14:42:04.359859432 -0600
+++ linux-2.6.20-rc1/arch/powerpc/platforms/cell/spufs/context.c	2007-02-06 16:44:05.983965096 -0600
@@ -22,6 +22,7 @@
 
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/module.h>
 #include <linux/slab.h>
 #include <asm/spu.h>
 #include <asm/spu_csa.h>
@@ -71,6 +72,8 @@
 	spu_fini_csa(&ctx->csa);
 	if (ctx->gang)
 		spu_gang_remove_ctx(ctx->gang, ctx);
+	if (ctx->prof_priv_kref)
+		kref_put(ctx->prof_priv_kref, ctx->prof_priv_release);
 	kfree(ctx);
 }
 
@@ -200,3 +203,29 @@
 
 	downgrade_write(&ctx->state_sema);
 }
+
+/* This interface allows a profiler (e.g., OProfile) to store
+ * spu_context information needed for profiling, allowing it to
+ * be saved across context save/restore operation.
+ *
+ * Assumes the caller has already incremented the ref count to
+ * profile_info; then spu_context_destroy must call kref_put
+ * on prof_info_kref.
+ */
+void spu_set_profile_private(struct spu_context * ctx, void * profile_info,
+			     struct kref * prof_info_kref,
+			     void (* prof_info_release) (struct kref * kref))
+{
+	ctx->profile_private = profile_info;
+	ctx->prof_priv_kref = prof_info_kref;
+	ctx->prof_priv_release = prof_info_release;
+}
+EXPORT_SYMBOL_GPL(spu_set_profile_private);
+
+void * spu_get_profile_private(struct spu_context * ctx)
+{
+	return ctx->profile_private;
+}
+EXPORT_SYMBOL_GPL(spu_get_profile_private);
+
+



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
@ 2007-02-06 23:02   ` Carl Love
  0 siblings, 0 replies; 66+ messages in thread
From: Carl Love @ 2007-02-06 23:02 UTC (permalink / raw)
  To: linux-kernel; +Cc: linuxppc-dev, cbe-oss-dev

This is the first update to the patch previously posted by Maynard
Johnson as "PATCH 4/4. Add support to OProfile for profiling CELL".  

This repost fixes the line wrap issue that Ben mentioned.  Also the kref
handling for the cached info has been fixed and simplified.

There are still a few items from the comments being discussed
specifically how to profile the dynamic code for the SPFS context switch
code and how to deal with dynamic code stubs for library support.  Our
proposal is to assign the samples from the SPFS and dynamic library code
to an anonymous sample bucket.  The support for properly handling the
symbol extraction in these cases would be deferred to a later SDK.    

There is also a bug in profiling overlay code that we are investigating.


Subject: Add support to OProfile for profiling Cell BE SPUs

From: Maynard Johnson <maynardj@us.ibm.com>

This patch updates the existing arch/powerpc/oprofile/op_model_cell.c
to add in the SPU profiling capabilities.  In addition, a 'cell' subdirectory
was added to arch/powerpc/oprofile to hold Cell-specific SPU profiling
code.

Signed-off-by: Carl Love <carll@us.ibm.com>
Signed-off-by: Maynard Johnson <mpjohn@us.ibm.com>

Index: linux-2.6.20-rc1/arch/powerpc/configs/cell_defconfig
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/configs/cell_defconfig	2007-01-18 16:43:14.230540320 -0600
+++ linux-2.6.20-rc1/arch/powerpc/configs/cell_defconfig	2007-02-01 17:21:46.928875304 -0600
@@ -1403,7 +1403,7 @@
 # Instrumentation Support
 #
 CONFIG_PROFILING=y
-CONFIG_OPROFILE=y
+CONFIG_OPROFILE=m
 # CONFIG_KPROBES is not set
 
 #
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/cell/pr_util.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/cell/pr_util.h	2007-02-03 15:56:01.094856152 -0600
@@ -0,0 +1,78 @@
+ /*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Author: Maynard Johnson <maynardj@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef PR_UTIL_H
+#define PR_UTIL_H
+
+#include <linux/cpumask.h>
+#include <linux/oprofile.h>
+#include <asm/cell-pmu.h>
+#include <asm/spu.h>
+
+static inline int number_of_online_nodes(void) 
+{
+	u32 cpu; u32 tmp;
+	int nodes = 0;
+	for_each_online_cpu(cpu) {
+		tmp = cbe_cpu_to_node(cpu) + 1;
+		if (tmp > nodes)
+			nodes++;
+	}
+	return nodes;
+}
+
+/* Defines used for sync_start */
+#define SKIP_GENERIC_SYNC 0
+#define SYNC_START_ERROR -1
+#define DO_GENERIC_SYNC 1
+
+struct vma_to_fileoffset_map
+{
+	struct vma_to_fileoffset_map *next;
+	unsigned int vma;
+	unsigned int size;
+	unsigned int offset;
+	unsigned int guard_ptr;
+	unsigned int guard_val;
+};
+
+/* The three functions below are for maintaining and accessing
+ * the vma-to-fileoffset map.
+ */
+struct vma_to_fileoffset_map * create_vma_map(const struct spu * spu, u64 objectid);
+unsigned int vma_map_lookup(struct vma_to_fileoffset_map *map, unsigned int vma,
+			    const struct spu * aSpu);
+void vma_map_free(struct vma_to_fileoffset_map *map);
+
+/*
+ * Entry point for SPU profiling.
+ * cycles_reset is the SPU_CYCLES count value specified by the user.
+ */
+void start_spu_profiling(unsigned int cycles_reset);
+
+void stop_spu_profiling(void);
+
+ 
+/* add the necessary profiling hooks */
+int spu_sync_start(void);
+
+/* remove the hooks */
+int spu_sync_stop(void);
+ 
+/* Record SPU program counter samples to the oprofile event buffer. */
+void spu_sync_buffer(int spu_num, unsigned int * samples, 
+		     int num_samples);
+
+void set_profiling_frequency(unsigned int freq_khz, unsigned int cycles_reset);
+
+#endif    // PR_UTIL_H 
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/cell/spu_profiler.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/cell/spu_profiler.c	2007-02-05 09:32:25.708937424 -0600
@@ -0,0 +1,203 @@
+/*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Authors: Maynard Johnson <maynardj@us.ibm.com>
+ *          Carl Love <carll@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/hrtimer.h>
+#include <linux/smp.h>
+#include <linux/slab.h>
+#include <asm/cell-pmu.h>
+#include <asm/time.h>
+#include "pr_util.h"
+
+#define TRACE_ARRAY_SIZE 1024
+#define SCALE_SHIFT 14 
+
+static u32 * samples;
+
+static int spu_prof_running = 0;
+static unsigned int profiling_interval = 0;
+
+extern int spu_prof_num_nodes;
+
+
+#define NUM_SPU_BITS_TRBUF 16
+#define SPUS_PER_TB_ENTRY   4
+#define SPUS_PER_NODE       8
+
+void set_profiling_frequency(unsigned int freq_khz, unsigned int cycles_reset)
+{
+	unsigned long nsPerCyc;
+	if (!freq_khz)
+		freq_khz = ppc_proc_freq/1000;
+
+        /* To calculate a timeout in nanoseconds, the basic
+	 * formula is ns = cycles_reset * (NSEC_PER_SEC / cpu frequency).
+	 * To avoid floating point math, we use the scale math
+	 * technique as described in linux/jiffies.h.  We use
+	 * a scale factor of SCALE_SHIFT,which provides 4 decimal places
+	 * of precision, which is close enough for the purpose at hand.
+	 */
+
+	nsPerCyc = (USEC_PER_SEC << SCALE_SHIFT)/freq_khz;
+	profiling_interval = (nsPerCyc * cycles_reset) >> SCALE_SHIFT;
+
+}
+
+/*
+ * Extract SPU PC from trace buffer entry
+ */
+static void spu_pc_extract(int cpu, int entry)
+{
+        /* the trace buffer is 128 bits */
+	u64 trace_buffer[2];
+	u64 spu_pc_lower;  
+	u64 spu_pc_upper;
+	u64 spu_mask;
+	int spu;
+	int node_factor;
+	
+	spu_mask = 0xFFFF;
+	node_factor = cbe_cpu_to_node(cpu) * SPUS_PER_NODE;
+	
+	/* Each SPU PC is 16 bits; hence, four spus in each of 
+	 * the two 64-bit buffer entries that make up the
+	 * 128-bit trace_buffer entry.  Process the upper and
+	 * lower 64-bit values simultaneously.
+	 * trace[0] SPU PC contents are: 0 1 2 3
+	 * trace[1] SPU PC contents are: 4 5 6 7
+	 */
+
+	cbe_read_trace_buffer(cpu, trace_buffer);  
+
+	for (spu = SPUS_PER_TB_ENTRY-1; spu >= 0; spu--) {
+		spu_pc_lower = spu_mask & trace_buffer[0];
+		trace_buffer[0] = trace_buffer[0] >> NUM_SPU_BITS_TRBUF;
+
+		spu_pc_upper = spu_mask & trace_buffer[1];
+		trace_buffer[1] = trace_buffer[1] >> NUM_SPU_BITS_TRBUF;
+		
+		/* spu PC trace entry is upper 16 bits of the
+		 * 18 bit SPU program counter 
+		 */
+		spu_pc_lower = spu_pc_lower << 2;
+		spu_pc_upper = spu_pc_upper << 2;
+		
+		samples[((node_factor + spu) * TRACE_ARRAY_SIZE) + entry]
+			= (u32) spu_pc_lower;
+		samples[((node_factor + spu + SPUS_PER_TB_ENTRY) 
+			 * TRACE_ARRAY_SIZE) + entry] = (u32) spu_pc_upper;
+	}
+}
+
+static int cell_spu_pc_collection(int cpu)
+{
+	u32 trace_addr;
+	int entry;
+
+	/* process the collected SPU PC for the node */
+
+	entry = 0;
+
+	trace_addr = cbe_read_pm(cpu, trace_address);
+	while ((trace_addr & CBE_PM_TRACE_BUF_EMPTY) != 0x400)
+	{
+		/* there is data in the trace buffer to process */
+		spu_pc_extract(cpu, entry);
+
+		entry++;
+
+		if (entry >= TRACE_ARRAY_SIZE) 
+			/* spu_samples is full */
+			break;
+
+		trace_addr = cbe_read_pm(cpu, trace_address);
+	}
+	return(entry);
+}
+
+
+static int profile_spus(struct hrtimer * timer)
+{
+	ktime_t kt;
+	int cpu, node, k, num_samples, spu_num;
+	
+	if (!spu_prof_running)
+		goto stop;
+
+	for_each_online_cpu(cpu) {
+		if (cbe_get_hw_thread_id(cpu))
+			continue;
+
+		node = cbe_cpu_to_node(cpu);
+
+		num_samples = cell_spu_pc_collection(cpu);
+
+		if (num_samples == 0)
+			continue;
+
+		for (k = 0; k < SPUS_PER_NODE; k++) {
+			spu_num = k + (node * SPUS_PER_NODE);
+			spu_sync_buffer(spu_num, 
+					samples + (spu_num * TRACE_ARRAY_SIZE),
+					num_samples);
+		}
+	}
+	smp_wmb();
+
+	kt = ktime_set(0, profiling_interval);
+	if (!spu_prof_running)
+		goto stop;
+	hrtimer_forward(timer, timer->base->get_time(), kt);
+	return HRTIMER_RESTART;
+
+ stop:
+	printk(KERN_INFO "SPU_PROF: spu-prof timer ending\n");
+	return HRTIMER_NORESTART;
+}
+
+static struct hrtimer timer;
+/*
+ * Entry point for SPU profiling.
+ * NOTE:  SPU profiling is done system-wide, not per-CPU.
+ *
+ * cycles_reset is the count value specified by the user when
+ * setting up OProfile to count SPU_CYCLES.
+ */
+void start_spu_profiling(unsigned int cycles_reset) {
+
+	ktime_t kt;
+	
+	pr_debug("timer resolution: %lu\n", 
+		 TICK_NSEC);
+	kt = ktime_set(0, profiling_interval);
+	hrtimer_init(&timer, CLOCK_MONOTONIC, HRTIMER_REL);
+	timer.expires = kt;
+	timer.function = profile_spus;
+
+        /* Allocate arrays for collecting SPU PC samples */
+	samples = (u32 *) kzalloc(spu_prof_num_nodes * SPUS_PER_NODE * 
+				  TRACE_ARRAY_SIZE * sizeof(u32), GFP_KERNEL);
+
+	spu_prof_running = 1;
+	hrtimer_start(&timer, kt, HRTIMER_REL);
+}
+
+void stop_spu_profiling(void) 
+{
+	spu_prof_running = 0;
+	hrtimer_cancel(&timer);
+	kfree(samples);
+	pr_debug("SPU_PROF: stop_spu_profiling issued\n");
+}
+
+
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/cell/spu_task_sync.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/cell/spu_task_sync.c	2007-02-06 16:43:27.832908640 -0600
@@ -0,0 +1,425 @@
+/*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Author: Maynard Johnson <maynardj@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/* The purpose of this file is to handle SPU event task switching
+ * and to record SPU context information into the OProfile
+ * event buffer. 
+ *
+ * Additionally, the spu_sync_buffer function is provided as a helper
+ * for recoding actual SPU program counter samples to the event buffer.
+ */
+
+#include <linux/notifier.h>
+#include <linux/list.h>
+#include <linux/numa.h>
+#include <linux/mm.h>
+#include <linux/dcookies.h>
+#include <linux/spinlock.h>
+#include <linux/kref.h>
+#include <linux/oprofile.h>
+#include "pr_util.h"
+
+#define RELEASE_ALL 9999
+
+static spinlock_t buffer_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t cache_lock = SPIN_LOCK_UNLOCKED;
+static int num_spu_nodes;
+int spu_prof_num_nodes;
+
+/* Container for caching information about an active SPU task.
+ * 
+ */
+struct cached_info {
+	struct vma_to_fileoffset_map * map;
+	struct spu * the_spu;   /* needed to access pointer to local_store */
+	struct kref cache_ref;
+};
+
+static struct cached_info * spu_info[MAX_NUMNODES * 8];
+
+static void destroy_cached_info(struct kref * kref)
+{
+	struct cached_info * info;
+	info = container_of(kref, struct cached_info, cache_ref);
+	vma_map_free(info->map);
+	kfree(info);
+}
+
+/* Return the cached_info for the passed SPU number.
+ * 
+ */
+static struct cached_info * get_cached_info(struct spu * the_spu, int spu_num)
+{
+	struct cached_info * ret_info = NULL;
+	unsigned long flags = 0;
+	if (spu_num >= num_spu_nodes) {
+		printk(KERN_ERR "SPU_PROF: " 
+		       "%s, line %d: Invalid index %d into spu info cache\n",
+		       __FUNCTION__, __LINE__, spu_num); 
+		goto out;
+	}
+	spin_lock_irqsave(&cache_lock, flags);
+	if (!spu_info[spu_num] && the_spu)
+		spu_info[spu_num] = (struct cached_info *)
+			spu_get_profile_private(the_spu->ctx);
+
+	ret_info = spu_info[spu_num];
+	spin_unlock_irqrestore(&cache_lock, flags);
+ out:
+	return ret_info;
+}
+
+
+/* Looks for cached info for the passed spu.  If not found, the
+ * cached info is created for the passed spu.
+ * Returns 0 for success; otherwise, -1 for error.  
+ */ 
+static int
+prepare_cached_spu_info(struct spu * spu, unsigned int objectId)
+{
+	unsigned long flags = 0;
+	struct vma_to_fileoffset_map * new_map;
+	int retval = 0;
+	struct cached_info * info = get_cached_info(spu, spu->number);
+
+	if (info) {
+		pr_debug("Found cached SPU info.\n");
+		goto out;
+	}
+
+	/* Create cached_info and set spu_info[spu->number] to point to it.
+	 * spu->number is a system-wide value, not a per-node value.
+	 */
+	info = kzalloc(sizeof(struct cached_info), GFP_KERNEL);
+	if (!info) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: create vma_map failed\n",
+		       __FUNCTION__, __LINE__);
+		goto err_alloc;
+	}
+	new_map = create_vma_map(spu, objectId);
+	if (!new_map) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: create vma_map failed\n",
+		       __FUNCTION__, __LINE__);
+		goto err_alloc;
+	}
+
+	pr_debug("Created vma_map\n");
+	info->map = new_map;
+	info->the_spu = spu;
+	kref_init(&info->cache_ref);
+	spin_lock_irqsave(&cache_lock, flags);
+	spu_info[spu->number] = info;
+	spin_unlock_irqrestore(&cache_lock, flags);
+	/* Increment count before passing off ref to SPUFS. */
+	kref_get(&info->cache_ref);
+	spu_set_profile_private(spu->ctx, info, &info->cache_ref,
+				destroy_cached_info);
+	goto out;
+	
+err_alloc:
+	retval = -1;
+out:
+	return retval;
+}
+
+/*
+ * NOTE:  The caller is responsible for locking the
+ *	  cache_lock prior to calling this function.
+ */
+static int release_cached_info(int spu_index)
+{
+	int index, end;
+	if (spu_index == RELEASE_ALL) {
+		end = num_spu_nodes;
+		index = 0;
+	} else {
+	        if (spu_index >= num_spu_nodes) {
+        	        printk(KERN_ERR "SPU_PROF: "
+			       "%s, line %d: Invalid index %d into spu info cache\n",
+               	               __FUNCTION__, __LINE__, spu_index);
+	                goto out;
+	        }
+		end = spu_index +1;
+		index = spu_index;
+	}
+	for (; index < end; index++) {
+		if (spu_info[index]) {
+			kref_put(&spu_info[index]->cache_ref, destroy_cached_info);
+			spu_info[index] = NULL;
+		}
+	}
+
+out:
+	return 0;
+}
+
+/* The source code for fast_get_dcookie was "borrowed"
+ * from drivers/oprofile/buffer_sync.c.
+ */
+
+/* Optimisation. We can manage without taking the dcookie sem
+ * because we cannot reach this code without at least one
+ * dcookie user still being registered (namely, the reader
+ * of the event buffer).
+ */
+static inline unsigned long fast_get_dcookie(struct dentry * dentry,
+					     struct vfsmount * vfsmnt)
+{
+	unsigned long cookie;
+
+	if (dentry->d_cookie)
+		return (unsigned long)dentry;
+	get_dcookie(dentry, vfsmnt, &cookie);
+	return cookie;
+}
+
+/* Look up the dcookie for the task's first VM_EXECUTABLE mapping,
+ * which corresponds loosely to "application name". Also, determine
+ * the offset for the SPU ELF object.  If computed offset is 
+ * non-zero, it implies an embedded SPU object; otherwise, it's a
+ * separate SPU binary, in which case we retrieve it's dcookie.
+ */
+static unsigned long 
+get_exec_dcookie_and_offset(
+	struct spu * spu, unsigned int * offsetp,
+	unsigned long * spu_bin_dcookie,
+	unsigned int spu_ref)
+{
+	unsigned long cookie = 0;
+	unsigned int my_offset = 0;
+	struct vm_area_struct * vma;
+	struct mm_struct * mm = spu->mm;
+
+	if (!mm)
+		goto out;
+
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		if (!vma->vm_file)
+			continue;
+		if (!(vma->vm_flags & VM_EXECUTABLE))
+			continue;
+		cookie = fast_get_dcookie(vma->vm_file->f_dentry,
+					  vma->vm_file->f_vfsmnt);
+		pr_debug("got dcookie for %s\n",
+			 vma->vm_file->f_dentry->d_name.name);
+		break;
+	}
+
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		if (vma->vm_start > spu_ref || vma->vm_end < spu_ref)
+			continue;
+		my_offset = spu_ref - vma->vm_start;
+		pr_debug("Found spu ELF at "
+			 " %X for file %s\n", my_offset,
+			 vma->vm_file->f_dentry->d_name.name);
+		*offsetp = my_offset;
+		if (my_offset == 0) {
+			if (!vma->vm_file) {
+				goto fail_no_spu_cookie;
+			}
+			*spu_bin_dcookie = fast_get_dcookie(
+				vma->vm_file->f_dentry,
+				vma->vm_file->f_vfsmnt);
+			pr_debug("got dcookie for %s\n",
+				 vma->vm_file->f_dentry->d_name.name);
+		}
+		break;			
+	}
+	
+ out:
+	return cookie;
+
+ fail_no_spu_cookie:
+	printk(KERN_ERR "SPU_PROF: "
+	       "%s, line %d: Cannot find dcookie for SPU binary\n",
+	       __FUNCTION__, __LINE__);
+	goto out;
+}
+
+
+
+/* This function finds or creates cached context information for the
+ * passed SPU and records SPU context information into the OProfile
+ * event buffer.
+ */
+static int process_context_switch(struct spu * spu, unsigned int objectId)
+{
+	unsigned long flags;
+	int retval = 0;
+	unsigned int offset = 0;
+	unsigned long spu_cookie = 0, app_dcookie = 0;
+	retval = prepare_cached_spu_info(spu, objectId);
+	if (retval == -1) {
+		goto out;
+	}
+        /* Get dcookie first because a mutex_lock is taken in that
+	 * code path, so interrupts must not be disabled.
+	 */
+	app_dcookie = get_exec_dcookie_and_offset(spu, &offset,
+						  &spu_cookie, objectId);
+
+        /* Record context info in event buffer */
+	spin_lock_irqsave(&buffer_lock, flags);
+	add_event_entry(ESCAPE_CODE);
+	add_event_entry(SPU_CTX_SWITCH_CODE);
+	add_event_entry(spu->number);
+	add_event_entry(spu->pid);
+	add_event_entry(spu->tgid);
+	add_event_entry(app_dcookie);
+
+	add_event_entry(ESCAPE_CODE);
+	if (offset) {
+	  /* When offset is non-zero,  this means the SPU ELF was embedded;
+	   * otherwise, it was loaded from a separate binary file.  For the
+	   * embedded case, we record the offset of the SPU ELF into the PPU
+	   * executable; for the non-embedded case, we record a dcookie that
+	   * points to the location of the SPU binary that was loaded.
+	   */
+		add_event_entry(SPU_OFFSET_CODE);
+		add_event_entry(offset);
+	} else {
+		add_event_entry(SPU_COOKIE_CODE);
+		add_event_entry(spu_cookie);
+	}
+	spin_unlock_irqrestore(&buffer_lock, flags);
+	smp_wmb();
+out:
+	return retval;
+}
+
+/* 
+ * This function is invoked on either a bind_context or unbind_context.  
+ * If called for an unbind_context, the val arg is 0; otherwise, 
+ * it is the object-id value for the spu context.
+ * The data arg is of type 'struct spu *'.
+ */
+static int spu_active_notify(struct notifier_block * self, unsigned long val,
+			     void * data)
+{
+	int retval;
+	unsigned long flags = 0;
+	struct spu * the_spu = data;
+	pr_debug("SPU event notification arrived\n");
+	if (!val){
+		spin_lock_irqsave(&cache_lock, flags);
+		retval = release_cached_info(the_spu->number);
+		spin_unlock_irqrestore(&cache_lock, flags);
+	} else {
+		retval = process_context_switch(the_spu, val);
+	}
+	return retval;
+}
+
+static struct notifier_block spu_active = {
+	.notifier_call = spu_active_notify,
+};
+
+/* The main purpose of this function is to synchronize
+ * OProfile with SPUFS by registering to be notified of
+ * SPU task switches.
+ *
+ * NOTE: When profiling SPUs, we must ensure that only
+ * spu_sync_start is invoked and not the generic sync_start
+ * in drivers/oprofile/oprof.c.  A return value of
+ * SKIP_GENERIC_SYNC or SYNC_START_ERROR will
+ * accomplish this.
+ */
+int spu_sync_start(void) {
+	int ret = SKIP_GENERIC_SYNC;
+	int register_ret;
+	unsigned long flags = 0;
+	spu_prof_num_nodes = number_of_online_nodes();
+	num_spu_nodes = spu_prof_num_nodes * 8;
+
+	spin_lock_irqsave(&buffer_lock, flags);
+	add_event_entry(ESCAPE_CODE);
+	add_event_entry(SPU_PROFILING_CODE);
+	add_event_entry(num_spu_nodes);
+	spin_unlock_irqrestore(&buffer_lock, flags);
+
+        /* Register for SPU events  */
+	register_ret = spu_switch_event_register(&spu_active);
+	if (register_ret) {
+		ret = SYNC_START_ERROR;
+		goto out;
+	}
+
+	pr_debug("spu_sync_start -- running.\n");
+out:
+	return ret;	
+}
+
+/* Record SPU program counter samples to the oprofile event buffer. */
+void spu_sync_buffer(int spu_num, unsigned int * samples, 
+		     int num_samples)
+{
+	unsigned long flags = 0;
+	int i;
+	struct vma_to_fileoffset_map * map;
+	struct spu * the_spu;
+	unsigned long long spu_num_ll = spu_num;
+	unsigned long long spu_num_shifted = spu_num_ll << 32;
+	struct cached_info * c_info = get_cached_info(NULL, spu_num);
+	if (c_info == NULL) {
+        /* This legitimately happens when the SPU task ends before all 
+	 * samples are recorded.  No big deal -- so we just drop a few samples.
+	 */
+		pr_debug("SPU_PROF: No cached SPU contex "
+			  "for SPU #%d. Dropping samples.\n", spu_num);
+		return ;
+	}
+
+	map = c_info->map;
+	the_spu = c_info->the_spu;
+	spin_lock_irqsave(&buffer_lock, flags);
+	for (i = 0; i < num_samples; i++) {
+		unsigned long long file_offset;
+		unsigned int sample = *(samples+i);
+		if (sample == 0)
+			continue;
+		file_offset = vma_map_lookup(
+			map, sample, the_spu);
+		/* For now, we'll drop samples that can't be mapped.
+		 * This can happen for generated stubs executed from
+		 * the SPU stack.  Do we need to record these somehow?
+		 */
+		if (unlikely(file_offset == -1))
+			continue;
+		add_event_entry(file_offset | spu_num_shifted);
+	}
+	spin_unlock_irqrestore(&buffer_lock, flags);
+}
+
+
+int spu_sync_stop(void)
+{
+	unsigned long flags = 0;
+	int ret = spu_switch_event_unregister(&spu_active);
+	if (ret) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: spu_switch_event_unregister returned %d\n",
+		       __FUNCTION__, __LINE__, ret);
+		goto out;
+	} 
+
+	spin_lock_irqsave(&cache_lock, flags);
+	ret = release_cached_info(RELEASE_ALL);
+	spin_unlock_irqrestore(&cache_lock, flags);
+out:
+	pr_debug("spu_sync_stop -- done.\n");
+	return ret;
+}
+
+
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/cell/vma_map.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/cell/vma_map.c	2007-02-01 17:21:46.944872872 -0600
@@ -0,0 +1,229 @@
+ /*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Author: Maynard Johnson <maynardj@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/* The code in this source file is responsible for generating
+ * vma-to-fileOffset maps for both overlay and non-overlay SPU
+ * applications.
+ */
+
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/elf.h>
+#include "pr_util.h"
+
+
+void vma_map_free(struct vma_to_fileoffset_map *map)
+{
+	while (map) {
+		struct vma_to_fileoffset_map *next = map->next;
+		kfree(map);
+		map = next;
+	}
+}
+
+unsigned int
+vma_map_lookup(struct vma_to_fileoffset_map *map, unsigned int vma,
+	       const struct spu * aSpu)
+{
+	u32 offset = -1;
+	u32 ovly_grd;
+	for (; map; map = map->next) {
+		if (vma < map->vma || vma >= map->vma + map->size)
+			continue;
+
+		if (map->guard_ptr) {
+			ovly_grd = *(u32 *)(aSpu->local_store + map->guard_ptr);
+			if (ovly_grd != map->guard_val)
+				continue;
+		}
+		break;
+	}
+
+	if (likely(map != NULL)) {
+		offset = vma - map->vma + map->offset;
+	}
+	return offset;
+}
+
+static struct vma_to_fileoffset_map *
+vma_map_add(struct vma_to_fileoffset_map *map, unsigned int vma,
+	    unsigned int size, unsigned int offset, unsigned int guard_ptr,
+	    unsigned int guard_val)
+{
+	struct vma_to_fileoffset_map *new = kzalloc(sizeof(struct vma_to_fileoffset_map), GFP_KERNEL);
+	if (!new) {
+		printk(KERN_ERR "SPU_PROF: %s, line %d: malloc failed\n",
+		       __FUNCTION__, __LINE__);
+		vma_map_free(map);
+		return NULL;
+	}
+
+	new->next = map;
+	new->vma = vma;
+	new->size = size;
+	new->offset = offset;
+	new->guard_ptr = guard_ptr;
+	new->guard_val = guard_val;
+
+	return new;
+}
+
+
+/* Parse SPE ELF header and generate a list of vma_maps.
+ * A pointer to the first vma_map in the generated list
+ * of vma_maps is returned.  */
+struct vma_to_fileoffset_map * create_vma_map(const struct spu * aSpu, 
+					      unsigned long spu_elf_start)
+{
+	static const unsigned char expected[EI_PAD] = {
+		[EI_MAG0] = ELFMAG0,
+		[EI_MAG1] = ELFMAG1,
+		[EI_MAG2] = ELFMAG2,
+		[EI_MAG3] = ELFMAG3,
+		[EI_CLASS] = ELFCLASS32,
+		[EI_DATA] = ELFDATA2MSB,
+		[EI_VERSION] = EV_CURRENT,
+		[EI_OSABI] = ELFOSABI_NONE
+	};
+
+	struct vma_to_fileoffset_map *map = NULL;
+	unsigned int overlay_tbl_offset = -1;
+	unsigned long phdr_start, shdr_start;
+	Elf32_Ehdr ehdr;
+	Elf32_Phdr phdr;
+	Elf32_Shdr shdr, shdr_str;
+	Elf32_Sym sym;
+	int i, j;
+	char name[32];
+
+	unsigned int ovly_table_sym = 0;
+	unsigned int ovly_buf_table_sym = 0;
+	unsigned int ovly_table_end_sym = 0;
+	unsigned int ovly_buf_table_end_sym = 0;
+	unsigned long ovly_table;
+	unsigned int n_ovlys;
+
+	struct {
+		unsigned int vma;
+		unsigned int size;
+		unsigned int offset;
+		unsigned int buf;
+	} ovly;
+
+	/* Get and validate ELF header.  */
+
+	copy_from_user(&ehdr, (void *) spu_elf_start, sizeof (ehdr));
+	if (memcmp(ehdr.e_ident, expected, EI_PAD) != 0) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Unexpected value parsing SPU ELF\n",
+		       __FUNCTION__, __LINE__);
+		return NULL;
+	}
+	if (ehdr.e_machine != 23) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Unexpected value parsing SPU ELF\n",
+		       __FUNCTION__,  __LINE__);
+
+		return NULL;
+	}
+	if (ehdr.e_type != ET_EXEC) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Unexpected value parsing SPU ELF\n",
+		       __FUNCTION__, __LINE__);
+		return NULL;
+	}
+	phdr_start = spu_elf_start + ehdr.e_phoff;
+	shdr_start = spu_elf_start + ehdr.e_shoff;
+
+	/* Traverse program headers.  */
+	for (i = 0; i < ehdr.e_phnum; i++) {
+		copy_from_user(&phdr, (void *) (phdr_start + i * sizeof(phdr)), 
+			       sizeof(phdr));
+		if (phdr.p_type != PT_LOAD)
+			continue;
+		if (phdr.p_flags & (1 << 27))
+			continue;
+
+		map = vma_map_add(map, phdr.p_vaddr, phdr.p_memsz, 
+				  phdr.p_offset, 0, 0);
+		if (!map)
+			return NULL;
+	}
+
+	pr_debug("SPU_PROF: Created non-overlay maps\n");	
+	/* Traverse section table and search for overlay-related symbols.  */
+	for (i = 0; i < ehdr.e_shnum; i++) {
+		copy_from_user(&shdr, (void *) (shdr_start + i * sizeof(shdr)), 
+			       sizeof(shdr));
+		if (shdr.sh_type != SHT_SYMTAB)
+			continue;
+		if (shdr.sh_entsize != sizeof (sym))
+			continue;
+
+		copy_from_user(&shdr_str, 
+			       (void *) (shdr_start + shdr.sh_link * sizeof(shdr)),
+			       sizeof(shdr));
+		if (shdr_str.sh_type != SHT_STRTAB)
+			return NULL;
+
+		for (j = 0; j < shdr.sh_size / sizeof (sym); j++) {
+			copy_from_user(&sym, (void *) (spu_elf_start +
+						       shdr.sh_offset + j * sizeof (sym)),
+				       sizeof (sym));
+			copy_from_user(name, (void *) (spu_elf_start + shdr_str.sh_offset + 
+						       sym.st_name),
+				       20);
+			if (memcmp(name, "_ovly_table", 12) == 0)
+				ovly_table_sym = sym.st_value;
+			if (memcmp(name, "_ovly_buf_table", 16) == 0)
+				ovly_buf_table_sym = sym.st_value;
+			if (memcmp(name, "_ovly_table_end", 16) == 0)
+				ovly_table_end_sym = sym.st_value;
+			if (memcmp(name, "_ovly_buf_table_end", 20) == 0)
+				ovly_buf_table_end_sym = sym.st_value;
+		}
+	}
+
+	/* If we don't have overlays, we're done.  */
+	if (ovly_table_sym == 0 || ovly_buf_table_sym == 0
+	    || ovly_table_end_sym == 0 || ovly_buf_table_end_sym == 0) {
+		pr_debug("SPU_PROF: No overlay table found\n");
+		return map;
+	}
+	else {
+		pr_debug("SPU_PROF: Overlay table found\n");
+	}
+
+	overlay_tbl_offset = vma_map_lookup(map, ovly_table_sym, aSpu);
+	if (overlay_tbl_offset < 0) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Error finding SPU overlay table\n",
+		       __FUNCTION__, __LINE__);
+		return NULL;
+	}
+	ovly_table = spu_elf_start + overlay_tbl_offset;
+	n_ovlys = (ovly_table_end_sym - ovly_table_sym) / sizeof (ovly);
+
+	/* Traverse overlay table.  */
+	for (i = 0; i < n_ovlys; i++) {
+		copy_from_user(&ovly, (void *) (ovly_table + i * sizeof (ovly)),
+			       sizeof (ovly));
+		map = vma_map_add(map, ovly.vma, ovly.size, ovly.offset,
+				   ovly_buf_table_sym + (ovly.buf - 1) * 4, i + 1);
+		if (!map)
+			return NULL;
+	}
+	
+	return map;
+}
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/common.c
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/oprofile/common.c	2007-01-18 16:43:14.429510072 -0600
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/common.c	2007-02-01 17:21:46.946872568 -0600
@@ -150,6 +150,8 @@
 #ifdef CONFIG_PPC_CELL_NATIVE
 		case PPC_OPROFILE_CELL:
 			model = &op_model_cell;
+			ops->sync_start = model->sync_start;
+			ops->sync_stop = model->sync_stop;
 			break;
 #endif
 		case PPC_OPROFILE_RS64:
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/Kconfig
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/oprofile/Kconfig	2007-01-18 16:43:14.426510528 -0600
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/Kconfig	2007-02-03 17:05:51.967892936 -0600
@@ -7,7 +7,8 @@
 
 config OPROFILE
 	tristate "OProfile system profiling (EXPERIMENTAL)"
-	depends on PROFILING
+	default m
+	depends on SPU_FS && PROFILING
 	help
 	  OProfile is a profiling system capable of profiling the
 	  whole system, include the kernel, kernel modules, libraries,
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/Makefile
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/oprofile/Makefile	2007-01-18 16:43:14.429510072 -0600
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/Makefile	2007-02-01 17:21:46.948872264 -0600
@@ -11,7 +11,8 @@
 		timer_int.o )
 
 oprofile-y := $(DRIVER_OBJS) common.o backtrace.o
-oprofile-$(CONFIG_PPC_CELL_NATIVE) += op_model_cell.o
+oprofile-$(CONFIG_PPC_CELL_NATIVE) += op_model_cell.o \
+					cell/spu_profiler.o cell/vma_map.o cell/spu_task_sync.o
 oprofile-$(CONFIG_PPC64) += op_model_rs64.o op_model_power4.o
 oprofile-$(CONFIG_FSL_BOOKE) += op_model_fsl_booke.o
 oprofile-$(CONFIG_6xx) += op_model_7450.o
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/op_model_cell.c
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/oprofile/op_model_cell.c	2007-02-01 17:21:38.388840624 -0600
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/op_model_cell.c	2007-02-03 15:59:38.555810464 -0600
@@ -37,6 +37,16 @@
 #include <asm/system.h>
 
 #include "../platforms/cell/interrupt.h"
+#include "cell/pr_util.h"
+
+/* spu_cycle_reset is the number of cycles between samples.
+ * This variable is used for SPU profiling and should ONLY be set
+ * at the beginning of cell_reg_setup; otherwise, it's read-only.
+ */
+static unsigned int spu_cycle_reset = 0;
+
+#define NUM_SPUS_PER_NODE    8
+#define SPU_CYCLES_EVENT_NUM 2        /*  event number for SPU_CYCLES */
 
 #define PPU_CYCLES_EVENT_NUM 1	/*  event number for CYCLES */
 #define PPU_CYCLES_GRP_NUM   1  /* special group number for identifying
@@ -50,7 +60,6 @@
 #define NUM_TRACE_BUS_WORDS 4
 #define NUM_INPUT_BUS_WORDS 2
 
-
 struct pmc_cntrl_data {
 	unsigned long vcntr;
 	unsigned long evnts;
@@ -140,12 +149,21 @@
 /*
  * Firmware interface functions
  */
+
 static int
 rtas_ibm_cbe_perftools(int subfunc, int passthru,
 		       void *address, unsigned long length)
 {
 	u64 paddr = __pa(address);
 
+	pm_rtas_token = rtas_token("ibm,cbe-perftools");  
+
+	if (pm_rtas_token == RTAS_UNKNOWN_SERVICE) {
+	  printk(KERN_ERR
+		 "%s: rtas token ibm,cbe-perftools unknown\n",
+		 __FUNCTION__);
+	}
+
 	return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc, passthru,
 			 paddr >> 32, paddr & 0xffffffff, length);
 }
@@ -486,7 +504,12 @@
 	       struct op_system_config *sys, int num_ctrs)
 {
 	int i, j, cpu;
+	spu_cycle_reset = 0;
 
+	if (ctr[0].event == SPU_CYCLES_EVENT_NUM) {
+		spu_cycle_reset = ctr[0].count;
+		return;
+	}
 	pm_rtas_token = rtas_token("ibm,cbe-perftools");
 	if (pm_rtas_token == RTAS_UNKNOWN_SERVICE) {
 		printk(KERN_WARNING "%s: RTAS_UNKNOWN_SERVICE\n",
@@ -572,6 +595,8 @@
 	;
 }
 
+
+
 /* This function is called once for each cpu */
 static void cell_cpu_setup(struct op_counter_config *cntr)
 {
@@ -579,6 +604,9 @@
 	u32 num_enabled = 0;
 	int i;
 
+	if (spu_cycle_reset)
+		return;
+
 	/* There is one performance monitor per processor chip (i.e. node),
 	 * so we only need to perform this function once per node.
 	 */
@@ -613,11 +641,216 @@
 	;
 }
 
-static void cell_global_start(struct op_counter_config *ctr)
+#define size 24
+#define ENTRIES  (0x1<<8) /* 256 */
+#define MAXLFSR  0xFFFFFF
+
+int initial_lfsr[] =
+{16777215, 3797240, 13519805, 11602690, 6497030, 7614675, 2328937, 2889445,
+ 12364575, 8723156, 2450594, 16280864, 14742496, 10904589, 6434212, 4996256,
+ 5814270, 13014041, 9825245, 410260, 904096, 15151047, 15487695, 3061843,
+ 16482682, 7938572, 4893279, 9390321, 4320879, 5686402, 1711063, 10176714,
+ 4512270, 1057359, 16700434, 5731602, 2070114, 16030890, 1208230, 15603106,
+ 11857845, 6470172, 1362790, 7316876, 8534496, 1629197, 10003072, 1714539,
+ 1814669, 7106700, 5427154, 3395151, 3683327, 12950450, 16620273, 12122372,
+ 7194999, 9952750, 3608260, 13604295, 2266835, 14943567, 7079230, 777380,
+ 4516801, 1737661, 8730333, 13796927, 3247181, 9950017, 3481896, 16527555,
+ 13116123, 14505033, 9781119, 4860212, 7403253, 13264219, 12269980, 100120,
+ 664506, 607795, 8274553, 13133688, 6215305, 13208866, 16439693, 3320753,
+ 8773582, 13874619, 1784784, 4513501, 11002978, 9318515, 3038856, 14254582,
+ 15484958, 15967857, 13504461, 13657322, 14724513, 13955736, 5695315, 7330509,
+ 12630101, 6826854, 439712, 4609055, 13288878, 1309632, 4996398, 11392266,
+ 793740, 7653789, 2472670, 14641200, 5164364, 5482529, 10415855, 1629108,
+ 2012376, 13661123, 14655718, 9534083, 16637925, 2537745, 9787923, 12750103,
+ 4660370, 3283461, 14862772, 7034955, 6679872, 8918232, 6506913, 103649,
+ 6085577, 13324033, 14251613, 11058220, 11998181, 3100233, 468898, 7104918,
+ 12498413, 14408165, 1208514, 15712321, 3088687, 14778333, 3632503, 11151952,
+ 98896, 9159367, 8866146, 4780737, 4925758, 12362320, 4122783, 8543358,
+ 7056879, 10876914, 6282881, 1686625, 5100373, 4573666, 9265515, 13593840,
+ 5853060, 1188880, 4237111, 15765555, 14344137, 4608332, 6590210, 13745050,
+ 10916568, 12340402, 7145275, 4417153, 2300360, 12079643, 7608534, 15238251,
+ 4947424, 7014722, 3984546, 7168073, 10759589, 16293080, 3757181, 4577717,
+ 5163790, 2488841, 4650617, 3650022, 5440654, 1814617, 6939232, 15540909,
+ 501788, 1060986, 5058235, 5078222, 3734500, 10762065, 390862, 5172712,
+ 1070780, 7904429, 1669757, 3439997, 2956788, 14944927, 12496638, 994152,
+ 8901173, 11827497, 4268056, 15725859, 1694506, 5451950, 2892428, 1434298,
+ 9048323, 13558747, 15083840, 8154495, 15830901, 391127, 14970070, 2451434,
+ 2080347, 10775644, 14599429, 12540753, 4813943, 16140655, 2421772, 12724304,
+ 12935733, 7206473, 5697333, 10328104, 2418008, 13547986, 284246, 1732363,
+ 16375319, 8109554, 16372365, 14346072, 1835890, 13059499, 2442500, 4110674};
+
+/*
+ * The hardware uses an LFSR counting sequence to determine when to capture
+ * the SPU PCs.  The SPU PC capture is done when the LFSR sequence reaches the
+ * last value in the sequence.  An LFSR sequence is like a puesdo random 
+ * number sequence where each number occurs once in the sequence but the 
+ * sequence is not in numerical order.  To reduce the calculation time, a 
+ * sequence of 256 precomputed values in the LFSR sequence are stored in a
+ * table.  The nearest precomputed value is used as the initial point from
+ * which to caculate the desired LFSR value that is n from the end of the 
+ * sequence.  The lookup table reduces the maximum number of iterations in 
+ * the loop from 2^24 to 2^16.
+ */
+static int calculate_lfsr(int n)
 {
-	u32 cpu;
+  int i;
+
+  int start_lfsr_index;
+  unsigned int newlfsr0;
+  unsigned int lfsr = MAXLFSR;
+  unsigned int binsize = (MAXLFSR+1)/ENTRIES;
+  unsigned int howmany; 
+
+  start_lfsr_index = (MAXLFSR - n) / binsize;
+  lfsr = initial_lfsr[start_lfsr_index];
+  howmany = (MAXLFSR - n) - (start_lfsr_index * (binsize));
+
+  for (i = 2; i < howmany+2; i++) {
+    newlfsr0 = (((lfsr >> (size - 1 - 0)) & 1) ^
+		((lfsr >> (size - 1 - 1)) & 1) ^
+		(((lfsr >> (size - 1 - 6)) & 1) ^
+		 ((lfsr >> (size - 1 - 23)) & 1)));
+
+    lfsr >>= 1;
+    lfsr = lfsr | (newlfsr0 << (size - 1)); 
+  }
+  return lfsr;
+}
+
+static void pm_rtas_activate_spu_profiling(u32 node)
+{
+	int ret, i;
+	struct pm_signal pm_signal_local[NR_PHYS_CTRS];
+
+	/* Set up the rtas call to configure the debug bus to 
+	 * route the SPU PCs.  Setup the pm_signal for each SPU */
+	for (i = 0; i < NUM_SPUS_PER_NODE; i++) {
+		pm_signal_local[i].cpu = node;
+		pm_signal_local[i].signal_group = 41;
+		pm_signal_local[i].bus_word = 1 << i / 2; /* spu i on 
+							   * word (i/2) 
+							   */
+		pm_signal_local[i].sub_unit = i;	/* spu i */
+		pm_signal_local[i].bit = 63;
+	}
+
+	pm_rtas_token = rtas_token("ibm,cbe-perftools");
+	if (pm_rtas_token == RTAS_UNKNOWN_SERVICE) {
+		printk(KERN_WARNING "%s: RTAS_UNKNOWN_SERVICE \n",
+		       __FUNCTION__);
+	}
+
+	ret = rtas_ibm_cbe_perftools(SUBFUNC_ACTIVATE, PASSTHRU_ENABLE,
+				     pm_signal_local,
+				     8 * sizeof(struct pm_signal)); //FIXME 8 to #define
+
+	if (ret)
+		printk(KERN_WARNING "%s: rtas returned: %d\n",
+		       __FUNCTION__, ret);
+
+}
+
+#ifdef CONFIG_CPU_FREQ
+static int
+oprof_cpufreq_notify(struct notifier_block *nb, unsigned long val, void *data)
+{
+	int ret = 0;
+	struct cpufreq_freqs * frq = data;
+	if ((val == CPUFREQ_PRECHANGE && frq->old < frq->new) ||
+	    (val == CPUFREQ_POSTCHANGE && frq->old > frq->new) ||
+	    (val == CPUFREQ_RESUMECHANGE || val == CPUFREQ_SUSPENDCHANGE))
+		set_profiling_frequency(frq->new, spu_cycle_reset);
+	return ret;
+}
+
+static struct notifier_block cpu_freq_notifier_block = {
+	.notifier_call	= oprof_cpufreq_notify
+};
+#endif
+
+static void cell_global_start_spu(struct op_counter_config *ctr)
+{
+	int subfunc, rtn_value;
+	unsigned int lfsr_value;
+	int cpu;
+	int ret = 0;
+	unsigned int cpu_khzfreq = 0;
+
+	/* The SPU profiling uses time-based profiling based on
+	 * cpu frequency, so if configured with the CPU_FREQ
+	 * option, we should detect frequency changes and react
+	 * accordingly.
+	 */
+#ifdef CONFIG_CPU_FREQ
+	ret = cpufreq_register_notifier(&cpu_freq_notifier_block,
+					CPUFREQ_TRANSITION_NOTIFIER);
+	if (ret < 0)
+		printk(KERN_ERR "CPU freq change registration failed: %d\n",
+		       ret);
+	else
+		cpu_khzfreq = cpufreq_quick_get(smp_processor_id());
+#endif
+
+	set_profiling_frequency(cpu_khzfreq, spu_cycle_reset);
+
+	for_each_online_cpu(cpu) {
+		if (cbe_get_hw_thread_id(cpu))
+			continue;
+		/* Setup SPU cycle-based profiling.
+		 * Set perf_mon_control bit 0 to a zero before
+		 * enabling spu collection hardware.
+		 */
+		cbe_write_pm(cpu, pm_control, 0);
+
+		pm_rtas_activate_spu_profiling(cbe_cpu_to_node(cpu));
+
+		if (spu_cycle_reset > 0xFFFFFE) 
+				lfsr_value = calculate_lfsr(1);  /* use largest possible 
+								  *  value 
+								  */
+		else 
+		    lfsr_value = calculate_lfsr(spu_cycle_reset);
+
+		if (lfsr_value == 0) {  /* must use a non zero value.  Zero
+					 * disables data collection.
+					 */
+				lfsr_value = calculate_lfsr(1);  /* use largest possible 
+								  * value 
+								 */
+		}
+
+		lfsr_value = lfsr_value << 8; /* shift lfsr to correct 
+					       * register location
+					       */
+		
+		pm_rtas_token = rtas_token("ibm,cbe-spu-perftools");  
+
+		if (pm_rtas_token == RTAS_UNKNOWN_SERVICE) {
+			printk(KERN_ERR
+			       "%s: rtas token ibm,cbe-spu-perftools unknown\n",
+			       __FUNCTION__);
+		}
+
+		subfunc = 2;	// 2 - activate SPU tracing, 3 - deactivate
+
+		rtn_value = rtas_call(pm_rtas_token, 3, 1, NULL, subfunc,
+			  cbe_cpu_to_node(cpu), lfsr_value);
+
+		if (rtn_value != 0)
+			printk(KERN_ERR
+			       "%s: rtas call ibm,cbe-spu-perftools failed, return = %d\n",
+			       __FUNCTION__, rtn_value);
+	}
+
+	start_spu_profiling(spu_cycle_reset);
+
+	oprofile_running = 1;
+}
+
+static void cell_global_start_ppu(struct op_counter_config *ctr)
+{
+	u32 cpu, i;
 	u32 interrupt_mask = 0;
-	u32 i;
 
 	/* This routine gets called once for the system.
 	 * There is one performance monitor per node, so we
@@ -658,7 +891,61 @@
 	start_virt_cntrs();
 }
 
-static void cell_global_stop(void)
+
+static void cell_global_start(struct op_counter_config *ctr)
+{
+	if (spu_cycle_reset) {
+		cell_global_start_spu(ctr);
+	} else {
+		cell_global_start_ppu(ctr);
+	}
+}
+
+static void cell_global_stop_spu(void)
+{
+	int subfunc, rtn_value;
+	unsigned int lfsr_value;
+	int cpu;
+
+	oprofile_running = 0;
+
+#ifdef CONFIG_CPU_FREQ
+	cpufreq_unregister_notifier(&cpu_freq_notifier_block,
+				    CPUFREQ_TRANSITION_NOTIFIER);
+#endif
+
+	for_each_online_cpu(cpu) {
+		if (cbe_get_hw_thread_id(cpu))
+			continue;
+
+		pm_rtas_token = rtas_token("ibm,cbe-spu-perftools");  
+
+		if (pm_rtas_token == RTAS_UNKNOWN_SERVICE) {
+			printk(KERN_ERR
+			       "%s: rtas token ibm,cbe-spu-perftools unknown\n",
+			       __FUNCTION__);
+		}
+
+		subfunc = 3;	// 2 - activate SPU tracing, 3 - deactivate
+		lfsr_value = 0x8f100000;
+
+		rtn_value =
+		    rtas_call(pm_rtas_token, 3, 1, NULL, subfunc,
+			      cbe_cpu_to_node(cpu), lfsr_value);
+
+		if (rtn_value != 0)
+			printk
+			    ("ERROR, rtas call ibm,cbe-spu-perftools failed, return = %d\n",
+			     rtn_value);
+
+		/* Deactivate the signals */
+		pm_rtas_reset_signals(cbe_cpu_to_node(cpu));
+	}
+
+	stop_spu_profiling();
+}
+
+static void cell_global_stop_ppu(void)
 {
 	int cpu;
 
@@ -686,6 +973,16 @@
 	}
 }
 
+static void cell_global_stop(void)
+{
+	if (spu_cycle_reset) {
+		cell_global_stop_spu();
+	} else {
+		cell_global_stop_ppu();
+	}
+
+}
+
 static void
 cell_handle_interrupt(struct pt_regs *regs, struct op_counter_config *ctr)
 {
@@ -754,10 +1051,35 @@
 	spin_unlock_irqrestore(&virt_cntr_lock, flags);
 }
 
+/* This function is called from the generic OProfile
+ * driver.  When profiling PPUs, we need to do the
+ * generic sync start; otherwise, do spu_sync_start.
+ */
+static int cell_sync_start(void)
+{
+	if (spu_cycle_reset)
+		return spu_sync_start();
+	else
+		return DO_GENERIC_SYNC;
+}
+
+static int cell_sync_stop(void)
+{
+	if (spu_cycle_reset)
+		return spu_sync_stop();
+	else
+		return 1;
+}
+
+
 struct op_powerpc_model op_model_cell = {
 	.reg_setup = cell_reg_setup,
 	.cpu_setup = cell_cpu_setup,
 	.global_start = cell_global_start,
 	.global_stop = cell_global_stop,
+	.sync_start = cell_sync_start,
+	.sync_stop = cell_sync_stop,
 	.handle_interrupt = cell_handle_interrupt,
 };
+
+
Index: linux-2.6.20-rc1/arch/powerpc/platforms/cell/spufs/sched.c
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/platforms/cell/spufs/sched.c	2007-02-01 17:21:41.943834416 -0600
+++ linux-2.6.20-rc1/arch/powerpc/platforms/cell/spufs/sched.c	2007-02-01 17:21:46.957870896 -0600
@@ -129,6 +129,7 @@
 	ctx->spu = spu;
 	ctx->ops = &spu_hw_ops;
 	spu->pid = current->pid;
+	spu->tgid = current->tgid;
 	spu->prio = current->prio;
 	spu->mm = ctx->owner;
 	mm_needs_global_tlbie(spu->mm);
@@ -161,6 +162,7 @@
 	spu->dma_callback = NULL;
 	spu->mm = NULL;
 	spu->pid = 0;
+	spu->tgid = 0;
 	spu->prio = MAX_PRIO;
 	ctx->ops = &spu_backing_ops;
 	ctx->spu = NULL;
Index: linux-2.6.20-rc1/drivers/oprofile/buffer_sync.c
===================================================================
--- linux-2.6.20-rc1.orig/drivers/oprofile/buffer_sync.c	2007-01-18 16:43:11.675529376 -0600
+++ linux-2.6.20-rc1/drivers/oprofile/buffer_sync.c	2007-02-01 17:21:46.960870440 -0600
@@ -26,6 +26,7 @@
 #include <linux/profile.h>
 #include <linux/module.h>
 #include <linux/fs.h>
+#include <linux/oprofile.h>
  
 #include "oprofile_stats.h"
 #include "event_buffer.h"
Index: linux-2.6.20-rc1/drivers/oprofile/event_buffer.h
===================================================================
--- linux-2.6.20-rc1.orig/drivers/oprofile/event_buffer.h	2007-01-18 16:43:11.673529680 -0600
+++ linux-2.6.20-rc1/drivers/oprofile/event_buffer.h	2007-02-01 17:21:46.962870136 -0600
@@ -19,28 +19,10 @@
  
 /* wake up the process sleeping on the event file */
 void wake_up_buffer_waiter(void);
- 
-/* Each escaped entry is prefixed by ESCAPE_CODE
- * then one of the following codes, then the
- * relevant data.
- */
-#define ESCAPE_CODE			~0UL
-#define CTX_SWITCH_CODE 		1
-#define CPU_SWITCH_CODE 		2
-#define COOKIE_SWITCH_CODE 		3
-#define KERNEL_ENTER_SWITCH_CODE	4
-#define KERNEL_EXIT_SWITCH_CODE		5
-#define MODULE_LOADED_CODE		6
-#define CTX_TGID_CODE			7
-#define TRACE_BEGIN_CODE		8
-#define TRACE_END_CODE			9
- 
+  
 #define INVALID_COOKIE ~0UL
 #define NO_COOKIE 0UL
 
-/* add data to the event buffer */
-void add_event_entry(unsigned long data);
- 
 extern struct file_operations event_buffer_fops;
  
 /* mutex between sync_cpu_buffers() and the
Index: linux-2.6.20-rc1/drivers/oprofile/oprof.c
===================================================================
--- linux-2.6.20-rc1.orig/drivers/oprofile/oprof.c	2007-01-18 16:43:11.675529376 -0600
+++ linux-2.6.20-rc1/drivers/oprofile/oprof.c	2007-02-01 17:21:46.964869832 -0600
@@ -53,9 +53,23 @@
 	 * us missing task deaths and eventually oopsing
 	 * when trying to process the event buffer.
 	 */
+	if (oprofile_ops.sync_start) {
+		int sync_ret = oprofile_ops.sync_start();
+		switch (sync_ret) {
+			case 0: goto post_sync;
+				break;
+			case 1: goto do_generic;
+				break;
+			case -1: goto out3;
+				break;
+			default: goto out3;
+		}
+	}
+do_generic:
 	if ((err = sync_start()))
 		goto out3;
 
+post_sync:
 	is_setup = 1;
 	mutex_unlock(&start_mutex);
 	return 0;
@@ -118,7 +132,19 @@
 void oprofile_shutdown(void)
 {
 	mutex_lock(&start_mutex);
+        if (oprofile_ops.sync_stop) {
+                int sync_ret = oprofile_ops.sync_stop();
+                switch (sync_ret) {
+                        case 0: goto post_sync;
+                                break;
+                        case 1: goto do_generic;
+                                break;
+			default: goto post_sync;
+                }
+        }
+do_generic:
 	sync_stop();
+post_sync:
 	if (oprofile_ops.shutdown)
 		oprofile_ops.shutdown();
 	is_setup = 0;
Index: linux-2.6.20-rc1/include/asm-powerpc/oprofile_impl.h
===================================================================
--- linux-2.6.20-rc1.orig/include/asm-powerpc/oprofile_impl.h	2007-01-18 16:43:19.315566704 -0600
+++ linux-2.6.20-rc1/include/asm-powerpc/oprofile_impl.h	2007-02-01 17:21:46.966869528 -0600
@@ -47,6 +47,8 @@
         void (*global_start) (struct op_counter_config *);
 	void (*stop) (void);
 	void (*global_stop) (void);
+	int (*sync_start)(void);
+	int (*sync_stop)(void);
 	void (*handle_interrupt) (struct pt_regs *,
 				  struct op_counter_config *);
 	int num_counters;
Index: linux-2.6.20-rc1/include/asm-powerpc/spu.h
===================================================================
--- linux-2.6.20-rc1.orig/include/asm-powerpc/spu.h	2007-02-01 17:21:41.950833352 -0600
+++ linux-2.6.20-rc1/include/asm-powerpc/spu.h	2007-02-05 08:34:38.498856800 -0600
@@ -128,6 +128,7 @@
 	struct spu_runqueue *rq;
 	unsigned long long timestamp;
 	pid_t pid;
+	pid_t tgid;
 	int prio;
 	int class_0_pending;
 	spinlock_t register_lock;
@@ -153,6 +154,11 @@
 int spu_irq_class_0_bottom(struct spu *spu);
 int spu_irq_class_1_bottom(struct spu *spu);
 void spu_irq_setaffinity(struct spu *spu, int cpu);
+void * spu_get_profile_private(struct spu_context * ctx);
+void spu_set_profile_private(struct spu_context * ctx, void * profile_info,
+			     struct kref * prof_info_kref, 
+			     void (* prof_info_release) (struct kref * kref));
+
 
 /* system callbacks from the SPU */
 struct spu_syscall_block {
Index: linux-2.6.20-rc1/include/linux/oprofile.h
===================================================================
--- linux-2.6.20-rc1.orig/include/linux/oprofile.h	2007-01-18 16:43:18.379575976 -0600
+++ linux-2.6.20-rc1/include/linux/oprofile.h	2007-02-01 17:21:46.970868920 -0600
@@ -17,6 +17,28 @@
 #include <linux/spinlock.h>
 #include <asm/atomic.h>
  
+/* Each escaped entry is prefixed by ESCAPE_CODE
+ * then one of the following codes, then the
+ * relevant data.
+ * These #defines live in this file so that arch-specific
+ * buffer sync'ing code can access them.  
+ */
+#define ESCAPE_CODE                     ~0UL
+#define CTX_SWITCH_CODE                 1
+#define CPU_SWITCH_CODE                 2
+#define COOKIE_SWITCH_CODE              3
+#define KERNEL_ENTER_SWITCH_CODE        4
+#define KERNEL_EXIT_SWITCH_CODE         5
+#define MODULE_LOADED_CODE              6
+#define CTX_TGID_CODE                   7
+#define TRACE_BEGIN_CODE                8
+#define TRACE_END_CODE                  9
+#define XEN_ENTER_SWITCH_CODE          10
+#define SPU_PROFILING_CODE             11
+#define SPU_CTX_SWITCH_CODE            12
+#define SPU_OFFSET_CODE                13
+#define SPU_COOKIE_CODE                14
+
 struct super_block;
 struct dentry;
 struct file_operations;
@@ -35,6 +57,14 @@
 	int (*start)(void);
 	/* Stop delivering interrupts. */
 	void (*stop)(void);
+	/* Arch-specific buffer sync functions.
+	 * Return value = 0:  Success
+	 * Return value = -1: Failure
+	 * Return value = 1:  Run generic sync function
+	 */
+	int (*sync_start)(void);
+	int (*sync_stop)(void);
+
 	/* Initiate a stack backtrace. Optional. */
 	void (*backtrace)(struct pt_regs * const regs, unsigned int depth);
 	/* CPU identification string. */
@@ -56,6 +86,13 @@
 void oprofile_arch_exit(void);
 
 /**
+ * Add data to the event buffer.
+ * The data passed is free-form, but typically consists of
+ * file offsets, dcookies, context information, and ESCAPE codes.
+ */
+void add_event_entry(unsigned long data);
+ 
+/**
  * Add a sample. This may be called from any context. Pass
  * smp_processor_id() as cpu.
  */
Index: linux-2.6.20-rc1/kernel/hrtimer.c
===================================================================
--- linux-2.6.20-rc1.orig/kernel/hrtimer.c	2007-01-18 16:43:05.808489704 -0600
+++ linux-2.6.20-rc1/kernel/hrtimer.c	2007-02-01 17:21:46.973868464 -0600
@@ -335,6 +335,7 @@
 
 	return orun;
 }
+EXPORT_SYMBOL_GPL(hrtimer_forward);
 
 /*
  * enqueue_hrtimer - internal function to (re)start a timer
Index: linux-2.6.20-rc1/arch/powerpc/kernel/time.c
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/kernel/time.c	2007-02-02 15:47:08.624906680 -0600
+++ linux-2.6.20-rc1/arch/powerpc/kernel/time.c	2007-02-02 17:06:28.183894912 -0600
@@ -122,6 +122,7 @@
 static long timezone_offset;
 
 unsigned long ppc_proc_freq;
+EXPORT_SYMBOL(ppc_proc_freq);
 unsigned long ppc_tb_freq;
 
 static u64 tb_last_jiffy __cacheline_aligned_in_smp;
Index: linux-2.6.20-rc1/arch/powerpc/platforms/cell/spufs/spufs.h
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/platforms/cell/spufs/spufs.h	2007-02-01 17:21:41.945834112 -0600
+++ linux-2.6.20-rc1/arch/powerpc/platforms/cell/spufs/spufs.h	2007-02-05 08:06:01.793907392 -0600
@@ -75,6 +75,9 @@
 
 	struct list_head gang_list;
 	struct spu_gang *gang;
+	void * profile_private;		/* To be used only by profiler */
+	struct kref * prof_priv_kref;
+	void (* prof_priv_release) (struct kref *kref);
 };
 
 struct spu_gang {
Index: linux-2.6.20-rc1/arch/powerpc/platforms/cell/spufs/context.c
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/platforms/cell/spufs/context.c	2007-02-05 14:42:04.359859432 -0600
+++ linux-2.6.20-rc1/arch/powerpc/platforms/cell/spufs/context.c	2007-02-06 16:44:05.983965096 -0600
@@ -22,6 +22,7 @@
 
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/module.h>
 #include <linux/slab.h>
 #include <asm/spu.h>
 #include <asm/spu_csa.h>
@@ -71,6 +72,8 @@
 	spu_fini_csa(&ctx->csa);
 	if (ctx->gang)
 		spu_gang_remove_ctx(ctx->gang, ctx);
+	if (ctx->prof_priv_kref)
+		kref_put(ctx->prof_priv_kref, ctx->prof_priv_release);
 	kfree(ctx);
 }
 
@@ -200,3 +203,29 @@
 
 	downgrade_write(&ctx->state_sema);
 }
+
+/* This interface allows a profiler (e.g., OProfile) to store
+ * spu_context information needed for profiling, allowing it to
+ * be saved across context save/restore operation.
+ *
+ * Assumes the caller has already incremented the ref count to
+ * profile_info; then spu_context_destroy must call kref_put
+ * on prof_info_kref.
+ */
+void spu_set_profile_private(struct spu_context * ctx, void * profile_info,
+			     struct kref * prof_info_kref,
+			     void (* prof_info_release) (struct kref * kref))
+{
+	ctx->profile_private = profile_info;
+	ctx->prof_priv_kref = prof_info_kref;
+	ctx->prof_priv_release = prof_info_release;
+}
+EXPORT_SYMBOL_GPL(spu_set_profile_private);
+
+void * spu_get_profile_private(struct spu_context * ctx)
+{
+	return ctx->profile_private;
+}
+EXPORT_SYMBOL_GPL(spu_get_profile_private);
+
+

^ permalink raw reply	[flat|nested] 66+ messages in thread

end of thread, other threads:[~2007-02-28  1:44 UTC | newest]

Thread overview: 66+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-02-14 23:52 [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch Carl Love
2007-02-15 14:37 ` Arnd Bergmann
2007-02-15 14:37   ` Arnd Bergmann
2007-02-15 16:15   ` Maynard Johnson
2007-02-15 16:15     ` Maynard Johnson
2007-02-15 18:13     ` Arnd Bergmann
2007-02-15 18:13       ` Arnd Bergmann
2007-02-15 20:21   ` Carl Love
2007-02-15 20:21     ` Carl Love
2007-02-15 21:03     ` Arnd Bergmann
2007-02-15 21:03       ` Arnd Bergmann
2007-02-15 21:50     ` Paul E. McKenney
2007-02-15 21:50       ` Paul E. McKenney
2007-02-16  0:33       ` Arnd Bergmann
2007-02-16  0:33         ` Arnd Bergmann
2007-02-16  0:32   ` Maynard Johnson
2007-02-16  0:32     ` Maynard Johnson
2007-02-16 17:14     ` Arnd Bergmann
2007-02-16 17:14       ` Arnd Bergmann
2007-02-16 21:43       ` Maynard Johnson
2007-02-16 21:43         ` Maynard Johnson
2007-02-18 23:18         ` Maynard Johnson
2007-02-18 23:18           ` Maynard Johnson
  -- strict thread matches above, loose matches on Subject: below --
2007-02-22  0:02 Carl Love
2007-02-26 23:50 ` Arnd Bergmann
2007-02-26 23:50   ` Arnd Bergmann
2007-02-27  1:31   ` Michael Ellerman
2007-02-27  1:31     ` Michael Ellerman
2007-02-27 16:52   ` Maynard Johnson
2007-02-27 16:52     ` Maynard Johnson
2007-02-28  1:44     ` Arnd Bergmann
2007-02-28  1:44       ` Arnd Bergmann
2007-02-06  0:28 [RFC,PATCH] CELL PPU " Carl Love
2007-02-06 23:02 ` [Cbe-oss-dev] [RFC, PATCH] CELL " Carl Love
2007-02-06 23:02   ` Carl Love
2007-02-07 15:41   ` Maynard Johnson
2007-02-07 15:41     ` Maynard Johnson
2007-02-07 22:48     ` Michael Ellerman
2007-02-07 22:48       ` Michael Ellerman
2007-02-08 15:03       ` Maynard Johnson
2007-02-08 15:03         ` Maynard Johnson
2007-02-08 14:18   ` Milton Miller
2007-02-08 14:18     ` Milton Miller
2007-02-08 17:21     ` Arnd Bergmann
2007-02-08 17:21       ` Arnd Bergmann
2007-02-08 18:01       ` Adrian Reber
2007-02-08 18:01         ` Adrian Reber
2007-02-08 22:51       ` Carl Love
2007-02-08 22:51         ` Carl Love
2007-02-09  2:46         ` Milton Miller
2007-02-09  2:46           ` Milton Miller
2007-02-09 16:17           ` Carl Love
2007-02-09 16:17             ` Carl Love
2007-02-11 22:46             ` Milton Miller
2007-02-11 22:46               ` Milton Miller
2007-02-12 16:38               ` Carl Love
2007-02-12 16:38                 ` Carl Love
2007-02-09 18:47       ` Milton Miller
2007-02-09 18:47         ` Milton Miller
2007-02-09 19:10         ` Arnd Bergmann
2007-02-09 19:10           ` Arnd Bergmann
2007-02-09 19:46           ` Milton Miller
2007-02-09 19:46             ` Milton Miller
2007-02-08 23:59     ` Maynard Johnson
2007-02-08 23:59       ` Maynard Johnson
2007-02-09 18:03       ` Milton Miller
2007-02-09 18:03         ` Milton Miller

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.