[Linux-ia64] SAL error record logging/decoding

* [Linux-ia64] SAL error record logging/decoding
@ 2003-05-07 23:41 Bjorn Helgaas
  2003-05-08  0:05 ` David Mosberger
                   ` (21 more replies)
  0 siblings, 22 replies; 23+ messages in thread
From: Bjorn Helgaas @ 2003-05-07 23:41 UTC (permalink / raw)
  To: linux-ia64

[-- Attachment #1: Type: text/plain, Size: 2001 bytes --]

The MCA/INIT/CMC/CPE log decoding currently in arch/ia64/kernel/mca.c
has some problems:

	- It doesn't know much about OEM-specific sections.
	- At boot-time, it sometimes takes so long to print
	  the log to the console that the BSP erroneously
	  assumes an AP is stuck.  This sometimes causes
	  *another* MCA.
	- The log goes ONLY to the console, where the output
	  may be lost.

So here's some fodder for discussion.  I don't claim that this is ready
for prime time; I just want to get some feedback on whether this
is a reasonable approach.

The attached patch (against 2.4.21-rc1) makes the raw, binary
error records straight from SAL available via files in /proc:

	/proc/sal/cpu<n>/{mca,init,cmc,cpe}

If you read the file, you get the raw data.  If you write "clear" to
it, you invalidate the current error record (which as I read the spec,
may potentially make another, pending record available to be read).

The idea is that

	- An rc script run at boot-time can save all the logs in
	  files, clearing each afterwards.
	- A user-level analysis tool can decode them as needed
	  (perhaps also run from the same rc script above).
	- The user-level analyzer need not be open-source, if
	  people are worried about IP in the OEM-specific sections.
	- A baseline open-source analyzer can provide at least the
	  functionality available today in the kernel decoder.

So, attached are the kernel patch against 2.4.21-rc1 and a simple
user program ("salinfo") to decode the logs.  Note that the kernel
patch removes the SAL clear_state_info calls from mca.c, so the error
records will be preserved until the user program can read them.
This feels like the right thing to me (only a user program
can know that the logs have been saved somewhere safe), but
no doubt there are issues here.

The user-space analyzer is derived from the current kernel code
in mca.c and should produce identical output.  For now, I left
all the code in the kernel as well, but ultimately it could be
removed.

Bjorn

[-- Attachment #2: diffs --]
[-- Type: text/x-diff, Size: 6379 bytes --]

===== arch/ia64/kernel/mca.c 1.23 vs edited =====

--- 1.23/arch/ia64/kernel/mca.c	Fri Apr 18 04:07:09 2003
+++ edited/arch/ia64/kernel/mca.c	Fri May  2 11:24:15 2003
@@ -156,10 +156,6 @@
 	 */
 
 	platform_err = ia64_log_print(sal_info_type, (prfunc_t)printk);
-	/* temporary: only clear SAL logs on hardware-corrected errors
-		or if we're logging an error after an MCA-initiated reboot */
-	if ((sal_info_type > 1) || (called_from_init))
-		ia64_sal_clear_state_info(sal_info_type);
 
 	return platform_err;
 }
@@ -1235,9 +1231,6 @@
 	proc_ptr = &plog_ptr->proc_err;
 
 	ia64_process_min_state_save(&SAL_LPI_PSI_INFO(proc_ptr)->min_state_area);
-
-	/* Clear the INIT SAL logs now that they have been saved in the OS buffer */
-	ia64_sal_clear_state_info(SAL_INFO_TYPE_INIT);
 
 	init_handler_platform(proc_ptr, pt, sw);	/* call platform specific routines */
 }
===== arch/ia64/kernel/salinfo.c 1.1 vs edited =====
--- 1.1/arch/ia64/kernel/salinfo.c	Thu Sep 12 10:43:47 2002
+++ edited/arch/ia64/kernel/salinfo.c	Tue May  6 14:53:28 2003
@@ -4,6 +4,8 @@
  * Creates entries in /proc/sal for various system features.
  *
  * Copyright (c) 2001 Silicon Graphics, Inc.  All rights reserved.
+ * Copyright (c) 2003 Hewlett-Packard Co
+ *	Bjorn Helgaas <bjorn_helgaas@hp.com>
  *
  * 10/30/2001	jbarnes@sgi.com		copied much of Stephane's palinfo
  *					code to create this file
@@ -12,8 +14,10 @@
 #include <linux/types.h>
 #include <linux/proc_fs.h>
 #include <linux/module.h>
+#include <linux/smp.h>
 
 #include <asm/sal.h>
+#include <asm/uaccess.h>
 
 MODULE_AUTHOR("Jesse Barnes <jbarnes@sgi.com>");
 MODULE_DESCRIPTION("/proc interface to IA-64 SAL features");
@@ -40,25 +44,191 @@
 
 #define NR_SALINFO_ENTRIES (sizeof(salinfo_entries)/sizeof(salinfo_entry_t))
 
-/*
- * One for each feature and one more for the directory entry...
- */
-static struct proc_dir_entry *salinfo_proc_entries[NR_SALINFO_ENTRIES + 1];
+static char *salinfo_log_name[] = {
+	"mca",
+	"init",
+	"cmc",
+	"cpe",
+};
+
+static struct proc_dir_entry *salinfo_proc_entries[
+	ARRAY_SIZE(salinfo_entries) +			/* /proc/sal/bus_lock */
+	(NR_CPUS * ARRAY_SIZE(salinfo_log_name)) +	/* /proc/sal/cpu0/mca */
+	NR_CPUS +					/* /proc/sal/cpu0 */
+	1];						/* /proc/sal */
+
+struct salinfo_log_data {
+	int	type;
+	u8	*log_buffer;
+	u64	log_size;
+};
+
+static void
+salinfo_log_read_cpu(void *context)
+{
+	struct salinfo_log_data *info = context;
+	u64 size;
+
+	size = ia64_sal_get_state_info_size(info->type);
+	info->log_buffer = kmalloc(size, GFP_ATOMIC);
+	if (!info->log_buffer)
+		return;
+
+	info->log_size = ia64_sal_get_state_info(info->type, (u64 *) info->log_buffer);
+}
+
+static ssize_t
+salinfo_log_read(struct file *file, char *buffer, size_t count, loff_t *ppos)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct proc_dir_entry *entry = (struct proc_dir_entry *) inode->u.generic_ip;
+	struct salinfo_log_data info;
+	int cpu, ret;
+	void *data;
+	size_t size;
+
+	if (!suser())
+		return -EPERM;
+
+	MOD_INC_USE_COUNT;
+
+	cpu = (u64) entry->data >> 16;
+	info.type = (u64) entry->data & 0xffff;
+
+	if (cpu == smp_processor_id())
+		salinfo_log_read_cpu(&info);
+	else {
+#ifdef CONFIG_SMP
+		smp_call_function_single(cpu, salinfo_log_read_cpu, &info, 0, 1);
+#else
+		printk(KERN_ERR "%s: trying to read CPU %d data from %d\n",
+			__FUNCTION__, cpu, smp_processor_id());
+		info.log_buffer = 0;
+#endif
+	}
+
+	if (!info.log_buffer || *ppos >= info.log_size) {
+		ret = 0;
+		goto out;
+	}
+
+	data = info.log_buffer + file->f_pos;
+	size = info.log_size - file->f_pos;
+	if (size > count)
+		size = count;
+
+	if (copy_to_user(buffer, data, size)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	*ppos += size;
+	ret = size;
+
+out:
+	kfree(info.log_buffer);
+
+	MOD_DEC_USE_COUNT;
+
+	return ret;
+}
+
+static void
+salinfo_log_write_cpu(void *context)
+{
+	u64 type = (u64) context;
+
+	ia64_sal_clear_state_info(type);
+}
+
+static ssize_t
+salinfo_log_write(struct file *file, const char *buffer, size_t count, loff_t *ppos)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct proc_dir_entry *entry = (struct proc_dir_entry *) inode->u.generic_ip;
+	char cmd[16];
+	int cpu;
+	u64 type;
+
+	if (!suser())
+		return -EPERM;
+
+	if (ppos != &file->f_pos)
+		return -ESPIPE;
+
+	memset(cmd, 0, sizeof(cmd));
+	if (copy_from_user(cmd, buffer, sizeof(cmd)))
+		return -EFAULT;
+
+	if (strncmp(cmd, "clear", 5))
+		return count;
+
+	MOD_INC_USE_COUNT;
+
+	cpu = (u64) entry->data >> 16;
+	type = (u64) entry->data & 0xffff;
+
+	if (cpu == smp_processor_id())
+		salinfo_log_write_cpu((void *) type);
+	else {
+#ifdef CONFIG_SMP
+		smp_call_function_single(cpu, salinfo_log_write_cpu, (void *) type, 0, 1);
+#else
+		printk(KERN_ERR "%s: trying to clear CPU %d data from %d\n",
+			__FUNCTION__, cpu, smp_processor_id());
+#endif
+	}
+
+	MOD_DEC_USE_COUNT;
+
+	return count;
+}
+
+static struct file_operations salinfo_log_fops = {
+	.read  = salinfo_log_read,
+	.write = salinfo_log_write,
+};
 
 static int __init
 salinfo_init(void)
 {
 	struct proc_dir_entry *salinfo_dir; /* /proc/sal dir entry */
 	struct proc_dir_entry **sdir = salinfo_proc_entries; /* keeps track of every entry */
-	int i;
+	struct proc_dir_entry *cpu_dir, *entry;
+#define CPUSTR "cpu%d"
+	char name[sizeof(CPUSTR)];
+	int i, j;
 
 	salinfo_dir = proc_mkdir("sal", NULL);
+	if (!salinfo_dir)
+		return 0;
 
 	for (i=0; i < NR_SALINFO_ENTRIES; i++) {
 		/* pass the feature bit in question as misc data */
 		*sdir++ = create_proc_read_entry (salinfo_entries[i].name, 0, salinfo_dir,
 						  salinfo_read, (void *)salinfo_entries[i].feature);
 	}
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_online(i))
+			continue;
+
+		sprintf(name, CPUSTR, i);
+		cpu_dir = proc_mkdir(name, salinfo_dir);
+		if (!cpu_dir)
+			continue;
+
+		for (j = 0; j < ARRAY_SIZE(salinfo_log_name); j++) {
+			entry = create_proc_entry(salinfo_log_name[j], 0, cpu_dir);
+			if (entry) {
+				entry->proc_fops = &salinfo_log_fops;
+				entry->data = (void *) ((u64) i << 16 | j);
+				*sdir++ = entry;
+			}
+		}
+		*sdir++ = cpu_dir;
+	}
+
 	*sdir++ = salinfo_dir;
 
 	return 0;
@@ -69,7 +239,7 @@
 {
 	int i = 0;
 
-	for (i = 0; i < NR_SALINFO_ENTRIES ; i++) {
+	for (i = 0; i < ARRAY_SIZE(salinfo_proc_entries); i++) {
 		if (salinfo_proc_entries[i])
 			remove_proc_entry (salinfo_proc_entries[i]->name, NULL);
 	}

[-- Attachment #3: salinfo.tar.gz --]
[-- Type: application/x-tgz, Size: 32027 bytes --]

^ permalink raw reply	[flat|nested] 23+ messages in thread