linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] PPC64: EEH Recovery
@ 2005-01-06 19:24 Linas Vepstas
  2005-01-17 20:14 ` Linas Vepstas
  0 siblings, 1 reply; 8+ messages in thread
From: Linas Vepstas @ 2005-01-06 19:24 UTC (permalink / raw)
  To: paulus, anton, akpm; +Cc: linuxppc64-dev, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 406 bytes --]


Hi Paul,

The patch below implements hotplug style EEH error recovery. 
Its split into two pieces: a part that needs to be applied to the
PPC64 arch tree, and a part that needs to be applied to the 
RPA PHP hotplug tree. The PPC64 part needs to go in first.

Assuming this doesn't generate a round of discussion, please
forward upstream to akpm/torvalds.

Signed-off-by: Linas Vepstas <linas@linas.org>



[-- Attachment #2: eeh-recovery-bk-ppc64-3.patch --]
[-- Type: text/plain, Size: 19418 bytes --]

===== arch/ppc64/kernel/eeh.c 1.41 vs edited =====
--- 1.41/arch/ppc64/kernel/eeh.c	2005-01-06 13:05:42 -06:00
+++ edited/arch/ppc64/kernel/eeh.c	2005-01-06 13:08:03 -06:00
@@ -17,21 +17,19 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  */
 
-#include <linux/bootmem.h>
+#include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/list.h>
-#include <linux/mm.h>
 #include <linux/notifier.h>
 #include <linux/pci.h>
 #include <linux/proc_fs.h>
 #include <linux/rbtree.h>
 #include <linux/seq_file.h>
-#include <linux/spinlock.h>
+#include <asm/atomic.h>
 #include <asm/eeh.h>
 #include <asm/io.h>
 #include <asm/machdep.h>
 #include <asm/rtas.h>
-#include <asm/atomic.h>
 #include "pci.h"
 
 #undef DEBUG
@@ -89,7 +87,6 @@ static struct notifier_block *eeh_notifi
  * attempts we allow before panicking.
  */
 #define EEH_MAX_FAILS	1000
-static atomic_t eeh_fail_count;
 
 /* RTAS tokens */
 static int ibm_set_eeh_option;
@@ -106,6 +103,10 @@ static spinlock_t slot_errbuf_lock = SPI
 static int eeh_error_buf_size;
 
 /* System monitoring statistics */
+static DEFINE_PER_CPU(unsigned long, no_device);
+static DEFINE_PER_CPU(unsigned long, no_dn);
+static DEFINE_PER_CPU(unsigned long, no_cfg_addr);
+static DEFINE_PER_CPU(unsigned long, ignored_check);
 static DEFINE_PER_CPU(unsigned long, total_mmio_ffs);
 static DEFINE_PER_CPU(unsigned long, false_positives);
 static DEFINE_PER_CPU(unsigned long, ignored_failures);
@@ -224,9 +225,9 @@ pci_addr_cache_insert(struct pci_dev *de
 	while (*p) {
 		parent = *p;
 		piar = rb_entry(parent, struct pci_io_addr_range, rb_node);
-		if (alo < piar->addr_lo) {
+		if (ahi < piar->addr_lo) {
 			p = &parent->rb_left;
-		} else if (ahi > piar->addr_hi) {
+		} else if (alo > piar->addr_hi) {
 			p = &parent->rb_right;
 		} else {
 			if (dev != piar->pcidev ||
@@ -244,6 +245,11 @@ pci_addr_cache_insert(struct pci_dev *de
 	piar->addr_hi = ahi;
 	piar->pcidev = dev;
 	piar->flags = flags;
+	
+#ifdef DEBUG 
+	printk (KERN_DEBUG "PIAR: insert range=[%lx:%lx] dev=%s\n", 
+	               alo, ahi, pci_name (dev));
+#endif
 
 	rb_link_node(&piar->rb_node, parent, p);
 	rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root);
@@ -368,6 +374,7 @@ void pci_addr_cache_remove_device(struct
  */
 void __init pci_addr_cache_build(void)
 {
+	struct device_node *dn;
 	struct pci_dev *dev = NULL;
 
 	spin_lock_init(&pci_io_addr_cache_root.piar_lock);
@@ -378,6 +385,14 @@ void __init pci_addr_cache_build(void)
 			continue;
 		}
 		pci_addr_cache_insert_device(dev);
+		
+		/* Save the BAR's; firmware doesn't restore these after EEH reset */
+		dn = pci_device_to_OF_node(dev);
+		if (dn) {
+			int i;
+			for (i = 0; i < 16; i++) 
+				pci_read_config_dword(dev, i * 4, &dn->config_space[i]);
+		}
 	}
 
 #ifdef DEBUG
@@ -389,6 +404,32 @@ void __init pci_addr_cache_build(void)
 /* --------------------------------------------------------------- */
 /* Above lies the PCI Address Cache. Below lies the EEH event infrastructure */
 
+void eeh_slot_error_detail (struct device_node *dn, int severity)
+{
+	unsigned long flags;
+	int rc;
+
+	if (!dn) return;
+
+	/* Log the error with the rtas logger */
+	spin_lock_irqsave(&slot_errbuf_lock, flags);
+	memset(slot_errbuf, 0, eeh_error_buf_size);
+
+	rc = rtas_call(ibm_slot_error_detail,
+	               8, 1, NULL, dn->eeh_config_addr,
+	               BUID_HI(dn->phb->buid),
+	               BUID_LO(dn->phb->buid), NULL, 0,
+	               virt_to_phys(slot_errbuf),
+	               eeh_error_buf_size,
+	               severity);
+
+	if (rc == 0)
+		log_error(slot_errbuf, ERR_TYPE_RTAS_LOG, 0);
+	spin_unlock_irqrestore(&slot_errbuf_lock, flags);
+}
+
+EXPORT_SYMBOL(eeh_slot_error_detail);
+
 /**
  * eeh_register_notifier - Register to find out about EEH events.
  * @nb: notifier block to callback on events
@@ -484,11 +525,9 @@ static void eeh_event_handler(void *dumm
 		       "%s %s\n", event->reset_state,
 		       pci_name(event->dev), pci_pretty_name(event->dev));
 
-		atomic_set(&eeh_fail_count, 0);
-		notifier_call_chain (&eeh_notifier_chain,
-				     EEH_NOTIFY_FREEZE, event);
-
 		__get_cpu_var(slot_resets)++;
+		notifier_call_chain (&eeh_notifier_chain,
+		           EEH_NOTIFY_FREEZE, event);
 
 		pci_dev_put(event->dev);
 		kfree(event);
@@ -496,8 +535,8 @@ static void eeh_event_handler(void *dumm
 }
 
 /**
- * eeh_token_to_phys - convert EEH address token to phys address
- * @token i/o token, should be address in the form 0xE....
+ * eeh_token_to_phys - convert I/O address to phys address
+ * @token i/o address, should be address in the form 0xA....
  */
 static inline unsigned long eeh_token_to_phys(unsigned long token)
 {
@@ -512,6 +551,17 @@ static inline unsigned long eeh_token_to
 	return pa | (token & (PAGE_SIZE-1));
 }
 
+static inline struct pci_dev * eeh_get_pci_dev(struct device_node *dn)
+{
+	struct pci_dev *dev = NULL;
+
+	for_each_pci_dev(dev) {
+		if (pci_device_to_OF_node(dev) == dn)
+			return dev;
+	}
+	return NULL;
+}
+
 /**
  * eeh_dn_check_failure - check if all 1's data is due to EEH slot freeze
  * @dn device node
@@ -532,7 +582,7 @@ int eeh_dn_check_failure(struct device_n
 	int ret;
 	int rets[3];
 	unsigned long flags;
-	int rc, reset_state;
+	int reset_state;
 	struct eeh_event  *event;
 
 	__get_cpu_var(total_mmio_ffs)++;
@@ -540,16 +590,20 @@ int eeh_dn_check_failure(struct device_n
 	if (!eeh_subsystem_enabled)
 		return 0;
 
-	if (!dn)
+	if (!dn) {
+		__get_cpu_var(no_dn)++;
 		return 0;
+	}
 
 	/* Access to IO BARs might get this far and still not want checking. */
 	if (!(dn->eeh_mode & EEH_MODE_SUPPORTED) ||
 	    dn->eeh_mode & EEH_MODE_NOCHECK) {
+		__get_cpu_var(ignored_check)++;
 		return 0;
 	}
 
 	if (!dn->eeh_config_addr) {
+		__get_cpu_var(no_cfg_addr)++;
 		return 0;
 	}
 
@@ -558,8 +612,9 @@ int eeh_dn_check_failure(struct device_n
 	 * slot, we know it's bad already, we don't need to check...
 	 */
 	if (dn->eeh_mode & EEH_MODE_ISOLATED) {
-		atomic_inc(&eeh_fail_count);
-		if (atomic_read(&eeh_fail_count) >= EEH_MAX_FAILS) {
+		dn->eeh_freeze_count ++;
+		if (dn->eeh_freeze_count >= EEH_MAX_FAILS) {
+			dump_stack();
 			/* re-read the slot reset state */
 			if (read_slot_reset_state(dn, rets) != 0)
 				rets[0] = -1;	/* reset state unknown */
@@ -581,34 +636,25 @@ int eeh_dn_check_failure(struct device_n
 		return 0;
 	}
 
-	/* prevent repeated reports of this failure */
+	/* Prevent repeated reports of this failure */
 	dn->eeh_mode |= EEH_MODE_ISOLATED;
 
 	reset_state = rets[0];
+	/* Log the error with the rtas logger */
+	if (dn->eeh_freeze_count < EEH_MAX_ALLOWED_FREEZES) {
+		eeh_slot_error_detail (dn, 1 /* Temporary Error */);
+	} else {
+		eeh_slot_error_detail (dn, 2 /* Permanent Error */);
+   }
 
-	spin_lock_irqsave(&slot_errbuf_lock, flags);
-	memset(slot_errbuf, 0, eeh_error_buf_size);
-
-	rc = rtas_call(ibm_slot_error_detail,
-	               8, 1, NULL, dn->eeh_config_addr,
-	               BUID_HI(dn->phb->buid),
-	               BUID_LO(dn->phb->buid), NULL, 0,
-	               virt_to_phys(slot_errbuf),
-	               eeh_error_buf_size,
-	               1 /* Temporary Error */);
-
-	if (rc == 0)
-		log_error(slot_errbuf, ERR_TYPE_RTAS_LOG, 0);
-	spin_unlock_irqrestore(&slot_errbuf_lock, flags);
-
-	printk(KERN_INFO "EEH: MMIO failure (%d) on device: %s %s\n",
-	       rets[0], dn->name, dn->full_name);
 	event = kmalloc(sizeof(*event), GFP_ATOMIC);
 	if (event == NULL) {
-		eeh_panic(dev, reset_state);
+		printk (KERN_ERR "EEH: out of memory, event not handled\n");
 		return 1;
  	}
 
+	if (!dev)
+		dev = eeh_get_pci_dev (dn);
 	event->dev = dev;
 	event->dn = dn;
 	event->reset_state = reset_state;
@@ -634,7 +680,6 @@ EXPORT_SYMBOL(eeh_dn_check_failure);
  * @token i/o token, should be address in the form 0xA....
  * @val value, should be all 1's (XXX why do we need this arg??)
  *
- * Check for an eeh failure at the given token address.
  * Check for an EEH failure at the given token address.  Call this
  * routine if the result of a read was all 0xff's and you want to
  * find out if this is due to an EEH slot freeze event.  This routine
@@ -642,6 +687,7 @@ EXPORT_SYMBOL(eeh_dn_check_failure);
  *
  * Note this routine is safe to call in an interrupt context.
  */
+
 unsigned long eeh_check_failure(const volatile void __iomem *token, unsigned long val)
 {
 	unsigned long addr;
@@ -651,8 +697,10 @@ unsigned long eeh_check_failure(const vo
 	/* Finding the phys addr + pci device; this is pretty quick. */
 	addr = eeh_token_to_phys((unsigned long __force) token);
 	dev = pci_get_device_by_addr(addr);
-	if (!dev)
+	if (!dev) {
+		__get_cpu_var(no_device)++;
 		return val;
+	}
 
 	dn = pci_device_to_OF_node(dev);
 	eeh_dn_check_failure (dn, dev);
@@ -663,6 +711,172 @@ unsigned long eeh_check_failure(const vo
 
 EXPORT_SYMBOL(eeh_check_failure);
 
+/* ------------------------------------------------------------- */
+/* The code below deals with error recovery */
+
+void
+rtas_set_slot_reset(struct device_node *dn)
+{
+	int token = rtas_token ("ibm,set-slot-reset");
+	int rc;
+
+	if (token == RTAS_UNKNOWN_SERVICE)
+		return;
+	rc = rtas_call(token,4,1, NULL,
+	               dn->eeh_config_addr,
+	               BUID_HI(dn->phb->buid),
+	               BUID_LO(dn->phb->buid),
+	               1);
+	if (rc) {
+		printk (KERN_WARNING "EEH: Unable to reset the failed slot\n");
+		return;
+	}
+	
+	/* The PCI bus requires that the reset be held high for at least
+	 * a 100 milliseconds. We wait a bit longer 'just in case'.
+	 */
+   msleep (200);
+	
+	rc = rtas_call(token,4,1, NULL,
+	               dn->eeh_config_addr,
+	               BUID_HI(dn->phb->buid),
+	               BUID_LO(dn->phb->buid),
+	               0);
+}
+
+EXPORT_SYMBOL(rtas_set_slot_reset);
+
+void
+rtas_configure_bridge(struct device_node *dn)
+{
+	int token = rtas_token ("ibm,configure-bridge");
+	int rc;
+
+	if (token == RTAS_UNKNOWN_SERVICE)
+		return;
+	rc = rtas_call(token,3,1, NULL,
+	               dn->eeh_config_addr,
+	               BUID_HI(dn->phb->buid),
+	               BUID_LO(dn->phb->buid));
+	if (rc) {
+		printk (KERN_WARNING "EEH: Unable to configure device bridge\n");
+	}
+}
+
+EXPORT_SYMBOL(rtas_configure_bridge);
+
+/* ------------------------------------------------------- */
+/** Save and restore of PCI BARs
+ * 
+ * Although firmware will set up BARs during boot, it doesn't
+ * set up device BAR's after a device reset, although it will,
+ * if requested, set up bridge configuration. Thus, we need to 
+ * configure the PCI devices ourselves.  Config-space setup is 
+ * stored in the PCI structures which are normally deleted during
+ * device removal.  Thus, the "save" routine references the
+ * structures so that they aren't deleted. 
+ */
+
+
+struct eeh_cfg_tree
+{
+	struct eeh_cfg_tree *sibling;
+	struct eeh_cfg_tree *child;
+	struct device_node *dn;
+	int is_bridge;
+};
+
+/** 
+ * eeh_save_bars - save the PCI config space info
+ */
+struct eeh_cfg_tree * eeh_save_bars(struct device_node *dn)
+{
+	struct pci_dev *dev;
+	struct eeh_cfg_tree *cnode;
+
+	dev = eeh_get_pci_dev(dn);
+	if (!dev)
+		return NULL;
+	
+	cnode = kmalloc(sizeof(struct eeh_cfg_tree), GFP_KERNEL);
+	if (!cnode) 
+		return NULL;
+	
+	cnode->is_bridge = 0;
+	
+	if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) 
+		cnode->is_bridge = 1;
+			  
+	of_node_get(dn);
+	cnode->dn = dn;
+	
+	cnode->sibling = NULL;
+	cnode->child = NULL;
+
+	if (dn->child) {
+		cnode->child = eeh_save_bars (dn->child);
+	}
+	if (dn->sibling) {
+		cnode->sibling = eeh_save_bars (dn->sibling);
+	}
+
+	return cnode;
+}
+EXPORT_SYMBOL(eeh_save_bars);
+
+/**
+ * __restore_bars - Restore the Base Address Registers
+ * Loads the PCI configuration space base address registers, 
+ * the expansion ROM base address, the latency timer, and etc.
+ * from the saved values in the device node.
+ */
+static inline void __restore_bars (struct device_node *dn)
+{
+	int i;
+	for (i=4; i<10; i++) {
+		rtas_write_config(dn, i*4, 4, dn->config_space[i]);
+	}
+
+	/* 12 == Expansion ROM Address */
+	rtas_write_config(dn, 12*4, 4, dn->config_space[12]);
+	
+#define SAVED_BYTE(OFF) (((u8 *)(dn->config_space))[OFF])
+	
+	rtas_write_config (dn, PCI_CACHE_LINE_SIZE, 1, 
+	            SAVED_BYTE(PCI_CACHE_LINE_SIZE));
+	
+	rtas_write_config (dn, PCI_LATENCY_TIMER, 1, 
+	            SAVED_BYTE(PCI_LATENCY_TIMER));
+	
+	rtas_write_config (dn, PCI_INTERRUPT_LINE, 1, 
+	            SAVED_BYTE(PCI_INTERRUPT_LINE));
+}
+
+/** 
+ * eeh_restore_bars - restore the PCI config space info
+ */
+void eeh_restore_bars(struct eeh_cfg_tree *tree)
+{
+	if (!(tree->is_bridge))
+		__restore_bars (tree->dn);
+	
+	if (tree->child)
+		eeh_restore_bars (tree->child);
+
+	if (tree->sibling)
+		eeh_restore_bars (tree->sibling);
+
+	of_node_put (tree->dn);
+	kfree (tree);
+}
+EXPORT_SYMBOL(eeh_restore_bars);
+
+/* ------------------------------------------------------------- */
+/* The code below deals with enabling EEH for devices during  the
+ * early boot sequence.  EEH must be enabled before any PCI probing
+ * can be done.
+ */
+
 struct eeh_early_enable_info {
 	unsigned int buid_hi;
 	unsigned int buid_lo;
@@ -829,7 +1043,9 @@ void eeh_add_device_early(struct device_
 		return;
 	phb = dn->phb;
 	if (NULL == phb || 0 == phb->buid) {
-		printk(KERN_WARNING "EEH: Expected buid but found none\n");
+		printk(KERN_WARNING "EEH: Expected buid but found none for %s\n",
+		                dn->full_name);
+		dump_stack();
 		return;
 	}
 
@@ -848,6 +1064,9 @@ EXPORT_SYMBOL(eeh_add_device_early);
  */
 void eeh_add_device_late(struct pci_dev *dev)
 {
+	int i;
+	struct device_node *dn;
+
 	if (!dev || !eeh_subsystem_enabled)
 		return;
 
@@ -857,6 +1076,11 @@ void eeh_add_device_late(struct pci_dev 
 #endif
 
 	pci_addr_cache_insert_device (dev);
+
+	/* Save the BAR's; firmware doesn't restore these after EEH reset */
+	dn = pci_device_to_OF_node(dev);
+	for (i = 0; i < 16; i++)
+		pci_read_config_dword(dev, i * 4, &dn->config_space[i]);
 }
 EXPORT_SYMBOL(eeh_add_device_late);
 
@@ -886,12 +1110,17 @@ static int proc_eeh_show(struct seq_file
 	unsigned int cpu;
 	unsigned long ffs = 0, positives = 0, failures = 0;
 	unsigned long resets = 0;
+	unsigned long no_dev = 0, no_dn = 0, no_cfg = 0, no_check = 0;
 
 	for_each_cpu(cpu) {
 		ffs += per_cpu(total_mmio_ffs, cpu);
 		positives += per_cpu(false_positives, cpu);
 		failures += per_cpu(ignored_failures, cpu);
 		resets += per_cpu(slot_resets, cpu);
+		no_dev += per_cpu(no_device, cpu);
+		no_dn += per_cpu(no_dn, cpu);
+		no_cfg += per_cpu(no_cfg_addr, cpu);
+		no_check += per_cpu(ignored_check, cpu);
 	}
 
 	if (0 == eeh_subsystem_enabled) {
@@ -899,13 +1128,17 @@ static int proc_eeh_show(struct seq_file
 		seq_printf(m, "eeh_total_mmio_ffs=%ld\n", ffs);
 	} else {
 		seq_printf(m, "EEH Subsystem is enabled\n");
-		seq_printf(m, "eeh_total_mmio_ffs=%ld\n"
+		seq_printf(m, 
+				"no device=%ld\n"
+				"no device node=%ld\n"
+				"no config address=%ld\n"
+				"check not wanted=%ld\n"
+				"eeh_total_mmio_ffs=%ld\n"
 			   "eeh_false_positives=%ld\n"
 			   "eeh_ignored_failures=%ld\n"
-			   "eeh_slot_resets=%ld\n"
-				"eeh_fail_count=%d\n",
-			   ffs, positives, failures, resets,
-				eeh_fail_count.counter);
+			   "eeh_slot_resets=%ld\n",
+				no_dev, no_dn, no_cfg, no_check,
+			   ffs, positives, failures, resets);
 	}
 
 	return 0;
===== arch/ppc64/kernel/pSeries_pci.c 1.59 vs edited =====
--- 1.59/arch/ppc64/kernel/pSeries_pci.c	2004-11-15 21:29:10 -06:00
+++ edited/arch/ppc64/kernel/pSeries_pci.c	2005-01-05 13:41:09 -06:00
@@ -102,7 +102,7 @@ static int rtas_pci_read_config(struct p
 	return PCIBIOS_DEVICE_NOT_FOUND;
 }
 
-static int rtas_write_config(struct device_node *dn, int where, int size, u32 val)
+int rtas_write_config(struct device_node *dn, int where, int size, u32 val)
 {
 	unsigned long buid, addr;
 	int ret;
===== include/asm-ppc64/eeh.h 1.23 vs edited =====
--- 1.23/include/asm-ppc64/eeh.h	2004-10-25 18:17:38 -05:00
+++ edited/include/asm-ppc64/eeh.h	2005-01-05 13:47:55 -06:00
@@ -22,8 +22,8 @@
 
 #include <linux/init.h>
 #include <linux/list.h>
-#include <linux/string.h>
 #include <linux/notifier.h>
+#include <linux/string.h>
 
 struct pci_dev;
 struct device_node;
@@ -33,6 +33,10 @@ struct device_node;
 #define EEH_MODE_NOCHECK	(1<<1)
 #define EEH_MODE_ISOLATED	(1<<2)
 
+/* Max number of EEH freezes allowed before we consider the device
+ * to be permanently disabled. */
+#define EEH_MAX_ALLOWED_FREEZES 5
+
 #ifdef CONFIG_PPC_PSERIES
 extern void __init eeh_init(void);
 unsigned long eeh_check_failure(const volatile void __iomem *token, unsigned long val);
@@ -57,6 +61,34 @@ void eeh_add_device_early(struct device_
 void eeh_add_device_late(struct pci_dev *);
 
 /**
+ * eeh_slot_error_detail -- record and EEH error condition to the log
+ * @severity: 1 if temporary, 2 if permanent failure.
+ *
+ * Obtains the the EEH error details from the RTAS subsystem, 
+ * and then logs these details with the RTAS error log system.
+ */
+void eeh_slot_error_detail (struct device_node *dn, int severity);
+
+/** 
+ * rtas_set_slot_reset -- unfreeze a frozen slot
+ *
+ * Clear the EEH-frozen condition on a slot.  This routine
+ * does this by asserting the PCI #RST line for 1/8th of 
+ * a second; this routine will sleep while the adapter is 
+ * being reset.
+ */
+void rtas_set_slot_reset (struct device_node *dn);
+
+/**
+ * rtas_configure_bridge -- firmware initialization of pci bridge
+ * 
+ * Ask the firmware to configure any PCI bridge devices 
+ * located behind the indicated node. Required after a 
+ * pci device reset.
+ */
+void rtas_configure_bridge(struct device_node *dn);
+
+/**
  * eeh_remove_device - undo EEH setup for the indicated pci device
  * @dev: pci device to be removed
  *
@@ -91,6 +123,13 @@ struct eeh_event {
 /** Register to find out about EEH events. */
 int eeh_register_notifier(struct notifier_block *nb);
 int eeh_unregister_notifier(struct notifier_block *nb);
+
+/** Save and restore device configuration info across
+ *  device resets.
+ */
+struct eeh_cfg_tree;
+struct eeh_cfg_tree * eeh_save_bars(struct device_node *dn);
+void eeh_restore_bars(struct eeh_cfg_tree *tree);
 
 /**
  * EEH_POSSIBLE_ERROR() -- test for possible MMIO failure.
===== include/asm-ppc64/prom.h 1.24 vs edited =====
--- 1.24/include/asm-ppc64/prom.h	2004-11-25 00:42:42 -06:00
+++ edited/include/asm-ppc64/prom.h	2005-01-05 13:41:09 -06:00
@@ -164,8 +164,10 @@ struct device_node {
 	int	status;			/* Current device status (non-zero is bad) */
 	int	eeh_mode;		/* See eeh.h for possible EEH_MODEs */
 	int	eeh_config_addr;
+	int	eeh_freeze_count;   /* number of times this device froze up. */
 	struct  pci_controller *phb;	/* for pci devices */
 	struct	iommu_table *iommu_table;	/* for phb's or bridges */
+	u32      config_space[16]; /* saved PCI config space */
 
 	struct	property *properties;
 	struct	device_node *parent;
===== include/asm-ppc64/rtas.h 1.25 vs edited =====
--- 1.25/include/asm-ppc64/rtas.h	2004-11-25 00:42:42 -06:00
+++ edited/include/asm-ppc64/rtas.h	2005-01-05 13:41:09 -06:00
@@ -241,4 +241,6 @@ extern void rtas_stop_self(void);
 /* RMO buffer reserved for user-space RTAS use */
 extern unsigned long rtas_rmo_buf;
 
+extern int rtas_write_config(struct device_node *dn, int where, int size, u32 val);
+
 #endif /* _PPC64_RTAS_H */

[-- Attachment #3: eeh-recovery-bk-hotplug-3.patch --]
[-- Type: text/plain, Size: 8853 bytes --]

===== drivers/pci/hotplug/rpaphp.h 1.11 vs edited =====
--- 1.11/drivers/pci/hotplug/rpaphp.h	2004-10-06 11:43:44 -05:00
+++ edited/drivers/pci/hotplug/rpaphp.h	2005-01-05 13:41:09 -06:00
@@ -126,6 +126,8 @@ extern int register_pci_slot(struct slot
 extern int rpaphp_unconfig_pci_adapter(struct slot *slot);
 extern int rpaphp_get_pci_adapter_status(struct slot *slot, int is_init, u8 * value);
 extern struct hotplug_slot *rpaphp_find_hotplug_slot(struct pci_dev *dev);
+extern void init_eeh_handler (void);
+extern void exit_eeh_handler (void);
 
 /* rpaphp_core.c */
 extern int rpaphp_add_slot(struct device_node *dn);
===== drivers/pci/hotplug/rpaphp_core.c 1.18 vs edited =====
--- 1.18/drivers/pci/hotplug/rpaphp_core.c	2004-10-06 11:43:44 -05:00
+++ edited/drivers/pci/hotplug/rpaphp_core.c	2005-01-05 13:41:09 -06:00
@@ -443,12 +443,18 @@ static int __init rpaphp_init(void)
 {
 	info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
 
+	/* Get set to handle EEH events. */
+	init_eeh_handler();
+
 	/* read all the PRA info from the system */
 	return init_rpa();
 }
 
 static void __exit rpaphp_exit(void)
 {
+	/* Let EEH know we are going away. */
+	exit_eeh_handler();
+
 	cleanup_slots();
 }
 
===== drivers/pci/hotplug/rpaphp_pci.c 1.17 vs edited =====
--- 1.17/drivers/pci/hotplug/rpaphp_pci.c	2004-11-18 02:36:18 -06:00
+++ edited/drivers/pci/hotplug/rpaphp_pci.c	2005-01-05 15:30:29 -06:00
@@ -22,8 +22,12 @@
  * Send feedback to <lxie@us.ibm.com>
  *
  */
+#include <linux/delay.h>
+#include <linux/notifier.h>
 #include <linux/pci.h>
+#include <asm/eeh.h>
 #include <asm/pci-bridge.h>
+#include <asm/prom.h>
 #include <asm/rtas.h>
 #include "../pci.h"		/* for pci_add_new_bus */
 
@@ -62,6 +66,7 @@ int rpaphp_claim_resource(struct pci_dev
 		    root ? "Address space collision on" :
 		    "No parent found for",
 		    resource, dtype, pci_name(dev), res->start, res->end);
+		dump_stack();
 	}
 	return err;
 }
@@ -184,6 +189,19 @@ rpaphp_fixup_new_pci_devices(struct pci_
 
 static int rpaphp_pci_config_bridge(struct pci_dev *dev);
 
+static void rpaphp_eeh_add_bus_device(struct pci_bus *bus)
+{
+	struct pci_dev *dev;
+	list_for_each_entry(dev, &bus->devices, bus_list) {
+		eeh_add_device_late(dev);
+		if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
+			struct pci_bus *subbus = dev->subordinate;
+			if (bus)
+				rpaphp_eeh_add_bus_device (subbus);
+		}
+	}
+}
+
 /*****************************************************************************
  rpaphp_pci_config_slot() will  configure all devices under the 
  given slot->dn and return the the first pci_dev.
@@ -211,6 +229,8 @@ rpaphp_pci_config_slot(struct device_nod
 		}
 		if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) 
 			rpaphp_pci_config_bridge(dev);
+
+		rpaphp_eeh_add_bus_device(bus);
 	}
 	return dev;
 }
@@ -219,7 +239,6 @@ static int rpaphp_pci_config_bridge(stru
 {
 	u8 sec_busno;
 	struct pci_bus *child_bus;
-	struct pci_dev *child_dev;
 
 	dbg("Enter %s:  BRIDGE dev=%s\n", __FUNCTION__, pci_name(dev));
 
@@ -236,11 +255,7 @@ static int rpaphp_pci_config_bridge(stru
 	/* do pci_scan_child_bus */
 	pci_scan_child_bus(child_bus);
 
-	list_for_each_entry(child_dev, &child_bus->devices, bus_list) {
-		eeh_add_device_late(child_dev);
-	}
-
-	 /* fixup new pci devices without touching bus struct */
+	/* Fixup new pci devices without touching bus struct */
 	rpaphp_fixup_new_pci_devices(child_bus, 0);
 
 	/* Make the discovered devices available */
@@ -278,7 +293,7 @@ static void print_slot_pci_funcs(struct 
 	return;
 }
 #else
-static void print_slot_pci_funcs(struct slot *slot)
+static inline void print_slot_pci_funcs(struct slot *slot)
 {
 	return;
 }
@@ -360,7 +375,6 @@ static void rpaphp_eeh_remove_bus_device
 			if (pdev)
 				rpaphp_eeh_remove_bus_device(pdev);
 		}
-
 	}
 	return;
 }
@@ -562,36 +576,154 @@ exit:
 	return retval;
 }
 
-struct hotplug_slot *rpaphp_find_hotplug_slot(struct pci_dev *dev)
+/**
+ * rpaphp_find_slot - find and return the slot holding the device
+ * @dev: pci device for which we want the slot structure.
+ */
+static struct slot *rpaphp_find_slot(struct pci_dev *dev)
 {
-	struct list_head	*tmp, *n;
-	struct slot		*slot;
+	struct list_head *tmp, *n;
+	struct slot	*slot;
 
 	list_for_each_safe(tmp, n, &rpaphp_slot_head) {
 		struct pci_bus *bus;
 		struct list_head *ln;
 
 		slot = list_entry(tmp, struct slot, rpaphp_slot_list);
-		if (slot->bridge == NULL) {
-			if (slot->dev_type == PCI_DEV) {
-				printk(KERN_WARNING "PCI slot missing bridge %s %s \n", 
-				                    slot->name, slot->location);
-			}
+		
+		/* PHB slots don't have bridges */
+		if (slot->bridge == NULL)
 			continue;
-		}
+
+		/* the PCI device could be the PHB itself */
+		if (slot->bridge == dev)
+			return slot;
 
 		bus = slot->bridge->subordinate;
 		if (!bus) {
+			printk (KERN_WARNING "PCI bridge is missing bus: %s %s\n",
+			    pci_name (slot->bridge), pci_pretty_name (slot->bridge));
 			continue;  /* should never happen? */
 		}
+
 		for (ln = bus->devices.next; ln != &bus->devices; ln = ln->next) {
-                                struct pci_dev *pdev = pci_dev_b(ln);
-				if (pdev == dev)
-					return slot->hotplug_slot;
+			struct pci_dev *pdev = pci_dev_b(ln);
+			if (pdev == dev)
+				return slot;
 		}
 	}
 
 	return NULL;
 }
 
-EXPORT_SYMBOL_GPL(rpaphp_find_hotplug_slot);
+/* ------------------------------------------------------- */
+/**
+ * handle_eeh_events -- reset a PCI device after hard lockup.
+ *
+ * pSeries systems will isolate a PCI slot if the PCI-Host
+ * bridge detects address or data parity errors, DMA's 
+ * occuring to wild addresses (which usually happen due to
+ * bugs in device drivers or in PCI adapter firmware).
+ * Slot isolations also occur if #SERR, #PERR or other misc
+ * PCI-related errors are detected.
+ * 
+ * Recovery process consists of unplugging the device driver
+ * (which generated hotplug events to userspace), then issuing
+ * a PCI #RST to the device, then reconfiguring the PCI config 
+ * space for all bridges & devices under this slot, and then 
+ * finally restarting the device drivers (which cause a second
+ * set of hotplug events to go out to userspace).
+ */
+int handle_eeh_events (struct notifier_block *self, 
+                       unsigned long reason, void *ev)
+{
+	int freeze_count=0;
+	struct eeh_event *event = ev;
+	struct slot *frozen_slot;
+	struct eeh_cfg_tree * saved_bars;
+
+debug=1;
+	frozen_slot = rpaphp_find_slot(event->dev);
+	if (!frozen_slot)
+	{
+		printk (KERN_ERR 
+			"EEH: Cannot find PCI slot for EEH error! dev=%p dn=%p\n", 
+			event->dev, event->dn);
+		if (event->dev)
+			printk("EEH: above message for pci device %s %s\n", 
+				pci_name(event->dev), pci_pretty_name (event->dev));
+		if (event->dn)
+			printk ("EEH: above message for dn %s\n", event->dn->full_name);
+		return 1;
+	}
+
+	/* Keep a copy of the config space registers */
+	saved_bars = eeh_save_bars(frozen_slot->dn);
+	of_node_get(event->dn);
+	pci_dev_get(event->dev);
+
+	if (frozen_slot->dn->child)
+		freeze_count = frozen_slot->dn->child->eeh_freeze_count;
+	rpaphp_unconfig_pci_adapter (frozen_slot);
+
+	freeze_count ++;
+	if (freeze_count > EEH_MAX_ALLOWED_FREEZES) {
+		/* 
+		 * About 90% of all real-life EEH failures in the field
+		 * are due to poorly seated PCI cards. Only 10% or so are
+		 * due to actual, failed cards 
+		 */
+		printk (KERN_ERR
+		   "EEH: device %s:%s has failed %d times \n"
+			"and has been permanently disabled.  Please try reseating\n"
+		   "this device or replacing it.\n",
+			pci_name (event->dev),
+			pci_pretty_name (event->dev),
+			freeze_count);
+		goto rdone;
+	}
+	printk (KERN_WARNING
+	   "EEH: This device has failed %d times since last reoobt: %s:%s\n",
+		freeze_count,
+		pci_name (event->dev),
+		pci_pretty_name (event->dev));
+
+	/* Reset the pci controller. (Asserts RST#; resets config space). 
+	 * Reconfigure bridges and devices */
+	rtas_set_slot_reset (event->dn);
+	rtas_configure_bridge(event->dn);
+	eeh_restore_bars(saved_bars);
+
+	/* Give the system 5 seconds to finish running the user-space
+	 * hotplug scripts, e.g. ifdown for ethernet.  Yes, this is a hack, 
+	 * but if we don't do this, weird things happen.
+	 */
+	ssleep (5);
+
+	rpaphp_enable_pci_slot (frozen_slot);
+
+	/* Store the freeze count with the pci adapter, and not the slot.
+	 * This way, if the device is replaced, the count is cleared.
+	 */
+	if (frozen_slot->dn->child)
+		frozen_slot->dn->child->eeh_freeze_count = freeze_count;
+
+rdone:
+	of_node_put(event->dn);
+	pci_dev_put(event->dev);
+	return 0;
+}
+
+static struct notifier_block eeh_block;
+
+void __init init_eeh_handler (void)
+{
+	eeh_block.notifier_call = handle_eeh_events;
+	eeh_register_notifier (&eeh_block);
+}
+
+void __exit exit_eeh_handler (void)
+{
+	eeh_unregister_notifier (&eeh_block);
+}
+

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] PPC64: EEH Recovery
  2005-01-06 19:24 [PATCH] PPC64: EEH Recovery Linas Vepstas
@ 2005-01-17 20:14 ` Linas Vepstas
  2005-01-19  6:06   ` Paul Mackerras
  0 siblings, 1 reply; 8+ messages in thread
From: Linas Vepstas @ 2005-01-17 20:14 UTC (permalink / raw)
  To: paulus, anton, akpm; +Cc: linuxppc64-dev, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 506 bytes --]


Andrew,

The attached file describes PCI bus EEH "Extended Error Handling"
concepts and operation;  could you drop this into the kernel
documentation tree, at
linux-2.6/Documentation/powerpc/eeh-pci-error-recovery.txt ?

Signed-off-by: Linas Vepstas <linas@linas.org>

--linas

p.s.  It was not clear to me if the EEH patch previously sent 
(6 January 2005, same subject line) will be wending its way into 
the main Torvalds kernel tree, or not.  I hadn't really gotten
confirmation one way or another.



[-- Attachment #2: eeh-pci-error-recovery.txt --]
[-- Type: text/plain, Size: 15280 bytes --]



                      PCI Bus EEH Error Recovery
                      --------------------------
                           Linas Vepstas
                       <linas@austin.ibm.com>
                          12 January 2005


Overview:
---------
The IBM POWER-based pSeries and iSeries computers include PCI bus 
controller chips that have extended capabilities for detecting and 
reporting a large variety of PCI bus error conditions.  These features 
go under the name of "EEH", for "Extended Error Handling".  The EEH
hardware features allow PCI bus errors to be cleared and a PCI
card to be "rebooted", without also having to reboot the operating
system.  

This is in contrast to traditional PCI error handling, where the 
PCI chip is wired directly to the CPU, and an error would cause 
a CPU machine-check/check-stop condition, halting the CPU entirely. 
Another "traditional" technique is to ignore such errors, which
can lead to data corruption, both of user data or of kernel data,
hung/unresponsive adapters, or system crashes/lockups.  Thus, 
the idea behind EEH is that the operating system can become more
reliable and robust by protecting it from PCI errors, and giving
the OS the ability to "reboot"/recover individual PCI devices.

Future systems from other vendors, based on the PCI-E specification,
may contain similar features. 


Causes of EEH Errors
--------------------
EEH was originally designed to guard against hardware failure, such 
as PCI cards dying from heat, humidity, dust, vibration and bad 
electrical connections. The vast majority of EEH errors seen in 
"real life" are due to eithr poorly seated PCI cards, or, 
unfortunately quite commonly, due device driver bugs, device firmware 
bugs, and sometimes PCI card hardware bugs.

The most common software bug, is one that causes the device to
attempt to DMA to a location in system memory that has not been 
reserved for DMA access for that card.  This is a powerful feature, 
as it prevents what; otherwise, would have been silent memory 
corruption caused by the bad DMA.  A number of device driver
bugs have been found and fixed in this way over the past few 
years.  Other possible causes of EEH errors include data or 
address line parity errors (for example, due to poor electrical 
connectivity due to a poorly seated card), and PCI-X split-completion 
errors (due to software, device firmware, or device PCI hardware bugs). 
The vast majority of "true hardware failures" can be cured by
physically removing and re-seating the PCI card.


Detection and Recovery
----------------------
In the following discussion, a generic overview of how to detect 
and recover from EEH errors will be presented. This is followed
by an overview of how the current implementation in the Linux
kernel does it.  The actual implementation is subject to change,
and some of the finer points are still being debated.  These 
may in turn be swayed if or when other architectures implement 
similar functionality.

When a PCI Host Bridge (PHB, the bus controller connecting the 
PCI bus to the system CPU electronics complex) detects a PCI error
condition, it will "isolate" the affected PCI card.  Isolation 
will block all writes (either to the card from the system, or 
from the card to the system), and it will cause all reads to 
return all-ff's (0xff, 0xffff, 0xffffffff for 8/16/32-bit reads).
This value was chosen because it is the same value you would
get if the device was physically unplugged from the slot.
This includes access to PCI memory, I/O space, and PCI config 
space.  Interrupts; however, will continued to be delivered.

Detection and recovery are performed with the aid of ppc64 
firmware.  The programming interfaces in the Linux kernel 
into the firmware are referred to as RTAS (Run-Time Abstraction 
Services).  The Linux kernel does not (should not) access
the EEH function in the PCI chipsets directly, primarily because 
there are a number of different chipsets out there, each with 
different interfaces and quirks. The firmware provides a 
uniform abstraction layer that will work with all pSeries 
and iSeries hardware (and be forwards-compatible).

If the OS or device driver suspects that a PCI slot has been 
EEH-isolated, there is a firmware call it can make to determine if 
this is the case. If so, then the device driver should put itself 
into a consistent state (given that it won't be able to complete any 
pending work) and start recovery of the card.  Recovery normally 
would consist of reseting the PCI device (holding the PCI #RST 
line high for two seconds), followed by setting up the device 
config space (the base address registers (BAR's), latency timer, 
cache line size, interrupt line, and so on).  This is followed by a 
reinitialization of the device driver.  In a worst-case scenario, 
the power to the card can be toggled, at least on hot-plug-capable 
slots.  In principle, layers far above the device driver probably 
do not need to know that the PCI card has been "rebooted" in this 
way; ideally, there should be at most a pause in Ethernet/disk/USB 
I/O while the card is being reset. 

If the card cannot be recovered after three or four resets, the 
kernel/device driver should assume the worst-case scenario, that the 
card has died completely, and report this error to the sysadmin.  
In addition, error messages are reported through RTAS and also through 
syslogd (/var/log/messages) to alert the sysadmin of PCI resets.
The correct way to deal with failed adapters is to use the standard
PCI hotplug tools to remove and replace the dead card.


Current PPC64 Linux EEH Implementation
--------------------------------------
At this time, a generic EEH recovery mechanism has been implemented,
so that individual device drivers do not need to be modified to support
EEH recovery.  This generic mechanism piggy-backs on the PCI hotplug
infrastructure,  and percolates events up through the hotplug/udev 
infrastructure.  Followiing is a detailed description of how this is 
accomplished.

EEH must be enabled in the PHB's very early during the boot process, 
and if a PCI slot is hot-plugged. The former is performed by 
eeh_init() in arch/ppc64/kernel/eeh.c, and the later by
drivers/pci/hotplug/pSeries_pci.c calling in to the eeh.c code.
EEH must be enabled before a PCI scan of the device can proceed.
Current Power5 hardware will not work unless EEH is enabled;
although older Power4 can run with it disabled.  Effectively,
EEH can no longer be turned off.  PCI devices *must* be 
registered with the EEH code; the EEH code needs to know about
the I/O address ranges of the PCI device in order to detect an 
error.  Given an arbitrary address, the routine 
pci_get_device_by_addr() will find the pci device associated 
with that address (if any).

The default include/asm-ppc64/io.h macros readb(), inb(), insb(), 
etc. include a check to see if the the i/o read returned all-0xff's.
If so, these make a call to eeh_dn_check_failure(), which in turn
asks the firmware if the all-ff's value is the sign of a true EEH 
error.  If it is not, processing continues as normal.  The grand 
total number of these false alarms or "false positives" can be
seen in /proc/ppc64/eeh (subject to change).  Normally, almost 
all of these occur during boot, when the PCI bus is scanned, where
a large number of 0xff reads are part of the bus scan procedure.

If a frozen slot is detected, code in arch/ppc64/kernel/eeh.c will 
print a stack trace to syslog (/var/log/messages).  This stack trace 
has proven to be very useful to device-driver authors for finding 
out at what point the EEH error was detected, as the error itself
usually occurs slightly beforehand.

Next, it uses the Linux kernel notifier chain/work queue mechanism to
allow any interested parties to find out about the failure.  Device 
drivers, or other parts of the kernel, can use 
eeh_register_notifier(struct notifier_block *) to find out about EEH 
events.  The event will include a pointer to the pci device, the 
device node and some state info.  Receivers of the event can "do as 
they wish"; the default handler will be described further in this
section.

To assist in the recovery of the device, eeh.c exports the
following functions:

rtas_set_slot_reset() -- assert the  PCI #RST line for 1/8th of a second
rtas_configure_bridge() -- ask firmware to configure any PCI bridges
   located topologically under the pci slot.
eeh_save_bars() and eeh_restore_bars(): save and restore the PCI
   config-space info for a device and any devices under it. 
 

A handler for the EEH notifier_block events is implemented in
drivers/pci/hotplug/pSeries_pci.c, called handle_eeh_events().
It saves the device BAR's and then calls rpaphp_unconfig_pci_adapter().
This last call causes the device driver for the card to be stopped,
which causes hotplug events to go out to user space. This triggers
user-space scripts that might issue commands such as "ifdown eth0"
for ethernet cards, and so on.  This handler then sleeps for 5 seconds,
hoping to give the user-space scripts enough time to complete.
It then resets the PCI card, reconfigures the device BAR's, and
any bridges underneath. It then calls rpaphp_enable_pci_slot(),
which restarts the device driver and triggers more user-space
events (for example, calling "ifup eth0" for ethernet cards).


Device Shutdown and User-Space Events
-------------------------------------
This section documents what happens when a pci slot is unconfigured,
focusing on how the device driver gets shut down, and on how the 
events get delivered to user-space scripts.
 
Following is an example sequence of events that cause a device driver
close function to be called during the first phase of an EEH reset.  
The following sequence is an example of the pcnet32 device driver.

    rpa_php_unconfig_pci_adapter (struct slot *)  // in rpaphp_pci.c
    {
      calls
      pci_remove_bus_device (struct pci_dev *) // in /drivers/pci/remove.c
      { 
        calls
        pci_destroy_dev (struct pci_dev *) 
        {
          calls 
          device_unregister (&dev->dev) // in /drivers/base/core.c
          {
            calls
            device_del (struct device *)
            {
              calls 
              bus_remove_device() // in /drivers/base/bus.c
              {
                calls 
                device_release_driver()
                {
                  calls 
                  struct device_driver->remove() which is just
                  pci_device_remove()  // in /drivers/pci/pci_driver.c
                  {
                    calls
                    struct pci_driver->remove() which is just
                    pcnet32_remove_one() // in /drivers/net/pcnet32.c  
                    {
                      calls
                      unregister_netdev() // in /net/core/dev.c
                      {
                        calls 
                        dev_close()  // in /net/core/dev.c
                        { 
                           calls dev->stop();
                           which is just pcnet32_close() // in pcnet32.c
                           {
                             which does what you wanted
                             to stop the device
                           }
                        }
                     }
                   which
                   frees pcnet32 device driver memory
                }
     }}}}}}


    in drivers/pci/pci_driver.c, 
    struct device_driver->remove() is just pci_device_remove() 
    which calls struct pci_driver->remove() which is pcnet32_remove_one()
    which calls unregister_netdev()  (in net/core/dev.c)
    which calls dev_close()  (in net/core/dev.c) 
    which calls dev->stop() which is pcnet32_close() 
    which then does the appropriate shutdown. 
    
---
Following is the analogous stack trace for events sent to user-space
when the pci device is unconfigured.

rpa_php_unconfig_pci_adapter() {             // in rpaphp_pci.c 
  calls
  pci_remove_bus_device (struct pci_dev *) { // in /drivers/pci/remove.c
    calls 
    pci_destroy_dev (struct pci_dev *) {
      calls 
      device_unregister (&dev->dev) {      // in /drivers/base/core.c 
        calls
        device_del(struct device * dev) {  // in /drivers/base/core.c
          calls
          kobject_del() {                  //in /libs/kobject.c
            calls
            kobject_hotplug() {            // in /libs/kobject.c
              calls
              kset_hotplug() {             // in /lib/kobject.c
                calls 
                kset->hotplug_ops->hotplug() which is really just
                a call to 
                dev_hotplug() {           // in /drivers/base/core.c
                  calls 
                  dev->bus->hotplug() which is really just a call to 
                  pci_hotplug () {      // in drivers/pci/hotplug.c
                    which prints device name, etc....
                 }
               }
               then kset_hotplug() calls 
                call_usermodehelper () with 
                   argv[0]=hotplug_path[] which is "/sbin/hotplug"
             --> event to userspace, 
           }
         }
         kobject_del() then calls sysfs_remove_dir(), which would
         trigger any user-space daemon that was watching /sysfs,
         and notice the delete event.
  

Pro's and Con's of the Current Design
-------------------------------------
There are several issues with the current EEH software recovery design,
which may be addressed in future revisions.  But first, note that the 
big plus of the current design is that no changes need to be made to 
individual device drivers, so that the current design throws a wide net.
The biggest negative of the design is that it potentially disturbs 
network daemons and file systems that didn't need to be disturbed.

-- A minor complaint is that resetting the network card causes 
   user-space back-to-back ifdown/ifup burps that potentially disturb 
   network daemons, that didn't need to even know that the pci
   card was being rebooted.

-- A more serious concern is that the same reset, for SCSI devices,
   causes havoc to mounted file systems.  Scripts cannot post-facto
   unmount a file system without flushing pending buffers, but this 
   is impossible, because I/O has already been stopped.  Thus, 
   ideally, the reset should happen at or below the block layer,
   so that the file systems are not disturbed.

   Reiserfs does not tolerate errors returned from the block device.
   Ext3fs seems to be tolerant, retrying reads/writes until it does
   succeed. Both have been only lightly tested in this scenario.

   The SCSI-generic subsystem already has built-in code for performing
   SCSI device resets, SCSI bus resets, and SCSI host-bus-adapter 
   (HBA) resets.  These are cascaded into a chain of attempted 
   resets if a SCSI command fails. These are completely hidden
   from the block layer.  It would be very natural to add an EEH 
   reset into this chain of events.

-- If a SCSI error occurs for the root device, all is lost unless
   the sysadmin had the foresight to run /bin, /sbin, /etc, /var 
   and so on, out of ramdisk/tmpfs.


Conclusions
-----------
There's forward progress ... 


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] PPC64: EEH Recovery
  2005-01-17 20:14 ` Linas Vepstas
@ 2005-01-19  6:06   ` Paul Mackerras
  2005-01-19 16:00     ` Nathan Fontenot
                       ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Paul Mackerras @ 2005-01-19  6:06 UTC (permalink / raw)
  To: Linas Vepstas; +Cc: anton, akpm, linuxppc64-dev, linux-kernel

Linas Vepstas writes:

> p.s.  It was not clear to me if the EEH patch previously sent 
> (6 January 2005, same subject line) will be wending its way into 
> the main Torvalds kernel tree, or not.  I hadn't really gotten
> confirmation one way or another.

I'm not really totally happy with it yet, on a number of fronts:

1. You're adding more PCI-specific stuff to the device_node struct,
   which I don't like.  I would prefer that the device_node tree
   contains basically just what we get from OF, and that we have a
   separate struct for storing ppc64-specific information for each PCI
   device.  Fixing that is outside the scope of your patch, though.

2. I don't see why the device nodes for the PCI subtree being reset
   would go away, and thus I don't see the need for your eeh_cfg_tree
   struct.

3. Is there a good reason why we can't use the assigned-addresses
   property on the relevant device tree nodes to tell us what to set
   the BARs to?

4. I think the 5 second sleep is quite bogus, and shows that we have
   the flow of control wrong.  In particular I think it should be a
   userland write to a sysfs file that kicks off the restart process
   rather than it just happening after 5 seconds.  Anyway, what
   process or thread is executing that 5 second sleep?  Is it keventd
   or something?

5. AFAICS userland will get an unplug notification for the device, but
   nothing to indicate that is due to an EEH slot isolation event.  I
   think userland should be told about EEH events.

Regards,
Paul.


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] PPC64: EEH Recovery
  2005-01-19  6:06   ` Paul Mackerras
@ 2005-01-19 16:00     ` Nathan Fontenot
  2005-01-20 22:39     ` Linas Vepstas
  2005-01-20 22:48     ` Linas Vepstas
  2 siblings, 0 replies; 8+ messages in thread
From: Nathan Fontenot @ 2005-01-19 16:00 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: Linas Vepstas, akpm, linuxppc64-dev, anton, linux-kernel


Paul Mackerras wrote:

> 5. AFAICS userland will get an unplug notification for the device, but
>    nothing to indicate that is due to an EEH slot isolation event.  I
>    think userland should be told about EEH events.
> 

Currently there is a way for userland to determine if a hotplug event 
they receive is due to an EEH slot isolation event.  It's not very 
pretty and requires the rtas_errd daemon to be running.

The RTAS event generated from the EEH event is logged to 
/var/log/platform by rtas_errd.  Userland scripts would have to search 
the file for a recent EEH event matching their device to make this 
determination.  This isn't as nice as a direct notification but is what 
we have at this point.

-- 
Nathan Fontenot

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] PPC64: EEH Recovery
  2005-01-19  6:06   ` Paul Mackerras
  2005-01-19 16:00     ` Nathan Fontenot
@ 2005-01-20 22:39     ` Linas Vepstas
  2005-01-21  2:50       ` Paul Mackerras
  2005-01-20 22:48     ` Linas Vepstas
  2 siblings, 1 reply; 8+ messages in thread
From: Linas Vepstas @ 2005-01-20 22:39 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: anton, akpm, linuxppc64-dev, linux-kernel


On Wed, Jan 19, 2005 at 05:06:05PM +1100, Paul Mackerras was heard to remark:
> Linas Vepstas writes:
> 
> > p.s.  It was not clear to me if the EEH patch previously sent 
> > (6 January 2005, same subject line) will be wending its way into 
> > the main Torvalds kernel tree, or not.  I hadn't really gotten
> > confirmation one way or another.
> 
> I'm not really totally happy with it yet, on a number of fronts:
> 
> 1. You're adding more PCI-specific stuff to the device_node struct,
>    which I don't like.  I would prefer that the device_node tree
>    contains basically just what we get from OF, and that we have a
>    separate struct for storing ppc64-specific information for each PCI
>    device.  Fixing that is outside the scope of your patch, though.

I wrote this down on my to-do list.  Its the sort of thing that 
evaporates from my consciousness when other things come along,
but I'll give it a shot.  

> 2. I don't see why the device nodes for the PCI subtree being reset
>    would go away, and thus I don't see the need for your eeh_cfg_tree
>    struct.

Its not the reset, its the hot-plug remove.  The hot plug code assumes
that you are going to physically remove the device from the slot, so
it removes the device_node as part of the "unconfig".  

Of course, I found this out only after performing a null-pointer deref.
Note only does the node go away, but all of the various pointers it holds
are zeroed in the process.  

The cfg tree holds on to those pointers, so that I wouldn't have to
muck with the device_node removal code to do something tricky.

> 3. Is there a good reason why we can't use the assigned-addresses
>    property on the relevant device tree nodes to tell us what to set
>    the BARs to?

Yes, the reason is that after a reset, that property doesn't hold any 
decent data.   I discussed this with the firmware developers, and thier 
response was that it is the kernel's responsibility to compute 
(or save/restore) such values.  (Except for bridges, which they will do for us).

> 4. I think the 5 second sleep is quite bogus, and shows that we have
>    the flow of control wrong.  

:)  Yes, well, indeed it is.  Don't look at me, not my idea.

> In particular I think it should be a
>    userland write to a sysfs file that kicks off the restart process
>    rather than it just happening after 5 seconds.  Anyway, what
>    process or thread is executing that 5 second sleep?  Is it keventd
>    or something?

Its a workqueue.

> 5. AFAICS userland will get an unplug notification for the device, but
>    nothing to indicate that is due to an EEH slot isolation event.  I
>    think userland should be told about EEH events.

In principle, I'd agree. In practice, this would seem to require changes
or additions or enhancements to udev that I don't quite understand, as
well as potential changes to udev scripts.  Maybe I don't understand
sysfs sufficiently well.  I am very tempted to punt on this, and wait 
for the Intel-backed PCI-E code to get to this point, and then do whatever 
they're doing.

--linas

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] PPC64: EEH Recovery
  2005-01-19  6:06   ` Paul Mackerras
  2005-01-19 16:00     ` Nathan Fontenot
  2005-01-20 22:39     ` Linas Vepstas
@ 2005-01-20 22:48     ` Linas Vepstas
  2 siblings, 0 replies; 8+ messages in thread
From: Linas Vepstas @ 2005-01-20 22:48 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: anton, akpm, linuxppc64-dev, linux-kernel

On Wed, Jan 19, 2005 at 05:06:05PM +1100, Paul Mackerras was heard to remark:
> Linas Vepstas writes:
> 
> > p.s.  It was not clear to me if the EEH patch previously sent 
> > (6 January 2005, same subject line) will be wending its way into 
> > the main Torvalds kernel tree, or not.  I hadn't really gotten
> > confirmation one way or another.
> 
> I'm not really totally happy with it yet, on a number of fronts:

[...]

I forgot to mention: while I agree with some/many of these points,
especially with regards to recovery, I'd also like to note that the 
patch was mailed in two independent parts:  

-- a number of generic infrastructure routines, all in a ppc64 patch, and
-- the code that actually performs the recovery, as a patch to 
   the drivers/pci/hotplug subsystem.

While the actual recovery code is controversial (e.g. no support of 
scsi recovery), I'd like to at least get in the the generic 
infrastructure pieces.  

--linas

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] PPC64: EEH Recovery
  2005-01-20 22:39     ` Linas Vepstas
@ 2005-01-21  2:50       ` Paul Mackerras
  0 siblings, 0 replies; 8+ messages in thread
From: Paul Mackerras @ 2005-01-21  2:50 UTC (permalink / raw)
  To: Linas Vepstas; +Cc: anton, akpm, linuxppc64-dev, linux-kernel

Linas Vepstas writes:

> > 2. I don't see why the device nodes for the PCI subtree being reset
> >    would go away, and thus I don't see the need for your eeh_cfg_tree
> >    struct.
> 
> Its not the reset, its the hot-plug remove.  The hot plug code assumes
> that you are going to physically remove the device from the slot, so
> it removes the device_node as part of the "unconfig".  

OK, I missed that.  It seems a bit bogus to me.  Could you point me at
where in the code this happens?

> > 3. Is there a good reason why we can't use the assigned-addresses
> >    property on the relevant device tree nodes to tell us what to set
> >    the BARs to?
> 
> Yes, the reason is that after a reset, that property doesn't hold any 
> decent data.   I discussed this with the firmware developers, and thier 
> response was that it is the kernel's responsibility to compute 
> (or save/restore) such values.  (Except for bridges, which they will do for us).

The not holding any decent data is a consequence of the device nodes
getting thrown away, isn't it?  I fail to see how resetting the device
can of itself affect our copy of the device tree.

> > In particular I think it should be a
> >    userland write to a sysfs file that kicks off the restart process
> >    rather than it just happening after 5 seconds.  Anyway, what
> >    process or thread is executing that 5 second sleep?  Is it keventd
> >    or something?
> 
> Its a workqueue.

Which get run in keventd's context.  In other words no other
workqueues will get run during the 5 second sleep, or at least not on
that cpu.

Paul.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH] PPC64: EEH Recovery
@ 2004-11-17 23:52 Linas Vepstas
  0 siblings, 0 replies; 8+ messages in thread
From: Linas Vepstas @ 2004-11-17 23:52 UTC (permalink / raw)
  To: paulus, anton; +Cc: linuxppc64-dev, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 407 bytes --]


Hi Paul,

The patch below implements hotplug style EEH error recovery. 
Its split into two pieces: a part that needs to be applied to the
PPC64 arch tree, and a part that needs to be applied to the 
RPA PHP hotplug tree.  The PPC64 part needs to go in first.

Assuming this doesn't generate a round of discussion, please
forward upstream to akpm/torvalds.

 Signed-off-by: Linas Vepstas <linas@linas.org>


[-- Attachment #2: eeh-recovery-bk-ppc64.patch --]
[-- Type: text/plain, Size: 15635 bytes --]

===== arch/ppc64/kernel/eeh.c 1.40 vs edited =====
--- 1.40/arch/ppc64/kernel/eeh.c	2004-10-25 14:47:50 -05:00
+++ edited/arch/ppc64/kernel/eeh.c	2004-11-17 17:31:41 -06:00
@@ -17,21 +17,19 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  */
 
-#include <linux/bootmem.h>
+#include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/list.h>
-#include <linux/mm.h>
 #include <linux/notifier.h>
 #include <linux/pci.h>
 #include <linux/proc_fs.h>
 #include <linux/rbtree.h>
 #include <linux/seq_file.h>
-#include <linux/spinlock.h>
+#include <asm/atomic.h>
 #include <asm/eeh.h>
 #include <asm/io.h>
 #include <asm/machdep.h>
 #include <asm/rtas.h>
-#include <asm/atomic.h>
 #include "pci.h"
 
 #undef DEBUG
@@ -89,7 +87,6 @@ static struct notifier_block *eeh_notifi
  * attempts we allow before panicking.
  */
 #define EEH_MAX_FAILS	1000
-static atomic_t eeh_fail_count;
 
 /* RTAS tokens */
 static int ibm_set_eeh_option;
@@ -223,9 +220,9 @@ pci_addr_cache_insert(struct pci_dev *de
 	while (*p) {
 		parent = *p;
 		piar = rb_entry(parent, struct pci_io_addr_range, rb_node);
-		if (alo < piar->addr_lo) {
+		if (ahi < piar->addr_lo) {
 			p = &parent->rb_left;
-		} else if (ahi > piar->addr_hi) {
+		} else if (alo > piar->addr_hi) {
 			p = &parent->rb_right;
 		} else {
 			if (dev != piar->pcidev ||
@@ -243,6 +240,11 @@ pci_addr_cache_insert(struct pci_dev *de
 	piar->addr_hi = ahi;
 	piar->pcidev = dev;
 	piar->flags = flags;
+	
+#ifdef DEBUG 
+	printk (KERN_DEBUG "PIAR: insert range=[%lx:%lx] dev=%s\n", 
+	               alo, ahi, pci_name (dev));
+#endif
 
 	rb_link_node(&piar->rb_node, parent, p);
 	rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root);
@@ -377,6 +379,9 @@ void __init pci_addr_cache_build(void)
 			continue;
 		}
 		pci_addr_cache_insert_device(dev);
+		
+		/* Save the BAR's; firmware doesn't restore these after EEH reset */
+		pci_save_state (dev);
 	}
 
 #ifdef DEBUG
@@ -388,6 +393,32 @@ void __init pci_addr_cache_build(void)
 /* --------------------------------------------------------------- */
 /* Above lies the PCI Address Cache. Below lies the EEH event infrastructure */
 
+void eeh_slot_error_detail (struct device_node *dn, int severity)
+{
+	unsigned long flags;
+	int rc;
+
+	if (!dn) return;
+
+	/* Log the error with the rtas logger */
+	spin_lock_irqsave(&slot_errbuf_lock, flags);
+	memset(slot_errbuf, 0, eeh_error_buf_size);
+
+	rc = rtas_call(ibm_slot_error_detail,
+	               8, 1, NULL, dn->eeh_config_addr,
+	               BUID_HI(dn->phb->buid),
+	               BUID_LO(dn->phb->buid), NULL, 0,
+	               virt_to_phys(slot_errbuf),
+	               eeh_error_buf_size,
+	               severity);
+
+	if (rc == 0)
+		log_error(slot_errbuf, ERR_TYPE_RTAS_LOG, 0);
+	spin_unlock_irqrestore(&slot_errbuf_lock, flags);
+}
+
+EXPORT_SYMBOL(eeh_slot_error_detail);
+
 /**
  * eeh_register_notifier - Register to find out about EEH events.
  * @nb: notifier block to callback on events
@@ -462,11 +493,9 @@ static void eeh_event_handler(void *dumm
 		       "%s %s\n", event->reset_state,
 		       pci_name(event->dev), pci_pretty_name(event->dev));
 
-		atomic_set(&eeh_fail_count, 0);
-		notifier_call_chain (&eeh_notifier_chain,
-				     EEH_NOTIFY_FREEZE, event);
-
 		__get_cpu_var(slot_resets)++;
+		notifier_call_chain (&eeh_notifier_chain,
+		           EEH_NOTIFY_FREEZE, event);
 
 		pci_dev_put(event->dev);
 		kfree(event);
@@ -510,7 +539,7 @@ int eeh_dn_check_failure(struct device_n
 	int ret;
 	int rets[2];
 	unsigned long flags;
-	int rc, reset_state;
+	int reset_state;
 	struct eeh_event  *event;
 
 	__get_cpu_var(total_mmio_ffs)++;
@@ -530,14 +559,15 @@ int eeh_dn_check_failure(struct device_n
 	if (!dn->eeh_config_addr) {
 		return 0;
 	}
-
+	
 	/*
 	 * If we already have a pending isolation event for this
 	 * slot, we know it's bad already, we don't need to check...
 	 */
 	if (dn->eeh_mode & EEH_MODE_ISOLATED) {
-		atomic_inc(&eeh_fail_count);
-		if (atomic_read(&eeh_fail_count) >= EEH_MAX_FAILS) {
+		dn->eeh_freeze_count ++;
+		if (dn->eeh_freeze_count >= EEH_MAX_FAILS) {
+			dump_stack();
 			/* re-read the slot reset state */
 			rets[0] = -1;
 			rtas_call(ibm_read_slot_reset_state, 3, 3, rets,
@@ -565,28 +595,17 @@ int eeh_dn_check_failure(struct device_n
 		return 0;
 	}
 
-	/* prevent repeated reports of this failure */
+	/* Prevent repeated reports of this failure */
 	dn->eeh_mode |= EEH_MODE_ISOLATED;
 
 	reset_state = rets[0];
+	/* Log the error with the rtas logger */
+	if (dn->eeh_freeze_count < EEH_MAX_ALLOWED_FREEZES) {
+		eeh_slot_error_detail (dn, 1 /* Temporary Error */);
+	} else {
+		eeh_slot_error_detail (dn, 2 /* Permanent Error */);
+   }
 
-	spin_lock_irqsave(&slot_errbuf_lock, flags);
-	memset(slot_errbuf, 0, eeh_error_buf_size);
-
-	rc = rtas_call(ibm_slot_error_detail,
-	               8, 1, NULL, dn->eeh_config_addr,
-	               BUID_HI(dn->phb->buid),
-	               BUID_LO(dn->phb->buid), NULL, 0,
-	               virt_to_phys(slot_errbuf),
-	               eeh_error_buf_size,
-	               1 /* Temporary Error */);
-
-	if (rc == 0)
-		log_error(slot_errbuf, ERR_TYPE_RTAS_LOG, 0);
-	spin_unlock_irqrestore(&slot_errbuf_lock, flags);
-
-	printk(KERN_INFO "EEH: MMIO failure (%d) on device: %s %s\n",
-	       rets[0], dn->name, dn->full_name);
 	event = kmalloc(sizeof(*event), GFP_ATOMIC);
 	if (event == NULL) {
 		eeh_panic(dev, reset_state);
@@ -618,7 +637,6 @@ EXPORT_SYMBOL(eeh_dn_check_failure);
  * @token i/o token, should be address in the form 0xA....
  * @val value, should be all 1's (XXX why do we need this arg??)
  *
- * Check for an eeh failure at the given token address.
  * Check for an EEH failure at the given token address.  Call this
  * routine if the result of a read was all 0xff's and you want to
  * find out if this is due to an EEH slot freeze event.  This routine
@@ -626,6 +644,7 @@ EXPORT_SYMBOL(eeh_dn_check_failure);
  *
  * Note this routine is safe to call in an interrupt context.
  */
+
 unsigned long eeh_check_failure(const volatile void __iomem *token, unsigned long val)
 {
 	unsigned long addr;
@@ -635,7 +654,7 @@ unsigned long eeh_check_failure(const vo
 	/* Finding the phys addr + pci device; this is pretty quick. */
 	addr = eeh_token_to_phys((unsigned long __force) token);
 	dev = pci_get_device_by_addr(addr);
-	if (!dev)
+	if (!dev) 
 		return val;
 
 	dn = pci_device_to_OF_node(dev);
@@ -647,6 +666,174 @@ unsigned long eeh_check_failure(const vo
 
 EXPORT_SYMBOL(eeh_check_failure);
 
+/* ------------------------------------------------------------- */
+/* The code below deals with error recovery */
+
+void
+rtas_set_slot_reset(struct device_node *dn)
+{
+	int token = rtas_token ("ibm,set-slot-reset");
+	int rc;
+
+	if (token == RTAS_UNKNOWN_SERVICE)
+		return;
+	rc = rtas_call(token,4,1, NULL,
+	               dn->eeh_config_addr,
+	               BUID_HI(dn->phb->buid),
+	               BUID_LO(dn->phb->buid),
+	               1);
+	if (rc) {
+		printk (KERN_WARNING "EEH: Unable to reset the failed slot\n");
+		return;
+	}
+	
+	/* The PCI bus requires that the reset be held high for at least
+	 * a 100 milliseconds. We wait a bit longer 'just in case'.
+	 */
+   msleep (200);
+	
+	rc = rtas_call(token,4,1, NULL,
+	               dn->eeh_config_addr,
+	               BUID_HI(dn->phb->buid),
+	               BUID_LO(dn->phb->buid),
+	               0);
+}
+
+EXPORT_SYMBOL(rtas_set_slot_reset);
+
+void
+rtas_configure_bridge(struct device_node *dn)
+{
+	int token = rtas_token ("ibm,configure-bridge");
+	int rc;
+
+	if (token == RTAS_UNKNOWN_SERVICE)
+		return;
+	rc = rtas_call(token,3,1, NULL,
+	               dn->eeh_config_addr,
+	               BUID_HI(dn->phb->buid),
+	               BUID_LO(dn->phb->buid));
+	if (rc) {
+		printk (KERN_WARNING "EEH: Unable to configure device bridge\n");
+	}
+}
+
+EXPORT_SYMBOL(rtas_configure_bridge);
+
+/* ------------------------------------------------------- */
+/** Save and restore of PCI BARs
+ * 
+ * Although firmware will set up BARs during boot, it doesn't
+ * set up device BAR's after a device reset, although it will,
+ * if requested, set up bridge configuration. Thus, we need to 
+ * configure the PCI devices ourselves.  Config-space setup is 
+ * stored in the PCI structures which are normally deleted during
+ * device removal.  Thus, the "save" routine references the
+ * structures so that they aren't deleted. 
+ */
+
+
+struct eeh_cfg_tree
+{
+	struct eeh_cfg_tree *sibling;
+	struct eeh_cfg_tree *child;
+	struct pci_dev *dev;
+	struct device_node *dn;
+};
+
+static inline struct pci_dev * eeh_get_pci_dev(struct device_node *dn)
+{
+	struct pci_dev *dev = NULL;
+	char bus_id[BUS_ID_SIZE];
+
+	sprintf(bus_id, "%04x:%02x:%02x.%d",dn->phb->global_number,
+		dn->busno, PCI_SLOT(dn->devfn), PCI_FUNC(dn->devfn));
+
+	while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
+		if (!strcmp(pci_name(dev), bus_id)) 
+			return dev;
+	}
+	return NULL;
+}
+
+/** 
+ * eeh_save_bars - save the PCI config space info
+ */
+struct eeh_cfg_tree * eeh_save_bars(struct device_node *dn)
+{
+	struct eeh_cfg_tree *cnode;
+	struct pci_dev *dev;
+	
+	dev = eeh_get_pci_dev (dn);
+	if (!dev) 
+		return NULL;
+
+	cnode = kmalloc(sizeof(struct eeh_cfg_tree), GFP_KERNEL);
+	if (!cnode) 
+		return NULL;
+	
+	cnode->dev = dev;
+	
+	of_node_get(dn);
+	cnode->dn = dn;
+	
+	cnode->sibling = NULL;
+	cnode->child = NULL;
+
+	if (dn->child) {
+		cnode->child = eeh_save_bars (dn->child);
+	}
+	if (dn->sibling) {
+		cnode->sibling = eeh_save_bars (dn->sibling);
+	}
+
+	return cnode;
+}
+EXPORT_SYMBOL(eeh_save_bars);
+
+/**
+ * __restore_bars - Restore the Base Address Registers
+ * Loads the PCI configuration space base address registers 
+ * and the expansion ROM base address from the array 
+ * passed as the second argument.
+ */
+static inline void __restore_bars (struct device_node *dn, u32 *cfg_hdr)
+{
+	int i;
+	for (i=4; i<10; i++) {
+		rtas_write_config(dn, i*4, 4, cfg_hdr[i]);
+	}
+	rtas_write_config(dn, 12*4, 4, cfg_hdr[12]);
+}
+
+/** 
+ * eeh_restore_bars - restore the PCI config space info
+ */
+void eeh_restore_bars(struct eeh_cfg_tree *tree)
+{
+	if (tree->dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
+		__restore_bars (tree->dn, tree->dev->saved_config_space);
+	}
+	
+	if (tree->child) {
+		eeh_restore_bars (tree->child);
+	}
+	if (tree->sibling) {
+		eeh_restore_bars (tree->sibling);
+	}
+
+	of_node_put (tree->dn);
+	pci_dev_put (tree->dev);
+	kfree (tree);
+}
+EXPORT_SYMBOL(eeh_restore_bars);
+
+/* ------------------------------------------------------------- */
+/* The code below deals with enabling EEH for devices during  the
+ * early boot sequence.  EEH must be enabled before any PCI probing
+ * can be done.
+ */
+
 struct eeh_early_enable_info {
 	unsigned int buid_hi;
 	unsigned int buid_lo;
@@ -840,6 +1027,9 @@ void eeh_add_device_late(struct pci_dev 
 #endif
 
 	pci_addr_cache_insert_device (dev);
+
+	/* Save the BAR's; firmware doesn't restore these after EEH reset */
+	pci_save_state (dev);
 }
 EXPORT_SYMBOL(eeh_add_device_late);
 
@@ -885,10 +1075,8 @@ static int proc_eeh_show(struct seq_file
 		seq_printf(m, "eeh_total_mmio_ffs=%ld\n"
 			   "eeh_false_positives=%ld\n"
 			   "eeh_ignored_failures=%ld\n"
-			   "eeh_slot_resets=%ld\n"
-				"eeh_fail_count=%d\n",
-			   ffs, positives, failures, resets,
-				eeh_fail_count.counter);
+			   "eeh_slot_resets=%ld\n",
+			   ffs, positives, failures, resets);
 	}
 
 	return 0;
===== arch/ppc64/kernel/pSeries_pci.c 1.59 vs edited =====
--- 1.59/arch/ppc64/kernel/pSeries_pci.c	2004-11-15 21:29:10 -06:00
+++ edited/arch/ppc64/kernel/pSeries_pci.c	2004-11-17 16:18:02 -06:00
@@ -102,7 +102,7 @@ static int rtas_pci_read_config(struct p
 	return PCIBIOS_DEVICE_NOT_FOUND;
 }
 
-static int rtas_write_config(struct device_node *dn, int where, int size, u32 val)
+int rtas_write_config(struct device_node *dn, int where, int size, u32 val)
 {
 	unsigned long buid, addr;
 	int ret;
@@ -125,6 +125,7 @@ static int rtas_write_config(struct devi
 
 	return PCIBIOS_SUCCESSFUL;
 }
+EXPORT_SYMBOL(rtas_write_config);
 
 static int rtas_pci_write_config(struct pci_bus *bus,
 				 unsigned int devfn,
===== include/asm-ppc64/eeh.h 1.23 vs edited =====
--- 1.23/include/asm-ppc64/eeh.h	2004-10-25 18:17:38 -05:00
+++ edited/include/asm-ppc64/eeh.h	2004-11-17 16:10:58 -06:00
@@ -22,8 +22,8 @@
 
 #include <linux/init.h>
 #include <linux/list.h>
-#include <linux/string.h>
 #include <linux/notifier.h>
+#include <linux/string.h>
 
 struct pci_dev;
 struct device_node;
@@ -33,6 +33,10 @@ struct device_node;
 #define EEH_MODE_NOCHECK	(1<<1)
 #define EEH_MODE_ISOLATED	(1<<2)
 
+/* Max number of EEH freezes allowed before we consider the device
+ * to be permanently disabled. */
+#define EEH_MAX_ALLOWED_FREEZES 5
+
 #ifdef CONFIG_PPC_PSERIES
 extern void __init eeh_init(void);
 unsigned long eeh_check_failure(const volatile void __iomem *token, unsigned long val);
@@ -57,6 +61,34 @@ void eeh_add_device_early(struct device_
 void eeh_add_device_late(struct pci_dev *);
 
 /**
+ * eeh_slot_error_detail -- record and EEH error condition to the log
+ * @severity: 1 if temporary, 2 if permanent failure.
+ *
+ * Obtains the the EEH error details from the RTAS subsystem, 
+ * and then logs these details with the RTAS error log system.
+ */
+void eeh_slot_error_detail (struct device_node *dn, int severity);
+
+/** 
+ * rtas_set_slot_reset -- unfreeze a frozen slot
+ *
+ * Clear the EEH-frozen condition on a slot.  This routine
+ * does this by asserting the PCI #RST line for 1/8th of 
+ * a second; this routine will sleep while the adapter is 
+ * being reset.
+ */
+void rtas_set_slot_reset (struct device_node *dn);
+
+/**
+ * rtas_configure_bridge -- firmware initialization of pci bridge
+ * 
+ * Ask the firmware to configure any PCI bridge devices 
+ * located behind the indicated node. Required after a 
+ * pci device reset.
+ */
+void rtas_configure_bridge(struct device_node *dn);
+
+/**
  * eeh_remove_device - undo EEH setup for the indicated pci device
  * @dev: pci device to be removed
  *
@@ -91,6 +123,13 @@ struct eeh_event {
 /** Register to find out about EEH events. */
 int eeh_register_notifier(struct notifier_block *nb);
 int eeh_unregister_notifier(struct notifier_block *nb);
+
+/** Save and restore device configuration info across
+ *  device resets
+ */
+struct eeh_cfg_tree;
+struct eeh_cfg_tree * eeh_save_bars(struct device_node *dn);
+void eeh_restore_bars(struct eeh_cfg_tree *tree);
 
 /**
  * EEH_POSSIBLE_ERROR() -- test for possible MMIO failure.
===== include/asm-ppc64/prom.h 1.23 vs edited =====
--- 1.23/include/asm-ppc64/prom.h	2004-10-24 20:55:43 -05:00
+++ edited/include/asm-ppc64/prom.h	2004-11-17 16:00:37 -06:00
@@ -162,6 +162,7 @@ struct device_node {
 	int	status;			/* Current device status (non-zero is bad) */
 	int	eeh_mode;		/* See eeh.h for possible EEH_MODEs */
 	int	eeh_config_addr;
+	int	eeh_freeze_count;   /* number of times this device froze up. */
 	struct  pci_controller *phb;	/* for pci devices */
 	struct	iommu_table *iommu_table;	/* for phb's or bridges */
 
===== include/asm-ppc64/rtas.h 1.24 vs edited =====
--- 1.24/include/asm-ppc64/rtas.h	2004-09-22 00:42:53 -05:00
+++ edited/include/asm-ppc64/rtas.h	2004-11-17 16:00:37 -06:00
@@ -241,4 +241,6 @@ extern void rtas_stop_self(void);
 /* RMO buffer reserved for user-space RTAS use */
 extern unsigned long rtas_rmo_buf;
 
+extern int rtas_write_config(struct device_node *dn, int where, int size, u32 val);
+
 #endif /* _PPC64_RTAS_H */

[-- Attachment #3: eeh-recovery-bk-hotplug.patch --]
[-- Type: text/plain, Size: 7784 bytes --]

===== drivers/pci/hotplug/rpaphp.h 1.11 vs edited =====
--- 1.11/drivers/pci/hotplug/rpaphp.h	2004-10-06 11:43:44 -05:00
+++ edited/drivers/pci/hotplug/rpaphp.h	2004-11-17 16:00:37 -06:00
@@ -126,6 +126,8 @@ extern int register_pci_slot(struct slot
 extern int rpaphp_unconfig_pci_adapter(struct slot *slot);
 extern int rpaphp_get_pci_adapter_status(struct slot *slot, int is_init, u8 * value);
 extern struct hotplug_slot *rpaphp_find_hotplug_slot(struct pci_dev *dev);
+extern void init_eeh_handler (void);
+extern void exit_eeh_handler (void);
 
 /* rpaphp_core.c */
 extern int rpaphp_add_slot(struct device_node *dn);
===== drivers/pci/hotplug/rpaphp_core.c 1.18 vs edited =====
--- 1.18/drivers/pci/hotplug/rpaphp_core.c	2004-10-06 11:43:44 -05:00
+++ edited/drivers/pci/hotplug/rpaphp_core.c	2004-11-17 16:00:37 -06:00
@@ -443,12 +443,18 @@ static int __init rpaphp_init(void)
 {
 	info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
 
+	/* Get set to handle EEH events. */
+	init_eeh_handler();
+
 	/* read all the PRA info from the system */
 	return init_rpa();
 }
 
 static void __exit rpaphp_exit(void)
 {
+	/* Let EEH know we are going away. */
+	exit_eeh_handler();
+
 	cleanup_slots();
 }
 
===== drivers/pci/hotplug/rpaphp_pci.c 1.16 vs edited =====
--- 1.16/drivers/pci/hotplug/rpaphp_pci.c	2004-10-19 11:54:38 -05:00
+++ edited/drivers/pci/hotplug/rpaphp_pci.c	2004-11-17 17:23:39 -06:00
@@ -22,8 +22,12 @@
  * Send feedback to <lxie@us.ibm.com>
  *
  */
+#include <linux/delay.h>
+#include <linux/notifier.h>
 #include <linux/pci.h>
+#include <asm/eeh.h>
 #include <asm/pci-bridge.h>
+#include <asm/prom.h>
 #include <asm/rtas.h>
 #include "../pci.h"		/* for pci_add_new_bus */
 
@@ -63,6 +67,7 @@ int rpaphp_claim_resource(struct pci_dev
 		    root ? "Address space collision on" :
 		    "No parent found for",
 		    resource, dtype, pci_name(dev), res->start, res->end);
+		dump_stack();
 	}
 	return err;
 }
@@ -185,6 +190,19 @@ rpaphp_fixup_new_pci_devices(struct pci_
 
 static int rpaphp_pci_config_bridge(struct pci_dev *dev);
 
+static void rpaphp_eeh_add_bus_device(struct pci_bus *bus)
+{
+	struct pci_dev *dev;
+	list_for_each_entry(dev, &bus->devices, bus_list) {
+		eeh_add_device_late(dev);
+		if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
+			struct pci_bus *subbus = dev->subordinate;
+			if (bus)
+				rpaphp_eeh_add_bus_device (subbus);
+		}
+	}
+}
+
 /*****************************************************************************
  rpaphp_pci_config_slot() will  configure all devices under the 
  given slot->dn and return the the first pci_dev.
@@ -212,6 +230,8 @@ rpaphp_pci_config_slot(struct device_nod
 		}
 		if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) 
 			rpaphp_pci_config_bridge(dev);
+
+		rpaphp_eeh_add_bus_device(bus);
 	}
 	return dev;
 }
@@ -220,7 +240,6 @@ static int rpaphp_pci_config_bridge(stru
 {
 	u8 sec_busno;
 	struct pci_bus *child_bus;
-	struct pci_dev *child_dev;
 
 	dbg("Enter %s:  BRIDGE dev=%s\n", __FUNCTION__, pci_name(dev));
 
@@ -237,11 +256,7 @@ static int rpaphp_pci_config_bridge(stru
 	/* do pci_scan_child_bus */
 	pci_scan_child_bus(child_bus);
 
-	list_for_each_entry(child_dev, &child_bus->devices, bus_list) {
-		eeh_add_device_late(child_dev);
-	}
-
-	 /* fixup new pci devices without touching bus struct */
+	/* Fixup new pci devices without touching bus struct */
 	rpaphp_fixup_new_pci_devices(child_bus, 0);
 
 	/* Make the discovered devices available */
@@ -279,7 +294,7 @@ static void print_slot_pci_funcs(struct 
 	return;
 }
 #else
-static void print_slot_pci_funcs(struct slot *slot)
+static inline void print_slot_pci_funcs(struct slot *slot)
 {
 	return;
 }
@@ -361,7 +376,6 @@ static void rpaphp_eeh_remove_bus_device
 			if (pdev)
 				rpaphp_eeh_remove_bus_device(pdev);
 		}
-
 	}
 	return;
 }
@@ -563,10 +577,14 @@ exit:
 	return retval;
 }
 
-struct hotplug_slot *rpaphp_find_hotplug_slot(struct pci_dev *dev)
+/**
+ * rpaphp_find_slot - find and return the slot holding the device
+ * @dev: pci device for which we want the slot structure.
+ */
+static struct slot *rpaphp_find_slot(struct pci_dev *dev)
 {
-	struct list_head	*tmp, *n;
-	struct slot		*slot;
+	struct list_head *tmp, *n;
+	struct slot	*slot;
 
 	list_for_each_safe(tmp, n, &rpaphp_slot_head) {
 		struct pci_bus *bus;
@@ -585,14 +603,109 @@ struct hotplug_slot *rpaphp_find_hotplug
 		if (!bus) {
 			continue;  /* should never happen? */
 		}
+
 		for (ln = bus->devices.next; ln != &bus->devices; ln = ln->next) {
-                                struct pci_dev *pdev = pci_dev_b(ln);
-				if (pdev == dev)
-					return slot->hotplug_slot;
+			struct pci_dev *pdev = pci_dev_b(ln);
+			if (pdev == dev)
+				return slot;
 		}
 	}
 
 	return NULL;
 }
 
-EXPORT_SYMBOL_GPL(rpaphp_find_hotplug_slot);
+/* ------------------------------------------------------- */
+/**
+ * handle_eeh_events -- reset a PCI device after hard lockup.
+ *
+ * pSeries systems will isolate a PCI slot if the PCI-Host
+ * bridge detects address or data parity errors, DMA's 
+ * occuring to wild addresses (which usually happen due to
+ * bugs in device drivers or in PCI adapter firmware).
+ * Slot isolations also occur if #SERR, #PERR or other misc
+ * PCI-related errors are detected.
+ * 
+ * Recovery process consists of unplugging the device driver
+ * (which generated hotplug events to userspace), then issuing
+ * a PCI #RST to the device, then reconfiguring the PCI config 
+ * space for all bridges & devices under this slot, and then 
+ * finally restarting the device drivers (which cause a second
+ * set of hotplug events to go out to userspace).
+ */
+int handle_eeh_events (struct notifier_block *self, 
+                       unsigned long reason, void *ev)
+{
+	struct eeh_event *event = ev;
+	struct slot *frozen_slot;
+	struct eeh_cfg_tree * saved_bars;
+
+	frozen_slot = rpaphp_find_slot(event->dev);
+	if (!frozen_slot)
+	{
+		printk (KERN_ERR 
+			"EEH: Cannot find PCI slot for EEH error! dev=%p dn=%p\n", 
+			event->dev, event->dn);
+		return 1;
+	}
+
+	/* Keep a copy of the config space registers */
+	saved_bars = eeh_save_bars(frozen_slot->dn);
+	of_node_get(event->dn);
+	pci_dev_get(event->dev);
+
+	rpaphp_unconfig_pci_adapter (frozen_slot);
+
+	event->dn->eeh_freeze_count ++;
+	if (event->dn->eeh_freeze_count > EEH_MAX_ALLOWED_FREEZES) {
+		/* 
+		 * About 90% of all real-life EEH failures in the field
+		 * are due to poorly seated PCI cards. Only 10% or so are
+		 * due to actual, failed cards 
+		 */
+		printk (KERN_ERR
+		   "EEH: device %s:%s has failed %d times \n"
+			"and has been permanently disabled.  Please try reseating\n"
+		   "this device or replacing it.\n",
+			pci_name (event->dev),
+			pci_pretty_name (event->dev),
+			EEH_MAX_ALLOWED_FREEZES);
+		goto rdone;
+	}
+
+	/* Reset the pci controller. (Asserts RST#; resets config space). 
+	 * Reconfigure bridges and devices */
+	rtas_set_slot_reset (event->dn);
+	rtas_configure_bridge(event->dn);
+	eeh_restore_bars(saved_bars);
+
+	/* Give the system 5 seconds to finish running the user-space
+	 * hotplug scripts, e.g. ifdown for ethernet.  Yes, this is a hack, 
+	 * but if we don't do this, weird things happen.
+	 */
+	ssleep (5);
+
+	rpaphp_enable_pci_slot (frozen_slot);
+
+	/* The new device node is different than the old one; 
+	 * copy over the freeze count, so that we don't loose track of it.
+	 */
+	frozen_slot->dn->eeh_freeze_count = event->dn->eeh_freeze_count;
+rdone:
+	of_node_put(event->dn);
+	pci_dev_put(event->dev);
+	return 0;
+}
+
+static struct notifier_block eeh_block;
+
+void __init init_eeh_handler (void)
+{
+	eeh_block.notifier_call = handle_eeh_events;
+	eeh_register_notifier (&eeh_block);
+}
+
+void __exit exit_eeh_handler (void)
+{
+	eeh_unregister_notifier (&eeh_block);
+}
+

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2005-01-21  2:45 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2005-01-06 19:24 [PATCH] PPC64: EEH Recovery Linas Vepstas
2005-01-17 20:14 ` Linas Vepstas
2005-01-19  6:06   ` Paul Mackerras
2005-01-19 16:00     ` Nathan Fontenot
2005-01-20 22:39     ` Linas Vepstas
2005-01-21  2:50       ` Paul Mackerras
2005-01-20 22:48     ` Linas Vepstas
  -- strict thread matches above, loose matches on Subject: below --
2004-11-17 23:52 Linas Vepstas

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).