aboutsummaryrefslogtreecommitdiff
path: root/arch/powerpc/platforms/pseries/eeh_event.c
diff options
context:
space:
mode:
authorLinas Vepstas <linas@austin.ibm.com>2006-04-18 21:05:21 -0700
committerPaul Mackerras <paulus@samba.org>2006-04-22 18:46:13 +1000
commitac325acd50013fa8f4953208cbb96504dec9b12a (patch)
tree6c08470d68be38504c6aadae168b873efb39e8db /arch/powerpc/platforms/pseries/eeh_event.c
parent4bd174fe1cca738f53cf8bb9ac3cb327b1f516ed (diff)
[PATCH] powerpc/pseries: clear PCI failure counter if no new failures
The current PCI error recovery system keeps track of the number of PCI card resets, and refuses to bring a card back up if this number is too large. The goal of doing this was to avoid an infinite loop of resets if a card is obviously dead. However, if the failures are rare, but the machine has a high uptime, this mechanism might still be triggered; this is too harsh. This patch will avoids this problem by decrementing the fail count after an hour. Thus, as long as a pci card BSOD's less than 6 times an hour, it will continue to be reset indefinitely. If it's failure rate is greater than that, it will be taken off-line permanently. This patch is larger than it might otherwise be because it changes indentation by removing a pointless while-loop. The while loop is not needed, as the handler is invoked once fo each event (by schedule_work()); the loop is leftover cruft from an earlier implementation. Signed-off-by: Linas Vepstas <linas@austin.ibm.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Paul Mackerras <paulus@samba.org>
Diffstat (limited to 'arch/powerpc/platforms/pseries/eeh_event.c')
-rw-r--r--arch/powerpc/platforms/pseries/eeh_event.c50
1 files changed, 28 insertions, 22 deletions
diff --git a/arch/powerpc/platforms/pseries/eeh_event.c b/arch/powerpc/platforms/pseries/eeh_event.c
index a1bda6f96fd..a0b39640a00 100644
--- a/arch/powerpc/platforms/pseries/eeh_event.c
+++ b/arch/powerpc/platforms/pseries/eeh_event.c
@@ -18,6 +18,7 @@
* Copyright (c) 2005 Linas Vepstas <linas@linas.org>
*/
+#include <linux/delay.h>
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/pci.h>
@@ -56,38 +57,43 @@ static int eeh_event_handler(void * dummy)
{
unsigned long flags;
struct eeh_event *event;
+ struct pci_dn *pdn;
daemonize ("eehd");
+ set_current_state(TASK_INTERRUPTIBLE);
- while (1) {
- set_current_state(TASK_INTERRUPTIBLE);
+ spin_lock_irqsave(&eeh_eventlist_lock, flags);
+ event = NULL;
- spin_lock_irqsave(&eeh_eventlist_lock, flags);
- event = NULL;
+ /* Unqueue the event, get ready to process. */
+ if (!list_empty(&eeh_eventlist)) {
+ event = list_entry(eeh_eventlist.next, struct eeh_event, list);
+ list_del(&event->list);
+ }
+ spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
- /* Unqueue the event, get ready to process. */
- if (!list_empty(&eeh_eventlist)) {
- event = list_entry(eeh_eventlist.next, struct eeh_event, list);
- list_del(&event->list);
- }
- spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
+ if (event == NULL)
+ return 0;
- if (event == NULL)
- break;
+ /* Serialize processing of EEH events */
+ mutex_lock(&eeh_event_mutex);
+ eeh_mark_slot(event->dn, EEH_MODE_RECOVERING);
- /* Serialize processing of EEH events */
- mutex_lock(&eeh_event_mutex);
- eeh_mark_slot(event->dn, EEH_MODE_RECOVERING);
+ printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n",
+ pci_name(event->dev));
- printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n",
- pci_name(event->dev));
+ pdn = handle_eeh_events(event);
- handle_eeh_events(event);
+ eeh_clear_slot(event->dn, EEH_MODE_RECOVERING);
+ pci_dev_put(event->dev);
+ kfree(event);
+ mutex_unlock(&eeh_event_mutex);
- eeh_clear_slot(event->dn, EEH_MODE_RECOVERING);
- pci_dev_put(event->dev);
- kfree(event);
- mutex_unlock(&eeh_event_mutex);
+ /* If there are no new errors after an hour, clear the counter. */
+ if (pdn && pdn->eeh_freeze_count>0) {
+ msleep_interruptible (3600*1000);
+ if (pdn->eeh_freeze_count>0)
+ pdn->eeh_freeze_count--;
}
return 0;