mirror of
https://github.com/adulau/aha.git
synced 2024-12-27 19:26:25 +00:00
[WATCHDOG] hpwdt: Add NMI sourcing
Add NMI sourcing functionality (Can only be active if nmi_watchdog is inactive). Signed-off-by: Thomas Mingarelli <thomas.mingarelli@hp.com> Signed-off-by: Wim Van Sebroeck <wim@iguana.be>
This commit is contained in:
parent
55e8ddecec
commit
47bece87b1
2 changed files with 128 additions and 15 deletions
84
Documentation/watchdog/hpwdt.txt
Normal file
84
Documentation/watchdog/hpwdt.txt
Normal file
|
@ -0,0 +1,84 @@
|
|||
Last reviewed: 06/02/2009
|
||||
|
||||
HP iLO2 NMI Watchdog Driver
|
||||
NMI sourcing for iLO2 based ProLiant Servers
|
||||
Documentation and Driver by
|
||||
Thomas Mingarelli <thomas.mingarelli@hp.com>
|
||||
|
||||
The HP iLO2 NMI Watchdog driver is a kernel module that provides basic
|
||||
watchdog functionality and the added benefit of NMI sourcing. Both the
|
||||
watchdog functionality and the NMI sourcing capability need to be enabled
|
||||
by the user. Remember that the two modes are not dependant on one another.
|
||||
A user can have the NMI sourcing without the watchdog timer and vice-versa.
|
||||
|
||||
Watchdog functionality is enabled like any other common watchdog driver. That
|
||||
is, an application needs to be started that kicks off the watchdog timer. A
|
||||
basic application exists in the Documentation/watchdog/src directory called
|
||||
watchdog-test.c. Simply compile the C file and kick it off. If the system
|
||||
gets into a bad state and hangs, the HP ProLiant iLO 2 timer register will
|
||||
not be updated in a timely fashion and a hardware system reset (also known as
|
||||
an Automatic Server Recovery (ASR)) event will occur.
|
||||
|
||||
The hpwdt driver also has three (3) module parameters. They are the following:
|
||||
|
||||
soft_margin - allows the user to set the watchdog timer value
|
||||
allow_kdump - allows the user to save off a kernel dump image after an NMI
|
||||
nowayout - basic watchdog parameter that does not allow the timer to
|
||||
be restarted or an impending ASR to be escaped.
|
||||
|
||||
NOTE: More information about watchdog drivers in general, including the ioctl
|
||||
interface to /dev/watchdog can be found in
|
||||
Documentation/watchdog/watchdog-api.txt and Documentation/IPMI.txt.
|
||||
|
||||
The NMI sourcing capability is disabled when the driver discovers that the
|
||||
nmi_watchdog is turned on (nmi_watchdog = 1). This is due to the inability to
|
||||
distinguish between "NMI Watchdog Ticks" and "HW generated NMI events" in the
|
||||
Linux kernel. What this means is that the hpwdt nmi handler code is called
|
||||
each time the NMI signal fires off. This could amount to several thousands of
|
||||
NMIs in a matter of seconds. If a user sees the Linux kernel's "dazed and
|
||||
confused" message in the logs or if the system gets into a hung state, then
|
||||
the user should reboot with nmi_watchdog=0.
|
||||
|
||||
1. If the kernel has not been booted with nmi_watchdog turned off then
|
||||
edit /boot/grub/menu.lst and place the nmi_watchdog=0 at the end of the
|
||||
currently booting kernel line.
|
||||
2. reboot the sever
|
||||
|
||||
Now, the hpwdt can successfully receive and source the NMI and provide a log
|
||||
message that details the reason for the NMI (as determined by the HP BIOS).
|
||||
|
||||
Below is a list of NMIs the HP BIOS understands along with the associated
|
||||
code (reason):
|
||||
|
||||
No source found 00h
|
||||
|
||||
Uncorrectable Memory Error 01h
|
||||
|
||||
ASR NMI 1Bh
|
||||
|
||||
PCI Parity Error 20h
|
||||
|
||||
NMI Button Press 27h
|
||||
|
||||
SB_BUS_NMI 28h
|
||||
|
||||
ILO Doorbell NMI 29h
|
||||
|
||||
ILO IOP NMI 2Ah
|
||||
|
||||
ILO Watchdog NMI 2Bh
|
||||
|
||||
Proc Throt NMI 2Ch
|
||||
|
||||
Front Side Bus NMI 2Dh
|
||||
|
||||
PCI Express Error 2Fh
|
||||
|
||||
DMA controller NMI 30h
|
||||
|
||||
Hypertransport/CSI Error 31h
|
||||
|
||||
|
||||
|
||||
-- Tom Mingarelli
|
||||
(thomas.mingarelli@hp.com)
|
|
@ -19,6 +19,7 @@
|
|||
#include <linux/interrupt.h>
|
||||
#include <linux/io.h>
|
||||
#include <linux/irq.h>
|
||||
#include <linux/nmi.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/miscdevice.h>
|
||||
#include <linux/mm.h>
|
||||
|
@ -47,7 +48,7 @@
|
|||
#define PCI_BIOS32_PARAGRAPH_LEN 16
|
||||
#define PCI_ROM_BASE1 0x000F0000
|
||||
#define ROM_SIZE 0x10000
|
||||
#define HPWDT_VERSION "1.01"
|
||||
#define HPWDT_VERSION "1.1.1"
|
||||
|
||||
struct bios32_service_dir {
|
||||
u32 signature;
|
||||
|
@ -119,6 +120,7 @@ static int nowayout = WATCHDOG_NOWAYOUT;
|
|||
static char expect_release;
|
||||
static unsigned long hpwdt_is_open;
|
||||
static unsigned int allow_kdump;
|
||||
static int hpwdt_nmi_sourcing;
|
||||
|
||||
static void __iomem *pci_mem_addr; /* the PCI-memory address */
|
||||
static unsigned long __iomem *hpwdt_timer_reg;
|
||||
|
@ -468,21 +470,22 @@ static int hpwdt_pretimeout(struct notifier_block *nb, unsigned long ulReason,
|
|||
if (ulReason != DIE_NMI && ulReason != DIE_NMI_IPI)
|
||||
return NOTIFY_OK;
|
||||
|
||||
spin_lock_irqsave(&rom_lock, rom_pl);
|
||||
if (!die_nmi_called)
|
||||
asminline_call(&cmn_regs, cru_rom_addr);
|
||||
die_nmi_called = 1;
|
||||
spin_unlock_irqrestore(&rom_lock, rom_pl);
|
||||
if (cmn_regs.u1.ral == 0) {
|
||||
printk(KERN_WARNING "hpwdt: An NMI occurred, "
|
||||
"but unable to determine source.\n");
|
||||
} else {
|
||||
if (allow_kdump)
|
||||
hpwdt_stop();
|
||||
panic("An NMI occurred, please see the Integrated "
|
||||
"Management Log for details.\n");
|
||||
if (hpwdt_nmi_sourcing) {
|
||||
spin_lock_irqsave(&rom_lock, rom_pl);
|
||||
if (!die_nmi_called)
|
||||
asminline_call(&cmn_regs, cru_rom_addr);
|
||||
die_nmi_called = 1;
|
||||
spin_unlock_irqrestore(&rom_lock, rom_pl);
|
||||
if (cmn_regs.u1.ral == 0) {
|
||||
printk(KERN_WARNING "hpwdt: An NMI occurred, "
|
||||
"but unable to determine source.\n");
|
||||
} else {
|
||||
if (allow_kdump)
|
||||
hpwdt_stop();
|
||||
panic("An NMI occurred, please see the Integrated "
|
||||
"Management Log for details.\n");
|
||||
}
|
||||
}
|
||||
|
||||
return NOTIFY_OK;
|
||||
}
|
||||
|
||||
|
@ -627,11 +630,37 @@ static struct notifier_block die_notifier = {
|
|||
* Init & Exit
|
||||
*/
|
||||
|
||||
#ifdef ARCH_HAS_NMI_WATCHDOG
|
||||
static void __devinit hpwdt_check_nmi_sourcing(struct pci_dev *dev)
|
||||
{
|
||||
/*
|
||||
* If nmi_watchdog is turned off then we can turn on
|
||||
* our nmi sourcing capability.
|
||||
*/
|
||||
if (!nmi_watchdog_active())
|
||||
hpwdt_nmi_sourcing = 1;
|
||||
else
|
||||
dev_warn(&dev->dev, "NMI sourcing is disabled. To enable this "
|
||||
"functionality you must reboot with nmi_watchdog=0.\n");
|
||||
}
|
||||
#else
|
||||
static void __devinit hpwdt_check_nmi_sourcing(struct pci_dev *dev)
|
||||
{
|
||||
dev_warn(&dev->dev, "NMI sourcing is disabled. "
|
||||
"Your kernel does not support a NMI Watchdog.\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
static int __devinit hpwdt_init_one(struct pci_dev *dev,
|
||||
const struct pci_device_id *ent)
|
||||
{
|
||||
int retval;
|
||||
|
||||
/*
|
||||
* Check if we can do NMI sourcing or not
|
||||
*/
|
||||
hpwdt_check_nmi_sourcing(dev);
|
||||
|
||||
/*
|
||||
* First let's find out if we are on an iLO2 server. We will
|
||||
* not run on a legacy ASM box.
|
||||
|
|
Loading…
Reference in a new issue