x86, mce: Add boot options for corrected errors

This patch introduces three boot options (no_cmci, dont_log_ce
and ignore_ce) to control handling for corrected errors.

The "mce=no_cmci" boot option disables the CMCI feature.

Since CMCI is a new feature so having boot controls to disable
it will be a help if the hardware is misbehaving.

The "mce=dont_log_ce" boot option disables logging for corrected
errors. All reported corrected errors will be cleared silently.
This option will be useful if you never care about corrected
errors.

The "mce=ignore_ce" boot option disables features for corrected
errors, i.e. polling timer and cmci.  All corrected events are
not cleared and kept in bank MSRs.

Usually this disablement is not recommended, however it will be
a help if there are some conflict with the BIOS or hardware
monitoring applications etc., that clears corrected events in
banks instead of OS.

[ And trivial cleanup (space -> tab) for doc is included. ]

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
LKML-Reference: <4A30ACDF.5030408@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
This commit is contained in:
Hidetoshi Seto 2009-06-11 16:06:07 +09:00 committed by Ingo Molnar
parent 77e26cca20
commit 62fdac5913
4 changed files with 52 additions and 8 deletions

View file

@ -7,12 +7,36 @@ Machine check
Please see Documentation/x86/x86_64/machinecheck for sysfs runtime tunables. Please see Documentation/x86/x86_64/machinecheck for sysfs runtime tunables.
mce=off disable machine check mce=off
mce=bootlog Enable logging of machine checks left over from booting. Disable machine check
Disabled by default on AMD because some BIOS leave bogus ones. mce=no_cmci
If your BIOS doesn't do that it's a good idea to enable though Disable CMCI(Corrected Machine Check Interrupt) that
to make sure you log even machine check events that result Intel processor supports. Usually this disablement is
in a reboot. On Intel systems it is enabled by default. not recommended, but it might be handy if your hardware
is misbehaving.
Note that you'll get more problems without CMCI than with
due to the shared banks, i.e. you might get duplicated
error logs.
mce=dont_log_ce
Don't make logs for corrected errors. All events reported
as corrected are silently cleared by OS.
This option will be useful if you have no interest in any
of corrected errors.
mce=ignore_ce
Disable features for corrected errors, e.g. polling timer
and CMCI. All events reported as corrected are not cleared
by OS and remained in its error banks.
Usually this disablement is not recommended, however if
there is an agent checking/clearing corrected errors
(e.g. BIOS or hardware monitoring applications), conflicting
with OS's error handling, and you cannot deactivate the agent,
then this option will be a help.
mce=bootlog
Enable logging of machine checks left over from booting.
Disabled by default on AMD because some BIOS leave bogus ones.
If your BIOS doesn't do that it's a good idea to enable though
to make sure you log even machine check events that result
in a reboot. On Intel systems it is enabled by default.
mce=nobootlog mce=nobootlog
Disable boot machine check logging. Disable boot machine check logging.
mce=tolerancelevel[,monarchtimeout] (number,number) mce=tolerancelevel[,monarchtimeout] (number,number)

View file

@ -119,6 +119,8 @@ extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1) #define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
#ifdef CONFIG_X86_MCE_INTEL #ifdef CONFIG_X86_MCE_INTEL
extern int mce_cmci_disabled;
extern int mce_ignore_ce;
void mce_intel_feature_init(struct cpuinfo_x86 *c); void mce_intel_feature_init(struct cpuinfo_x86 *c);
void cmci_clear(void); void cmci_clear(void);
void cmci_reenable(void); void cmci_reenable(void);

View file

@ -84,6 +84,9 @@ static int rip_msr;
static int mce_bootlog = -1; static int mce_bootlog = -1;
static int monarch_timeout = -1; static int monarch_timeout = -1;
static int mce_panic_timeout; static int mce_panic_timeout;
static int mce_dont_log_ce;
int mce_cmci_disabled;
int mce_ignore_ce;
int mce_ser; int mce_ser;
static char trigger[128]; static char trigger[128];
@ -526,7 +529,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
* Don't get the IP here because it's unlikely to * Don't get the IP here because it's unlikely to
* have anything to do with the actual error location. * have anything to do with the actual error location.
*/ */
if (!(flags & MCP_DONTLOG)) { if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
mce_log(&m); mce_log(&m);
add_taint(TAINT_MACHINE_CHECK); add_taint(TAINT_MACHINE_CHECK);
} }
@ -1307,6 +1310,9 @@ static void mce_init_timer(void)
struct timer_list *t = &__get_cpu_var(mce_timer); struct timer_list *t = &__get_cpu_var(mce_timer);
int *n = &__get_cpu_var(next_interval); int *n = &__get_cpu_var(next_interval);
if (mce_ignore_ce)
return;
*n = check_interval * HZ; *n = check_interval * HZ;
if (!*n) if (!*n)
return; return;
@ -1517,7 +1523,10 @@ static struct miscdevice mce_log_device = {
}; };
/* /*
* mce=off disables machine check * mce=off Disables machine check
* mce=no_cmci Disables CMCI
* mce=dont_log_ce Clears corrected events silently, no log created for CEs.
* mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
* mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
* monarchtimeout is how long to wait for other CPUs on machine * monarchtimeout is how long to wait for other CPUs on machine
* check, or 0 to not wait * check, or 0 to not wait
@ -1532,6 +1541,12 @@ static int __init mcheck_enable(char *str)
str++; str++;
if (!strcmp(str, "off")) if (!strcmp(str, "off"))
mce_disabled = 1; mce_disabled = 1;
else if (!strcmp(str, "no_cmci"))
mce_cmci_disabled = 1;
else if (!strcmp(str, "dont_log_ce"))
mce_dont_log_ce = 1;
else if (!strcmp(str, "ignore_ce"))
mce_ignore_ce = 1;
else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
mce_bootlog = (str[0] == 'b'); mce_bootlog = (str[0] == 'b');
else if (isdigit(str[0])) { else if (isdigit(str[0])) {

View file

@ -57,6 +57,9 @@ static int cmci_supported(int *banks)
{ {
u64 cap; u64 cap;
if (mce_cmci_disabled || mce_ignore_ce)
return 0;
/* /*
* Vendor check is not strictly needed, but the initial * Vendor check is not strictly needed, but the initial
* initialization is vendor keyed and this * initialization is vendor keyed and this