From 8edc5cc5ec880c96de8e6686fb0d7a5231e91c05 Mon Sep 17 00:00:00 2001 From: Venki Pallipadi Date: Mon, 12 May 2008 15:43:34 +0200 Subject: x86: remove 6 bank limitation in 64 bit MCE reporting code Eliminate the 6 bank restriction in 64 bit mce reporting code. This restriction is artificial (due to static creation of sysfs files) and 32 bit code does not have any such restriction. This change helps in reporting the details of machine checks on a machine check exception with errors in bank 6 and above on CPUs that support those banks. Without the patch, machine check errors in those banks are not reported. We still have 128 (MCE_EXTENDED_BANK) bank restriction instead of max 256 supported in hardware. That is not changed in the patch below as it will have some user level mcelog utility dependency, with bank 128 being used for thermal reporting currently. The patch below does not create sysfs control (bankNctl) for banks higher than 6 as well. That needs some pre-cleanup in /sysfs mce layout, removal of per cpu /sysfs entries for bankctl as they are really global system level control today. That change will follow. This basic change is critical to report the detailed errors on banks higher than 6. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_64.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel/cpu/mcheck') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index e07e8c068ae..f1f3f5e163b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -31,7 +31,7 @@ #include #define MISC_MCELOG_MINOR 227 -#define NR_BANKS 6 +#define NR_SYSFS_BANKS 6 atomic_t mce_entry; @@ -46,7 +46,7 @@ static int mce_dont_init; */ static int tolerant = 1; static int banks; -static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL }; +static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL }; static unsigned long notify_user; static int rip_msr; static int mce_bootlog = -1; @@ -209,7 +209,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) barrier(); for (i = 0; i < banks; i++) { - if (!bank[i]) + if (i < NR_SYSFS_BANKS && !bank[i]) continue; m.misc = 0; @@ -444,9 +444,10 @@ static void mce_init(void *dummy) rdmsrl(MSR_IA32_MCG_CAP, cap); banks = cap & 0xff; - if (banks > NR_BANKS) { - printk(KERN_INFO "MCE: warning: using only %d banks\n", banks); - banks = NR_BANKS; + if (banks > MCE_EXTENDED_BANK) { + printk(KERN_INFO "MCE: warning: using only %d banks\n", + MCE_EXTENDED_BANK); + banks = MCE_EXTENDED_BANK; } /* Use accurate RIP reporting if available. */ if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) @@ -462,7 +463,7 @@ static void mce_init(void *dummy) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); for (i = 0; i < banks; i++) { - wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); + wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL); wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); } } @@ -766,7 +767,10 @@ DEFINE_PER_CPU(struct sys_device, device_mce); } \ static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); -/* TBD should generate these dynamically based on number of available banks */ +/* + * TBD should generate these dynamically based on number of available banks. + * Have only 6 contol banks in /sysfs until then. + */ ACCESSOR(bank0ctl,bank[0],mce_restart()) ACCESSOR(bank1ctl,bank[1],mce_restart()) ACCESSOR(bank2ctl,bank[2],mce_restart()) -- cgit v1.2.3 From b4b3bd96f26586e53ab5482f1869221dd1b5ac36 Mon Sep 17 00:00:00 2001 From: Daniel Rahn Date: Fri, 6 Jun 2008 09:42:36 +0200 Subject: x86: correctly report NR_BANKS in mce_64.c attached is a no-brainer that makes kernel correctly report NR_BANKS for MCE. We are right now limited to NR_BANKS==6, but the error message will use the available number of banks instead of the defined maximum. For a Nehalem based system it will print: "MCE: warning: using only 9 banks" while the correct message would be "MCE: warning: using only 6 banks" Signed-off-by: Pavel Machek Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel/cpu/mcheck') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index f1f3f5e163b..8c8299ce7ad 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -445,9 +445,9 @@ static void mce_init(void *dummy) rdmsrl(MSR_IA32_MCG_CAP, cap); banks = cap & 0xff; if (banks > MCE_EXTENDED_BANK) { + banks = MCE_EXTENDED_BANK; printk(KERN_INFO "MCE: warning: using only %d banks\n", MCE_EXTENDED_BANK); - banks = MCE_EXTENDED_BANK; } /* Use accurate RIP reporting if available. */ if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) -- cgit v1.2.3 From fe94ae995d33a4df35b6b9cd0504e87d7e37c8de Mon Sep 17 00:00:00 2001 From: Paolo Ciarrocchi Date: Sat, 14 Jun 2008 14:06:19 +0200 Subject: x86: coding style fixes to arch/x86/kernel/cpu/mcheck/p4.c Before: total: 16 errors, 34 warnings, 257 lines checked After: total: 0 errors, 2 warnings, 257 lines checked No changes in the compiled code: paolo@paolo-desktop:~/linux.trees.git$ size /tmp/p4* text data bss dec hex filename 2644 4 4 2652 a5c /tmp/p4.o.after 2644 4 4 2652 a5c /tmp/p4.o.before paolo@paolo-desktop:~/linux.trees.git$ md5sum /tmp/p4* 13f1b21c4246b31a28aaff38184586ca /tmp/p4.o.after 13f1b21c4246b31a28aaff38184586ca /tmp/p4.o.before Signed-off-by: Paolo Ciarrocchi Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/p4.c | 90 ++++++++++++++++++++--------------------- 1 file changed, 45 insertions(+), 45 deletions(-) (limited to 'arch/x86/kernel/cpu/mcheck') diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index cb03345554a..eef001ad3bd 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -8,7 +8,7 @@ #include #include -#include +#include #include #include #include @@ -32,12 +32,12 @@ struct intel_mce_extended_msrs { /* u32 *reserved[]; */ }; -static int mce_num_extended_msrs = 0; +static int mce_num_extended_msrs; #ifdef CONFIG_X86_MCE_P4THERMAL static void unexpected_thermal_interrupt(struct pt_regs *regs) -{ +{ printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", smp_processor_id()); add_taint(TAINT_MACHINE_CHECK); @@ -83,7 +83,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) * be some SMM goo which handles it, so we can't even put a handler * since it might be delivered via SMI already -zwanem. */ - rdmsr (MSR_IA32_MISC_ENABLE, l, h); + rdmsr(MSR_IA32_MISC_ENABLE, l, h); h = apic_read(APIC_LVTTHMR); if ((l & (1<<3)) && (h & APIC_DM_SMI)) { printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", @@ -91,7 +91,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) return; /* -EBUSY */ } - /* check whether a vector already exists, temporarily masked? */ + /* check whether a vector already exists, temporarily masked? */ if (h & APIC_VECTOR_MASK) { printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already " "installed\n", @@ -104,18 +104,18 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */ apic_write_around(APIC_LVTTHMR, h); - rdmsr (MSR_IA32_THERM_INTERRUPT, l, h); - wrmsr (MSR_IA32_THERM_INTERRUPT, l | 0x03 , h); + rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); + wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h); /* ok we're good to go... */ vendor_thermal_interrupt = intel_thermal_interrupt; - - rdmsr (MSR_IA32_MISC_ENABLE, l, h); - wrmsr (MSR_IA32_MISC_ENABLE, l | (1<<3), h); - l = apic_read (APIC_LVTTHMR); - apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED); - printk (KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h); + + l = apic_read(APIC_LVTTHMR); + apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); + printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); /* enable thermal throttle processing */ atomic_set(&therm_throt_en, 1); @@ -129,28 +129,28 @@ static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) { u32 h; - rdmsr (MSR_IA32_MCG_EAX, r->eax, h); - rdmsr (MSR_IA32_MCG_EBX, r->ebx, h); - rdmsr (MSR_IA32_MCG_ECX, r->ecx, h); - rdmsr (MSR_IA32_MCG_EDX, r->edx, h); - rdmsr (MSR_IA32_MCG_ESI, r->esi, h); - rdmsr (MSR_IA32_MCG_EDI, r->edi, h); - rdmsr (MSR_IA32_MCG_EBP, r->ebp, h); - rdmsr (MSR_IA32_MCG_ESP, r->esp, h); - rdmsr (MSR_IA32_MCG_EFLAGS, r->eflags, h); - rdmsr (MSR_IA32_MCG_EIP, r->eip, h); + rdmsr(MSR_IA32_MCG_EAX, r->eax, h); + rdmsr(MSR_IA32_MCG_EBX, r->ebx, h); + rdmsr(MSR_IA32_MCG_ECX, r->ecx, h); + rdmsr(MSR_IA32_MCG_EDX, r->edx, h); + rdmsr(MSR_IA32_MCG_ESI, r->esi, h); + rdmsr(MSR_IA32_MCG_EDI, r->edi, h); + rdmsr(MSR_IA32_MCG_EBP, r->ebp, h); + rdmsr(MSR_IA32_MCG_ESP, r->esp, h); + rdmsr(MSR_IA32_MCG_EFLAGS, r->eflags, h); + rdmsr(MSR_IA32_MCG_EIP, r->eip, h); } -static void intel_machine_check(struct pt_regs * regs, long error_code) +static void intel_machine_check(struct pt_regs *regs, long error_code) { - int recover=1; + int recover = 1; u32 alow, ahigh, high, low; u32 mcgstl, mcgsth; int i; - rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth); + rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); if (mcgstl & (1<<0)) /* Recoverable ? */ - recover=0; + recover = 0; printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", smp_processor_id(), mcgsth, mcgstl); @@ -191,20 +191,20 @@ static void intel_machine_check(struct pt_regs * regs, long error_code) } if (recover & 2) - panic ("CPU context corrupt"); + panic("CPU context corrupt"); if (recover & 1) - panic ("Unable to continue"); + panic("Unable to continue"); printk(KERN_EMERG "Attempting to continue.\n"); - /* - * Do not clear the MSR_IA32_MCi_STATUS if the error is not + /* + * Do not clear the MSR_IA32_MCi_STATUS if the error is not * recoverable/continuable.This will allow BIOS to look at the MSRs * for errors if the OS could not log the error. */ - for (i=0; i> 16) & 0xff; - printk (KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)" + printk(KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)" " available\n", smp_processor_id(), mce_num_extended_msrs); -- cgit v1.2.3 From 5175676a2d012ca5e5ad5eaedbfc1da5d5660d2a Mon Sep 17 00:00:00 2001 From: Paolo Ciarrocchi Date: Sat, 14 Jun 2008 14:37:14 +0200 Subject: x86: coding style fixes to arch/x86/kernel/cpu/mcheck/k7.c Before: total: 6 errors, 13 warnings, 105 lines checked After: total: 0 errors, 0 warnings, 105 lines checked paolo@paolo-desktop:~/linux.trees.git$ size /tmp/k7* text data bss dec hex filename 1135 0 0 1135 46f /tmp/k7.o.after 1135 0 0 1135 46f /tmp/k7.o.before paolo@paolo-desktop:~/linux.trees.git$ md5sum /tmp/k7* 87b14954045aa37dbaee6fb7e022ed9a /tmp/k7.o.after 87b14954045aa37dbaee6fb7e022ed9a /tmp/k7.o.before Signed-off-by: Paolo Ciarrocchi Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/k7.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'arch/x86/kernel/cpu/mcheck') diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c index e633c9c2b76..f390c9f6635 100644 --- a/arch/x86/kernel/cpu/mcheck/k7.c +++ b/arch/x86/kernel/cpu/mcheck/k7.c @@ -9,23 +9,23 @@ #include #include -#include +#include #include #include #include "mce.h" /* Machine Check Handler For AMD Athlon/Duron */ -static void k7_machine_check(struct pt_regs * regs, long error_code) +static void k7_machine_check(struct pt_regs *regs, long error_code) { - int recover=1; + int recover = 1; u32 alow, ahigh, high, low; u32 mcgstl, mcgsth; int i; - rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth); + rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); if (mcgstl & (1<<0)) /* Recoverable ? */ - recover=0; + recover = 0; printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", smp_processor_id(), mcgsth, mcgstl); @@ -60,12 +60,12 @@ static void k7_machine_check(struct pt_regs * regs, long error_code) } if (recover&2) - panic ("CPU context corrupt"); + panic("CPU context corrupt"); if (recover&1) - panic ("Unable to continue"); - printk (KERN_EMERG "Attempting to continue.\n"); + panic("Unable to continue"); + printk(KERN_EMERG "Attempting to continue.\n"); mcgstl &= ~(1<<2); - wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth); + wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); } @@ -81,25 +81,25 @@ void amd_mcheck_init(struct cpuinfo_x86 *c) machine_check_vector = k7_machine_check; wmb(); - printk (KERN_INFO "Intel machine check architecture supported.\n"); - rdmsr (MSR_IA32_MCG_CAP, l, h); + printk(KERN_INFO "Intel machine check architecture supported.\n"); + rdmsr(MSR_IA32_MCG_CAP, l, h); if (l & (1<<8)) /* Control register present ? */ - wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); + wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); nr_mce_banks = l & 0xff; /* Clear status for MC index 0 separately, we don't touch CTL, * as some K7 Athlons cause spurious MCEs when its enabled. */ if (boot_cpu_data.x86 == 6) { - wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0); + wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0); i = 1; } else i = 0; - for (; i Date: Tue, 24 Jun 2008 17:12:56 -0700 Subject: x86, mce_64.c: mce_cpu_quirks being ignored Quirks getting ignored was a bug. Below patch fixes the bug, until we have the dynamic banks support. Sysfs choice configuration should not have any issues with the earlier patch as we look for NR_SYSFS_BANKS in do_machine_check(). Signed-off-by: Venkatesh Pallipadi Cc: Andi Kleen Cc: Max Asbock Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_64.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel/cpu/mcheck') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 8c8299ce7ad..501ca1cea27 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -463,7 +463,11 @@ static void mce_init(void *dummy) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); for (i = 0; i < banks; i++) { - wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL); + if (i < NR_SYSFS_BANKS) + wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); + else + wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL); + wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); } } -- cgit v1.2.3