From f607e3a03c90e8c050cb0c12ec9967c2925cc812 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 19 Aug 2008 13:34:59 -0700 Subject: Revert "[CPUFREQ][2/2] preregister support for powernow-k8" This reverts commit 34ae7f35a21694aa5cb8829dc5142c39d73d6ba0, which has been reported to cause a number of problems. During suspend and resume, it apparently causes a crash in a CPU hotplug notifier to happen, although the exact details are sketchy because of the inability to get good traces during the suspend sequence. See buzilla entries http://bugzilla.kernel.org/show_bug.cgi?id=11296 http://bugzilla.kernel.org/show_bug.cgi?id=11339 for more examples and details. [ Mark: "Revert the patch for now. I'm still looking into getting a reliable reproduction and I do not have a fix at this time." ] Requested-by: Rafael J. Wysocki Acked-by: Mark Langsdorf Acked-by: Dave Jones Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 109 ++++++++++-------------------- arch/x86/kernel/cpu/cpufreq/powernow-k8.h | 3 +- 2 files changed, 37 insertions(+), 75 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 4e7271999a7..84bb395038d 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -737,63 +737,44 @@ static int find_psb_table(struct powernow_k8_data *data) #ifdef CONFIG_X86_POWERNOW_K8_ACPI static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) { - if (!data->acpi_data->state_count || (cpu_family == CPU_HW_PSTATE)) + if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE)) return; - data->irt = (data->acpi_data->states[index].control >> IRT_SHIFT) & IRT_MASK; - data->rvo = (data->acpi_data->states[index].control >> RVO_SHIFT) & RVO_MASK; - data->exttype = (data->acpi_data->states[index].control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; - data->plllock = (data->acpi_data->states[index].control >> PLL_L_SHIFT) & PLL_L_MASK; - data->vidmvs = 1 << ((data->acpi_data->states[index].control >> MVS_SHIFT) & MVS_MASK); - data->vstable = (data->acpi_data->states[index].control >> VST_SHIFT) & VST_MASK; -} - - -static struct acpi_processor_performance *acpi_perf_data; -static int preregister_valid; - -static int powernow_k8_cpu_preinit_acpi(void) -{ - acpi_perf_data = alloc_percpu(struct acpi_processor_performance); - if (!acpi_perf_data) - return -ENODEV; - - if (acpi_processor_preregister_performance(acpi_perf_data)) - return -ENODEV; - else - preregister_valid = 1; - return 0; + data->irt = (data->acpi_data.states[index].control >> IRT_SHIFT) & IRT_MASK; + data->rvo = (data->acpi_data.states[index].control >> RVO_SHIFT) & RVO_MASK; + data->exttype = (data->acpi_data.states[index].control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; + data->plllock = (data->acpi_data.states[index].control >> PLL_L_SHIFT) & PLL_L_MASK; + data->vidmvs = 1 << ((data->acpi_data.states[index].control >> MVS_SHIFT) & MVS_MASK); + data->vstable = (data->acpi_data.states[index].control >> VST_SHIFT) & VST_MASK; } static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { struct cpufreq_frequency_table *powernow_table; int ret_val; - int cpu = 0; - data->acpi_data = percpu_ptr(acpi_perf_data, cpu); - if (acpi_processor_register_performance(data->acpi_data, data->cpu)) { + if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { dprintk("register performance failed: bad ACPI data\n"); return -EIO; } /* verify the data contained in the ACPI structures */ - if (data->acpi_data->state_count <= 1) { + if (data->acpi_data.state_count <= 1) { dprintk("No ACPI P-States\n"); goto err_out; } - if ((data->acpi_data->control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) || - (data->acpi_data->status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) { + if ((data->acpi_data.control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) || + (data->acpi_data.status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) { dprintk("Invalid control/status registers (%x - %x)\n", - data->acpi_data->control_register.space_id, - data->acpi_data->status_register.space_id); + data->acpi_data.control_register.space_id, + data->acpi_data.status_register.space_id); goto err_out; } /* fill in data->powernow_table */ powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table) - * (data->acpi_data->state_count + 1)), GFP_KERNEL); + * (data->acpi_data.state_count + 1)), GFP_KERNEL); if (!powernow_table) { dprintk("powernow_table memory alloc failure\n"); goto err_out; @@ -806,12 +787,12 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) if (ret_val) goto err_out_mem; - powernow_table[data->acpi_data->state_count].frequency = CPUFREQ_TABLE_END; - powernow_table[data->acpi_data->state_count].index = 0; + powernow_table[data->acpi_data.state_count].frequency = CPUFREQ_TABLE_END; + powernow_table[data->acpi_data.state_count].index = 0; data->powernow_table = powernow_table; /* fill in data */ - data->numps = data->acpi_data->state_count; + data->numps = data->acpi_data.state_count; if (first_cpu(per_cpu(cpu_core_map, data->cpu)) == data->cpu) print_basics(data); powernow_k8_acpi_pst_values(data, 0); @@ -819,31 +800,16 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) /* notify BIOS that we exist */ acpi_processor_notify_smm(THIS_MODULE); - /* determine affinity, from ACPI if available */ - if (preregister_valid) { - if ((data->acpi_data->shared_type == CPUFREQ_SHARED_TYPE_ALL) || - (data->acpi_data->shared_type == CPUFREQ_SHARED_TYPE_ANY)) - data->starting_core_affinity = data->acpi_data->shared_cpu_map; - else - data->starting_core_affinity = cpumask_of_cpu(data->cpu); - } else { - /* best guess from family if not */ - if (cpu_family == CPU_HW_PSTATE) - data->starting_core_affinity = cpumask_of_cpu(data->cpu); - else - data->starting_core_affinity = per_cpu(cpu_core_map, data->cpu); - } - return 0; err_out_mem: kfree(powernow_table); err_out: - acpi_processor_unregister_performance(data->acpi_data, data->cpu); + acpi_processor_unregister_performance(&data->acpi_data, data->cpu); /* data->acpi_data.state_count informs us at ->exit() whether ACPI was used */ - data->acpi_data->state_count = 0; + data->acpi_data.state_count = 0; return -ENODEV; } @@ -855,10 +821,10 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpuf rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo); data->max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; - for (i = 0; i < data->acpi_data->state_count; i++) { + for (i = 0; i < data->acpi_data.state_count; i++) { u32 index; - index = data->acpi_data->states[i].control & HW_PSTATE_MASK; + index = data->acpi_data.states[i].control & HW_PSTATE_MASK; if (index > data->max_hw_pstate) { printk(KERN_ERR PFX "invalid pstate %d - bad value %d.\n", i, index); printk(KERN_ERR PFX "Please report to BIOS manufacturer\n"); @@ -874,7 +840,7 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpuf powernow_table[i].index = index; - powernow_table[i].frequency = data->acpi_data->states[i].core_frequency * 1000; + powernow_table[i].frequency = data->acpi_data.states[i].core_frequency * 1000; } return 0; } @@ -883,16 +849,16 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpuf { int i; int cntlofreq = 0; - for (i = 0; i < data->acpi_data->state_count; i++) { + for (i = 0; i < data->acpi_data.state_count; i++) { u32 fid; u32 vid; if (data->exttype) { - fid = data->acpi_data->states[i].status & EXT_FID_MASK; - vid = (data->acpi_data->states[i].status >> VID_SHIFT) & EXT_VID_MASK; + fid = data->acpi_data.states[i].status & EXT_FID_MASK; + vid = (data->acpi_data.states[i].status >> VID_SHIFT) & EXT_VID_MASK; } else { - fid = data->acpi_data->states[i].control & FID_MASK; - vid = (data->acpi_data->states[i].control >> VID_SHIFT) & VID_MASK; + fid = data->acpi_data.states[i].control & FID_MASK; + vid = (data->acpi_data.states[i].control >> VID_SHIFT) & VID_MASK; } dprintk(" %d : fid 0x%x, vid 0x%x\n", i, fid, vid); @@ -933,10 +899,10 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpuf cntlofreq = i; } - if (powernow_table[i].frequency != (data->acpi_data->states[i].core_frequency * 1000)) { + if (powernow_table[i].frequency != (data->acpi_data.states[i].core_frequency * 1000)) { printk(KERN_INFO PFX "invalid freq entries %u kHz vs. %u kHz\n", powernow_table[i].frequency, - (unsigned int) (data->acpi_data->states[i].core_frequency * 1000)); + (unsigned int) (data->acpi_data.states[i].core_frequency * 1000)); powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID; continue; } @@ -946,12 +912,11 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpuf static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) { - if (data->acpi_data->state_count) - acpi_processor_unregister_performance(data->acpi_data, data->cpu); + if (data->acpi_data.state_count) + acpi_processor_unregister_performance(&data->acpi_data, data->cpu); } #else -static int powernow_k8_cpu_preinit_acpi(void) { return -ENODEV; } static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { return -ENODEV; } static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) { return; } static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) { return; } @@ -1136,7 +1101,7 @@ static int powernowk8_verify(struct cpufreq_policy *pol) static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) { struct powernow_k8_data *data; - cpumask_t oldmask = CPU_MASK_ALL; + cpumask_t oldmask; int rc; if (!cpu_online(pol->cpu)) @@ -1209,7 +1174,10 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) /* run on any CPU again */ set_cpus_allowed_ptr(current, &oldmask); - pol->cpus = data->starting_core_affinity; + if (cpu_family == CPU_HW_PSTATE) + pol->cpus = cpumask_of_cpu(pol->cpu); + else + pol->cpus = per_cpu(cpu_core_map, pol->cpu); data->available_cores = &(pol->cpus); /* Take a crude guess here. @@ -1332,7 +1300,6 @@ static int __cpuinit powernowk8_init(void) } if (supported_cpus == num_online_cpus()) { - powernow_k8_cpu_preinit_acpi(); printk(KERN_INFO PFX "Found %d %s " "processors (%d cpu cores) (" VERSION ")\n", num_online_nodes(), @@ -1349,10 +1316,6 @@ static void __exit powernowk8_exit(void) dprintk("exit\n"); cpufreq_unregister_driver(&cpufreq_amd64_driver); - -#ifdef CONFIG_X86_POWERNOW_K8_ACPI - free_percpu(acpi_perf_data); -#endif } MODULE_AUTHOR("Paul Devriendt and Mark Langsdorf "); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h index a62612cd4be..ab48cfed4d9 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h @@ -33,13 +33,12 @@ struct powernow_k8_data { #ifdef CONFIG_X86_POWERNOW_K8_ACPI /* the acpi table needs to be kept. it's only available if ACPI was * used to determine valid frequency/vid/fid states */ - struct acpi_processor_performance *acpi_data; + struct acpi_processor_performance acpi_data; #endif /* we need to keep track of associated cores, but let cpufreq * handle hotplug events - so just point at cpufreq pol->cpus * structure */ cpumask_t *available_cores; - cpumask_t starting_core_affinity; }; -- cgit v1.2.3 From c6744955d0ec0cb485c28c51eeb7185e260f6172 Mon Sep 17 00:00:00 2001 From: Samuel Sieb Date: Wed, 6 Aug 2008 22:06:29 -0700 Subject: x86: fix "kernel won't boot on a Cyrix MediaGXm (Geode)" Cyrix MediaGXm/Cx5530 Unicorn Revision 1.19.3B has stopped booting starting at v2.6.22. The reason is this commit: > commit f25f64ed5bd3c2932493681bdfdb483ea707da0a > Author: Juergen Beisert > Date: Sun Jul 22 11:12:38 2007 +0200 > > x86: Replace NSC/Cyrix specific chipset access macros by inlined functions. this commit activated a macro which was dormant before due to (buggy) macro side-effects. I've looked through various datasheets and found that the GXm and GXLV Geode processors don't have an incrementor. Remove the incrementor setup entirely. As the incrementor value differs according to clock speed and we would hope that the BIOS configures it correctly, it is probably the right solution. Cc: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/cyrix.c | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 3fd7a67bb06..e710a21bb6e 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -134,23 +134,6 @@ static void __cpuinit set_cx86_memwb(void) setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14); } -static void __cpuinit set_cx86_inc(void) -{ - unsigned char ccr3; - - printk(KERN_INFO "Enable Incrementor on Cyrix/NSC processor.\n"); - - ccr3 = getCx86(CX86_CCR3); - setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ - /* PCR1 -- Performance Control */ - /* Incrementor on, whatever that is */ - setCx86(CX86_PCR1, getCx86(CX86_PCR1) | 0x02); - /* PCR0 -- Performance Control */ - /* Incrementor Margin 10 */ - setCx86(CX86_PCR0, getCx86(CX86_PCR0) | 0x04); - setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ -} - /* * Configure later MediaGX and/or Geode processor. */ @@ -174,7 +157,6 @@ static void __cpuinit geode_configure(void) set_cx86_memwb(); set_cx86_reorder(); - set_cx86_inc(); local_irq_restore(flags); } -- cgit v1.2.3 From bde78a79a6eb015f33aa4660df1b06f5135def20 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 8 Jul 2008 09:51:56 -0700 Subject: x86: use WARN() in arch/x86/kernel Use WARN() instead of a printk+WARN_ON() pair; this way the message becomes part of the warning section for better reporting/collection. This also allowed the folding of some if()'s into the WARN() Signed-off-by: Arjan van de Ven Cc: akpm@linux-foundation.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/main.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 6f23969c8fa..b117d7f8a56 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -1496,11 +1496,8 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) /* kvm/qemu doesn't have mtrr set right, don't trim them all */ if (!highest_pfn) { - if (!kvm_para_available()) { - printk(KERN_WARNING + WARN(!kvm_para_available(), KERN_WARNING "WARNING: strange, CPU MTRRs all blank?\n"); - WARN_ON(1); - } return 0; } -- cgit v1.2.3 From 8323444b5dba3fe55e56a95d20d8f55c1d6745af Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Wed, 20 Aug 2008 16:45:53 -0700 Subject: x86: PAT Update validate_pat_support for intel CPUs Pentium III and Core Solo/Duo CPUs have an erratum " Page with PAT set to WC while associated MTRR is UC may consolidate to UC " which can result in WC setting in PAT to be ineffective. We will disable PAT on such CPUs, so that we can continue to use MTRR WC setting. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/addon_cpuid_features.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index 84a8220a607..a6ef672adbb 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -56,9 +56,22 @@ void __cpuinit validate_pat_support(struct cpuinfo_x86 *c) switch (c->x86_vendor) { case X86_VENDOR_INTEL: - if (c->x86 == 0xF || (c->x86 == 6 && c->x86_model >= 15)) + /* + * There is a known erratum on Pentium III and Core Solo + * and Core Duo CPUs. + * " Page with PAT set to WC while associated MTRR is UC + * may consolidate to UC " + * Because of this erratum, it is better to stick with + * setting WC in MTRR rather than using PAT on these CPUs. + * + * Enable PAT WC only on P4, Core 2 or later CPUs. + */ + if (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 15)) return; - break; + + pat_disable("PAT WC disabled due to known CPU erratum."); + return; + case X86_VENDOR_AMD: case X86_VENDOR_CENTAUR: case X86_VENDOR_TRANSMETA: -- cgit v1.2.3 From 38cc1c3df77c1bb739a4766788eb9fa49f16ffdf Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 21 Aug 2008 20:24:24 -0700 Subject: x86: work around MTRR mask setting Joshua Hoblitt reported that only 3 GB of his 16 GB of RAM is usable. Booting with mtrr_show showed us the BIOS-initialized MTRR settings - which are all wrong. So the root cause is that the BIOS has not set the mask correctly: > [ 0.429971] MSR00000200: 00000000d0000000 > [ 0.433305] MSR00000201: 0000000ff0000800 > should be ==> [ 0.433305] MSR00000201: 0000003ff0000800 > > [ 0.436638] MSR00000202: 00000000e0000000 > [ 0.439971] MSR00000203: 0000000fe0000800 > should be ==> [ 0.439971] MSR00000203: 0000003fe0000800 > > [ 0.443304] MSR00000204: 0000000000000006 > [ 0.446637] MSR00000205: 0000000c00000800 > should be ==> [ 0.446637] MSR00000205: 0000003c00000800 > > [ 0.449970] MSR00000206: 0000000400000006 > [ 0.453303] MSR00000207: 0000000fe0000800 > should be ==> [ 0.453303] MSR00000207: 0000003fe0000800 > > [ 0.456636] MSR00000208: 0000000420000006 > [ 0.459970] MSR00000209: 0000000ff0000800 > should be ==> [ 0.459970] MSR00000209: 0000003ff0000800 So detect this borkage and add the prefix 111. Signed-off-by: Yinghai Lu Cc: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 509bd3d9eac..43102e03e2d 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -379,6 +379,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, unsigned long *size, mtrr_type *type) { unsigned int mask_lo, mask_hi, base_lo, base_hi; + unsigned int tmp, hi; rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); if ((mask_lo & 0x800) == 0) { @@ -392,8 +393,18 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi); /* Work out the shifted address mask. */ - mask_lo = size_or_mask | mask_hi << (32 - PAGE_SHIFT) - | mask_lo >> PAGE_SHIFT; + tmp = mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT; + mask_lo = size_or_mask | tmp; + /* Expand tmp with high bits to all 1s*/ + hi = fls(tmp); + if (hi > 0) { + tmp |= ~((1<<(hi - 1)) - 1); + + if (tmp != mask_lo) { + WARN_ON("mtrr: your BIOS has set up an incorrect mask, fixing it up.\n"); + mask_lo = tmp; + } + } /* This works correctly if size is a power of two, i.e. a contiguous range. */ -- cgit v1.2.3 From 9754a5b840a209bc1f192d59f63e81b698a55ac8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 22 Aug 2008 08:22:23 +0200 Subject: x86: work around MTRR mask setting, v2 improve the debug printout: - make it actually display something - print it only once would be nice to have a WARN_ONCE() facility, to feed such things to kerneloops.org. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 43102e03e2d..cb7d3b6a80e 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -401,7 +401,12 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, tmp |= ~((1<<(hi - 1)) - 1); if (tmp != mask_lo) { - WARN_ON("mtrr: your BIOS has set up an incorrect mask, fixing it up.\n"); + static int once = 1; + + if (once) { + printk(KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n"); + once = 0; + } mask_lo = tmp; } } -- cgit v1.2.3 From 8735728ef8dc935c4fb351f913758fdbb62c308d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 22 Aug 2008 22:23:09 +0200 Subject: x86 MCE: Fix CPU hotplug problem with multiple multicore AMD CPUs During CPU hot-remove the sysfs directory created by threshold_create_bank(), defined in arch/x86/kernel/cpu/mcheck/mce_amd_64.c, has to be removed before its parent directory, created by mce_create_device(), defined in arch/x86/kernel/cpu/mcheck/mce_64.c . Moreover, when the CPU in question is hotplugged again, obviously the latter has to be created before the former. At present, the right ordering is not enforced, because all of these operations are carried out by CPU hotplug notifiers which are not appropriately ordered with respect to each other. This leads to serious problems on systems with two or more multicore AMD CPUs, among other things during suspend and hibernation. Fix the problem by placing threshold bank CPU hotplug callbacks in mce_cpu_callback(), so that they are invoked at the right places, if defined. Additionally, use kobject_del() to remove the sysfs directory associated with the kobject created by kobject_create_and_add() in threshold_create_bank(), to prevent the kernel from crashing during CPU hotplug operations on systems with two or more multicore AMD CPUs. This patch fixes bug #11337. Signed-off-by: Rafael J. Wysocki Acked-by: Andi Kleen Tested-by: Mark Langsdorf Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_64.c | 5 +++++ arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 18 +++++------------- 2 files changed, 10 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 65a339678ec..726a5fcdf34 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -759,6 +759,7 @@ static struct sysdev_class mce_sysclass = { }; DEFINE_PER_CPU(struct sys_device, device_mce); +void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata; /* Why are there no generic functions for this? */ #define ACCESSOR(name, var, start) \ @@ -883,9 +884,13 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, case CPU_ONLINE: case CPU_ONLINE_FROZEN: mce_create_device(cpu); + if (threshold_cpu_callback) + threshold_cpu_callback(action, cpu); break; case CPU_DEAD: case CPU_DEAD_FROZEN: + if (threshold_cpu_callback) + threshold_cpu_callback(action, cpu); mce_remove_device(cpu); break; } diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 88736cadbaa..5eb390a4b2e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -628,6 +628,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) deallocate_threshold_block(cpu, bank); free_out: + kobject_del(b->kobj); kobject_put(b->kobj); kfree(b); per_cpu(threshold_banks, cpu)[bank] = NULL; @@ -645,14 +646,11 @@ static void threshold_remove_device(unsigned int cpu) } /* get notified when a cpu comes on/off */ -static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static void __cpuinit amd_64_threshold_cpu_callback(unsigned long action, + unsigned int cpu) { - /* cpu was unsigned int to begin with */ - unsigned int cpu = (unsigned long)hcpu; - if (cpu >= NR_CPUS) - goto out; + return; switch (action) { case CPU_ONLINE: @@ -666,14 +664,8 @@ static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb, default: break; } - out: - return NOTIFY_OK; } -static struct notifier_block threshold_cpu_notifier __cpuinitdata = { - .notifier_call = threshold_cpu_callback, -}; - static __init int threshold_init_device(void) { unsigned lcpu = 0; @@ -684,7 +676,7 @@ static __init int threshold_init_device(void) if (err) return err; } - register_hotcpu_notifier(&threshold_cpu_notifier); + threshold_cpu_callback = amd_64_threshold_cpu_callback; return 0; } -- cgit v1.2.3 From b6734c35af028f06772c0b2c836c7d579e6d4dad Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 18 Aug 2008 17:39:32 -0700 Subject: x86: add NOPL as a synthetic CPU feature bit The long noops ("NOPL") are supposed to be detected by family >= 6. Unfortunately, several non-Intel x86 implementations, both hardware and software, don't obey this dictum. Instead, probe for NOPL directly by executing a NOPL instruction and see if we get #UD. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 32 +++++++++++++++++++++++++++++++- arch/x86/kernel/cpu/common_64.c | 36 ++++++++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/feature_names.c | 3 ++- 3 files changed, 69 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 80ab20d4fa3..0785b3c8d04 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -13,6 +13,7 @@ #include #include #include +#include #ifdef CONFIG_X86_LOCAL_APIC #include #include @@ -341,6 +342,35 @@ static void __init early_cpu_detect(void) early_get_cap(c); } +/* + * The NOPL instruction is supposed to exist on all CPUs with + * family >= 6, unfortunately, that's not true in practice because + * of early VIA chips and (more importantly) broken virtualizers that + * are not easy to detect. Hence, probe for it based on first + * principles. + */ +static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) +{ + const u32 nopl_signature = 0x888c53b1; /* Random number */ + u32 has_nopl = nopl_signature; + + clear_cpu_cap(c, X86_FEATURE_NOPL); + if (c->x86 >= 6) { + asm volatile("\n" + "1: .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */ + "2:\n" + " .section .fixup,\"ax\"\n" + "3: xor %0,%0\n" + " jmp 2b\n" + " .previous\n" + _ASM_EXTABLE(1b,3b) + : "+a" (has_nopl)); + + if (has_nopl == nopl_signature) + set_cpu_cap(c, X86_FEATURE_NOPL); + } +} + static void __cpuinit generic_identify(struct cpuinfo_x86 *c) { u32 tfms, xlvl; @@ -395,8 +425,8 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) } init_scattered_cpuid_features(c); + detect_nopl(c); } - } static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index dd6e3f15017..c3afba5a81a 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #ifdef CONFIG_X86_LOCAL_APIC #include @@ -215,6 +216,39 @@ static void __init early_cpu_support_print(void) } } +/* + * The NOPL instruction is supposed to exist on all CPUs with + * family >= 6, unfortunately, that's not true in practice because + * of early VIA chips and (more importantly) broken virtualizers that + * are not easy to detect. Hence, probe for it based on first + * principles. + * + * Note: no 64-bit chip is known to lack these, but put the code here + * for consistency with 32 bits, and to make it utterly trivial to + * diagnose the problem should it ever surface. + */ +static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) +{ + const u32 nopl_signature = 0x888c53b1; /* Random number */ + u32 has_nopl = nopl_signature; + + clear_cpu_cap(c, X86_FEATURE_NOPL); + if (c->x86 >= 6) { + asm volatile("\n" + "1: .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */ + "2:\n" + " .section .fixup,\"ax\"\n" + "3: xor %0,%0\n" + " jmp 2b\n" + " .previous\n" + _ASM_EXTABLE(1b,3b) + : "+a" (has_nopl)); + + if (has_nopl == nopl_signature) + set_cpu_cap(c, X86_FEATURE_NOPL); + } +} + static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c); void __init early_cpu_init(void) @@ -313,6 +347,8 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) c->x86_phys_bits = eax & 0xff; } + detect_nopl(c); + if (c->x86_vendor != X86_VENDOR_UNKNOWN && cpu_devs[c->x86_vendor]->c_early_init) cpu_devs[c->x86_vendor]->c_early_init(c); diff --git a/arch/x86/kernel/cpu/feature_names.c b/arch/x86/kernel/cpu/feature_names.c index e43ad4ad4cb..c9017799497 100644 --- a/arch/x86/kernel/cpu/feature_names.c +++ b/arch/x86/kernel/cpu/feature_names.c @@ -39,7 +39,8 @@ const char * const x86_cap_flags[NCAPINTS*32] = { NULL, NULL, NULL, NULL, "constant_tsc", "up", NULL, "arch_perfmon", "pebs", "bts", NULL, NULL, - "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "rep_good", NULL, NULL, NULL, + "nopl", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* Intel-defined (#2) */ -- cgit v1.2.3 From 12cf105cd66d95cf32c73cfa847a50bd1b700f23 Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Thu, 4 Sep 2008 21:09:43 +0200 Subject: x86: delay early cpu initialization until cpuid is done Move early cpu initialization after cpu early get cap so the early cpu initialization can fix up cpu caps. Signed-off-by: Krzysztof Helt Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 0785b3c8d04..8aab8517642 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -335,11 +335,11 @@ static void __init early_cpu_detect(void) get_cpu_vendor(c, 1); + early_get_cap(c); + if (c->x86_vendor != X86_VENDOR_UNKNOWN && cpu_devs[c->x86_vendor]->c_early_init) cpu_devs[c->x86_vendor]->c_early_init(c); - - early_get_cap(c); } /* -- cgit v1.2.3 From dd786dd12c99634055a9066f25ea957f29991c22 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 4 Sep 2008 21:09:43 +0200 Subject: x86: move mtrr cpu cap setting early in early_init_xxxx Krzysztof Helt found MTRR is not detected on k6-2 root cause: we moved mtrr_bp_init() early for mtrr trimming, and in early_detect we only read the CPU capability from cpuid, so some cpu doesn't have that bit in cpuid. So we need to add early_init_xxxx to preset those bit before mtrr_bp_init for those earlier cpus. this patch is for v2.6.27 Reported-by: Krzysztof Helt Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 9 +++++---- arch/x86/kernel/cpu/centaur.c | 11 +++++++++++ arch/x86/kernel/cpu/cyrix.c | 32 ++++++++++++++++++++++++++++---- 3 files changed, 44 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index cae9cabc303..18514ed2610 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -31,6 +31,11 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) if (c->x86_power & (1<<8)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); } + + /* Set MTRR capability flag if appropriate */ + if (c->x86_model == 13 || c->x86_model == 9 || + (c->x86_model == 8 && c->x86_mask >= 8)) + set_cpu_cap(c, X86_FEATURE_K6_MTRR); } static void __cpuinit init_amd(struct cpuinfo_x86 *c) @@ -166,10 +171,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) mbytes); } - /* Set MTRR capability flag if appropriate */ - if (c->x86_model == 13 || c->x86_model == 9 || - (c->x86_model == 8 && c->x86_mask >= 8)) - set_cpu_cap(c, X86_FEATURE_K6_MTRR); break; } diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index e0f45edd6a5..a0534c04d38 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -314,6 +314,16 @@ enum { EAMD3D = 1<<20, }; +static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) +{ + switch (c->x86) { + case 5: + /* Emulate MTRRs using Centaur's MCR. */ + set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR); + break; + } +} + static void __cpuinit init_centaur(struct cpuinfo_x86 *c) { @@ -462,6 +472,7 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) static struct cpu_dev centaur_cpu_dev __cpuinitdata = { .c_vendor = "Centaur", .c_ident = { "CentaurHauls" }, + .c_early_init = early_init_centaur, .c_init = init_centaur, .c_size_cache = centaur_size_cache, }; diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index e710a21bb6e..898a5a2002e 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -15,13 +15,11 @@ /* * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU */ -static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) +static void __cpuinit __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) { unsigned char ccr2, ccr3; - unsigned long flags; /* we test for DEVID by checking whether CCR3 is writable */ - local_irq_save(flags); ccr3 = getCx86(CX86_CCR3); setCx86(CX86_CCR3, ccr3 ^ 0x80); getCx86(0xc0); /* dummy to change bus */ @@ -44,9 +42,16 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) *dir0 = getCx86(CX86_DIR0); *dir1 = getCx86(CX86_DIR1); } - local_irq_restore(flags); } +static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) +{ + unsigned long flags; + + local_irq_save(flags); + __do_cyrix_devid(dir0, dir1); + local_irq_restore(flags); +} /* * Cx86_dir0_msb is a HACK needed by check_cx686_cpuid/slop in bugs.h in * order to identify the Cyrix CPU model after we're out of setup.c @@ -161,6 +166,24 @@ static void __cpuinit geode_configure(void) local_irq_restore(flags); } +static void __cpuinit early_init_cyrix(struct cpuinfo_x86 *c) +{ + unsigned char dir0, dir0_msn, dir1 = 0; + + __do_cyrix_devid(&dir0, &dir1); + dir0_msn = dir0 >> 4; /* identifies CPU "family" */ + + switch (dir0_msn) { + case 3: /* 6x86/6x86L */ + /* Emulate MTRRs using Cyrix's ARRs. */ + set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); + break; + case 5: /* 6x86MX/M II */ + /* Emulate MTRRs using Cyrix's ARRs. */ + set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); + break; + } +} static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) { @@ -416,6 +439,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c) static struct cpu_dev cyrix_cpu_dev __cpuinitdata = { .c_vendor = "Cyrix", .c_ident = { "CyrixInstead" }, + .c_early_init = early_init_cyrix, .c_init = init_cyrix, .c_identify = cyrix_identify, }; -- cgit v1.2.3 From d04ec773d7ca1bbc05a2768be95c1cebe2b07757 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 6 Aug 2008 10:27:30 +0200 Subject: x86: pda_init(): fix memory leak when using CPU hotplug pda->irqstackptr is allocated whenever a CPU is set online. But it is never freed. This results in a memory leak of 16K for each CPU offline/online cycle. Fix is to allocate pda->irqstackptr only once. Signed-off-by: Andreas Herrmann Cc: akpm@linux-foundation.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common_64.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index c3afba5a81a..4f2eeb5652e 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -529,17 +529,20 @@ void pda_init(int cpu) /* others are initialized in smpboot.c */ pda->pcurrent = &init_task; pda->irqstackptr = boot_cpu_stack; + pda->irqstackptr += IRQSTACKSIZE - 64; } else { - pda->irqstackptr = (char *) - __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); - if (!pda->irqstackptr) - panic("cannot allocate irqstack for cpu %d", cpu); + if (!pda->irqstackptr) { + pda->irqstackptr = (char *) + __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); + if (!pda->irqstackptr) + panic("cannot allocate irqstack for cpu %d", + cpu); + pda->irqstackptr += IRQSTACKSIZE - 64; + } if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) pda->nodenumber = cpu_to_node(cpu); } - - pda->irqstackptr += IRQSTACKSIZE-64; } char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + -- cgit v1.2.3 From 23952a96ae738277f3139b63d622e22984589031 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 6 Aug 2008 10:29:37 +0200 Subject: x86: cpu_init(): fix memory leak when using CPU hotplug Exception stacks are allocated each time a CPU is set online. But the allocated space is never freed. Thus with one CPU hotplug offline/online cycle there is a memory leak of 24K (6 pages) for a CPU. Fix is to allocate exception stacks only once -- when the CPU is set online for the first time. Signed-off-by: Andreas Herrmann Cc: akpm@linux-foundation.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common_64.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 4f2eeb5652e..a11f5d4477c 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -640,19 +640,22 @@ void __cpuinit cpu_init(void) /* * set up and load the per-CPU TSS */ - for (v = 0; v < N_EXCEPTION_STACKS; v++) { + if (!orig_ist->ist[0]) { static const unsigned int order[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, - [DEBUG_STACK - 1] = DEBUG_STACK_ORDER + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, + [DEBUG_STACK - 1] = DEBUG_STACK_ORDER }; - if (cpu) { - estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); - if (!estacks) - panic("Cannot allocate exception stack %ld %d\n", - v, cpu); + for (v = 0; v < N_EXCEPTION_STACKS; v++) { + if (cpu) { + estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); + if (!estacks) + panic("Cannot allocate exception " + "stack %ld %d\n", v, cpu); + } + estacks += PAGE_SIZE << order[v]; + orig_ist->ist[v] = t->x86_tss.ist[v] = + (unsigned long)estacks; } - estacks += PAGE_SIZE << order[v]; - orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; } t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); -- cgit v1.2.3