From 973a2dd1d50a11d380086601f14e59116f93e8c5 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:39:32 +0100 Subject: x86, mce: disable machine checks on suspend Impact: Bug fix During suspend it is not reliable to process machine check exceptions, because CPUs disappear but can still get machine check broadcasts. Also the system is slightly more likely to machine check them, but the handler is typically not a position to handle them in a meaningfull way. So disable them during suspend and enable them during resume. Also make sure they are always disabled on hot-unplugged CPUs. This new code assumes that suspend always hotunplugs all non BP CPUs. v2: Remove the WARN_ONs Thomas objected to. Signed-off-by: Andi Kleen Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 25cf624eccb..5ed80991ab9 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -728,6 +728,29 @@ __setup("mce=", mcheck_enable); * Sysfs support */ +/* + * Disable machine checks on suspend and shutdown. We can't really handle + * them later. + */ +static int mce_disable(void) +{ + int i; + + for (i = 0; i < banks; i++) + wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + return 0; +} + +static int mce_suspend(struct sys_device *dev, pm_message_t state) +{ + return mce_disable(); +} + +static int mce_shutdown(struct sys_device *dev) +{ + return mce_disable(); +} + /* On resume clear all MCE state. Don't want to see leftovers from the BIOS. Only one CPU is active at this time, the others get readded later using CPU hotplug. */ @@ -752,6 +775,8 @@ static void mce_restart(void) } static struct sysdev_class mce_sysclass = { + .suspend = mce_suspend, + .shutdown = mce_shutdown, .resume = mce_resume, .name = "machinecheck", }; -- cgit v1.2.3 From 123aa76ec0cab5d4881cd8509faed43231e68801 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:39:27 +0100 Subject: x86, mce: don't disable machine checks during code patching Impact: low priority bug fix This removes part of a a patch I added myself some time ago. After some consideration the patch was a bad idea. In particular it stopped machine check exceptions during code patching. To quote the comment: * MCEs only happen when something got corrupted and in this * case we must do something about the corruption. * Ignoring it is worse than a unlikely patching race. * Also machine checks tend to be broadcast and if one CPU * goes into machine check the others follow quickly, so we don't * expect a machine check to cause undue problems during to code * patching. So undo the machine check related parts of 8f4e956b313dcccbc7be6f10808952345e3b638c NMIs are still disabled. This only removes code, the only additions are a new comment. Signed-off-by: Andi Kleen Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/kernel/alternative.c | 17 +++++++++++------ arch/x86/kernel/cpu/mcheck/mce_32.c | 14 -------------- arch/x86/kernel/cpu/mcheck/mce_64.c | 14 -------------- 3 files changed, 11 insertions(+), 34 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index a84ac7b570e..5b8394a3a6b 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -414,9 +414,17 @@ void __init alternative_instructions(void) that might execute the to be patched code. Other CPUs are not running. */ stop_nmi(); -#ifdef CONFIG_X86_MCE - stop_mce(); -#endif + + /* + * Don't stop machine check exceptions while patching. + * MCEs only happen when something got corrupted and in this + * case we must do something about the corruption. + * Ignoring it is worse than a unlikely patching race. + * Also machine checks tend to be broadcast and if one CPU + * goes into machine check the others follow quickly, so we don't + * expect a machine check to cause undue problems during to code + * patching. + */ apply_alternatives(__alt_instructions, __alt_instructions_end); @@ -456,9 +464,6 @@ void __init alternative_instructions(void) (unsigned long)__smp_locks_end); restart_nmi(); -#ifdef CONFIG_X86_MCE - restart_mce(); -#endif } /** diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c index dfaebce3633..3552119b091 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_32.c +++ b/arch/x86/kernel/cpu/mcheck/mce_32.c @@ -60,20 +60,6 @@ void mcheck_init(struct cpuinfo_x86 *c) } } -static unsigned long old_cr4 __initdata; - -void __init stop_mce(void) -{ - old_cr4 = read_cr4(); - clear_in_cr4(X86_CR4_MCE); -} - -void __init restart_mce(void) -{ - if (old_cr4 & X86_CR4_MCE) - set_in_cr4(X86_CR4_MCE); -} - static int __init mcheck_disable(char *str) { mce_disabled = 1; diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 5ed80991ab9..25ccdbec86e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -680,20 +680,6 @@ static struct miscdevice mce_log_device = { &mce_chrdev_ops, }; -static unsigned long old_cr4 __initdata; - -void __init stop_mce(void) -{ - old_cr4 = read_cr4(); - clear_in_cr4(X86_CR4_MCE); -} - -void __init restart_mce(void) -{ - if (old_cr4 & X86_CR4_MCE) - set_in_cr4(X86_CR4_MCE); -} - /* * Old style boot options parsing. Only for compatibility. */ -- cgit v1.2.3 From 9bd984058088d6ef7af6946591a207e51a2f4890 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:39:28 +0100 Subject: x86, mce: always use separate work queue to run trigger Impact: Needed for bug fix in next patch This relaxes the requirement that mce_notify_user has to run in process context. Useful for future changes, but also leads to cleaner behaviour now. Now instead mce_notify_user can be called directly from interrupt (but not NMI) context. The work queue only uses a single global work struct, which can be done safely because it is always free to reuse before the trigger function is executed. This way no events can be lost. Signed-off-by: Andi Kleen Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 25ccdbec86e..18b379cf061 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -380,11 +380,17 @@ static void mcheck_timer(struct work_struct *work) schedule_delayed_work(&mcheck_work, next_interval); } +static void mce_do_trigger(struct work_struct *work) +{ + call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); +} + +static DECLARE_WORK(mce_trigger_work, mce_do_trigger); + /* - * This is only called from process context. This is where we do - * anything we need to alert userspace about new MCEs. This is called - * directly from the poller and also from entry.S and idle, thanks to - * TIF_MCE_NOTIFY. + * Notify the user(s) about new machine check events. + * Can be called from interrupt context, but not from machine check/NMI + * context. */ int mce_notify_user(void) { @@ -394,9 +400,14 @@ int mce_notify_user(void) unsigned long now = jiffies; wake_up_interruptible(&mce_wait); - if (trigger[0]) - call_usermodehelper(trigger, trigger_argv, NULL, - UMH_NO_WAIT); + + /* + * There is no risk of missing notifications because + * work_pending is always cleared before the function is + * executed. + */ + if (trigger[0] && !work_pending(&mce_trigger_work)) + schedule_work(&mce_trigger_work); if (time_after_eq(now, last_print + (check_interval*HZ))) { last_print = now; -- cgit v1.2.3 From 52d168e28bc11dd026b620fe1767cadde5a747cd Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:39:29 +0100 Subject: x86, mce: switch machine check polling to per CPU timer Impact: Higher priority bug fix The machine check poller runs a single timer and then broadcasted an IPI to all CPUs to check them. This leads to unnecessary synchronization between CPUs. The original CPU running the timer has to wait potentially a long time for all other CPUs answering. This is also real time unfriendly and in general inefficient. This was especially a problem on systems with a lot of events where the poller run with a higher frequency after processing some events. There could be more and more CPU time wasted with this, to the point of significantly slowing down machines. The machine check polling is actually fully independent per CPU, so there's no reason to not just do this all with per CPU timers. This patch implements that. Also switch the poller also to use standard timers instead of work queues. It was using work queues to be able to execute a user program on a event, but mce_notify_user() handles this case now with a separate callback. So instead always run the poll code in in a standard per CPU timer, which means that in the common case of not having to execute a trigger there will be less overhead. This allows to clean up the initialization significantly, because standard timers are already up when machine checks get init'ed. No multiple initialization functions. Thanks to Thomas Gleixner for some help. Cc: thockin@google.com v2: Use del_timer_sync() on cpu shutdown and don't try to handle migrated timers. v3: Add WARN_ON for timer running on unexpected CPU Signed-off-by: Andi Kleen Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 68 ++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 23 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 18b379cf061..3f0550d16f3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -353,18 +353,17 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status) static int check_interval = 5 * 60; /* 5 minutes */ static int next_interval; /* in jiffies */ -static void mcheck_timer(struct work_struct *work); -static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer); +static void mcheck_timer(unsigned long); +static DEFINE_PER_CPU(struct timer_list, mce_timer); -static void mcheck_check_cpu(void *info) +static void mcheck_timer(unsigned long data) { + struct timer_list *t = &per_cpu(mce_timer, data); + + WARN_ON(smp_processor_id() != data); + if (mce_available(¤t_cpu_data)) do_machine_check(NULL, 0); -} - -static void mcheck_timer(struct work_struct *work) -{ - on_each_cpu(mcheck_check_cpu, NULL, 1); /* * Alert userspace if needed. If we logged an MCE, reduce the @@ -377,7 +376,8 @@ static void mcheck_timer(struct work_struct *work) (int)round_jiffies_relative(check_interval*HZ)); } - schedule_delayed_work(&mcheck_work, next_interval); + t->expires = jiffies + next_interval; + add_timer(t); } static void mce_do_trigger(struct work_struct *work) @@ -436,16 +436,11 @@ static struct notifier_block mce_idle_notifier = { static __init int periodic_mcheck_init(void) { - next_interval = check_interval * HZ; - if (next_interval) - schedule_delayed_work(&mcheck_work, - round_jiffies_relative(next_interval)); - idle_notifier_register(&mce_idle_notifier); - return 0; + idle_notifier_register(&mce_idle_notifier); + return 0; } __initcall(periodic_mcheck_init); - /* * Initialize Machine Checks for a CPU. */ @@ -515,6 +510,20 @@ static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c) } } +static void mce_init_timer(void) +{ + struct timer_list *t = &__get_cpu_var(mce_timer); + + /* data race harmless because everyone sets to the same value */ + if (!next_interval) + next_interval = check_interval * HZ; + if (!next_interval) + return; + setup_timer(t, mcheck_timer, smp_processor_id()); + t->expires = round_jiffies_relative(jiffies + next_interval); + add_timer(t); +} + /* * Called for each booted CPU to set up machine checks. * Must be called with preempt off. @@ -529,6 +538,7 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) mce_init(NULL); mce_cpu_features(c); + mce_init_timer(); } /* @@ -758,17 +768,19 @@ static int mce_resume(struct sys_device *dev) return 0; } +static void mce_cpu_restart(void *data) +{ + del_timer_sync(&__get_cpu_var(mce_timer)); + if (mce_available(¤t_cpu_data)) + mce_init(NULL); + mce_init_timer(); +} + /* Reinit MCEs after user configuration changes */ static void mce_restart(void) { - if (next_interval) - cancel_delayed_work(&mcheck_work); - /* Timer race is harmless here */ - on_each_cpu(mce_init, NULL, 1); next_interval = check_interval * HZ; - if (next_interval) - schedule_delayed_work(&mcheck_work, - round_jiffies_relative(next_interval)); + on_each_cpu(mce_cpu_restart, NULL, 1); } static struct sysdev_class mce_sysclass = { @@ -899,6 +911,7 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; + struct timer_list *t = &per_cpu(mce_timer, cpu); switch (action) { case CPU_ONLINE: @@ -913,6 +926,15 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, threshold_cpu_callback(action, cpu); mce_remove_device(cpu); break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + del_timer_sync(t); + break; + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + t->expires = round_jiffies_relative(jiffies + next_interval); + add_timer_on(t, cpu); + break; } return NOTIFY_OK; } -- cgit v1.2.3 From 5b4408fdaa62474dd9485cddb9126370d90d4b82 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:39:30 +0100 Subject: x86, mce: don't set up mce sysdev devices with mce=off Impact: bug fix, in this case the resume handler shouldn't run which avoids incorrectly reenabling machine checks on resume When MCEs are completely disabled on the command line don't set up the sysdev devices for them either. Includes a comment fix from Thomas Gleixner. Signed-off-by: Andi Kleen Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 3f0550d16f3..4e2b1bc5131 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -151,6 +151,8 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start) static int mce_available(struct cpuinfo_x86 *c) { + if (mce_dont_init) + return 0; return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); } @@ -532,8 +534,7 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) { mce_cpu_quirks(c); - if (mce_dont_init || - !mce_available(c)) + if (!mce_available(c)) return; mce_init(NULL); @@ -710,8 +711,7 @@ static int __init mcheck_disable(char *str) return 1; } -/* mce=off disables machine check. Note you can re-enable it later - using sysfs. +/* mce=off disables machine check. mce=TOLERANCELEVEL (number, see above) mce=bootlog Log MCEs from before booting. Disabled by default on AMD. mce=nobootlog Don't log MCEs from before booting. */ -- cgit v1.2.3 From d6b75584a3eaab8cb2ab3e8cf90c5e57c1928a85 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:39:31 +0100 Subject: x86, mce: disable machine checks on offlined CPUs Impact: Lower priority bug fix Offlined CPUs could still get machine checks, but the machine check handler cannot handle them properly, leading to an unconditional crash. Disable machine checks on CPUs that are going down. Signed-off-by: Andi Kleen Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 4e2b1bc5131..1db94c0d5aa 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -906,6 +906,27 @@ static __cpuinit void mce_remove_device(unsigned int cpu) cpu_clear(cpu, mce_device_initialized); } +/* Make sure there are no machine checks on offlined CPUs. */ +static void __cpuexit mce_disable_cpu(void *h) +{ + int i; + + if (!mce_available(¤t_cpu_data)) + return; + for (i = 0; i < banks; i++) + wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); +} + +static void __cpuexit mce_reenable_cpu(void *h) +{ + int i; + + if (!mce_available(¤t_cpu_data)) + return; + for (i = 0; i < banks; i++) + wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); +} + /* Get notified when a cpu comes on/off. Be hotplug friendly. */ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) @@ -929,11 +950,13 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: del_timer_sync(t); + smp_call_function_single(cpu, mce_disable_cpu, NULL, 1); break; case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: t->expires = round_jiffies_relative(jiffies + next_interval); add_timer_on(t, cpu); + smp_call_function_single(cpu, mce_reenable_cpu, NULL, 1); break; } return NOTIFY_OK; -- cgit v1.2.3 From ef41df4344ff952c79746d44a6126bd2cf7ed2bc Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 12 Feb 2009 13:39:34 +0100 Subject: x86, mce: fix a race condition in mce_read() Impact: bugfix Considering the situation as follow: before: mcelog.next == 1, mcelog.entry[0].finished = 1 +-------------------------------------------------------------------------- R W1 W2 W3 read mcelog.next (1) mcelog.next++ (2) (working on entry 1, finished == 0) mcelog.next = 0 mcelog.next++ (1) (working on entry 0) mcelog.next++ (2) (working on entry 1) <----------------- race ----------------> (done on entry 1, finished = 1) (done on entry 1, finished = 1) To fix the race condition, a cmpxchg loop is added to mce_read() to ensure no new MCE record can be added between mcelog.next reading and mcelog.next = 0. Signed-off-by: Huang Ying Signed-off-by: Andi Kleen Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 41 ++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 17 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 1db94c0d5aa..870d08deccf 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -595,7 +595,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, { unsigned long *cpu_tsc; static DEFINE_MUTEX(mce_read_mutex); - unsigned next; + unsigned prev, next; char __user *buf = ubuf; int i, err; @@ -614,25 +614,32 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, } err = 0; - for (i = 0; i < next; i++) { - unsigned long start = jiffies; - - while (!mcelog.entry[i].finished) { - if (time_after_eq(jiffies, start + 2)) { - memset(mcelog.entry + i,0, sizeof(struct mce)); - goto timeout; + prev = 0; + do { + for (i = prev; i < next; i++) { + unsigned long start = jiffies; + + while (!mcelog.entry[i].finished) { + if (time_after_eq(jiffies, start + 2)) { + memset(mcelog.entry + i, 0, + sizeof(struct mce)); + goto timeout; + } + cpu_relax(); } - cpu_relax(); + smp_rmb(); + err |= copy_to_user(buf, mcelog.entry + i, + sizeof(struct mce)); + buf += sizeof(struct mce); +timeout: + ; } - smp_rmb(); - err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce)); - buf += sizeof(struct mce); - timeout: - ; - } - memset(mcelog.entry, 0, next * sizeof(struct mce)); - mcelog.next = 0; + memset(mcelog.entry + prev, 0, + (next - prev) * sizeof(struct mce)); + prev = next; + next = cmpxchg(&mcelog.next, prev, 0); + } while (next != prev); synchronize_sched(); -- cgit v1.2.3 From 0d7482e3d76522157c9d741d79fce22c401fa0c5 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 17 Feb 2009 23:07:13 +0100 Subject: x86, mce: implement dynamic machine check banks support Impact: cleanup; making code future proof; memory saving on small systems This patch replaces the hardcoded max number of machine check banks with dynamic allocation depending on what the CPU reports. The sysfs data structures and the banks array are dynamically allocated. There is still a hard bank limit (128) because the mcelog protocol uses banks >= 128 as pseudo banks to escape other events. But we expect that 128 banks is beyond any reasonable CPU for now. This supersedes an earlier patch by Venki, but it solves the problem more completely by making the limit fully dynamic (up to the 128 boundary). This saves some memory on machines with less than 6 banks because they won't need sysdevs for unused ones and also allows to use sysfs to control these banks on possible future CPUs with more than 6 banks. This is an updated patch addressing Venki's comments. I also added in another patch from Thomas which fixed the error allocation path (that patch was previously separated) Cc: Venki Pallipadi Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 147 ++++++++++++++++++++++++++++-------- 1 file changed, 115 insertions(+), 32 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 870d08deccf..2297730bb51 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -24,6 +24,8 @@ #include #include #include +#include +#include #include #include #include @@ -32,7 +34,12 @@ #include #define MISC_MCELOG_MINOR 227 -#define NR_SYSFS_BANKS 6 + +/* + * To support more than 128 would need to escape the predefined + * Linux defined extended banks first. + */ +#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1) atomic_t mce_entry; @@ -47,7 +54,7 @@ static int mce_dont_init; */ static int tolerant = 1; static int banks; -static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL }; +static u64 *bank; static unsigned long notify_user; static int rip_msr; static int mce_bootlog = -1; @@ -212,7 +219,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) barrier(); for (i = 0; i < banks; i++) { - if (i < NR_SYSFS_BANKS && !bank[i]) + if (!bank[i]) continue; m.misc = 0; @@ -446,37 +453,54 @@ __initcall(periodic_mcheck_init); /* * Initialize Machine Checks for a CPU. */ -static void mce_init(void *dummy) +static int mce_cap_init(void) { u64 cap; - int i; + unsigned b; rdmsrl(MSR_IA32_MCG_CAP, cap); - banks = cap & 0xff; - if (banks > MCE_EXTENDED_BANK) { - banks = MCE_EXTENDED_BANK; - printk(KERN_INFO "MCE: warning: using only %d banks\n", - MCE_EXTENDED_BANK); + b = cap & 0xff; + if (b > MAX_NR_BANKS) { + printk(KERN_WARNING + "MCE: Using only %u machine check banks out of %u\n", + MAX_NR_BANKS, b); + b = MAX_NR_BANKS; + } + + /* Don't support asymmetric configurations today */ + WARN_ON(banks != 0 && b != banks); + banks = b; + if (!bank) { + bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); + if (!bank) + return -ENOMEM; + memset(bank, 0xff, banks * sizeof(u64)); } + /* Use accurate RIP reporting if available. */ if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) rip_msr = MSR_IA32_MCG_EIP; + return 0; +} + +static void mce_init(void *dummy) +{ + u64 cap; + int i; + /* Log the machine checks left over from the previous reset. This also clears all registers */ do_machine_check(NULL, mce_bootlog ? -1 : -2); set_in_cr4(X86_CR4_MCE); + rdmsrl(MSR_IA32_MCG_CAP, cap); if (cap & MCG_CTL_P) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); for (i = 0; i < banks; i++) { - if (i < NR_SYSFS_BANKS) - wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); - else - wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL); - + wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); } } @@ -486,10 +510,10 @@ static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) { /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD) { - if(c->x86 == 15) + if (c->x86 == 15 && banks > 4) /* disable GART TBL walk error reporting, which trips off incorrectly with the IOMMU & 3ware & Cerberus. */ - clear_bit(10, &bank[4]); + clear_bit(10, (unsigned long *)&bank[4]); if(c->x86 <= 17 && mce_bootlog < 0) /* Lots of broken BIOS around that don't clear them by default and leave crap in there. Don't log. */ @@ -532,11 +556,15 @@ static void mce_init_timer(void) */ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) { - mce_cpu_quirks(c); - if (!mce_available(c)) return; + if (mce_cap_init() < 0) { + mce_dont_init = 1; + return; + } + mce_cpu_quirks(c); + mce_init(NULL); mce_cpu_features(c); mce_init_timer(); @@ -819,16 +847,26 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit } \ static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); -/* - * TBD should generate these dynamically based on number of available banks. - * Have only 6 contol banks in /sysfs until then. - */ -ACCESSOR(bank0ctl,bank[0],mce_restart()) -ACCESSOR(bank1ctl,bank[1],mce_restart()) -ACCESSOR(bank2ctl,bank[2],mce_restart()) -ACCESSOR(bank3ctl,bank[3],mce_restart()) -ACCESSOR(bank4ctl,bank[4],mce_restart()) -ACCESSOR(bank5ctl,bank[5],mce_restart()) +static struct sysdev_attribute *bank_attrs; + +static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, + char *buf) +{ + u64 b = bank[attr - bank_attrs]; + return sprintf(buf, "%Lx\n", b); +} + +static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, + const char *buf, size_t siz) +{ + char *end; + u64 new = simple_strtoull(buf, &end, 0); + if (end == buf) + return -EINVAL; + bank[attr - bank_attrs] = new; + mce_restart(); + return end-buf; +} static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) @@ -855,8 +893,6 @@ static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); ACCESSOR(check_interval,check_interval,mce_restart()) static struct sysdev_attribute *mce_attributes[] = { - &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, - &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl, &attr_tolerant.attr, &attr_check_interval, &attr_trigger, NULL }; @@ -886,11 +922,22 @@ static __cpuinit int mce_create_device(unsigned int cpu) if (err) goto error; } + for (i = 0; i < banks; i++) { + err = sysdev_create_file(&per_cpu(device_mce, cpu), + &bank_attrs[i]); + if (err) + goto error2; + } cpu_set(cpu, mce_device_initialized); return 0; +error2: + while (--i >= 0) { + sysdev_remove_file(&per_cpu(device_mce, cpu), + &bank_attrs[i]); + } error: - while (i--) { + while (--i >= 0) { sysdev_remove_file(&per_cpu(device_mce,cpu), mce_attributes[i]); } @@ -909,6 +956,9 @@ static __cpuinit void mce_remove_device(unsigned int cpu) for (i = 0; mce_attributes[i]; i++) sysdev_remove_file(&per_cpu(device_mce,cpu), mce_attributes[i]); + for (i = 0; i < banks; i++) + sysdev_remove_file(&per_cpu(device_mce, cpu), + &bank_attrs[i]); sysdev_unregister(&per_cpu(device_mce,cpu)); cpu_clear(cpu, mce_device_initialized); } @@ -973,6 +1023,34 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = { .notifier_call = mce_cpu_callback, }; +static __init int mce_init_banks(void) +{ + int i; + + bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, + GFP_KERNEL); + if (!bank_attrs) + return -ENOMEM; + + for (i = 0; i < banks; i++) { + struct sysdev_attribute *a = &bank_attrs[i]; + a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); + if (!a->attr.name) + goto nomem; + a->attr.mode = 0644; + a->show = show_bank; + a->store = set_bank; + } + return 0; + +nomem: + while (--i >= 0) + kfree(bank_attrs[i].attr.name); + kfree(bank_attrs); + bank_attrs = NULL; + return -ENOMEM; +} + static __init int mce_init_device(void) { int err; @@ -980,6 +1058,11 @@ static __init int mce_init_device(void) if (!mce_available(&boot_cpu_data)) return -EIO; + + err = mce_init_banks(); + if (err) + return err; + err = sysdev_class_register(&mce_sysclass); if (err) return err; -- cgit v1.2.3 From b5f2fa4ea00a179ac1c2ff342ceeee261dd75e53 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:43:22 +0100 Subject: x86, mce: factor out duplicated struct mce setup into one function Impact: cleanup This merely factors out duplicated code to set up the initial struct mce state into a single function. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 23 ++++++++++++++--------- arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 4 +--- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 2 +- 3 files changed, 16 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 2297730bb51..fed875742b1 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -65,6 +65,14 @@ static char *trigger_argv[2] = { trigger, NULL }; static DECLARE_WAIT_QUEUE_HEAD(mce_wait); +/* Do initial initialization of a struct mce */ +void mce_setup(struct mce *m) +{ + memset(m, 0, sizeof(struct mce)); + m->cpu = smp_processor_id(); + rdtscll(m->tsc); +} + /* * Lockless MCE logging infrastructure. * This avoids deadlocks on printk locks without having to break locks. Also @@ -208,8 +216,8 @@ void do_machine_check(struct pt_regs * regs, long error_code) || !banks) goto out2; - memset(&m, 0, sizeof(struct mce)); - m.cpu = smp_processor_id(); + mce_setup(&m); + rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); /* if the restart IP is not valid, we're done for */ if (!(m.mcgstatus & MCG_STATUS_RIPV)) @@ -225,7 +233,6 @@ void do_machine_check(struct pt_regs * regs, long error_code) m.misc = 0; m.addr = 0; m.bank = i; - m.tsc = 0; rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); if ((m.status & MCI_STATUS_VAL) == 0) @@ -252,8 +259,8 @@ void do_machine_check(struct pt_regs * regs, long error_code) rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); mce_get_rip(&m, regs); - if (error_code >= 0) - rdtscll(m.tsc); + if (error_code < 0) + m.tsc = 0; if (error_code != -2) mce_log(&m); @@ -341,15 +348,13 @@ void do_machine_check(struct pt_regs * regs, long error_code) * and historically has been the register value of the * MSR_IA32_THERMAL_STATUS (Intel) msr. */ -void mce_log_therm_throt_event(unsigned int cpu, __u64 status) +void mce_log_therm_throt_event(__u64 status) { struct mce m; - memset(&m, 0, sizeof(m)); - m.cpu = cpu; + mce_setup(&m); m.bank = MCE_THERMAL_BANK; m.status = status; - rdtscll(m.tsc); mce_log(&m); } #endif /* CONFIG_X86_MCE_INTEL */ diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 8ae8c4ff094..75d9dd25e3d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -197,9 +197,7 @@ asmlinkage void mce_threshold_interrupt(void) exit_idle(); irq_enter(); - memset(&m, 0, sizeof(m)); - rdtscll(m.tsc); - m.cpu = smp_processor_id(); + mce_setup(&m); /* assume first bank caused it */ for (bank = 0; bank < NR_BANKS; ++bank) { diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index 4b48f251fd3..7f7f1015ef1 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -24,7 +24,7 @@ asmlinkage void smp_thermal_interrupt(void) rdmsrl(MSR_IA32_THERM_STATUS, msr_val); if (therm_throt_process(msr_val & 1)) - mce_log_therm_throt_event(smp_processor_id(), msr_val); + mce_log_therm_throt_event(msr_val); inc_irq_stat(irq_thermal_count); irq_exit(); -- cgit v1.2.3 From b79109c3bbcf52cac5103979b283b9e5df4e796c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:43:23 +0100 Subject: x86, mce: separate correct machine check poller and fatal exception handler Impact: cleanup, performance enhancement The machine check poller is diverging more and more from the fatal exception handler. Instead of adding more special cases separate the code paths completely. The corrected poll path is actually quite simple, and this doesn't result in much code duplication. This makes both handlers much easier to read and results in cleaner code flow. The exception handler now only needs to care about uncorrected errors, which also simplifies the handling of multiple errors. The corrected poller also now always runs in standard interrupt context and does not need to do anything special to handle NMI context. Minor behaviour changes: - MCG status is now not cleared on polling. - Only the banks which had corrected errors get cleared on polling - The exception handler only clears banks with errors now v2: Forward port to new patch order. Add "uc" argument. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 129 ++++++++++++++++++++++++++------ arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 2 +- 2 files changed, 109 insertions(+), 22 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index fed875742b1..268b05edade 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -3,6 +3,8 @@ * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. * Rest from unknown author(s). * 2004 Andi Kleen. Rewrote most of it. + * Copyright 2008 Intel Corporation + * Author: Andi Kleen */ #include @@ -189,7 +191,77 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) } /* - * The actual machine check handler + * Poll for corrected events or events that happened before reset. + * Those are just logged through /dev/mcelog. + * + * This is executed in standard interrupt context. + */ +void machine_check_poll(enum mcp_flags flags) +{ + struct mce m; + int i; + + mce_setup(&m); + + rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + for (i = 0; i < banks; i++) { + if (!bank[i]) + continue; + + m.misc = 0; + m.addr = 0; + m.bank = i; + m.tsc = 0; + + barrier(); + rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); + if (!(m.status & MCI_STATUS_VAL)) + continue; + + /* + * Uncorrected events are handled by the exception handler + * when it is enabled. But when the exception is disabled log + * everything. + * + * TBD do the same check for MCI_STATUS_EN here? + */ + if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) + continue; + + if (m.status & MCI_STATUS_MISCV) + rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); + if (m.status & MCI_STATUS_ADDRV) + rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); + + if (!(flags & MCP_TIMESTAMP)) + m.tsc = 0; + /* + * Don't get the IP here because it's unlikely to + * have anything to do with the actual error location. + */ + + mce_log(&m); + add_taint(TAINT_MACHINE_CHECK); + + /* + * Clear state for this bank. + */ + wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + } + + /* + * Don't clear MCG_STATUS here because it's only defined for + * exceptions. + */ +} + +/* + * The actual machine check handler. This only handles real + * exceptions when something got corrupted coming in through int 18. + * + * This is executed in NMI context not subject to normal locking rules. This + * implies that most kernel services cannot be safely used. Don't even + * think about putting a printk in there! */ void do_machine_check(struct pt_regs * regs, long error_code) { @@ -207,13 +279,14 @@ void do_machine_check(struct pt_regs * regs, long error_code) * error. */ int kill_it = 0; + DECLARE_BITMAP(toclear, MAX_NR_BANKS); atomic_inc(&mce_entry); - if ((regs - && notify_die(DIE_NMI, "machine check", regs, error_code, + if (notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL) == NOTIFY_STOP) - || !banks) + goto out2; + if (!banks) goto out2; mce_setup(&m); @@ -227,6 +300,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) barrier(); for (i = 0; i < banks; i++) { + __clear_bit(i, toclear); if (!bank[i]) continue; @@ -238,6 +312,20 @@ void do_machine_check(struct pt_regs * regs, long error_code) if ((m.status & MCI_STATUS_VAL) == 0) continue; + /* + * Non uncorrected errors are handled by machine_check_poll + * Leave them alone. + */ + if ((m.status & MCI_STATUS_UC) == 0) + continue; + + /* + * Set taint even when machine check was not enabled. + */ + add_taint(TAINT_MACHINE_CHECK); + + __set_bit(i, toclear); + if (m.status & MCI_STATUS_EN) { /* if PCC was set, there's no way out */ no_way_out |= !!(m.status & MCI_STATUS_PCC); @@ -251,6 +339,12 @@ void do_machine_check(struct pt_regs * regs, long error_code) no_way_out = 1; kill_it = 1; } + } else { + /* + * Machine check event was not enabled. Clear, but + * ignore. + */ + continue; } if (m.status & MCI_STATUS_MISCV) @@ -259,10 +353,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); mce_get_rip(&m, regs); - if (error_code < 0) - m.tsc = 0; - if (error_code != -2) - mce_log(&m); + mce_log(&m); /* Did this bank cause the exception? */ /* Assume that the bank with uncorrectable errors did it, @@ -271,14 +362,8 @@ void do_machine_check(struct pt_regs * regs, long error_code) panicm = m; panicm_found = 1; } - - add_taint(TAINT_MACHINE_CHECK); } - /* Never do anything final in the polling timer */ - if (!regs) - goto out; - /* If we didn't find an uncorrectable error, pick the last one (shouldn't happen, just being safe). */ if (!panicm_found) @@ -325,10 +410,11 @@ void do_machine_check(struct pt_regs * regs, long error_code) /* notify userspace ASAP */ set_thread_flag(TIF_MCE_NOTIFY); - out: /* the last thing we do is clear state */ - for (i = 0; i < banks; i++) - wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + for (i = 0; i < banks; i++) { + if (test_bit(i, toclear)) + wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + } wrmsrl(MSR_IA32_MCG_STATUS, 0); out2: atomic_dec(&mce_entry); @@ -377,7 +463,7 @@ static void mcheck_timer(unsigned long data) WARN_ON(smp_processor_id() != data); if (mce_available(¤t_cpu_data)) - do_machine_check(NULL, 0); + machine_check_poll(MCP_TIMESTAMP); /* * Alert userspace if needed. If we logged an MCE, reduce the @@ -494,9 +580,10 @@ static void mce_init(void *dummy) u64 cap; int i; - /* Log the machine checks left over from the previous reset. - This also clears all registers */ - do_machine_check(NULL, mce_bootlog ? -1 : -2); + /* + * Log the machine checks left over from the previous reset. + */ + machine_check_poll(MCP_UC); set_in_cr4(X86_CR4_MCE); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 75d9dd25e3d..0069c653f4e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -231,7 +231,7 @@ asmlinkage void mce_threshold_interrupt(void) /* Log the machine check that caused the threshold event. */ - do_machine_check(NULL, 0); + machine_check_poll(MCP_TIMESTAMP); if (high & MASK_OVERFLOW_HI) { rdmsrl(address, m.misc); -- cgit v1.2.3 From f6d1826dfad0d15fd14a455facc80b91f2ee642f Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 19 Feb 2009 15:44:58 -0800 Subject: x86, mce: use %ll instead of %L for 64-bit numbers Impact: Cleanup The standard spelling of a printf pattern for long long is "ll", not "L", which is for long double. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 268b05edade..60a114ca14f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -136,11 +136,11 @@ static void print_mce(struct mce *m) print_symbol("{%s}", m->ip); printk("\n"); } - printk(KERN_EMERG "TSC %Lx ", m->tsc); + printk(KERN_EMERG "TSC %llx ", m->tsc); if (m->addr) - printk("ADDR %Lx ", m->addr); + printk("ADDR %llx ", m->addr); if (m->misc) - printk("MISC %Lx ", m->misc); + printk("MISC %llx ", m->misc); printk("\n"); printk(KERN_EMERG "This is not a software problem!\n"); printk(KERN_EMERG "Run through mcelog --ascii to decode " @@ -945,7 +945,7 @@ static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, char *buf) { u64 b = bank[attr - bank_attrs]; - return sprintf(buf, "%Lx\n", b); + return sprintf(buf, "%llx\n", b); } static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, -- cgit v1.2.3 From 42f8faecf7a88371de0f30aebb052d1ae51762c0 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 17 Feb 2009 11:46:42 +0800 Subject: x86: use percpu data for 4k hardirq and softirq stacks Impact: economize memory for large NR_CPUS percpu data is setup earlier than irq, we can use percpu data to economize memory. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- arch/x86/kernel/irq_32.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index e0f29be8ab0..90cd94aba69 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -16,6 +16,7 @@ #include #include #include +#include #include @@ -55,13 +56,13 @@ static inline void print_stack_overflow(void) { } union irq_ctx { struct thread_info tinfo; u32 stack[THREAD_SIZE/sizeof(u32)]; -}; +} __attribute__((aligned(PAGE_SIZE))); -static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; -static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; +static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx); +static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx); -static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; -static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; +static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack); +static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack); static void call_on_stack(void *func, void *stack) { @@ -81,7 +82,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) u32 *isp, arg1, arg2; curctx = (union irq_ctx *) current_thread_info(); - irqctx = hardirq_ctx[smp_processor_id()]; + irqctx = __get_cpu_var(hardirq_ctx); /* * this is where we switch to the IRQ stack. However, if we are @@ -125,34 +126,34 @@ void __cpuinit irq_ctx_init(int cpu) { union irq_ctx *irqctx; - if (hardirq_ctx[cpu]) + if (per_cpu(hardirq_ctx, cpu)) return; - irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE]; + irqctx = &per_cpu(hardirq_stack, cpu); irqctx->tinfo.task = NULL; irqctx->tinfo.exec_domain = NULL; irqctx->tinfo.cpu = cpu; irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); - hardirq_ctx[cpu] = irqctx; + per_cpu(hardirq_ctx, cpu) = irqctx; - irqctx = (union irq_ctx *) &softirq_stack[cpu*THREAD_SIZE]; + irqctx = &per_cpu(softirq_stack, cpu); irqctx->tinfo.task = NULL; irqctx->tinfo.exec_domain = NULL; irqctx->tinfo.cpu = cpu; irqctx->tinfo.preempt_count = 0; irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); - softirq_ctx[cpu] = irqctx; + per_cpu(softirq_ctx, cpu) = irqctx; printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n", - cpu, hardirq_ctx[cpu], softirq_ctx[cpu]); + cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu)); } void irq_ctx_exit(int cpu) { - hardirq_ctx[cpu] = NULL; + per_cpu(hardirq_ctx, cpu) = NULL; } asmlinkage void do_softirq(void) @@ -169,7 +170,7 @@ asmlinkage void do_softirq(void) if (local_softirq_pending()) { curctx = current_thread_info(); - irqctx = softirq_ctx[smp_processor_id()]; + irqctx = __get_cpu_var(softirq_ctx); irqctx->tinfo.task = curctx->task; irqctx->tinfo.previous_esp = current_stack_pointer; -- cgit v1.2.3 From b36128c830a8f5bd7d4981f5b0b69950f5928ee6 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 20 Feb 2009 16:29:08 +0900 Subject: alloc_percpu: change percpu_ptr to per_cpu_ptr Impact: cleanup There are two allocated per-cpu accessor macros with almost identical spelling. The original and far more popular is per_cpu_ptr (44 files), so change over the other 4 files. tj: kill percpu_ptr() and update UP too Signed-off-by: Rusty Russell Cc: mingo@redhat.com Cc: lenb@kernel.org Cc: cpufreq@vger.kernel.org Signed-off-by: Tejun Heo --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 4b1c319d30c..22590cf688a 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -601,7 +601,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) if (!data) return -ENOMEM; - data->acpi_data = percpu_ptr(acpi_perf_data, cpu); + data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu); per_cpu(drv_data, cpu) = data; if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) -- cgit v1.2.3 From 11124411aa95827404d6bfdfc14c908e1b54513c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Feb 2009 16:29:09 +0900 Subject: x86: convert to the new dynamic percpu allocator Impact: use new dynamic allocator, unified access to static/dynamic percpu memory Convert to the new dynamic percpu allocator. * implement populate_extra_pte() for both 32 and 64 * update setup_per_cpu_areas() to use pcpu_setup_static() * define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() * define config HAVE_DYNAMIC_PER_CPU_AREA Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 62 +++++++++++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 22 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index d992e6cff73..2dce4355821 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -61,38 +61,56 @@ static inline void setup_percpu_segment(int cpu) */ void __init setup_per_cpu_areas(void) { - ssize_t size; - char *ptr; - int cpu; - - /* Copy section for each CPU (we discard the original) */ - size = roundup(PERCPU_ENOUGH_ROOM, PAGE_SIZE); + ssize_t size = __per_cpu_end - __per_cpu_start; + unsigned int nr_cpu_pages = DIV_ROUND_UP(size, PAGE_SIZE); + static struct page **pages; + size_t pages_size; + unsigned int cpu, i, j; + unsigned long delta; + size_t pcpu_unit_size; pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); + pr_info("PERCPU: Allocating %zd bytes for static per cpu data\n", size); - pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size); + pages_size = nr_cpu_pages * num_possible_cpus() * sizeof(pages[0]); + pages = alloc_bootmem(pages_size); + j = 0; for_each_possible_cpu(cpu) { + void *ptr; + + for (i = 0; i < nr_cpu_pages; i++) { #ifndef CONFIG_NEED_MULTIPLE_NODES - ptr = alloc_bootmem_pages(size); + ptr = alloc_bootmem_pages(PAGE_SIZE); #else - int node = early_cpu_to_node(cpu); - if (!node_online(node) || !NODE_DATA(node)) { - ptr = alloc_bootmem_pages(size); - pr_info("cpu %d has no node %d or node-local memory\n", - cpu, node); - pr_debug("per cpu data for cpu%d at %016lx\n", - cpu, __pa(ptr)); - } else { - ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); - pr_debug("per cpu data for cpu%d on node%d at %016lx\n", - cpu, node, __pa(ptr)); - } + int node = early_cpu_to_node(cpu); + + if (!node_online(node) || !NODE_DATA(node)) { + ptr = alloc_bootmem_pages(PAGE_SIZE); + pr_info("cpu %d has no node %d or node-local " + "memory\n", cpu, node); + pr_debug("per cpu data for cpu%d at %016lx\n", + cpu, __pa(ptr)); + } else { + ptr = alloc_bootmem_pages_node(NODE_DATA(node), + PAGE_SIZE); + pr_debug("per cpu data for cpu%d on node%d " + "at %016lx\n", cpu, node, __pa(ptr)); + } #endif + memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); + pages[j++] = virt_to_page(ptr); + } + } + + pcpu_unit_size = pcpu_setup_static(populate_extra_pte, pages, size); - memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start); - per_cpu_offset(cpu) = ptr - __per_cpu_start; + free_bootmem(__pa(pages), pages_size); + + delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; + for_each_possible_cpu(cpu) { + per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); per_cpu(cpu_number, cpu) = cpu; setup_percpu_segment(cpu); -- cgit v1.2.3 From ec5b3d32437571b8a742069a4cfd04edb6b6eda5 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 23 Feb 2009 14:01:04 -0800 Subject: x86, mce: remove invalid __cpuinit/__cpuexit annotations Impact: Bug fix when CPU hotplug is disabled Correct the following broken __cpuinit/__cpuexit annotations: - mce_cpu_features() is called from mce_resume(), and so cannot be __cpuinit. - mce_disable_cpu() and mce_reenable_cpu() are called from mce_cpu_callback(), and so cannot be __cpuexit(). Cc: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 60a114ca14f..0625993bf95 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -598,7 +598,7 @@ static void mce_init(void *dummy) } /* Add per CPU specific workarounds here */ -static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) +static void mce_cpu_quirks(struct cpuinfo_x86 *c) { /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD) { @@ -1056,7 +1056,7 @@ static __cpuinit void mce_remove_device(unsigned int cpu) } /* Make sure there are no machine checks on offlined CPUs. */ -static void __cpuexit mce_disable_cpu(void *h) +static void mce_disable_cpu(void *h) { int i; @@ -1066,7 +1066,7 @@ static void __cpuexit mce_disable_cpu(void *h) wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); } -static void __cpuexit mce_reenable_cpu(void *h) +static void mce_reenable_cpu(void *h) { int i; -- cgit v1.2.3 From 458a3e644c3327be529393982e24277eda8f1ac7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Feb 2009 11:57:21 +0900 Subject: x86: update populate_extra_pte() and add populate_extra_pmd() Impact: minor change to populate_extra_pte() and addition of pmd flavor Update populate_extra_pte() to return pointer to the pte_t for the specified address and add populate_extra_pmd() which only populates till the pmd and returns pointer to the pmd entry for the address. For 64bit, pud/pmd/pte fill functions are separated out from set_pte_vaddr[_pud]() and used for set_pte_vaddr[_pud]() and populate_extra_{pte|pmd}(). Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 2dce4355821..671e6528a82 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -41,6 +41,11 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { }; EXPORT_SYMBOL(__per_cpu_offset); +static void __init pcpu4k_populate_pte(unsigned long addr) +{ + populate_extra_pte(addr); +} + static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 @@ -104,7 +109,7 @@ void __init setup_per_cpu_areas(void) } } - pcpu_unit_size = pcpu_setup_static(populate_extra_pte, pages, size); + pcpu_unit_size = pcpu_setup_static(pcpu4k_populate_pte, pages, size); free_bootmem(__pa(pages), pages_size); -- cgit v1.2.3 From 8d408b4be37bc49c9086531f2ebe411cf5731746 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Feb 2009 11:57:21 +0900 Subject: percpu: give more latitude to arch specific first chunk initialization Impact: more latitude for first percpu chunk allocation The first percpu chunk serves the kernel static percpu area and may or may not contain extra room for further dynamic allocation. Initialization of the first chunk needs to be done before normal memory allocation service is up, so it has its own init path - pcpu_setup_static(). It seems archs need more latitude while initializing the first chunk for example to take advantage of large page mapping. This patch makes the following changes to allow this. * Define PERCPU_DYNAMIC_RESERVE to give arch hint about how much space to reserve in the first chunk for further dynamic allocation. * Rename pcpu_setup_static() to pcpu_setup_first_chunk(). * Make pcpu_setup_first_chunk() much more flexible by fetching page pointer by callback and adding optional @unit_size, @free_size and @base_addr arguments which allow archs to selectively part of chunk initialization to their likings. Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 671e6528a82..d928e888720 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -41,6 +41,16 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { }; EXPORT_SYMBOL(__per_cpu_offset); +static struct page **pcpu4k_pages __initdata; +static int pcpu4k_nr_static_pages __initdata; + +static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno) +{ + if (pageno < pcpu4k_nr_static_pages) + return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno]; + return NULL; +} + static void __init pcpu4k_populate_pte(unsigned long addr) { populate_extra_pte(addr); @@ -109,7 +119,10 @@ void __init setup_per_cpu_areas(void) } } - pcpu_unit_size = pcpu_setup_static(pcpu4k_populate_pte, pages, size); + pcpu4k_pages = pages; + pcpu4k_nr_static_pages = nr_cpu_pages; + pcpu_unit_size = pcpu_setup_first_chunk(pcpu4k_get_page, size, 0, 0, + NULL, pcpu4k_populate_pte); free_bootmem(__pa(pages), pages_size); -- cgit v1.2.3 From 5f5d8405d1c50f5cf7e1dbfe9c9b44e2f015c8fd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Feb 2009 11:57:21 +0900 Subject: x86: separate out setup_pcpu_4k() from setup_per_cpu_areas() Impact: modularize percpu first chunk allocation x86 is gonna have a few different strategies for the first chunk allocation. Modularize it by separating out the current allocation mechanism into pcpu_alloc_bootmem() and setup_pcpu_4k(). Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 144 +++++++++++++++++++++++++++++------------ 1 file changed, 102 insertions(+), 42 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index d928e888720..4a17c96f4f6 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -41,6 +42,52 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { }; EXPORT_SYMBOL(__per_cpu_offset); +/** + * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu + * @cpu: cpu to allocate for + * @size: size allocation in bytes + * @align: alignment + * + * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper + * does the right thing for NUMA regardless of the current + * configuration. + * + * RETURNS: + * Pointer to the allocated area on success, NULL on failure. + */ +static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, + unsigned long align) +{ + const unsigned long goal = __pa(MAX_DMA_ADDRESS); +#ifdef CONFIG_NEED_MULTIPLE_NODES + int node = early_cpu_to_node(cpu); + void *ptr; + + if (!node_online(node) || !NODE_DATA(node)) { + ptr = __alloc_bootmem_nopanic(size, align, goal); + pr_info("cpu %d has no node %d or node-local memory\n", + cpu, node); + pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n", + cpu, size, __pa(ptr)); + } else { + ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node), + size, align, goal); + pr_debug("per cpu data for cpu%d %lu bytes on node%d at " + "%016lx\n", cpu, size, node, __pa(ptr)); + } + return ptr; +#else + return __alloc_bootmem_nopanic(size, align, goal); +#endif +} + +/* + * 4k page allocator + * + * This is the basic allocator. Static percpu area is allocated + * page-by-page and most of initialization is done by the generic + * setup function. + */ static struct page **pcpu4k_pages __initdata; static int pcpu4k_nr_static_pages __initdata; @@ -56,6 +103,51 @@ static void __init pcpu4k_populate_pte(unsigned long addr) populate_extra_pte(addr); } +static ssize_t __init setup_pcpu_4k(size_t static_size) +{ + size_t pages_size; + unsigned int cpu; + int i, j; + ssize_t ret; + + pcpu4k_nr_static_pages = PFN_UP(static_size); + + /* unaligned allocations can't be freed, round up to page size */ + pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus() + * sizeof(pcpu4k_pages[0])); + pcpu4k_pages = alloc_bootmem(pages_size); + + /* allocate and copy */ + j = 0; + for_each_possible_cpu(cpu) + for (i = 0; i < pcpu4k_nr_static_pages; i++) { + void *ptr; + + ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); + if (!ptr) + goto enomem; + + memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); + pcpu4k_pages[j++] = virt_to_page(ptr); + } + + /* we're ready, commit */ + pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", + pcpu4k_nr_static_pages, static_size); + + ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, 0, NULL, + pcpu4k_populate_pte); + goto out_free_ar; + +enomem: + while (--j >= 0) + free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE); + ret = -ENOMEM; +out_free_ar: + free_bootmem(__pa(pcpu4k_pages), pages_size); + return ret; +} + static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 @@ -76,56 +168,24 @@ static inline void setup_percpu_segment(int cpu) */ void __init setup_per_cpu_areas(void) { - ssize_t size = __per_cpu_end - __per_cpu_start; - unsigned int nr_cpu_pages = DIV_ROUND_UP(size, PAGE_SIZE); - static struct page **pages; - size_t pages_size; - unsigned int cpu, i, j; + size_t static_size = __per_cpu_end - __per_cpu_start; + unsigned int cpu; unsigned long delta; size_t pcpu_unit_size; + ssize_t ret; pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); - pr_info("PERCPU: Allocating %zd bytes for static per cpu data\n", size); - - pages_size = nr_cpu_pages * num_possible_cpus() * sizeof(pages[0]); - pages = alloc_bootmem(pages_size); - - j = 0; - for_each_possible_cpu(cpu) { - void *ptr; - - for (i = 0; i < nr_cpu_pages; i++) { -#ifndef CONFIG_NEED_MULTIPLE_NODES - ptr = alloc_bootmem_pages(PAGE_SIZE); -#else - int node = early_cpu_to_node(cpu); - - if (!node_online(node) || !NODE_DATA(node)) { - ptr = alloc_bootmem_pages(PAGE_SIZE); - pr_info("cpu %d has no node %d or node-local " - "memory\n", cpu, node); - pr_debug("per cpu data for cpu%d at %016lx\n", - cpu, __pa(ptr)); - } else { - ptr = alloc_bootmem_pages_node(NODE_DATA(node), - PAGE_SIZE); - pr_debug("per cpu data for cpu%d on node%d " - "at %016lx\n", cpu, node, __pa(ptr)); - } -#endif - memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); - pages[j++] = virt_to_page(ptr); - } - } - pcpu4k_pages = pages; - pcpu4k_nr_static_pages = nr_cpu_pages; - pcpu_unit_size = pcpu_setup_first_chunk(pcpu4k_get_page, size, 0, 0, - NULL, pcpu4k_populate_pte); + /* allocate percpu area */ + ret = setup_pcpu_4k(static_size); + if (ret < 0) + panic("cannot allocate static percpu area (%zu bytes, err=%zd)", + static_size, ret); - free_bootmem(__pa(pages), pages_size); + pcpu_unit_size = ret; + /* alrighty, percpu areas up and running */ delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) { per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; -- cgit v1.2.3 From 89c9215165ca609096e845926d9a18f1306176a4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Feb 2009 11:57:21 +0900 Subject: x86: add embedding percpu first chunk allocator Impact: add better first percpu allocation for !NUMA On !NUMA, we can simply allocate contiguous memory and use it for the first chunk without mapping it into vmalloc area. As the memory area is covered by the large page physical memory mapping, it allows the dynamic perpcu allocator to not add any TLB overhead for the static percpu area and whatever falls into the first chunk and the implementation is very simple too. Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 86 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 85 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 4a17c96f4f6..fd4c399675d 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -42,6 +42,35 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { }; EXPORT_SYMBOL(__per_cpu_offset); +/** + * pcpu_need_numa - determine percpu allocation needs to consider NUMA + * + * If NUMA is not configured or there is only one NUMA node available, + * there is no reason to consider NUMA. This function determines + * whether percpu allocation should consider NUMA or not. + * + * RETURNS: + * true if NUMA should be considered; otherwise, false. + */ +static bool __init pcpu_need_numa(void) +{ +#ifdef CONFIG_NEED_MULTIPLE_NODES + pg_data_t *last = NULL; + unsigned int cpu; + + for_each_possible_cpu(cpu) { + int node = early_cpu_to_node(cpu); + + if (node_online(node) && NODE_DATA(node) && + last && last != NODE_DATA(node)) + return true; + + last = NODE_DATA(node); + } +#endif + return false; +} + /** * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu * @cpu: cpu to allocate for @@ -81,6 +110,59 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, #endif } +/* + * Embedding allocator + * + * The first chunk is sized to just contain the static area plus + * PERCPU_DYNAMIC_RESERVE and allocated as a contiguous area using + * bootmem allocator and used as-is without being mapped into vmalloc + * area. This enables the first chunk to piggy back on the linear + * physical PMD mapping and doesn't add any additional pressure to + * TLB. + */ +static void *pcpue_ptr __initdata; +static size_t pcpue_unit_size __initdata; + +static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) +{ + return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + + ((size_t)pageno << PAGE_SHIFT)); +} + +static ssize_t __init setup_pcpu_embed(size_t static_size) +{ + unsigned int cpu; + + /* + * If large page isn't supported, there's no benefit in doing + * this. Also, embedding allocation doesn't play well with + * NUMA. + */ + if (!cpu_has_pse || pcpu_need_numa()) + return -EINVAL; + + /* allocate and copy */ + pcpue_unit_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE); + pcpue_unit_size = max(pcpue_unit_size, PCPU_MIN_UNIT_SIZE); + pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size, + PAGE_SIZE); + if (!pcpue_ptr) + return -ENOMEM; + + for_each_possible_cpu(cpu) + memcpy(pcpue_ptr + cpu * pcpue_unit_size, __per_cpu_load, + static_size); + + /* we're ready, commit */ + pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", + pcpue_unit_size >> PAGE_SHIFT, pcpue_ptr, static_size); + + return pcpu_setup_first_chunk(pcpue_get_page, static_size, + pcpue_unit_size, + pcpue_unit_size - static_size, pcpue_ptr, + NULL); +} + /* * 4k page allocator * @@ -178,7 +260,9 @@ void __init setup_per_cpu_areas(void) NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); /* allocate percpu area */ - ret = setup_pcpu_4k(static_size); + ret = setup_pcpu_embed(static_size); + if (ret < 0) + ret = setup_pcpu_4k(static_size); if (ret < 0) panic("cannot allocate static percpu area (%zu bytes, err=%zd)", static_size, ret); -- cgit v1.2.3 From 8ac837571491e239e64bd87863c1679d8002e8a2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Feb 2009 11:57:22 +0900 Subject: x86: add remapping percpu first chunk allocator Impact: add better first percpu allocation for NUMA On NUMA, embedding allocator can't be used as different units can't be made to fall in the correct NUMA nodes. To use large page mapping, each unit needs to be remapped. However, percpu areas are usually much smaller than large page size and unused space hurts a lot as the number of cpus grow. This allocator remaps large pages for each chunk but gives back unused part to the bootmem allocator making the large pages mapped twice. This adds slightly to the TLB pressure but is much better than using 4k mappings while still being NUMA-friendly. Ingo suggested that this would be the correct approach for NUMA. Signed-off-by: Tejun Heo Cc: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 137 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 135 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index fd4c399675d..2d946a8f78b 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -110,6 +110,133 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, #endif } +/* + * Remap allocator + * + * This allocator uses PMD page as unit. A PMD page is allocated for + * each cpu and each is remapped into vmalloc area using PMD mapping. + * As PMD page is quite large, only part of it is used for the first + * chunk. Unused part is returned to the bootmem allocator. + * + * So, the PMD pages are mapped twice - once to the physical mapping + * and to the vmalloc area for the first percpu chunk. The double + * mapping does add one more PMD TLB entry pressure but still is much + * better than only using 4k mappings while still being NUMA friendly. + */ +#ifdef CONFIG_NEED_MULTIPLE_NODES +static size_t pcpur_size __initdata; +static void **pcpur_ptrs __initdata; + +static struct page * __init pcpur_get_page(unsigned int cpu, int pageno) +{ + size_t off = (size_t)pageno << PAGE_SHIFT; + + if (off >= pcpur_size) + return NULL; + + return virt_to_page(pcpur_ptrs[cpu] + off); +} + +static ssize_t __init setup_pcpu_remap(size_t static_size) +{ + static struct vm_struct vm; + pg_data_t *last; + size_t ptrs_size; + unsigned int cpu; + ssize_t ret; + + /* + * If large page isn't supported, there's no benefit in doing + * this. Also, on non-NUMA, embedding is better. + */ + if (!cpu_has_pse || pcpu_need_numa()) + return -EINVAL; + + last = NULL; + for_each_possible_cpu(cpu) { + int node = early_cpu_to_node(cpu); + + if (node_online(node) && NODE_DATA(node) && + last && last != NODE_DATA(node)) + goto proceed; + + last = NODE_DATA(node); + } + return -EINVAL; + +proceed: + /* + * Currently supports only single page. Supporting multiple + * pages won't be too difficult if it ever becomes necessary. + */ + pcpur_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE); + if (pcpur_size > PMD_SIZE) { + pr_warning("PERCPU: static data is larger than large page, " + "can't use large page\n"); + return -EINVAL; + } + + /* allocate pointer array and alloc large pages */ + ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0])); + pcpur_ptrs = alloc_bootmem(ptrs_size); + + for_each_possible_cpu(cpu) { + pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE); + if (!pcpur_ptrs[cpu]) + goto enomem; + + /* + * Only use pcpur_size bytes and give back the rest. + * + * Ingo: The 2MB up-rounding bootmem is needed to make + * sure the partial 2MB page is still fully RAM - it's + * not well-specified to have a PAT-incompatible area + * (unmapped RAM, device memory, etc.) in that hole. + */ + free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size), + PMD_SIZE - pcpur_size); + + memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size); + } + + /* allocate address and map */ + vm.flags = VM_ALLOC; + vm.size = num_possible_cpus() * PMD_SIZE; + vm_area_register_early(&vm, PMD_SIZE); + + for_each_possible_cpu(cpu) { + pmd_t *pmd; + + pmd = populate_extra_pmd((unsigned long)vm.addr + + cpu * PMD_SIZE); + set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])), + PAGE_KERNEL_LARGE)); + } + + /* we're ready, commit */ + pr_info("PERCPU: Remapped at %p with large pages, static data " + "%zu bytes\n", vm.addr, static_size); + + ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, PMD_SIZE, + pcpur_size - static_size, vm.addr, NULL); + goto out_free_ar; + +enomem: + for_each_possible_cpu(cpu) + if (pcpur_ptrs[cpu]) + free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE); + ret = -ENOMEM; +out_free_ar: + free_bootmem(__pa(pcpur_ptrs), ptrs_size); + return ret; +} +#else +static ssize_t __init setup_pcpu_remap(size_t static_size) +{ + return -EINVAL; +} +#endif + /* * Embedding allocator * @@ -259,8 +386,14 @@ void __init setup_per_cpu_areas(void) pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); - /* allocate percpu area */ - ret = setup_pcpu_embed(static_size); + /* + * Allocate percpu area. If PSE is supported, try to make use + * of large page mappings. Please read comments on top of + * each allocator for details. + */ + ret = setup_pcpu_remap(static_size); + if (ret < 0) + ret = setup_pcpu_embed(static_size); if (ret < 0) ret = setup_pcpu_4k(static_size); if (ret < 0) -- cgit v1.2.3 From 41fdff322e26c4a86fe65cf577f2556a650cb7bc Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:49:30 +0100 Subject: x86, mce, cmci: export MAX_NR_BANKS Impact: Cleanup (code movement) Move MAX_NR_BANKS into mce.h because it's needed there for followup patches. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index a4a7c686ce9..39f8bb525a7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -37,12 +37,6 @@ #define MISC_MCELOG_MINOR 227 -/* - * To support more than 128 would need to escape the predefined - * Linux defined extended banks first. - */ -#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1) - atomic_t mce_entry; static int mce_dont_init; -- cgit v1.2.3 From b276268631af3a1b0df871e10d19d492f0513d4b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:49:31 +0100 Subject: x86, mce, cmci: factor out threshold interrupt handler Impact: cleanup; preparation for feature The mce_amd_64 code has an own private MC threshold vector with an own interrupt handler. Since Intel needs a similar handler it makes sense to share the vector because both can not be active at the same time. I factored the common APIC handler code into a separate file which can be used by both the Intel or AMD MC code. This is needed for the next patch which adds an Intel specific CMCI handler. This patch should be a nop for AMD, it just moves some code around. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/Makefile | 1 + arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 15 ++++++--------- arch/x86/kernel/cpu/mcheck/threshold.c | 24 ++++++++++++++++++++++++ 3 files changed, 31 insertions(+), 9 deletions(-) create mode 100644 arch/x86/kernel/cpu/mcheck/threshold.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index d7d2323bbb6..b2f89829bbe 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -4,3 +4,4 @@ obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o +obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index e82c8208b81..49705be9820 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -79,6 +79,8 @@ static unsigned char shared_bank[NR_BANKS] = { static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ +static void amd_threshold_interrupt(void); + /* * CPU Initialization */ @@ -174,6 +176,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) tr.reset = 0; tr.old_limit = 0; threshold_restart_bank(&tr); + + mce_threshold_vector = amd_threshold_interrupt; } } } @@ -187,16 +191,12 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) * the interrupt goes off when error_count reaches threshold_limit. * the handler will simply log mcelog w/ software defined bank number. */ -asmlinkage void mce_threshold_interrupt(void) +static void amd_threshold_interrupt(void) { unsigned int bank, block; struct mce m; u32 low = 0, high = 0, address = 0; - ack_APIC_irq(); - exit_idle(); - irq_enter(); - mce_setup(&m); /* assume first bank caused it */ @@ -241,13 +241,10 @@ asmlinkage void mce_threshold_interrupt(void) + bank * NR_BLOCKS + block; mce_log(&m); - goto out; + return; } } } -out: - inc_irq_stat(irq_threshold_count); - irq_exit(); } /* diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c new file mode 100644 index 00000000000..4319142413d --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -0,0 +1,24 @@ +/* Common corrected MCE threshold handler code */ +#include +#include +#include +#include +#include + +static void default_threshold_interrupt(void) +{ + printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n", + THRESHOLD_APIC_VECTOR); +} + +void (*mce_threshold_vector)(void) = default_threshold_interrupt; + +asmlinkage void mce_threshold_interrupt(void) +{ + ack_APIC_irq(); + exit_idle(); + irq_enter(); + inc_irq_stat(irq_threshold_count); + mce_threshold_vector(); + irq_exit(); +} -- cgit v1.2.3 From f9695df42cdbca78530b4458c38ecfdd0bb90079 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:49:32 +0100 Subject: x86, mce, cmci: avoid potential reentry of threshold interrupt Impact: minor bugfix The threshold handler on AMD (and soon on Intel) could be theoretically reentered by the hardware. This could lead to corrupted events because the machine check poll code assumes it is not reentered. Move the APIC ACK to the end of the interrupt handler to let the hardware avoid that. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/threshold.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index 4319142413d..e4b8a3833fc 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -15,10 +15,11 @@ void (*mce_threshold_vector)(void) = default_threshold_interrupt; asmlinkage void mce_threshold_interrupt(void) { - ack_APIC_irq(); exit_idle(); irq_enter(); inc_irq_stat(irq_threshold_count); mce_threshold_vector(); irq_exit(); + /* Ack only at the end to avoid potential reentry */ + ack_APIC_irq(); } -- cgit v1.2.3 From 8457c84d68678cbfd4167a9073b89da58e48c037 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:49:33 +0100 Subject: x86, mce: replace machine check events logged interval with ratelimit Impact: behavior change, use common code Use a standard leaky bucket ratelimit for the machine check warning print interval instead of waiting every check_interval. Also decrease the limit to twice per minute. This interacts better with threshold interrupts because they can happen more often than check_interval. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 39f8bb525a7..9017609cadd 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -488,11 +489,11 @@ static DECLARE_WORK(mce_trigger_work, mce_do_trigger); */ int mce_notify_user(void) { + /* Not more than two messages every minute */ + static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); + clear_thread_flag(TIF_MCE_NOTIFY); if (test_and_clear_bit(0, ¬ify_user)) { - static unsigned long last_print; - unsigned long now = jiffies; - wake_up_interruptible(&mce_wait); /* @@ -503,10 +504,8 @@ int mce_notify_user(void) if (trigger[0] && !work_pending(&mce_trigger_work)) schedule_work(&mce_trigger_work); - if (time_after_eq(now, last_print + (check_interval*HZ))) { - last_print = now; + if (__ratelimit(&ratelimit)) printk(KERN_INFO "Machine check events logged\n"); - } return 1; } -- cgit v1.2.3 From ee031c31d6381d004bfd386c2e45821211507499 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:49:34 +0100 Subject: x86, mce, cmci: use polled banks bitmap in machine check poller Define a per cpu bitmap that contains the banks polled by the machine check poller. This is needed for the CMCI code in the next patches to be able to disable polling on specific banks. The bank by default contains all banks, so there is no behaviour change. Only future code will remove some banks from the polling set. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 16 ++++++++++++---- arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 3 ++- 2 files changed, 14 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 9017609cadd..a8ff38bfa6e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -62,6 +62,11 @@ static char *trigger_argv[2] = { trigger, NULL }; static DECLARE_WAIT_QUEUE_HEAD(mce_wait); +/* MCA banks polled by the period polling timer for corrected events */ +DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { + [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL +}; + /* Do initial initialization of a struct mce */ void mce_setup(struct mce *m) { @@ -191,7 +196,7 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) * * This is executed in standard interrupt context. */ -void machine_check_poll(enum mcp_flags flags) +void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) { struct mce m; int i; @@ -200,7 +205,7 @@ void machine_check_poll(enum mcp_flags flags) rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); for (i = 0; i < banks; i++) { - if (!bank[i]) + if (!bank[i] || !test_bit(i, *b)) continue; m.misc = 0; @@ -458,7 +463,8 @@ static void mcheck_timer(unsigned long data) WARN_ON(smp_processor_id() != data); if (mce_available(¤t_cpu_data)) - machine_check_poll(MCP_TIMESTAMP); + machine_check_poll(MCP_TIMESTAMP, + &__get_cpu_var(mce_poll_banks)); /* * Alert userspace if needed. If we logged an MCE, reduce the @@ -572,11 +578,13 @@ static void mce_init(void *dummy) { u64 cap; int i; + mce_banks_t all_banks; /* * Log the machine checks left over from the previous reset. */ - machine_check_poll(MCP_UC); + bitmap_fill(all_banks, MAX_NR_BANKS); + machine_check_poll(MCP_UC, &all_banks); set_in_cr4(X86_CR4_MCE); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 49705be9820..ee8bfcd3aa3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -231,7 +231,8 @@ static void amd_threshold_interrupt(void) /* Log the machine check that caused the threshold event. */ - machine_check_poll(MCP_TIMESTAMP); + machine_check_poll(MCP_TIMESTAMP, + &__get_cpu_var(mce_poll_banks)); if (high & MASK_OVERFLOW_HI) { rdmsrl(address, m.misc); -- cgit v1.2.3 From 88ccbedd9ca85d1aca6a6f99df48dce87b7c02d4 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:49:36 +0100 Subject: x86, mce, cmci: add CMCI support Impact: Major new feature Intel CMCI (Corrected Machine Check Interrupt) is a new feature on Nehalem CPUs. It allows the CPU to trigger interrupts on corrected events, which allows faster reaction to them instead of with the traditional polling timer. Also use CMCI to discover shared banks. Machine check banks can be shared by CPU threads or even cores. Using the CMCI enable bit it is possible to detect the fact that another CPU already saw a specific bank. Use this to assign shared banks only to one CPU to avoid reporting duplicated events. On CPU hot unplug bank sharing is re discovered. This is done using a thread that cycles through all the CPUs. To avoid races between the poller and CMCI we only poll for banks that are not CMCI capable and only check CMCI owned banks on a interrupt. The shared banks ownership information is currently only used for CMCI interrupts, not polled banks. The sharing discovery code follows the algorithm recommended in the IA32 SDM Vol3a 14.5.2.1 The CMCI interrupt handler just calls the machine check poller to pick up the machine check event that caused the interrupt. I decided not to implement a separate threshold event like the AMD version has, because the threshold is always one currently and adding another event didn't seem to add any value. Some code inspired by Yunhong Jiang's Xen implementation, which was in term inspired by a earlier CMCI implementation by me. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 16 ++- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 205 ++++++++++++++++++++++++++++++ 2 files changed, 218 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index a8ff38bfa6e..bfbd5323a63 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -166,7 +166,7 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start) panic(msg); } -static int mce_available(struct cpuinfo_x86 *c) +int mce_available(struct cpuinfo_x86 *c) { if (mce_dont_init) return 0; @@ -1060,9 +1060,12 @@ static __cpuinit void mce_remove_device(unsigned int cpu) static void mce_disable_cpu(void *h) { int i; + unsigned long action = *(unsigned long *)h; if (!mce_available(¤t_cpu_data)) return; + if (!(action & CPU_TASKS_FROZEN)) + cmci_clear(); for (i = 0; i < banks; i++) wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); } @@ -1070,9 +1073,12 @@ static void mce_disable_cpu(void *h) static void mce_reenable_cpu(void *h) { int i; + unsigned long action = *(unsigned long *)h; if (!mce_available(¤t_cpu_data)) return; + if (!(action & CPU_TASKS_FROZEN)) + cmci_reenable(); for (i = 0; i < banks; i++) wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); } @@ -1100,13 +1106,17 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: del_timer_sync(t); - smp_call_function_single(cpu, mce_disable_cpu, NULL, 1); + smp_call_function_single(cpu, mce_disable_cpu, &action, 1); break; case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: t->expires = round_jiffies_relative(jiffies + next_interval); add_timer_on(t, cpu); - smp_call_function_single(cpu, mce_reenable_cpu, NULL, 1); + smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); + break; + case CPU_POST_DEAD: + /* intentionally ignoring frozen here */ + cmci_rediscover(cpu); break; } return NOTIFY_OK; diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index 1b1491a76b5..a518ec8c6f8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -1,6 +1,8 @@ /* * Intel specific MCE features. * Copyright 2004 Zwane Mwaikambo + * Copyright (C) 2008, 2009 Intel Corporation + * Author: Andi Kleen */ #include @@ -12,6 +14,7 @@ #include #include #include +#include asmlinkage void smp_thermal_interrupt(void) { @@ -84,7 +87,209 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) return; } +/* + * Support for Intel Correct Machine Check Interrupts. This allows + * the CPU to raise an interrupt when a corrected machine check happened. + * Normally we pick those up using a regular polling timer. + * Also supports reliable discovery of shared banks. + */ + +static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); + +/* + * cmci_discover_lock protects against parallel discovery attempts + * which could race against each other. + */ +static DEFINE_SPINLOCK(cmci_discover_lock); + +#define CMCI_THRESHOLD 1 + +static __cpuinit int cmci_supported(int *banks) +{ + u64 cap; + + /* + * Vendor check is not strictly needed, but the initial + * initialization is vendor keyed and this + * makes sure none of the backdoors are entered otherwise. + */ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return 0; + if (!cpu_has_apic || lapic_get_maxlvt() < 6) + return 0; + rdmsrl(MSR_IA32_MCG_CAP, cap); + *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff); + return !!(cap & MCG_CMCI_P); +} + +/* + * The interrupt handler. This is called on every event. + * Just call the poller directly to log any events. + * This could in theory increase the threshold under high load, + * but doesn't for now. + */ +static void intel_threshold_interrupt(void) +{ + machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); + mce_notify_user(); +} + +static void print_update(char *type, int *hdr, int num) +{ + if (*hdr == 0) + printk(KERN_INFO "CPU %d MCA banks", smp_processor_id()); + *hdr = 1; + printk(KERN_CONT " %s:%d", type, num); +} + +/* + * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks + * on this CPU. Use the algorithm recommended in the SDM to discover shared + * banks. + */ +static __cpuinit void cmci_discover(int banks, int boot) +{ + unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned); + int hdr = 0; + int i; + + spin_lock(&cmci_discover_lock); + for (i = 0; i < banks; i++) { + u64 val; + + if (test_bit(i, owned)) + continue; + + rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + + /* Already owned by someone else? */ + if (val & CMCI_EN) { + if (test_and_clear_bit(i, owned) || boot) + print_update("SHD", &hdr, i); + __clear_bit(i, __get_cpu_var(mce_poll_banks)); + continue; + } + + val |= CMCI_EN | CMCI_THRESHOLD; + wrmsrl(MSR_IA32_MC0_CTL2 + i, val); + rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + + /* Did the enable bit stick? -- the bank supports CMCI */ + if (val & CMCI_EN) { + if (!test_and_set_bit(i, owned) || boot) + print_update("CMCI", &hdr, i); + __clear_bit(i, __get_cpu_var(mce_poll_banks)); + } else { + WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); + } + } + spin_unlock(&cmci_discover_lock); + if (hdr) + printk(KERN_CONT "\n"); +} + +/* + * Just in case we missed an event during initialization check + * all the CMCI owned banks. + */ +__cpuinit void cmci_recheck(void) +{ + unsigned long flags; + int banks; + + if (!mce_available(¤t_cpu_data) || !cmci_supported(&banks)) + return; + local_irq_save(flags); + machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); + local_irq_restore(flags); +} + +/* + * Disable CMCI on this CPU for all banks it owns when it goes down. + * This allows other CPUs to claim the banks on rediscovery. + */ +void __cpuexit cmci_clear(void) +{ + int i; + int banks; + u64 val; + + if (!cmci_supported(&banks)) + return; + spin_lock(&cmci_discover_lock); + for (i = 0; i < banks; i++) { + if (!test_bit(i, __get_cpu_var(mce_banks_owned))) + continue; + /* Disable CMCI */ + rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); + wrmsrl(MSR_IA32_MC0_CTL2 + i, val); + __clear_bit(i, __get_cpu_var(mce_banks_owned)); + } + spin_unlock(&cmci_discover_lock); +} + +/* + * After a CPU went down cycle through all the others and rediscover + * Must run in process context. + */ +void __cpuexit cmci_rediscover(int dying) +{ + int banks; + int cpu; + cpumask_var_t old; + + if (!cmci_supported(&banks)) + return; + if (!alloc_cpumask_var(&old, GFP_KERNEL)) + return; + cpumask_copy(old, ¤t->cpus_allowed); + + for_each_online_cpu (cpu) { + if (cpu == dying) + continue; + if (set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu))) + continue; + /* Recheck banks in case CPUs don't all have the same */ + if (cmci_supported(&banks)) + cmci_discover(banks, 0); + } + + set_cpus_allowed_ptr(current, old); + free_cpumask_var(old); +} + +/* + * Reenable CMCI on this CPU in case a CPU down failed. + */ +void cmci_reenable(void) +{ + int banks; + if (cmci_supported(&banks)) + cmci_discover(banks, 0); +} + +static __cpuinit void intel_init_cmci(void) +{ + int banks; + + if (!cmci_supported(&banks)) + return; + + mce_threshold_vector = intel_threshold_interrupt; + cmci_discover(banks, 1); + /* + * For CPU #0 this runs with still disabled APIC, but that's + * ok because only the vector is set up. We still do another + * check for the banks later for CPU #0 just to make sure + * to not miss any events. + */ + apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED); + cmci_recheck(); +} + void mce_intel_feature_init(struct cpuinfo_x86 *c) { intel_init_thermal(c); + intel_init_cmci(); } -- cgit v1.2.3 From df20e2eb3e59b8625021a1bc8b1b53a4edc6008b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 24 Feb 2009 13:19:02 -0800 Subject: x86, mce, cmci: remove incorrect __cpuinit/__cpuexit annotations Impact: Bug fix on UP The MCE code is reinitialized from resume, so we can't use __cpuinit/__cpuexit for most of the code. Remove those annotations for anything downstream of mce_init(). Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index a518ec8c6f8..7a2e10fcfa3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -104,7 +104,7 @@ static DEFINE_SPINLOCK(cmci_discover_lock); #define CMCI_THRESHOLD 1 -static __cpuinit int cmci_supported(int *banks) +static int cmci_supported(int *banks) { u64 cap; @@ -147,7 +147,7 @@ static void print_update(char *type, int *hdr, int num) * on this CPU. Use the algorithm recommended in the SDM to discover shared * banks. */ -static __cpuinit void cmci_discover(int banks, int boot) +static void cmci_discover(int banks, int boot) { unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned); int hdr = 0; @@ -192,7 +192,7 @@ static __cpuinit void cmci_discover(int banks, int boot) * Just in case we missed an event during initialization check * all the CMCI owned banks. */ -__cpuinit void cmci_recheck(void) +void cmci_recheck(void) { unsigned long flags; int banks; @@ -208,7 +208,7 @@ __cpuinit void cmci_recheck(void) * Disable CMCI on this CPU for all banks it owns when it goes down. * This allows other CPUs to claim the banks on rediscovery. */ -void __cpuexit cmci_clear(void) +void cmci_clear(void) { int i; int banks; @@ -233,7 +233,7 @@ void __cpuexit cmci_clear(void) * After a CPU went down cycle through all the others and rediscover * Must run in process context. */ -void __cpuexit cmci_rediscover(int dying) +void cmci_rediscover(int dying) { int banks; int cpu; -- cgit v1.2.3 From 5ca8681ca10f671427710f4954644359856581a3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:49:37 +0100 Subject: x86, mce, cmci: disable CMCI on rebooting Impact: Avoids confusing other OSes. Disable the CMCI vector on reboot to avoid confusing other OS. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 570f36e44e5..648676f0b50 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -868,6 +868,14 @@ void clear_local_APIC(void) apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); } #endif +#ifdef CONFIG_X86_MCE_INTEL + if (maxlvt >= 6) { + v = apic_read(APIC_LVTCMCI); + if (!(v & APIC_LVT_MASKED)) + apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED); + } +#endif + /* * Clean APIC state for other OSs: */ -- cgit v1.2.3 From be71b8553d0522aba535a815baaebb1f0bb9a9ec Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:49:38 +0100 Subject: x86, mce, cmci: recheck CMCI banks after APIC has been enabled on CPU #0 Impact: Fix marginal race condition One the first CPU the machine checks are enabled early before the local APIC is enabled. This could in theory lead to some lost CMCI events very early during boot because CMCIs cannot be delivered with disabled LAPIC. The poller also doesn't recover from this because it doesn't check CMCI banks. Add an explicit CMCI banks check after the LAPIC is enabled. This is only done for CPU #0, the other CPUs only initialize machine checks after the LAPIC is on. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 648676f0b50..57b53774d98 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include @@ -1270,6 +1271,12 @@ void __cpuinit setup_local_APIC(void) apic_write(APIC_LVT1, value); preempt_enable(); + +#ifdef CONFIG_X86_MCE_INTEL + /* Recheck CMCI information after local APIC is up on CPU #0 */ + if (smp_processor_id() == 0) + cmci_recheck(); +#endif } void __cpuinit end_local_APIC_setup(void) -- cgit v1.2.3 From 24ff954233ecfd45801383f831626f88937ebe6f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 25 Feb 2009 10:38:10 +0900 Subject: x86, percpu: fix minor bugs in setup_percpu.c Recent changes in setup_percpu.c made a now meaningless DBG() statement fail to compile and introduced a comparison-of-different-types warning. Fix them. Compile failure is reported by Ingo Molnar. Signed-off-by: Tejun Heo Reported-by: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 2d946a8f78b..c29f301d388 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -270,7 +270,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size) /* allocate and copy */ pcpue_unit_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE); - pcpue_unit_size = max(pcpue_unit_size, PCPU_MIN_UNIT_SIZE); + pcpue_unit_size = max_t(size_t, pcpue_unit_size, PCPU_MIN_UNIT_SIZE); pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size, PAGE_SIZE); if (!pcpue_ptr) @@ -438,8 +438,6 @@ void __init setup_per_cpu_areas(void) */ if (cpu == boot_cpu_id) switch_to_new_gdt(cpu); - - DBG("PERCPU: cpu %4d %p\n", cpu, ptr); } /* indicate the early static arrays will soon be gone */ -- cgit v1.2.3 From 73af76dfd1f998dba71d8e8e785cbe77a990bf17 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 4 Mar 2009 11:47:17 +0100 Subject: x86, mce: fix build failure in arch/x86/kernel/cpu/mcheck/threshold.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: build fix The APIC code rewrite in the x86 tree broke the x86/mce branch: arch/x86/kernel/cpu/mcheck/threshold.c: In function ‘mce_threshold_interrupt’: arch/x86/kernel/cpu/mcheck/threshold.c:24: error: implicit declaration of function ‘ack_APIC_irq’ Also tidy up the file a bit while at it. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/threshold.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index e4b8a3833fc..23ee9e730f7 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -1,9 +1,13 @@ -/* Common corrected MCE threshold handler code */ -#include +/* + * Common corrected MCE threshold handler code: + */ #include -#include +#include + #include +#include #include +#include static void default_threshold_interrupt(void) { -- cgit v1.2.3 From ff0c0874905fb312ca1491bbdac2653b0b48c20b Mon Sep 17 00:00:00 2001 From: Brian Maly Date: Tue, 3 Mar 2009 21:55:31 -0500 Subject: x86: fix DMI on EFI Impact: reactivate DMI quirks on EFI hardware DMI tables are loaded by EFI, so the dmi calls must happen after efi_init() and not before. Currently Apple hardware uses DMI to determine the framebuffer mappings for efifb. Without DMI working you also have no video on MacBook Pro. This patch resolves the DMI issue for EFI hardware (DMI is now properly detected at boot), and additionally efifb now loads on Apple hardware (i.e. video works). Signed-off-by: Brian Maly Acked-by: Yinghai Lu Cc: ying.huang@intel.com LKML-Reference: <49ADEDA3.1030406@redhat.com> Signed-off-by: Ingo Molnar arch/x86/kernel/setup.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) --- arch/x86/kernel/setup.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index c461f6d6907..6a8811a6932 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -770,6 +770,9 @@ void __init setup_arch(char **cmdline_p) finish_e820_parsing(); + if (efi_enabled) + efi_init(); + dmi_scan_machine(); dmi_check_system(bad_bios_dmi_table); @@ -789,8 +792,6 @@ void __init setup_arch(char **cmdline_p) insert_resource(&iomem_resource, &data_resource); insert_resource(&iomem_resource, &bss_resource); - if (efi_enabled) - efi_init(); #ifdef CONFIG_X86_32 if (ppro_with_ram_bug()) { -- cgit v1.2.3 From dd39ecf522ba86c70809715af46e6557f6491131 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 4 Mar 2009 10:58:33 +0800 Subject: x86: EFI: Back efi_ioremap with init_memory_mapping instead of FIX_MAP Impact: Fix boot failure on EFI system with large runtime memory range Brian Maly reported that some EFI system with large runtime memory range can not boot. Because the FIX_MAP used to map runtime memory range is smaller than run time memory range. This patch fixes this issue by re-implement efi_ioremap() with init_memory_mapping(). Reported-and-tested-by: Brian Maly Signed-off-by: Huang Ying Cc: Brian Maly Cc: Yinghai Lu LKML-Reference: <1236135513.6204.306.camel@yhuang-dev.sh.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/efi.c | 7 +++++-- arch/x86/kernel/efi_64.c | 21 ++++----------------- 2 files changed, 9 insertions(+), 19 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index 1119d247fe1..eb1ef3b67dd 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -467,7 +467,7 @@ void __init efi_enter_virtual_mode(void) efi_memory_desc_t *md; efi_status_t status; unsigned long size; - u64 end, systab, addr, npages; + u64 end, systab, addr, npages, end_pfn; void *p, *va; efi.systab = NULL; @@ -479,7 +479,10 @@ void __init efi_enter_virtual_mode(void) size = md->num_pages << EFI_PAGE_SHIFT; end = md->phys_addr + size; - if (PFN_UP(end) <= max_low_pfn_mapped) + end_pfn = PFN_UP(end); + if (end_pfn <= max_low_pfn_mapped + || (end_pfn > (1UL << (32 - PAGE_SHIFT)) + && end_pfn <= max_pfn_mapped)) va = __va(md->phys_addr); else va = efi_ioremap(md->phys_addr, size); diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c index 652c5287215..cb783b92c50 100644 --- a/arch/x86/kernel/efi_64.c +++ b/arch/x86/kernel/efi_64.c @@ -99,24 +99,11 @@ void __init efi_call_phys_epilog(void) void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size) { - static unsigned pages_mapped __initdata; - unsigned i, pages; - unsigned long offset; + unsigned long last_map_pfn; - pages = PFN_UP(phys_addr + size) - PFN_DOWN(phys_addr); - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - - if (pages_mapped + pages > MAX_EFI_IO_PAGES) + last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size); + if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) return NULL; - for (i = 0; i < pages; i++) { - __set_fixmap(FIX_EFI_IO_MAP_FIRST_PAGE - pages_mapped, - phys_addr, PAGE_KERNEL); - phys_addr += PAGE_SIZE; - pages_mapped++; - } - - return (void __iomem *)__fix_to_virt(FIX_EFI_IO_MAP_FIRST_PAGE - \ - (pages_mapped - pages)) + offset; + return (void __iomem *)__va(phys_addr); } -- cgit v1.2.3 From acaabe795a62bba089c185917af86b44654313dc Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Wed, 4 Mar 2009 12:56:05 -0600 Subject: x86: UV, SGI RTC: add generic system vector This patch allocates a system interrupt vector for various platform specific uses. Signed-off-by: Dimitri Sivanich Cc: Andrew Morton Cc: john stultz LKML-Reference: <20090304185605.GA24419@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 2 ++ arch/x86/kernel/irq.c | 34 ++++++++++++++++++++++++++++++++++ arch/x86/kernel/irqinit_32.c | 3 +++ arch/x86/kernel/irqinit_64.c | 3 +++ 4 files changed, 42 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 83d1836b946..7ba4621c0df 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -984,6 +984,8 @@ apicinterrupt UV_BAU_MESSAGE \ #endif apicinterrupt LOCAL_TIMER_VECTOR \ apic_timer_interrupt smp_apic_timer_interrupt +apicinterrupt GENERIC_INTERRUPT_VECTOR \ + generic_interrupt smp_generic_interrupt #ifdef CONFIG_SMP apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index f13ca1650aa..b864341dcc4 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -15,6 +15,9 @@ atomic_t irq_err_count; +/* Function pointer for generic interrupt vector handling */ +void (*generic_interrupt_extension)(void) = NULL; + /* * 'what should we do if we get a hw irq event on an illegal vector'. * each architecture has to answer this themselves. @@ -56,6 +59,12 @@ static int show_other_interrupts(struct seq_file *p) seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); seq_printf(p, " Local timer interrupts\n"); #endif + if (generic_interrupt_extension) { + seq_printf(p, "PLT: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->generic_irqs); + seq_printf(p, " Platform interrupts\n"); + } #ifdef CONFIG_SMP seq_printf(p, "RES: "); for_each_online_cpu(j) @@ -163,6 +172,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu) #ifdef CONFIG_X86_LOCAL_APIC sum += irq_stats(cpu)->apic_timer_irqs; #endif + if (generic_interrupt_extension) + sum += irq_stats(cpu)->generic_irqs; #ifdef CONFIG_SMP sum += irq_stats(cpu)->irq_resched_count; sum += irq_stats(cpu)->irq_call_count; @@ -226,4 +237,27 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs) return 1; } +/* + * Handler for GENERIC_INTERRUPT_VECTOR. + */ +void smp_generic_interrupt(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + ack_APIC_irq(); + + exit_idle(); + + irq_enter(); + + inc_irq_stat(generic_irqs); + + if (generic_interrupt_extension) + generic_interrupt_extension(); + + irq_exit(); + + set_irq_regs(old_regs); +} + EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 50b8c3a3006..bc132610544 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -175,6 +175,9 @@ void __init native_init_IRQ(void) /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); + /* generic IPI for platform specific use */ + alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); + /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index da481a1e3f3..c7a49e0ffbf 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -147,6 +147,9 @@ static void __init apic_intr_init(void) /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); + /* generic IPI for platform specific use */ + alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); + /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); -- cgit v1.2.3 From 5ab5ab34498f94d60884c4ccea890601e429042e Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Wed, 4 Mar 2009 12:59:18 -0600 Subject: x86: UV, SGI RTC: add UV RTC clocksource/clockevents This patch provides a high resolution clock/timer source using the SGI UV system-wide synchronized RTC clock/timer hardware. Signed-off-by: Dimitri Sivanich Cc: Andrew Morton Cc: john stultz LKML-Reference: <20090304185918.GC24419@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/uv_time.c | 391 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 392 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/uv_time.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 95f216bbfaf..339ce35648e 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -111,7 +111,7 @@ obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o # NB rename without _64 ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) - obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o + obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o obj-$(CONFIG_AUDIT) += audit_64.o diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c new file mode 100644 index 00000000000..6f8e3256ab2 --- /dev/null +++ b/arch/x86/kernel/uv_time.c @@ -0,0 +1,391 @@ +/* + * SGI RTC clock/timer routines. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Copyright (c) 2009 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) Dimitri Sivanich + */ +#include + +#include +#include +#include +#include + +#define RTC_NAME "sgi_rtc" + +static cycle_t uv_read_rtc(void); +static int uv_rtc_next_event(unsigned long, struct clock_event_device *); +static void uv_rtc_timer_setup(enum clock_event_mode, + struct clock_event_device *); + +static struct clocksource clocksource_uv = { + .name = RTC_NAME, + .rating = 400, + .read = uv_read_rtc, + .mask = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK, + .shift = 10, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +static struct clock_event_device clock_event_device_uv = { + .name = RTC_NAME, + .features = CLOCK_EVT_FEAT_ONESHOT, + .shift = 20, + .rating = 400, + .irq = -1, + .set_next_event = uv_rtc_next_event, + .set_mode = uv_rtc_timer_setup, + .event_handler = NULL, +}; + +static DEFINE_PER_CPU(struct clock_event_device, cpu_ced); + +/* There is one of these allocated per node */ +struct uv_rtc_timer_head { + spinlock_t lock; + /* next cpu waiting for timer, local node relative: */ + int next_cpu; + /* number of cpus on this node: */ + int ncpus; + struct { + int lcpu; /* systemwide logical cpu number */ + u64 expires; /* next timer expiration for this cpu */ + } cpu[1]; +}; + +/* + * Access to uv_rtc_timer_head via blade id. + */ +static struct uv_rtc_timer_head **blade_info __read_mostly; + +static int uv_rtc_enable; + +/* + * Hardware interface routines + */ + +/* Send IPIs to another node */ +static void uv_rtc_send_IPI(int cpu) +{ + unsigned long apicid, val; + int pnode; + + apicid = per_cpu(x86_cpu_to_apicid, cpu); + pnode = uv_apicid_to_pnode(apicid); + val = (1UL << UVH_IPI_INT_SEND_SHFT) | + (apicid << UVH_IPI_INT_APIC_ID_SHFT) | + (GENERIC_INTERRUPT_VECTOR << UVH_IPI_INT_VECTOR_SHFT); + + uv_write_global_mmr64(pnode, UVH_IPI_INT, val); +} + +/* Check for an RTC interrupt pending */ +static int uv_intr_pending(int pnode) +{ + return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) & + UVH_EVENT_OCCURRED0_RTC1_MASK; +} + +/* Setup interrupt and return non-zero if early expiration occurred. */ +static int uv_setup_intr(int cpu, u64 expires) +{ + u64 val; + int pnode = uv_cpu_to_pnode(cpu); + + uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, + UVH_RTC1_INT_CONFIG_M_MASK); + uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L); + + uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS, + UVH_EVENT_OCCURRED0_RTC1_MASK); + + val = (GENERIC_INTERRUPT_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | + ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); + + /* Set configuration */ + uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val); + /* Initialize comparator value */ + uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires); + + return (expires < uv_read_rtc() && !uv_intr_pending(pnode)); +} + +/* + * Per-cpu timer tracking routines + */ + +static __init void uv_rtc_deallocate_timers(void) +{ + int bid; + + for_each_possible_blade(bid) { + kfree(blade_info[bid]); + } + kfree(blade_info); +} + +/* Allocate per-node list of cpu timer expiration times. */ +static __init int uv_rtc_allocate_timers(void) +{ + int cpu; + + blade_info = kmalloc(uv_possible_blades * sizeof(void *), GFP_KERNEL); + if (!blade_info) + return -ENOMEM; + memset(blade_info, 0, uv_possible_blades * sizeof(void *)); + + for_each_present_cpu(cpu) { + int nid = cpu_to_node(cpu); + int bid = uv_cpu_to_blade_id(cpu); + int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id; + struct uv_rtc_timer_head *head = blade_info[bid]; + + if (!head) { + head = kmalloc_node(sizeof(struct uv_rtc_timer_head) + + (uv_blade_nr_possible_cpus(bid) * + 2 * sizeof(u64)), + GFP_KERNEL, nid); + if (!head) { + uv_rtc_deallocate_timers(); + return -ENOMEM; + } + spin_lock_init(&head->lock); + head->ncpus = uv_blade_nr_possible_cpus(bid); + head->next_cpu = -1; + blade_info[bid] = head; + } + + head->cpu[bcpu].lcpu = cpu; + head->cpu[bcpu].expires = ULLONG_MAX; + } + + return 0; +} + +/* Find and set the next expiring timer. */ +static void uv_rtc_find_next_timer(struct uv_rtc_timer_head *head, int pnode) +{ + u64 lowest = ULLONG_MAX; + int c, bcpu = -1; + + head->next_cpu = -1; + for (c = 0; c < head->ncpus; c++) { + u64 exp = head->cpu[c].expires; + if (exp < lowest) { + bcpu = c; + lowest = exp; + } + } + if (bcpu >= 0) { + head->next_cpu = bcpu; + c = head->cpu[bcpu].lcpu; + if (uv_setup_intr(c, lowest)) + /* If we didn't set it up in time, trigger */ + uv_rtc_send_IPI(c); + } else { + uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, + UVH_RTC1_INT_CONFIG_M_MASK); + } +} + +/* + * Set expiration time for current cpu. + * + * Returns 1 if we missed the expiration time. + */ +static int uv_rtc_set_timer(int cpu, u64 expires) +{ + int pnode = uv_cpu_to_pnode(cpu); + int bid = uv_cpu_to_blade_id(cpu); + struct uv_rtc_timer_head *head = blade_info[bid]; + int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id; + u64 *t = &head->cpu[bcpu].expires; + unsigned long flags; + int next_cpu; + + spin_lock_irqsave(&head->lock, flags); + + next_cpu = head->next_cpu; + *t = expires; + /* Will this one be next to go off? */ + if (next_cpu < 0 || bcpu == next_cpu || + expires < head->cpu[next_cpu].expires) { + head->next_cpu = bcpu; + if (uv_setup_intr(cpu, expires)) { + *t = ULLONG_MAX; + uv_rtc_find_next_timer(head, pnode); + spin_unlock_irqrestore(&head->lock, flags); + return 1; + } + } + + spin_unlock_irqrestore(&head->lock, flags); + return 0; +} + +/* + * Unset expiration time for current cpu. + * + * Returns 1 if this timer was pending. + */ +static int uv_rtc_unset_timer(int cpu) +{ + int pnode = uv_cpu_to_pnode(cpu); + int bid = uv_cpu_to_blade_id(cpu); + struct uv_rtc_timer_head *head = blade_info[bid]; + int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id; + u64 *t = &head->cpu[bcpu].expires; + unsigned long flags; + int rc = 0; + + spin_lock_irqsave(&head->lock, flags); + + if (head->next_cpu == bcpu && uv_read_rtc() >= *t) + rc = 1; + + *t = ULLONG_MAX; + + /* Was the hardware setup for this timer? */ + if (head->next_cpu == bcpu) + uv_rtc_find_next_timer(head, pnode); + + spin_unlock_irqrestore(&head->lock, flags); + + return rc; +} + + +/* + * Kernel interface routines. + */ + +/* + * Read the RTC. + */ +static cycle_t uv_read_rtc(void) +{ + return (cycle_t)uv_read_local_mmr(UVH_RTC); +} + +/* + * Program the next event, relative to now + */ +static int uv_rtc_next_event(unsigned long delta, + struct clock_event_device *ced) +{ + int ced_cpu = cpumask_first(ced->cpumask); + + return uv_rtc_set_timer(ced_cpu, delta + uv_read_rtc()); +} + +/* + * Setup the RTC timer in oneshot mode + */ +static void uv_rtc_timer_setup(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + int ced_cpu = cpumask_first(evt->cpumask); + + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + case CLOCK_EVT_MODE_ONESHOT: + case CLOCK_EVT_MODE_RESUME: + /* Nothing to do here yet */ + break; + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + uv_rtc_unset_timer(ced_cpu); + break; + } +} + +static void uv_rtc_interrupt(void) +{ + struct clock_event_device *ced = &__get_cpu_var(cpu_ced); + int cpu = smp_processor_id(); + + if (!ced || !ced->event_handler) + return; + + if (uv_rtc_unset_timer(cpu) != 1) + return; + + ced->event_handler(ced); +} + +static int __init uv_enable_rtc(char *str) +{ + uv_rtc_enable = 1; + + return 1; +} +__setup("uvrtc", uv_enable_rtc); + +static __init void uv_rtc_register_clockevents(struct work_struct *dummy) +{ + struct clock_event_device *ced = &__get_cpu_var(cpu_ced); + + *ced = clock_event_device_uv; + ced->cpumask = cpumask_of(smp_processor_id()); + clockevents_register_device(ced); +} + +static __init int uv_rtc_setup_clock(void) +{ + int rc; + + if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension) + return -ENODEV; + + generic_interrupt_extension = uv_rtc_interrupt; + + clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, + clocksource_uv.shift); + + rc = clocksource_register(&clocksource_uv); + if (rc) { + generic_interrupt_extension = NULL; + return rc; + } + + /* Setup and register clockevents */ + rc = uv_rtc_allocate_timers(); + if (rc) { + clocksource_unregister(&clocksource_uv); + generic_interrupt_extension = NULL; + return rc; + } + + clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second, + NSEC_PER_SEC, clock_event_device_uv.shift); + + clock_event_device_uv.min_delta_ns = NSEC_PER_SEC / + sn_rtc_cycles_per_second; + + clock_event_device_uv.max_delta_ns = clocksource_uv.mask * + (NSEC_PER_SEC / sn_rtc_cycles_per_second); + + rc = schedule_on_each_cpu(uv_rtc_register_clockevents); + if (rc) { + clocksource_unregister(&clocksource_uv); + generic_interrupt_extension = NULL; + uv_rtc_deallocate_timers(); + } + + return rc; +} +arch_initcall(uv_rtc_setup_clock); -- cgit v1.2.3 From ab9e18587f4cdb5f3fb3854c732f27a36f98e8f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Gl=C3=B6ckner?= Date: Wed, 4 Mar 2009 19:42:27 +0100 Subject: x86, math-emu: fix init_fpu for task != current MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: fix math-emu related crash while using GDB/ptrace init_fpu() calls finit to initialize a task's xstate, while finit always works on the current task. If we use PTRACE_GETFPREGS on another process and both processes did not already use floating point, we get a null pointer exception in finit. This patch creates a new function finit_task that takes a task_struct parameter. finit becomes a wrapper that simply calls finit_task with current. On the plus side this avoids many calls to get_current which would each resolve to an inline assembler mov instruction. An empty finit_task has been added to i387.h to avoid linker errors in case the compiler still emits the call in init_fpu when CONFIG_MATH_EMULATION is not defined. The declaration of finit in i387.h has been removed as the remaining code using this function gets its prototype from fpu_proto.h. Signed-off-by: Daniel Glöckner Cc: Suresh Siddha Cc: "Pallipadi Venkatesh" Cc: Arjan van de Ven Cc: Bill Metzenthen LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/i387.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index b0f61f0dcd0..f2f8540a7f3 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -136,7 +136,7 @@ int init_fpu(struct task_struct *tsk) #ifdef CONFIG_X86_32 if (!HAVE_HWFP) { memset(tsk->thread.xstate, 0, xstate_size); - finit(); + finit_task(tsk); set_stopped_child_used_math(tsk); return 0; } -- cgit v1.2.3 From 8d4dd919b46ed982da6ef6bf6fcec454cd7a5b1b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 4 Mar 2009 01:25:21 -0800 Subject: x86: ioremap mptable Impact: fix boot with mptable above max_low_mapped Try to use early_ioremap() to map MPC to make sure it works even it is at the end of ram. Signed-off-by: Yinghai Lu Cc: Andrew Morton LKML-Reference: <49AE4901.3090801@kernel.org> Signed-off-by: Ingo Molnar Reported-and-tested-by: Kevin O'Connor --- arch/x86/kernel/mpparse.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 37cb1bda1ba..ae9060cb448 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -558,6 +558,19 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) static struct mpf_intel *mpf_found; +static unsigned long __init get_mpc_size(unsigned long physptr) +{ + struct mpc_table *mpc; + unsigned long size; + + mpc = early_ioremap(physptr, PAGE_SIZE); + size = mpc->length; + early_iounmap(mpc, PAGE_SIZE); + apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size); + + return size; +} + /* * Scan the memory blocks for an SMP configuration block. */ @@ -611,12 +624,16 @@ static void __init __get_smp_config(unsigned int early) construct_default_ISA_mptable(mpf->feature1); } else if (mpf->physptr) { + struct mpc_table *mpc; + unsigned long size; + size = get_mpc_size(mpf->physptr); + mpc = early_ioremap(mpf->physptr, size); /* * Read the physical hardware table. Anything here will * override the defaults. */ - if (!smp_read_mpc(phys_to_virt(mpf->physptr), early)) { + if (!smp_read_mpc(mpc, early)) { #ifdef CONFIG_X86_LOCAL_APIC smp_found_config = 0; #endif @@ -624,8 +641,10 @@ static void __init __get_smp_config(unsigned int early) "BIOS bug, MP table errors detected!...\n"); printk(KERN_ERR "... disabling SMP support. " "(tell your hw vendor)\n"); + early_iounmap(mpc, size); return; } + early_iounmap(mpc, size); if (early) return; -- cgit v1.2.3 From f62432395ec54e93f113091bcb2e2017eeed7683 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 4 Mar 2009 01:25:54 -0800 Subject: x86: reserve exact size of mptable Impact: save a bit of RAM Get the exact size for the reserve_bootmem() call. Signed-off-by: Yinghai Lu Cc: Andrew Morton LKML-Reference: <49AE4922.605@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/mpparse.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index ae9060cb448..e8192401da4 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -716,10 +716,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, if (!reserve) return 1; - reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE, + reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf), BOOTMEM_DEFAULT); if (mpf->physptr) { - unsigned long size = PAGE_SIZE; + unsigned long size = get_mpc_size(mpf->physptr); #ifdef CONFIG_X86_32 /* * We cannot access to MPC table to compute -- cgit v1.2.3 From dd4124a8a06bca89c077a16437edac010f0bb993 Mon Sep 17 00:00:00 2001 From: Leann Ogasawara Date: Wed, 4 Mar 2009 11:53:00 -0800 Subject: x86: add Dell XPS710 reboot quirk Dell XPS710 will hang on reboot. This is resolved by adding a quirk to set bios reboot. Signed-off-by: Leann Ogasawara Signed-off-by: Tim Gardner Cc: "manoj.iyer" Cc: LKML-Reference: <1236196380.3231.89.camel@emiko> Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 2b46eb41643..4526b3a75ed 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -217,6 +217,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"), }, }, + { /* Handle problems with rebooting on Dell XPS710 */ + .callback = set_bios_reboot, + .ident = "Dell XPS710", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"), + }, + }, { } }; -- cgit v1.2.3 From 1400b3faab8fedfffde5a7fe47098e2732d4aa76 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Wed, 4 Mar 2009 16:02:46 -0600 Subject: x86: UV, SGI RTC: fix uv_time.c for UP Fix non-smp build of uv_time.c. Signed-off-by: Dimitri Sivanich LKML-Reference: <20090304220246.GC6288@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/uv_time.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c index 6f8e3256ab2..2ffb6c53326 100644 --- a/arch/x86/kernel/uv_time.c +++ b/arch/x86/kernel/uv_time.c @@ -24,6 +24,8 @@ #include #include #include +#include +#include #define RTC_NAME "sgi_rtc" @@ -84,7 +86,7 @@ static void uv_rtc_send_IPI(int cpu) unsigned long apicid, val; int pnode; - apicid = per_cpu(x86_cpu_to_apicid, cpu); + apicid = cpu_physical_id(cpu); pnode = uv_apicid_to_pnode(apicid); val = (1UL << UVH_IPI_INT_SEND_SHFT) | (apicid << UVH_IPI_INT_APIC_ID_SHFT) | -- cgit v1.2.3 From ed26dbe5ae045e5bf95c6dc27497397a3fde52e1 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 4 Mar 2009 16:16:51 -0800 Subject: x86: pre-initialize boot_cpu_data.x86_phys_bits to avoid system_state tests Impact: cleanup, micro-optimization Pre-initialize boot_cpu_data.x86_phys_bits to a reasonable default to remove the use of system_state tests in __virt_addr_valid() and __phys_addr(). Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b746deb9ebc..f28c56e6bf9 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -202,7 +202,9 @@ struct ist_info ist_info; #endif #else -struct cpuinfo_x86 boot_cpu_data __read_mostly; +struct cpuinfo_x86 boot_cpu_data __read_mostly = { + .x86_phys_bits = MAX_PHYSMEM_BITS, +}; EXPORT_SYMBOL(boot_cpu_data); #endif -- cgit v1.2.3 From cafe8816b217b98dc3f268d3b77445da498beb4f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Mar 2009 14:33:59 +0900 Subject: percpu: use negative for auto for pcpu_setup_first_chunk() arguments Impact: argument semantic cleanup In pcpu_setup_first_chunk(), zero @unit_size and @dyn_size meant auto-sizing. It's okay for @unit_size as 0 doesn't make sense but 0 dynamic reserve size is valid. Alos, if arch @dyn_size is calculated from other parameters, it might end up passing in 0 @dyn_size and malfunction when the size is automatically adjusted. This patch makes both @unit_size and @dyn_size ssize_t and use -1 for auto sizing. Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index c29f301d388..ef3a2cd3fe6 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -344,7 +344,7 @@ static ssize_t __init setup_pcpu_4k(size_t static_size) pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", pcpu4k_nr_static_pages, static_size); - ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, 0, NULL, + ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, -1, -1, NULL, pcpu4k_populate_pte); goto out_free_ar; -- cgit v1.2.3 From 9a4f8a878b68d5a5d9ee60908a52cf6a55e1b823 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Mar 2009 14:33:59 +0900 Subject: x86: make embedding percpu allocator return excessive free space Impact: reduce unnecessary memory usage on certain configurations Embedding percpu allocator allocates unit_size * smp_num_possible_cpus() bytes consecutively and use it for the first chunk. However, if the static area is small, this can result in excessive prellocated free space in the first chunk due to PCPU_MIN_UNIT_SIZE restriction. This patch makes embedding percpu allocator preallocate only what's necessary as described by PERPCU_DYNAMIC_RESERVE and return the leftover to the bootmem allocator. Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 44 +++++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index ef3a2cd3fe6..38e2b2a470a 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -241,24 +241,31 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) * Embedding allocator * * The first chunk is sized to just contain the static area plus - * PERCPU_DYNAMIC_RESERVE and allocated as a contiguous area using - * bootmem allocator and used as-is without being mapped into vmalloc - * area. This enables the first chunk to piggy back on the linear - * physical PMD mapping and doesn't add any additional pressure to - * TLB. + * module and dynamic reserves, and allocated as a contiguous area + * using bootmem allocator and used as-is without being mapped into + * vmalloc area. This enables the first chunk to piggy back on the + * linear physical PMD mapping and doesn't add any additional pressure + * to TLB. Note that if the needed size is smaller than the minimum + * unit size, the leftover is returned to the bootmem allocator. */ static void *pcpue_ptr __initdata; +static size_t pcpue_size __initdata; static size_t pcpue_unit_size __initdata; static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) { - return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size - + ((size_t)pageno << PAGE_SHIFT)); + size_t off = (size_t)pageno << PAGE_SHIFT; + + if (off >= pcpue_size) + return NULL; + + return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off); } static ssize_t __init setup_pcpu_embed(size_t static_size) { unsigned int cpu; + size_t dyn_size; /* * If large page isn't supported, there's no benefit in doing @@ -269,25 +276,30 @@ static ssize_t __init setup_pcpu_embed(size_t static_size) return -EINVAL; /* allocate and copy */ - pcpue_unit_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE); - pcpue_unit_size = max_t(size_t, pcpue_unit_size, PCPU_MIN_UNIT_SIZE); + pcpue_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE); + pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); + dyn_size = pcpue_size - static_size; + pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size, PAGE_SIZE); if (!pcpue_ptr) return -ENOMEM; - for_each_possible_cpu(cpu) - memcpy(pcpue_ptr + cpu * pcpue_unit_size, __per_cpu_load, - static_size); + for_each_possible_cpu(cpu) { + void *ptr = pcpue_ptr + cpu * pcpue_unit_size; + + free_bootmem(__pa(ptr + pcpue_size), + pcpue_unit_size - pcpue_size); + memcpy(ptr, __per_cpu_load, static_size); + } /* we're ready, commit */ pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", - pcpue_unit_size >> PAGE_SHIFT, pcpue_ptr, static_size); + pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); return pcpu_setup_first_chunk(pcpue_get_page, static_size, - pcpue_unit_size, - pcpue_unit_size - static_size, pcpue_ptr, - NULL); + pcpue_unit_size, dyn_size, + pcpue_ptr, NULL); } /* -- cgit v1.2.3 From edcb463997ed7b2ffa3bac76e3e75957318f2e01 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Mar 2009 14:33:59 +0900 Subject: percpu, module: implement reserved allocation and use it for module percpu variables Impact: add reserved allocation functionality and use it for module percpu variables This patch implements reserved allocation from the first chunk. When setting up the first chunk, arch can ask to set aside certain number of bytes right after the core static area which is available only through a separate reserved allocator. This will be used primarily for module static percpu variables on architectures with limited relocation range to ensure that the module perpcu symbols are inside the relocatable range. If reserved area is requested, the first chunk becomes reserved and isn't available for regular allocation. If the first chunk also includes piggy-back dynamic allocation area, a separate chunk mapping the same region is created to serve dynamic allocation. The first one is called static first chunk and the second dynamic first chunk. Although they share the page map, their different area map initializations guarantee they serve disjoint areas according to their purposes. If arch doesn't setup reserved area, reserved allocation is handled like any other allocation. Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 38e2b2a470a..dd4eabc747c 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -217,7 +217,7 @@ proceed: pr_info("PERCPU: Remapped at %p with large pages, static data " "%zu bytes\n", vm.addr, static_size); - ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, PMD_SIZE, + ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, 0, PMD_SIZE, pcpur_size - static_size, vm.addr, NULL); goto out_free_ar; @@ -297,7 +297,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size) pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); - return pcpu_setup_first_chunk(pcpue_get_page, static_size, + return pcpu_setup_first_chunk(pcpue_get_page, static_size, 0, pcpue_unit_size, dyn_size, pcpue_ptr, NULL); } @@ -356,8 +356,8 @@ static ssize_t __init setup_pcpu_4k(size_t static_size) pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", pcpu4k_nr_static_pages, static_size); - ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, -1, -1, NULL, - pcpu4k_populate_pte); + ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, -1, -1, + NULL, pcpu4k_populate_pte); goto out_free_ar; enomem: -- cgit v1.2.3 From 6b19b0c2400437a3c10059ede0e59b517092e1bd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Mar 2009 14:33:59 +0900 Subject: x86, percpu: setup reserved percpu area for x86_64 Impact: fix relocation overflow during module load x86_64 uses 32bit relocations for symbol access and static percpu symbols whether in core or modules must be inside 2GB of the percpu segement base which the dynamic percpu allocator doesn't guarantee. This patch makes x86_64 reserve PERCPU_MODULE_RESERVE bytes in the first chunk so that module percpu areas are always allocated from the first chunk which is always inside the relocatable range. This problem exists for any percpu allocator but is easily triggered when using the embedding allocator because the second chunk is located beyond 2GB on it. This patch also changes the meaning of PERCPU_DYNAMIC_RESERVE such that it only indicates the size of the area to reserve for dynamic allocation as static and dynamic areas can be separate. New PERCPU_DYNAMIC_RESERVED is increased by 4k for both 32 and 64bits as the reserved area separation eats away some allocatable space and having slightly more headroom (currently between 4 and 8k after minimal boot sans module area) makes sense for common case performance. x86_32 can address anywhere from anywhere and doesn't need reserving. Mike Galbraith first reported the problem first and bisected it to the embedding percpu allocator commit. Signed-off-by: Tejun Heo Reported-by: Mike Galbraith Reported-by: Jaswinder Singh Rajput --- arch/x86/kernel/setup_percpu.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index dd4eabc747c..efa615f2bf4 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -42,6 +42,19 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { }; EXPORT_SYMBOL(__per_cpu_offset); +/* + * On x86_64 symbols referenced from code should be reachable using + * 32bit relocations. Reserve space for static percpu variables in + * modules so that they are always served from the first chunk which + * is located at the percpu segment base. On x86_32, anything can + * address anywhere. No need to reserve space in the first chunk. + */ +#ifdef CONFIG_X86_64 +#define PERCPU_FIRST_CHUNK_RESERVE PERCPU_MODULE_RESERVE +#else +#define PERCPU_FIRST_CHUNK_RESERVE 0 +#endif + /** * pcpu_need_numa - determine percpu allocation needs to consider NUMA * @@ -141,7 +154,7 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) { static struct vm_struct vm; pg_data_t *last; - size_t ptrs_size; + size_t ptrs_size, dyn_size; unsigned int cpu; ssize_t ret; @@ -169,12 +182,14 @@ proceed: * Currently supports only single page. Supporting multiple * pages won't be too difficult if it ever becomes necessary. */ - pcpur_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE); + pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + + PERCPU_DYNAMIC_RESERVE); if (pcpur_size > PMD_SIZE) { pr_warning("PERCPU: static data is larger than large page, " "can't use large page\n"); return -EINVAL; } + dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; /* allocate pointer array and alloc large pages */ ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0])); @@ -217,8 +232,9 @@ proceed: pr_info("PERCPU: Remapped at %p with large pages, static data " "%zu bytes\n", vm.addr, static_size); - ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, 0, PMD_SIZE, - pcpur_size - static_size, vm.addr, NULL); + ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, + PERCPU_FIRST_CHUNK_RESERVE, + PMD_SIZE, dyn_size, vm.addr, NULL); goto out_free_ar; enomem: @@ -276,9 +292,10 @@ static ssize_t __init setup_pcpu_embed(size_t static_size) return -EINVAL; /* allocate and copy */ - pcpue_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE); + pcpue_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + + PERCPU_DYNAMIC_RESERVE); pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); - dyn_size = pcpue_size - static_size; + dyn_size = pcpue_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size, PAGE_SIZE); @@ -297,7 +314,8 @@ static ssize_t __init setup_pcpu_embed(size_t static_size) pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); - return pcpu_setup_first_chunk(pcpue_get_page, static_size, 0, + return pcpu_setup_first_chunk(pcpue_get_page, static_size, + PERCPU_FIRST_CHUNK_RESERVE, pcpue_unit_size, dyn_size, pcpue_ptr, NULL); } @@ -356,8 +374,9 @@ static ssize_t __init setup_pcpu_4k(size_t static_size) pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", pcpu4k_nr_static_pages, static_size); - ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, -1, -1, - NULL, pcpu4k_populate_pte); + ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, + PERCPU_FIRST_CHUNK_RESERVE, -1, -1, NULL, + pcpu4k_populate_pte); goto out_free_ar; enomem: -- cgit v1.2.3 From 9ca0791dcaa666e8e8f4b4ca028b65b4bde9cb28 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Thu, 5 Mar 2009 08:49:54 +0100 Subject: x86, bts: remove bad warning In case a ptraced task is reaped (while the tracer is still attached), ds_exit_thread() is called before ptrace_exit(). The latter will release the bts_tracer and remove the thread's ds_ctx. The former will WARN() if the context is not NULL. Oleg Nesterov submitted patches that move ptrace_exit() before exit_thread() and thus reverse the order of the above calls. Remove the bad warning. I will add it again when Oleg's changes are in. Signed-off-by: Markus Metzger LKML-Reference: <20090305084954.A22000@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 169a120587b..de7cdbda1c3 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -1029,5 +1029,4 @@ void ds_copy_thread(struct task_struct *tsk, struct task_struct *father) void ds_exit_thread(struct task_struct *tsk) { - WARN_ON(tsk->thread.ds_ctx); } -- cgit v1.2.3 From 73bf1b62f561fc8ecb00e2810efe4fe769f4933e Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Thu, 5 Mar 2009 08:57:21 +0100 Subject: x86, pebs: correct qualifier passed to ds_write_config() from ds_request_pebs() ds_write_config() can write the BTS as well as the PEBS part of the DS config. ds_request_pebs() passes the wrong qualifier, which results in the wrong configuration to be written. Reported-by: Stephane Eranian Signed-off-by: Markus Metzger LKML-Reference: <20090305085721.A22550@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index de7cdbda1c3..87b67e3a765 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -729,7 +729,7 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, spin_unlock_irqrestore(&ds_lock, irq); - ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); + ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); ds_resume_pebs(tracer); return tracer; -- cgit v1.2.3 From 3a450de1365d20afde406f0d9b2931a5e4a4fd6a Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Fri, 6 Mar 2009 17:30:56 -0600 Subject: x86: UV: remove uv_flush_tlb_others() WARN_ON In uv_flush_tlb_others() (arch/x86/kernel/tlb_uv.c), the "WARN_ON(!in_atomic())" fails if CONFIG_PREEMPT is not enabled. And CONFIG_PREEMPT is not enabled by default in the distribution that most UV owners will use. We could #ifdef CONFIG_PREEMPT the warning, but that is not good form. And there seems to be no suitable fix to in_atomic() when CONFIG_PREMPT is not on. As Ingo commented: > and we have no proper primitive to test for atomicity. (mainly > because we dont know about atomicity on a non-preempt kernel) So we drop the WARN_ON. Signed-off-by: Cliff Wickman Signed-off-by: Tejun Heo Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index f04549afcfe..d038b9c45cf 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -314,8 +314,6 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, int locals = 0; struct bau_desc *bau_desc; - WARN_ON(!in_atomic()); - cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); uv_cpu = uv_blade_processor_id(); -- cgit v1.2.3 From 1f442d70c84aa798e243e721eba728a98434cd86 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 7 Mar 2009 23:46:26 -0800 Subject: x86: remove smp_apply_quirks()/smp_checks() Impact: cleanup and code size reduction on 64-bit This code is only applied to Intel Pentium and AMD K7 32-bit cpus. Move those checks to intel_init()/amd_init() for 32-bit so 64-bit will not build this code. Also change to use cpu_index check to see if we need to emit warning. Signed-off-by: Yinghai Lu LKML-Reference: <49B377D2.8030108@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 52 ++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/intel.c | 25 +++++++++++++++ arch/x86/kernel/smpboot.c | 78 --------------------------------------------- 3 files changed, 77 insertions(+), 78 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 25423a5b80e..f47df59016c 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -5,6 +5,7 @@ #include #include #include +#include #ifdef CONFIG_X86_64 # include @@ -141,6 +142,55 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c) } } +static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + /* calling is from identify_secondary_cpu() ? */ + if (c->cpu_index == boot_cpu_id) + return; + + /* + * Certain Athlons might work (for various values of 'work') in SMP + * but they are not certified as MP capable. + */ + /* Athlon 660/661 is valid. */ + if ((c->x86_model == 6) && ((c->x86_mask == 0) || + (c->x86_mask == 1))) + goto valid_k7; + + /* Duron 670 is valid */ + if ((c->x86_model == 7) && (c->x86_mask == 0)) + goto valid_k7; + + /* + * Athlon 662, Duron 671, and Athlon >model 7 have capability + * bit. It's worth noting that the A5 stepping (662) of some + * Athlon XP's have the MP bit set. + * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for + * more. + */ + if (((c->x86_model == 6) && (c->x86_mask >= 2)) || + ((c->x86_model == 7) && (c->x86_mask >= 1)) || + (c->x86_model > 7)) + if (cpu_has_mp) + goto valid_k7; + + /* If we get here, not a certified SMP capable AMD system. */ + + /* + * Don't taint if we are running SMP kernel on a single non-MP + * approved Athlon + */ + WARN_ONCE(1, "WARNING: This combination of AMD" + "processors is not suitable for SMP.\n"); + if (!test_taint(TAINT_UNSAFE_SMP)) + add_taint(TAINT_UNSAFE_SMP); + +valid_k7: + ; +#endif +} + static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) { u32 l, h; @@ -175,6 +225,8 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) } set_cpu_cap(c, X86_FEATURE_K7); + + amd_k7_smp_check(c); } #endif diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 25c559ba8d5..191117f1ad5 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -13,6 +13,7 @@ #include #include #include +#include #ifdef CONFIG_X86_64 #include @@ -110,6 +111,28 @@ static void __cpuinit trap_init_f00f_bug(void) } #endif +static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + /* calling is from identify_secondary_cpu() ? */ + if (c->cpu_index == boot_cpu_id) + return; + + /* + * Mask B, Pentium, but not Pentium MMX + */ + if (c->x86 == 5 && + c->x86_mask >= 1 && c->x86_mask <= 4 && + c->x86_model <= 3) { + /* + * Remember we have B step Pentia with bugs + */ + WARN_ONCE(1, "WARNING: SMP operation may be unreliable" + "with B stepping processors.\n"); + } +#endif +} + static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) { unsigned long lo, hi; @@ -186,6 +209,8 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_NUMAQ numaq_tsc_disable(); #endif + + intel_smp_check(c); } #else static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 249334f5080..ef7d10170c3 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -114,10 +114,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_info); atomic_t init_deasserted; - -/* Set if we find a B stepping CPU */ -static int __cpuinitdata smp_b_stepping; - #if defined(CONFIG_NUMA) && defined(CONFIG_X86_32) /* which logical CPUs are on which nodes */ @@ -271,8 +267,6 @@ static void __cpuinit smp_callin(void) cpumask_set_cpu(cpuid, cpu_callin_mask); } -static int __cpuinitdata unsafe_smp; - /* * Activate a secondary processor. */ @@ -340,76 +334,6 @@ notrace static void __cpuinit start_secondary(void *unused) cpu_idle(); } -static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c) -{ - /* - * Mask B, Pentium, but not Pentium MMX - */ - if (c->x86_vendor == X86_VENDOR_INTEL && - c->x86 == 5 && - c->x86_mask >= 1 && c->x86_mask <= 4 && - c->x86_model <= 3) - /* - * Remember we have B step Pentia with bugs - */ - smp_b_stepping = 1; - - /* - * Certain Athlons might work (for various values of 'work') in SMP - * but they are not certified as MP capable. - */ - if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) { - - if (num_possible_cpus() == 1) - goto valid_k7; - - /* Athlon 660/661 is valid. */ - if ((c->x86_model == 6) && ((c->x86_mask == 0) || - (c->x86_mask == 1))) - goto valid_k7; - - /* Duron 670 is valid */ - if ((c->x86_model == 7) && (c->x86_mask == 0)) - goto valid_k7; - - /* - * Athlon 662, Duron 671, and Athlon >model 7 have capability - * bit. It's worth noting that the A5 stepping (662) of some - * Athlon XP's have the MP bit set. - * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for - * more. - */ - if (((c->x86_model == 6) && (c->x86_mask >= 2)) || - ((c->x86_model == 7) && (c->x86_mask >= 1)) || - (c->x86_model > 7)) - if (cpu_has_mp) - goto valid_k7; - - /* If we get here, not a certified SMP capable AMD system. */ - unsafe_smp = 1; - } - -valid_k7: - ; -} - -static void __cpuinit smp_checks(void) -{ - if (smp_b_stepping) - printk(KERN_WARNING "WARNING: SMP operation may be unreliable" - "with B stepping processors.\n"); - - /* - * Don't taint if we are running SMP kernel on a single non-MP - * approved Athlon - */ - if (unsafe_smp && num_online_cpus() > 1) { - printk(KERN_INFO "WARNING: This combination of AMD" - "processors is not suitable for SMP.\n"); - add_taint(TAINT_UNSAFE_SMP); - } -} - /* * The bootstrap kernel entry code has set these up. Save them for * a given CPU @@ -423,7 +347,6 @@ void __cpuinit smp_store_cpu_info(int id) c->cpu_index = id; if (id != 0) identify_secondary_cpu(c); - smp_apply_quirks(c); } @@ -1193,7 +1116,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus) pr_debug("Boot done.\n"); impress_friends(); - smp_checks(); #ifdef CONFIG_X86_IO_APIC setup_ioapic_dest(); #endif -- cgit v1.2.3 From 129f8ae9b1b5be94517da76009ea956e89104ce8 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Mon, 9 Mar 2009 15:07:33 -0400 Subject: Revert "[CPUFREQ] Disable sysfs ui for p4-clockmod." This reverts commit e088e4c9cdb618675874becb91b2fd581ee707e6. Removing the sysfs interface for p4-clockmod was flagged as a regression in bug 12826. Course of action: - Find out the remaining causes of overheating, and fix them if possible. ACPI should be doing the right thing automatically. If it isn't, we need to fix that. - mark p4-clockmod ui as deprecated - try again with the removal in six months. It's not really feasible to printk about the deprecation, because it needs to happen at all the sysfs entry points, which means adding a lot of strcmp("p4-clockmod".. calls to the core, which.. bleuch. Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/p4-clockmod.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c index b585e04cbc9..3178c3acd97 100644 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c @@ -277,7 +277,6 @@ static struct cpufreq_driver p4clockmod_driver = { .name = "p4-clockmod", .owner = THIS_MODULE, .attr = p4clockmod_attr, - .hide_interface = 1, }; -- cgit v1.2.3 From 6074d5b0a319fe8400ff079a3c289406ca024321 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 10 Mar 2009 16:27:48 +0900 Subject: percpu: more flexibility for @dyn_size of pcpu_setup_first_chunk() Impact: cleanup, more flexibility for first chunk init Non-negative @dyn_size used to be allowed iff @unit_size wasn't auto. This restriction stemmed from implementation detail and made things a bit less intuitive. This patch allows @dyn_size to be specified regardless of @unit_size and swaps the positions of @dyn_size and @unit_size so that the parameter order makes more sense (static, reserved and dyn sizes followed by enclosing unit_size). While at it, add @unit_size >= PCPU_MIN_UNIT_SIZE sanity check. Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index efa615f2bf4..e41c51f6ada 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -233,8 +233,8 @@ proceed: "%zu bytes\n", vm.addr, static_size); ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, - PERCPU_FIRST_CHUNK_RESERVE, - PMD_SIZE, dyn_size, vm.addr, NULL); + PERCPU_FIRST_CHUNK_RESERVE, dyn_size, + PMD_SIZE, vm.addr, NULL); goto out_free_ar; enomem: @@ -315,9 +315,8 @@ static ssize_t __init setup_pcpu_embed(size_t static_size) pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); return pcpu_setup_first_chunk(pcpue_get_page, static_size, - PERCPU_FIRST_CHUNK_RESERVE, - pcpue_unit_size, dyn_size, - pcpue_ptr, NULL); + PERCPU_FIRST_CHUNK_RESERVE, dyn_size, + pcpue_unit_size, pcpue_ptr, NULL); } /* @@ -375,8 +374,8 @@ static ssize_t __init setup_pcpu_4k(size_t static_size) pcpu4k_nr_static_pages, static_size); ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, - PERCPU_FIRST_CHUNK_RESERVE, -1, -1, NULL, - pcpu4k_populate_pte); + PERCPU_FIRST_CHUNK_RESERVE, -1, + -1, NULL, pcpu4k_populate_pte); goto out_free_ar; enomem: -- cgit v1.2.3 From 66c3a75772247c31feabefb724e082220a1ab060 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 10 Mar 2009 16:27:48 +0900 Subject: percpu: generalize embedding first chunk setup helper Impact: code reorganization Separate out embedding first chunk setup helper from x86 embedding first chunk allocator and put it in mm/percpu.c. This will be used by the default percpu first chunk allocator and possibly by other archs. Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 54 +++++------------------------------------- 1 file changed, 6 insertions(+), 48 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index e41c51f6ada..400331b50a5 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -257,31 +257,13 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) * Embedding allocator * * The first chunk is sized to just contain the static area plus - * module and dynamic reserves, and allocated as a contiguous area - * using bootmem allocator and used as-is without being mapped into - * vmalloc area. This enables the first chunk to piggy back on the - * linear physical PMD mapping and doesn't add any additional pressure - * to TLB. Note that if the needed size is smaller than the minimum - * unit size, the leftover is returned to the bootmem allocator. + * module and dynamic reserves and embedded into linear physical + * mapping so that it can use PMD mapping without additional TLB + * pressure. */ -static void *pcpue_ptr __initdata; -static size_t pcpue_size __initdata; -static size_t pcpue_unit_size __initdata; - -static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) -{ - size_t off = (size_t)pageno << PAGE_SHIFT; - - if (off >= pcpue_size) - return NULL; - - return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off); -} - static ssize_t __init setup_pcpu_embed(size_t static_size) { - unsigned int cpu; - size_t dyn_size; + size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; /* * If large page isn't supported, there's no benefit in doing @@ -291,32 +273,8 @@ static ssize_t __init setup_pcpu_embed(size_t static_size) if (!cpu_has_pse || pcpu_need_numa()) return -EINVAL; - /* allocate and copy */ - pcpue_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + - PERCPU_DYNAMIC_RESERVE); - pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); - dyn_size = pcpue_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; - - pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size, - PAGE_SIZE); - if (!pcpue_ptr) - return -ENOMEM; - - for_each_possible_cpu(cpu) { - void *ptr = pcpue_ptr + cpu * pcpue_unit_size; - - free_bootmem(__pa(ptr + pcpue_size), - pcpue_unit_size - pcpue_size); - memcpy(ptr, __per_cpu_load, static_size); - } - - /* we're ready, commit */ - pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", - pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); - - return pcpu_setup_first_chunk(pcpue_get_page, static_size, - PERCPU_FIRST_CHUNK_RESERVE, dyn_size, - pcpue_unit_size, pcpue_ptr, NULL); + return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, + reserve - PERCPU_FIRST_CHUNK_RESERVE, -1); } /* -- cgit v1.2.3 From 8c5dfd25519bf302ba43daa59976c4d675a594a7 Mon Sep 17 00:00:00 2001 From: Stoyan Gaydarov Date: Tue, 10 Mar 2009 00:10:32 -0500 Subject: x86: BUG to BUG_ON changes Impact: cleanup Signed-off-by: Stoyan Gaydarov LKML-Reference: <1236661850-8237-8-git-send-email-stoyboyker@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 6 ++---- arch/x86/kernel/quirks.c | 3 +-- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 826d5c87627..f8869978bbb 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1078,8 +1078,7 @@ void __cpuinit cpu_init(void) atomic_inc(&init_mm.mm_count); me->active_mm = &init_mm; - if (me->mm) - BUG(); + BUG_ON(me->mm); enter_lazy_tlb(&init_mm, me); load_sp0(t, ¤t->thread); @@ -1145,8 +1144,7 @@ void __cpuinit cpu_init(void) */ atomic_inc(&init_mm.mm_count); curr->active_mm = &init_mm; - if (curr->mm) - BUG(); + BUG_ON(curr->mm); enter_lazy_tlb(&init_mm, curr); load_sp0(t, thread); diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 309949e9e1c..6a5a2970f4c 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -74,8 +74,7 @@ static void ich_force_hpet_resume(void) if (!force_hpet_address) return; - if (rcba_base == NULL) - BUG(); + BUG_ON(rcba_base == NULL); /* read the Function Disable register, dword mode only */ val = readl(rcba_base + 0x3404); -- cgit v1.2.3 From 9b779edf4b97798d037bb44fca2719ac184d0f14 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Tue, 10 Mar 2009 15:37:51 +0530 Subject: x86: cpu architecture debug code Introduce: cat /sys/kernel/debug/x86/cpu/* for Intel and AMD processors to view / debug the state of each CPU. By using this we can debug whole range of registers and other cpu information for debugging purpose and monitor how things are changing. This can be useful for developers as well as for users. Signed-off-by: Jaswinder Singh Rajput LKML-Reference: <1236701373.3387.4.camel@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/Makefile | 2 + arch/x86/kernel/cpu/cpu_debug.c | 784 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 786 insertions(+) create mode 100755 arch/x86/kernel/cpu/cpu_debug.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 82db7f45e2d..d4356f8b752 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -14,6 +14,8 @@ obj-y += vmware.o hypervisor.o obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o obj-$(CONFIG_X86_64) += bugs_64.o +obj-$(CONFIG_X86_CPU_DEBUG) += cpu_debug.o + obj-$(CONFIG_CPU_SUP_INTEL) += intel.o obj-$(CONFIG_CPU_SUP_AMD) += amd.o obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c new file mode 100755 index 00000000000..0bdf4daba20 --- /dev/null +++ b/arch/x86/kernel/cpu/cpu_debug.c @@ -0,0 +1,784 @@ +/* + * CPU x86 architecture debug code + * + * Copyright(C) 2009 Jaswinder Singh Rajput + * + * For licencing details see kernel-base/COPYING + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]); +static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]); +static DEFINE_PER_CPU(unsigned, cpu_modelflag); +static DEFINE_PER_CPU(int, cpu_priv_count); +static DEFINE_PER_CPU(unsigned, cpu_model); + +static DEFINE_MUTEX(cpu_debug_lock); + +static struct dentry *cpu_debugfs_dir; + +static struct cpu_debug_base cpu_base[] = { + { "mc", CPU_MC }, /* Machine Check */ + { "monitor", CPU_MONITOR }, /* Monitor */ + { "time", CPU_TIME }, /* Time */ + { "pmc", CPU_PMC }, /* Performance Monitor */ + { "platform", CPU_PLATFORM }, /* Platform */ + { "apic", CPU_APIC }, /* APIC */ + { "poweron", CPU_POWERON }, /* Power-on */ + { "control", CPU_CONTROL }, /* Control */ + { "features", CPU_FEATURES }, /* Features control */ + { "lastbranch", CPU_LBRANCH }, /* Last Branch */ + { "bios", CPU_BIOS }, /* BIOS */ + { "freq", CPU_FREQ }, /* Frequency */ + { "mtrr", CPU_MTRR }, /* MTRR */ + { "perf", CPU_PERF }, /* Performance */ + { "cache", CPU_CACHE }, /* Cache */ + { "sysenter", CPU_SYSENTER }, /* Sysenter */ + { "therm", CPU_THERM }, /* Thermal */ + { "misc", CPU_MISC }, /* Miscellaneous */ + { "debug", CPU_DEBUG }, /* Debug */ + { "pat", CPU_PAT }, /* PAT */ + { "vmx", CPU_VMX }, /* VMX */ + { "call", CPU_CALL }, /* System Call */ + { "base", CPU_BASE }, /* BASE Address */ + { "smm", CPU_SMM }, /* System mgmt mode */ + { "svm", CPU_SVM }, /*Secure Virtial Machine*/ + { "osvm", CPU_OSVM }, /* OS-Visible Workaround*/ + { "tss", CPU_TSS }, /* Task Stack Segment */ + { "cr", CPU_CR }, /* Control Registers */ + { "dt", CPU_DT }, /* Descriptor Table */ + { "registers", CPU_REG_ALL }, /* Select all Registers */ +}; + +static struct cpu_file_base cpu_file[] = { + { "index", CPU_REG_ALL }, /* index */ + { "value", CPU_REG_ALL }, /* value */ +}; + +/* Intel Registers Range */ +static struct cpu_debug_range cpu_intel_range[] = { + { 0x00000000, 0x00000001, CPU_MC, CPU_INTEL_ALL }, + { 0x00000006, 0x00000007, CPU_MONITOR, CPU_CX_AT_XE }, + { 0x00000010, 0x00000010, CPU_TIME, CPU_INTEL_ALL }, + { 0x00000011, 0x00000013, CPU_PMC, CPU_INTEL_PENTIUM }, + { 0x00000017, 0x00000017, CPU_PLATFORM, CPU_PX_CX_AT_XE }, + { 0x0000001B, 0x0000001B, CPU_APIC, CPU_P6_CX_AT_XE }, + + { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_PX_CX_AT_XE }, + { 0x0000002B, 0x0000002B, CPU_POWERON, CPU_INTEL_XEON }, + { 0x0000002C, 0x0000002C, CPU_FREQ, CPU_INTEL_XEON }, + { 0x0000003A, 0x0000003A, CPU_CONTROL, CPU_CX_AT_XE }, + + { 0x00000040, 0x00000043, CPU_LBRANCH, CPU_PM_CX_AT_XE }, + { 0x00000044, 0x00000047, CPU_LBRANCH, CPU_PM_CO_AT }, + { 0x00000060, 0x00000063, CPU_LBRANCH, CPU_C2_AT }, + { 0x00000064, 0x00000067, CPU_LBRANCH, CPU_INTEL_ATOM }, + + { 0x00000079, 0x00000079, CPU_BIOS, CPU_P6_CX_AT_XE }, + { 0x00000088, 0x0000008A, CPU_CACHE, CPU_INTEL_P6 }, + { 0x0000008B, 0x0000008B, CPU_BIOS, CPU_P6_CX_AT_XE }, + { 0x0000009B, 0x0000009B, CPU_MONITOR, CPU_INTEL_XEON }, + + { 0x000000C1, 0x000000C2, CPU_PMC, CPU_P6_CX_AT }, + { 0x000000CD, 0x000000CD, CPU_FREQ, CPU_CX_AT }, + { 0x000000E7, 0x000000E8, CPU_PERF, CPU_CX_AT }, + { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_P6_CX_XE }, + + { 0x00000116, 0x00000116, CPU_CACHE, CPU_INTEL_P6 }, + { 0x00000118, 0x00000118, CPU_CACHE, CPU_INTEL_P6 }, + { 0x00000119, 0x00000119, CPU_CACHE, CPU_INTEL_PX }, + { 0x0000011A, 0x0000011B, CPU_CACHE, CPU_INTEL_P6 }, + { 0x0000011E, 0x0000011E, CPU_CACHE, CPU_PX_CX_AT }, + + { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_P6_CX_AT_XE }, + { 0x00000179, 0x0000017A, CPU_MC, CPU_PX_CX_AT_XE }, + { 0x0000017B, 0x0000017B, CPU_MC, CPU_P6_XE }, + { 0x00000186, 0x00000187, CPU_PMC, CPU_P6_CX_AT }, + { 0x00000198, 0x00000199, CPU_PERF, CPU_PM_CX_AT_XE }, + { 0x0000019A, 0x0000019A, CPU_TIME, CPU_PM_CX_AT_XE }, + { 0x0000019B, 0x0000019D, CPU_THERM, CPU_PM_CX_AT_XE }, + { 0x000001A0, 0x000001A0, CPU_MISC, CPU_PM_CX_AT_XE }, + + { 0x000001C9, 0x000001C9, CPU_LBRANCH, CPU_PM_CX_AT }, + { 0x000001D7, 0x000001D8, CPU_LBRANCH, CPU_INTEL_XEON }, + { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_CX_AT_XE }, + { 0x000001DA, 0x000001DA, CPU_LBRANCH, CPU_INTEL_XEON }, + { 0x000001DB, 0x000001DB, CPU_LBRANCH, CPU_P6_XE }, + { 0x000001DC, 0x000001DC, CPU_LBRANCH, CPU_INTEL_P6 }, + { 0x000001DD, 0x000001DE, CPU_LBRANCH, CPU_PX_CX_AT_XE }, + { 0x000001E0, 0x000001E0, CPU_LBRANCH, CPU_INTEL_P6 }, + + { 0x00000200, 0x0000020F, CPU_MTRR, CPU_P6_CX_XE }, + { 0x00000250, 0x00000250, CPU_MTRR, CPU_P6_CX_XE }, + { 0x00000258, 0x00000259, CPU_MTRR, CPU_P6_CX_XE }, + { 0x00000268, 0x0000026F, CPU_MTRR, CPU_P6_CX_XE }, + { 0x00000277, 0x00000277, CPU_PAT, CPU_C2_AT_XE }, + { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_P6_CX_XE }, + + { 0x00000300, 0x00000308, CPU_PMC, CPU_INTEL_XEON }, + { 0x00000309, 0x0000030B, CPU_PMC, CPU_C2_AT_XE }, + { 0x0000030C, 0x00000311, CPU_PMC, CPU_INTEL_XEON }, + { 0x00000345, 0x00000345, CPU_PMC, CPU_C2_AT }, + { 0x00000360, 0x00000371, CPU_PMC, CPU_INTEL_XEON }, + { 0x0000038D, 0x00000390, CPU_PMC, CPU_C2_AT }, + { 0x000003A0, 0x000003BE, CPU_PMC, CPU_INTEL_XEON }, + { 0x000003C0, 0x000003CD, CPU_PMC, CPU_INTEL_XEON }, + { 0x000003E0, 0x000003E1, CPU_PMC, CPU_INTEL_XEON }, + { 0x000003F0, 0x000003F0, CPU_PMC, CPU_INTEL_XEON }, + { 0x000003F1, 0x000003F1, CPU_PMC, CPU_C2_AT_XE }, + { 0x000003F2, 0x000003F2, CPU_PMC, CPU_INTEL_XEON }, + + { 0x00000400, 0x00000402, CPU_MC, CPU_PM_CX_AT_XE }, + { 0x00000403, 0x00000403, CPU_MC, CPU_INTEL_XEON }, + { 0x00000404, 0x00000406, CPU_MC, CPU_PM_CX_AT_XE }, + { 0x00000407, 0x00000407, CPU_MC, CPU_INTEL_XEON }, + { 0x00000408, 0x0000040A, CPU_MC, CPU_PM_CX_AT_XE }, + { 0x0000040B, 0x0000040B, CPU_MC, CPU_INTEL_XEON }, + { 0x0000040C, 0x0000040E, CPU_MC, CPU_PM_CX_XE }, + { 0x0000040F, 0x0000040F, CPU_MC, CPU_INTEL_XEON }, + { 0x00000410, 0x00000412, CPU_MC, CPU_PM_CX_AT_XE }, + { 0x00000413, 0x00000417, CPU_MC, CPU_CX_AT_XE }, + { 0x00000480, 0x0000048B, CPU_VMX, CPU_CX_AT_XE }, + + { 0x00000600, 0x00000600, CPU_DEBUG, CPU_PM_CX_AT_XE }, + { 0x00000680, 0x0000068F, CPU_LBRANCH, CPU_INTEL_XEON }, + { 0x000006C0, 0x000006CF, CPU_LBRANCH, CPU_INTEL_XEON }, + + { 0x000107CC, 0x000107D3, CPU_PMC, CPU_INTEL_XEON_MP }, + + { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_INTEL_XEON }, + { 0xC0000081, 0xC0000082, CPU_CALL, CPU_INTEL_XEON }, + { 0xC0000084, 0xC0000084, CPU_CALL, CPU_INTEL_XEON }, + { 0xC0000100, 0xC0000102, CPU_BASE, CPU_INTEL_XEON }, +}; + +/* AMD Registers Range */ +static struct cpu_debug_range cpu_amd_range[] = { + { 0x00000010, 0x00000010, CPU_TIME, CPU_ALL, }, + { 0x0000001B, 0x0000001B, CPU_APIC, CPU_ALL, }, + { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_ALL, }, + + { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_ALL, }, + { 0x00000179, 0x0000017A, CPU_MC, CPU_ALL, }, + { 0x0000017B, 0x0000017B, CPU_MC, CPU_ALL, }, + { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_ALL, }, + { 0x000001DB, 0x000001DE, CPU_LBRANCH, CPU_ALL, }, + + { 0x00000200, 0x0000020F, CPU_MTRR, CPU_ALL, }, + { 0x00000250, 0x00000250, CPU_MTRR, CPU_ALL, }, + { 0x00000258, 0x00000259, CPU_MTRR, CPU_ALL, }, + { 0x00000268, 0x0000026F, CPU_MTRR, CPU_ALL, }, + { 0x00000277, 0x00000277, CPU_PAT, CPU_ALL, }, + { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_ALL, }, + + { 0x00000400, 0x00000417, CPU_MC, CPU_ALL, }, + + { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_ALL, }, + { 0xC0000081, 0xC0000084, CPU_CALL, CPU_ALL, }, + { 0xC0000100, 0xC0000102, CPU_BASE, CPU_ALL, }, + { 0xC0000103, 0xC0000103, CPU_TIME, CPU_ALL, }, + + { 0xC0000408, 0xC000040A, CPU_MC, CPU_ALL, }, + + { 0xc0010000, 0xc0010007, CPU_PMC, CPU_ALL, }, + { 0xc0010010, 0xc0010010, CPU_MTRR, CPU_ALL, }, + { 0xc0010016, 0xc001001A, CPU_MTRR, CPU_ALL, }, + { 0xc001001D, 0xc001001D, CPU_MTRR, CPU_ALL, }, + { 0xc0010030, 0xc0010035, CPU_BIOS, CPU_ALL, }, + { 0xc0010056, 0xc0010056, CPU_SMM, CPU_ALL, }, + { 0xc0010061, 0xc0010063, CPU_SMM, CPU_ALL, }, + { 0xc0010074, 0xc0010074, CPU_MC, CPU_ALL, }, + { 0xc0010111, 0xc0010113, CPU_SMM, CPU_ALL, }, + { 0xc0010114, 0xc0010118, CPU_SVM, CPU_ALL, }, + { 0xc0010119, 0xc001011A, CPU_SMM, CPU_ALL, }, + { 0xc0010140, 0xc0010141, CPU_OSVM, CPU_ALL, }, + { 0xc0010156, 0xc0010156, CPU_SMM, CPU_ALL, }, +}; + + +static int get_cpu_modelflag(unsigned cpu) +{ + int flag; + + switch (per_cpu(cpu_model, cpu)) { + /* Intel */ + case 0x0501: + case 0x0502: + case 0x0504: + flag = CPU_INTEL_PENTIUM; + break; + case 0x0601: + case 0x0603: + case 0x0605: + case 0x0607: + case 0x0608: + case 0x060A: + case 0x060B: + flag = CPU_INTEL_P6; + break; + case 0x0609: + case 0x060D: + flag = CPU_INTEL_PENTIUM_M; + break; + case 0x060E: + flag = CPU_INTEL_CORE; + break; + case 0x060F: + case 0x0617: + flag = CPU_INTEL_CORE2; + break; + case 0x061C: + flag = CPU_INTEL_ATOM; + break; + case 0x0F00: + case 0x0F01: + case 0x0F02: + case 0x0F03: + case 0x0F04: + flag = CPU_INTEL_XEON_P4; + break; + case 0x0F06: + flag = CPU_INTEL_XEON_MP; + break; + default: + flag = CPU_NONE; + break; + } + + return flag; +} + +static int get_cpu_range_count(unsigned cpu) +{ + int index; + + switch (per_cpu(cpu_model, cpu) >> 16) { + case X86_VENDOR_INTEL: + index = ARRAY_SIZE(cpu_intel_range); + break; + case X86_VENDOR_AMD: + index = ARRAY_SIZE(cpu_amd_range); + break; + default: + index = 0; + break; + } + + return index; +} + +static int is_typeflag_valid(unsigned cpu, unsigned flag) +{ + unsigned vendor, modelflag; + int i, index; + + /* Standard Registers should be always valid */ + if (flag >= CPU_TSS) + return 1; + + modelflag = per_cpu(cpu_modelflag, cpu); + vendor = per_cpu(cpu_model, cpu) >> 16; + index = get_cpu_range_count(cpu); + + for (i = 0; i < index; i++) { + switch (vendor) { + case X86_VENDOR_INTEL: + if ((cpu_intel_range[i].model & modelflag) && + (cpu_intel_range[i].flag & flag)) + return 1; + break; + case X86_VENDOR_AMD: + if (cpu_amd_range[i].flag & flag) + return 1; + break; + } + } + + /* Invalid */ + return 0; +} + +static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max, + int index, unsigned flag) +{ + unsigned modelflag; + + modelflag = per_cpu(cpu_modelflag, cpu); + *max = 0; + switch (per_cpu(cpu_model, cpu) >> 16) { + case X86_VENDOR_INTEL: + if ((cpu_intel_range[index].model & modelflag) && + (cpu_intel_range[index].flag & flag)) { + *min = cpu_intel_range[index].min; + *max = cpu_intel_range[index].max; + } + break; + case X86_VENDOR_AMD: + if (cpu_amd_range[index].flag & flag) { + *min = cpu_amd_range[index].min; + *max = cpu_amd_range[index].max; + } + break; + } + + return *max; +} + +/* This function can also be called with seq = NULL for printk */ +static void print_cpu_data(struct seq_file *seq, unsigned type, + u32 low, u32 high) +{ + struct cpu_private *priv; + u64 val = high; + + if (seq) { + priv = seq->private; + if (priv->file) { + val = (val << 32) | low; + seq_printf(seq, "0x%llx\n", val); + } else + seq_printf(seq, " %08x: %08x_%08x\n", + type, high, low); + } else + printk(KERN_INFO " %08x: %08x_%08x\n", type, high, low); +} + +/* This function can also be called with seq = NULL for printk */ +static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag) +{ + unsigned msr, msr_min, msr_max; + struct cpu_private *priv; + u32 low, high; + int i, range; + + if (seq) { + priv = seq->private; + if (priv->file) { + if (!rdmsr_safe_on_cpu(priv->cpu, priv->reg, + &low, &high)) + print_cpu_data(seq, priv->reg, low, high); + return; + } + } + + range = get_cpu_range_count(cpu); + + for (i = 0; i < range; i++) { + if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag)) + continue; + + for (msr = msr_min; msr <= msr_max; msr++) { + if (rdmsr_safe_on_cpu(cpu, msr, &low, &high)) + continue; + print_cpu_data(seq, msr, low, high); + } + } +} + +static void print_tss(void *arg) +{ + struct pt_regs *regs = task_pt_regs(current); + struct seq_file *seq = arg; + unsigned int seg; + + seq_printf(seq, " RAX\t: %016lx\n", regs->ax); + seq_printf(seq, " RBX\t: %016lx\n", regs->bx); + seq_printf(seq, " RCX\t: %016lx\n", regs->cx); + seq_printf(seq, " RDX\t: %016lx\n", regs->dx); + + seq_printf(seq, " RSI\t: %016lx\n", regs->si); + seq_printf(seq, " RDI\t: %016lx\n", regs->di); + seq_printf(seq, " RBP\t: %016lx\n", regs->bp); + seq_printf(seq, " ESP\t: %016lx\n", regs->sp); + +#ifdef CONFIG_X86_64 + seq_printf(seq, " R08\t: %016lx\n", regs->r8); + seq_printf(seq, " R09\t: %016lx\n", regs->r9); + seq_printf(seq, " R10\t: %016lx\n", regs->r10); + seq_printf(seq, " R11\t: %016lx\n", regs->r11); + seq_printf(seq, " R12\t: %016lx\n", regs->r12); + seq_printf(seq, " R13\t: %016lx\n", regs->r13); + seq_printf(seq, " R14\t: %016lx\n", regs->r14); + seq_printf(seq, " R15\t: %016lx\n", regs->r15); +#endif + + asm("movl %%cs,%0" : "=r" (seg)); + seq_printf(seq, " CS\t: %04x\n", seg); + asm("movl %%ds,%0" : "=r" (seg)); + seq_printf(seq, " DS\t: %04x\n", seg); + seq_printf(seq, " SS\t: %04lx\n", regs->ss); + asm("movl %%es,%0" : "=r" (seg)); + seq_printf(seq, " ES\t: %04x\n", seg); + asm("movl %%fs,%0" : "=r" (seg)); + seq_printf(seq, " FS\t: %04x\n", seg); + asm("movl %%gs,%0" : "=r" (seg)); + seq_printf(seq, " GS\t: %04x\n", seg); + + seq_printf(seq, " EFLAGS\t: %016lx\n", regs->flags); + + seq_printf(seq, " EIP\t: %016lx\n", regs->ip); +} + +static void print_cr(void *arg) +{ + struct seq_file *seq = arg; + + seq_printf(seq, " cr0\t: %016lx\n", read_cr0()); + seq_printf(seq, " cr2\t: %016lx\n", read_cr2()); + seq_printf(seq, " cr3\t: %016lx\n", read_cr3()); + seq_printf(seq, " cr4\t: %016lx\n", read_cr4_safe()); +#ifdef CONFIG_X86_64 + seq_printf(seq, " cr8\t: %016lx\n", read_cr8()); +#endif +} + +static void print_desc_ptr(char *str, struct seq_file *seq, struct desc_ptr dt) +{ + seq_printf(seq, " %s\t: %016llx\n", str, (u64)(dt.address | dt.size)); +} + +static void print_dt(void *seq) +{ + struct desc_ptr dt; + unsigned long ldt; + + /* IDT */ + store_idt((struct desc_ptr *)&dt); + print_desc_ptr("IDT", seq, dt); + + /* GDT */ + store_gdt((struct desc_ptr *)&dt); + print_desc_ptr("GDT", seq, dt); + + /* LDT */ + store_ldt(ldt); + seq_printf(seq, " LDT\t: %016lx\n", ldt); + + /* TR */ + store_tr(ldt); + seq_printf(seq, " TR\t: %016lx\n", ldt); +} + +static void print_dr(void *arg) +{ + struct seq_file *seq = arg; + unsigned long dr; + int i; + + for (i = 0; i < 8; i++) { + /* Ignore db4, db5 */ + if ((i == 4) || (i == 5)) + continue; + get_debugreg(dr, i); + seq_printf(seq, " dr%d\t: %016lx\n", i, dr); + } + + seq_printf(seq, "\n MSR\t:\n"); +} + +static void print_apic(void *arg) +{ + struct seq_file *seq = arg; + +#ifdef CONFIG_X86_LOCAL_APIC + seq_printf(seq, " LAPIC\t:\n"); + seq_printf(seq, " ID\t\t: %08x\n", apic_read(APIC_ID) >> 24); + seq_printf(seq, " LVR\t\t: %08x\n", apic_read(APIC_LVR)); + seq_printf(seq, " TASKPRI\t: %08x\n", apic_read(APIC_TASKPRI)); + seq_printf(seq, " ARBPRI\t\t: %08x\n", apic_read(APIC_ARBPRI)); + seq_printf(seq, " PROCPRI\t: %08x\n", apic_read(APIC_PROCPRI)); + seq_printf(seq, " LDR\t\t: %08x\n", apic_read(APIC_LDR)); + seq_printf(seq, " DFR\t\t: %08x\n", apic_read(APIC_DFR)); + seq_printf(seq, " SPIV\t\t: %08x\n", apic_read(APIC_SPIV)); + seq_printf(seq, " ISR\t\t: %08x\n", apic_read(APIC_ISR)); + seq_printf(seq, " ESR\t\t: %08x\n", apic_read(APIC_ESR)); + seq_printf(seq, " ICR\t\t: %08x\n", apic_read(APIC_ICR)); + seq_printf(seq, " ICR2\t\t: %08x\n", apic_read(APIC_ICR2)); + seq_printf(seq, " LVTT\t\t: %08x\n", apic_read(APIC_LVTT)); + seq_printf(seq, " LVTTHMR\t: %08x\n", apic_read(APIC_LVTTHMR)); + seq_printf(seq, " LVTPC\t\t: %08x\n", apic_read(APIC_LVTPC)); + seq_printf(seq, " LVT0\t\t: %08x\n", apic_read(APIC_LVT0)); + seq_printf(seq, " LVT1\t\t: %08x\n", apic_read(APIC_LVT1)); + seq_printf(seq, " LVTERR\t\t: %08x\n", apic_read(APIC_LVTERR)); + seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT)); + seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT)); + seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR)); +#endif /* CONFIG_X86_LOCAL_APIC */ + + seq_printf(seq, "\n MSR\t:\n"); +} + +static int cpu_seq_show(struct seq_file *seq, void *v) +{ + struct cpu_private *priv = seq->private; + + if (priv == NULL) + return -EINVAL; + + switch (cpu_base[priv->type].flag) { + case CPU_TSS: + smp_call_function_single(priv->cpu, print_tss, seq, 1); + break; + case CPU_CR: + smp_call_function_single(priv->cpu, print_cr, seq, 1); + break; + case CPU_DT: + smp_call_function_single(priv->cpu, print_dt, seq, 1); + break; + case CPU_DEBUG: + if (priv->file == CPU_INDEX_BIT) + smp_call_function_single(priv->cpu, print_dr, seq, 1); + print_msr(seq, priv->cpu, cpu_base[priv->type].flag); + break; + case CPU_APIC: + if (priv->file == CPU_INDEX_BIT) + smp_call_function_single(priv->cpu, print_apic, seq, 1); + print_msr(seq, priv->cpu, cpu_base[priv->type].flag); + break; + + default: + print_msr(seq, priv->cpu, cpu_base[priv->type].flag); + break; + } + seq_printf(seq, "\n"); + + return 0; +} + +static void *cpu_seq_start(struct seq_file *seq, loff_t *pos) +{ + if (*pos == 0) /* One time is enough ;-) */ + return seq; + + return NULL; +} + +static void *cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + (*pos)++; + + return cpu_seq_start(seq, pos); +} + +static void cpu_seq_stop(struct seq_file *seq, void *v) +{ +} + +static const struct seq_operations cpu_seq_ops = { + .start = cpu_seq_start, + .next = cpu_seq_next, + .stop = cpu_seq_stop, + .show = cpu_seq_show, +}; + +static int cpu_seq_open(struct inode *inode, struct file *file) +{ + struct cpu_private *priv = inode->i_private; + struct seq_file *seq; + int err; + + err = seq_open(file, &cpu_seq_ops); + if (!err) { + seq = file->private_data; + seq->private = priv; + } + + return err; +} + +static const struct file_operations cpu_fops = { + .open = cpu_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg, + unsigned file, struct dentry *dentry) +{ + struct cpu_private *priv = NULL; + + /* Already intialized */ + if (file == CPU_INDEX_BIT) + if (per_cpu(cpu_arr[type].init, cpu)) + return 0; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (priv == NULL) + return -ENOMEM; + + priv->cpu = cpu; + priv->type = type; + priv->reg = reg; + priv->file = file; + mutex_lock(&cpu_debug_lock); + per_cpu(priv_arr[type], cpu) = priv; + per_cpu(cpu_priv_count, cpu)++; + mutex_unlock(&cpu_debug_lock); + + if (file) + debugfs_create_file(cpu_file[file].name, S_IRUGO, + dentry, (void *)priv, &cpu_fops); + else { + debugfs_create_file(cpu_base[type].name, S_IRUGO, + per_cpu(cpu_arr[type].dentry, cpu), + (void *)priv, &cpu_fops); + mutex_lock(&cpu_debug_lock); + per_cpu(cpu_arr[type].init, cpu) = 1; + mutex_unlock(&cpu_debug_lock); + } + + return 0; +} + +static int cpu_init_regfiles(unsigned cpu, unsigned int type, unsigned reg, + struct dentry *dentry) +{ + unsigned file; + int err = 0; + + for (file = 0; file < ARRAY_SIZE(cpu_file); file++) { + err = cpu_create_file(cpu, type, reg, file, dentry); + if (err) + return err; + } + + return err; +} + +static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry) +{ + struct dentry *cpu_dentry = NULL; + unsigned reg, reg_min, reg_max; + int i, range, err = 0; + char reg_dir[12]; + u32 low, high; + + range = get_cpu_range_count(cpu); + + for (i = 0; i < range; i++) { + if (!get_cpu_range(cpu, ®_min, ®_max, i, + cpu_base[type].flag)) + continue; + + for (reg = reg_min; reg <= reg_max; reg++) { + if (rdmsr_safe_on_cpu(cpu, reg, &low, &high)) + continue; + + sprintf(reg_dir, "0x%x", reg); + cpu_dentry = debugfs_create_dir(reg_dir, dentry); + err = cpu_init_regfiles(cpu, type, reg, cpu_dentry); + if (err) + return err; + } + } + + return err; +} + +static int cpu_init_allreg(unsigned cpu, struct dentry *dentry) +{ + struct dentry *cpu_dentry = NULL; + unsigned type; + int err = 0; + + for (type = 0; type < ARRAY_SIZE(cpu_base) - 1; type++) { + if (!is_typeflag_valid(cpu, cpu_base[type].flag)) + continue; + cpu_dentry = debugfs_create_dir(cpu_base[type].name, dentry); + per_cpu(cpu_arr[type].dentry, cpu) = cpu_dentry; + + if (type < CPU_TSS_BIT) + err = cpu_init_msr(cpu, type, cpu_dentry); + else + err = cpu_create_file(cpu, type, 0, CPU_INDEX_BIT, + cpu_dentry); + if (err) + return err; + } + + return err; +} + +static int cpu_init_cpu(void) +{ + struct dentry *cpu_dentry = NULL; + struct cpuinfo_x86 *cpui; + char cpu_dir[12]; + unsigned cpu; + int err = 0; + + for (cpu = 0; cpu < nr_cpu_ids; cpu++) { + cpui = &cpu_data(cpu); + if (!cpu_has(cpui, X86_FEATURE_MSR)) + continue; + per_cpu(cpu_model, cpu) = ((cpui->x86_vendor << 16) | + (cpui->x86 << 8) | + (cpui->x86_model)); + per_cpu(cpu_modelflag, cpu) = get_cpu_modelflag(cpu); + + sprintf(cpu_dir, "cpu%d", cpu); + cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir); + err = cpu_init_allreg(cpu, cpu_dentry); + + pr_info("cpu%d(%d) debug files %d\n", + cpu, nr_cpu_ids, per_cpu(cpu_priv_count, cpu)); + if (per_cpu(cpu_priv_count, cpu) > MAX_CPU_FILES) { + pr_err("Register files count %d exceeds limit %d\n", + per_cpu(cpu_priv_count, cpu), MAX_CPU_FILES); + per_cpu(cpu_priv_count, cpu) = MAX_CPU_FILES; + err = -ENFILE; + } + if (err) + return err; + } + + return err; +} + +static int __init cpu_debug_init(void) +{ + cpu_debugfs_dir = debugfs_create_dir("cpu", arch_debugfs_dir); + + return cpu_init_cpu(); +} + +static void __exit cpu_debug_exit(void) +{ + int i, cpu; + + if (cpu_debugfs_dir) + debugfs_remove_recursive(cpu_debugfs_dir); + + for (cpu = 0; cpu < nr_cpu_ids; cpu++) + for (i = 0; i < per_cpu(cpu_priv_count, cpu); i++) + kfree(per_cpu(priv_arr[i], cpu)); +} + +module_init(cpu_debug_init); +module_exit(cpu_debug_exit); + +MODULE_AUTHOR("Jaswinder Singh Rajput"); +MODULE_DESCRIPTION("CPU Debug module"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From fef3a7a17418814733ebde0b40d8e32747677c8f Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 10 Mar 2009 10:56:57 +0800 Subject: x86, kexec: fix kexec x86 coding style Impact: Cleanup Fix some coding style issue for kexec x86. Signed-off-by: Huang Ying Signed-off-by: H. Peter Anvin --- arch/x86/kernel/machine_kexec_32.c | 17 ++++++++++------- arch/x86/kernel/machine_kexec_64.c | 15 ++++++++------- arch/x86/kernel/relocate_kernel_32.S | 24 ++++++++++++++++-------- arch/x86/kernel/relocate_kernel_64.S | 24 ++++++++++++++++-------- 4 files changed, 50 insertions(+), 30 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index f5fc8c781a6..e7368c1da01 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -14,12 +14,12 @@ #include #include #include +#include #include #include #include #include -#include #include #include #include @@ -63,7 +63,7 @@ static void load_segments(void) "\tmovl %%eax,%%fs\n" "\tmovl %%eax,%%gs\n" "\tmovl %%eax,%%ss\n" - ::: "eax", "memory"); + : : : "eax", "memory"); #undef STR #undef __STR } @@ -205,7 +205,8 @@ void machine_kexec(struct kimage *image) if (image->preserve_context) { #ifdef CONFIG_X86_IO_APIC - /* We need to put APICs in legacy mode so that we can + /* + * We need to put APICs in legacy mode so that we can * get timer interrupts in second kernel. kexec/kdump * paths already have calls to disable_IO_APIC() in * one form or other. kexec jump path also need @@ -227,7 +228,8 @@ void machine_kexec(struct kimage *image) page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT); - /* The segment registers are funny things, they have both a + /* + * The segment registers are funny things, they have both a * visible and an invisible part. Whenever the visible part is * set to a specific selector, the invisible part is loaded * with from a table in memory. At no other time is the @@ -237,11 +239,12 @@ void machine_kexec(struct kimage *image) * segments, before I zap the gdt with an invalid value. */ load_segments(); - /* The gdt & idt are now invalid. + /* + * The gdt & idt are now invalid. * If you want to load them you must set up your own idt & gdt. */ - set_gdt(phys_to_virt(0),0); - set_idt(phys_to_virt(0),0); + set_gdt(phys_to_virt(0), 0); + set_idt(phys_to_virt(0), 0); /* now call it */ image->start = relocate_kernel_ptr((unsigned long)image->head, diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 6993d51b7fd..f8c796fffa0 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -12,11 +12,11 @@ #include #include #include +#include #include #include #include -#include static void init_level2_page(pmd_t *level2p, unsigned long addr) { @@ -83,9 +83,8 @@ static int init_level4_page(struct kimage *image, pgd_t *level4p, } level3p = (pud_t *)page_address(page); result = init_level3_page(image, level3p, addr, last_addr); - if (result) { + if (result) goto out; - } set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE)); addr += PGDIR_SIZE; } @@ -242,7 +241,8 @@ void machine_kexec(struct kimage *image) page_list[PA_TABLE_PAGE] = (unsigned long)__pa(page_address(image->control_code_page)); - /* The segment registers are funny things, they have both a + /* + * The segment registers are funny things, they have both a * visible and an invisible part. Whenever the visible part is * set to a specific selector, the invisible part is loaded * with from a table in memory. At no other time is the @@ -252,11 +252,12 @@ void machine_kexec(struct kimage *image) * segments, before I zap the gdt with an invalid value. */ load_segments(); - /* The gdt & idt are now invalid. + /* + * The gdt & idt are now invalid. * If you want to load them you must set up your own idt & gdt. */ - set_gdt(phys_to_virt(0),0); - set_idt(phys_to_virt(0),0); + set_gdt(phys_to_virt(0), 0); + set_idt(phys_to_virt(0), 0); /* now call it */ relocate_kernel((unsigned long)image->head, (unsigned long)page_list, diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index 2064d0aa8d2..41235531b11 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -17,7 +17,8 @@ #define PTR(x) (x << 2) -/* control_page + KEXEC_CONTROL_CODE_MAX_SIZE +/* + * control_page + KEXEC_CONTROL_CODE_MAX_SIZE * ~ control_page + PAGE_SIZE are used as data storage and stack for * jumping back */ @@ -76,8 +77,10 @@ relocate_kernel: movl %eax, CP_PA_SWAP_PAGE(%edi) movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi) - /* get physical address of control page now */ - /* this is impossible after page table switch */ + /* + * get physical address of control page now + * this is impossible after page table switch + */ movl PTR(PA_CONTROL_PAGE)(%ebp), %edi /* switch to new set of page tables */ @@ -97,7 +100,8 @@ identity_mapped: /* store the start address on the stack */ pushl %edx - /* Set cr0 to a known state: + /* + * Set cr0 to a known state: * - Paging disabled * - Alignment check disabled * - Write protect disabled @@ -113,7 +117,8 @@ identity_mapped: /* clear cr4 if applicable */ testl %ecx, %ecx jz 1f - /* Set cr4 to a known state: + /* + * Set cr4 to a known state: * Setting everything to zero seems safe. */ xorl %eax, %eax @@ -132,15 +137,18 @@ identity_mapped: call swap_pages addl $8, %esp - /* To be certain of avoiding problems with self-modifying code + /* + * To be certain of avoiding problems with self-modifying code * I need to execute a serializing instruction here. * So I flush the TLB, it's handy, and not processor dependent. */ xorl %eax, %eax movl %eax, %cr3 - /* set all of the registers to known values */ - /* leave %esp alone */ + /* + * set all of the registers to known values + * leave %esp alone + */ testl %esi, %esi jnz 1f diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index d32cfb27a47..cfc0d24003d 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -24,7 +24,8 @@ .code64 .globl relocate_kernel relocate_kernel: - /* %rdi indirection_page + /* + * %rdi indirection_page * %rsi page_list * %rdx start address */ @@ -33,8 +34,10 @@ relocate_kernel: pushq $0 popfq - /* get physical address of control page now */ - /* this is impossible after page table switch */ + /* + * get physical address of control page now + * this is impossible after page table switch + */ movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 /* get physical address of page table now too */ @@ -55,7 +58,8 @@ identity_mapped: /* store the start address on the stack */ pushq %rdx - /* Set cr0 to a known state: + /* + * Set cr0 to a known state: * - Paging enabled * - Alignment check disabled * - Write protect disabled @@ -68,7 +72,8 @@ identity_mapped: orl $(X86_CR0_PG | X86_CR0_PE), %eax movq %rax, %cr0 - /* Set cr4 to a known state: + /* + * Set cr4 to a known state: * - physical address extension enabled */ movq $X86_CR4_PAE, %rax @@ -117,7 +122,8 @@ identity_mapped: jmp 0b 3: - /* To be certain of avoiding problems with self-modifying code + /* + * To be certain of avoiding problems with self-modifying code * I need to execute a serializing instruction here. * So I flush the TLB by reloading %cr3 here, it's handy, * and not processor dependent. @@ -125,8 +131,10 @@ identity_mapped: movq %cr3, %rax movq %rax, %cr3 - /* set all of the registers to known values */ - /* leave %rsp alone */ + /* + * set all of the registers to known values + * leave %rsp alone + */ xorq %rax, %rax xorq %rbx, %rbx -- cgit v1.2.3 From 5359454701ce51a4626b1ef6eb7b16ec35bd458d Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 10 Mar 2009 10:57:04 +0800 Subject: x86, kexec: x86_64: add identity map for pages at image->start Impact: Fix corner case that cannot yet occur image->start may be outside of 0 ~ max_pfn, for example when jumping back to original kernel from kexeced kenrel. This patch add identity map for pages at image->start. Signed-off-by: Huang Ying Signed-off-by: H. Peter Anvin --- arch/x86/kernel/machine_kexec_64.c | 42 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index f8c796fffa0..7cc5d3d0148 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -18,6 +18,41 @@ #include #include +static int init_one_level2_page(struct kimage *image, pgd_t *pgd, + unsigned long addr) +{ + pud_t *pud; + pmd_t *pmd; + struct page *page; + int result = -ENOMEM; + + addr &= PMD_MASK; + pgd += pgd_index(addr); + if (!pgd_present(*pgd)) { + page = kimage_alloc_control_pages(image, 0); + if (!page) + goto out; + pud = (pud_t *)page_address(page); + memset(pud, 0, PAGE_SIZE); + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + } + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) { + page = kimage_alloc_control_pages(image, 0); + if (!page) + goto out; + pmd = (pmd_t *)page_address(page); + memset(pmd, 0, PAGE_SIZE); + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + } + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); + result = 0; +out: + return result; +} + static void init_level2_page(pmd_t *level2p, unsigned long addr) { unsigned long end_addr; @@ -153,6 +188,13 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) int result; level4p = (pgd_t *)__va(start_pgtable); result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT); + if (result) + return result; + /* + * image->start may be outside 0 ~ max_pfn, for example when + * jump back to original kernel from kexeced kernel + */ + result = init_one_level2_page(image, level4p, image->start); if (result) return result; return init_transition_pgtable(image, level4p); -- cgit v1.2.3 From fee7b0d84cc8c7bc5dc212901c79e93eaf83a5b5 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 10 Mar 2009 10:57:16 +0800 Subject: x86, kexec: x86_64: add kexec jump support for x86_64 Impact: New major feature This patch add kexec jump support for x86_64. More information about kexec jump can be found in corresponding x86_32 support patch. Signed-off-by: Huang Ying Signed-off-by: H. Peter Anvin --- arch/x86/kernel/machine_kexec_64.c | 42 ++++++++- arch/x86/kernel/relocate_kernel_64.S | 177 ++++++++++++++++++++++++++++------- arch/x86/kernel/vmlinux_64.lds.S | 7 ++ 3 files changed, 189 insertions(+), 37 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 7cc5d3d0148..89cea4d4467 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -270,19 +271,43 @@ void machine_kexec(struct kimage *image) { unsigned long page_list[PAGES_NR]; void *control_page; + int save_ftrace_enabled; - tracer_disable(); +#ifdef CONFIG_KEXEC_JUMP + if (kexec_image->preserve_context) + save_processor_state(); +#endif + + save_ftrace_enabled = __ftrace_enabled_save(); /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); + if (image->preserve_context) { +#ifdef CONFIG_X86_IO_APIC + /* + * We need to put APICs in legacy mode so that we can + * get timer interrupts in second kernel. kexec/kdump + * paths already have calls to disable_IO_APIC() in + * one form or other. kexec jump path also need + * one. + */ + disable_IO_APIC(); +#endif + } + control_page = page_address(image->control_code_page) + PAGE_SIZE; - memcpy(control_page, relocate_kernel, PAGE_SIZE); + memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); + page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; page_list[PA_TABLE_PAGE] = (unsigned long)__pa(page_address(image->control_code_page)); + if (image->type == KEXEC_TYPE_DEFAULT) + page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) + << PAGE_SHIFT); + /* * The segment registers are funny things, they have both a * visible and an invisible part. Whenever the visible part is @@ -302,8 +327,17 @@ void machine_kexec(struct kimage *image) set_idt(phys_to_virt(0), 0); /* now call it */ - relocate_kernel((unsigned long)image->head, (unsigned long)page_list, - image->start); + image->start = relocate_kernel((unsigned long)image->head, + (unsigned long)page_list, + image->start, + image->preserve_context); + +#ifdef CONFIG_KEXEC_JUMP + if (kexec_image->preserve_context) + restore_processor_state(); +#endif + + __ftrace_enabled_restore(save_ftrace_enabled); } void arch_crash_save_vmcoreinfo(void) diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index cfc0d24003d..4de8f5b3d47 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -19,6 +19,24 @@ #define PTR(x) (x << 3) #define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) +/* + * control_page + KEXEC_CONTROL_CODE_MAX_SIZE + * ~ control_page + PAGE_SIZE are used as data storage and stack for + * jumping back + */ +#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset)) + +/* Minimal CPU state */ +#define RSP DATA(0x0) +#define CR0 DATA(0x8) +#define CR3 DATA(0x10) +#define CR4 DATA(0x18) + +/* other data */ +#define CP_PA_TABLE_PAGE DATA(0x20) +#define CP_PA_SWAP_PAGE DATA(0x28) +#define CP_PA_BACKUP_PAGES_MAP DATA(0x30) + .text .align PAGE_SIZE .code64 @@ -28,8 +46,27 @@ relocate_kernel: * %rdi indirection_page * %rsi page_list * %rdx start address + * %rcx preserve_context */ + /* Save the CPU context, used for jumping back */ + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushf + + movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 + movq %rsp, RSP(%r11) + movq %cr0, %rax + movq %rax, CR0(%r11) + movq %cr3, %rax + movq %rax, CR3(%r11) + movq %cr4, %rax + movq %rax, CR4(%r11) + /* zero out flags, and disable interrupts */ pushq $0 popfq @@ -41,10 +78,18 @@ relocate_kernel: movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 /* get physical address of page table now too */ - movq PTR(PA_TABLE_PAGE)(%rsi), %rcx + movq PTR(PA_TABLE_PAGE)(%rsi), %r9 + + /* get physical address of swap page now */ + movq PTR(PA_SWAP_PAGE)(%rsi), %r10 + + /* save some information for jumping back */ + movq %r9, CP_PA_TABLE_PAGE(%r11) + movq %r10, CP_PA_SWAP_PAGE(%r11) + movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11) /* Switch to the identity mapped page tables */ - movq %rcx, %cr3 + movq %r9, %cr3 /* setup a new stack at the end of the physical control page */ lea PAGE_SIZE(%r8), %rsp @@ -83,9 +128,87 @@ identity_mapped: 1: /* Flush the TLB (needed?) */ - movq %rcx, %cr3 + movq %r9, %cr3 + + movq %rcx, %r11 + call swap_pages + + /* + * To be certain of avoiding problems with self-modifying code + * I need to execute a serializing instruction here. + * So I flush the TLB by reloading %cr3 here, it's handy, + * and not processor dependent. + */ + movq %cr3, %rax + movq %rax, %cr3 + + /* + * set all of the registers to known values + * leave %rsp alone + */ + + testq %r11, %r11 + jnz 1f + xorq %rax, %rax + xorq %rbx, %rbx + xorq %rcx, %rcx + xorq %rdx, %rdx + xorq %rsi, %rsi + xorq %rdi, %rdi + xorq %rbp, %rbp + xorq %r8, %r8 + xorq %r9, %r9 + xorq %r10, %r9 + xorq %r11, %r11 + xorq %r12, %r12 + xorq %r13, %r13 + xorq %r14, %r14 + xorq %r15, %r15 + + ret + +1: + popq %rdx + leaq PAGE_SIZE(%r10), %rsp + call *%rdx + + /* get the re-entry point of the peer system */ + movq 0(%rsp), %rbp + call 1f +1: + popq %r8 + subq $(1b - relocate_kernel), %r8 + movq CP_PA_SWAP_PAGE(%r8), %r10 + movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi + movq CP_PA_TABLE_PAGE(%r8), %rax + movq %rax, %cr3 + lea PAGE_SIZE(%r8), %rsp + call swap_pages + movq $virtual_mapped, %rax + pushq %rax + ret + +virtual_mapped: + movq RSP(%r8), %rsp + movq CR4(%r8), %rax + movq %rax, %cr4 + movq CR3(%r8), %rax + movq CR0(%r8), %r8 + movq %rax, %cr3 + movq %r8, %cr0 + movq %rbp, %rax + + popf + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret /* Do the copies */ +swap_pages: movq %rdi, %rcx /* Put the page_list in %rcx */ xorq %rdi, %rdi xorq %rsi, %rsi @@ -117,39 +240,27 @@ identity_mapped: movq %rcx, %rsi /* For ever source page do a copy */ andq $0xfffffffffffff000, %rsi + movq %rdi, %rdx + movq %rsi, %rax + + movq %r10, %rdi movq $512, %rcx rep ; movsq - jmp 0b -3: - /* - * To be certain of avoiding problems with self-modifying code - * I need to execute a serializing instruction here. - * So I flush the TLB by reloading %cr3 here, it's handy, - * and not processor dependent. - */ - movq %cr3, %rax - movq %rax, %cr3 - - /* - * set all of the registers to known values - * leave %rsp alone - */ + movq %rax, %rdi + movq %rdx, %rsi + movq $512, %rcx + rep ; movsq - xorq %rax, %rax - xorq %rbx, %rbx - xorq %rcx, %rcx - xorq %rdx, %rdx - xorq %rsi, %rsi - xorq %rdi, %rdi - xorq %rbp, %rbp - xorq %r8, %r8 - xorq %r9, %r9 - xorq %r10, %r9 - xorq %r11, %r11 - xorq %r12, %r12 - xorq %r13, %r13 - xorq %r14, %r14 - xorq %r15, %r15 + movq %rdx, %rdi + movq %r10, %rsi + movq $512, %rcx + rep ; movsq + lea PAGE_SIZE(%rax), %rsi + jmp 0b +3: ret + + .globl kexec_control_code_size +.set kexec_control_code_size, . - relocate_kernel diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index fbfced6f680..5bf54e40c6e 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -275,3 +275,10 @@ ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), ASSERT((per_cpu__irq_stack_union == 0), "irq_stack_union is not at start of per-cpu area"); #endif + +#ifdef CONFIG_KEXEC +#include + +ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, + "kexec control code size is too big") +#endif -- cgit v1.2.3 From 5490fa96735ce0e2af270c0868987d644b9a38ec Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Wed, 11 Mar 2009 10:14:26 +0900 Subject: x86, mce: use round_jiffies() instead round_jiffies_relative() Impact: saving power _very_ little round_jiffies() round up absolute jiffies to full second. round_jiffies_relative() round up relative jiffies to full second. The "t->expires" is absolute jiffies. Then, round_jiffies() should be used instead round_jiffies_relative(). Signed-off-by: KOSAKI Motohiro Cc: Andi Kleen Cc: H. Peter Anvin Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index bfbd5323a63..ca14604611e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -639,7 +639,7 @@ static void mce_init_timer(void) if (!next_interval) return; setup_timer(t, mcheck_timer, smp_processor_id()); - t->expires = round_jiffies_relative(jiffies + next_interval); + t->expires = round_jiffies(jiffies + next_interval); add_timer(t); } @@ -1110,7 +1110,7 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, break; case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: - t->expires = round_jiffies_relative(jiffies + next_interval); + t->expires = round_jiffies(jiffies + next_interval); add_timer_on(t, cpu); smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); break; -- cgit v1.2.3 From 8229d754383e8cd905c38b56bd7365c7fc10dfc1 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 11 Mar 2009 19:13:49 +0530 Subject: x86: cpu architecture debug code, build fix, cleanup move store_ldt outside the CONFIG_PARAVIRT section and also clean up the code a bit. Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/cpu_debug.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c index 0bdf4daba20..9abbcbd933c 100755 --- a/arch/x86/kernel/cpu/cpu_debug.c +++ b/arch/x86/kernel/cpu/cpu_debug.c @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -427,7 +428,7 @@ static void print_tss(void *arg) seq_printf(seq, " CS\t: %04x\n", seg); asm("movl %%ds,%0" : "=r" (seg)); seq_printf(seq, " DS\t: %04x\n", seg); - seq_printf(seq, " SS\t: %04lx\n", regs->ss); + seq_printf(seq, " SS\t: %04lx\n", regs->ss & 0xffff); asm("movl %%es,%0" : "=r" (seg)); seq_printf(seq, " ES\t: %04x\n", seg); asm("movl %%fs,%0" : "=r" (seg)); -- cgit v1.2.3 From 9fa7266c2f0cdfcb09eacba2d3624bec7df78641 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 12 Mar 2009 10:34:45 +0000 Subject: x86: remove leftover unwind annotations Impact: cleanup These got left in needlessly when ret_from_fork got simplified. Signed-off-by: Jan Beulich LKML-Reference: <49B8F355.76E4.0078.0@novell.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 7ba4621c0df..54866bb5d38 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -416,7 +416,6 @@ ENTRY(ret_from_fork) GET_THREAD_INFO(%rcx) - CFI_REMEMBER_STATE RESTORE_REST testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? @@ -428,7 +427,6 @@ ENTRY(ret_from_fork) RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET jmp ret_from_sys_call # go to the SYSRET fastpath - CFI_RESTORE_STATE CFI_ENDPROC END(ret_from_fork) -- cgit v1.2.3 From c2810188c1b810c68139608a207befae0a4f1e69 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 12 Mar 2009 10:38:55 +0000 Subject: x86-64: move save_paranoid into .kprobes.text Impact: mark save_paranoid as non-kprobe-able code This appears to be necessary as the function gets called from kprobes-unsafe exception handling stubs (i.e. which themselves live in .kprobes.text). Signed-off-by: Jan Beulich LKML-Reference: <49B8F44F.76E4.0078.0@novell.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 54866bb5d38..a331ec38af9 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -368,6 +368,7 @@ ENTRY(save_rest) END(save_rest) /* save complete stack frame */ + .pushsection .kprobes.text, "ax" ENTRY(save_paranoid) XCPT_FRAME 1 RDI+8 cld @@ -396,6 +397,7 @@ ENTRY(save_paranoid) 1: ret CFI_ENDPROC END(save_paranoid) + .popsection /* * A newly forked process directly context switches into this address. -- cgit v1.2.3 From 02dde8b45c5460794b9052d7c12939fe3eb63c2c Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 12 Mar 2009 12:08:49 +0000 Subject: x86: move various CPU initialization objects into .cpuinit.rodata Impact: debuggability and micro-optimization Putting whatever is possible into the (final) .rodata section increases the likelihood of catching memory corruption bugs early, and reduces false cache line sharing. Signed-off-by: Jan Beulich LKML-Reference: <49B90961.76E4.0078.0@novell.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/addon_cpuid_features.c | 2 +- arch/x86/kernel/cpu/amd.c | 2 +- arch/x86/kernel/cpu/centaur.c | 2 +- arch/x86/kernel/cpu/centaur_64.c | 2 +- arch/x86/kernel/cpu/common.c | 20 ++++++++++---------- arch/x86/kernel/cpu/cpu.h | 11 ++++++----- arch/x86/kernel/cpu/cyrix.c | 16 ++++++++-------- arch/x86/kernel/cpu/intel.c | 2 +- arch/x86/kernel/cpu/intel_cacheinfo.c | 8 ++++---- arch/x86/kernel/cpu/transmeta.c | 2 +- arch/x86/kernel/cpu/umc.c | 2 +- arch/x86/kernel/mmconf-fam10h_64.c | 2 +- 12 files changed, 36 insertions(+), 35 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index 6882a735d9c..8220ae69849 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -29,7 +29,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) u32 regs[4]; const struct cpuid_bit *cb; - static const struct cpuid_bit cpuid_bits[] = { + static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 }, { 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index f47df59016c..7e4a459daa6 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -502,7 +502,7 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int } #endif -static struct cpu_dev amd_cpu_dev __cpuinitdata = { +static const struct cpu_dev __cpuinitconst amd_cpu_dev = { .c_vendor = "AMD", .c_ident = { "AuthenticAMD" }, #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 89bfdd9cacc..983e0830f0d 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -468,7 +468,7 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) return size; } -static struct cpu_dev centaur_cpu_dev __cpuinitdata = { +static const struct cpu_dev __cpuinitconst centaur_cpu_dev = { .c_vendor = "Centaur", .c_ident = { "CentaurHauls" }, .c_early_init = early_init_centaur, diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c index a1625f5a1e7..51b09c48c9c 100644 --- a/arch/x86/kernel/cpu/centaur_64.c +++ b/arch/x86/kernel/cpu/centaur_64.c @@ -25,7 +25,7 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); } -static struct cpu_dev centaur_cpu_dev __cpuinitdata = { +static const struct cpu_dev centaur_cpu_dev __cpuinitconst = { .c_vendor = "Centaur", .c_ident = { "CentaurHauls" }, .c_early_init = early_init_centaur, diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f8869978bbb..54cbe7690f9 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -70,7 +70,7 @@ cpumask_t cpu_sibling_setup_map; #endif /* CONFIG_X86_32 */ -static struct cpu_dev *this_cpu __cpuinitdata; +static const struct cpu_dev *this_cpu __cpuinitdata; DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { #ifdef CONFIG_X86_64 @@ -274,9 +274,9 @@ static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn) */ /* Look up CPU names by table lookup. */ -static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c) +static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c) { - struct cpu_model_info *info; + const struct cpu_model_info *info; if (c->x86_model >= 16) return NULL; /* Range check */ @@ -321,7 +321,7 @@ void switch_to_new_gdt(int cpu) load_percpu_segment(cpu); } -static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; +static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {}; static void __cpuinit default_init(struct cpuinfo_x86 *c) { @@ -340,7 +340,7 @@ static void __cpuinit default_init(struct cpuinfo_x86 *c) #endif } -static struct cpu_dev __cpuinitdata default_cpu = { +static const struct cpu_dev __cpuinitconst default_cpu = { .c_init = default_init, .c_vendor = "Unknown", .c_x86_vendor = X86_VENDOR_UNKNOWN, @@ -634,12 +634,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) void __init early_cpu_init(void) { - struct cpu_dev **cdev; + const struct cpu_dev *const *cdev; int count = 0; printk("KERNEL supported cpus:\n"); for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { - struct cpu_dev *cpudev = *cdev; + const struct cpu_dev *cpudev = *cdev; unsigned int j; if (count >= X86_VENDOR_NUM) @@ -768,7 +768,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) /* If the model name is still unset, do table lookup. */ if (!c->x86_model_id[0]) { - char *p; + const char *p; p = table_lookup_model(c); if (p) strcpy(c->x86_model_id, p); @@ -847,7 +847,7 @@ struct msr_range { unsigned max; }; -static struct msr_range msr_range_array[] __cpuinitdata = { +static const struct msr_range msr_range_array[] __cpuinitconst = { { 0x00000000, 0x00000418}, { 0xc0000000, 0xc000040b}, { 0xc0010000, 0xc0010142}, @@ -894,7 +894,7 @@ __setup("noclflush", setup_noclflush); void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) { - char *vendor = NULL; + const char *vendor = NULL; if (c->x86_vendor < X86_VENDOR_NUM) vendor = this_cpu->c_vendor; diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index de4094a3921..9469ecb5aeb 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -5,15 +5,15 @@ struct cpu_model_info { int vendor; int family; - char *model_names[16]; + const char *model_names[16]; }; /* attempt to consolidate cpu attributes */ struct cpu_dev { - char * c_vendor; + const char * c_vendor; /* some have two possibilities for cpuid string */ - char * c_ident[2]; + const char * c_ident[2]; struct cpu_model_info c_models[4]; @@ -25,11 +25,12 @@ struct cpu_dev { }; #define cpu_dev_register(cpu_devX) \ - static struct cpu_dev *__cpu_dev_##cpu_devX __used \ + static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \ __attribute__((__section__(".x86_cpu_dev.init"))) = \ &cpu_devX; -extern struct cpu_dev *__x86_cpu_dev_start[], *__x86_cpu_dev_end[]; +extern const struct cpu_dev *const __x86_cpu_dev_start[], + *const __x86_cpu_dev_end[]; extern void display_cacheinfo(struct cpuinfo_x86 *c); diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index ffd0f5ed071..593171e967e 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -61,23 +61,23 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) */ static unsigned char Cx86_dir0_msb __cpuinitdata = 0; -static char Cx86_model[][9] __cpuinitdata = { +static const char __cpuinitconst Cx86_model[][9] = { "Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ", "M II ", "Unknown" }; -static char Cx486_name[][5] __cpuinitdata = { +static const char __cpuinitconst Cx486_name[][5] = { "SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx", "SRx2", "DRx2" }; -static char Cx486S_name[][4] __cpuinitdata = { +static const char __cpuinitconst Cx486S_name[][4] = { "S", "S2", "Se", "S2e" }; -static char Cx486D_name[][4] __cpuinitdata = { +static const char __cpuinitconst Cx486D_name[][4] = { "DX", "DX2", "?", "?", "?", "DX4" }; static char Cx86_cb[] __cpuinitdata = "?.5x Core/Bus Clock"; -static char cyrix_model_mult1[] __cpuinitdata = "12??43"; -static char cyrix_model_mult2[] __cpuinitdata = "12233445"; +static const char __cpuinitconst cyrix_model_mult1[] = "12??43"; +static const char __cpuinitconst cyrix_model_mult2[] = "12233445"; /* * Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old @@ -435,7 +435,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c) } } -static struct cpu_dev cyrix_cpu_dev __cpuinitdata = { +static const struct cpu_dev __cpuinitconst cyrix_cpu_dev = { .c_vendor = "Cyrix", .c_ident = { "CyrixInstead" }, .c_early_init = early_init_cyrix, @@ -446,7 +446,7 @@ static struct cpu_dev cyrix_cpu_dev __cpuinitdata = { cpu_dev_register(cyrix_cpu_dev); -static struct cpu_dev nsc_cpu_dev __cpuinitdata = { +static const struct cpu_dev __cpuinitconst nsc_cpu_dev = { .c_vendor = "NSC", .c_ident = { "Geode by NSC" }, .c_init = init_nsc, diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 191117f1ad5..968f15129ed 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -410,7 +410,7 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i } #endif -static struct cpu_dev intel_cpu_dev __cpuinitdata = { +static const struct cpu_dev __cpuinitconst intel_cpu_dev = { .c_vendor = "Intel", .c_ident = { "GenuineIntel" }, #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 7293508d8f5..c471eb1a389 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -32,7 +32,7 @@ struct _cache_table }; /* all the cache descriptor types we care about (no TLB or trace cache entries) */ -static struct _cache_table cache_table[] __cpuinitdata = +static const struct _cache_table __cpuinitconst cache_table[] = { { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */ { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */ @@ -206,15 +206,15 @@ union l3_cache { unsigned val; }; -static unsigned short assocs[] __cpuinitdata = { +static const unsigned short __cpuinitconst assocs[] = { [1] = 1, [2] = 2, [4] = 4, [6] = 8, [8] = 16, [0xa] = 32, [0xb] = 48, [0xc] = 64, [0xf] = 0xffff // ?? }; -static unsigned char levels[] __cpuinitdata = { 1, 1, 2, 3 }; -static unsigned char types[] __cpuinitdata = { 1, 2, 3, 3 }; +static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 }; +static const unsigned char __cpuinitconst types[] = { 1, 2, 3, 3 }; static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index 52b3fefbd5a..bb62b3e5caa 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c @@ -98,7 +98,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) #endif } -static struct cpu_dev transmeta_cpu_dev __cpuinitdata = { +static const struct cpu_dev __cpuinitconst transmeta_cpu_dev = { .c_vendor = "Transmeta", .c_ident = { "GenuineTMx86", "TransmetaCPU" }, .c_early_init = early_init_transmeta, diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c index e777f79e096..fd2c37bf7ac 100644 --- a/arch/x86/kernel/cpu/umc.c +++ b/arch/x86/kernel/cpu/umc.c @@ -8,7 +8,7 @@ * so no special init takes place. */ -static struct cpu_dev umc_cpu_dev __cpuinitdata = { +static const struct cpu_dev __cpuinitconst umc_cpu_dev = { .c_vendor = "UMC", .c_ident = { "UMC UMC UMC" }, .c_models = { diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index 666e43df51f..712d15fdc41 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -226,7 +226,7 @@ static int __devinit set_check_enable_amd_mmconf(const struct dmi_system_id *d) return 0; } -static struct dmi_system_id __devinitdata mmconf_dmi_table[] = { +static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = { { .callback = set_check_enable_amd_mmconf, .ident = "Sun Microsystems Machine", -- cgit v1.2.3 From 7a81d9a7da03d2f27840d659f97ef140d032f609 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 12 Mar 2009 12:45:15 +0000 Subject: x86: smarten /proc/interrupts output Impact: change /proc/interrupts output ABI With the number of interrupts on large systems growing, assumptions on the width an interrupt number requires when converted to a decimal string turn invalid. Therefore, calculate the maximum number of digits dynamically. Signed-off-by: Jan Beulich LKML-Reference: <49B911EB.76E4.0078.0@novell.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index b864341dcc4..b8ac3b6cf77 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -45,16 +45,16 @@ void ack_bad_irq(unsigned int irq) /* * /proc/interrupts printing: */ -static int show_other_interrupts(struct seq_file *p) +static int show_other_interrupts(struct seq_file *p, int prec) { int j; - seq_printf(p, "NMI: "); + seq_printf(p, "%*s: ", prec, "NMI"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->__nmi_count); seq_printf(p, " Non-maskable interrupts\n"); #ifdef CONFIG_X86_LOCAL_APIC - seq_printf(p, "LOC: "); + seq_printf(p, "%*s: ", prec, "LOC"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); seq_printf(p, " Local timer interrupts\n"); @@ -66,40 +66,40 @@ static int show_other_interrupts(struct seq_file *p) seq_printf(p, " Platform interrupts\n"); } #ifdef CONFIG_SMP - seq_printf(p, "RES: "); + seq_printf(p, "%*s: ", prec, "RES"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count); seq_printf(p, " Rescheduling interrupts\n"); - seq_printf(p, "CAL: "); + seq_printf(p, "%*s: ", prec, "CAL"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_call_count); seq_printf(p, " Function call interrupts\n"); - seq_printf(p, "TLB: "); + seq_printf(p, "%*s: ", prec, "TLB"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); seq_printf(p, " TLB shootdowns\n"); #endif #ifdef CONFIG_X86_MCE - seq_printf(p, "TRM: "); + seq_printf(p, "%*s: ", prec, "TRM"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); seq_printf(p, " Thermal event interrupts\n"); # ifdef CONFIG_X86_64 - seq_printf(p, "THR: "); + seq_printf(p, "%*s: ", prec, "THR"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); seq_printf(p, " Threshold APIC interrupts\n"); # endif #endif #ifdef CONFIG_X86_LOCAL_APIC - seq_printf(p, "SPU: "); + seq_printf(p, "%*s: ", prec, "SPU"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); seq_printf(p, " Spurious interrupts\n"); #endif - seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); + seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) - seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); + seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); #endif return 0; } @@ -107,19 +107,22 @@ static int show_other_interrupts(struct seq_file *p) int show_interrupts(struct seq_file *p, void *v) { unsigned long flags, any_count = 0; - int i = *(loff_t *) v, j; + int i = *(loff_t *) v, j, prec; struct irqaction *action; struct irq_desc *desc; if (i > nr_irqs) return 0; + for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec) + j *= 10; + if (i == nr_irqs) - return show_other_interrupts(p); + return show_other_interrupts(p, prec); /* print header */ if (i == 0) { - seq_printf(p, " "); + seq_printf(p, "%*s", prec + 8, ""); for_each_online_cpu(j) seq_printf(p, "CPU%-8d", j); seq_putc(p, '\n'); @@ -140,7 +143,7 @@ int show_interrupts(struct seq_file *p, void *v) if (!action && !any_count) goto out; - seq_printf(p, "%3d: ", i); + seq_printf(p, "%*d: ", prec, i); #ifndef CONFIG_SMP seq_printf(p, "%10u ", kstat_irqs(i)); #else -- cgit v1.2.3 From 13c6c53282d99c82e79b02477efd2c1e30a991ef Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 12 Mar 2009 12:37:34 +0000 Subject: x86, 32-bit: also use cpuinfo_x86's x86_{phys,virt}_bits members Impact: 32/64-bit consolidation In a first step, this allows fixing phys_addr_valid() for PAE (which until now reported all addresses to be valid). Subsequently, this will also allow simplifying some MTRR handling code. Signed-off-by: Jan Beulich LKML-Reference: <49B9101E.76E4.0078.0@novell.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 12 +++++++++++- arch/x86/kernel/cpu/intel.c | 5 +++++ 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 826d5c87627..a95e9480bb9 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -549,13 +549,15 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) } } -#ifdef CONFIG_X86_64 if (c->extended_cpuid_level >= 0x80000008) { u32 eax = cpuid_eax(0x80000008); c->x86_virt_bits = (eax >> 8) & 0xff; c->x86_phys_bits = eax & 0xff; } +#ifdef CONFIG_X86_32 + else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36)) + c->x86_phys_bits = 36; #endif if (c->extended_cpuid_level >= 0x80000007) @@ -602,8 +604,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_64 c->x86_clflush_size = 64; + c->x86_phys_bits = 36; + c->x86_virt_bits = 48; #else c->x86_clflush_size = 32; + c->x86_phys_bits = 32; + c->x86_virt_bits = 32; #endif c->x86_cache_alignment = c->x86_clflush_size; @@ -726,9 +732,13 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) c->x86_coreid_bits = 0; #ifdef CONFIG_X86_64 c->x86_clflush_size = 64; + c->x86_phys_bits = 36; + c->x86_virt_bits = 48; #else c->cpuid_level = -1; /* CPUID not detected */ c->x86_clflush_size = 32; + c->x86_phys_bits = 32; + c->x86_virt_bits = 32; #endif c->x86_cache_alignment = c->x86_clflush_size; memset(&c->x86_capability, 0, sizeof c->x86_capability); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 191117f1ad5..ae769471042 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -54,6 +54,11 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) c->x86_cache_alignment = 128; #endif + /* CPUID workaround for 0F33/0F34 CPU */ + if (c->x86 == 0xF && c->x86_model == 0x3 + && (c->x86_mask == 0x3 || c->x86_mask == 0x4)) + c->x86_phys_bits = 36; + /* * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate * with P/T states and does not stop in deep C-states -- cgit v1.2.3 From 9a50156a1c7bfa65315facaffdfbed6513fcfd3b Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 12 Mar 2009 12:41:23 +0000 Subject: x86: properly __init-annotate recent early_printk additions Impact: cleanup, save memory Don't keep code resident that's only needed during startup. Signed-off-by: Jan Beulich LKML-Reference: <49B91103.76E4.0078.0@novell.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/early_printk.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 639ad98238a..335f049d110 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -250,7 +250,7 @@ static int dbgp_wait_until_complete(void) return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl); } -static void dbgp_mdelay(int ms) +static void __init dbgp_mdelay(int ms) { int i; @@ -311,7 +311,7 @@ static void dbgp_set_data(const void *buf, int size) writel(hi, &ehci_debug->data47); } -static void dbgp_get_data(void *buf, int size) +static void __init dbgp_get_data(void *buf, int size) { unsigned char *bytes = buf; u32 lo, hi; @@ -355,7 +355,7 @@ static int dbgp_bulk_write(unsigned devnum, unsigned endpoint, return ret; } -static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data, +static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data, int size) { u32 pids, addr, ctrl; @@ -386,8 +386,8 @@ static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data, return ret; } -static int dbgp_control_msg(unsigned devnum, int requesttype, int request, - int value, int index, void *data, int size) +static int __init dbgp_control_msg(unsigned devnum, int requesttype, + int request, int value, int index, void *data, int size) { u32 pids, addr, ctrl; struct usb_ctrlrequest req; @@ -489,7 +489,7 @@ static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc) return 0; } -static int ehci_reset_port(int port) +static int __init ehci_reset_port(int port) { u32 portsc; u32 delay_time, delay; @@ -532,7 +532,7 @@ static int ehci_reset_port(int port) return -EBUSY; } -static int ehci_wait_for_port(int port) +static int __init ehci_wait_for_port(int port) { u32 status; int ret, reps; @@ -557,13 +557,13 @@ static inline void dbgp_printk(const char *fmt, ...) { } typedef void (*set_debug_port_t)(int port); -static void default_set_debug_port(int port) +static void __init default_set_debug_port(int port) { } -static set_debug_port_t set_debug_port = default_set_debug_port; +static set_debug_port_t __initdata set_debug_port = default_set_debug_port; -static void nvidia_set_debug_port(int port) +static void __init nvidia_set_debug_port(int port) { u32 dword; dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, -- cgit v1.2.3 From 82034d6f59b4772f4233bbb61c670290803a9960 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 12 Mar 2009 12:57:10 +0000 Subject: x86: clean up output resulting from update_mptable option Impact: cleanup Without apic=verbose, using the update_mptable option would result in garbled and confusing output due to the inconsistent use of printk() vs apic_printk(). Signed-off-by: Jan Beulich LKML-Reference: <49B914B6.76E4.0078.0@novell.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/mpparse.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index e8192401da4..47673e02ae5 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -890,12 +890,12 @@ static int __init replace_intsrc_all(struct mpc_table *mpc, #ifdef CONFIG_X86_IO_APIC struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; - printk(KERN_INFO "OLD "); + apic_printk(APIC_VERBOSE, "OLD "); print_MP_intsrc_info(m); i = get_MP_intsrc_index(m); if (i > 0) { assign_to_mpc_intsrc(&mp_irqs[i], m); - printk(KERN_INFO "NEW "); + apic_printk(APIC_VERBOSE, "NEW "); print_mp_irq_info(&mp_irqs[i]); } else if (!i) { /* legacy, do nothing */ @@ -943,7 +943,7 @@ static int __init replace_intsrc_all(struct mpc_table *mpc, continue; if (nr_m_spare > 0) { - printk(KERN_INFO "*NEW* found "); + apic_printk(APIC_VERBOSE, "*NEW* found\n"); nr_m_spare--; assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]); m_spare[nr_m_spare] = NULL; -- cgit v1.2.3 From 5c0e6f035df983210e4d22213aed624ced502d3d Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 12 Mar 2009 13:07:23 +0000 Subject: x86: fix code paths used by update_mptable Impact: fix crashes under Xen due to unrobust e820 code find_e820_area_size() must return a properly distinguishable and out-of-bounds value when it fails, and -1UL does not meet that criteria on i386/PAE. Additionally, callers of the function must check against that value. early_reserve_e820() should be prepared for the region found to be outside of the addressable range on 32-bits. e820_update_range_map() should not blindly update e820, but should do all it work on the map it got a pointer passed for (which in 50% of the cases is &e820_saved). It must also not call e820_add_region(), as that again acts on e820 unconditionally. The issues were found when trying to make this option work in our Xen kernel (i.e. where some of the silent assumptions made in the code would not hold). Signed-off-by: Jan Beulich LKML-Reference: <49B9171B.76E4.0078.0@novell.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/check.c | 2 +- arch/x86/kernel/e820.c | 32 +++++++++++++++++++++++++------- 2 files changed, 26 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c index 2ac0ab71412..b617b1164f1 100644 --- a/arch/x86/kernel/check.c +++ b/arch/x86/kernel/check.c @@ -83,7 +83,7 @@ void __init setup_bios_corruption_check(void) u64 size; addr = find_e820_area_size(addr, &size, PAGE_SIZE); - if (addr == 0) + if (!(addr + 1)) break; if ((addr + size) > corruption_check_size) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 508bec1cee2..3cf6681ac80 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -421,7 +421,7 @@ static u64 __init e820_update_range_map(struct e820map *e820x, u64 start, u64 size, unsigned old_type, unsigned new_type) { - int i; + unsigned int i, x; u64 real_updated_size = 0; BUG_ON(old_type == new_type); @@ -429,7 +429,7 @@ static u64 __init e820_update_range_map(struct e820map *e820x, u64 start, if (size > (ULLONG_MAX - start)) size = ULLONG_MAX - start; - for (i = 0; i < e820.nr_map; i++) { + for (i = 0; i < e820x->nr_map; i++) { struct e820entry *ei = &e820x->map[i]; u64 final_start, final_end; if (ei->type != old_type) @@ -446,14 +446,23 @@ static u64 __init e820_update_range_map(struct e820map *e820x, u64 start, final_end = min(start + size, ei->addr + ei->size); if (final_start >= final_end) continue; - e820_add_region(final_start, final_end - final_start, - new_type); + + x = e820x->nr_map; + if (x == ARRAY_SIZE(e820x->map)) { + printk(KERN_ERR "Too many memory map entries!\n"); + break; + } + e820x->map[x].addr = final_start; + e820x->map[x].size = final_end - final_start; + e820x->map[x].type = new_type; + e820x->nr_map++; + real_updated_size += final_end - final_start; - ei->size -= final_end - final_start; if (ei->addr < final_start) continue; ei->addr = final_end; + ei->size -= final_end - final_start; } return real_updated_size; } @@ -1020,8 +1029,8 @@ u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align) continue; return addr; } - return -1UL; + return -1ULL; } /* @@ -1034,13 +1043,22 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align) u64 start; start = startt; - while (size < sizet) + while (size < sizet && (start + 1)) start = find_e820_area_size(start, &size, align); if (size < sizet) return 0; +#ifdef CONFIG_X86_32 + if (start >= MAXMEM) + return 0; + if (start + size > MAXMEM) + size = MAXMEM - start; +#endif + addr = round_down(start + size - sizet, align); + if (addr < start) + return 0; e820_update_range(addr, sizet, E820_RAM, E820_RESERVED); e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED); printk(KERN_INFO "update e820 for early_reserve_e820\n"); -- cgit v1.2.3 From 8ad9790588ee2e69118b2b294ddab6f3f0379ad9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 12 Mar 2009 18:43:54 -0700 Subject: x86: more MTRR debug printouts Impact: improve MTRR debugging messages There's still inefficiencies suspected with the MTRR sanitizing code, so make sure we get all the info we need from a dmesg. - Remove unneeded mtrr_show (It will only printout one time by first cpu, so it is no big deal.) - Also print out directly from get_mtrr, because it doesn't update mtrr_state. Signed-off-by: Yinghai Lu LKML-Reference: <49B9BA5A.40108@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 95 ++++++++++++++++++++------------------ 1 file changed, 51 insertions(+), 44 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 0c0a455fe95..96440352010 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -33,14 +33,6 @@ u64 mtrr_tom2; struct mtrr_state_type mtrr_state = {}; EXPORT_SYMBOL_GPL(mtrr_state); -static int __initdata mtrr_show; -static int __init mtrr_debug(char *opt) -{ - mtrr_show = 1; - return 0; -} -early_param("mtrr.show", mtrr_debug); - /* * Returns the effective MTRR type for the region * Error returns: @@ -193,13 +185,51 @@ static void print_fixed(unsigned base, unsigned step, const mtrr_type*types) unsigned i; for (i = 0; i < 8; ++i, ++types, base += step) - printk(KERN_INFO "MTRR %05X-%05X %s\n", + printk(KERN_INFO " %05X-%05X %s\n", base, base + step - 1, mtrr_attrib_to_str(*types)); } static void prepare_set(void); static void post_set(void); +static void __init print_mtrr_state(void) +{ + unsigned int i; + int high_width; + + printk(KERN_INFO "MTRR default type: %s\n", mtrr_attrib_to_str(mtrr_state.def_type)); + if (mtrr_state.have_fixed) { + printk(KERN_INFO "MTRR fixed ranges %sabled:\n", + mtrr_state.enabled & 1 ? "en" : "dis"); + print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0); + for (i = 0; i < 2; ++i) + print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8); + for (i = 0; i < 8; ++i) + print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8); + } + printk(KERN_INFO "MTRR variable ranges %sabled:\n", + mtrr_state.enabled & 2 ? "en" : "dis"); + high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4; + for (i = 0; i < num_var_ranges; ++i) { + if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) + printk(KERN_INFO " %u base %0*X%05X000 mask %0*X%05X000 %s\n", + i, + high_width, + mtrr_state.var_ranges[i].base_hi, + mtrr_state.var_ranges[i].base_lo >> 12, + high_width, + mtrr_state.var_ranges[i].mask_hi, + mtrr_state.var_ranges[i].mask_lo >> 12, + mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff)); + else + printk(KERN_INFO " %u disabled\n", i); + } + if (mtrr_tom2) { + printk(KERN_INFO "TOM2: %016llx aka %lldM\n", + mtrr_tom2, mtrr_tom2>>20); + } +} + /* Grab all of the MTRR state for this CPU into *state */ void __init get_mtrr_state(void) { @@ -231,41 +261,9 @@ void __init get_mtrr_state(void) mtrr_tom2 |= low; mtrr_tom2 &= 0xffffff800000ULL; } - if (mtrr_show) { - int high_width; - - printk(KERN_INFO "MTRR default type: %s\n", mtrr_attrib_to_str(mtrr_state.def_type)); - if (mtrr_state.have_fixed) { - printk(KERN_INFO "MTRR fixed ranges %sabled:\n", - mtrr_state.enabled & 1 ? "en" : "dis"); - print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0); - for (i = 0; i < 2; ++i) - print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8); - for (i = 0; i < 8; ++i) - print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8); - } - printk(KERN_INFO "MTRR variable ranges %sabled:\n", - mtrr_state.enabled & 2 ? "en" : "dis"); - high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4; - for (i = 0; i < num_var_ranges; ++i) { - if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) - printk(KERN_INFO "MTRR %u base %0*X%05X000 mask %0*X%05X000 %s\n", - i, - high_width, - mtrr_state.var_ranges[i].base_hi, - mtrr_state.var_ranges[i].base_lo >> 12, - high_width, - mtrr_state.var_ranges[i].mask_hi, - mtrr_state.var_ranges[i].mask_lo >> 12, - mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff)); - else - printk(KERN_INFO "MTRR %u disabled\n", i); - } - if (mtrr_tom2) { - printk(KERN_INFO "TOM2: %016llx aka %lldM\n", - mtrr_tom2, mtrr_tom2>>20); - } - } + + print_mtrr_state(); + mtrr_state_set = 1; /* PAT setup for BP. We need to go through sync steps here */ @@ -377,7 +375,12 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, unsigned int mask_lo, mask_hi, base_lo, base_hi; unsigned int tmp, hi; + /* + * get_mtrr doesn't need to update mtrr_state, also it could be called + * from any cpu, so try to print it out directly. + */ rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); + if ((mask_lo & 0x800) == 0) { /* Invalid (i.e. free) range */ *base = 0; @@ -407,6 +410,10 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, *size = -mask_lo; *base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT; *type = base_lo & 0xff; + + printk(KERN_DEBUG " get_mtrr: cpu%d reg%02d base=%010lx size=%010lx %s\n", + smp_processor_id(), reg, *base, *size, + mtrr_attrib_to_str(*type & 0xff)); } /** -- cgit v1.2.3 From c1ab7e93c6ddf8a068719b97b7e26c3d8eba7c32 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 11 Mar 2009 20:05:46 -0700 Subject: x86: print out mtrr_range_state when user specify size Impact: print more debug info Keep it consistent with autodetect version. Signed-off-by: Yinghai Lu LKML-Reference: <49B87C0A.4010105@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/main.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 236a401b825..311a4734609 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -1424,6 +1424,8 @@ static int __init mtrr_cleanup(unsigned address_bits) if (!result[i].bad) { set_var_mtrr_all(address_bits); + printk(KERN_DEBUG "New variable MTRRs\n"); + print_out_mtrr_range_state(); return 1; } printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " -- cgit v1.2.3 From 0d890355bff25e1dc03a577a90ed80741489ca54 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 11 Mar 2009 20:07:39 -0700 Subject: x86: separate mtrr cleanup/mtrr_e820 trim to separate file Impact: cleanup mtrr main.c is too big, seperate mtrr cleanup and mtrr e820 trim code to another file. Signed-off-by: Yinghai Lu LKML-Reference: <49B87C7B.80809@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/Makefile | 2 +- arch/x86/kernel/cpu/mtrr/cleanup.c | 1089 ++++++++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/mtrr/main.c | 1055 +--------------------------------- arch/x86/kernel/cpu/mtrr/mtrr.h | 3 + 4 files changed, 1094 insertions(+), 1055 deletions(-) create mode 100644 arch/x86/kernel/cpu/mtrr/cleanup.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile index 191fc053364..f4361b56f8e 100644 --- a/arch/x86/kernel/cpu/mtrr/Makefile +++ b/arch/x86/kernel/cpu/mtrr/Makefile @@ -1,3 +1,3 @@ -obj-y := main.o if.o generic.o state.o +obj-y := main.o if.o generic.o state.o cleanup.o obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c new file mode 100644 index 00000000000..58b58bbf7eb --- /dev/null +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -0,0 +1,1089 @@ +/* MTRR (Memory Type Range Register) cleanup + + Copyright (C) 2009 Yinghai Lu + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with this library; if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include "mtrr.h" + +/* should be related to MTRR_VAR_RANGES nums */ +#define RANGE_NUM 256 + +struct res_range { + unsigned long start; + unsigned long end; +}; + +static int __init +add_range(struct res_range *range, int nr_range, unsigned long start, + unsigned long end) +{ + /* out of slots */ + if (nr_range >= RANGE_NUM) + return nr_range; + + range[nr_range].start = start; + range[nr_range].end = end; + + nr_range++; + + return nr_range; +} + +static int __init +add_range_with_merge(struct res_range *range, int nr_range, unsigned long start, + unsigned long end) +{ + int i; + + /* try to merge it with old one */ + for (i = 0; i < nr_range; i++) { + unsigned long final_start, final_end; + unsigned long common_start, common_end; + + if (!range[i].end) + continue; + + common_start = max(range[i].start, start); + common_end = min(range[i].end, end); + if (common_start > common_end + 1) + continue; + + final_start = min(range[i].start, start); + final_end = max(range[i].end, end); + + range[i].start = final_start; + range[i].end = final_end; + return nr_range; + } + + /* need to add that */ + return add_range(range, nr_range, start, end); +} + +static void __init +subtract_range(struct res_range *range, unsigned long start, unsigned long end) +{ + int i, j; + + for (j = 0; j < RANGE_NUM; j++) { + if (!range[j].end) + continue; + + if (start <= range[j].start && end >= range[j].end) { + range[j].start = 0; + range[j].end = 0; + continue; + } + + if (start <= range[j].start && end < range[j].end && + range[j].start < end + 1) { + range[j].start = end + 1; + continue; + } + + + if (start > range[j].start && end >= range[j].end && + range[j].end > start - 1) { + range[j].end = start - 1; + continue; + } + + if (start > range[j].start && end < range[j].end) { + /* find the new spare */ + for (i = 0; i < RANGE_NUM; i++) { + if (range[i].end == 0) + break; + } + if (i < RANGE_NUM) { + range[i].end = range[j].end; + range[i].start = end + 1; + } else { + printk(KERN_ERR "run of slot in ranges\n"); + } + range[j].end = start - 1; + continue; + } + } +} + +static int __init cmp_range(const void *x1, const void *x2) +{ + const struct res_range *r1 = x1; + const struct res_range *r2 = x2; + long start1, start2; + + start1 = r1->start; + start2 = r2->start; + + return start1 - start2; +} + +struct var_mtrr_range_state { + unsigned long base_pfn; + unsigned long size_pfn; + mtrr_type type; +}; + +static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; +static int __initdata debug_print; + +static int __init +x86_get_mtrr_mem_range(struct res_range *range, int nr_range, + unsigned long extra_remove_base, + unsigned long extra_remove_size) +{ + unsigned long i, base, size; + mtrr_type type; + + for (i = 0; i < num_var_ranges; i++) { + type = range_state[i].type; + if (type != MTRR_TYPE_WRBACK) + continue; + base = range_state[i].base_pfn; + size = range_state[i].size_pfn; + nr_range = add_range_with_merge(range, nr_range, base, + base + size - 1); + } + if (debug_print) { + printk(KERN_DEBUG "After WB checking\n"); + for (i = 0; i < nr_range; i++) + printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", + range[i].start, range[i].end + 1); + } + + /* take out UC ranges */ + for (i = 0; i < num_var_ranges; i++) { + type = range_state[i].type; + if (type != MTRR_TYPE_UNCACHABLE && + type != MTRR_TYPE_WRPROT) + continue; + size = range_state[i].size_pfn; + if (!size) + continue; + base = range_state[i].base_pfn; + subtract_range(range, base, base + size - 1); + } + if (extra_remove_size) + subtract_range(range, extra_remove_base, + extra_remove_base + extra_remove_size - 1); + + /* get new range num */ + nr_range = 0; + for (i = 0; i < RANGE_NUM; i++) { + if (!range[i].end) + continue; + nr_range++; + } + if (debug_print) { + printk(KERN_DEBUG "After UC checking\n"); + for (i = 0; i < nr_range; i++) + printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", + range[i].start, range[i].end + 1); + } + + /* sort the ranges */ + sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); + if (debug_print) { + printk(KERN_DEBUG "After sorting\n"); + for (i = 0; i < nr_range; i++) + printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", + range[i].start, range[i].end + 1); + } + + /* clear those is not used */ + for (i = nr_range; i < RANGE_NUM; i++) + memset(&range[i], 0, sizeof(range[i])); + + return nr_range; +} + +static struct res_range __initdata range[RANGE_NUM]; +static int __initdata nr_range; + +#ifdef CONFIG_MTRR_SANITIZER + +static unsigned long __init sum_ranges(struct res_range *range, int nr_range) +{ + unsigned long sum; + int i; + + sum = 0; + for (i = 0; i < nr_range; i++) + sum += range[i].end + 1 - range[i].start; + + return sum; +} + +static int enable_mtrr_cleanup __initdata = + CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT; + +static int __init disable_mtrr_cleanup_setup(char *str) +{ + enable_mtrr_cleanup = 0; + return 0; +} +early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup); + +static int __init enable_mtrr_cleanup_setup(char *str) +{ + enable_mtrr_cleanup = 1; + return 0; +} +early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup); + +static int __init mtrr_cleanup_debug_setup(char *str) +{ + debug_print = 1; + return 0; +} +early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup); + +struct var_mtrr_state { + unsigned long range_startk; + unsigned long range_sizek; + unsigned long chunk_sizek; + unsigned long gran_sizek; + unsigned int reg; +}; + +static void __init +set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, + unsigned char type, unsigned int address_bits) +{ + u32 base_lo, base_hi, mask_lo, mask_hi; + u64 base, mask; + + if (!sizek) { + fill_mtrr_var_range(reg, 0, 0, 0, 0); + return; + } + + mask = (1ULL << address_bits) - 1; + mask &= ~((((u64)sizek) << 10) - 1); + + base = ((u64)basek) << 10; + + base |= type; + mask |= 0x800; + + base_lo = base & ((1ULL<<32) - 1); + base_hi = base >> 32; + + mask_lo = mask & ((1ULL<<32) - 1); + mask_hi = mask >> 32; + + fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi); +} + +static void __init +save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, + unsigned char type) +{ + range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10); + range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10); + range_state[reg].type = type; +} + +static void __init +set_var_mtrr_all(unsigned int address_bits) +{ + unsigned long basek, sizek; + unsigned char type; + unsigned int reg; + + for (reg = 0; reg < num_var_ranges; reg++) { + basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10); + sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10); + type = range_state[reg].type; + + set_var_mtrr(reg, basek, sizek, type, address_bits); + } +} + +static unsigned long to_size_factor(unsigned long sizek, char *factorp) +{ + char factor; + unsigned long base = sizek; + + if (base & ((1<<10) - 1)) { + /* not MB alignment */ + factor = 'K'; + } else if (base & ((1<<20) - 1)) { + factor = 'M'; + base >>= 10; + } else { + factor = 'G'; + base >>= 20; + } + + *factorp = factor; + + return base; +} + +static unsigned int __init +range_to_mtrr(unsigned int reg, unsigned long range_startk, + unsigned long range_sizek, unsigned char type) +{ + if (!range_sizek || (reg >= num_var_ranges)) + return reg; + + while (range_sizek) { + unsigned long max_align, align; + unsigned long sizek; + + /* Compute the maximum size I can make a range */ + if (range_startk) + max_align = ffs(range_startk) - 1; + else + max_align = 32; + align = fls(range_sizek) - 1; + if (align > max_align) + align = max_align; + + sizek = 1 << align; + if (debug_print) { + char start_factor = 'K', size_factor = 'K'; + unsigned long start_base, size_base; + + start_base = to_size_factor(range_startk, + &start_factor), + size_base = to_size_factor(sizek, &size_factor), + + printk(KERN_DEBUG "Setting variable MTRR %d, " + "base: %ld%cB, range: %ld%cB, type %s\n", + reg, start_base, start_factor, + size_base, size_factor, + (type == MTRR_TYPE_UNCACHABLE) ? "UC" : + ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other") + ); + } + save_var_mtrr(reg++, range_startk, sizek, type); + range_startk += sizek; + range_sizek -= sizek; + if (reg >= num_var_ranges) + break; + } + return reg; +} + +static unsigned __init +range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, + unsigned long sizek) +{ + unsigned long hole_basek, hole_sizek; + unsigned long second_basek, second_sizek; + unsigned long range0_basek, range0_sizek; + unsigned long range_basek, range_sizek; + unsigned long chunk_sizek; + unsigned long gran_sizek; + + hole_basek = 0; + hole_sizek = 0; + second_basek = 0; + second_sizek = 0; + chunk_sizek = state->chunk_sizek; + gran_sizek = state->gran_sizek; + + /* align with gran size, prevent small block used up MTRRs */ + range_basek = ALIGN(state->range_startk, gran_sizek); + if ((range_basek > basek) && basek) + return second_sizek; + state->range_sizek -= (range_basek - state->range_startk); + range_sizek = ALIGN(state->range_sizek, gran_sizek); + + while (range_sizek > state->range_sizek) { + range_sizek -= gran_sizek; + if (!range_sizek) + return 0; + } + state->range_sizek = range_sizek; + + /* try to append some small hole */ + range0_basek = state->range_startk; + range0_sizek = ALIGN(state->range_sizek, chunk_sizek); + + /* no increase */ + if (range0_sizek == state->range_sizek) { + if (debug_print) + printk(KERN_DEBUG "rangeX: %016lx - %016lx\n", + range0_basek<<10, + (range0_basek + state->range_sizek)<<10); + state->reg = range_to_mtrr(state->reg, range0_basek, + state->range_sizek, MTRR_TYPE_WRBACK); + return 0; + } + + /* only cut back, when it is not the last */ + if (sizek) { + while (range0_basek + range0_sizek > (basek + sizek)) { + if (range0_sizek >= chunk_sizek) + range0_sizek -= chunk_sizek; + else + range0_sizek = 0; + + if (!range0_sizek) + break; + } + } + +second_try: + range_basek = range0_basek + range0_sizek; + + /* one hole in the middle */ + if (range_basek > basek && range_basek <= (basek + sizek)) + second_sizek = range_basek - basek; + + if (range0_sizek > state->range_sizek) { + + /* one hole in middle or at end */ + hole_sizek = range0_sizek - state->range_sizek - second_sizek; + + /* hole size should be less than half of range0 size */ + if (hole_sizek >= (range0_sizek >> 1) && + range0_sizek >= chunk_sizek) { + range0_sizek -= chunk_sizek; + second_sizek = 0; + hole_sizek = 0; + + goto second_try; + } + } + + if (range0_sizek) { + if (debug_print) + printk(KERN_DEBUG "range0: %016lx - %016lx\n", + range0_basek<<10, + (range0_basek + range0_sizek)<<10); + state->reg = range_to_mtrr(state->reg, range0_basek, + range0_sizek, MTRR_TYPE_WRBACK); + } + + if (range0_sizek < state->range_sizek) { + /* need to handle left over */ + range_sizek = state->range_sizek - range0_sizek; + + if (debug_print) + printk(KERN_DEBUG "range: %016lx - %016lx\n", + range_basek<<10, + (range_basek + range_sizek)<<10); + state->reg = range_to_mtrr(state->reg, range_basek, + range_sizek, MTRR_TYPE_WRBACK); + } + + if (hole_sizek) { + hole_basek = range_basek - hole_sizek - second_sizek; + if (debug_print) + printk(KERN_DEBUG "hole: %016lx - %016lx\n", + hole_basek<<10, + (hole_basek + hole_sizek)<<10); + state->reg = range_to_mtrr(state->reg, hole_basek, + hole_sizek, MTRR_TYPE_UNCACHABLE); + } + + return second_sizek; +} + +static void __init +set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn, + unsigned long size_pfn) +{ + unsigned long basek, sizek; + unsigned long second_sizek = 0; + + if (state->reg >= num_var_ranges) + return; + + basek = base_pfn << (PAGE_SHIFT - 10); + sizek = size_pfn << (PAGE_SHIFT - 10); + + /* See if I can merge with the last range */ + if ((basek <= 1024) || + (state->range_startk + state->range_sizek == basek)) { + unsigned long endk = basek + sizek; + state->range_sizek = endk - state->range_startk; + return; + } + /* Write the range mtrrs */ + if (state->range_sizek != 0) + second_sizek = range_to_mtrr_with_hole(state, basek, sizek); + + /* Allocate an msr */ + state->range_startk = basek + second_sizek; + state->range_sizek = sizek - second_sizek; +} + +/* mininum size of mtrr block that can take hole */ +static u64 mtrr_chunk_size __initdata = (256ULL<<20); + +static int __init parse_mtrr_chunk_size_opt(char *p) +{ + if (!p) + return -EINVAL; + mtrr_chunk_size = memparse(p, &p); + return 0; +} +early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt); + +/* granity of mtrr of block */ +static u64 mtrr_gran_size __initdata; + +static int __init parse_mtrr_gran_size_opt(char *p) +{ + if (!p) + return -EINVAL; + mtrr_gran_size = memparse(p, &p); + return 0; +} +early_param("mtrr_gran_size", parse_mtrr_gran_size_opt); + +static int nr_mtrr_spare_reg __initdata = + CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT; + +static int __init parse_mtrr_spare_reg(char *arg) +{ + if (arg) + nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0); + return 0; +} + +early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg); + +static int __init +x86_setup_var_mtrrs(struct res_range *range, int nr_range, + u64 chunk_size, u64 gran_size) +{ + struct var_mtrr_state var_state; + int i; + int num_reg; + + var_state.range_startk = 0; + var_state.range_sizek = 0; + var_state.reg = 0; + var_state.chunk_sizek = chunk_size >> 10; + var_state.gran_sizek = gran_size >> 10; + + memset(range_state, 0, sizeof(range_state)); + + /* Write the range etc */ + for (i = 0; i < nr_range; i++) + set_var_mtrr_range(&var_state, range[i].start, + range[i].end - range[i].start + 1); + + /* Write the last range */ + if (var_state.range_sizek != 0) + range_to_mtrr_with_hole(&var_state, 0, 0); + + num_reg = var_state.reg; + /* Clear out the extra MTRR's */ + while (var_state.reg < num_var_ranges) { + save_var_mtrr(var_state.reg, 0, 0, 0); + var_state.reg++; + } + + return num_reg; +} + +struct mtrr_cleanup_result { + unsigned long gran_sizek; + unsigned long chunk_sizek; + unsigned long lose_cover_sizek; + unsigned int num_reg; + int bad; +}; + +/* + * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G + * chunk size: gran_size, ..., 2G + * so we need (1+16)*8 + */ +#define NUM_RESULT 136 +#define PSHIFT (PAGE_SHIFT - 10) + +static struct mtrr_cleanup_result __initdata result[NUM_RESULT]; +static unsigned long __initdata min_loss_pfn[RANGE_NUM]; + +static void __init print_out_mtrr_range_state(void) +{ + int i; + char start_factor = 'K', size_factor = 'K'; + unsigned long start_base, size_base; + mtrr_type type; + + for (i = 0; i < num_var_ranges; i++) { + + size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10); + if (!size_base) + continue; + + size_base = to_size_factor(size_base, &size_factor), + start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10); + start_base = to_size_factor(start_base, &start_factor), + type = range_state[i].type; + + printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n", + i, start_base, start_factor, + size_base, size_factor, + (type == MTRR_TYPE_UNCACHABLE) ? "UC" : + ((type == MTRR_TYPE_WRPROT) ? "WP" : + ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other")) + ); + } +} + +static int __init mtrr_need_cleanup(void) +{ + int i; + mtrr_type type; + unsigned long size; + /* extra one for all 0 */ + int num[MTRR_NUM_TYPES + 1]; + + /* check entries number */ + memset(num, 0, sizeof(num)); + for (i = 0; i < num_var_ranges; i++) { + type = range_state[i].type; + size = range_state[i].size_pfn; + if (type >= MTRR_NUM_TYPES) + continue; + if (!size) + type = MTRR_NUM_TYPES; + if (type == MTRR_TYPE_WRPROT) + type = MTRR_TYPE_UNCACHABLE; + num[type]++; + } + + /* check if we got UC entries */ + if (!num[MTRR_TYPE_UNCACHABLE]) + return 0; + + /* check if we only had WB and UC */ + if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != + num_var_ranges - num[MTRR_NUM_TYPES]) + return 0; + + return 1; +} + +static unsigned long __initdata range_sums; +static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size, + unsigned long extra_remove_base, + unsigned long extra_remove_size, + int i) +{ + int num_reg; + static struct res_range range_new[RANGE_NUM]; + static int nr_range_new; + unsigned long range_sums_new; + + /* convert ranges to var ranges state */ + num_reg = x86_setup_var_mtrrs(range, nr_range, + chunk_size, gran_size); + + /* we got new setting in range_state, check it */ + memset(range_new, 0, sizeof(range_new)); + nr_range_new = x86_get_mtrr_mem_range(range_new, 0, + extra_remove_base, extra_remove_size); + range_sums_new = sum_ranges(range_new, nr_range_new); + + result[i].chunk_sizek = chunk_size >> 10; + result[i].gran_sizek = gran_size >> 10; + result[i].num_reg = num_reg; + if (range_sums < range_sums_new) { + result[i].lose_cover_sizek = + (range_sums_new - range_sums) << PSHIFT; + result[i].bad = 1; + } else + result[i].lose_cover_sizek = + (range_sums - range_sums_new) << PSHIFT; + + /* double check it */ + if (!result[i].bad && !result[i].lose_cover_sizek) { + if (nr_range_new != nr_range || + memcmp(range, range_new, sizeof(range))) + result[i].bad = 1; + } + + if (!result[i].bad && (range_sums - range_sums_new < + min_loss_pfn[num_reg])) { + min_loss_pfn[num_reg] = + range_sums - range_sums_new; + } +} + +static void __init mtrr_print_out_one_result(int i) +{ + char gran_factor, chunk_factor, lose_factor; + unsigned long gran_base, chunk_base, lose_base; + + gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), + chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), + lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), + printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t", + result[i].bad ? "*BAD*" : " ", + gran_base, gran_factor, chunk_base, chunk_factor); + printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n", + result[i].num_reg, result[i].bad ? "-" : "", + lose_base, lose_factor); +} + +static int __init mtrr_search_optimal_index(void) +{ + int i; + int num_reg_good; + int index_good; + + if (nr_mtrr_spare_reg >= num_var_ranges) + nr_mtrr_spare_reg = num_var_ranges - 1; + num_reg_good = -1; + for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { + if (!min_loss_pfn[i]) + num_reg_good = i; + } + + index_good = -1; + if (num_reg_good != -1) { + for (i = 0; i < NUM_RESULT; i++) { + if (!result[i].bad && + result[i].num_reg == num_reg_good && + !result[i].lose_cover_sizek) { + index_good = i; + break; + } + } + } + + return index_good; +} + + +int __init mtrr_cleanup(unsigned address_bits) +{ + unsigned long extra_remove_base, extra_remove_size; + unsigned long base, size, def, dummy; + mtrr_type type; + u64 chunk_size, gran_size; + int index_good; + int i; + + if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) + return 0; + rdmsr(MTRRdefType_MSR, def, dummy); + def &= 0xff; + if (def != MTRR_TYPE_UNCACHABLE) + return 0; + + /* get it and store it aside */ + memset(range_state, 0, sizeof(range_state)); + for (i = 0; i < num_var_ranges; i++) { + mtrr_if->get(i, &base, &size, &type); + range_state[i].base_pfn = base; + range_state[i].size_pfn = size; + range_state[i].type = type; + } + + /* check if we need handle it and can handle it */ + if (!mtrr_need_cleanup()) + return 0; + + /* print original var MTRRs at first, for debugging: */ + printk(KERN_DEBUG "original variable MTRRs\n"); + print_out_mtrr_range_state(); + + memset(range, 0, sizeof(range)); + extra_remove_size = 0; + extra_remove_base = 1 << (32 - PAGE_SHIFT); + if (mtrr_tom2) + extra_remove_size = + (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base; + nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, + extra_remove_size); + /* + * [0, 1M) should always be coverred by var mtrr with WB + * and fixed mtrrs should take effective before var mtrr for it + */ + nr_range = add_range_with_merge(range, nr_range, 0, + (1ULL<<(20 - PAGE_SHIFT)) - 1); + /* sort the ranges */ + sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); + + range_sums = sum_ranges(range, nr_range); + printk(KERN_INFO "total RAM coverred: %ldM\n", + range_sums >> (20 - PAGE_SHIFT)); + + if (mtrr_chunk_size && mtrr_gran_size) { + i = 0; + mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size, + extra_remove_base, extra_remove_size, i); + + mtrr_print_out_one_result(i); + + if (!result[i].bad) { + set_var_mtrr_all(address_bits); + printk(KERN_DEBUG "New variable MTRRs\n"); + print_out_mtrr_range_state(); + return 1; + } + printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " + "will find optimal one\n"); + } + + i = 0; + memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn)); + memset(result, 0, sizeof(result)); + for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) { + + for (chunk_size = gran_size; chunk_size < (1ULL<<32); + chunk_size <<= 1) { + + if (i >= NUM_RESULT) + continue; + + mtrr_calc_range_state(chunk_size, gran_size, + extra_remove_base, extra_remove_size, i); + if (debug_print) { + mtrr_print_out_one_result(i); + printk(KERN_INFO "\n"); + } + + i++; + } + } + + /* try to find the optimal index */ + index_good = mtrr_search_optimal_index(); + + if (index_good != -1) { + printk(KERN_INFO "Found optimal setting for mtrr clean up\n"); + i = index_good; + mtrr_print_out_one_result(i); + + /* convert ranges to var ranges state */ + chunk_size = result[i].chunk_sizek; + chunk_size <<= 10; + gran_size = result[i].gran_sizek; + gran_size <<= 10; + x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); + set_var_mtrr_all(address_bits); + printk(KERN_DEBUG "New variable MTRRs\n"); + print_out_mtrr_range_state(); + return 1; + } else { + /* print out all */ + for (i = 0; i < NUM_RESULT; i++) + mtrr_print_out_one_result(i); + } + + printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n"); + printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n"); + + return 0; +} +#else +int __init mtrr_cleanup(unsigned address_bits) +{ + return 0; +} +#endif + +static int disable_mtrr_trim; + +static int __init disable_mtrr_trim_setup(char *str) +{ + disable_mtrr_trim = 1; + return 0; +} +early_param("disable_mtrr_trim", disable_mtrr_trim_setup); + +/* + * Newer AMD K8s and later CPUs have a special magic MSR way to force WB + * for memory >4GB. Check for that here. + * Note this won't check if the MTRRs < 4GB where the magic bit doesn't + * apply to are wrong, but so far we don't know of any such case in the wild. + */ +#define Tom2Enabled (1U << 21) +#define Tom2ForceMemTypeWB (1U << 22) + +int __init amd_special_default_mtrr(void) +{ + u32 l, h; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return 0; + if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) + return 0; + /* In case some hypervisor doesn't pass SYSCFG through */ + if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) + return 0; + /* + * Memory between 4GB and top of mem is forced WB by this magic bit. + * Reserved before K8RevF, but should be zero there. + */ + if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) == + (Tom2Enabled | Tom2ForceMemTypeWB)) + return 1; + return 0; +} + +static u64 __init real_trim_memory(unsigned long start_pfn, + unsigned long limit_pfn) +{ + u64 trim_start, trim_size; + trim_start = start_pfn; + trim_start <<= PAGE_SHIFT; + trim_size = limit_pfn; + trim_size <<= PAGE_SHIFT; + trim_size -= trim_start; + + return e820_update_range(trim_start, trim_size, E820_RAM, + E820_RESERVED); +} +/** + * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs + * @end_pfn: ending page frame number + * + * Some buggy BIOSes don't setup the MTRRs properly for systems with certain + * memory configurations. This routine checks that the highest MTRR matches + * the end of memory, to make sure the MTRRs having a write back type cover + * all of the memory the kernel is intending to use. If not, it'll trim any + * memory off the end by adjusting end_pfn, removing it from the kernel's + * allocation pools, warning the user with an obnoxious message. + */ +int __init mtrr_trim_uncached_memory(unsigned long end_pfn) +{ + unsigned long i, base, size, highest_pfn = 0, def, dummy; + mtrr_type type; + u64 total_trim_size; + + /* extra one for all 0 */ + int num[MTRR_NUM_TYPES + 1]; + /* + * Make sure we only trim uncachable memory on machines that + * support the Intel MTRR architecture: + */ + if (!is_cpu(INTEL) || disable_mtrr_trim) + return 0; + rdmsr(MTRRdefType_MSR, def, dummy); + def &= 0xff; + if (def != MTRR_TYPE_UNCACHABLE) + return 0; + + /* get it and store it aside */ + memset(range_state, 0, sizeof(range_state)); + for (i = 0; i < num_var_ranges; i++) { + mtrr_if->get(i, &base, &size, &type); + range_state[i].base_pfn = base; + range_state[i].size_pfn = size; + range_state[i].type = type; + } + + /* Find highest cached pfn */ + for (i = 0; i < num_var_ranges; i++) { + type = range_state[i].type; + if (type != MTRR_TYPE_WRBACK) + continue; + base = range_state[i].base_pfn; + size = range_state[i].size_pfn; + if (highest_pfn < base + size) + highest_pfn = base + size; + } + + /* kvm/qemu doesn't have mtrr set right, don't trim them all */ + if (!highest_pfn) { + printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n"); + return 0; + } + + /* check entries number */ + memset(num, 0, sizeof(num)); + for (i = 0; i < num_var_ranges; i++) { + type = range_state[i].type; + if (type >= MTRR_NUM_TYPES) + continue; + size = range_state[i].size_pfn; + if (!size) + type = MTRR_NUM_TYPES; + num[type]++; + } + + /* no entry for WB? */ + if (!num[MTRR_TYPE_WRBACK]) + return 0; + + /* check if we only had WB and UC */ + if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != + num_var_ranges - num[MTRR_NUM_TYPES]) + return 0; + + memset(range, 0, sizeof(range)); + nr_range = 0; + if (mtrr_tom2) { + range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT)); + range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1; + if (highest_pfn < range[nr_range].end + 1) + highest_pfn = range[nr_range].end + 1; + nr_range++; + } + nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); + + total_trim_size = 0; + /* check the head */ + if (range[0].start) + total_trim_size += real_trim_memory(0, range[0].start); + /* check the holes */ + for (i = 0; i < nr_range - 1; i++) { + if (range[i].end + 1 < range[i+1].start) + total_trim_size += real_trim_memory(range[i].end + 1, + range[i+1].start); + } + /* check the top */ + i = nr_range - 1; + if (range[i].end + 1 < end_pfn) + total_trim_size += real_trim_memory(range[i].end + 1, + end_pfn); + + if (total_trim_size) { + printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover" + " all of memory, losing %lluMB of RAM.\n", + total_trim_size >> 20); + + if (!changed_by_mtrr_cleanup) + WARN_ON(1); + + printk(KERN_INFO "update e820 for mtrr\n"); + update_e820(); + + return 1; + } + + return 0; +} + diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 311a4734609..5c2e266f41d 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -610,1060 +610,7 @@ static struct sysdev_driver mtrr_sysdev_driver = { .resume = mtrr_restore, }; -/* should be related to MTRR_VAR_RANGES nums */ -#define RANGE_NUM 256 - -struct res_range { - unsigned long start; - unsigned long end; -}; - -static int __init -add_range(struct res_range *range, int nr_range, unsigned long start, - unsigned long end) -{ - /* out of slots */ - if (nr_range >= RANGE_NUM) - return nr_range; - - range[nr_range].start = start; - range[nr_range].end = end; - - nr_range++; - - return nr_range; -} - -static int __init -add_range_with_merge(struct res_range *range, int nr_range, unsigned long start, - unsigned long end) -{ - int i; - - /* try to merge it with old one */ - for (i = 0; i < nr_range; i++) { - unsigned long final_start, final_end; - unsigned long common_start, common_end; - - if (!range[i].end) - continue; - - common_start = max(range[i].start, start); - common_end = min(range[i].end, end); - if (common_start > common_end + 1) - continue; - - final_start = min(range[i].start, start); - final_end = max(range[i].end, end); - - range[i].start = final_start; - range[i].end = final_end; - return nr_range; - } - - /* need to add that */ - return add_range(range, nr_range, start, end); -} - -static void __init -subtract_range(struct res_range *range, unsigned long start, unsigned long end) -{ - int i, j; - - for (j = 0; j < RANGE_NUM; j++) { - if (!range[j].end) - continue; - - if (start <= range[j].start && end >= range[j].end) { - range[j].start = 0; - range[j].end = 0; - continue; - } - - if (start <= range[j].start && end < range[j].end && - range[j].start < end + 1) { - range[j].start = end + 1; - continue; - } - - - if (start > range[j].start && end >= range[j].end && - range[j].end > start - 1) { - range[j].end = start - 1; - continue; - } - - if (start > range[j].start && end < range[j].end) { - /* find the new spare */ - for (i = 0; i < RANGE_NUM; i++) { - if (range[i].end == 0) - break; - } - if (i < RANGE_NUM) { - range[i].end = range[j].end; - range[i].start = end + 1; - } else { - printk(KERN_ERR "run of slot in ranges\n"); - } - range[j].end = start - 1; - continue; - } - } -} - -static int __init cmp_range(const void *x1, const void *x2) -{ - const struct res_range *r1 = x1; - const struct res_range *r2 = x2; - long start1, start2; - - start1 = r1->start; - start2 = r2->start; - - return start1 - start2; -} - -struct var_mtrr_range_state { - unsigned long base_pfn; - unsigned long size_pfn; - mtrr_type type; -}; - -static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; -static int __initdata debug_print; - -static int __init -x86_get_mtrr_mem_range(struct res_range *range, int nr_range, - unsigned long extra_remove_base, - unsigned long extra_remove_size) -{ - unsigned long i, base, size; - mtrr_type type; - - for (i = 0; i < num_var_ranges; i++) { - type = range_state[i].type; - if (type != MTRR_TYPE_WRBACK) - continue; - base = range_state[i].base_pfn; - size = range_state[i].size_pfn; - nr_range = add_range_with_merge(range, nr_range, base, - base + size - 1); - } - if (debug_print) { - printk(KERN_DEBUG "After WB checking\n"); - for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", - range[i].start, range[i].end + 1); - } - - /* take out UC ranges */ - for (i = 0; i < num_var_ranges; i++) { - type = range_state[i].type; - if (type != MTRR_TYPE_UNCACHABLE && - type != MTRR_TYPE_WRPROT) - continue; - size = range_state[i].size_pfn; - if (!size) - continue; - base = range_state[i].base_pfn; - subtract_range(range, base, base + size - 1); - } - if (extra_remove_size) - subtract_range(range, extra_remove_base, - extra_remove_base + extra_remove_size - 1); - - /* get new range num */ - nr_range = 0; - for (i = 0; i < RANGE_NUM; i++) { - if (!range[i].end) - continue; - nr_range++; - } - if (debug_print) { - printk(KERN_DEBUG "After UC checking\n"); - for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", - range[i].start, range[i].end + 1); - } - - /* sort the ranges */ - sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); - if (debug_print) { - printk(KERN_DEBUG "After sorting\n"); - for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", - range[i].start, range[i].end + 1); - } - - /* clear those is not used */ - for (i = nr_range; i < RANGE_NUM; i++) - memset(&range[i], 0, sizeof(range[i])); - - return nr_range; -} - -static struct res_range __initdata range[RANGE_NUM]; -static int __initdata nr_range; - -#ifdef CONFIG_MTRR_SANITIZER - -static unsigned long __init sum_ranges(struct res_range *range, int nr_range) -{ - unsigned long sum; - int i; - - sum = 0; - for (i = 0; i < nr_range; i++) - sum += range[i].end + 1 - range[i].start; - - return sum; -} - -static int enable_mtrr_cleanup __initdata = - CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT; - -static int __init disable_mtrr_cleanup_setup(char *str) -{ - enable_mtrr_cleanup = 0; - return 0; -} -early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup); - -static int __init enable_mtrr_cleanup_setup(char *str) -{ - enable_mtrr_cleanup = 1; - return 0; -} -early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup); - -static int __init mtrr_cleanup_debug_setup(char *str) -{ - debug_print = 1; - return 0; -} -early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup); - -struct var_mtrr_state { - unsigned long range_startk; - unsigned long range_sizek; - unsigned long chunk_sizek; - unsigned long gran_sizek; - unsigned int reg; -}; - -static void __init -set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, - unsigned char type, unsigned int address_bits) -{ - u32 base_lo, base_hi, mask_lo, mask_hi; - u64 base, mask; - - if (!sizek) { - fill_mtrr_var_range(reg, 0, 0, 0, 0); - return; - } - - mask = (1ULL << address_bits) - 1; - mask &= ~((((u64)sizek) << 10) - 1); - - base = ((u64)basek) << 10; - - base |= type; - mask |= 0x800; - - base_lo = base & ((1ULL<<32) - 1); - base_hi = base >> 32; - - mask_lo = mask & ((1ULL<<32) - 1); - mask_hi = mask >> 32; - - fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi); -} - -static void __init -save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, - unsigned char type) -{ - range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10); - range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10); - range_state[reg].type = type; -} - -static void __init -set_var_mtrr_all(unsigned int address_bits) -{ - unsigned long basek, sizek; - unsigned char type; - unsigned int reg; - - for (reg = 0; reg < num_var_ranges; reg++) { - basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10); - sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10); - type = range_state[reg].type; - - set_var_mtrr(reg, basek, sizek, type, address_bits); - } -} - -static unsigned long to_size_factor(unsigned long sizek, char *factorp) -{ - char factor; - unsigned long base = sizek; - - if (base & ((1<<10) - 1)) { - /* not MB alignment */ - factor = 'K'; - } else if (base & ((1<<20) - 1)){ - factor = 'M'; - base >>= 10; - } else { - factor = 'G'; - base >>= 20; - } - - *factorp = factor; - - return base; -} - -static unsigned int __init -range_to_mtrr(unsigned int reg, unsigned long range_startk, - unsigned long range_sizek, unsigned char type) -{ - if (!range_sizek || (reg >= num_var_ranges)) - return reg; - - while (range_sizek) { - unsigned long max_align, align; - unsigned long sizek; - - /* Compute the maximum size I can make a range */ - if (range_startk) - max_align = ffs(range_startk) - 1; - else - max_align = 32; - align = fls(range_sizek) - 1; - if (align > max_align) - align = max_align; - - sizek = 1 << align; - if (debug_print) { - char start_factor = 'K', size_factor = 'K'; - unsigned long start_base, size_base; - - start_base = to_size_factor(range_startk, &start_factor), - size_base = to_size_factor(sizek, &size_factor), - - printk(KERN_DEBUG "Setting variable MTRR %d, " - "base: %ld%cB, range: %ld%cB, type %s\n", - reg, start_base, start_factor, - size_base, size_factor, - (type == MTRR_TYPE_UNCACHABLE)?"UC": - ((type == MTRR_TYPE_WRBACK)?"WB":"Other") - ); - } - save_var_mtrr(reg++, range_startk, sizek, type); - range_startk += sizek; - range_sizek -= sizek; - if (reg >= num_var_ranges) - break; - } - return reg; -} - -static unsigned __init -range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, - unsigned long sizek) -{ - unsigned long hole_basek, hole_sizek; - unsigned long second_basek, second_sizek; - unsigned long range0_basek, range0_sizek; - unsigned long range_basek, range_sizek; - unsigned long chunk_sizek; - unsigned long gran_sizek; - - hole_basek = 0; - hole_sizek = 0; - second_basek = 0; - second_sizek = 0; - chunk_sizek = state->chunk_sizek; - gran_sizek = state->gran_sizek; - - /* align with gran size, prevent small block used up MTRRs */ - range_basek = ALIGN(state->range_startk, gran_sizek); - if ((range_basek > basek) && basek) - return second_sizek; - state->range_sizek -= (range_basek - state->range_startk); - range_sizek = ALIGN(state->range_sizek, gran_sizek); - - while (range_sizek > state->range_sizek) { - range_sizek -= gran_sizek; - if (!range_sizek) - return 0; - } - state->range_sizek = range_sizek; - - /* try to append some small hole */ - range0_basek = state->range_startk; - range0_sizek = ALIGN(state->range_sizek, chunk_sizek); - - /* no increase */ - if (range0_sizek == state->range_sizek) { - if (debug_print) - printk(KERN_DEBUG "rangeX: %016lx - %016lx\n", - range0_basek<<10, - (range0_basek + state->range_sizek)<<10); - state->reg = range_to_mtrr(state->reg, range0_basek, - state->range_sizek, MTRR_TYPE_WRBACK); - return 0; - } - - /* only cut back, when it is not the last */ - if (sizek) { - while (range0_basek + range0_sizek > (basek + sizek)) { - if (range0_sizek >= chunk_sizek) - range0_sizek -= chunk_sizek; - else - range0_sizek = 0; - - if (!range0_sizek) - break; - } - } - -second_try: - range_basek = range0_basek + range0_sizek; - - /* one hole in the middle */ - if (range_basek > basek && range_basek <= (basek + sizek)) - second_sizek = range_basek - basek; - - if (range0_sizek > state->range_sizek) { - - /* one hole in middle or at end */ - hole_sizek = range0_sizek - state->range_sizek - second_sizek; - - /* hole size should be less than half of range0 size */ - if (hole_sizek >= (range0_sizek >> 1) && - range0_sizek >= chunk_sizek) { - range0_sizek -= chunk_sizek; - second_sizek = 0; - hole_sizek = 0; - - goto second_try; - } - } - - if (range0_sizek) { - if (debug_print) - printk(KERN_DEBUG "range0: %016lx - %016lx\n", - range0_basek<<10, - (range0_basek + range0_sizek)<<10); - state->reg = range_to_mtrr(state->reg, range0_basek, - range0_sizek, MTRR_TYPE_WRBACK); - } - - if (range0_sizek < state->range_sizek) { - /* need to handle left over */ - range_sizek = state->range_sizek - range0_sizek; - - if (debug_print) - printk(KERN_DEBUG "range: %016lx - %016lx\n", - range_basek<<10, - (range_basek + range_sizek)<<10); - state->reg = range_to_mtrr(state->reg, range_basek, - range_sizek, MTRR_TYPE_WRBACK); - } - - if (hole_sizek) { - hole_basek = range_basek - hole_sizek - second_sizek; - if (debug_print) - printk(KERN_DEBUG "hole: %016lx - %016lx\n", - hole_basek<<10, - (hole_basek + hole_sizek)<<10); - state->reg = range_to_mtrr(state->reg, hole_basek, - hole_sizek, MTRR_TYPE_UNCACHABLE); - } - - return second_sizek; -} - -static void __init -set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn, - unsigned long size_pfn) -{ - unsigned long basek, sizek; - unsigned long second_sizek = 0; - - if (state->reg >= num_var_ranges) - return; - - basek = base_pfn << (PAGE_SHIFT - 10); - sizek = size_pfn << (PAGE_SHIFT - 10); - - /* See if I can merge with the last range */ - if ((basek <= 1024) || - (state->range_startk + state->range_sizek == basek)) { - unsigned long endk = basek + sizek; - state->range_sizek = endk - state->range_startk; - return; - } - /* Write the range mtrrs */ - if (state->range_sizek != 0) - second_sizek = range_to_mtrr_with_hole(state, basek, sizek); - - /* Allocate an msr */ - state->range_startk = basek + second_sizek; - state->range_sizek = sizek - second_sizek; -} - -/* mininum size of mtrr block that can take hole */ -static u64 mtrr_chunk_size __initdata = (256ULL<<20); - -static int __init parse_mtrr_chunk_size_opt(char *p) -{ - if (!p) - return -EINVAL; - mtrr_chunk_size = memparse(p, &p); - return 0; -} -early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt); - -/* granity of mtrr of block */ -static u64 mtrr_gran_size __initdata; - -static int __init parse_mtrr_gran_size_opt(char *p) -{ - if (!p) - return -EINVAL; - mtrr_gran_size = memparse(p, &p); - return 0; -} -early_param("mtrr_gran_size", parse_mtrr_gran_size_opt); - -static int nr_mtrr_spare_reg __initdata = - CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT; - -static int __init parse_mtrr_spare_reg(char *arg) -{ - if (arg) - nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0); - return 0; -} - -early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg); - -static int __init -x86_setup_var_mtrrs(struct res_range *range, int nr_range, - u64 chunk_size, u64 gran_size) -{ - struct var_mtrr_state var_state; - int i; - int num_reg; - - var_state.range_startk = 0; - var_state.range_sizek = 0; - var_state.reg = 0; - var_state.chunk_sizek = chunk_size >> 10; - var_state.gran_sizek = gran_size >> 10; - - memset(range_state, 0, sizeof(range_state)); - - /* Write the range etc */ - for (i = 0; i < nr_range; i++) - set_var_mtrr_range(&var_state, range[i].start, - range[i].end - range[i].start + 1); - - /* Write the last range */ - if (var_state.range_sizek != 0) - range_to_mtrr_with_hole(&var_state, 0, 0); - - num_reg = var_state.reg; - /* Clear out the extra MTRR's */ - while (var_state.reg < num_var_ranges) { - save_var_mtrr(var_state.reg, 0, 0, 0); - var_state.reg++; - } - - return num_reg; -} - -struct mtrr_cleanup_result { - unsigned long gran_sizek; - unsigned long chunk_sizek; - unsigned long lose_cover_sizek; - unsigned int num_reg; - int bad; -}; - -/* - * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G - * chunk size: gran_size, ..., 2G - * so we need (1+16)*8 - */ -#define NUM_RESULT 136 -#define PSHIFT (PAGE_SHIFT - 10) - -static struct mtrr_cleanup_result __initdata result[NUM_RESULT]; -static unsigned long __initdata min_loss_pfn[RANGE_NUM]; - -static void __init print_out_mtrr_range_state(void) -{ - int i; - char start_factor = 'K', size_factor = 'K'; - unsigned long start_base, size_base; - mtrr_type type; - - for (i = 0; i < num_var_ranges; i++) { - - size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10); - if (!size_base) - continue; - - size_base = to_size_factor(size_base, &size_factor), - start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10); - start_base = to_size_factor(start_base, &start_factor), - type = range_state[i].type; - - printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n", - i, start_base, start_factor, - size_base, size_factor, - (type == MTRR_TYPE_UNCACHABLE) ? "UC" : - ((type == MTRR_TYPE_WRPROT) ? "WP" : - ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other")) - ); - } -} - -static int __init mtrr_need_cleanup(void) -{ - int i; - mtrr_type type; - unsigned long size; - /* extra one for all 0 */ - int num[MTRR_NUM_TYPES + 1]; - - /* check entries number */ - memset(num, 0, sizeof(num)); - for (i = 0; i < num_var_ranges; i++) { - type = range_state[i].type; - size = range_state[i].size_pfn; - if (type >= MTRR_NUM_TYPES) - continue; - if (!size) - type = MTRR_NUM_TYPES; - if (type == MTRR_TYPE_WRPROT) - type = MTRR_TYPE_UNCACHABLE; - num[type]++; - } - - /* check if we got UC entries */ - if (!num[MTRR_TYPE_UNCACHABLE]) - return 0; - - /* check if we only had WB and UC */ - if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != - num_var_ranges - num[MTRR_NUM_TYPES]) - return 0; - - return 1; -} - -static unsigned long __initdata range_sums; -static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size, - unsigned long extra_remove_base, - unsigned long extra_remove_size, - int i) -{ - int num_reg; - static struct res_range range_new[RANGE_NUM]; - static int nr_range_new; - unsigned long range_sums_new; - - /* convert ranges to var ranges state */ - num_reg = x86_setup_var_mtrrs(range, nr_range, - chunk_size, gran_size); - - /* we got new setting in range_state, check it */ - memset(range_new, 0, sizeof(range_new)); - nr_range_new = x86_get_mtrr_mem_range(range_new, 0, - extra_remove_base, extra_remove_size); - range_sums_new = sum_ranges(range_new, nr_range_new); - - result[i].chunk_sizek = chunk_size >> 10; - result[i].gran_sizek = gran_size >> 10; - result[i].num_reg = num_reg; - if (range_sums < range_sums_new) { - result[i].lose_cover_sizek = - (range_sums_new - range_sums) << PSHIFT; - result[i].bad = 1; - } else - result[i].lose_cover_sizek = - (range_sums - range_sums_new) << PSHIFT; - - /* double check it */ - if (!result[i].bad && !result[i].lose_cover_sizek) { - if (nr_range_new != nr_range || - memcmp(range, range_new, sizeof(range))) - result[i].bad = 1; - } - - if (!result[i].bad && (range_sums - range_sums_new < - min_loss_pfn[num_reg])) { - min_loss_pfn[num_reg] = - range_sums - range_sums_new; - } -} - -static void __init mtrr_print_out_one_result(int i) -{ - char gran_factor, chunk_factor, lose_factor; - unsigned long gran_base, chunk_base, lose_base; - - gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), - chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), - lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), - printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t", - result[i].bad ? "*BAD*" : " ", - gran_base, gran_factor, chunk_base, chunk_factor); - printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n", - result[i].num_reg, result[i].bad ? "-" : "", - lose_base, lose_factor); -} - -static int __init mtrr_search_optimal_index(void) -{ - int i; - int num_reg_good; - int index_good; - - if (nr_mtrr_spare_reg >= num_var_ranges) - nr_mtrr_spare_reg = num_var_ranges - 1; - num_reg_good = -1; - for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { - if (!min_loss_pfn[i]) - num_reg_good = i; - } - - index_good = -1; - if (num_reg_good != -1) { - for (i = 0; i < NUM_RESULT; i++) { - if (!result[i].bad && - result[i].num_reg == num_reg_good && - !result[i].lose_cover_sizek) { - index_good = i; - break; - } - } - } - - return index_good; -} - - -static int __init mtrr_cleanup(unsigned address_bits) -{ - unsigned long extra_remove_base, extra_remove_size; - unsigned long base, size, def, dummy; - mtrr_type type; - u64 chunk_size, gran_size; - int index_good; - int i; - - if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) - return 0; - rdmsr(MTRRdefType_MSR, def, dummy); - def &= 0xff; - if (def != MTRR_TYPE_UNCACHABLE) - return 0; - - /* get it and store it aside */ - memset(range_state, 0, sizeof(range_state)); - for (i = 0; i < num_var_ranges; i++) { - mtrr_if->get(i, &base, &size, &type); - range_state[i].base_pfn = base; - range_state[i].size_pfn = size; - range_state[i].type = type; - } - - /* check if we need handle it and can handle it */ - if (!mtrr_need_cleanup()) - return 0; - - /* print original var MTRRs at first, for debugging: */ - printk(KERN_DEBUG "original variable MTRRs\n"); - print_out_mtrr_range_state(); - - memset(range, 0, sizeof(range)); - extra_remove_size = 0; - extra_remove_base = 1 << (32 - PAGE_SHIFT); - if (mtrr_tom2) - extra_remove_size = - (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base; - nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, - extra_remove_size); - /* - * [0, 1M) should always be coverred by var mtrr with WB - * and fixed mtrrs should take effective before var mtrr for it - */ - nr_range = add_range_with_merge(range, nr_range, 0, - (1ULL<<(20 - PAGE_SHIFT)) - 1); - /* sort the ranges */ - sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); - - range_sums = sum_ranges(range, nr_range); - printk(KERN_INFO "total RAM coverred: %ldM\n", - range_sums >> (20 - PAGE_SHIFT)); - - if (mtrr_chunk_size && mtrr_gran_size) { - i = 0; - mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size, - extra_remove_base, extra_remove_size, i); - - mtrr_print_out_one_result(i); - - if (!result[i].bad) { - set_var_mtrr_all(address_bits); - printk(KERN_DEBUG "New variable MTRRs\n"); - print_out_mtrr_range_state(); - return 1; - } - printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " - "will find optimal one\n"); - } - - i = 0; - memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn)); - memset(result, 0, sizeof(result)); - for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) { - - for (chunk_size = gran_size; chunk_size < (1ULL<<32); - chunk_size <<= 1) { - - if (i >= NUM_RESULT) - continue; - - mtrr_calc_range_state(chunk_size, gran_size, - extra_remove_base, extra_remove_size, i); - if (debug_print) { - mtrr_print_out_one_result(i); - printk(KERN_INFO "\n"); - } - - i++; - } - } - - /* try to find the optimal index */ - index_good = mtrr_search_optimal_index(); - - if (index_good != -1) { - printk(KERN_INFO "Found optimal setting for mtrr clean up\n"); - i = index_good; - mtrr_print_out_one_result(i); - - /* convert ranges to var ranges state */ - chunk_size = result[i].chunk_sizek; - chunk_size <<= 10; - gran_size = result[i].gran_sizek; - gran_size <<= 10; - x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); - set_var_mtrr_all(address_bits); - printk(KERN_DEBUG "New variable MTRRs\n"); - print_out_mtrr_range_state(); - return 1; - } else { - /* print out all */ - for (i = 0; i < NUM_RESULT; i++) - mtrr_print_out_one_result(i); - } - - printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n"); - printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n"); - - return 0; -} -#else -static int __init mtrr_cleanup(unsigned address_bits) -{ - return 0; -} -#endif - -static int __initdata changed_by_mtrr_cleanup; - -static int disable_mtrr_trim; - -static int __init disable_mtrr_trim_setup(char *str) -{ - disable_mtrr_trim = 1; - return 0; -} -early_param("disable_mtrr_trim", disable_mtrr_trim_setup); - -/* - * Newer AMD K8s and later CPUs have a special magic MSR way to force WB - * for memory >4GB. Check for that here. - * Note this won't check if the MTRRs < 4GB where the magic bit doesn't - * apply to are wrong, but so far we don't know of any such case in the wild. - */ -#define Tom2Enabled (1U << 21) -#define Tom2ForceMemTypeWB (1U << 22) - -int __init amd_special_default_mtrr(void) -{ - u32 l, h; - - if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) - return 0; - if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) - return 0; - /* In case some hypervisor doesn't pass SYSCFG through */ - if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) - return 0; - /* - * Memory between 4GB and top of mem is forced WB by this magic bit. - * Reserved before K8RevF, but should be zero there. - */ - if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) == - (Tom2Enabled | Tom2ForceMemTypeWB)) - return 1; - return 0; -} - -static u64 __init real_trim_memory(unsigned long start_pfn, - unsigned long limit_pfn) -{ - u64 trim_start, trim_size; - trim_start = start_pfn; - trim_start <<= PAGE_SHIFT; - trim_size = limit_pfn; - trim_size <<= PAGE_SHIFT; - trim_size -= trim_start; - - return e820_update_range(trim_start, trim_size, E820_RAM, - E820_RESERVED); -} -/** - * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs - * @end_pfn: ending page frame number - * - * Some buggy BIOSes don't setup the MTRRs properly for systems with certain - * memory configurations. This routine checks that the highest MTRR matches - * the end of memory, to make sure the MTRRs having a write back type cover - * all of the memory the kernel is intending to use. If not, it'll trim any - * memory off the end by adjusting end_pfn, removing it from the kernel's - * allocation pools, warning the user with an obnoxious message. - */ -int __init mtrr_trim_uncached_memory(unsigned long end_pfn) -{ - unsigned long i, base, size, highest_pfn = 0, def, dummy; - mtrr_type type; - u64 total_trim_size; - - /* extra one for all 0 */ - int num[MTRR_NUM_TYPES + 1]; - /* - * Make sure we only trim uncachable memory on machines that - * support the Intel MTRR architecture: - */ - if (!is_cpu(INTEL) || disable_mtrr_trim) - return 0; - rdmsr(MTRRdefType_MSR, def, dummy); - def &= 0xff; - if (def != MTRR_TYPE_UNCACHABLE) - return 0; - - /* get it and store it aside */ - memset(range_state, 0, sizeof(range_state)); - for (i = 0; i < num_var_ranges; i++) { - mtrr_if->get(i, &base, &size, &type); - range_state[i].base_pfn = base; - range_state[i].size_pfn = size; - range_state[i].type = type; - } - - /* Find highest cached pfn */ - for (i = 0; i < num_var_ranges; i++) { - type = range_state[i].type; - if (type != MTRR_TYPE_WRBACK) - continue; - base = range_state[i].base_pfn; - size = range_state[i].size_pfn; - if (highest_pfn < base + size) - highest_pfn = base + size; - } - - /* kvm/qemu doesn't have mtrr set right, don't trim them all */ - if (!highest_pfn) { - printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n"); - return 0; - } - - /* check entries number */ - memset(num, 0, sizeof(num)); - for (i = 0; i < num_var_ranges; i++) { - type = range_state[i].type; - if (type >= MTRR_NUM_TYPES) - continue; - size = range_state[i].size_pfn; - if (!size) - type = MTRR_NUM_TYPES; - num[type]++; - } - - /* no entry for WB? */ - if (!num[MTRR_TYPE_WRBACK]) - return 0; - - /* check if we only had WB and UC */ - if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != - num_var_ranges - num[MTRR_NUM_TYPES]) - return 0; - - memset(range, 0, sizeof(range)); - nr_range = 0; - if (mtrr_tom2) { - range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT)); - range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1; - if (highest_pfn < range[nr_range].end + 1) - highest_pfn = range[nr_range].end + 1; - nr_range++; - } - nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); - - total_trim_size = 0; - /* check the head */ - if (range[0].start) - total_trim_size += real_trim_memory(0, range[0].start); - /* check the holes */ - for (i = 0; i < nr_range - 1; i++) { - if (range[i].end + 1 < range[i+1].start) - total_trim_size += real_trim_memory(range[i].end + 1, - range[i+1].start); - } - /* check the top */ - i = nr_range - 1; - if (range[i].end + 1 < end_pfn) - total_trim_size += real_trim_memory(range[i].end + 1, - end_pfn); - - if (total_trim_size) { - printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover" - " all of memory, losing %lluMB of RAM.\n", - total_trim_size >> 20); - - if (!changed_by_mtrr_cleanup) - WARN_ON(1); - - printk(KERN_INFO "update e820 for mtrr\n"); - update_e820(); - - return 1; - } - - return 0; -} +int __initdata changed_by_mtrr_cleanup; /** * mtrr_bp_init - initialize mtrrs on the boot CPU diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index ffd60409cc6..6710e93021a 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -88,3 +88,6 @@ void mtrr_wrmsr(unsigned, unsigned, unsigned); int amd_init_mtrr(void); int cyrix_init_mtrr(void); int centaur_init_mtrr(void); + +extern int changed_by_mtrr_cleanup; +extern int mtrr_cleanup(unsigned address_bits); -- cgit v1.2.3 From 91219bcbdcccc1686b0ecce09e28825c93619c07 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 12 Mar 2009 02:37:00 +0530 Subject: x86: cpu_debug add write support for MSRs Supported write flag for registers. currently write is enabled only for PMC MSR. [root@ht]# cat /sys/kernel/debug/x86/cpu/cpu1/pmc/0x300/value 0x0 [root@ht]# echo 1234 > /sys/kernel/debug/x86/cpu/cpu1/pmc/0x300/value [root@ht]# cat /sys/kernel/debug/x86/cpu/cpu1/pmc/0x300/value 0x4d2 [root@ht]# echo 0x1234 > /sys/kernel/debug/x86/cpu/cpu1/pmc/0x300/value [root@ht]# cat /sys/kernel/debug/x86/cpu/cpu1/pmc/0x300/value 0x1234 Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/cpu_debug.c | 118 +++++++++++++++++++++++++++++----------- 1 file changed, 86 insertions(+), 32 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c index 9abbcbd933c..21c0cf8ced1 100755 --- a/arch/x86/kernel/cpu/cpu_debug.c +++ b/arch/x86/kernel/cpu/cpu_debug.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -40,41 +41,41 @@ static DEFINE_MUTEX(cpu_debug_lock); static struct dentry *cpu_debugfs_dir; static struct cpu_debug_base cpu_base[] = { - { "mc", CPU_MC }, /* Machine Check */ - { "monitor", CPU_MONITOR }, /* Monitor */ - { "time", CPU_TIME }, /* Time */ - { "pmc", CPU_PMC }, /* Performance Monitor */ - { "platform", CPU_PLATFORM }, /* Platform */ - { "apic", CPU_APIC }, /* APIC */ - { "poweron", CPU_POWERON }, /* Power-on */ - { "control", CPU_CONTROL }, /* Control */ - { "features", CPU_FEATURES }, /* Features control */ - { "lastbranch", CPU_LBRANCH }, /* Last Branch */ - { "bios", CPU_BIOS }, /* BIOS */ - { "freq", CPU_FREQ }, /* Frequency */ - { "mtrr", CPU_MTRR }, /* MTRR */ - { "perf", CPU_PERF }, /* Performance */ - { "cache", CPU_CACHE }, /* Cache */ - { "sysenter", CPU_SYSENTER }, /* Sysenter */ - { "therm", CPU_THERM }, /* Thermal */ - { "misc", CPU_MISC }, /* Miscellaneous */ - { "debug", CPU_DEBUG }, /* Debug */ - { "pat", CPU_PAT }, /* PAT */ - { "vmx", CPU_VMX }, /* VMX */ - { "call", CPU_CALL }, /* System Call */ - { "base", CPU_BASE }, /* BASE Address */ - { "smm", CPU_SMM }, /* System mgmt mode */ - { "svm", CPU_SVM }, /*Secure Virtial Machine*/ - { "osvm", CPU_OSVM }, /* OS-Visible Workaround*/ - { "tss", CPU_TSS }, /* Task Stack Segment */ - { "cr", CPU_CR }, /* Control Registers */ - { "dt", CPU_DT }, /* Descriptor Table */ - { "registers", CPU_REG_ALL }, /* Select all Registers */ + { "mc", CPU_MC, 0 }, + { "monitor", CPU_MONITOR, 0 }, + { "time", CPU_TIME, 0 }, + { "pmc", CPU_PMC, 1 }, + { "platform", CPU_PLATFORM, 0 }, + { "apic", CPU_APIC, 0 }, + { "poweron", CPU_POWERON, 0 }, + { "control", CPU_CONTROL, 0 }, + { "features", CPU_FEATURES, 0 }, + { "lastbranch", CPU_LBRANCH, 0 }, + { "bios", CPU_BIOS, 0 }, + { "freq", CPU_FREQ, 0 }, + { "mtrr", CPU_MTRR, 0 }, + { "perf", CPU_PERF, 0 }, + { "cache", CPU_CACHE, 0 }, + { "sysenter", CPU_SYSENTER, 0 }, + { "therm", CPU_THERM, 0 }, + { "misc", CPU_MISC, 0 }, + { "debug", CPU_DEBUG, 0 }, + { "pat", CPU_PAT, 0 }, + { "vmx", CPU_VMX, 0 }, + { "call", CPU_CALL, 0 }, + { "base", CPU_BASE, 0 }, + { "smm", CPU_SMM, 0 }, + { "svm", CPU_SVM, 0 }, + { "osvm", CPU_OSVM, 0 }, + { "tss", CPU_TSS, 0 }, + { "cr", CPU_CR, 0 }, + { "dt", CPU_DT, 0 }, + { "registers", CPU_REG_ALL, 0 }, }; static struct cpu_file_base cpu_file[] = { - { "index", CPU_REG_ALL }, /* index */ - { "value", CPU_REG_ALL }, /* value */ + { "index", CPU_REG_ALL, 0 }, + { "value", CPU_REG_ALL, 1 }, }; /* Intel Registers Range */ @@ -608,9 +609,62 @@ static int cpu_seq_open(struct inode *inode, struct file *file) return err; } +static int write_msr(struct cpu_private *priv, u64 val) +{ + u32 low, high; + + high = (val >> 32) & 0xffffffff; + low = val & 0xffffffff; + + if (!wrmsr_safe_on_cpu(priv->cpu, priv->reg, low, high)) + return 0; + + return -EPERM; +} + +static int write_cpu_register(struct cpu_private *priv, const char *buf) +{ + int ret = -EPERM; + u64 val; + + ret = strict_strtoull(buf, 0, &val); + if (ret < 0) + return ret; + + /* Supporting only MSRs */ + if (priv->type < CPU_TSS_BIT) + return write_msr(priv, val); + + return ret; +} + +static ssize_t cpu_write(struct file *file, const char __user *ubuf, + size_t count, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct cpu_private *priv = seq->private; + char buf[19]; + + if ((priv == NULL) || (count >= sizeof(buf))) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, count)) + return -EFAULT; + + buf[count] = 0; + + if ((cpu_base[priv->type].write) && (cpu_file[priv->file].write)) + if (!write_cpu_register(priv, buf)) + return count; + + return -EACCES; +} + static const struct file_operations cpu_fops = { + .owner = THIS_MODULE, .open = cpu_seq_open, .read = seq_read, + .write = cpu_write, .llseek = seq_lseek, .release = seq_release, }; -- cgit v1.2.3 From 773e673de27297d07d852e7e9bfd1a695cae1da2 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 12 Mar 2009 21:35:18 -0700 Subject: x86: fix e820_update_range() Impact: fix left range size on head | commit 5c0e6f035df983210e4d22213aed624ced502d3d | x86: fix code paths used by update_mptable | Impact: fix crashes under Xen due to unrobust e820 code fixes one e820 bug, but introduces another bug. Need to update size for left range at first in case it is header. also add __e820_add_region take more parameter. Signed-off-by: Yinghai Lu Cc: jbeulich@novell.com LKML-Reference: <49B9E286.502@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 45 ++++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 3cf6681ac80..95b81c18b6b 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -110,19 +110,25 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type) /* * Add a memory region to the kernel e820 map. */ -void __init e820_add_region(u64 start, u64 size, int type) +static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, + int type) { - int x = e820.nr_map; + int x = e820x->nr_map; - if (x == ARRAY_SIZE(e820.map)) { + if (x == ARRAY_SIZE(e820x->map)) { printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); return; } - e820.map[x].addr = start; - e820.map[x].size = size; - e820.map[x].type = type; - e820.nr_map++; + e820x->map[x].addr = start; + e820x->map[x].size = size; + e820x->map[x].type = type; + e820x->nr_map++; +} + +void __init e820_add_region(u64 start, u64 size, int type) +{ + __e820_add_region(&e820, start, size, type); } void __init e820_print_map(char *who) @@ -417,11 +423,11 @@ static int __init append_e820_map(struct e820entry *biosmap, int nr_map) return __append_e820_map(biosmap, nr_map); } -static u64 __init e820_update_range_map(struct e820map *e820x, u64 start, +static u64 __init __e820_update_range(struct e820map *e820x, u64 start, u64 size, unsigned old_type, unsigned new_type) { - unsigned int i, x; + unsigned int i; u64 real_updated_size = 0; BUG_ON(old_type == new_type); @@ -447,22 +453,19 @@ static u64 __init e820_update_range_map(struct e820map *e820x, u64 start, if (final_start >= final_end) continue; - x = e820x->nr_map; - if (x == ARRAY_SIZE(e820x->map)) { - printk(KERN_ERR "Too many memory map entries!\n"); - break; - } - e820x->map[x].addr = final_start; - e820x->map[x].size = final_end - final_start; - e820x->map[x].type = new_type; - e820x->nr_map++; + __e820_add_region(e820x, final_start, final_end - final_start, + new_type); real_updated_size += final_end - final_start; + /* + * left range could be head or tail, so need to update + * size at first. + */ + ei->size -= final_end - final_start; if (ei->addr < final_start) continue; ei->addr = final_end; - ei->size -= final_end - final_start; } return real_updated_size; } @@ -470,13 +473,13 @@ static u64 __init e820_update_range_map(struct e820map *e820x, u64 start, u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, unsigned new_type) { - return e820_update_range_map(&e820, start, size, old_type, new_type); + return __e820_update_range(&e820, start, size, old_type, new_type); } static u64 __init e820_update_range_saved(u64 start, u64 size, unsigned old_type, unsigned new_type) { - return e820_update_range_map(&e820_saved, start, size, old_type, + return __e820_update_range(&e820_saved, start, size, old_type, new_type); } -- cgit v1.2.3 From 3ff42da5048649503e343a32be37b14a6a4e8aaf Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Thu, 12 Mar 2009 17:39:37 +0100 Subject: x86: mtrr: don't modify RdDram/WrDram bits of fixed MTRRs Impact: bug fix + BIOS workaround BIOS is expected to clear the SYSCFG[MtrrFixDramModEn] on AMD CPUs after fixed MTRRs are configured. Some BIOSes do not clear SYSCFG[MtrrFixDramModEn] on BP (and on APs). This can lead to obfuscation in Linux when this bit is not cleared on BP but cleared on APs. A consequence of this is that the saved fixed-MTRR state (from BP) differs from the fixed-MTRRs of APs -- because RdDram/WrDram bits are read as zero when SYSCFG[MtrrFixDramModEn] is cleared -- and Linux tries to sync fixed-MTRR state from BP to AP. This implies that Linux sets SYSCFG[MtrrFixDramEn] and activates those bits. More important is that (some) systems change these bits in SMM when ACPI is enabled. Hence it is racy if Linux modifies RdMem/WrMem bits, too. (1) The patch modifies an old fix from Bernhard Kaindl to get suspend/resume working on some Acer Laptops. Bernhard's patch tried to sync RdMem/WrMem bits of fixed MTRR registers and that helped on those old Laptops. (Don't ask me why -- can't test it myself). But this old problem was not the motivation for the patch. (See http://lkml.org/lkml/2007/4/3/110) (2) The more important effect is to fix issues on some more current systems. On those systems Linux panics or just freezes, see http://bugzilla.kernel.org/show_bug.cgi?id=11541 (and also duplicates of this bug: http://bugzilla.kernel.org/show_bug.cgi?id=11737 http://bugzilla.kernel.org/show_bug.cgi?id=11714) The affected systems boot only using acpi=ht, acpi=off or when the kernel is built with CONFIG_MTRR=n. The acpi options prevent full enablement of ACPI. Obviously when ACPI is enabled the BIOS/SMM modfies RdMem/WrMem bits. When CONFIG_MTRR=y Linux also accesses and modifies those bits when it needs to sync fixed-MTRRs across cores (Bernhard's fix, see (1)). How do you synchronize that? You can't. As a consequence Linux shouldn't touch those bits at all (Rationale are AMD's BKDGs which recommend to clear the bit that makes RdMem/WrMem accessible). This is the purpose of this patch. And (so far) this suffices to fix (1) and (2). I suggest not to touch RdDram/WrDram bits of fixed-MTRRs and SYSCFG[MtrrFixDramEn] and to clear SYSCFG[MtrrFixDramModEn] as suggested by AMD K8, and AMD family 10h/11h BKDGs. BIOS is expected to do this anyway. This should avoid that Linux and SMM tread on each other's toes ... Signed-off-by: Andreas Herrmann Cc: trenn@suse.de Cc: Yinghai Lu LKML-Reference: <20090312163937.GH20716@alberich.amd.com> Cc: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 51 ++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 21 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 96440352010..950c434f793 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -33,6 +33,32 @@ u64 mtrr_tom2; struct mtrr_state_type mtrr_state = {}; EXPORT_SYMBOL_GPL(mtrr_state); +/** + * BIOS is expected to clear MtrrFixDramModEn bit, see for example + * "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD + * Opteron Processors" (26094 Rev. 3.30 February 2006), section + * "13.2.1.2 SYSCFG Register": "The MtrrFixDramModEn bit should be set + * to 1 during BIOS initalization of the fixed MTRRs, then cleared to + * 0 for operation." + */ +static inline void k8_check_syscfg_dram_mod_en(void) +{ + u32 lo, hi; + + if (!((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && + (boot_cpu_data.x86 >= 0x0f))) + return; + + rdmsr(MSR_K8_SYSCFG, lo, hi); + if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) { + printk(KERN_ERR FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]" + " not cleared by BIOS, clearing this bit\n", + smp_processor_id()); + lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY; + mtrr_wrmsr(MSR_K8_SYSCFG, lo, hi); + } +} + /* * Returns the effective MTRR type for the region * Error returns: @@ -166,6 +192,8 @@ get_fixed_ranges(mtrr_type * frs) unsigned int *p = (unsigned int *) frs; int i; + k8_check_syscfg_dram_mod_en(); + rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]); for (i = 0; i < 2; i++) @@ -305,28 +333,11 @@ void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b) smp_processor_id(), msr, a, b); } -/** - * Enable and allow read/write of extended fixed-range MTRR bits on K8 CPUs - * see AMD publication no. 24593, chapter 3.2.1 for more information - */ -static inline void k8_enable_fixed_iorrs(void) -{ - unsigned lo, hi; - - rdmsr(MSR_K8_SYSCFG, lo, hi); - mtrr_wrmsr(MSR_K8_SYSCFG, lo - | K8_MTRRFIXRANGE_DRAM_ENABLE - | K8_MTRRFIXRANGE_DRAM_MODIFY, hi); -} - /** * set_fixed_range - checks & updates a fixed-range MTRR if it differs from the value it should have * @msr: MSR address of the MTTR which should be checked and updated * @changed: pointer which indicates whether the MTRR needed to be changed * @msrwords: pointer to the MSR values which the MSR should have - * - * If K8 extentions are wanted, update the K8 SYSCFG MSR also. - * See AMD publication no. 24593, chapter 7.8.1, page 233 for more information. */ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords) { @@ -335,10 +346,6 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords) rdmsr(msr, lo, hi); if (lo != msrwords[0] || hi != msrwords[1]) { - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - (boot_cpu_data.x86 >= 0x0f && boot_cpu_data.x86 <= 0x11) && - ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK)) - k8_enable_fixed_iorrs(); mtrr_wrmsr(msr, msrwords[0], msrwords[1]); *changed = true; } @@ -426,6 +433,8 @@ static int set_fixed_ranges(mtrr_type * frs) bool changed = false; int block=-1, range; + k8_check_syscfg_dram_mod_en(); + while (fixed_range_blocks[++block].ranges) for (range=0; range < fixed_range_blocks[block].ranges; range++) set_fixed_range(fixed_range_blocks[block].base_msr + range, -- cgit v1.2.3 From 5a8ac9d28dae5330c70562c7d7785f5104059c17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9rico=20Wang?= Date: Fri, 13 Mar 2009 15:56:58 +0800 Subject: x86: ptrace, bts: fix an unreachable statement Commit c2724775ce57c98b8af9694857b941dc61056516 put a statement after return, which makes that statement unreachable. Move that statement before return. Signed-off-by: WANG Cong Cc: Roland McGrath Cc: Markus Metzger LKML-Reference: <20090313075622.GB8933@hack> Cc: # .29 only Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 3d9672e59c1..19378715f41 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -685,9 +685,8 @@ static int ptrace_bts_config(struct task_struct *child, if (!cfg.signal) return -EINVAL; - return -EOPNOTSUPP; - child->thread.bts_ovfl_signal = cfg.signal; + return -EOPNOTSUPP; } if ((cfg.flags & PTRACE_BTS_O_ALLOC) && -- cgit v1.2.3 From 88200bc28da38bcda1cb1bd218216e83b426d8a8 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 14 Mar 2009 12:08:13 +0530 Subject: x86: entry_32.S fix compile warnings - fix work mask bit width Fix: arch/x86/kernel/entry_32.S:446: Warning: 00000000080001d1 shortened to 00000000000001d1 arch/x86/kernel/entry_32.S:457: Warning: 000000000800feff shortened to 000000000000feff arch/x86/kernel/entry_32.S:527: Warning: 00000000080001d1 shortened to 00000000000001d1 arch/x86/kernel/entry_32.S:541: Warning: 000000000800feff shortened to 000000000000feff arch/x86/kernel/entry_32.S:676: Warning: 0000000008000091 shortened to 0000000000000091 TIF_SYSCALL_FTRACE is 0x08000000 and until now we checked the first 16 bits of the work mask - bit 27 falls outside of that. Update the entry_32.S code to check the full 32-bit mask. [ %cx => %ecx fix from Cyrill Gorcunov ] Signed-off-by: Jaswinder Singh Rajput Cc: Frederic Weisbecker Cc: Steven Rostedt Cc: "H. Peter Anvin" LKML-Reference: <1237012693.18733.3.camel@ht.satnam> Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 899e8938e79..c929add475c 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -442,8 +442,7 @@ sysenter_past_esp: GET_THREAD_INFO(%ebp) - /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ - testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) jnz sysenter_audit sysenter_do_call: cmpl $(nr_syscalls), %eax @@ -454,7 +453,7 @@ sysenter_do_call: DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx - testw $_TIF_ALLWORK_MASK, %cx + testl $_TIF_ALLWORK_MASK, %ecx jne sysexit_audit sysenter_exit: /* if something modifies registers it must also disable sysexit */ @@ -468,7 +467,7 @@ sysenter_exit: #ifdef CONFIG_AUDITSYSCALL sysenter_audit: - testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp) jnz syscall_trace_entry addl $4,%esp CFI_ADJUST_CFA_OFFSET -4 @@ -485,7 +484,7 @@ sysenter_audit: jmp sysenter_do_call sysexit_audit: - testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx jne syscall_exit_work TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_ANY) @@ -498,7 +497,7 @@ sysexit_audit: DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx - testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx jne syscall_exit_work movl PT_EAX(%esp),%eax /* reload syscall return value */ jmp sysenter_exit @@ -523,8 +522,7 @@ ENTRY(system_call) SAVE_ALL GET_THREAD_INFO(%ebp) # system call tracing in operation / emulation - /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ - testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) jnz syscall_trace_entry cmpl $(nr_syscalls), %eax jae syscall_badsys @@ -538,7 +536,7 @@ syscall_exit: # between sampling and the iret TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx - testw $_TIF_ALLWORK_MASK, %cx # current->work + testl $_TIF_ALLWORK_MASK, %ecx # current->work jne syscall_exit_work restore_all: @@ -673,7 +671,7 @@ END(syscall_trace_entry) # perform syscall exit tracing ALIGN syscall_exit_work: - testb $_TIF_WORK_SYSCALL_EXIT, %cl + testl $_TIF_WORK_SYSCALL_EXIT, %ecx jz work_pending TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call -- cgit v1.2.3 From 78a8b35bc7abf8b8333d6f625e08c0f7cc1c3742 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 12 Mar 2009 22:36:01 -0700 Subject: x86: make e820_update_range() handle small range update Impact: enhance e820 code to handle more cases Try to handle new range which could be covered by one entry. Signed-off-by: Yinghai Lu Cc: jbeulich@novell.com LKML-Reference: <49B9F0C1.10402@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 95b81c18b6b..0c34ff49ff4 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -427,6 +427,7 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start, u64 size, unsigned old_type, unsigned new_type) { + u64 end; unsigned int i; u64 real_updated_size = 0; @@ -435,21 +436,35 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start, if (size > (ULLONG_MAX - start)) size = ULLONG_MAX - start; + end = start + size; for (i = 0; i < e820x->nr_map; i++) { struct e820entry *ei = &e820x->map[i]; u64 final_start, final_end; + u64 ei_end; + if (ei->type != old_type) continue; - /* totally covered? */ - if (ei->addr >= start && - (ei->addr + ei->size) <= (start + size)) { + + ei_end = ei->addr + ei->size; + /* totally covered by new range? */ + if (ei->addr >= start && ei_end <= end) { ei->type = new_type; real_updated_size += ei->size; continue; } + + /* new range is totally covered? */ + if (ei->addr < start && ei_end > end) { + __e820_add_region(e820x, start, size, new_type); + __e820_add_region(e820x, end, ei_end - end, ei->type); + ei->size = start - ei->addr; + real_updated_size += size; + continue; + } + /* partially covered */ final_start = max(start, ei->addr); - final_end = min(start + size, ei->addr + ei->size); + final_end = min(end, ei_end); if (final_start >= final_end) continue; -- cgit v1.2.3 From 63516ef6d6a8379ee5c32463efc6a75f9da8a309 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 13 Mar 2009 12:46:07 -0700 Subject: x86: fix get_mtrr() warning about smp_processor_id() with CONFIG_PREEMPT=y Impact: fix debug warning Jaswinder noticed that there is a warning about smp_processor_id() in get_mtrr(). Fix it by wrapping the printout into a get/put_cpu() pair. Reported-by: Jaswinder Singh Rajput Signed-off-by: Yinghai Lu Cc: Andrew Morton LKML-Reference: <49BAB7FF.4030107@kernel.org> [ changed to get/put_cpu(), cleaned up surrounding code a it. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 950c434f793..641ee3507d0 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -381,11 +381,14 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, { unsigned int mask_lo, mask_hi, base_lo, base_hi; unsigned int tmp, hi; + int cpu; /* * get_mtrr doesn't need to update mtrr_state, also it could be called * from any cpu, so try to print it out directly. */ + cpu = get_cpu(); + rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); if ((mask_lo & 0x800) == 0) { @@ -393,15 +396,16 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, *base = 0; *size = 0; *type = 0; - return; + goto out_put_cpu; } rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi); - /* Work out the shifted address mask. */ + /* Work out the shifted address mask: */ tmp = mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT; mask_lo = size_or_mask | tmp; - /* Expand tmp with high bits to all 1s*/ + + /* Expand tmp with high bits to all 1s: */ hi = fls(tmp); if (hi > 0) { tmp |= ~((1<<(hi - 1)) - 1); @@ -412,15 +416,19 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, } } - /* This works correctly if size is a power of two, i.e. a - contiguous range. */ + /* + * This works correctly if size is a power of two, i.e. a + * contiguous range: + */ *size = -mask_lo; *base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT; *type = base_lo & 0xff; printk(KERN_DEBUG " get_mtrr: cpu%d reg%02d base=%010lx size=%010lx %s\n", - smp_processor_id(), reg, *base, *size, + cpu, reg, *base, *size, mtrr_attrib_to_str(*type & 0xff)); +out_put_cpu: + put_cpu(); } /** -- cgit v1.2.3 From d4c90e37a21154c1910b2646e9544bdd32d5bc3a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 13 Mar 2009 14:08:49 -0700 Subject: x86: print the continous part of fixed mtrrs together Impact: print out fewer lines 1. print continuous range with same type together 2. change _INFO to _DEBUG Signed-off-by: Yinghai Lu LKML-Reference: <49BACB61.8000302@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 58 +++++++++++++++++++++++++++++++------- 1 file changed, 48 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 641ee3507d0..37f28fc7cf9 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -208,13 +208,47 @@ void mtrr_save_fixed_ranges(void *info) get_fixed_ranges(mtrr_state.fixed_ranges); } -static void print_fixed(unsigned base, unsigned step, const mtrr_type*types) +static unsigned __initdata last_fixed_start; +static unsigned __initdata last_fixed_end; +static mtrr_type __initdata last_fixed_type; + +static void __init print_fixed_last(void) +{ + if (!last_fixed_end) + return; + + printk(KERN_DEBUG " %05X-%05X %s\n", last_fixed_start, + last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type)); + + last_fixed_end = 0; +} + +static void __init update_fixed_last(unsigned base, unsigned end, + mtrr_type type) +{ + last_fixed_start = base; + last_fixed_end = end; + last_fixed_type = type; +} + +static void __init print_fixed(unsigned base, unsigned step, + const mtrr_type *types) { unsigned i; - for (i = 0; i < 8; ++i, ++types, base += step) - printk(KERN_INFO " %05X-%05X %s\n", - base, base + step - 1, mtrr_attrib_to_str(*types)); + for (i = 0; i < 8; ++i, ++types, base += step) { + if (last_fixed_end == 0) { + update_fixed_last(base, base + step, *types); + continue; + } + if (last_fixed_end == base && last_fixed_type == *types) { + last_fixed_end = base + step; + continue; + } + /* new segments: gap or different type */ + print_fixed_last(); + update_fixed_last(base, base + step, *types); + } } static void prepare_set(void); @@ -225,22 +259,26 @@ static void __init print_mtrr_state(void) unsigned int i; int high_width; - printk(KERN_INFO "MTRR default type: %s\n", mtrr_attrib_to_str(mtrr_state.def_type)); + printk(KERN_DEBUG "MTRR default type: %s\n", + mtrr_attrib_to_str(mtrr_state.def_type)); if (mtrr_state.have_fixed) { - printk(KERN_INFO "MTRR fixed ranges %sabled:\n", + printk(KERN_DEBUG "MTRR fixed ranges %sabled:\n", mtrr_state.enabled & 1 ? "en" : "dis"); print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0); for (i = 0; i < 2; ++i) print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8); for (i = 0; i < 8; ++i) print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8); + + /* tail */ + print_fixed_last(); } - printk(KERN_INFO "MTRR variable ranges %sabled:\n", + printk(KERN_DEBUG "MTRR variable ranges %sabled:\n", mtrr_state.enabled & 2 ? "en" : "dis"); high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4; for (i = 0; i < num_var_ranges; ++i) { if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) - printk(KERN_INFO " %u base %0*X%05X000 mask %0*X%05X000 %s\n", + printk(KERN_DEBUG " %u base %0*X%05X000 mask %0*X%05X000 %s\n", i, high_width, mtrr_state.var_ranges[i].base_hi, @@ -250,10 +288,10 @@ static void __init print_mtrr_state(void) mtrr_state.var_ranges[i].mask_lo >> 12, mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff)); else - printk(KERN_INFO " %u disabled\n", i); + printk(KERN_DEBUG " %u disabled\n", i); } if (mtrr_tom2) { - printk(KERN_INFO "TOM2: %016llx aka %lldM\n", + printk(KERN_DEBUG "TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20); } } -- cgit v1.2.3 From 48f4c485c275e9550fa1a1191768689cc3ae0037 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Sat, 14 Mar 2009 12:24:02 +0100 Subject: x86/centaur: merge 32 & 64 bit version there should be no difference, except: * the 64bit variant now also initializes the padlock unit. * ->c_early_init() is executed again from ->c_init() * the 64bit fixups made into 32bit path. Signed-off-by: Sebastian Andrzej Siewior Cc: herbert@gondor.apana.org.au LKML-Reference: <1237029843-28076-2-git-send-email-sebastian@breakpoint.cc> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/Makefile | 3 +-- arch/x86/kernel/cpu/centaur.c | 34 +++++++++++++++++++++++++++------- arch/x86/kernel/cpu/centaur_64.c | 37 ------------------------------------- 3 files changed, 28 insertions(+), 46 deletions(-) delete mode 100644 arch/x86/kernel/cpu/centaur_64.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index d4356f8b752..4e242f9a06e 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -19,8 +19,7 @@ obj-$(CONFIG_X86_CPU_DEBUG) += cpu_debug.o obj-$(CONFIG_CPU_SUP_INTEL) += intel.o obj-$(CONFIG_CPU_SUP_AMD) += amd.o obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o -obj-$(CONFIG_CPU_SUP_CENTAUR_32) += centaur.o -obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o +obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 983e0830f0d..c95e831bb09 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -1,11 +1,11 @@ +#include #include #include -#include #include -#include #include #include +#include #include "cpu.h" @@ -276,7 +276,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c) */ c->x86_capability[5] = cpuid_edx(0xC0000001); } - +#ifdef CONFIG_X86_32 /* Cyrix III family needs CX8 & PGE explicitly enabled. */ if (c->x86_model >= 6 && c->x86_model <= 9) { rdmsr(MSR_VIA_FCR, lo, hi); @@ -288,6 +288,11 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c) /* Before Nehemiah, the C3's had 3dNOW! */ if (c->x86_model >= 6 && c->x86_model < 9) set_cpu_cap(c, X86_FEATURE_3DNOW); +#endif + if (c->x86 == 0x6 && c->x86_model >= 0xf) { + c->x86_cache_alignment = c->x86_clflush_size * 2; + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + } display_cacheinfo(c); } @@ -316,16 +321,25 @@ enum { static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) { switch (c->x86) { +#ifdef CONFIG_X86_32 case 5: /* Emulate MTRRs using Centaur's MCR. */ set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR); break; +#endif + case 6: + if (c->x86_model >= 0xf) + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + break; } +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_SYSENTER32); +#endif } static void __cpuinit init_centaur(struct cpuinfo_x86 *c) { - +#ifdef CONFIG_X86_32 char *name; u32 fcr_set = 0; u32 fcr_clr = 0; @@ -337,8 +351,10 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c) * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ clear_cpu_cap(c, 0*32+31); - +#endif + early_init_centaur(c); switch (c->x86) { +#ifdef CONFIG_X86_32 case 5: switch (c->x86_model) { case 4: @@ -442,16 +458,20 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c) } sprintf(c->x86_model_id, "WinChip %s", name); break; - +#endif case 6: init_c3(c); break; } +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); +#endif } static unsigned int __cpuinit centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) { +#ifdef CONFIG_X86_32 /* VIA C3 CPUs (670-68F) need further shifting. */ if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8))) size >>= 8; @@ -464,7 +484,7 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) if ((c->x86 == 6) && (c->x86_model == 9) && (c->x86_mask == 1) && (size == 65)) size -= 1; - +#endif return size; } diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c deleted file mode 100644 index 51b09c48c9c..00000000000 --- a/arch/x86/kernel/cpu/centaur_64.c +++ /dev/null @@ -1,37 +0,0 @@ -#include -#include - -#include -#include - -#include "cpu.h" - -static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) -{ - if (c->x86 == 0x6 && c->x86_model >= 0xf) - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); - - set_cpu_cap(c, X86_FEATURE_SYSENTER32); -} - -static void __cpuinit init_centaur(struct cpuinfo_x86 *c) -{ - early_init_centaur(c); - - if (c->x86 == 0x6 && c->x86_model >= 0xf) { - c->x86_cache_alignment = c->x86_clflush_size * 2; - set_cpu_cap(c, X86_FEATURE_REP_GOOD); - } - set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); -} - -static const struct cpu_dev centaur_cpu_dev __cpuinitconst = { - .c_vendor = "Centaur", - .c_ident = { "CentaurHauls" }, - .c_early_init = early_init_centaur, - .c_init = init_centaur, - .c_x86_vendor = X86_VENDOR_CENTAUR, -}; - -cpu_dev_register(centaur_cpu_dev); - -- cgit v1.2.3 From f4c3c4cdb1de232ff37cf4339eb2f36c84e20da6 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 13 Mar 2009 19:59:26 +0530 Subject: x86: cpu_debug add support for various AMD CPUs Impact: Added AMD CPUs support Added flags for various AMD CPUs. Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/cpu_debug.c | 150 ++++++++++++++++++++++++++++------------ 1 file changed, 106 insertions(+), 44 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c index 21c0cf8ced1..46e29ab96c6 100755 --- a/arch/x86/kernel/cpu/cpu_debug.c +++ b/arch/x86/kernel/cpu/cpu_debug.c @@ -64,6 +64,8 @@ static struct cpu_debug_base cpu_base[] = { { "vmx", CPU_VMX, 0 }, { "call", CPU_CALL, 0 }, { "base", CPU_BASE, 0 }, + { "ver", CPU_VER, 0 }, + { "conf", CPU_CONF, 0 }, { "smm", CPU_SMM, 0 }, { "svm", CPU_SVM, 0 }, { "osvm", CPU_OSVM, 0 }, @@ -177,54 +179,59 @@ static struct cpu_debug_range cpu_intel_range[] = { /* AMD Registers Range */ static struct cpu_debug_range cpu_amd_range[] = { - { 0x00000010, 0x00000010, CPU_TIME, CPU_ALL, }, - { 0x0000001B, 0x0000001B, CPU_APIC, CPU_ALL, }, - { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_ALL, }, - - { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_ALL, }, - { 0x00000179, 0x0000017A, CPU_MC, CPU_ALL, }, - { 0x0000017B, 0x0000017B, CPU_MC, CPU_ALL, }, - { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_ALL, }, - { 0x000001DB, 0x000001DE, CPU_LBRANCH, CPU_ALL, }, - - { 0x00000200, 0x0000020F, CPU_MTRR, CPU_ALL, }, - { 0x00000250, 0x00000250, CPU_MTRR, CPU_ALL, }, - { 0x00000258, 0x00000259, CPU_MTRR, CPU_ALL, }, - { 0x00000268, 0x0000026F, CPU_MTRR, CPU_ALL, }, - { 0x00000277, 0x00000277, CPU_PAT, CPU_ALL, }, - { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_ALL, }, - - { 0x00000400, 0x00000417, CPU_MC, CPU_ALL, }, - - { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_ALL, }, - { 0xC0000081, 0xC0000084, CPU_CALL, CPU_ALL, }, - { 0xC0000100, 0xC0000102, CPU_BASE, CPU_ALL, }, - { 0xC0000103, 0xC0000103, CPU_TIME, CPU_ALL, }, - - { 0xC0000408, 0xC000040A, CPU_MC, CPU_ALL, }, - - { 0xc0010000, 0xc0010007, CPU_PMC, CPU_ALL, }, - { 0xc0010010, 0xc0010010, CPU_MTRR, CPU_ALL, }, - { 0xc0010016, 0xc001001A, CPU_MTRR, CPU_ALL, }, - { 0xc001001D, 0xc001001D, CPU_MTRR, CPU_ALL, }, - { 0xc0010030, 0xc0010035, CPU_BIOS, CPU_ALL, }, - { 0xc0010056, 0xc0010056, CPU_SMM, CPU_ALL, }, - { 0xc0010061, 0xc0010063, CPU_SMM, CPU_ALL, }, - { 0xc0010074, 0xc0010074, CPU_MC, CPU_ALL, }, - { 0xc0010111, 0xc0010113, CPU_SMM, CPU_ALL, }, - { 0xc0010114, 0xc0010118, CPU_SVM, CPU_ALL, }, - { 0xc0010119, 0xc001011A, CPU_SMM, CPU_ALL, }, - { 0xc0010140, 0xc0010141, CPU_OSVM, CPU_ALL, }, - { 0xc0010156, 0xc0010156, CPU_SMM, CPU_ALL, }, + { 0x00000000, 0x00000001, CPU_MC, CPU_K10_PLUS, }, + { 0x00000010, 0x00000010, CPU_TIME, CPU_K8_PLUS, }, + { 0x0000001B, 0x0000001B, CPU_APIC, CPU_K8_PLUS, }, + { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_K7_PLUS }, + { 0x0000008B, 0x0000008B, CPU_VER, CPU_K8_PLUS }, + { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_K8_PLUS, }, + + { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_K8_PLUS, }, + { 0x00000179, 0x0000017B, CPU_MC, CPU_K8_PLUS, }, + { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_K8_PLUS, }, + { 0x000001DB, 0x000001DE, CPU_LBRANCH, CPU_K8_PLUS, }, + + { 0x00000200, 0x0000020F, CPU_MTRR, CPU_K8_PLUS, }, + { 0x00000250, 0x00000250, CPU_MTRR, CPU_K8_PLUS, }, + { 0x00000258, 0x00000259, CPU_MTRR, CPU_K8_PLUS, }, + { 0x00000268, 0x0000026F, CPU_MTRR, CPU_K8_PLUS, }, + { 0x00000277, 0x00000277, CPU_PAT, CPU_K8_PLUS, }, + { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_K8_PLUS, }, + + { 0x00000400, 0x00000413, CPU_MC, CPU_K8_PLUS, }, + + { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_AMD_ALL, }, + { 0xC0000081, 0xC0000084, CPU_CALL, CPU_K8_PLUS, }, + { 0xC0000100, 0xC0000102, CPU_BASE, CPU_K8_PLUS, }, + { 0xC0000103, 0xC0000103, CPU_TIME, CPU_K10_PLUS, }, + + { 0xC0010000, 0xC0010007, CPU_PMC, CPU_K8_PLUS, }, + { 0xC0010010, 0xC0010010, CPU_CONF, CPU_K7_PLUS, }, + { 0xC0010015, 0xC0010015, CPU_CONF, CPU_K7_PLUS, }, + { 0xC0010016, 0xC001001A, CPU_MTRR, CPU_K8_PLUS, }, + { 0xC001001D, 0xC001001D, CPU_MTRR, CPU_K8_PLUS, }, + { 0xC001001F, 0xC001001F, CPU_CONF, CPU_K8_PLUS, }, + { 0xC0010030, 0xC0010035, CPU_BIOS, CPU_K8_PLUS, }, + { 0xC0010044, 0xC0010048, CPU_MC, CPU_K8_PLUS, }, + { 0xC0010050, 0xC0010056, CPU_SMM, CPU_K0F_PLUS, }, + { 0xC0010058, 0xC0010058, CPU_CONF, CPU_K10_PLUS, }, + { 0xC0010060, 0xC0010060, CPU_CACHE, CPU_AMD_11, }, + { 0xC0010061, 0xC0010068, CPU_SMM, CPU_K10_PLUS, }, + { 0xC0010069, 0xC001006B, CPU_SMM, CPU_AMD_11, }, + { 0xC0010070, 0xC0010071, CPU_SMM, CPU_K10_PLUS, }, + { 0xC0010111, 0xC0010113, CPU_SMM, CPU_K8_PLUS, }, + { 0xC0010114, 0xC0010118, CPU_SVM, CPU_K10_PLUS, }, + { 0xC0010140, 0xC0010141, CPU_OSVM, CPU_K10_PLUS, }, + { 0xC0011022, 0xC0011023, CPU_CONF, CPU_K10_PLUS, }, }; -static int get_cpu_modelflag(unsigned cpu) +/* Intel */ +static int get_intel_modelflag(unsigned model) { int flag; - switch (per_cpu(cpu_model, cpu)) { - /* Intel */ + switch (model) { case 0x0501: case 0x0502: case 0x0504: @@ -271,6 +278,59 @@ static int get_cpu_modelflag(unsigned cpu) return flag; } +/* AMD */ +static int get_amd_modelflag(unsigned model) +{ + int flag; + + switch (model >> 8) { + case 0x6: + flag = CPU_AMD_K6; + break; + case 0x7: + flag = CPU_AMD_K7; + break; + case 0x8: + flag = CPU_AMD_K8; + break; + case 0xf: + flag = CPU_AMD_0F; + break; + case 0x10: + flag = CPU_AMD_10; + break; + case 0x11: + flag = CPU_AMD_11; + break; + default: + flag = CPU_NONE; + break; + } + + return flag; +} + +static int get_cpu_modelflag(unsigned cpu) +{ + int flag; + + flag = per_cpu(cpu_model, cpu); + + switch (flag >> 16) { + case X86_VENDOR_INTEL: + flag = get_intel_modelflag(flag); + break; + case X86_VENDOR_AMD: + flag = get_amd_modelflag(flag & 0xffff); + break; + default: + flag = CPU_NONE; + break; + } + + return flag; +} + static int get_cpu_range_count(unsigned cpu) { int index; @@ -311,7 +371,8 @@ static int is_typeflag_valid(unsigned cpu, unsigned flag) return 1; break; case X86_VENDOR_AMD: - if (cpu_amd_range[i].flag & flag) + if ((cpu_amd_range[i].model & modelflag) && + (cpu_amd_range[i].flag & flag)) return 1; break; } @@ -337,7 +398,8 @@ static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max, } break; case X86_VENDOR_AMD: - if (cpu_amd_range[index].flag & flag) { + if ((cpu_amd_range[index].model & modelflag) && + (cpu_amd_range[index].flag & flag)) { *min = cpu_amd_range[index].min; *max = cpu_amd_range[index].max; } -- cgit v1.2.3 From b9719a4d9cfa5d46fa6a1eb3e3fc2238e7981e4d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 10 Mar 2009 11:19:18 -0700 Subject: x86: make section delimiter symbols part of their section Impact: cleanup Move the symbols delimiting a section part of the section (section relative) rather than absolute. This avoids any unexpected gaps between the section-start symbol and the first data in the section, which could be caused by implicit alignment of the section data. It also makes the general form of vmlinux_64.lds.S consistent with vmlinux_32.lds.S. Signed-off-by: Jeremy Fitzhardinge Cc: Yinghai Lu Cc: "Eric W. Biederman" Signed-off-by: H. Peter Anvin --- arch/x86/kernel/vmlinux_64.lds.S | 85 +++++++++++++++++++++------------------- 1 file changed, 45 insertions(+), 40 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 5bf54e40c6e..fe5d21ce724 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -29,8 +29,8 @@ SECTIONS { . = __START_KERNEL; phys_startup_64 = startup_64 - LOAD_OFFSET; - _text = .; /* Text and read-only data */ .text : AT(ADDR(.text) - LOAD_OFFSET) { + _text = .; /* Text and read-only data */ /* First the code that has to be first for bootstrapping */ *(.text.head) _stext = .; @@ -61,13 +61,13 @@ SECTIONS .data : AT(ADDR(.data) - LOAD_OFFSET) { DATA_DATA CONSTRUCTORS + _edata = .; /* End of data section */ } :data - _edata = .; /* End of data section */ - . = ALIGN(PAGE_SIZE); - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); *(.data.cacheline_aligned) } . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); @@ -125,29 +125,29 @@ SECTIONS #undef VVIRT_OFFSET #undef VVIRT - . = ALIGN(THREAD_SIZE); /* init_task */ .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { + . = ALIGN(THREAD_SIZE); /* init_task */ *(.data.init_task) }:data.init - . = ALIGN(PAGE_SIZE); .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); *(.data.page_aligned) } - /* might get freed after init */ - . = ALIGN(PAGE_SIZE); - __smp_alt_begin = .; - __smp_locks = .; .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { + /* might get freed after init */ + . = ALIGN(PAGE_SIZE); + __smp_alt_begin = .; + __smp_locks = .; *(.smp_locks) + __smp_locks_end = .; + . = ALIGN(PAGE_SIZE); + __smp_alt_end = .; } - __smp_locks_end = .; - . = ALIGN(PAGE_SIZE); - __smp_alt_end = .; . = ALIGN(PAGE_SIZE); /* Init code and data */ - __init_begin = .; + __init_begin = .; /* paired with __init_end */ .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { _sinittext = .; INIT_TEXT @@ -159,40 +159,42 @@ SECTIONS __initdata_end = .; } - . = ALIGN(16); - __setup_start = .; - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) } - __setup_end = .; - __initcall_start = .; + .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { + . = ALIGN(16); + __setup_start = .; + *(.init.setup) + __setup_end = .; + } .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { + __initcall_start = .; INITCALLS + __initcall_end = .; } - __initcall_end = .; - __con_initcall_start = .; .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { + __con_initcall_start = .; *(.con_initcall.init) + __con_initcall_end = .; } - __con_initcall_end = .; - __x86_cpu_dev_start = .; .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { + __x86_cpu_dev_start = .; *(.x86_cpu_dev.init) + __x86_cpu_dev_end = .; } - __x86_cpu_dev_end = .; SECURITY_INIT . = ALIGN(8); .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { - __parainstructions = .; + __parainstructions = .; *(.parainstructions) - __parainstructions_end = .; + __parainstructions_end = .; } - . = ALIGN(8); - __alt_instructions = .; .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { + . = ALIGN(8); + __alt_instructions = .; *(.altinstructions) + __alt_instructions_end = .; } - __alt_instructions_end = .; .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { *(.altinstr_replacement) } @@ -207,9 +209,11 @@ SECTIONS #ifdef CONFIG_BLK_DEV_INITRD . = ALIGN(PAGE_SIZE); - __initramfs_start = .; - .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) } - __initramfs_end = .; + .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { + __initramfs_start = .; + *(.init.ramfs) + __initramfs_end = .; + } #endif #ifdef CONFIG_SMP @@ -229,20 +233,21 @@ SECTIONS . = ALIGN(PAGE_SIZE); __init_end = .; - . = ALIGN(PAGE_SIZE); - __nosave_begin = .; .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { - *(.data.nosave) + . = ALIGN(PAGE_SIZE); + __nosave_begin = .; + *(.data.nosave) + . = ALIGN(PAGE_SIZE); + __nosave_end = .; } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */ - . = ALIGN(PAGE_SIZE); - __nosave_end = .; - __bss_start = .; /* BSS */ .bss : AT(ADDR(.bss) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); + __bss_start = .; /* BSS */ *(.bss.page_aligned) *(.bss) - } - __bss_stop = .; + __bss_stop = .; + } _end = . ; -- cgit v1.2.3 From 93dbda7cbcd70a0bd1a99f39f44a9ccde8ab9040 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 26 Feb 2009 17:35:44 -0800 Subject: x86: add brk allocation for very, very early allocations Impact: new interface Add a brk()-like allocator which effectively extends the bss in order to allow very early code to do dynamic allocations. This is better than using statically allocated arrays for data in subsystems which may never get used. The space for brk allocations is in the bss ELF segment, so that the space is mapped properly by the code which maps the kernel, and so that bootloaders keep the space free rather than putting a ramdisk or something into it. The bss itself, delimited by __bss_stop, ends before the brk area (__brk_base to __brk_limit). The kernel text, data and bss is reserved up to __bss_stop. Any brk-allocated data is reserved separately just before the kernel pagetable is built, as that code allocates from unreserved spaces in the e820 map, potentially allocating from any unused brk memory. Ultimately any unused memory in the brk area is used in the general kernel memory pool. Initially the brk space is set to 1MB, which is probably much larger than any user needs (the largest current user is i386 head_32.S's code to build the pagetables to map the kernel, which can get fairly large with a big kernel image and no PSE support). So long as the system has sufficient memory for the bootloader to reserve the kernel+1MB brk, there are no bad effects resulting from an over-large brk. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head32.c | 2 +- arch/x86/kernel/head64.c | 2 +- arch/x86/kernel/setup.c | 39 ++++++++++++++++++++++++++++++++++----- arch/x86/kernel/vmlinux_32.lds.S | 7 +++++++ arch/x86/kernel/vmlinux_64.lds.S | 5 +++++ 5 files changed, 48 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index ac108d1fe18..29f1095b084 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -18,7 +18,7 @@ void __init i386_start_kernel(void) { reserve_trampoline_memory(); - reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); + reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); #ifdef CONFIG_BLK_DEV_INITRD /* Reserve INITRD */ diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index f5b27224769..70eaa852c73 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -100,7 +100,7 @@ void __init x86_64_start_reservations(char *real_mode_data) reserve_trampoline_memory(); - reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); + reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); #ifdef CONFIG_BLK_DEV_INITRD /* Reserve INITRD */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f28c56e6bf9..e6b742d2a3f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -114,6 +114,9 @@ unsigned int boot_cpu_id __read_mostly; +static __initdata unsigned long _brk_start = (unsigned long)__brk_base; +unsigned long _brk_end = (unsigned long)__brk_base; + #ifdef CONFIG_X86_64 int default_cpu_present_to_apicid(int mps_cpu) { @@ -337,6 +340,34 @@ static void __init relocate_initrd(void) } #endif +void * __init extend_brk(size_t size, size_t align) +{ + size_t mask = align - 1; + void *ret; + + BUG_ON(_brk_start == 0); + BUG_ON(align & mask); + + _brk_end = (_brk_end + mask) & ~mask; + BUG_ON((char *)(_brk_end + size) > __brk_limit); + + ret = (void *)_brk_end; + _brk_end += size; + + memset(ret, 0, size); + + return ret; +} + +static void __init reserve_brk(void) +{ + if (_brk_end > _brk_start) + reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK"); + + /* Mark brk area as locked down and no longer taking any new allocations */ + _brk_start = 0; +} + static void __init reserve_initrd(void) { u64 ramdisk_image = boot_params.hdr.ramdisk_image; @@ -717,11 +748,7 @@ void __init setup_arch(char **cmdline_p) init_mm.start_code = (unsigned long) _text; init_mm.end_code = (unsigned long) _etext; init_mm.end_data = (unsigned long) _edata; -#ifdef CONFIG_X86_32 - init_mm.brk = init_pg_tables_end + PAGE_OFFSET; -#else - init_mm.brk = (unsigned long) &_end; -#endif + init_mm.brk = _brk_end; code_resource.start = virt_to_phys(_text); code_resource.end = virt_to_phys(_etext)-1; @@ -842,6 +869,8 @@ void __init setup_arch(char **cmdline_p) setup_bios_corruption_check(); #endif + reserve_brk(); + /* max_pfn_mapped is updated here */ max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn< Date: Sat, 14 Mar 2009 17:19:51 -0700 Subject: x86: move brk initialization out of #ifdef CONFIG_BLK_DEV_INITRD Impact: build fix The brk initialization functions were incorrectly located inside an #ifdef CONFIG_VLK_DEV_INITRD block, causing the obvious build failure in minimal configurations. Signed-off-by: H. Peter Anvin Cc: Jeremy Fitzhardinge --- arch/x86/kernel/setup.c | 57 +++++++++++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 28 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index e6b742d2a3f..b8329029c15 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -272,6 +272,35 @@ static inline void copy_edd(void) } #endif +void * __init extend_brk(size_t size, size_t align) +{ + size_t mask = align - 1; + void *ret; + + BUG_ON(_brk_start == 0); + BUG_ON(align & mask); + + _brk_end = (_brk_end + mask) & ~mask; + BUG_ON((char *)(_brk_end + size) > __brk_limit); + + ret = (void *)_brk_end; + _brk_end += size; + + memset(ret, 0, size); + + return ret; +} + +static void __init reserve_brk(void) +{ + if (_brk_end > _brk_start) + reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK"); + + /* Mark brk area as locked down and no longer taking any + new allocations */ + _brk_start = 0; +} + #ifdef CONFIG_BLK_DEV_INITRD #ifdef CONFIG_X86_32 @@ -340,34 +369,6 @@ static void __init relocate_initrd(void) } #endif -void * __init extend_brk(size_t size, size_t align) -{ - size_t mask = align - 1; - void *ret; - - BUG_ON(_brk_start == 0); - BUG_ON(align & mask); - - _brk_end = (_brk_end + mask) & ~mask; - BUG_ON((char *)(_brk_end + size) > __brk_limit); - - ret = (void *)_brk_end; - _brk_end += size; - - memset(ret, 0, size); - - return ret; -} - -static void __init reserve_brk(void) -{ - if (_brk_end > _brk_start) - reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK"); - - /* Mark brk area as locked down and no longer taking any new allocations */ - _brk_start = 0; -} - static void __init reserve_initrd(void) { u64 ramdisk_image = boot_params.hdr.ramdisk_image; -- cgit v1.2.3 From ccf3fe02e35f4abca2589f99022cc25084bbd8ae Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 27 Feb 2009 13:27:38 -0800 Subject: x86-32: use brk segment for allocating initial kernel pagetable Impact: use new interface instead of previous ad hoc implementation Rather than having special purpose init_pg_table_start/end variables to delimit the kernel pagetable built by head_32.S, just use the brk mechanism to extend the bss for the new pagetable. This patch removes init_pg_table_start/end and pg0, defines __brk_base (which is page-aligned and immediately follows _end), initializes the brk region to start there, and uses it for the 32-bit pagetable. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head32.c | 3 --- arch/x86/kernel/head_32.S | 14 +++++++------- arch/x86/kernel/setup.c | 6 ------ arch/x86/kernel/vmlinux_32.lds.S | 4 ---- 4 files changed, 7 insertions(+), 20 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 29f1095b084..3f8579f8d42 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -29,9 +29,6 @@ void __init i386_start_kernel(void) reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); } #endif - reserve_early(init_pg_tables_start, init_pg_tables_end, - "INIT_PG_TABLE"); - reserve_ebda_region(); /* diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index c32ca19d591..db6652710e9 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -167,7 +167,7 @@ num_subarch_entries = (. - subarch_entries) / 4 /* * Initialize page tables. This creates a PDE and a set of page * tables, which are located immediately beyond _end. The variable - * init_pg_tables_end is set up to point to the first "safe" location. + * _brk_end is set up to point to the first "safe" location. * Mappings are created both at virtual address 0 (identity mapping) * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END. * @@ -190,8 +190,7 @@ default_entry: xorl %ebx,%ebx /* %ebx is kept at zero */ - movl $pa(pg0), %edi - movl %edi, pa(init_pg_tables_start) + movl $pa(__brk_base), %edi movl $pa(swapper_pg_pmd), %edx movl $PTE_IDENT_ATTR, %eax 10: @@ -216,7 +215,8 @@ default_entry: cmpl %ebp,%eax jb 10b 1: - movl %edi,pa(init_pg_tables_end) + addl $__PAGE_OFFSET, %edi + movl %edi, pa(_brk_end) shrl $12, %eax movl %eax, pa(max_pfn_mapped) @@ -227,8 +227,7 @@ default_entry: page_pde_offset = (__PAGE_OFFSET >> 20); - movl $pa(pg0), %edi - movl %edi, pa(init_pg_tables_start) + movl $pa(__brk_base), %edi movl $pa(swapper_pg_dir), %edx movl $PTE_IDENT_ATTR, %eax 10: @@ -249,7 +248,8 @@ page_pde_offset = (__PAGE_OFFSET >> 20); leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp cmpl %ebp,%eax jb 10b - movl %edi,pa(init_pg_tables_end) + addl $__PAGE_OFFSET, %edi + movl %edi, pa(_brk_end) shrl $12, %eax movl %eax, pa(max_pfn_mapped) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b8329029c15..3ac2aa7b9ea 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -161,12 +161,6 @@ static struct resource bss_resource = { #ifdef CONFIG_X86_32 -/* This value is set up by the early boot code to point to the value - immediately after the boot time page tables. It contains a *physical* - address, and must not be in the .bss segment! */ -unsigned long init_pg_tables_start __initdata = ~0UL; -unsigned long init_pg_tables_end __initdata = ~0UL; - static struct resource video_ram_resource = { .name = "Video RAM area", .start = 0xa0000, diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 27e44aa2158..1063fbe93c7 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -196,10 +196,6 @@ SECTIONS __brk_limit = . ; _end = . ; - - /* This is where the kernel creates the early boot page tables */ - . = ALIGN(PAGE_SIZE); - pg0 = . ; } /* Sections to be discarded */ -- cgit v1.2.3 From 6de6cb442e76bbaf2e685150be8ddac0f237a59c Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 27 Feb 2009 13:35:45 -0800 Subject: x86: use brk allocation for DMI Impact: use new interface instead of previous ad hoc implementation Use extend_brk() to allocate memory for DMI rather than having an ad-hoc allocator. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3ac2aa7b9ea..e894f36335f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -215,12 +215,6 @@ unsigned long mmu_cr4_features = X86_CR4_PAE; /* Boot loader ID as an integer, for the benefit of proc_dointvec */ int bootloader_type; -/* - * Early DMI memory - */ -int dmi_alloc_index; -char dmi_alloc_data[DMI_MAX_DATA]; - /* * Setup options */ -- cgit v1.2.3 From 7543c1de84ed93c6769c9f20dced08a522af8912 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 12 Mar 2009 16:04:42 -0700 Subject: x86-32: compute initial mapping size more accurately Impact: simplification We only need to map the kernel in head_32.S, not the whole of lowmem. We use 512MB as a reasonable (but arbitrary) limit on the maximum size of the kernel image. Signed-off-by: Yinghai Lu Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head_32.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index db6652710e9..3ce5456dfbe 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -54,7 +54,7 @@ * * This should be a multiple of a page. */ -LOW_PAGES = 1<<(32-PAGE_SHIFT_asm) +LOW_PAGES = (KERNEL_IMAGE_SIZE + PAGE_SIZE_asm - 1)>>PAGE_SHIFT /* * To preserve the DMA pool in PAGEALLOC kernels, we'll allocate -- cgit v1.2.3 From 796216a57fe45c04adc35bda1f0782efec78a713 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 12 Mar 2009 16:09:49 -0700 Subject: x86: allow extend_brk users to reserve brk space Impact: new interface; remove hard-coded limit Add RESERVE_BRK(name, size) macro to reserve space in the brk area. This should be a conservative (ie, larger) estimate of how much space might possibly be required from the brk area. Any unused space will be freed, so there's no real downside on making the reservation too large (within limits). The name should be unique within a given file, and somewhat descriptive. The C definition of RESERVE_BRK() ends up being more complex than one would expect to work around a cluster of gcc infelicities: The first attempt was to simply try putting __section(.brk_reservation) on a variable. This doesn't work because it ends up making it a @progbits section, which gets actual space allocated in the vmlinux executable. The second attempt was to emit the space into a section using asm, but gcc doesn't allow arguments to be passed to file-level asm() statements, making it hard to pass in the size. The final attempt is to wrap the asm() in a function to allow it to have arguments, and put the function itself into the .discard section, which vmlinux*.lds drops entirely from the emitted vmlinux. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head_32.S | 2 ++ arch/x86/kernel/setup.c | 2 ++ arch/x86/kernel/vmlinux_32.lds.S | 4 +++- arch/x86/kernel/vmlinux_64.lds.S | 4 +++- 4 files changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 3ce5456dfbe..9e89f2a14b9 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -75,6 +75,8 @@ ALLOCATOR_SLOP = 4 INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm +RESERVE_BRK(pagetables, PAGE_TABLE_SIZE * PAGE_SIZE) + /* * 32-bit kernel entrypoint; only used by the boot CPU. On entry, * %esi points to the real-mode code as a 32-bit pointer. diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index e894f36335f..a0d26237d7c 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -112,6 +112,8 @@ #define ARCH_SETUP #endif +RESERVE_BRK(dmi_alloc, 65536); + unsigned int boot_cpu_id __read_mostly; static __initdata unsigned long _brk_start = (unsigned long)__brk_base; diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 1063fbe93c7..a1f28b85fb3 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -192,7 +192,8 @@ SECTIONS . = ALIGN(PAGE_SIZE); __brk_base = . ; - . += 1024 * 1024 ; + . += 64 * 1024 ; /* 64k slop space */ + *(.brk_reservation) /* areas brk users have reserved */ __brk_limit = . ; _end = . ; @@ -201,6 +202,7 @@ SECTIONS /* Sections to be discarded */ /DISCARD/ : { *(.exitcall.exit) + *(.discard) } STABS_DEBUG diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index ff373423138..7996687663a 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -250,7 +250,8 @@ SECTIONS . = ALIGN(PAGE_SIZE); __brk_base = . ; - . += 1024 * 1024 ; + . += 64 * 1024; /* 64k slop space */ + *(.brk_reservation) /* areas brk users have reserved */ __brk_limit = . ; } @@ -260,6 +261,7 @@ SECTIONS /DISCARD/ : { *(.exitcall.exit) *(.eh_frame) + *(.discard) } STABS_DEBUG -- cgit v1.2.3 From 2bd2753ff46346543ab92e80df9d96366e21baa5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 9 Mar 2009 01:15:57 -0700 Subject: x86: put initial_pg_tables into .bss Impact: makes vmlinux section information more useful Don't use ram after _end blindly for pagetables. aka init pages is before _end put those pg table into .bss [Adapted to use brk segment - Jeremy] v2: keep initial page table up to 512M only. v4: put initial page tables just before _end Signed-off-by: Yinghai Lu Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head_32.S | 43 +++++++++++++--------------------------- arch/x86/kernel/vmlinux_32.lds.S | 6 ++++++ 2 files changed, 20 insertions(+), 29 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 9e89f2a14b9..c79741cfb07 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -41,41 +41,28 @@ * This is how much memory *in addition to the memory covered up to * and including _end* we need mapped initially. * We need: - * - one bit for each possible page, but only in low memory, which means - * 2^32/4096/8 = 128K worst case (4G/4G split.) - * - enough space to map all low memory, which means - * (2^32/4096) / 1024 pages (worst case, non PAE) - * (2^32/4096) / 512 + 4 pages (worst case for PAE) - * - a few pages for allocator use before the kernel pagetable has - * been set up + * (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE) + * (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE) * * Modulo rounding, each megabyte assigned here requires a kilobyte of * memory, which is currently unreclaimed. * * This should be a multiple of a page. + * + * KERNEL_IMAGE_SIZE should be greater than pa(_end) + * and small than max_low_pfn, otherwise will waste some page table entries */ LOW_PAGES = (KERNEL_IMAGE_SIZE + PAGE_SIZE_asm - 1)>>PAGE_SHIFT -/* - * To preserve the DMA pool in PAGEALLOC kernels, we'll allocate - * pagetables from above the 16MB DMA limit, so we'll have to set - * up pagetables 16MB more (worst-case): - */ -#ifdef CONFIG_DEBUG_PAGEALLOC -LOW_PAGES = LOW_PAGES + 0x1000000 -#endif - #if PTRS_PER_PMD > 1 PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD #else PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD) #endif -BOOTBITMAP_SIZE = LOW_PAGES / 8 ALLOCATOR_SLOP = 4 -INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm - -RESERVE_BRK(pagetables, PAGE_TABLE_SIZE * PAGE_SIZE) +INIT_MAP_SIZE = (PAGE_TABLE_SIZE + ALLOCATOR_SLOP) * PAGE_SIZE_asm +RESERVE_BRK(pagetables, INIT_MAP_SIZE) /* * 32-bit kernel entrypoint; only used by the boot CPU. On entry, @@ -168,10 +155,10 @@ num_subarch_entries = (. - subarch_entries) / 4 /* * Initialize page tables. This creates a PDE and a set of page - * tables, which are located immediately beyond _end. The variable + * tables, which are located immediately beyond __brk_base. The variable * _brk_end is set up to point to the first "safe" location. * Mappings are created both at virtual address 0 (identity mapping) - * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END. + * and PAGE_OFFSET for up to _end. * * Note that the stack is not yet set up! */ @@ -210,10 +197,9 @@ default_entry: loop 11b /* - * End condition: we must map up to and including INIT_MAP_BEYOND_END - * bytes beyond the end of our own page tables. + * End condition: we must map up to the end. */ - leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp + movl $pa(_end) + PTE_IDENT_ATTR, %ebp cmpl %ebp,%eax jb 10b 1: @@ -243,11 +229,9 @@ page_pde_offset = (__PAGE_OFFSET >> 20); addl $0x1000,%eax loop 11b /* - * End condition: we must map up to and including INIT_MAP_BEYOND_END - * bytes beyond the end of our own page tables; the +0x007 is - * the attribute bits + * End condition: we must map up to end */ - leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp + movl $pa(_end) + PTE_IDENT_ATTR, %ebp cmpl %ebp,%eax jb 10b addl $__PAGE_OFFSET, %edi @@ -638,6 +622,7 @@ swapper_pg_fixmap: .fill 1024,4,0 ENTRY(empty_zero_page) .fill 4096,1,0 + /* * This starts the data section. */ diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index a1f28b85fb3..98424f33e07 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -210,6 +210,12 @@ SECTIONS DWARF_DEBUG } +/* + * Build-time check on the image size: + */ +ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE") + #ifdef CONFIG_KEXEC /* Link time checks */ #include -- cgit v1.2.3 From 6d7942dc2a70a7e74c352107b150265602671588 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 14 Mar 2009 14:32:41 -0700 Subject: x86: fix 64k corruption-check Impact: fix boot crash Need to exit early if the addr is far above 64k. The crash got exposed by: 78a8b35: x86: make e820_update_range() handle small range update Signed-off-by: Yinghai Lu Cc: LKML-Reference: <49BC2279.2030101@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/check.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c index b617b1164f1..fc999e6fc46 100644 --- a/arch/x86/kernel/check.c +++ b/arch/x86/kernel/check.c @@ -86,12 +86,12 @@ void __init setup_bios_corruption_check(void) if (!(addr + 1)) break; + if (addr >= corruption_check_size) + break; + if ((addr + size) > corruption_check_size) size = corruption_check_size - addr; - if (size == 0) - break; - e820_update_range(addr, size, E820_RAM, E820_RESERVED); scan_areas[num_scan_areas].addr = addr; scan_areas[num_scan_areas].size = size; -- cgit v1.2.3 From c61cf4cfe7c73c7aa62dde3ff82cd475b9c41481 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 15 Mar 2009 00:59:19 -0700 Subject: x86: print out more info in e820_update_range() Impact: help debug e820 bugs Try to print out more info, to catch wrong call parameters. Signed-off-by: Yinghai Lu LKML-Reference: <49BCB557.3030000@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 56 +++++++++++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 21 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 0c34ff49ff4..fb638d9ce6d 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -131,6 +131,31 @@ void __init e820_add_region(u64 start, u64 size, int type) __e820_add_region(&e820, start, size, type); } +static void __init e820_print_type(u32 type) +{ + switch (type) { + case E820_RAM: + case E820_RESERVED_KERN: + printk(KERN_CONT "(usable)"); + break; + case E820_RESERVED: + printk(KERN_CONT "(reserved)"); + break; + case E820_ACPI: + printk(KERN_CONT "(ACPI data)"); + break; + case E820_NVS: + printk(KERN_CONT "(ACPI NVS)"); + break; + case E820_UNUSABLE: + printk(KERN_CONT "(unusable)"); + break; + default: + printk(KERN_CONT "type %u", type); + break; + } +} + void __init e820_print_map(char *who) { int i; @@ -140,27 +165,8 @@ void __init e820_print_map(char *who) (unsigned long long) e820.map[i].addr, (unsigned long long) (e820.map[i].addr + e820.map[i].size)); - switch (e820.map[i].type) { - case E820_RAM: - case E820_RESERVED_KERN: - printk(KERN_CONT "(usable)\n"); - break; - case E820_RESERVED: - printk(KERN_CONT "(reserved)\n"); - break; - case E820_ACPI: - printk(KERN_CONT "(ACPI data)\n"); - break; - case E820_NVS: - printk(KERN_CONT "(ACPI NVS)\n"); - break; - case E820_UNUSABLE: - printk("(unusable)\n"); - break; - default: - printk(KERN_CONT "type %u\n", e820.map[i].type); - break; - } + e820_print_type(e820.map[i].type); + printk(KERN_CONT "\n"); } } @@ -437,6 +443,14 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start, size = ULLONG_MAX - start; end = start + size; + printk(KERN_DEBUG "e820 update range: %016Lx - %016Lx ", + (unsigned long long) start, + (unsigned long long) end); + e820_print_type(old_type); + printk(KERN_CONT " ==> "); + e820_print_type(new_type); + printk(KERN_CONT "\n"); + for (i = 0; i < e820x->nr_map; i++) { struct e820entry *ei = &e820x->map[i]; u64 final_start, final_end; -- cgit v1.2.3 From 514ec49a5f5146a6c0ade1a688787acf13617868 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 16 Mar 2009 17:07:33 +0900 Subject: x86, mce: remove incorrect __cpuinit for intel_init_cmci() Impact: Bug fix on UP Referring commit cc3ca22063784076bd240fda87217387a8f2ae92, Peter removed __cpuinit annotations for mce_cpu_features() and its successor functions, which caused troubles on UP configurations. However the intel_init_cmci() was introduced after that and it also has __cpuinit annotation even though it is called from mce_cpu_features(). Remove the annotation from that function too. Signed-off-by: Hidetoshi Seto Cc: H. Peter Anvin Cc: Andi Kleen Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index aaa7d973093..57df3d38347 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -270,7 +270,7 @@ void cmci_reenable(void) cmci_discover(banks, 0); } -static __cpuinit void intel_init_cmci(void) +static void intel_init_cmci(void) { int banks; -- cgit v1.2.3 From f0348c438c9ea156194d31fadde4a9e75522196f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 16 Mar 2009 16:33:59 -0700 Subject: x86: MTRR workaround for system with stange var MTRRs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: don't trim e820 according to wrong mtrr Ozan reports that his server emits strange warning. it turns out the BIOS sets the MTRRs incorrectly. Ignore those strange ranges, and don't trim e820, just emit one warning about BIOS Reported-by: Ozan Çağlayan Signed-off-by: Yinghai Lu LKML-Reference: <49BEE1E7.7020706@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/cleanup.c | 11 +++++++++++ arch/x86/kernel/cpu/mtrr/main.c | 16 ++++++++-------- arch/x86/kernel/cpu/mtrr/mtrr.h | 1 + 3 files changed, 20 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 58b58bbf7eb..f4f89fbc228 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -189,6 +189,17 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, if (!size) continue; base = range_state[i].base_pfn; + if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed && + (mtrr_state.enabled & 1)) { + /* Var MTRR contains UC entry below 1M? Skip it: */ + printk(KERN_WARNING "WARNING: BIOS bug: VAR MTRR %d " + "contains strange UC entry under 1M, check " + "with your system vendor!\n", i); + if (base + size <= (1<<(20-PAGE_SHIFT))) + continue; + size -= (1<<(20-PAGE_SHIFT)) - base; + base = 1<<(20-PAGE_SHIFT); + } subtract_range(range, base, base + size - 1); } if (extra_remove_size) diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 5c2e266f41d..03cda01f57c 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -574,7 +574,7 @@ struct mtrr_value { unsigned long lsize; }; -static struct mtrr_value mtrr_state[MTRR_MAX_VAR_RANGES]; +static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES]; static int mtrr_save(struct sys_device * sysdev, pm_message_t state) { @@ -582,9 +582,9 @@ static int mtrr_save(struct sys_device * sysdev, pm_message_t state) for (i = 0; i < num_var_ranges; i++) { mtrr_if->get(i, - &mtrr_state[i].lbase, - &mtrr_state[i].lsize, - &mtrr_state[i].ltype); + &mtrr_value[i].lbase, + &mtrr_value[i].lsize, + &mtrr_value[i].ltype); } return 0; } @@ -594,11 +594,11 @@ static int mtrr_restore(struct sys_device * sysdev) int i; for (i = 0; i < num_var_ranges; i++) { - if (mtrr_state[i].lsize) + if (mtrr_value[i].lsize) set_mtrr(i, - mtrr_state[i].lbase, - mtrr_state[i].lsize, - mtrr_state[i].ltype); + mtrr_value[i].lbase, + mtrr_value[i].lsize, + mtrr_value[i].ltype); } return 0; } diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 6710e93021a..77f67f7b347 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -79,6 +79,7 @@ extern struct mtrr_ops * mtrr_if; extern unsigned int num_var_ranges; extern u64 mtrr_tom2; +extern struct mtrr_state_type mtrr_state; void mtrr_state_warn(void); const char *mtrr_attrib_to_str(int x); -- cgit v1.2.3 From a6a80e1d8cf82b46a69f88e659da02749231eb36 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 17 Mar 2009 07:58:26 -0700 Subject: Fix potential fast PIT TSC calibration startup glitch During bootup, when we reprogram the PIT (programmable interval timer) to start counting down from 0xffff in order to use it for the fast TSC calibration, we should also make sure to delay a bit afterwards to allow the PIT hardware to actually start counting with the new value. That will happens at the next CLK pulse (1.193182 MHz), so the easiest way to do that is to just wait at least one microsecond after programming the new PIT counter value. We do that by just reading the counter value back once - which will take about 2us on PC hardware. Reported-and-tested-by: john stultz Signed-off-by: Linus Torvalds --- arch/x86/kernel/tsc.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 599e5816863..9e80207c96a 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -315,6 +315,15 @@ static unsigned long quick_pit_calibrate(void) outb(0xff, 0x42); outb(0xff, 0x42); + /* + * The PIT starts counting at the next edge, so we + * need to delay for a microsecond. The easiest way + * to do that is to just read back the 16-bit counter + * once from the PIT. + */ + inb(0x42); + inb(0x42); + if (pit_expect_msb(0xff)) { int i; u64 t1, t2, delta; -- cgit v1.2.3 From 9e8912e04e612b43897b4b722205408b92f423e5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 17 Mar 2009 08:13:17 -0700 Subject: Fast TSC calibration: calculate proper frequency error bounds In order for ntpd to correctly synchronize the clocks, the frequency of the system clock must not be off by more than 500 ppm (or, put another way, 1:2000), or ntpd will end up giving up on trying to synchronize properly, and ends up reseting the clock in jumps instead. The fast TSC PIT calibration sometimes failed this test - it was assuming that the PIT reads always took about one microsecond each (2us for the two reads to get a 16-bit timer), and that calibrating TSC to the PIT over 15ms should thus be sufficient to get much closer than 500ppm (max 2us error on both sides giving 4us over 15ms: a 270 ppm error value). However, that assumption does not always hold: apparently some hardware is either very much slower at reading the PIT registers, or there was other noise causing at least one machine to get 700+ ppm errors. So instead of using a fixed 15ms timing loop, this changes the fast PIT calibration to read the TSC delta over the individual PIT timer reads, and use the result to calculate the error bars on the PIT read timing properly. We then successfully calibrate the TSC only if the maximum error bars fall below 500ppm. In the process, we also relax the timing to allow up to 25ms for the calibration, although it can happen much faster depending on hardware. Reported-and-tested-by: Jesper Krogh Cc: john stultz Cc: Thomas Gleixner Acked-by: Ingo Molnar Signed-off-by: Linus Torvalds --- arch/x86/kernel/tsc.c | 101 ++++++++++++++++++++++++++++---------------------- 1 file changed, 56 insertions(+), 45 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 9e80207c96a..d5cebb52d45 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -273,30 +273,43 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin) * use the TSC value at the transitions to calculate a pretty * good value for the TSC frequencty. */ -static inline int pit_expect_msb(unsigned char val) +static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap) { - int count = 0; + int count; + u64 tsc = 0; for (count = 0; count < 50000; count++) { /* Ignore LSB */ inb(0x42); if (inb(0x42) != val) break; + tsc = get_cycles(); } - return count > 50; + *deltap = get_cycles() - tsc; + *tscp = tsc; + + /* + * We require _some_ success, but the quality control + * will be based on the error terms on the TSC values. + */ + return count > 5; } /* - * How many MSB values do we want to see? We aim for a - * 15ms calibration, which assuming a 2us counter read - * error should give us roughly 150 ppm precision for - * the calibration. + * How many MSB values do we want to see? We aim for + * a maximum error rate of 500ppm (in practice the + * real error is much smaller), but refuse to spend + * more than 25ms on it. */ -#define QUICK_PIT_MS 15 -#define QUICK_PIT_ITERATIONS (QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256) +#define MAX_QUICK_PIT_MS 25 +#define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256) static unsigned long quick_pit_calibrate(void) { + int i; + u64 tsc, delta; + unsigned long d1, d2; + /* Set the Gate high, disable speaker */ outb((inb(0x61) & ~0x02) | 0x01, 0x61); @@ -324,45 +337,43 @@ static unsigned long quick_pit_calibrate(void) inb(0x42); inb(0x42); - if (pit_expect_msb(0xff)) { - int i; - u64 t1, t2, delta; - unsigned char expect = 0xfe; - - t1 = get_cycles(); - for (i = 0; i < QUICK_PIT_ITERATIONS; i++, expect--) { - if (!pit_expect_msb(expect)) - goto failed; + if (pit_expect_msb(0xff, &tsc, &d1)) { + for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) { + if (!pit_expect_msb(0xff-i, &delta, &d2)) + break; + + /* + * Iterate until the error is less than 500 ppm + */ + delta -= tsc; + if (d1+d2 < delta >> 11) + goto success; } - t2 = get_cycles(); - - /* - * Make sure we can rely on the second TSC timestamp: - */ - if (!pit_expect_msb(expect)) - goto failed; - - /* - * Ok, if we get here, then we've seen the - * MSB of the PIT decrement QUICK_PIT_ITERATIONS - * times, and each MSB had many hits, so we never - * had any sudden jumps. - * - * As a result, we can depend on there not being - * any odd delays anywhere, and the TSC reads are - * reliable. - * - * kHz = ticks / time-in-seconds / 1000; - * kHz = (t2 - t1) / (QPI * 256 / PIT_TICK_RATE) / 1000 - * kHz = ((t2 - t1) * PIT_TICK_RATE) / (QPI * 256 * 1000) - */ - delta = (t2 - t1)*PIT_TICK_RATE; - do_div(delta, QUICK_PIT_ITERATIONS*256*1000); - printk("Fast TSC calibration using PIT\n"); - return delta; } -failed: + printk("Fast TSC calibration failed\n"); return 0; + +success: + /* + * Ok, if we get here, then we've seen the + * MSB of the PIT decrement 'i' times, and the + * error has shrunk to less than 500 ppm. + * + * As a result, we can depend on there not being + * any odd delays anywhere, and the TSC reads are + * reliable (within the error). We also adjust the + * delta to the middle of the error bars, just + * because it looks nicer. + * + * kHz = ticks / time-in-seconds / 1000; + * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000 + * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000) + */ + delta += (long)(d2 - d1)/2; + delta *= PIT_TICK_RATE; + do_div(delta, i*256*1000); + printk("Fast TSC calibration using PIT\n"); + return delta; } /** -- cgit v1.2.3 From 30390880debce4a68fd23e87a787f27609e4bf4a Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 16 Mar 2009 18:57:22 -0400 Subject: prevent boosting kprobes on exception address Don't boost at the addresses which are listed on exception tables, because major page fault will occur on those addresses. In that case, kprobes can not ensure that when instruction buffer can be freed since some processes will sleep on the buffer. kprobes-ia64 already has same check. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Signed-off-by: Linus Torvalds --- arch/x86/kernel/kprobes.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index e948b28a5a9..4558dd3918c 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -193,6 +193,9 @@ static int __kprobes can_boost(kprobe_opcode_t *opcodes) kprobe_opcode_t opcode; kprobe_opcode_t *orig_opcodes = opcodes; + if (search_exception_tables(opcodes)) + return 0; /* Page fault may occur on this address. */ + retry: if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) return 0; -- cgit v1.2.3 From c090f532db3ab5b7be503f8ac84b0d33a18646c6 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 16 Mar 2009 12:07:54 -0700 Subject: x86-32: make sure we map enough to fit linear map pagetables Impact: crash fix head_32.S needs to map the kernel itself, and enough space so that mm/init.c can allocate space from the e820 allocator for the linear map of low memory. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head_32.S | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index c79741cfb07..d383a7c0e49 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -38,8 +38,8 @@ #define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id /* - * This is how much memory *in addition to the memory covered up to - * and including _end* we need mapped initially. + * This is how much memory in addition to the memory covered up to + * and including _end we need mapped initially. * We need: * (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE) * (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE) @@ -52,16 +52,25 @@ * KERNEL_IMAGE_SIZE should be greater than pa(_end) * and small than max_low_pfn, otherwise will waste some page table entries */ -LOW_PAGES = (KERNEL_IMAGE_SIZE + PAGE_SIZE_asm - 1)>>PAGE_SHIFT #if PTRS_PER_PMD > 1 -PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD +#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD) #else -PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD) +#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD) #endif ALLOCATOR_SLOP = 4 -INIT_MAP_SIZE = (PAGE_TABLE_SIZE + ALLOCATOR_SLOP) * PAGE_SIZE_asm +/* Enough space to fit pagetables for the low memory linear map */ +MAPPING_BEYOND_END = (PAGE_TABLE_SIZE(1 << (32 - PAGE_SHIFT)) * PAGE_SIZE) + +/* + * Worst-case size of the kernel mapping we need to make: + * the worst-case size of the kernel itself, plus the extra we need + * to map for the linear map. + */ +KERNEL_PAGES = (KERNEL_IMAGE_SIZE + MAPPING_BEYOND_END)>>PAGE_SHIFT + +INIT_MAP_SIZE = (PAGE_TABLE_SIZE(KERNEL_PAGES) + ALLOCATOR_SLOP) * PAGE_SIZE_asm RESERVE_BRK(pagetables, INIT_MAP_SIZE) /* @@ -197,9 +206,9 @@ default_entry: loop 11b /* - * End condition: we must map up to the end. + * End condition: we must map up to the end + MAPPING_BEYOND_END. */ - movl $pa(_end) + PTE_IDENT_ATTR, %ebp + movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp cmpl %ebp,%eax jb 10b 1: @@ -229,9 +238,9 @@ page_pde_offset = (__PAGE_OFFSET >> 20); addl $0x1000,%eax loop 11b /* - * End condition: we must map up to end + * End condition: we must map up to the end + MAPPING_BEYOND_END. */ - movl $pa(_end) + PTE_IDENT_ATTR, %ebp + movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp cmpl %ebp,%eax jb 10b addl $__PAGE_OFFSET, %edi -- cgit v1.2.3 From b8a22a6273d5aed248db2695ce9bf65eb3e9fbe6 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 16 Mar 2009 12:10:07 -0700 Subject: x86-32: remove ALLOCATOR_SLOP from head_32.S Impact: cleanup ALLOCATOR_SLOP is a vestigial remain from when we used the bootmem allocator to allocate the kernel's linear memory mapping. Now we directly reserve pages from the e820 mapping, and no longer require secondary structures to keep track of allocated pages. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head_32.S | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index d383a7c0e49..fc884b90b6d 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -58,7 +58,6 @@ #else #define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD) #endif -ALLOCATOR_SLOP = 4 /* Enough space to fit pagetables for the low memory linear map */ MAPPING_BEYOND_END = (PAGE_TABLE_SIZE(1 << (32 - PAGE_SHIFT)) * PAGE_SIZE) @@ -70,7 +69,7 @@ MAPPING_BEYOND_END = (PAGE_TABLE_SIZE(1 << (32 - PAGE_SHIFT)) * PAGE_SIZE) */ KERNEL_PAGES = (KERNEL_IMAGE_SIZE + MAPPING_BEYOND_END)>>PAGE_SHIFT -INIT_MAP_SIZE = (PAGE_TABLE_SIZE(KERNEL_PAGES) + ALLOCATOR_SLOP) * PAGE_SIZE_asm +INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm RESERVE_BRK(pagetables, INIT_MAP_SIZE) /* -- cgit v1.2.3 From 60ac98213914cc615c67e84bfb964aa95a080d13 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 17 Mar 2009 11:38:23 -0700 Subject: x86-32: tighten the bound on additional memory to map Impact: Tighten bound to avoid masking errors The definition of MAPPING_BEYOND_END was excessive; this has a nasty tendency to mask bugs. We have learned over time that this kind of bug hiding can cause some very strange errors. Therefore, tighten the bound to only need to map the actual kernel area. Signed-off-by: H. Peter Anvin Cc: Jeremy Fitzhardinge Cc: Yinghai Lu --- arch/x86/kernel/head_32.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index fc884b90b6d..30683883e0c 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -60,7 +60,8 @@ #endif /* Enough space to fit pagetables for the low memory linear map */ -MAPPING_BEYOND_END = (PAGE_TABLE_SIZE(1 << (32 - PAGE_SHIFT)) * PAGE_SIZE) +MAPPING_BEYOND_END = \ + PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT /* * Worst-case size of the kernel mapping we need to make: -- cgit v1.2.3 From 704439ddf9f8ff1fc8c6d8abaac4bc4c95697e39 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sat, 14 Mar 2009 23:20:47 -0700 Subject: x86/brk: put the brk reservations in their own section Impact: disambiguate real .bss variables from .brk storage Add a .brk section after the .bss section. This has no effect on the final vmlinux, but it more clearly distinguishes the space taken by actual .bss symbols, and the variable space reserved by .brk users. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/vmlinux_32.lds.S | 8 +++++--- arch/x86/kernel/vmlinux_64.lds.S | 4 +++- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 98424f33e07..de14973e47f 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -189,16 +189,18 @@ SECTIONS *(.bss) . = ALIGN(4); __bss_stop = .; + } + .brk : AT(ADDR(.brk) - LOAD_OFFSET) { . = ALIGN(PAGE_SIZE); __brk_base = . ; - . += 64 * 1024 ; /* 64k slop space */ + . += 64 * 1024 ; /* 64k alignment slop space */ *(.brk_reservation) /* areas brk users have reserved */ __brk_limit = . ; - - _end = . ; } + _end = . ; + /* Sections to be discarded */ /DISCARD/ : { *(.exitcall.exit) diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 7996687663a..c8742507b03 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -247,10 +247,12 @@ SECTIONS *(.bss.page_aligned) *(.bss) __bss_stop = .; + } + .brk : AT(ADDR(.brk) - LOAD_OFFSET) { . = ALIGN(PAGE_SIZE); __brk_base = . ; - . += 64 * 1024; /* 64k slop space */ + . += 64 * 1024 ; /* 64k alignment slop space */ *(.brk_reservation) /* areas brk users have reserved */ __brk_limit = . ; } -- cgit v1.2.3 From 0a699af8e613a670be50245366fa18cb19ac5172 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 17 Mar 2009 14:14:31 -0700 Subject: x86-32: move _end to a dummy section Impact: build fix with CONFIG_RELOCATABLE Move _end into a dummy section, so that relocs.c will know it is a relocatable symbol. Signed-off-by: H. Peter Anvin Cc: Jeremy Fitzhardinge Cc: Jeremy Fitzhardinge --- arch/x86/kernel/vmlinux_32.lds.S | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index de14973e47f..62ad500d55f 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -199,7 +199,9 @@ SECTIONS __brk_limit = . ; } - _end = . ; + .end : AT(ADDR(.end) - LOAD_OFFSET) { + _end = . ; + } /* Sections to be discarded */ /DISCARD/ : { -- cgit v1.2.3 From 9d783ba042771284fb4ee5013c3d94220755ae7f Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 16 Mar 2009 17:04:55 -0700 Subject: x86, x2apic: enable fault handling for intr-remapping Impact: interface augmentation (not yet used) Enable fault handling flow for intr-remapping aswell. Fault handling code now shared by both dma-remapping and intr-remapping. Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/io_apic.c | 9 +++++++-- arch/x86/kernel/apic/probe_64.c | 9 +++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 00e6071cefc..b18a7734d68 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3294,7 +3294,12 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms } else #endif { - msg->address_hi = MSI_ADDR_BASE_HI; + if (x2apic_enabled()) + msg->address_hi = MSI_ADDR_BASE_HI | + MSI_ADDR_EXT_DEST_ID(dest); + else + msg->address_hi = MSI_ADDR_BASE_HI; + msg->address_lo = MSI_ADDR_BASE_LO | ((apic->irq_dest_mode == 0) ? @@ -3528,7 +3533,7 @@ void arch_teardown_msi_irq(unsigned int irq) destroy_irq(irq); } -#ifdef CONFIG_DMAR +#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP) #ifdef CONFIG_SMP static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) { diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index 8d7748efe6a..8297c2b8ed2 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -68,6 +68,15 @@ void __init default_setup_apic_routing(void) apic = &apic_physflat; printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); } + +#ifdef CONFIG_X86_X2APIC + /* + * Now that apic routing model is selected, configure the + * fault handling for intr remapping. + */ + if (intr_remapping_enabled) + enable_drhd_fault_handling(); +#endif } /* Same for both flat and physical. */ -- cgit v1.2.3 From 7c6d9f9785d156d0438401ef270322da3d5247db Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 16 Mar 2009 17:04:59 -0700 Subject: x86, x2apic: use virtual wire A mode in disable_IO_APIC() with interrupt-remapping Impact: make kexec work with x2apic disable_IO_APIC() gets called during crashdump aswell, which configures the IO-APIC/LAPIC so that legacy interrupts can be delivered for the kexec'd kernel. In the presence of interrupt-remapping, we need to change the interrupt-remapping configuration aswell as modifying IO-APIC for virtual wire B mode. To keep things simple during the crash, use virtual wire A mode (for which we don't need to touch io-apic and interrupt-remapping tables). Signed-off-by: Suresh Siddha Cc: Eric W. Biederman Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/io_apic.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index b18a7734d68..4d975d0e358 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2040,8 +2040,13 @@ void disable_IO_APIC(void) * If the i8259 is routed through an IOAPIC * Put that IOAPIC in virtual wire mode * so legacy interrupts can be delivered. + * + * With interrupt-remapping, for now we will use virtual wire A mode, + * as virtual wire B is little complex (need to configure both + * IOAPIC RTE aswell as interrupt-remapping table entry). + * As this gets called during crash dump, keep this simple for now. */ - if (ioapic_i8259.pin != -1) { + if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) { struct IO_APIC_route_entry entry; memset(&entry, 0, sizeof(entry)); @@ -2061,7 +2066,10 @@ void disable_IO_APIC(void) ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); } - disconnect_bsp_APIC(ioapic_i8259.pin != -1); + /* + * Use virtual wire A mode when interrupt remapping is enabled. + */ + disconnect_bsp_APIC(!intr_remapping_enabled && ioapic_i8259.pin != -1); } #ifdef CONFIG_X86_32 -- cgit v1.2.3 From cf6567fe40c55e9cffca7355cd34e50fb2871e4e Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 16 Mar 2009 17:05:00 -0700 Subject: x86, x2apic: fix clear_local_APIC() in the presence of x2apic Impact: cleanup, paranoia We were not clearing the local APIC in clear_local_APIC() in the presence of x2apic. Fix it. Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/apic.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 30909a258d0..699f8cf76bb 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -809,7 +809,7 @@ void clear_local_APIC(void) u32 v; /* APIC hasn't been mapped yet */ - if (!apic_phys) + if (!x2apic && !apic_phys) return; maxlvt = lapic_get_maxlvt(); @@ -1523,12 +1523,10 @@ void __init early_init_lapic_mapping(void) */ void __init init_apic_mappings(void) { -#ifdef CONFIG_X86_X2APIC if (x2apic) { boot_cpu_physical_apicid = read_apic_id(); return; } -#endif /* * If no local APIC can be found then set up a fake all @@ -1972,12 +1970,9 @@ static int lapic_resume(struct sys_device *dev) local_irq_save(flags); -#ifdef CONFIG_X86_X2APIC if (x2apic) enable_x2apic(); - else -#endif - { + else { /* * Make sure the APICBASE points to the right address * -- cgit v1.2.3 From 0280f7c416c652a2fd95d166f52b199ae61122c0 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 16 Mar 2009 17:05:01 -0700 Subject: x86, x2apic: cleanup the IO-APIC level migration with interrupt-remapping Impact: simplification In the current code, for level triggered migration, we need to modify the io-apic RTE with the update vector information, along with modifying interrupt remapping table entry(IRTE) with vector and destination. This is to ensure that remote IRR bit inthe IOAPIC RTE gets cleared when the cpu does EOI. With this patch, for level triggered, we eliminate the io-apic RTE modification (with the updated vector information), by using a virtual vector (io-apic pin number). Real vector that is used for interrupting cpu will be coming from the interrupt-remapping table entry. Trigger mode in the IRTE will always be edge, and the actual level or edge trigger will be setup in the IO-APIC RTE. So a level triggered interrupt will appear as an edge to the local apic cpu but still as level to the IO-APIC. With this change, level irq migration can be done by simply modifying the interrupt-remapping table entry with out changing the io-apic RTE. And as the interrupt appears as edge at the cpu, in addition to do the local apic EOI, we need to do IO-APIC directed EOI to clear the remote IRR bit in the IO-APIC RTE. This simplies the irq migration in the presence of interrupt-remapping. Idea-by: Rajesh Sankaran Signed-off-by: Suresh Siddha Cc: Eric W. Biederman Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/io_apic.c | 156 +++++++++++++++++------------------------ 1 file changed, 65 insertions(+), 91 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 4d975d0e358..e074eac5bd3 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -389,6 +389,8 @@ struct io_apic { unsigned int index; unsigned int unused[3]; unsigned int data; + unsigned int unused2[11]; + unsigned int eoi; }; static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) @@ -397,6 +399,12 @@ static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) + (mp_ioapics[idx].apicaddr & ~PAGE_MASK); } +static inline void io_apic_eoi(unsigned int apic, unsigned int vector) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(vector, &io_apic->eoi); +} + static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) { struct io_apic __iomem *io_apic = io_apic_base(apic); @@ -1478,7 +1486,7 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t int setup_ioapic_entry(int apic_id, int irq, struct IO_APIC_route_entry *entry, unsigned int destination, int trigger, - int polarity, int vector) + int polarity, int vector, int pin) { /* * add it to the IO-APIC irq-routing table: @@ -1504,7 +1512,14 @@ int setup_ioapic_entry(int apic_id, int irq, irte.present = 1; irte.dst_mode = apic->irq_dest_mode; - irte.trigger_mode = trigger; + /* + * Trigger mode in the IRTE will always be edge, and the + * actual level or edge trigger will be setup in the IO-APIC + * RTE. This will help simplify level triggered irq migration. + * For more details, see the comments above explainig IO-APIC + * irq migration in the presence of interrupt-remapping. + */ + irte.trigger_mode = 0; irte.dlvry_mode = apic->irq_delivery_mode; irte.vector = vector; irte.dest_id = IRTE_DEST(destination); @@ -1515,18 +1530,23 @@ int setup_ioapic_entry(int apic_id, int irq, ir_entry->zero = 0; ir_entry->format = 1; ir_entry->index = (index & 0x7fff); + /* + * IO-APIC RTE will be configured with virtual vector. + * irq handler will do the explicit EOI to the io-apic. + */ + ir_entry->vector = pin; } else #endif { entry->delivery_mode = apic->irq_delivery_mode; entry->dest_mode = apic->irq_dest_mode; entry->dest = destination; + entry->vector = vector; } entry->mask = 0; /* enable IRQ */ entry->trigger = trigger; entry->polarity = polarity; - entry->vector = vector; /* Mask level triggered irqs. * Use IRQ_DELAYED_DISABLE for edge triggered irqs. @@ -1561,7 +1581,7 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry, - dest, trigger, polarity, cfg->vector)) { + dest, trigger, polarity, cfg->vector, pin)) { printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", mp_ioapics[apic_id].apicid, pin); __clear_irq_vector(irq, cfg); @@ -2311,37 +2331,24 @@ static int ioapic_retrigger_irq(unsigned int irq) #ifdef CONFIG_SMP #ifdef CONFIG_INTR_REMAP -static void ir_irq_migration(struct work_struct *work); - -static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration); /* * Migrate the IO-APIC irq in the presence of intr-remapping. * - * For edge triggered, irq migration is a simple atomic update(of vector - * and cpu destination) of IRTE and flush the hardware cache. - * - * For level triggered, we need to modify the io-apic RTE aswell with the update - * vector information, along with modifying IRTE with vector and destination. - * So irq migration for level triggered is little bit more complex compared to - * edge triggered migration. But the good news is, we use the same algorithm - * for level triggered migration as we have today, only difference being, - * we now initiate the irq migration from process context instead of the - * interrupt context. + * For both level and edge triggered, irq migration is a simple atomic + * update(of vector and cpu destination) of IRTE and flush the hardware cache. * - * In future, when we do a directed EOI (combined with cpu EOI broadcast - * suppression) to the IO-APIC, level triggered irq migration will also be - * as simple as edge triggered migration and we can do the irq migration - * with a simple atomic update to IO-APIC RTE. + * For level triggered, we eliminate the io-apic RTE modification (with the + * updated vector information), by using a virtual vector (io-apic pin number). + * Real vector that is used for interrupting cpu will be coming from + * the interrupt-remapping table entry. */ static void migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { struct irq_cfg *cfg; struct irte irte; - int modify_ioapic_rte; unsigned int dest; - unsigned long flags; unsigned int irq; if (!cpumask_intersects(mask, cpu_online_mask)) @@ -2359,13 +2366,6 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); - modify_ioapic_rte = desc->status & IRQ_LEVEL; - if (modify_ioapic_rte) { - spin_lock_irqsave(&ioapic_lock, flags); - __target_IO_APIC_irq(irq, dest, cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); - } - irte.vector = cfg->vector; irte.dest_id = IRTE_DEST(dest); @@ -2380,73 +2380,12 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) cpumask_copy(desc->affinity, mask); } -static int migrate_irq_remapped_level_desc(struct irq_desc *desc) -{ - int ret = -1; - struct irq_cfg *cfg = desc->chip_data; - - mask_IO_APIC_irq_desc(desc); - - if (io_apic_level_ack_pending(cfg)) { - /* - * Interrupt in progress. Migrating irq now will change the - * vector information in the IO-APIC RTE and that will confuse - * the EOI broadcast performed by cpu. - * So, delay the irq migration to the next instance. - */ - schedule_delayed_work(&ir_migration_work, 1); - goto unmask; - } - - /* everthing is clear. we have right of way */ - migrate_ioapic_irq_desc(desc, desc->pending_mask); - - ret = 0; - desc->status &= ~IRQ_MOVE_PENDING; - cpumask_clear(desc->pending_mask); - -unmask: - unmask_IO_APIC_irq_desc(desc); - - return ret; -} - -static void ir_irq_migration(struct work_struct *work) -{ - unsigned int irq; - struct irq_desc *desc; - - for_each_irq_desc(irq, desc) { - if (desc->status & IRQ_MOVE_PENDING) { - unsigned long flags; - - spin_lock_irqsave(&desc->lock, flags); - if (!desc->chip->set_affinity || - !(desc->status & IRQ_MOVE_PENDING)) { - desc->status &= ~IRQ_MOVE_PENDING; - spin_unlock_irqrestore(&desc->lock, flags); - continue; - } - - desc->chip->set_affinity(irq, desc->pending_mask); - spin_unlock_irqrestore(&desc->lock, flags); - } - } -} - /* * Migrates the IRQ destination in the process context. */ static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { - if (desc->status & IRQ_LEVEL) { - desc->status |= IRQ_MOVE_PENDING; - cpumask_copy(desc->pending_mask, mask); - migrate_irq_remapped_level_desc(desc); - return; - } - migrate_ioapic_irq_desc(desc, mask); } static void set_ir_ioapic_affinity_irq(unsigned int irq, @@ -2537,9 +2476,44 @@ static inline void irq_complete_move(struct irq_desc **descp) {} #endif #ifdef CONFIG_INTR_REMAP +static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) +{ + int apic, pin; + struct irq_pin_list *entry; + + entry = cfg->irq_2_pin; + for (;;) { + + if (!entry) + break; + + apic = entry->apic; + pin = entry->pin; + io_apic_eoi(apic, pin); + entry = entry->next; + } +} + +static void +eoi_ioapic_irq(struct irq_desc *desc) +{ + struct irq_cfg *cfg; + unsigned long flags; + unsigned int irq; + + irq = desc->irq; + cfg = desc->chip_data; + + spin_lock_irqsave(&ioapic_lock, flags); + __eoi_ioapic_irq(irq, cfg); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + static void ack_x2apic_level(unsigned int irq) { + struct irq_desc *desc = irq_to_desc(irq); ack_x2APIC_irq(); + eoi_ioapic_irq(desc); } static void ack_x2apic_edge(unsigned int irq) -- cgit v1.2.3 From 29b61be65a33c95564fa82e7e8d60d97adb68ea8 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 16 Mar 2009 17:05:02 -0700 Subject: x86, x2apic: cleanup ifdef CONFIG_INTR_REMAP in io_apic code Impact: cleanup Clean up #ifdefs and replace them with helper functions. Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/io_apic.c | 44 ++++++++++------------------------------- arch/x86/kernel/apic/probe_64.c | 2 -- 2 files changed, 10 insertions(+), 36 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e074eac5bd3..cf27795c641 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -554,16 +554,12 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq apic = entry->apic; pin = entry->pin; -#ifdef CONFIG_INTR_REMAP /* * With interrupt-remapping, destination information comes * from interrupt-remapping table entry. */ if (!irq_remapped(irq)) io_apic_write(apic, 0x11 + pin*2, dest); -#else - io_apic_write(apic, 0x11 + pin*2, dest); -#endif reg = io_apic_read(apic, 0x10 + pin*2); reg &= ~IO_APIC_REDIR_VECTOR_MASK; reg |= vector; @@ -1419,9 +1415,8 @@ void __setup_vector_irq(int cpu) } static struct irq_chip ioapic_chip; -#ifdef CONFIG_INTR_REMAP static struct irq_chip ir_ioapic_chip; -#endif +static struct irq_chip msi_ir_chip; #define IOAPIC_AUTO -1 #define IOAPIC_EDGE 0 @@ -1460,7 +1455,6 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t else desc->status &= ~IRQ_LEVEL; -#ifdef CONFIG_INTR_REMAP if (irq_remapped(irq)) { desc->status |= IRQ_MOVE_PCNTXT; if (trigger) @@ -1472,7 +1466,7 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t handle_edge_irq, "edge"); return; } -#endif + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || trigger == IOAPIC_LEVEL) set_irq_chip_and_handler_name(irq, &ioapic_chip, @@ -1493,7 +1487,6 @@ int setup_ioapic_entry(int apic_id, int irq, */ memset(entry,0,sizeof(*entry)); -#ifdef CONFIG_INTR_REMAP if (intr_remapping_enabled) { struct intel_iommu *iommu = map_ioapic_to_ir(apic_id); struct irte irte; @@ -1535,9 +1528,7 @@ int setup_ioapic_entry(int apic_id, int irq, * irq handler will do the explicit EOI to the io-apic. */ ir_entry->vector = pin; - } else -#endif - { + } else { entry->delivery_mode = apic->irq_delivery_mode; entry->dest_mode = apic->irq_dest_mode; entry->dest = destination; @@ -1662,10 +1653,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin, { struct IO_APIC_route_entry entry; -#ifdef CONFIG_INTR_REMAP if (intr_remapping_enabled) return; -#endif memset(&entry, 0, sizeof(entry)); @@ -2395,6 +2384,11 @@ static void set_ir_ioapic_affinity_irq(unsigned int irq, set_ir_ioapic_affinity_irq_desc(desc, mask); } +#else +static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, + const struct cpumask *mask) +{ +} #endif asmlinkage void smp_irq_move_cleanup_interrupt(void) @@ -2883,10 +2877,8 @@ static inline void __init check_timer(void) * 8259A. */ if (pin1 == -1) { -#ifdef CONFIG_INTR_REMAP if (intr_remapping_enabled) panic("BIOS bug: timer not connected to IO-APIC"); -#endif pin1 = pin2; apic1 = apic2; no_pin1 = 1; @@ -2922,10 +2914,8 @@ static inline void __init check_timer(void) clear_IO_APIC_pin(0, pin1); goto out; } -#ifdef CONFIG_INTR_REMAP if (intr_remapping_enabled) panic("timer doesn't work through Interrupt-remapped IO-APIC"); -#endif local_irq_disable(); clear_IO_APIC_pin(apic1, pin1); if (!no_pin1) @@ -3219,9 +3209,7 @@ void destroy_irq(unsigned int irq) if (desc) desc->chip_data = cfg; -#ifdef CONFIG_INTR_REMAP free_irte(irq); -#endif spin_lock_irqsave(&vector_lock, flags); __clear_irq_vector(irq, cfg); spin_unlock_irqrestore(&vector_lock, flags); @@ -3247,7 +3235,6 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); -#ifdef CONFIG_INTR_REMAP if (irq_remapped(irq)) { struct irte irte; int ir_index; @@ -3273,9 +3260,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms MSI_ADDR_IR_SHV | MSI_ADDR_IR_INDEX1(ir_index) | MSI_ADDR_IR_INDEX2(ir_index); - } else -#endif - { + } else { if (x2apic_enabled()) msg->address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest); @@ -3392,6 +3377,7 @@ static struct irq_chip msi_ir_chip = { #endif .retrigger = ioapic_retrigger_irq, }; +#endif /* * Map the PCI dev to the corresponding remapping hardware unit @@ -3419,7 +3405,6 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec) } return index; } -#endif static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) { @@ -3433,7 +3418,6 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) set_irq_msi(irq, msidesc); write_msi_msg(irq, &msg); -#ifdef CONFIG_INTR_REMAP if (irq_remapped(irq)) { struct irq_desc *desc = irq_to_desc(irq); /* @@ -3442,7 +3426,6 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) desc->status |= IRQ_MOVE_PCNTXT; set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge"); } else -#endif set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq); @@ -3456,11 +3439,8 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) int ret, sub_handle; struct msi_desc *msidesc; unsigned int irq_want; - -#ifdef CONFIG_INTR_REMAP struct intel_iommu *iommu = 0; int index = 0; -#endif irq_want = nr_irqs_gsi; sub_handle = 0; @@ -3469,7 +3449,6 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) if (irq == 0) return -1; irq_want = irq + 1; -#ifdef CONFIG_INTR_REMAP if (!intr_remapping_enabled) goto no_ir; @@ -3497,7 +3476,6 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) set_irte_irq(irq, iommu, index, sub_handle); } no_ir: -#endif ret = setup_msi_irq(dev, msidesc, irq); if (ret < 0) goto error; @@ -4032,11 +4010,9 @@ void __init setup_ioapic_dest(void) else mask = apic->target_cpus(); -#ifdef CONFIG_INTR_REMAP if (intr_remapping_enabled) set_ir_ioapic_affinity_irq_desc(desc, mask); else -#endif set_ioapic_affinity_irq_desc(desc, mask); } diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index 8297c2b8ed2..1783652bb0e 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -69,14 +69,12 @@ void __init default_setup_apic_routing(void) printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); } -#ifdef CONFIG_X86_X2APIC /* * Now that apic routing model is selected, configure the * fault handling for intr remapping. */ if (intr_remapping_enabled) enable_drhd_fault_handling(); -#endif } /* Same for both flat and physical. */ -- cgit v1.2.3 From 05c3dc2c4b60387769cbe73174347de4cf85f0c9 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 16 Mar 2009 17:05:03 -0700 Subject: x86, ioapic: Fix non atomic allocation with interrupts disabled Impact: fix possible race save_mask_IO_APIC_setup() was using non atomic memory allocation while getting called with interrupts disabled. Fix this by splitting this into two different function. Allocation part save_IO_APIC_setup() now happens before disabling interrupts. Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/apic.c | 11 ++++++----- arch/x86/kernel/apic/io_apic.c | 34 +++++++++++++++++++++++----------- 2 files changed, 29 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 699f8cf76bb..85eb8e10081 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1334,15 +1334,16 @@ void __init enable_IR_x2apic(void) return; } - local_irq_save(flags); - mask_8259A(); - - ret = save_mask_IO_APIC_setup(); + ret = save_IO_APIC_setup(); if (ret) { pr_info("Saving IO-APIC state failed: %d\n", ret); goto end; } + local_irq_save(flags); + mask_IO_APIC_setup(); + mask_8259A(); + ret = enable_intr_remapping(1); if (ret && x2apic_preenabled) { @@ -1367,10 +1368,10 @@ end_restore: else reinit_intr_remapped_IO_APIC(x2apic_preenabled); -end: unmask_8259A(); local_irq_restore(flags); +end: if (!ret) { if (!x2apic_preenabled) pr_info("Enabled x2apic and interrupt-remapping\n"); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index cf27795c641..ff1759a1128 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -853,9 +853,9 @@ __setup("pirq=", ioapic_pirq_setup); static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS]; /* - * Saves and masks all the unmasked IO-APIC RTE's + * Saves all the IO-APIC RTE's */ -int save_mask_IO_APIC_setup(void) +int save_IO_APIC_setup(void) { union IO_APIC_reg_01 reg_01; unsigned long flags; @@ -880,16 +880,9 @@ int save_mask_IO_APIC_setup(void) } for (apic = 0; apic < nr_ioapics; apic++) - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - struct IO_APIC_route_entry entry; - - entry = early_ioapic_entries[apic][pin] = + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) + early_ioapic_entries[apic][pin] = ioapic_read_entry(apic, pin); - if (!entry.mask) { - entry.mask = 1; - ioapic_write_entry(apic, pin, entry); - } - } return 0; @@ -902,6 +895,25 @@ nomem: return -ENOMEM; } +void mask_IO_APIC_setup(void) +{ + int apic, pin; + + for (apic = 0; apic < nr_ioapics; apic++) { + if (!early_ioapic_entries[apic]) + break; + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + struct IO_APIC_route_entry entry; + + entry = early_ioapic_entries[apic][pin]; + if (!entry.mask) { + entry.mask = 1; + ioapic_write_entry(apic, pin, entry); + } + } + } +} + void restore_IO_APIC_setup(void) { int apic, pin; -- cgit v1.2.3 From 68a8ca593fac82e336a792226272455901fa83df Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 16 Mar 2009 17:05:04 -0700 Subject: x86: fix broken irq migration logic while cleaning up multiple vectors Impact: fix spurious IRQs During irq migration, we send a low priority interrupt to the previous irq destination. This happens in non interrupt-remapping case after interrupt starts arriving at new destination and in interrupt-remapping case after modifying and flushing the interrupt-remapping table entry caches. This low priority irq cleanup handler can cleanup multiple vectors, as multiple irq's can be migrated at almost the same time. While there will be multiple invocations of irq cleanup handler (one cleanup IPI for each irq migration), first invocation of the cleanup handler can potentially cleanup more than one vector (as the first invocation can see the requests for more than vector cleanup). When we cleanup multiple vectors during the first invocation of the smp_irq_move_cleanup_interrupt(), other vectors that are to be cleanedup can still be pending in the local cpu's IRR (as smp_irq_move_cleanup_interrupt() runs with interrupts disabled). When we are ready to unhook a vector corresponding to an irq, check if that vector is registered in the local cpu's IRR. If so skip that cleanup and do a self IPI with the cleanup vector, so that we give a chance to service the pending vector interrupt and then cleanup that vector allocation once we execute the lowest priority handler. This fixes spurious interrupts seen when migrating multiple vectors at the same time. [ This is apparently possible even on conventional xapic, although to the best of our knowledge it has never been seen. The stable maintainers may wish to consider this one for -stable. ] Signed-off-by: Suresh Siddha Cc: Eric W. Biederman Signed-off-by: H. Peter Anvin Cc: stable@kernel.org --- arch/x86/kernel/apic/io_apic.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ff1759a1128..42cdc78427a 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2414,6 +2414,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) me = smp_processor_id(); for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { unsigned int irq; + unsigned int irr; struct irq_desc *desc; struct irq_cfg *cfg; irq = __get_cpu_var(vector_irq)[vector]; @@ -2433,6 +2434,18 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) goto unlock; + irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); + /* + * Check if the vector that needs to be cleanedup is + * registered at the cpu's IRR. If so, then this is not + * the best time to clean it up. Lets clean it up in the + * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR + * to myself. + */ + if (irr & (1 << (vector % 32))) { + apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR); + goto unlock; + } __get_cpu_var(vector_irq)[vector] = -1; cfg->move_cleanup_count--; unlock: -- cgit v1.2.3 From a6b6a14e0c60561f2902b078bd28d0e61defad70 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 18 Mar 2009 10:40:25 +1030 Subject: x86: use smp_call_function_single() in arch/x86/kernel/cpu/mcheck/mce_amd_64.c Attempting to rid us of the problematic work_on_cpu(). Just use smp_call_function_single() here. Signed-off-by: Andrew Morton Signed-off-by: Rusty Russell LKML-Reference: <20090318042217.EF3F1DDF39@ozlabs.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 40 ++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index c5a32f92d07..7d01be86887 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -92,7 +92,8 @@ struct thresh_restart { }; /* must be called with correct cpu affinity */ -static long threshold_restart_bank(void *_tr) +/* Called via smp_call_function_single() */ +static void threshold_restart_bank(void *_tr) { struct thresh_restart *tr = _tr; u32 mci_misc_hi, mci_misc_lo; @@ -119,7 +120,6 @@ static long threshold_restart_bank(void *_tr) mci_misc_hi |= MASK_COUNT_EN_HI; wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi); - return 0; } /* cpu init entry point, called from mce.c with preempt off */ @@ -279,7 +279,7 @@ static ssize_t store_interrupt_enable(struct threshold_block *b, tr.b = b; tr.reset = 0; tr.old_limit = 0; - work_on_cpu(b->cpu, threshold_restart_bank, &tr); + smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); return end - buf; } @@ -301,23 +301,32 @@ static ssize_t store_threshold_limit(struct threshold_block *b, tr.b = b; tr.reset = 0; - work_on_cpu(b->cpu, threshold_restart_bank, &tr); + smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); return end - buf; } -static long local_error_count(void *_b) +struct threshold_block_cross_cpu { + struct threshold_block *tb; + long retval; +}; + +static void local_error_count_handler(void *_tbcc) { - struct threshold_block *b = _b; + struct threshold_block_cross_cpu *tbcc = _tbcc; + struct threshold_block *b = tbcc->tb; u32 low, high; rdmsr(b->address, low, high); - return (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit); + tbcc->retval = (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit); } static ssize_t show_error_count(struct threshold_block *b, char *buf) { - return sprintf(buf, "%lx\n", work_on_cpu(b->cpu, local_error_count, b)); + struct threshold_block_cross_cpu tbcc = { .tb = b, }; + + smp_call_function_single(b->cpu, local_error_count_handler, &tbcc, 1); + return sprintf(buf, "%lx\n", tbcc.retval); } static ssize_t store_error_count(struct threshold_block *b, @@ -325,7 +334,7 @@ static ssize_t store_error_count(struct threshold_block *b, { struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 }; - work_on_cpu(b->cpu, threshold_restart_bank, &tr); + smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); return 1; } @@ -394,7 +403,7 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu, if ((bank >= NR_BANKS) || (block >= NR_BLOCKS)) return 0; - if (rdmsr_safe(address, &low, &high)) + if (rdmsr_safe_on_cpu(cpu, address, &low, &high)) return 0; if (!(high & MASK_VALID_HI)) { @@ -458,12 +467,11 @@ out_free: return err; } -static __cpuinit long local_allocate_threshold_blocks(void *_bank) +static __cpuinit long +local_allocate_threshold_blocks(int cpu, unsigned int bank) { - unsigned int *bank = _bank; - - return allocate_threshold_blocks(smp_processor_id(), *bank, 0, - MSR_IA32_MC0_MISC + *bank * 4); + return allocate_threshold_blocks(cpu, bank, 0, + MSR_IA32_MC0_MISC + bank * 4); } /* symlinks sibling shared banks to first core. first core owns dir/files. */ @@ -526,7 +534,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) per_cpu(threshold_banks, cpu)[bank] = b; - err = work_on_cpu(cpu, local_allocate_threshold_blocks, &bank); + err = local_allocate_threshold_blocks(cpu, bank); if (err) goto out_free; -- cgit v1.2.3 From ce4e240c279a31096f74afa6584a62d64a1ba8c8 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 17 Mar 2009 10:16:54 -0800 Subject: x86: add x2apic_wrmsr_fence() to x2apic flush tlb paths Impact: optimize APIC IPI related barriers Uncached MMIO accesses for xapic are inherently serializing and hence we don't need explicit barriers for xapic IPI paths. x2apic MSR writes/reads don't have serializing semantics and hence need a serializing instruction or mfence, to make all the previous memory stores globally visisble before the x2apic msr write for IPI. Add x2apic_wrmsr_fence() in flush tlb path to x2apic specific paths. Signed-off-by: Suresh Siddha Cc: Peter Zijlstra Cc: Oleg Nesterov Cc: Jens Axboe Cc: Linus Torvalds Cc: "Paul E. McKenney" Cc: Rusty Russell Cc: Steven Rostedt Cc: "steiner@sgi.com" Cc: Nick Piggin LKML-Reference: <1237313814.27006.203.camel@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_cluster.c | 6 ++++++ arch/x86/kernel/apic/x2apic_phys.c | 6 ++++++ 2 files changed, 12 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 8fb87b6dd63..4a903e2f0d1 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -57,6 +57,8 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) unsigned long query_cpu; unsigned long flags; + x2apic_wrmsr_fence(); + local_irq_save(flags); for_each_cpu(query_cpu, mask) { __x2apic_send_IPI_dest( @@ -73,6 +75,8 @@ static void unsigned long query_cpu; unsigned long flags; + x2apic_wrmsr_fence(); + local_irq_save(flags); for_each_cpu(query_cpu, mask) { if (query_cpu == this_cpu) @@ -90,6 +94,8 @@ static void x2apic_send_IPI_allbutself(int vector) unsigned long query_cpu; unsigned long flags; + x2apic_wrmsr_fence(); + local_irq_save(flags); for_each_online_cpu(query_cpu) { if (query_cpu == this_cpu) diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 23625b9f98b..a284359627e 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -58,6 +58,8 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) unsigned long query_cpu; unsigned long flags; + x2apic_wrmsr_fence(); + local_irq_save(flags); for_each_cpu(query_cpu, mask) { __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), @@ -73,6 +75,8 @@ static void unsigned long query_cpu; unsigned long flags; + x2apic_wrmsr_fence(); + local_irq_save(flags); for_each_cpu(query_cpu, mask) { if (query_cpu != this_cpu) @@ -89,6 +93,8 @@ static void x2apic_send_IPI_allbutself(int vector) unsigned long query_cpu; unsigned long flags; + x2apic_wrmsr_fence(); + local_irq_save(flags); for_each_online_cpu(query_cpu) { if (query_cpu == this_cpu) -- cgit v1.2.3 From 2c74d66624ddbda8101d54d1e184cf9229b378bc Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 18 Mar 2009 08:22:30 +1030 Subject: x86, uv: fix cpumask iterator in uv_bau_init() Impact: fix boot crash on UV systems Commit 76ba0ecda0de9accea9a91cb6dbde46782110e1c "cpumask: use cpumask_var_t in uv_flush_tlb_others" used cur_cpu as an iterator; it was supposed to be zero for the code below it. Reported-by: Cliff Wickman Original-From: Cliff Wickman Signed-off-by: Rusty Russell Acked-by: Mike Travis Cc: steiner@sgi.com Cc: LKML-Reference: <200903180822.31196.rusty@rustcorp.com.au> Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index d038b9c45cf..79c07324728 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -750,7 +750,7 @@ static int __init uv_bau_init(void) int node; int nblades; int last_blade; - int cur_cpu = 0; + int cur_cpu; if (!is_uv_system()) return 0; @@ -760,6 +760,7 @@ static int __init uv_bau_init(void) uv_mmask = (1UL << uv_hub_info->n_val) - 1; nblades = 0; last_blade = -1; + cur_cpu = 0; for_each_online_node(node) { blade = uv_node_to_blade_id(node); if (blade == last_blade) -- cgit v1.2.3 From 4e16c888754468a58d4829c2eba2c6aa3a065e2b Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 18 Mar 2009 17:36:55 +0530 Subject: x86: cpu/mttr/cleanup.c fix compilation warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit arch/x86/kernel/cpu/mtrr/cleanup.c:197: warning: format ‘%d’ expects type ‘int’, but argument 2 has type ‘long unsigned int’ Signed-off-by: Jaswinder Singh Rajput Cc: Yinghai Lu LKML-Reference: <1237378015.13488.1.camel@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/cleanup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index f4f89fbc228..ce0fe4b5c04 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -160,8 +160,9 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, unsigned long extra_remove_base, unsigned long extra_remove_size) { - unsigned long i, base, size; + unsigned long base, size; mtrr_type type; + int i; for (i = 0; i < num_var_ranges; i++) { type = range_state[i].type; -- cgit v1.2.3