From 1d3381ebf42de1b6f8c118732893cb5bdc37edcd Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 13 Mar 2008 16:59:12 -0700 Subject: x86: convert mtrr/generic.c to kernel-doc Convert function comment blocks to kernel-doc notation. Signed-off-by: Randy Dunlap Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/mtrr/generic.c | 42 +++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 19 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 103d61a59b1..3e18db4cefe 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -176,12 +176,13 @@ static inline void k8_enable_fixed_iorrs(void) } /** - * Checks and updates an fixed-range MTRR if it differs from the value it - * should have. If K8 extentions are wanted, update the K8 SYSCFG MSR also. - * see AMD publication no. 24593, chapter 7.8.1, page 233 for more information - * \param msr MSR address of the MTTR which should be checked and updated - * \param changed pointer which indicates whether the MTRR needed to be changed - * \param msrwords pointer to the MSR values which the MSR should have + * set_fixed_range - checks & updates a fixed-range MTRR if it differs from the value it should have + * @msr: MSR address of the MTTR which should be checked and updated + * @changed: pointer which indicates whether the MTRR needed to be changed + * @msrwords: pointer to the MSR values which the MSR should have + * + * If K8 extentions are wanted, update the K8 SYSCFG MSR also. + * See AMD publication no. 24593, chapter 7.8.1, page 233 for more information. */ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords) { @@ -199,12 +200,15 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords) } } +/** + * generic_get_free_region - Get a free MTRR. + * @base: The starting (base) address of the region. + * @size: The size (in bytes) of the region. + * @replace_reg: mtrr index to be replaced; set to invalid value if none. + * + * Returns: The index of the region on success, else negative on error. + */ int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg) -/* [SUMMARY] Get a free MTRR. - The starting (base) address of the region. - The size (in bytes) of the region. - [RETURNS] The index of the region on success, else -1 on error. -*/ { int i, max; mtrr_type ltype; @@ -249,8 +253,8 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, } /** - * Checks and updates the fixed-range MTRRs if they differ from the saved set - * \param frs pointer to fixed-range MTRR values, saved by get_fixed_ranges() + * set_fixed_ranges - checks & updates the fixed-range MTRRs if they differ from the saved set + * @frs: pointer to fixed-range MTRR values, saved by get_fixed_ranges() */ static int set_fixed_ranges(mtrr_type * frs) { @@ -294,13 +298,13 @@ static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) static u32 deftype_lo, deftype_hi; +/** + * set_mtrr_state - Set the MTRR state for this CPU. + * + * NOTE: The CPU must already be in a safe state for MTRR changes. + * RETURNS: 0 if no changes made, else a mask indicating what was changed. + */ static unsigned long set_mtrr_state(void) -/* [SUMMARY] Set the MTRR state for this CPU. - The MTRR state information to read. - Some relevant CPU context. - [NOTE] The CPU must already be in a safe state for MTRR changes. - [RETURNS] 0 if no changes made, else a mask indication what was changed. -*/ { unsigned int i; unsigned long change_mask = 0; -- cgit v1.2.3 From 3c274c2909e17aa0afeded4cd4520b7357357ca0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 21 Mar 2008 10:06:32 +0100 Subject: x86: add dmi quirk for io_delay reported by mereandor@gmail.com, in: http://bugzilla.kernel.org/show_bug.cgi?id=6307 Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/io_delay.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c index c706a306155..5921e5f0a64 100644 --- a/arch/x86/kernel/io_delay.c +++ b/arch/x86/kernel/io_delay.c @@ -76,6 +76,14 @@ static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "30B9") } }, + { + .callback = dmi_io_delay_0xed_port, + .ident = "HP Pavilion dv6000", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30B8") + } + }, { .callback = dmi_io_delay_0xed_port, .ident = "HP Pavilion tx1000", -- cgit v1.2.3 From 475613b9e374bf0c15340eb166a962da04aa02e8 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 24 Feb 2008 23:23:09 -0800 Subject: x86: fix memoryless node oops during boot fix oops during boot reported in this thread: http://lkml.org/lkml/2008/2/6/65 enable booting on memoryless nodes. Reported-by: Kamalesh Babulal Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 7637dc91c79..f4f7ecfb898 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -801,7 +801,7 @@ static void __cpuinit srat_detect_node(void) /* Don't do the funky fallback heuristics the AMD version employs for now. */ node = apicid_to_node[apicid]; - if (node == NUMA_NO_NODE) + if (node == NUMA_NO_NODE || !node_online(node)) node = first_node(node_online_map); numa_set_node(cpu, node); -- cgit v1.2.3 From c6e8256a7b15033bc5d7797e25c7e053040c4c7c Mon Sep 17 00:00:00 2001 From: Stephan Diestelhorst Date: Mon, 10 Mar 2008 16:05:41 +0100 Subject: x86, cpufreq: fix Speedfreq-SMI call that clobbers ECX I have found that using SMI to change the cpu's frequency on my DELL Latitude L400 clobbers the ECX register in speedstep_set_state, causing unneccessary retries because the "state" variable has changed silently (GCC assumes it is still present in ECX). play safe and avoid gcc caching any register across IO port accesses that trigger SMIs. Signed-off by: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/cpufreq/speedstep-smi.c | 39 ++++++++++++++++++----------- 1 file changed, 24 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c index f2b5a621d27..8a85c93bd62 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c @@ -63,7 +63,7 @@ static struct cpufreq_frequency_table speedstep_freqs[] = { */ static int speedstep_smi_ownership (void) { - u32 command, result, magic; + u32 command, result, magic, dummy; u32 function = GET_SPEEDSTEP_OWNER; unsigned char magic_data[] = "Copyright (c) 1999 Intel Corporation"; @@ -73,8 +73,11 @@ static int speedstep_smi_ownership (void) dprintk("trying to obtain ownership with command %x at port %x\n", command, smi_port); __asm__ __volatile__( + "push %%ebp\n" "out %%al, (%%dx)\n" - : "=D" (result) + "pop %%ebp\n" + : "=D" (result), "=a" (dummy), "=b" (dummy), "=c" (dummy), "=d" (dummy), + "=S" (dummy) : "a" (command), "b" (function), "c" (0), "d" (smi_port), "D" (0), "S" (magic) : "memory" @@ -96,7 +99,7 @@ static int speedstep_smi_ownership (void) */ static int speedstep_smi_get_freqs (unsigned int *low, unsigned int *high) { - u32 command, result = 0, edi, high_mhz, low_mhz; + u32 command, result = 0, edi, high_mhz, low_mhz, dummy; u32 state=0; u32 function = GET_SPEEDSTEP_FREQS; @@ -109,10 +112,12 @@ static int speedstep_smi_get_freqs (unsigned int *low, unsigned int *high) dprintk("trying to determine frequencies with command %x at port %x\n", command, smi_port); - __asm__ __volatile__("movl $0, %%edi\n" + __asm__ __volatile__( + "push %%ebp\n" "out %%al, (%%dx)\n" - : "=a" (result), "=b" (high_mhz), "=c" (low_mhz), "=d" (state), "=D" (edi) - : "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0) + "pop %%ebp" + : "=a" (result), "=b" (high_mhz), "=c" (low_mhz), "=d" (state), "=D" (edi), "=S" (dummy) + : "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0), "D" (0) ); dprintk("result %x, low_freq %u, high_freq %u\n", result, low_mhz, high_mhz); @@ -135,16 +140,18 @@ static int speedstep_smi_get_freqs (unsigned int *low, unsigned int *high) static int speedstep_get_state (void) { u32 function=GET_SPEEDSTEP_STATE; - u32 result, state, edi, command; + u32 result, state, edi, command, dummy; command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); dprintk("trying to determine current setting with command %x at port %x\n", command, smi_port); - __asm__ __volatile__("movl $0, %%edi\n" + __asm__ __volatile__( + "push %%ebp\n" "out %%al, (%%dx)\n" - : "=a" (result), "=b" (state), "=D" (edi) - : "a" (command), "b" (function), "c" (0), "d" (smi_port), "S" (0) + "pop %%ebp\n" + : "=a" (result), "=b" (state), "=D" (edi), "=c" (dummy), "=d" (dummy), "=S" (dummy) + : "a" (command), "b" (function), "c" (0), "d" (smi_port), "S" (0), "D" (0) ); dprintk("state is %x, result is %x\n", state, result); @@ -160,7 +167,7 @@ static int speedstep_get_state (void) */ static void speedstep_set_state (unsigned int state) { - unsigned int result = 0, command, new_state; + unsigned int result = 0, command, new_state, dummy; unsigned long flags; unsigned int function=SET_SPEEDSTEP_STATE; unsigned int retry = 0; @@ -182,10 +189,12 @@ static void speedstep_set_state (unsigned int state) } retry++; __asm__ __volatile__( - "movl $0, %%edi\n" + "push %%ebp\n" "out %%al, (%%dx)\n" - : "=b" (new_state), "=D" (result) - : "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0) + "pop %%ebp" + : "=b" (new_state), "=D" (result), "=c" (dummy), "=a" (dummy), + "=d" (dummy), "=S" (dummy) + : "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0), "D" (0) ); } while ((new_state != state) && (retry <= SMI_TRIES)); @@ -195,7 +204,7 @@ static void speedstep_set_state (unsigned int state) if (new_state == state) { dprintk("change to %u MHz succeeded after %u tries with result %u\n", (speedstep_freqs[new_state].frequency / 1000), retry, result); } else { - printk(KERN_ERR "cpufreq: change failed with new_state %u and result %u\n", new_state, result); + printk(KERN_ERR "cpufreq: change to state %u failed with new_state %u and result %u\n", state, new_state, result); } return; -- cgit v1.2.3 From 923a0cf82f2b504e316642e2d152d38b6c0be4ba Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Wed, 26 Mar 2008 14:13:01 -0400 Subject: x86: GEODE: add missing module.h include On Wed, 26 Mar 2008 11:56:22 -0600 Jordan Crouse wrote: > On 26/03/08 14:31 +0100, Stefan Pfetzing wrote: > > Hello Jordan, > > > > I just tried to build your geodwdt driver for the geode watchdog. Therefore > > I pulled your repository from http://git.infradead.org/geode.git (or more, > > the git url). > > > > I tried to build the geodewdt driver as a module - which didn't work, and > > it failed with the same problem as earlier mentioned on lkmk [1]. I also > > checked the fix [2], but that seems to be already in your (or linus) tree - > > and so I'm unsure what the problem is. > > > > [1] http://kerneltrap.org/mailarchive/linux-kernel/2008/2/17/884074 > > [2] http://kerneltrap.org/mailarchive/linux-kernel/2008/2/17/884174 > > > > Building directly into the kernel seems to work. > > > > Maybe you have some idea? > > Hmm - that is strange. Exporting the symbols should work. I recommend > starting over with a clean tree. > > CCing Andres - any thoughts? > > Jordan > Er, yeah. The patch below should fix it. This should probably go into 2.6.25. Oops, EXPORT_SYMBOL_GPL wasn't being declared due to this header being missing. Signed-off-by: Andres Salomon Signed-off-by: Ingo Molnar --- arch/x86/kernel/mfgpt_32.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index 027fc067b39..b402c0f3f19 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -30,6 +30,7 @@ #include #include +#include #include static struct mfgpt_timer_t { -- cgit v1.2.3 From 76c324182bbd29dfe4298ca65efb15be18055df1 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 23 Mar 2008 00:16:49 -0700 Subject: x86: fix trim mtrr not to setup_memory two times we could call find_max_pfn() directly instead of setup_memory() to get max_pfn needed for mtrr trimming. otherwise setup_memory() is called two times... that is duplicated... [ mingo@elte.hu: both Thomas and me simulated a double call to setup_bootmem_allocator() and can confirm that it is a real bug which can hang in certain configs. It's not been reported yet but that is probably due to the relatively scarce nature of MTRR-trimming systems. ] Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_32.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index a1d7071a51c..2b3e5d45176 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -406,8 +406,6 @@ static unsigned long __init setup_memory(void) */ min_low_pfn = PFN_UP(init_pg_tables_end); - find_max_pfn(); - max_low_pfn = find_max_low_pfn(); #ifdef CONFIG_HIGHMEM @@ -764,12 +762,13 @@ void __init setup_arch(char **cmdline_p) if (efi_enabled) efi_init(); - max_low_pfn = setup_memory(); - /* update e820 for memory not covered by WB MTRRs */ + find_max_pfn(); mtrr_bp_init(); if (mtrr_trim_uncached_memory(max_pfn)) - max_low_pfn = setup_memory(); + find_max_pfn(); + + max_low_pfn = setup_memory(); #ifdef CONFIG_VMI /* -- cgit v1.2.3 From d8d4f157b8d828bc837f0eb2ee4a2dd40dbdd572 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 4 Mar 2008 15:05:39 -0800 Subject: x86: ptrace.c: fix defined-but-unused warnings arch/x86/kernel/ptrace.c:548: warning: 'ptrace_bts_get_size' defined but not used arch/x86/kernel/ptrace.c:558: warning: 'ptrace_bts_read_record' defined but not used arch/x86/kernel/ptrace.c:607: warning: 'ptrace_bts_clear' defined but not used arch/x86/kernel/ptrace.c:617: warning: 'ptrace_bts_drain' defined but not used arch/x86/kernel/ptrace.c:720: warning: 'ptrace_bts_config' defined but not used arch/x86/kernel/ptrace.c:788: warning: 'ptrace_bts_status' defined but not used Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/ptrace.c | 169 ++++++++++++++++++++++++----------------------- 1 file changed, 85 insertions(+), 84 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index d5904eef1d3..eb92ccbb350 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -600,21 +600,6 @@ static int ptrace_bts_read_record(struct task_struct *child, return sizeof(ret); } -static int ptrace_bts_write_record(struct task_struct *child, - const struct bts_struct *in) -{ - int retval; - - if (!child->thread.ds_area_msr) - return -ENXIO; - - retval = ds_write_bts((void *)child->thread.ds_area_msr, in); - if (retval) - return retval; - - return sizeof(*in); -} - static int ptrace_bts_clear(struct task_struct *child) { if (!child->thread.ds_area_msr) @@ -657,75 +642,6 @@ static int ptrace_bts_drain(struct task_struct *child, return end; } -static int ptrace_bts_realloc(struct task_struct *child, - int size, int reduce_size) -{ - unsigned long rlim, vm; - int ret, old_size; - - if (size < 0) - return -EINVAL; - - old_size = ds_get_bts_size((void *)child->thread.ds_area_msr); - if (old_size < 0) - return old_size; - - ret = ds_free((void **)&child->thread.ds_area_msr); - if (ret < 0) - goto out; - - size >>= PAGE_SHIFT; - old_size >>= PAGE_SHIFT; - - current->mm->total_vm -= old_size; - current->mm->locked_vm -= old_size; - - if (size == 0) - goto out; - - rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; - vm = current->mm->total_vm + size; - if (rlim < vm) { - ret = -ENOMEM; - - if (!reduce_size) - goto out; - - size = rlim - current->mm->total_vm; - if (size <= 0) - goto out; - } - - rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; - vm = current->mm->locked_vm + size; - if (rlim < vm) { - ret = -ENOMEM; - - if (!reduce_size) - goto out; - - size = rlim - current->mm->locked_vm; - if (size <= 0) - goto out; - } - - ret = ds_allocate((void **)&child->thread.ds_area_msr, - size << PAGE_SHIFT); - if (ret < 0) - goto out; - - current->mm->total_vm += size; - current->mm->locked_vm += size; - -out: - if (child->thread.ds_area_msr) - set_tsk_thread_flag(child, TIF_DS_AREA_MSR); - else - clear_tsk_thread_flag(child, TIF_DS_AREA_MSR); - - return ret; -} - static int ptrace_bts_config(struct task_struct *child, long cfg_size, const struct ptrace_bts_config __user *ucfg) @@ -828,6 +744,91 @@ static int ptrace_bts_status(struct task_struct *child, return sizeof(cfg); } + +static int ptrace_bts_write_record(struct task_struct *child, + const struct bts_struct *in) +{ + int retval; + + if (!child->thread.ds_area_msr) + return -ENXIO; + + retval = ds_write_bts((void *)child->thread.ds_area_msr, in); + if (retval) + return retval; + + return sizeof(*in); +} + +static int ptrace_bts_realloc(struct task_struct *child, + int size, int reduce_size) +{ + unsigned long rlim, vm; + int ret, old_size; + + if (size < 0) + return -EINVAL; + + old_size = ds_get_bts_size((void *)child->thread.ds_area_msr); + if (old_size < 0) + return old_size; + + ret = ds_free((void **)&child->thread.ds_area_msr); + if (ret < 0) + goto out; + + size >>= PAGE_SHIFT; + old_size >>= PAGE_SHIFT; + + current->mm->total_vm -= old_size; + current->mm->locked_vm -= old_size; + + if (size == 0) + goto out; + + rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; + vm = current->mm->total_vm + size; + if (rlim < vm) { + ret = -ENOMEM; + + if (!reduce_size) + goto out; + + size = rlim - current->mm->total_vm; + if (size <= 0) + goto out; + } + + rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; + vm = current->mm->locked_vm + size; + if (rlim < vm) { + ret = -ENOMEM; + + if (!reduce_size) + goto out; + + size = rlim - current->mm->locked_vm; + if (size <= 0) + goto out; + } + + ret = ds_allocate((void **)&child->thread.ds_area_msr, + size << PAGE_SHIFT); + if (ret < 0) + goto out; + + current->mm->total_vm += size; + current->mm->locked_vm += size; + +out: + if (child->thread.ds_area_msr) + set_tsk_thread_flag(child, TIF_DS_AREA_MSR); + else + clear_tsk_thread_flag(child, TIF_DS_AREA_MSR); + + return ret; +} + void ptrace_bts_take_timestamp(struct task_struct *tsk, enum bts_qualifier qualifier) { -- cgit v1.2.3 From 629c8b4cdb354518308663aff2f719e02f69ffbe Mon Sep 17 00:00:00 2001 From: Ken'ichi Ohmichi Date: Wed, 2 Apr 2008 13:04:50 -0700 Subject: vmcoreinfo: add the symbol "phys_base" Fix the problem that makedumpfile sometimes fails on x86_64 machine. This patch adds the symbol "phys_base" to a vmcoreinfo data. The vmcoreinfo data has the minimum debugging information only for dump filtering. makedumpfile (dump filtering command) gets it to distinguish unnecessary pages, and makedumpfile creates a small dumpfile. On x86_64 kernel which compiled with CONFIG_PHYSICAL_START=0x0 and CONFIG_RELOCATABLE=y, makedumpfile fails like the following: # makedumpfile -d31 /proc/vmcore dumpfile The kernel version is not supported. The created dumpfile may be incomplete. _exclude_free_page: Can't get next online node. makedumpfile Failed. # The cause is the lack of the symbol "phys_base" in a vmcoreinfo data. If the symbol "phys_base" does not exist, makedumpfile considers an x86_64 kernel as non relocatable. As the result, makedumpfile misunderstands the physical address where the kernel is loaded, and it cannot translate a kernel virtual address to physical address correctly. To fix this problem, this patch adds the symbol "phys_base" to a vmcoreinfo data. Signed-off-by: Ken'ichi Ohmichi Cc: "Eric W. Biederman" Cc: Acked-by: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/machine_kexec_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 236d2f8f7dd..576a03db451 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -233,6 +233,7 @@ NORET_TYPE void machine_kexec(struct kimage *image) void arch_crash_save_vmcoreinfo(void) { + VMCOREINFO_SYMBOL(phys_base); VMCOREINFO_SYMBOL(init_level4_pgt); #ifdef CONFIG_NUMA -- cgit v1.2.3 From 4ba51fd75cc3789be83f0d6f878dabbb0cb19bca Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Thu, 3 Apr 2008 14:18:55 -0700 Subject: x86 ptrace: avoid unnecessary wrmsr This avoids using wrmsr on MSR_IA32_DEBUGCTLMSR when it's not needed. No wrmsr ever needs to be done if noone has ever used block stepping. Without this change, using ptrace on 2.6.25 on an x86 KVM guest will tickle KVM's missing support for the MSR and crash the guest kernel. Though host KVM is the buggy one, this makes for a regression in the guest behavior from 2.6.24->2.6.25 that we can easily avoid. I also corrected some bad whitespace. Signed-off-by: Roland McGrath Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Linus Torvalds --- arch/x86/kernel/step.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 9d406cdc847..071ff479823 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -140,6 +140,9 @@ static int enable_single_step(struct task_struct *child) */ static void write_debugctlmsr(struct task_struct *child, unsigned long val) { + if (child->thread.debugctlmsr == val) + return; + child->thread.debugctlmsr = val; if (child != current) @@ -165,11 +168,11 @@ static void enable_step(struct task_struct *child, bool block) write_debugctlmsr(child, child->thread.debugctlmsr | DEBUGCTLMSR_BTF); } else { - write_debugctlmsr(child, - child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF); + write_debugctlmsr(child, + child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF); - if (!child->thread.debugctlmsr) - clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + if (!child->thread.debugctlmsr) + clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); } } -- cgit v1.2.3 From 4f14bdef41e599e218d71e3d0abf339d65e9b480 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 27 Mar 2008 23:37:58 +0100 Subject: x86: fix nmi_watchdog=2 on Pentium-D CPUs implement nmi_watchdog=2 on this class of CPUs: cpu family : 15 model : 6 model name : Intel(R) Pentium(R) D CPU 3.00GHz the watchdog's ->setup() method is safe anyway, so if the CPU cannot support it we'll bail out safely. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perfctr-watchdog.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 9b838324b81..19a359472ae 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -652,9 +652,6 @@ static void probe_nmi_watchdog(void) wd_ops = &p6_wd_ops; break; case 15: - if (boot_cpu_data.x86_model > 0x4) - return; - wd_ops = &p4_wd_ops; break; default: -- cgit v1.2.3 From 9c9b81f77330ddc003a2de2f35fa6a20410c1a62 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 27 Mar 2008 23:39:42 +0100 Subject: x86: print message if nmi_watchdog=2 cannot be enabled right now if there's no CPU support for nmi_watchdog=2 we'll just refuse it silently. print a useful warning. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perfctr-watchdog.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 19a359472ae..b943e10ad81 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -667,8 +667,10 @@ int lapic_watchdog_init(unsigned nmi_hz) { if (!wd_ops) { probe_nmi_watchdog(); - if (!wd_ops) + if (!wd_ops) { + printk(KERN_INFO "NMI watchdog: CPU not supported\n"); return -1; + } if (!wd_ops->reserve()) { printk(KERN_ERR -- cgit v1.2.3 From 8f59610de2fb244b5bc1a3feafd328a8d4d511d6 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Tue, 1 Apr 2008 14:24:03 +0200 Subject: x86, agpgart: scary messages are fortunately obsolete Fix obsolete printks in aperture-64. We used not to handle missing agpgart, but we handle it okay now. Signed-off-by: Pavel Machek Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index faf3229f8fb..700e4647dd3 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -615,8 +615,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info) nommu: /* Should not happen anymore */ - printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n" - KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction.\n"); + printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n" + KERN_WARNING "falling back to iommu=soft.\n"); return -1; } @@ -692,9 +692,9 @@ void __init gart_iommu_init(void) !gart_iommu_aperture || (no_agp && init_k8_gatt(&info) < 0)) { if (end_pfn > MAX_DMA32_PFN) { - printk(KERN_ERR "WARNING more than 4GB of memory " - "but GART IOMMU not available.\n" - KERN_ERR "WARNING 32bit PCI may malfunction.\n"); + printk(KERN_WARNING "More than 4GB of memory " + "but GART IOMMU not available.\n" + KERN_WARNING "falling back to iommu=soft.\n"); } return; } -- cgit v1.2.3 From 47001d603375f857a7fab0e9c095d964a1ea0039 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Apr 2008 19:45:18 +0200 Subject: x86: tsc prevent time going backwards We already catch most of the TSC problems by sanity checks, but there is a subtle bug which has been in the code for ever. This can cause time jumps in the range of hours. This was reported in: http://lkml.org/lkml/2007/8/23/96 and http://lkml.org/lkml/2008/3/31/23 I was able to reproduce the problem with a gettimeofday loop test on a dual core and a quad core machine which both have sychronized TSCs. The TSCs seems not to be perfectly in sync though, but the kernel is not able to detect the slight delta in the sync check. Still there exists an extremly small window where this delta can be observed with a real big time jump. So far I was only able to reproduce this with the vsyscall gettimeofday implementation, but in theory this might be observable with the syscall based version as well. CPU 0 updates the clock source variables under xtime/vyscall lock and CPU1, where the TSC is slighty behind CPU0, is reading the time right after the seqlock was unlocked. The clocksource reference data was updated with the TSC from CPU0 and the value which is read from TSC on CPU1 is less than the reference data. This results in a huge delta value due to the unsigned subtraction of the TSC value and the reference value. This algorithm can not be changed due to the support of wrapping clock sources like pm timer. The huge delta is converted to nanoseconds and added to xtime, which is then observable by the caller. The next gettimeofday call on CPU1 will show the correct time again as now the TSC has advanced above the reference value. To prevent this TSC specific wreckage we need to compare the TSC value against the reference value and return the latter when it is larger than the actual TSC value. I pondered to mark the TSC unstable when the readout is smaller than the reference value, but this would render an otherwise good and fast clocksource unusable without a real good reason. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc_32.c | 15 ++++++++++++++- arch/x86/kernel/tsc_64.c | 23 ++++++++++++++++++++--- 2 files changed, 34 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c index f14cfd9d1f9..d7498b34c8e 100644 --- a/arch/x86/kernel/tsc_32.c +++ b/arch/x86/kernel/tsc_32.c @@ -287,14 +287,27 @@ core_initcall(cpufreq_tsc); /* clock source code */ static unsigned long current_tsc_khz = 0; +static struct clocksource clocksource_tsc; +/* + * We compare the TSC to the cycle_last value in the clocksource + * structure to avoid a nasty time-warp issue. This can be observed in + * a very small window right after one CPU updated cycle_last under + * xtime lock and the other CPU reads a TSC value which is smaller + * than the cycle_last reference value due to a TSC which is slighty + * behind. This delta is nowhere else observable, but in that case it + * results in a forward time jump in the range of hours due to the + * unsigned delta calculation of the time keeping core code, which is + * necessary to support wrapping clocksources like pm timer. + */ static cycle_t read_tsc(void) { cycle_t ret; rdtscll(ret); - return ret; + return ret >= clocksource_tsc.cycle_last ? + ret : clocksource_tsc.cycle_last; } static struct clocksource clocksource_tsc = { diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c index 947554ddabb..01fc9f0c39e 100644 --- a/arch/x86/kernel/tsc_64.c +++ b/arch/x86/kernel/tsc_64.c @@ -11,6 +11,7 @@ #include #include #include +#include static int notsc __initdata = 0; @@ -290,18 +291,34 @@ int __init notsc_setup(char *s) __setup("notsc", notsc_setup); +static struct clocksource clocksource_tsc; -/* clock source code: */ +/* + * We compare the TSC to the cycle_last value in the clocksource + * structure to avoid a nasty time-warp. This can be observed in a + * very small window right after one CPU updated cycle_last under + * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which + * is smaller than the cycle_last reference value due to a TSC which + * is slighty behind. This delta is nowhere else observable, but in + * that case it results in a forward time jump in the range of hours + * due to the unsigned delta calculation of the time keeping core + * code, which is necessary to support wrapping clocksources like pm + * timer. + */ static cycle_t read_tsc(void) { cycle_t ret = (cycle_t)get_cycles(); - return ret; + + return ret >= clocksource_tsc.cycle_last ? + ret : clocksource_tsc.cycle_last; } static cycle_t __vsyscall_fn vread_tsc(void) { cycle_t ret = (cycle_t)vget_cycles(); - return ret; + + return ret >= __vsyscall_gtod_data.clock.cycle_last ? + ret : __vsyscall_gtod_data.clock.cycle_last; } static struct clocksource clocksource_tsc = { -- cgit v1.2.3 From 5761d64b277c287a7520b868c32d656ef03374b4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 4 Apr 2008 16:26:10 +0200 Subject: x86: revert assign IRQs to hpet timer The commits: commit 37a47db8d7f0f38dac5acf5a13abbc8f401707fa Author: Balaji Rao Date: Wed Jan 30 13:30:03 2008 +0100 x86: assign IRQs to HPET timers, fix and commit e3f37a54f690d3e64995ea7ecea08c5ab3070faf Author: Balaji Rao Date: Wed Jan 30 13:30:03 2008 +0100 x86: assign IRQs to HPET timers have been identified to cause a regression on some platforms due to the assignement of legacy IRQs which makes the legacy devices connected to those IRQs disfunctional. Revert them. This fixes http://bugzilla.kernel.org/show_bug.cgi?id=10382 Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 235fd6c7750..36652ea1a26 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -133,13 +133,16 @@ static void hpet_reserve_platform_timers(unsigned long id) #ifdef CONFIG_HPET_EMULATE_RTC hpet_reserve_timer(&hd, 1); #endif + hd.hd_irq[0] = HPET_LEGACY_8254; hd.hd_irq[1] = HPET_LEGACY_RTC; - for (i = 2; i < nrtimers; timer++, i++) - hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >> - Tn_INT_ROUTE_CNF_SHIFT; + for (i = 2; i < nrtimers; timer++, i++) + hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >> + Tn_INT_ROUTE_CNF_SHIFT; + hpet_alloc(&hd); + } #else static void hpet_reserve_platform_timers(unsigned long id) { } -- cgit v1.2.3 From 64ba4f230d30b089bc89db2e59d02c1efa5ac769 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sun, 6 Apr 2008 17:23:38 +1000 Subject: Fix booting pentium+ with dodgy TSC We handle a broken tsc these days, so no need to panic. We clear the TSC bit when tsc_init decides it's unreliable (eg. under lguest w/ bad host TSC), leading to bogus panic. Signed-off-by: Rusty Russell Acked-by: Ingo Molnar Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/bugs.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 027e5c003b1..170d2f5523b 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -142,14 +142,6 @@ static void __init check_config(void) panic("Kernel requires i486+ for 'invlpg' and other features"); #endif -/* - * If we configured ourselves for a TSC, we'd better have one! - */ -#ifdef CONFIG_X86_TSC - if (!cpu_has_tsc) - panic("Kernel compiled for Pentium+, requires TSC feature!"); -#endif - /* * If we were told we had a good local APIC, check for buggy Pentia, * i.e. all B steppings and the C2 stepping of P54C when using their -- cgit v1.2.3 From 5b13d863573e746739ccfc24ac1a9473cfee8df1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 7 Apr 2008 20:58:08 +0200 Subject: revert "x86: tsc prevent time going backwards" revert: | commit 47001d603375f857a7fab0e9c095d964a1ea0039 | Author: Thomas Gleixner | Date: Tue Apr 1 19:45:18 2008 +0200 | | x86: tsc prevent time going backwards it has been identified to cause suspend regression - and the commit fixes a longstanding bug that existed before 2.6.25 was opened - so it can wait some more until the effects are better understood. Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc_32.c | 15 +-------------- arch/x86/kernel/tsc_64.c | 23 +++-------------------- 2 files changed, 4 insertions(+), 34 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c index d7498b34c8e..f14cfd9d1f9 100644 --- a/arch/x86/kernel/tsc_32.c +++ b/arch/x86/kernel/tsc_32.c @@ -287,27 +287,14 @@ core_initcall(cpufreq_tsc); /* clock source code */ static unsigned long current_tsc_khz = 0; -static struct clocksource clocksource_tsc; -/* - * We compare the TSC to the cycle_last value in the clocksource - * structure to avoid a nasty time-warp issue. This can be observed in - * a very small window right after one CPU updated cycle_last under - * xtime lock and the other CPU reads a TSC value which is smaller - * than the cycle_last reference value due to a TSC which is slighty - * behind. This delta is nowhere else observable, but in that case it - * results in a forward time jump in the range of hours due to the - * unsigned delta calculation of the time keeping core code, which is - * necessary to support wrapping clocksources like pm timer. - */ static cycle_t read_tsc(void) { cycle_t ret; rdtscll(ret); - return ret >= clocksource_tsc.cycle_last ? - ret : clocksource_tsc.cycle_last; + return ret; } static struct clocksource clocksource_tsc = { diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c index 01fc9f0c39e..947554ddabb 100644 --- a/arch/x86/kernel/tsc_64.c +++ b/arch/x86/kernel/tsc_64.c @@ -11,7 +11,6 @@ #include #include #include -#include static int notsc __initdata = 0; @@ -291,34 +290,18 @@ int __init notsc_setup(char *s) __setup("notsc", notsc_setup); -static struct clocksource clocksource_tsc; -/* - * We compare the TSC to the cycle_last value in the clocksource - * structure to avoid a nasty time-warp. This can be observed in a - * very small window right after one CPU updated cycle_last under - * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which - * is smaller than the cycle_last reference value due to a TSC which - * is slighty behind. This delta is nowhere else observable, but in - * that case it results in a forward time jump in the range of hours - * due to the unsigned delta calculation of the time keeping core - * code, which is necessary to support wrapping clocksources like pm - * timer. - */ +/* clock source code: */ static cycle_t read_tsc(void) { cycle_t ret = (cycle_t)get_cycles(); - - return ret >= clocksource_tsc.cycle_last ? - ret : clocksource_tsc.cycle_last; + return ret; } static cycle_t __vsyscall_fn vread_tsc(void) { cycle_t ret = (cycle_t)vget_cycles(); - - return ret >= __vsyscall_gtod_data.clock.cycle_last ? - ret : __vsyscall_gtod_data.clock.cycle_last; + return ret; } static struct clocksource clocksource_tsc = { -- cgit v1.2.3 From 4f41c94d5c24e3b3453e9df03c0a80ca1acf00d2 Mon Sep 17 00:00:00 2001 From: Karsten Wiese Date: Mon, 7 Apr 2008 12:14:45 +0200 Subject: x86: fix call to set_cyc2ns_scale() from time_cpufreq_notifier() In time_cpufreq_notifier() the cpu id to act upon is held in freq->cpu. Use it instead of smp_processor_id() in the call to set_cyc2ns_scale(). This makes the preempt_*able() unnecessary and lets set_cyc2ns_scale() update the intended cpu's cyc2ns. Related mail/thread: http://lkml.org/lkml/2007/12/7/130 Signed-off-by: Karsten Wiese Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc_32.c | 4 +--- arch/x86/kernel/tsc_64.c | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c index f14cfd9d1f9..c2241e04ea5 100644 --- a/arch/x86/kernel/tsc_32.c +++ b/arch/x86/kernel/tsc_32.c @@ -256,9 +256,7 @@ time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) ref_freq, freq->new); if (!(freq->flags & CPUFREQ_CONST_LOOPS)) { tsc_khz = cpu_khz; - preempt_disable(); - set_cyc2ns_scale(cpu_khz, smp_processor_id()); - preempt_enable(); + set_cyc2ns_scale(cpu_khz, freq->cpu); /* * TSC based sched_clock turns * to junk w/ cpufreq diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c index 947554ddabb..d3bebaaad84 100644 --- a/arch/x86/kernel/tsc_64.c +++ b/arch/x86/kernel/tsc_64.c @@ -148,9 +148,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, mark_tsc_unstable("cpufreq changes"); } - preempt_disable(); - set_cyc2ns_scale(tsc_khz_ref, smp_processor_id()); - preempt_enable(); + set_cyc2ns_scale(tsc_khz_ref, freq->cpu); return 0; } -- cgit v1.2.3 From f4be31ec9690cfe6e94fcbed6ae60a6a38b3c3ed Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 9 Apr 2008 19:04:07 -0400 Subject: pop previous section in alternative.c gcc expects all toplevel assembly to return to the original section type. The code in alteranative.c does not do this. This caused some strange bugs in sched-devel where code would end up in the .rodata section and when the kernel sets the NX bit on all .rodata, the kernel would crash when executing this code. This patch adds a .previous marker to return the code back to the original section. Credit goes to Andrew Pinski for telling me it wasn't a gcc bug but a bug in the toplevel asm code in the kernel. ;-) Signed-off-by: Steven Rostedt Signed-off-by: Linus Torvalds --- arch/x86/kernel/alternative.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 45d79ea890a..5fed98ca0e1 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -65,7 +65,8 @@ __setup("noreplace-paravirt", setup_noreplace_paravirt); get them easily into strings. */ asm("\t.section .rodata, \"a\"\nintelnops: " GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6 - GENERIC_NOP7 GENERIC_NOP8); + GENERIC_NOP7 GENERIC_NOP8 + "\t.previous"); extern const unsigned char intelnops[]; static const unsigned char *const intel_nops[ASM_NOP_MAX+1] = { NULL, @@ -83,7 +84,8 @@ static const unsigned char *const intel_nops[ASM_NOP_MAX+1] = { #ifdef K8_NOP1 asm("\t.section .rodata, \"a\"\nk8nops: " K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 - K8_NOP7 K8_NOP8); + K8_NOP7 K8_NOP8 + "\t.previous"); extern const unsigned char k8nops[]; static const unsigned char *const k8_nops[ASM_NOP_MAX+1] = { NULL, @@ -101,7 +103,8 @@ static const unsigned char *const k8_nops[ASM_NOP_MAX+1] = { #ifdef K7_NOP1 asm("\t.section .rodata, \"a\"\nk7nops: " K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6 - K7_NOP7 K7_NOP8); + K7_NOP7 K7_NOP8 + "\t.previous"); extern const unsigned char k7nops[]; static const unsigned char *const k7_nops[ASM_NOP_MAX+1] = { NULL, @@ -119,7 +122,8 @@ static const unsigned char *const k7_nops[ASM_NOP_MAX+1] = { #ifdef P6_NOP1 asm("\t.section .rodata, \"a\"\np6nops: " P6_NOP1 P6_NOP2 P6_NOP3 P6_NOP4 P6_NOP5 P6_NOP6 - P6_NOP7 P6_NOP8); + P6_NOP7 P6_NOP8 + "\t.previous"); extern const unsigned char p6nops[]; static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = { NULL, -- cgit v1.2.3 From 783e391b7b5b273cd20856d8f6f4878da8ec31b3 Mon Sep 17 00:00:00 2001 From: Venki Pallipadi Date: Thu, 10 Apr 2008 09:49:58 -0700 Subject: x86: Simplify cpu_idle_wait This patch also resolves hangs on boot: http://lkml.org/lkml/2008/2/23/263 http://bugzilla.kernel.org/show_bug.cgi?id=10093 The bug was causing once-in-few-reboots 10-15 sec wait during boot on certain laptops. Earlier commit 40d6a146629b98d8e322b6f9332b182c7cbff3df added smp_call_function in cpu_idle_wait() to kick cpus that are in tickless idle. Looking at cpu_idle_wait code at that time, code seemed to be over-engineered for a case which is rarely used (while changing idle handler). Below is a simplified version of cpu_idle_wait, which just makes a dummy smp_call_function to all cpus, to make them come out of old idle handler and start using the new idle handler. It eliminates code in the idle loop to handle cpu_idle_wait. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Linus Torvalds --- arch/x86/kernel/process_32.c | 47 +++++++++++--------------------------------- arch/x86/kernel/process_64.c | 47 +++++++++++--------------------------------- 2 files changed, 22 insertions(+), 72 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index be3c7a299f0..43930e73f65 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -82,7 +82,6 @@ unsigned long thread_saved_pc(struct task_struct *tsk) */ void (*pm_idle)(void); EXPORT_SYMBOL(pm_idle); -static DEFINE_PER_CPU(unsigned int, cpu_idle_state); void disable_hlt(void) { @@ -190,9 +189,6 @@ void cpu_idle(void) while (!need_resched()) { void (*idle)(void); - if (__get_cpu_var(cpu_idle_state)) - __get_cpu_var(cpu_idle_state) = 0; - check_pgt_cache(); rmb(); idle = pm_idle; @@ -220,40 +216,19 @@ static void do_nothing(void *unused) { } +/* + * cpu_idle_wait - Used to ensure that all the CPUs discard old value of + * pm_idle and update to new pm_idle value. Required while changing pm_idle + * handler on SMP systems. + * + * Caller must have changed pm_idle to the new value before the call. Old + * pm_idle value will not be used by any CPU after the return of this function. + */ void cpu_idle_wait(void) { - unsigned int cpu, this_cpu = get_cpu(); - cpumask_t map, tmp = current->cpus_allowed; - - set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); - put_cpu(); - - cpus_clear(map); - for_each_online_cpu(cpu) { - per_cpu(cpu_idle_state, cpu) = 1; - cpu_set(cpu, map); - } - - __get_cpu_var(cpu_idle_state) = 0; - - wmb(); - do { - ssleep(1); - for_each_online_cpu(cpu) { - if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) - cpu_clear(cpu, map); - } - cpus_and(map, map, cpu_online_map); - /* - * We waited 1 sec, if a CPU still did not call idle - * it may be because it is in idle and not waking up - * because it has nothing to do. - * Give all the remaining CPUS a kick. - */ - smp_call_function_mask(map, do_nothing, NULL, 0); - } while (!cpus_empty(map)); - - set_cpus_allowed(current, tmp); + smp_mb(); + /* kick all the CPUs so that they exit out of pm_idle */ + smp_call_function(do_nothing, NULL, 0, 1); } EXPORT_SYMBOL_GPL(cpu_idle_wait); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3baf9b9f4c8..46c4c546b49 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -63,7 +63,6 @@ EXPORT_SYMBOL(boot_option_idle_override); */ void (*pm_idle)(void); EXPORT_SYMBOL(pm_idle); -static DEFINE_PER_CPU(unsigned int, cpu_idle_state); static ATOMIC_NOTIFIER_HEAD(idle_notifier); @@ -173,9 +172,6 @@ void cpu_idle(void) while (!need_resched()) { void (*idle)(void); - if (__get_cpu_var(cpu_idle_state)) - __get_cpu_var(cpu_idle_state) = 0; - rmb(); idle = pm_idle; if (!idle) @@ -207,40 +203,19 @@ static void do_nothing(void *unused) { } +/* + * cpu_idle_wait - Used to ensure that all the CPUs discard old value of + * pm_idle and update to new pm_idle value. Required while changing pm_idle + * handler on SMP systems. + * + * Caller must have changed pm_idle to the new value before the call. Old + * pm_idle value will not be used by any CPU after the return of this function. + */ void cpu_idle_wait(void) { - unsigned int cpu, this_cpu = get_cpu(); - cpumask_t map, tmp = current->cpus_allowed; - - set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); - put_cpu(); - - cpus_clear(map); - for_each_online_cpu(cpu) { - per_cpu(cpu_idle_state, cpu) = 1; - cpu_set(cpu, map); - } - - __get_cpu_var(cpu_idle_state) = 0; - - wmb(); - do { - ssleep(1); - for_each_online_cpu(cpu) { - if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) - cpu_clear(cpu, map); - } - cpus_and(map, map, cpu_online_map); - /* - * We waited 1 sec, if a CPU still did not call idle - * it may be because it is in idle and not waking up - * because it has nothing to do. - * Give all the remaining CPUS a kick. - */ - smp_call_function_mask(map, do_nothing, 0, 0); - } while (!cpus_empty(map)); - - set_cpus_allowed(current, tmp); + smp_mb(); + /* kick all the CPUs so that they exit out of pm_idle */ + smp_call_function(do_nothing, NULL, 0, 1); } EXPORT_SYMBOL_GPL(cpu_idle_wait); -- cgit v1.2.3 From 54a015104136974262afa4b8ddd943ea70dec8a2 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Thu, 10 Apr 2008 15:37:38 -0700 Subject: asmlinkage_protect replaces prevent_tail_call The prevent_tail_call() macro works around the problem of the compiler clobbering argument words on the stack, which for asmlinkage functions is the caller's (user's) struct pt_regs. The tail/sibling-call optimization is not the only way that the compiler can decide to use stack argument words as scratch space, which we have to prevent. Other optimizations can do it too. Until we have new compiler support to make "asmlinkage" binding on the compiler's own use of the stack argument frame, we have work around all the manifestations of this issue that crop up. More cases seem to be prevented by also keeping the incoming argument variables live at the end of the function. This makes their original stack slots attractive places to leave those variables, so the compiler tends not clobber them for something else. It's still no guarantee, but it handles some observed cases that prevent_tail_call() did not. Signed-off-by: Roland McGrath Signed-off-by: Linus Torvalds --- arch/x86/kernel/tls.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index 022bcaa3b42..ab6bf375a30 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -92,7 +92,7 @@ int do_set_thread_area(struct task_struct *p, int idx, asmlinkage int sys_set_thread_area(struct user_desc __user *u_info) { int ret = do_set_thread_area(current, -1, u_info, 1); - prevent_tail_call(ret); + asmlinkage_protect(1, ret, u_info); return ret; } @@ -142,7 +142,7 @@ int do_get_thread_area(struct task_struct *p, int idx, asmlinkage int sys_get_thread_area(struct user_desc __user *u_info) { int ret = do_get_thread_area(current, -1, u_info); - prevent_tail_call(ret); + asmlinkage_protect(1, ret, u_info); return ret; } -- cgit v1.2.3