From 64e72e41acae0dab733fb0d5d789b76d115210c0 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 14 Nov 2006 16:56:33 +0100 Subject: Revert "[PATCH] MMCONFIG and new Intel motherboards" This reverts 4c6e052adfe285ede5884e4e8c4d33af33932c13 commit. Following Linus' i386 change: revert resource reservation for mmcfg config now. Will be revisited in .20 hopefully. --- arch/x86_64/pci/mmconfig.c | 32 -------------------------------- 1 file changed, 32 deletions(-) (limited to 'arch') diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c index e61093b34c2..f8b6b2800a6 100644 --- a/arch/x86_64/pci/mmconfig.c +++ b/arch/x86_64/pci/mmconfig.c @@ -163,37 +163,6 @@ static __init void unreachable_devices(void) } } -static __init void pci_mmcfg_insert_resources(void) -{ -#define PCI_MMCFG_RESOURCE_NAME_LEN 19 - int i; - struct resource *res; - char *names; - unsigned num_buses; - - res = kcalloc(PCI_MMCFG_RESOURCE_NAME_LEN + sizeof(*res), - pci_mmcfg_config_num, GFP_KERNEL); - - if (!res) { - printk(KERN_ERR "PCI: Unable to allocate MMCONFIG resources\n"); - return; - } - - names = (void *)&res[pci_mmcfg_config_num]; - for (i = 0; i < pci_mmcfg_config_num; i++, res++) { - num_buses = pci_mmcfg_config[i].end_bus_number - - pci_mmcfg_config[i].start_bus_number + 1; - res->name = names; - snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN, "PCI MMCONFIG %u", - pci_mmcfg_config[i].pci_segment_group_number); - res->start = pci_mmcfg_config[i].base_address; - res->end = res->start + (num_buses << 20) - 1; - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; - insert_resource(&iomem_resource, res); - names += PCI_MMCFG_RESOURCE_NAME_LEN; - } -} - void __init pci_mmcfg_init(int type) { int i; @@ -237,7 +206,6 @@ void __init pci_mmcfg_init(int type) } unreachable_devices(); - pci_mmcfg_insert_resources(); raw_pci_ops = &pci_mmcfg; pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; -- cgit v1.2.3 From 14f448e36192d6d2cd7dfd81cb044977b2f9dd9b Mon Sep 17 00:00:00 2001 From: Aaron Durbin Date: Tue, 14 Nov 2006 16:57:45 +0100 Subject: [PATCH] x86-64: Fix partial page check to ensure unusable memory is not being marked usable. Fix partial page check in e820_register_active_regions to ensure partial pages are not being marked as active in the memory pool. Signed-off-by: Aaron Durbin Signed-off-by: Andi Kleen --- arch/x86_64/kernel/e820.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index a75c829c2b0..855b5611318 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -278,7 +278,7 @@ e820_register_active_regions(int nid, unsigned long start_pfn, >> PAGE_SHIFT; /* Skip map entries smaller than a page */ - if (ei_startpfn > ei_endpfn) + if (ei_startpfn >= ei_endpfn) continue; /* Check if end_pfn_map should be updated */ -- cgit v1.2.3 From 14679eb3c50897889ba62f9a37e3bcd8a205b5e7 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 14 Nov 2006 16:57:46 +0100 Subject: [PATCH] x86-64: Fix PTRACE_[SG]ET_THREAD_AREA regression with ia32 emulation. ptrace(PTRACE_[SG]ET_THREAD_AREA) calls from ia32 code should be passed onto the x86_64 implementation. The default case in sys32_ptrace used to call to sys_ptrace(), but is now EINVAL. This patch fixes a regression caused by that changed. Signed-off-by: Mike McCormack Signed-off-by: Andi Kleen --- arch/x86_64/ia32/ptrace32.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/x86_64/ia32/ptrace32.c b/arch/x86_64/ia32/ptrace32.c index 3a7561d4703..04566fe5de4 100644 --- a/arch/x86_64/ia32/ptrace32.c +++ b/arch/x86_64/ia32/ptrace32.c @@ -244,6 +244,8 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data) case PTRACE_DETACH: case PTRACE_SYSCALL: case PTRACE_SETOPTIONS: + case PTRACE_SET_THREAD_AREA: + case PTRACE_GET_THREAD_AREA: return sys_ptrace(request, pid, addr, data); default: -- cgit v1.2.3 From 51d67a488b53a5cc8401460480c124eaec71e2d4 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 14 Nov 2006 16:57:46 +0100 Subject: [PATCH] x86-64: shorten the x86_64 boot setup GDT to what the comment says Stephen Tweedie, Herbert Xu, and myself have been struggling with a very nasty bug in Xen. But it also pointed out a small bug in the x86_64 kernel boot setup. The GDT limit being setup by the initial bzImage code when entering into protected mode is way too big. The comment by the code states that the size of the GDT is 2048, but the actual size being set up is much bigger (32768). This happens simply because of one extra '0'. Instead of setting up a 0x800 size, 0x8000 is set up. On bare metal this is fine because the CPU wont load any segments unless they are explicitly used. But unfortunately, this breaks Xen on vmx FV, since it (for now) blindly loads all the segments into the VMCS if they are less than the gdt limit. Since the real mode segments are around 0x3000, we are getting junk into the VMCS and that later causes an exception. Stephen Tweedie has written up a patch to fix the Xen side and will be submitting that to those folks. But that doesn't excuse the GDT limit being a magnitude too big. AK: changed to compute true gdt size in assembler, fixed comment Signed-off-by: Steven Rostedt Signed-off-by: Andi Kleen --- arch/x86_64/boot/setup.S | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86_64/boot/setup.S b/arch/x86_64/boot/setup.S index c3bfd223ab4..770940cc010 100644 --- a/arch/x86_64/boot/setup.S +++ b/arch/x86_64/boot/setup.S @@ -836,13 +836,12 @@ gdt: .word 0x9200 # data read/write .word 0x00CF # granularity = 4096, 386 # (+5th nibble of limit) +gdt_end: idt_48: .word 0 # idt limit = 0 .word 0, 0 # idt base = 0L gdt_48: - .word 0x8000 # gdt limit=2048, - # 256 GDT entries - + .word gdt_end-gdt-1 # gdt limit .word 0, 0 # gdt base (filled in later) # Include video setup & detection code -- cgit v1.2.3 From 5e58a02a8f6a7a1c9ae41f39286bcd3aea0d6f24 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 14 Nov 2006 16:57:46 +0100 Subject: [PATCH] x86-64: Handle reserve_bootmem_generic beyond end_pfn This can happen on kexec kernels with some configurations, in particularly on Unisys ES7000 systems. Analysis by Amul Shah Cc: Amul Shah Signed-off-by: Andi Kleen --- arch/x86_64/mm/init.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index 971dc1181e6..f1f977aafae 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -655,9 +655,22 @@ void free_initrd_mem(unsigned long start, unsigned long end) void __init reserve_bootmem_generic(unsigned long phys, unsigned len) { - /* Should check here against the e820 map to avoid double free */ #ifdef CONFIG_NUMA int nid = phys_to_nid(phys); +#endif + unsigned long pfn = phys >> PAGE_SHIFT; + if (pfn >= end_pfn) { + /* This can happen with kdump kernels when accessing firmware + tables. */ + if (pfn < end_pfn_map) + return; + printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n", + phys, len); + return; + } + + /* Should check here against the e820 map to avoid double free */ +#ifdef CONFIG_NUMA reserve_bootmem_node(NODE_DATA(nid), phys, len); #else reserve_bootmem(phys, len); -- cgit v1.2.3 From 15803a43288da434d34d41c4ed650c3c1728d42c Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 14 Nov 2006 16:57:46 +0100 Subject: [PATCH] x86-64: setup saved_max_pfn correctly (kdump) x86_64: setup saved_max_pfn correctly 2.6.19-rc4 has broken CONFIG_CRASH_DUMP support on x86_64. It is impossible to read out the kernel contents from /proc/vmcore because saved_max_pfn is set to zero instead of the max_pfn value before the user map is setup. This happens because saved_max_pfn is initialized at parse_early_param() time, and at this time no active regions have been registered. save_max_pfn is setup from e820_end_of_ram(), more exact find_max_pfn_with_active_regions() which returns 0 because no regions exist. This patch fixes this by registering before and removing after the call to e820_end_of_ram(). Signed-off-by: Magnus Damm Signed-off-by: Andi Kleen --- arch/x86_64/kernel/e820.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index 855b5611318..6fe191c5808 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -594,7 +594,9 @@ static int __init parse_memmap_opt(char *p) * size before original memory map is * reset. */ + e820_register_active_regions(0, 0, -1UL); saved_max_pfn = e820_end_of_ram(); + remove_all_active_ranges(); #endif end_pfn_map = 0; e820.nr_map = 0; -- cgit v1.2.3 From fa18f477d0987c011cce047a7c3cd1284f547a14 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 14 Nov 2006 16:57:46 +0100 Subject: [PATCH] x86: Add acpi_user_timer_override option for Asus boards Timer overrides are normally disabled on Nvidia board because they are commonly wrong, except on new ones with HPET support. Unfortunately there are quite some Asus boards around that don't have HPET, but need a timer override. We don't know yet how to handle this transparently, but at least add a command line option to force the timer override and let them boot. Cc: len.brown@intel.com Signed-off-by: Andi Kleen --- arch/i386/kernel/acpi/boot.c | 8 ++++++++ arch/i386/kernel/acpi/earlyquirk.c | 8 +++++++- arch/x86_64/kernel/early-quirks.c | 8 ++++++++ 3 files changed, 23 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c index 22e4c466e5a..d12fb97a533 100644 --- a/arch/i386/kernel/acpi/boot.c +++ b/arch/i386/kernel/acpi/boot.c @@ -82,6 +82,7 @@ EXPORT_SYMBOL(acpi_strict); acpi_interrupt_flags acpi_sci_flags __initdata; int acpi_sci_override_gsi __initdata; int acpi_skip_timer_override __initdata; +int acpi_use_timer_override __initdata; #ifdef CONFIG_X86_LOCAL_APIC static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; @@ -1300,6 +1301,13 @@ static int __init parse_acpi_skip_timer_override(char *arg) return 0; } early_param("acpi_skip_timer_override", parse_acpi_skip_timer_override); + +static int __init parse_acpi_use_timer_override(char *arg) +{ + acpi_use_timer_override = 1; + return 0; +} +early_param("acpi_use_timer_override", parse_acpi_use_timer_override); #endif /* CONFIG_X86_IO_APIC */ static int __init setup_acpi_sci(char *s) diff --git a/arch/i386/kernel/acpi/earlyquirk.c b/arch/i386/kernel/acpi/earlyquirk.c index fe799b11ac0..c9841692bb7 100644 --- a/arch/i386/kernel/acpi/earlyquirk.c +++ b/arch/i386/kernel/acpi/earlyquirk.c @@ -27,11 +27,17 @@ static int __init check_bridge(int vendor, int device) #ifdef CONFIG_ACPI /* According to Nvidia all timer overrides are bogus unless HPET is enabled. */ - if (vendor == PCI_VENDOR_ID_NVIDIA) { + if (!acpi_use_timer_override && vendor == PCI_VENDOR_ID_NVIDIA) { nvidia_hpet_detected = 0; acpi_table_parse(ACPI_HPET, nvidia_hpet_check); if (nvidia_hpet_detected == 0) { acpi_skip_timer_override = 1; + printk(KERN_INFO "Nvidia board " + "detected. Ignoring ACPI " + "timer override.\n"); + printk(KERN_INFO "If you got timer trouble " + "try acpi_use_timer_override\n"); + } } #endif diff --git a/arch/x86_64/kernel/early-quirks.c b/arch/x86_64/kernel/early-quirks.c index 2b1245d8625..68273bff58c 100644 --- a/arch/x86_64/kernel/early-quirks.c +++ b/arch/x86_64/kernel/early-quirks.c @@ -45,7 +45,13 @@ static void nvidia_bugs(void) /* * All timer overrides on Nvidia are * wrong unless HPET is enabled. + * Unfortunately that's not true on many Asus boards. + * We don't know yet how to detect this automatically, but + * at least allow a command line override. */ + if (acpi_use_timer_override) + return; + nvidia_hpet_detected = 0; acpi_table_parse(ACPI_HPET, nvidia_hpet_check); if (nvidia_hpet_detected == 0) { @@ -53,6 +59,8 @@ static void nvidia_bugs(void) printk(KERN_INFO "Nvidia board " "detected. Ignoring ACPI " "timer override.\n"); + printk(KERN_INFO "If you got timer trouble " + "try acpi_use_timer_override\n"); } #endif /* RED-PEN skip them on mptables too? */ -- cgit v1.2.3 From 8c131af1db510793f87dc43edbc8950a35370df3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 14 Nov 2006 16:57:46 +0100 Subject: [PATCH] x86-64: Fix vgetcpu when CONFIG_HOTPLUG_CPU is disabled The vgetcpu per CPU initialization previously relied on CPU hotplug events for all CPUs to initialize the per CPU state. That only worked only on kernels with CONFIG_HOTPLUG_CPU enabled. On the others some CPUs didn't get their state initialized properly and vgetcpu wouldn't work. Change the initialization sequence to instead run in a normal initcall (which runs after the normal CPU bootup) and initialize all running CPUs there. Later hotplug CPUs are still handled with an hotplug notifier. This actually simplifies the code somewhat. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/smp.c | 3 +-- arch/x86_64/kernel/time.c | 11 ----------- arch/x86_64/kernel/vsyscall.c | 45 ++++++++++++++++++++++++------------------- 3 files changed, 26 insertions(+), 33 deletions(-) (limited to 'arch') diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index 4f67697f503..9f74c883568 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -376,9 +376,8 @@ int smp_call_function_single (int cpu, void (*func) (void *info), void *info, /* prevent preemption and reschedule on another processor */ int me = get_cpu(); if (cpu == me) { - WARN_ON(1); put_cpu(); - return -EBUSY; + return 0; } spin_lock_bh(&call_lock); __smp_call_function_single(cpu, func, info, nonatomic, wait); diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 88722f11ca1..e3ef544d2cf 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -876,15 +876,6 @@ static struct irqaction irq0 = { timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL }; -static int __cpuinit -time_cpu_notifier(struct notifier_block *nb, unsigned long action, void *hcpu) -{ - unsigned cpu = (unsigned long) hcpu; - if (action == CPU_ONLINE) - vsyscall_set_cpu(cpu); - return NOTIFY_DONE; -} - void __init time_init(void) { if (nohpet) @@ -925,8 +916,6 @@ void __init time_init(void) vxtime.last_tsc = get_cycles_sync(); set_cyc2ns_scale(cpu_khz); setup_irq(0, &irq0); - hotcpu_notifier(time_cpu_notifier, 0); - time_cpu_notifier(NULL, CPU_ONLINE, (void *)(long)smp_processor_id()); #ifndef CONFIG_SMP time_init_gtod(); diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c index a98b460af6a..a730bacecb0 100644 --- a/arch/x86_64/kernel/vsyscall.c +++ b/arch/x86_64/kernel/vsyscall.c @@ -27,6 +27,9 @@ #include #include #include +#include +#include +#include #include #include @@ -243,32 +246,17 @@ static ctl_table kernel_root_table2[] = { #endif -static void __cpuinit write_rdtscp_cb(void *info) -{ - write_rdtscp_aux((unsigned long)info); -} - -void __cpuinit vsyscall_set_cpu(int cpu) +/* Assume __initcall executes before all user space. Hopefully kmod + doesn't violate that. We'll find out if it does. */ +static void __cpuinit vsyscall_set_cpu(int cpu) { unsigned long *d; unsigned long node = 0; #ifdef CONFIG_NUMA node = cpu_to_node[cpu]; #endif - if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP)) { - void *info = (void *)((node << 12) | cpu); - /* Can happen on preemptive kernel */ - if (get_cpu() == cpu) - write_rdtscp_cb(info); -#ifdef CONFIG_SMP - else { - /* the notifier is unfortunately not executed on the - target CPU */ - smp_call_function_single(cpu,write_rdtscp_cb,info,0,1); - } -#endif - put_cpu(); - } + if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP)) + write_rdtscp_aux((node << 12) | cpu); /* Store cpu number in limit so that it can be loaded quickly in user space in vgetcpu. @@ -280,6 +268,21 @@ void __cpuinit vsyscall_set_cpu(int cpu) *d |= (node >> 4) << 48; } +static void __cpuinit cpu_vsyscall_init(void *arg) +{ + /* preemption should be already off */ + vsyscall_set_cpu(raw_smp_processor_id()); +} + +static int __cpuinit +cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) +{ + long cpu = (long)arg; + if (action == CPU_ONLINE) + smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1); + return NOTIFY_DONE; +} + static void __init map_vsyscall(void) { extern char __vsyscall_0; @@ -299,6 +302,8 @@ static int __init vsyscall_init(void) #ifdef CONFIG_SYSCTL register_sysctl_table(kernel_root_table2, 0); #endif + on_each_cpu(cpu_vsyscall_init, NULL, 0, 1); + hotcpu_notifier(cpu_vsyscall_notifier, 0); return 0; } -- cgit v1.2.3 From 9446868b5383eb87f76b2d4389dea4bb968a6657 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 14 Nov 2006 16:57:46 +0100 Subject: [PATCH] x86-64: Fix race in exit_idle When another interrupt happens in exit_idle the exit idle notifier could be called an incorrect number of times. Add a test_and_clear_bit_pda and use it handle the bit atomically against interrupts to avoid this. Pointed out by Stephane Eranian Signed-off-by: Andi Kleen --- arch/x86_64/kernel/process.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index 49f7fac6229..f6226055d53 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -88,9 +88,8 @@ void enter_idle(void) static void __exit_idle(void) { - if (read_pda(isidle) == 0) + if (test_and_clear_bit_pda(0, isidle) == 0) return; - write_pda(isidle, 0); atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); } -- cgit v1.2.3