From 2f01942536d8c686a3f6b3b38f1257caa2fb763e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 3 Feb 2006 21:50:38 +0100 Subject: [PATCH] x86_64: Update defconfig Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/defconfig | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig index 09a3eb74331..56832929a54 100644 --- a/arch/x86_64/defconfig +++ b/arch/x86_64/defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.15-git12 -# Mon Jan 16 13:09:08 2006 +# Linux kernel version: 2.6.16-rc1-git2 +# Thu Jan 19 10:05:21 2006 # CONFIG_X86_64=y CONFIG_64BIT=y @@ -310,6 +310,11 @@ CONFIG_IPV6=y # SCTP Configuration (EXPERIMENTAL) # # CONFIG_IP_SCTP is not set + +# +# TIPC Configuration (EXPERIMENTAL) +# +# CONFIG_TIPC is not set # CONFIG_ATM is not set # CONFIG_BRIDGE is not set # CONFIG_VLAN_8021Q is not set @@ -319,11 +324,6 @@ CONFIG_IPV6=y # CONFIG_ATALK is not set # CONFIG_X25 is not set # CONFIG_LAPB is not set - -# -# TIPC Configuration (EXPERIMENTAL) -# -# CONFIG_TIPC is not set # CONFIG_NET_DIVERT is not set # CONFIG_ECONET is not set # CONFIG_WAN_ROUTER is not set @@ -1097,6 +1097,12 @@ CONFIG_USB_MON=y # SN Devices # +# +# EDAC - error detection and reporting (RAS) +# +# CONFIG_EDAC is not set +# CONFIG_EDAC_POLL is not set + # # Firmware Drivers # @@ -1290,6 +1296,7 @@ CONFIG_DEBUG_FS=y # CONFIG_DEBUG_VM is not set # CONFIG_FRAME_POINTER is not set # CONFIG_FORCED_INLINING is not set +# CONFIG_UNWIND_INFO is not set # CONFIG_RCU_TORTURE_TEST is not set CONFIG_INIT_DEBUG=y # CONFIG_DEBUG_RODATA is not set -- cgit v1.2.3 From eddb6fb9a54cdc8c7c37e056a2b4bbbc8a128a36 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 3 Feb 2006 21:50:41 +0100 Subject: [PATCH] x86_64: Disallow kprobes on NMI handlers A kprobe executes IRET early and that could cause NMI recursion and stack corruption. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/entry.S | 3 ++- arch/x86_64/kernel/nmi.c | 7 ++++--- arch/x86_64/kernel/traps.c | 21 ++++++++++++--------- 3 files changed, 18 insertions(+), 13 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index dbdba56e8fa..cdf9cb1288c 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -922,7 +922,7 @@ KPROBE_ENTRY(debug) .previous .text /* runs on exception stack */ -ENTRY(nmi) +KPROBE_ENTRY(nmi) INTR_FRAME pushq $-1 CFI_ADJUST_CFA_OFFSET 8 @@ -969,6 +969,7 @@ paranoid_schedule: cli jmp paranoid_userspace CFI_ENDPROC + .previous .text KPROBE_ENTRY(int3) INTR_FRAME diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 5fae6f0cd99..8be407a1f62 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -468,7 +469,7 @@ void touch_nmi_watchdog (void) touch_softlockup_watchdog(); } -void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) +void __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) { int sum; int touched = 0; @@ -512,14 +513,14 @@ void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) } } -static int dummy_nmi_callback(struct pt_regs * regs, int cpu) +static __kprobes int dummy_nmi_callback(struct pt_regs * regs, int cpu) { return 0; } static nmi_callback_t nmi_callback = dummy_nmi_callback; -asmlinkage void do_nmi(struct pt_regs * regs, long error_code) +asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code) { int cpu = safe_smp_processor_id(); diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 8bb0aeda78b..ee1b2da9e5e 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -372,7 +372,7 @@ void out_of_line_bug(void) static DEFINE_SPINLOCK(die_lock); static int die_owner = -1; -unsigned long oops_begin(void) +unsigned __kprobes long oops_begin(void) { int cpu = safe_smp_processor_id(); unsigned long flags; @@ -391,7 +391,7 @@ unsigned long oops_begin(void) return flags; } -void oops_end(unsigned long flags) +void __kprobes oops_end(unsigned long flags) { die_owner = -1; bust_spinlocks(0); @@ -400,7 +400,7 @@ void oops_end(unsigned long flags) panic("Oops"); } -void __die(const char * str, struct pt_regs * regs, long err) +void __kprobes __die(const char * str, struct pt_regs * regs, long err) { static int die_counter; printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter); @@ -432,7 +432,7 @@ void die(const char * str, struct pt_regs * regs, long err) do_exit(SIGSEGV); } -void die_nmi(char *str, struct pt_regs *regs) +void __kprobes die_nmi(char *str, struct pt_regs *regs) { unsigned long flags = oops_begin(); @@ -575,7 +575,8 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, } } -static void mem_parity_error(unsigned char reason, struct pt_regs * regs) +static __kprobes void +mem_parity_error(unsigned char reason, struct pt_regs * regs) { printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n"); printk("You probably have a hardware problem with your RAM chips\n"); @@ -585,7 +586,8 @@ static void mem_parity_error(unsigned char reason, struct pt_regs * regs) outb(reason, 0x61); } -static void io_check_error(unsigned char reason, struct pt_regs * regs) +static __kprobes void +io_check_error(unsigned char reason, struct pt_regs * regs) { printk("NMI: IOCK error (debug interrupt?)\n"); show_registers(regs); @@ -598,7 +600,8 @@ static void io_check_error(unsigned char reason, struct pt_regs * regs) outb(reason, 0x61); } -static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) +static __kprobes void +unknown_nmi_error(unsigned char reason, struct pt_regs * regs) { printk("Uhhuh. NMI received for unknown reason %02x.\n", reason); printk("Dazed and confused, but trying to continue\n"); printk("Do you have a strange power saving mode enabled?\n"); @@ -606,7 +609,7 @@ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) /* Runs on IST stack. This code must keep interrupts off all the time. Nested NMIs are prevented by the CPU. */ -asmlinkage void default_do_nmi(struct pt_regs *regs) +asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; int cpu; @@ -658,7 +661,7 @@ asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code) /* Help handler running on IST stack to switch back to user stack for scheduling or signal handling. The actual stack switch is done in entry.S */ -asmlinkage struct pt_regs *sync_regs(struct pt_regs *eregs) +asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) { struct pt_regs *regs = eregs; /* Did already sync */ -- cgit v1.2.3 From 73dea47faeb96d54a984b9d7f4de564816966354 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 3 Feb 2006 21:50:50 +0100 Subject: [PATCH] x86_64: Allow to run main time keeping from the local APIC interrupt Another piece from the no-idle-tick patch. This can be enabled with the "apicmaintimer" option. This is mainly useful when the PIT/HPET interrupt is unreliable. Note there are some systems that are known to stop the APIC timer in C3. For those it will never work, but this case should be automatically detected. It also only works with PM timer right now. When HPET is used the way the main timer handler computes the delay doesn't work. It should be a bit more efficient because there is one less regular interrupt to process on the boot processor. Requires earlier bugfix from Venkatesh Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/apic.c | 34 ++++++++++++++++++++++++++++++++-- arch/x86_64/kernel/time.c | 45 ++++++++++++++++++++++++++++++++++++++------- 2 files changed, 70 insertions(+), 9 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 5d3c5b07b8d..14751dda7dc 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -35,8 +35,11 @@ #include #include #include +#include +#include int apic_verbosity; +int apic_runs_main_timer; int disable_apic_timer __initdata; @@ -702,9 +705,17 @@ static void setup_APIC_timer(unsigned int clocks) c2 |= inb_p(0x40) << 8; } while (c2 - c1 < 300); } - __setup_APIC_LVTT(clocks); - + /* Turn off PIT interrupt if we use APIC timer as main timer. + Only works with the PM timer right now + TBD fix it for HPET too. */ + if (vxtime.mode == VXTIME_PMTMR && + smp_processor_id() == boot_cpu_id && + apic_runs_main_timer == 1 && + !cpu_isset(boot_cpu_id, timer_interrupt_broadcast_ipi_mask)) { + stop_timer_interrupt(); + apic_runs_main_timer++; + } local_irq_restore(flags); } @@ -872,6 +883,8 @@ void smp_local_timer_interrupt(struct pt_regs *regs) #ifdef CONFIG_SMP update_process_times(user_mode(regs)); #endif + if (apic_runs_main_timer > 1 && smp_processor_id() == boot_cpu_id) + main_timer_handler(regs); /* * We take the 'long' return path, and there every subsystem * grabs the appropriate locks (kernel lock/ irq lock). @@ -1081,10 +1094,27 @@ static __init int setup_nolapic(char *str) static __init int setup_noapictimer(char *str) { + if (str[0] != ' ' && str[0] != 0) + return -1; disable_apic_timer = 1; return 0; } +static __init int setup_apicmaintimer(char *str) +{ + apic_runs_main_timer = 1; + nohpet = 1; + return 0; +} +__setup("apicmaintimer", setup_apicmaintimer); + +static __init int setup_noapicmaintimer(char *str) +{ + apic_runs_main_timer = -1; + return 0; +} +__setup("noapicmaintimer", setup_noapicmaintimer); + /* dummy parsing: see setup.c */ __setup("disableapic", setup_disableapic); diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index f8c47c68844..91a448a86eb 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -51,7 +51,7 @@ extern int using_apic_timer; DEFINE_SPINLOCK(rtc_lock); DEFINE_SPINLOCK(i8253_lock); -static int nohpet __initdata = 0; +int nohpet __initdata = 0; static int notsc __initdata = 0; #undef HPET_HACK_ENABLE_DANGEROUS @@ -345,7 +345,7 @@ static noinline void handle_lost_ticks(int lost, struct pt_regs *regs) #endif } -static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) +void main_timer_handler(struct pt_regs *regs) { static unsigned long rtc_update = 0; unsigned long tsc; @@ -458,12 +458,17 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) } write_sequnlock(&xtime_lock); +} +static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + if (apic_runs_main_timer > 1) + return IRQ_HANDLED; + main_timer_handler(regs); #ifdef CONFIG_X86_LOCAL_APIC if (using_apic_timer) smp_send_timer_broadcast_ipi(); #endif - return IRQ_HANDLED; } @@ -843,17 +848,43 @@ static int hpet_reenable(void) return hpet_timer_stop_set_go(hpet_tick); } -void __init pit_init(void) +#define PIT_MODE 0x43 +#define PIT_CH0 0x40 + +static void __init __pit_init(int val, u8 mode) { unsigned long flags; spin_lock_irqsave(&i8253_lock, flags); - outb_p(0x34, 0x43); /* binary, mode 2, LSB/MSB, ch 0 */ - outb_p(LATCH & 0xff, 0x40); /* LSB */ - outb_p(LATCH >> 8, 0x40); /* MSB */ + outb_p(mode, PIT_MODE); + outb_p(val & 0xff, PIT_CH0); /* LSB */ + outb_p(val >> 8, PIT_CH0); /* MSB */ spin_unlock_irqrestore(&i8253_lock, flags); } +void __init pit_init(void) +{ + __pit_init(LATCH, 0x34); /* binary, mode 2, LSB/MSB, ch 0 */ +} + +void __init pit_stop_interrupt(void) +{ + __pit_init(0, 0x30); /* mode 0 */ +} + +void __init stop_timer_interrupt(void) +{ + char *name; + if (vxtime.hpet_address) { + name = "HPET"; + hpet_timer_stop_set_go(0); + } else { + name = "PIT"; + pit_stop_interrupt(); + } + printk(KERN_INFO "timer: %s interrupt stopped.\n", name); +} + int __init time_setup(char *str) { report_lost_ticks = 1; -- cgit v1.2.3 From 6f3814cd2fb5ea4d53a7fa5b0635d68fa4036c1b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 3 Feb 2006 21:50:53 +0100 Subject: [PATCH] x86_64: Automatically enable apicmaintimer on ATI boards They all have problems with IRQ 0 routing, so just use the APIC on them. Can be overwritten with "noapicmaintimer" Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/io_apic.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 1a5060b434b..4282d72b2a2 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -304,6 +304,14 @@ void __init check_ioapic(void) #endif /* RED-PEN skip them on mptables too? */ return; + case PCI_VENDOR_ID_ATI: + if (apic_runs_main_timer != 0) + break; + printk(KERN_INFO + "ATI board detected. Using APIC/PM timer.\n"); + apic_runs_main_timer = 1; + nohpet = 1; + return; } /* No multi-function device? */ -- cgit v1.2.3 From 0dd2ea9af8f0eca43cf6200baa182b3aba307049 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 3 Feb 2006 21:50:56 +0100 Subject: [PATCH] x86_64: [PATCH] timer resume At resume time, TSC's value or something similar might be changed a lot against suspend time. This could make system gets a very big lost ticks. See http://bugzilla.kernel.org/show_bug.cgi?id=5825 Signed-off-by: Shaohua Li Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/pmtimer.c | 5 +++++ arch/x86_64/kernel/time.c | 12 ++++++++++++ 2 files changed, 17 insertions(+) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/pmtimer.c b/arch/x86_64/kernel/pmtimer.c index feb5f108dd2..8b2655ae4e6 100644 --- a/arch/x86_64/kernel/pmtimer.c +++ b/arch/x86_64/kernel/pmtimer.c @@ -80,6 +80,11 @@ int pmtimer_mark_offset(void) return lost - 1; } +void pmtimer_resume(void) +{ + last_pmtmr_tick = inl(pmtmr_ioport); +} + unsigned int do_gettimeoffset_pm(void) { u32 now, offset, delta = 0; diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 91a448a86eb..c1f76055b57 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -1047,9 +1047,21 @@ static int timer_resume(struct sys_device *dev) write_seqlock_irqsave(&xtime_lock,flags); xtime.tv_sec = sec; xtime.tv_nsec = 0; + if (vxtime.mode == VXTIME_HPET) { + if (hpet_use_timer) + vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick; + else + vxtime.last = hpet_readl(HPET_COUNTER); +#ifdef CONFIG_X86_PM_TIMER + } else if (vxtime.mode == VXTIME_PMTMR) { + pmtimer_resume(); +#endif + } else + vxtime.last_tsc = get_cycles_sync(); write_sequnlock_irqrestore(&xtime_lock,flags); jiffies += sleep_length; wall_jiffies += sleep_length; + monotonic_base += sleep_length * (NSEC_PER_SEC/HZ); touch_softlockup_watchdog(); return 0; } -- cgit v1.2.3 From 6bca52b544489b626c7d0db801df6b4aa3d5adb5 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 3 Feb 2006 21:50:59 +0100 Subject: [PATCH] x86_64: Fix swiotlb dma_alloc_coherent fallback This avoids BUG_ONs in the low level allocator when an illegal GFP mask is added. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/pci-dma.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c index 2f5d8328e2b..4ed391edd47 100644 --- a/arch/x86_64/kernel/pci-dma.c +++ b/arch/x86_64/kernel/pci-dma.c @@ -107,6 +107,9 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, goto again; } + /* Let low level make its own zone decisions */ + gfp &= ~(GFP_DMA32|GFP_DMA); + if (dma_ops->alloc_coherent) return dma_ops->alloc_coherent(dev, size, dma_handle, gfp); -- cgit v1.2.3 From 7bcd3f34e262bbebffa954d80eab3a84f053da31 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 3 Feb 2006 21:51:02 +0100 Subject: [PATCH] x86_64: Undo the earlier changes to remove unrolled copy/memset functions They cause quite bad performance regressions on Netburst This is temporary until we can get new optimized functions for these CPUs. This undoes changes that were done in 2.6.15 and in 2.6.16-rc1, essentially bringing the code back to 2.6.14 level. Only change is I renamed the X86_FEATURE_K8_C flag to X86_FEATURE_REP_GOOD and fixed the check for the flag and also fixed some comments. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/setup.c | 6 ++ arch/x86_64/lib/clear_page.S | 38 +++++++ arch/x86_64/lib/copy_page.S | 87 +++++++++++++++ arch/x86_64/lib/copy_user.S | 247 +++++++++++++++++++++++++++++++++++++++---- arch/x86_64/lib/memcpy.S | 93 +++++++++++++++- arch/x86_64/lib/memset.S | 94 ++++++++++++++++ 6 files changed, 542 insertions(+), 23 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 28895c03cb1..506f152c238 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -877,6 +877,7 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c) static int __init init_amd(struct cpuinfo_x86 *c) { int r; + unsigned level; #ifdef CONFIG_SMP unsigned long value; @@ -899,6 +900,11 @@ static int __init init_amd(struct cpuinfo_x86 *c) 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ clear_bit(0*32+31, &c->x86_capability); + /* On C+ stepping K8 rep microcode works well for copy/memset */ + level = cpuid_eax(1); + if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)) + set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); + r = get_model_name(c); if (!r) { switch (c->x86) { diff --git a/arch/x86_64/lib/clear_page.S b/arch/x86_64/lib/clear_page.S index 43d9fa13618..1f81b79b796 100644 --- a/arch/x86_64/lib/clear_page.S +++ b/arch/x86_64/lib/clear_page.S @@ -5,8 +5,46 @@ .globl clear_page .p2align 4 clear_page: + xorl %eax,%eax + movl $4096/64,%ecx + .p2align 4 +.Lloop: + decl %ecx +#define PUT(x) movq %rax,x*8(%rdi) + movq %rax,(%rdi) + PUT(1) + PUT(2) + PUT(3) + PUT(4) + PUT(5) + PUT(6) + PUT(7) + leaq 64(%rdi),%rdi + jnz .Lloop + nop + ret +clear_page_end: + + /* Some CPUs run faster using the string instructions. + It is also a lot simpler. Use this when possible */ + +#include + + .section .altinstructions,"a" + .align 8 + .quad clear_page + .quad clear_page_c + .byte X86_FEATURE_REP_GOOD + .byte clear_page_end-clear_page + .byte clear_page_c_end-clear_page_c + .previous + + .section .altinstr_replacement,"ax" +clear_page_c: movl $4096/8,%ecx xorl %eax,%eax rep stosq ret +clear_page_c_end: + .previous diff --git a/arch/x86_64/lib/copy_page.S b/arch/x86_64/lib/copy_page.S index 621a1976940..8fa19d96a7e 100644 --- a/arch/x86_64/lib/copy_page.S +++ b/arch/x86_64/lib/copy_page.S @@ -8,7 +8,94 @@ .globl copy_page .p2align 4 copy_page: + subq $3*8,%rsp + movq %rbx,(%rsp) + movq %r12,1*8(%rsp) + movq %r13,2*8(%rsp) + + movl $(4096/64)-5,%ecx + .p2align 4 +.Loop64: + dec %rcx + + movq (%rsi), %rax + movq 8 (%rsi), %rbx + movq 16 (%rsi), %rdx + movq 24 (%rsi), %r8 + movq 32 (%rsi), %r9 + movq 40 (%rsi), %r10 + movq 48 (%rsi), %r11 + movq 56 (%rsi), %r12 + + prefetcht0 5*64(%rsi) + + movq %rax, (%rdi) + movq %rbx, 8 (%rdi) + movq %rdx, 16 (%rdi) + movq %r8, 24 (%rdi) + movq %r9, 32 (%rdi) + movq %r10, 40 (%rdi) + movq %r11, 48 (%rdi) + movq %r12, 56 (%rdi) + + leaq 64 (%rsi), %rsi + leaq 64 (%rdi), %rdi + + jnz .Loop64 + + movl $5,%ecx + .p2align 4 +.Loop2: + decl %ecx + + movq (%rsi), %rax + movq 8 (%rsi), %rbx + movq 16 (%rsi), %rdx + movq 24 (%rsi), %r8 + movq 32 (%rsi), %r9 + movq 40 (%rsi), %r10 + movq 48 (%rsi), %r11 + movq 56 (%rsi), %r12 + + movq %rax, (%rdi) + movq %rbx, 8 (%rdi) + movq %rdx, 16 (%rdi) + movq %r8, 24 (%rdi) + movq %r9, 32 (%rdi) + movq %r10, 40 (%rdi) + movq %r11, 48 (%rdi) + movq %r12, 56 (%rdi) + + leaq 64(%rdi),%rdi + leaq 64(%rsi),%rsi + + jnz .Loop2 + + movq (%rsp),%rbx + movq 1*8(%rsp),%r12 + movq 2*8(%rsp),%r13 + addq $3*8,%rsp + ret + + /* Some CPUs run faster using the string copy instructions. + It is also a lot simpler. Use this when possible */ + +#include + + .section .altinstructions,"a" + .align 8 + .quad copy_page + .quad copy_page_c + .byte X86_FEATURE_REP_GOOD + .byte copy_page_c_end-copy_page_c + .byte copy_page_c_end-copy_page_c + .previous + + .section .altinstr_replacement,"ax" +copy_page_c: movl $4096/8,%ecx rep movsq ret +copy_page_c_end: + .previous diff --git a/arch/x86_64/lib/copy_user.S b/arch/x86_64/lib/copy_user.S index 79422b6559c..f64569b83b5 100644 --- a/arch/x86_64/lib/copy_user.S +++ b/arch/x86_64/lib/copy_user.S @@ -4,9 +4,12 @@ * Functions to copy from and to user space. */ +#define FIX_ALIGNMENT 1 + #include #include #include + #include /* Standard copy_to_user with segment limit checking */ .globl copy_to_user @@ -18,7 +21,23 @@ copy_to_user: jc bad_to_user cmpq threadinfo_addr_limit(%rax),%rcx jae bad_to_user - jmp copy_user_generic +2: + .byte 0xe9 /* 32bit jump */ + .long .Lcug-1f +1: + + .section .altinstr_replacement,"ax" +3: .byte 0xe9 /* replacement jmp with 8 bit immediate */ + .long copy_user_generic_c-1b /* offset */ + .previous + .section .altinstructions,"a" + .align 8 + .quad 2b + .quad 3b + .byte X86_FEATURE_REP_GOOD + .byte 5 + .byte 5 + .previous /* Standard copy_from_user with segment limit checking */ .globl copy_from_user @@ -53,44 +72,230 @@ bad_to_user: * rsi source * rdx count * - * Only 4GB of copy is supported. This shouldn't be a problem - * because the kernel normally only writes from/to page sized chunks - * even if user space passed a longer buffer. - * And more would be dangerous because both Intel and AMD have - * errata with rep movsq > 4GB. If someone feels the need to fix - * this please consider this. - * * Output: * eax uncopied bytes or 0 if successful. */ - .globl copy_user_generic + .p2align 4 copy_user_generic: + .byte 0x66,0x66,0x90 /* 5 byte nop for replacement jump */ + .byte 0x66,0x90 +1: + .section .altinstr_replacement,"ax" +2: .byte 0xe9 /* near jump with 32bit immediate */ + .long copy_user_generic_c-1b /* offset */ + .previous + .section .altinstructions,"a" + .align 8 + .quad copy_user_generic + .quad 2b + .byte X86_FEATURE_REP_GOOD + .byte 5 + .byte 5 + .previous +.Lcug: + pushq %rbx + xorl %eax,%eax /*zero for the exception handler */ + +#ifdef FIX_ALIGNMENT + /* check for bad alignment of destination */ + movl %edi,%ecx + andl $7,%ecx + jnz .Lbad_alignment +.Lafter_bad_alignment: +#endif + + movq %rdx,%rcx + + movl $64,%ebx + shrq $6,%rdx + decq %rdx + js .Lhandle_tail + + .p2align 4 +.Lloop: +.Ls1: movq (%rsi),%r11 +.Ls2: movq 1*8(%rsi),%r8 +.Ls3: movq 2*8(%rsi),%r9 +.Ls4: movq 3*8(%rsi),%r10 +.Ld1: movq %r11,(%rdi) +.Ld2: movq %r8,1*8(%rdi) +.Ld3: movq %r9,2*8(%rdi) +.Ld4: movq %r10,3*8(%rdi) + +.Ls5: movq 4*8(%rsi),%r11 +.Ls6: movq 5*8(%rsi),%r8 +.Ls7: movq 6*8(%rsi),%r9 +.Ls8: movq 7*8(%rsi),%r10 +.Ld5: movq %r11,4*8(%rdi) +.Ld6: movq %r8,5*8(%rdi) +.Ld7: movq %r9,6*8(%rdi) +.Ld8: movq %r10,7*8(%rdi) + + decq %rdx + + leaq 64(%rsi),%rsi + leaq 64(%rdi),%rdi + + jns .Lloop + + .p2align 4 +.Lhandle_tail: + movl %ecx,%edx + andl $63,%ecx + shrl $3,%ecx + jz .Lhandle_7 + movl $8,%ebx + .p2align 4 +.Lloop_8: +.Ls9: movq (%rsi),%r8 +.Ld9: movq %r8,(%rdi) + decl %ecx + leaq 8(%rdi),%rdi + leaq 8(%rsi),%rsi + jnz .Lloop_8 + +.Lhandle_7: + movl %edx,%ecx + andl $7,%ecx + jz .Lende + .p2align 4 +.Lloop_1: +.Ls10: movb (%rsi),%bl +.Ld10: movb %bl,(%rdi) + incq %rdi + incq %rsi + decl %ecx + jnz .Lloop_1 + +.Lende: + popq %rbx + ret + +#ifdef FIX_ALIGNMENT + /* align destination */ + .p2align 4 +.Lbad_alignment: + movl $8,%r9d + subl %ecx,%r9d + movl %r9d,%ecx + cmpq %r9,%rdx + jz .Lhandle_7 + js .Lhandle_7 +.Lalign_1: +.Ls11: movb (%rsi),%bl +.Ld11: movb %bl,(%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz .Lalign_1 + subq %r9,%rdx + jmp .Lafter_bad_alignment +#endif + + /* table sorted by exception address */ + .section __ex_table,"a" + .align 8 + .quad .Ls1,.Ls1e + .quad .Ls2,.Ls2e + .quad .Ls3,.Ls3e + .quad .Ls4,.Ls4e + .quad .Ld1,.Ls1e + .quad .Ld2,.Ls2e + .quad .Ld3,.Ls3e + .quad .Ld4,.Ls4e + .quad .Ls5,.Ls5e + .quad .Ls6,.Ls6e + .quad .Ls7,.Ls7e + .quad .Ls8,.Ls8e + .quad .Ld5,.Ls5e + .quad .Ld6,.Ls6e + .quad .Ld7,.Ls7e + .quad .Ld8,.Ls8e + .quad .Ls9,.Le_quad + .quad .Ld9,.Le_quad + .quad .Ls10,.Le_byte + .quad .Ld10,.Le_byte +#ifdef FIX_ALIGNMENT + .quad .Ls11,.Lzero_rest + .quad .Ld11,.Lzero_rest +#endif + .quad .Le5,.Le_zero + .previous + + /* compute 64-offset for main loop. 8 bytes accuracy with error on the + pessimistic side. this is gross. it would be better to fix the + interface. */ + /* eax: zero, ebx: 64 */ +.Ls1e: addl $8,%eax +.Ls2e: addl $8,%eax +.Ls3e: addl $8,%eax +.Ls4e: addl $8,%eax +.Ls5e: addl $8,%eax +.Ls6e: addl $8,%eax +.Ls7e: addl $8,%eax +.Ls8e: addl $8,%eax + addq %rbx,%rdi /* +64 */ + subq %rax,%rdi /* correct destination with computed offset */ + + shlq $6,%rdx /* loop counter * 64 (stride length) */ + addq %rax,%rdx /* add offset to loopcnt */ + andl $63,%ecx /* remaining bytes */ + addq %rcx,%rdx /* add them */ + jmp .Lzero_rest + + /* exception on quad word loop in tail handling */ + /* ecx: loopcnt/8, %edx: length, rdi: correct */ +.Le_quad: + shll $3,%ecx + andl $7,%edx + addl %ecx,%edx + /* edx: bytes to zero, rdi: dest, eax:zero */ +.Lzero_rest: + movq %rdx,%rcx +.Le_byte: + xorl %eax,%eax +.Le5: rep + stosb + /* when there is another exception while zeroing the rest just return */ +.Le_zero: + movq %rdx,%rax + jmp .Lende + + /* Some CPUs run faster using the string copy instructions. + This is also a lot simpler. Use them when possible. + Patch in jmps to this code instead of copying it fully + to avoid unwanted aliasing in the exception tables. */ + + /* rdi destination + * rsi source + * rdx count + * + * Output: + * eax uncopied bytes or 0 if successfull. + * + * Only 4GB of copy is supported. This shouldn't be a problem + * because the kernel normally only writes from/to page sized chunks + * even if user space passed a longer buffer. + * And more would be dangerous because both Intel and AMD have + * errata with rep movsq > 4GB. If someone feels the need to fix + * this please consider this. + */ +copy_user_generic_c: movl %edx,%ecx shrl $3,%ecx andl $7,%edx - jz 5f 1: rep movsq movl %edx,%ecx - xor %eax,%eax 2: rep movsb +4: movl %ecx,%eax ret - /* align here? */ -5: xorl %eax,%eax -6: rep movsq - ret - - .section .fixup,"ax" 3: lea (%rdx,%rcx,8),%rax ret -4: movl %ecx,%eax - ret - .previous .section __ex_table,"a" .quad 1b,3b .quad 2b,4b - .quad 6b,4b .previous diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S index 92dd8054460..5554948b555 100644 --- a/arch/x86_64/lib/memcpy.S +++ b/arch/x86_64/lib/memcpy.S @@ -11,8 +11,6 @@ * * Output: * rax original destination - * - * TODO: check best memcpy for PSC */ .globl __memcpy @@ -20,6 +18,95 @@ .p2align 4 __memcpy: memcpy: + pushq %rbx + movq %rdi,%rax + + movl %edx,%ecx + shrl $6,%ecx + jz .Lhandle_tail + + .p2align 4 +.Lloop_64: + decl %ecx + + movq (%rsi),%r11 + movq 8(%rsi),%r8 + + movq %r11,(%rdi) + movq %r8,1*8(%rdi) + + movq 2*8(%rsi),%r9 + movq 3*8(%rsi),%r10 + + movq %r9,2*8(%rdi) + movq %r10,3*8(%rdi) + + movq 4*8(%rsi),%r11 + movq 5*8(%rsi),%r8 + + movq %r11,4*8(%rdi) + movq %r8,5*8(%rdi) + + movq 6*8(%rsi),%r9 + movq 7*8(%rsi),%r10 + + movq %r9,6*8(%rdi) + movq %r10,7*8(%rdi) + + leaq 64(%rsi),%rsi + leaq 64(%rdi),%rdi + jnz .Lloop_64 + +.Lhandle_tail: + movl %edx,%ecx + andl $63,%ecx + shrl $3,%ecx + jz .Lhandle_7 + .p2align 4 +.Lloop_8: + decl %ecx + movq (%rsi),%r8 + movq %r8,(%rdi) + leaq 8(%rdi),%rdi + leaq 8(%rsi),%rsi + jnz .Lloop_8 + +.Lhandle_7: + movl %edx,%ecx + andl $7,%ecx + jz .Lende + .p2align 4 +.Lloop_1: + movb (%rsi),%r8b + movb %r8b,(%rdi) + incq %rdi + incq %rsi + decl %ecx + jnz .Lloop_1 + +.Lende: + popq %rbx + ret +.Lfinal: + + /* Some CPUs run faster using the string copy instructions. + It is also a lot simpler. Use this when possible */ + + .section .altinstructions,"a" + .align 8 + .quad memcpy + .quad memcpy_c + .byte X86_FEATURE_REP_GOOD + .byte .Lfinal-memcpy + .byte memcpy_c_end-memcpy_c + .previous + + .section .altinstr_replacement,"ax" + /* rdi destination + * rsi source + * rdx count + */ +memcpy_c: movq %rdi,%rax movl %edx,%ecx shrl $3,%ecx @@ -30,3 +117,5 @@ memcpy: rep movsb ret +memcpy_c_end: + .previous diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S index 2aa48f24ed1..ad397f2c7de 100644 --- a/arch/x86_64/lib/memset.S +++ b/arch/x86_64/lib/memset.S @@ -13,6 +13,98 @@ .p2align 4 memset: __memset: + movq %rdi,%r10 + movq %rdx,%r11 + + /* expand byte value */ + movzbl %sil,%ecx + movabs $0x0101010101010101,%rax + mul %rcx /* with rax, clobbers rdx */ + + /* align dst */ + movl %edi,%r9d + andl $7,%r9d + jnz .Lbad_alignment +.Lafter_bad_alignment: + + movl %r11d,%ecx + shrl $6,%ecx + jz .Lhandle_tail + + .p2align 4 +.Lloop_64: + decl %ecx + movq %rax,(%rdi) + movq %rax,8(%rdi) + movq %rax,16(%rdi) + movq %rax,24(%rdi) + movq %rax,32(%rdi) + movq %rax,40(%rdi) + movq %rax,48(%rdi) + movq %rax,56(%rdi) + leaq 64(%rdi),%rdi + jnz .Lloop_64 + + /* Handle tail in loops. The loops should be faster than hard + to predict jump tables. */ + .p2align 4 +.Lhandle_tail: + movl %r11d,%ecx + andl $63&(~7),%ecx + jz .Lhandle_7 + shrl $3,%ecx + .p2align 4 +.Lloop_8: + decl %ecx + movq %rax,(%rdi) + leaq 8(%rdi),%rdi + jnz .Lloop_8 + +.Lhandle_7: + movl %r11d,%ecx + andl $7,%ecx + jz .Lende + .p2align 4 +.Lloop_1: + decl %ecx + movb %al,(%rdi) + leaq 1(%rdi),%rdi + jnz .Lloop_1 + +.Lende: + movq %r10,%rax + ret + +.Lbad_alignment: + cmpq $7,%r11 + jbe .Lhandle_7 + movq %rax,(%rdi) /* unaligned store */ + movq $8,%r8 + subq %r9,%r8 + addq %r8,%rdi + subq %r8,%r11 + jmp .Lafter_bad_alignment + + /* Some CPUs run faster using the string instructions. + It is also a lot simpler. Use this when possible */ + +#include + + .section .altinstructions,"a" + .align 8 + .quad memset + .quad memset_c + .byte X86_FEATURE_REP_GOOD + .byte memset_c_end-memset_c + .byte memset_c_end-memset_c + .previous + + .section .altinstr_replacement,"ax" + /* rdi destination + * rsi value + * rdx count + */ +memset_c: movq %rdi,%r9 movl %edx,%r8d andl $7,%r8d @@ -29,3 +121,5 @@ __memset: stosb movq %r9,%rax ret +memset_c_end: + .previous -- cgit v1.2.3 From 10f4dc8b27ac42f930ac55adb8c521264dc997f8 Mon Sep 17 00:00:00 2001 From: Ravikiran G Thirumalai Date: Fri, 3 Feb 2006 21:51:05 +0100 Subject: [PATCH] x86_64: Fix the node cpumask of a cpu going down Currently, x86_64 and ia64 arches do not clear the corresponding bits in the node's cpumask when a cpu goes down or cpu bring up is cancelled. This is buggy since there are pieces of common code where the cpumask is checked in the cpu down code path to decide on things (like in the slab down path). PPC does the right thing, but x86_64 and ia64 don't (This was the reason Sonny hit upon a slab bug during cpu offline on ppc and could not reproduce on other arches). This patch fixes it for x86_64. I won't attempt ia64 as I cannot test it. Credit for spotting this should go to Alok. Signed-off-by: Alok N Kataria Signed-off-by: Ravikiran Thirumalai Signed-off-by: Shai Fultheim Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/smpboot.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index a28756ef7ce..67e4e28f4df 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -59,6 +59,7 @@ #include #include #include +#include /* Number of siblings per CPU package */ int smp_num_siblings = 1; @@ -890,6 +891,7 @@ do_rest: if (boot_error) { cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ + clear_node_cpumask(cpu); /* was set by numa_add_cpu */ cpu_clear(cpu, cpu_present_map); cpu_clear(cpu, cpu_possible_map); x86_cpu_to_apicid[cpu] = BAD_APICID; @@ -1187,6 +1189,7 @@ void remove_cpu_from_maps(void) cpu_clear(cpu, cpu_callout_map); cpu_clear(cpu, cpu_callin_map); clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ + clear_node_cpumask(cpu); } int __cpu_disable(void) -- cgit v1.2.3 From 7115125057cf72dcc0b9ded1e12128e1abfa8586 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 3 Feb 2006 21:51:08 +0100 Subject: [PATCH] x86_64: Remove CONFIG_INIT_DEBUG It has been enabled by default for some time now and is cheap enough so it doesn't matter anyways. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/Kconfig.debug | 7 ------- 1 file changed, 7 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/Kconfig.debug b/arch/x86_64/Kconfig.debug index fcb06a50fdd..ea31b4c6210 100644 --- a/arch/x86_64/Kconfig.debug +++ b/arch/x86_64/Kconfig.debug @@ -2,13 +2,6 @@ menu "Kernel hacking" source "lib/Kconfig.debug" -config INIT_DEBUG - bool "Debug __init statements" - depends on DEBUG_KERNEL - help - Fill __init and __initdata at the end of boot. This helps debugging - illegal uses of __init and __initdata after initialization. - config DEBUG_RODATA bool "Write protect kernel read-only data structures" depends on DEBUG_KERNEL -- cgit v1.2.3 From 9391a3f9c7f17bdd82adf9a98905450642cc8970 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 3 Feb 2006 21:51:17 +0100 Subject: [PATCH] x86_64: Clear more state when ignoring empty node in SRAT parsing Might fix boot failures on systems with empty PXMs in SRAT Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/mm/srat.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c index 8b7f85608fa..3bd827f5611 100644 --- a/arch/x86_64/mm/srat.c +++ b/arch/x86_64/mm/srat.c @@ -26,6 +26,10 @@ static nodemask_t nodes_found __initdata; static struct node nodes[MAX_NUMNODES] __initdata; static u8 pxm2node[256] = { [0 ... 255] = 0xff }; +/* Too small nodes confuse the VM badly. Usually they result + from BIOS bugs. */ +#define NODE_MIN_SIZE (4*1024*1024) + static int node_to_pxm(int n); int pxm_to_node(int pxm) @@ -223,6 +227,16 @@ static int nodes_cover_memory(void) return 1; } +static void unparse_node(int node) +{ + int i; + node_clear(node, nodes_parsed); + for (i = 0; i < MAX_LOCAL_APIC; i++) { + if (apicid_to_node[i] == node) + apicid_to_node[i] = NUMA_NO_NODE; + } +} + void __init acpi_numa_arch_fixup(void) {} /* Use the information discovered above to actually set up the nodes. */ @@ -230,16 +244,16 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) { int i; - if (acpi_numa <= 0) - return -1; - /* First clean up the node list */ - for_each_node_mask(i, nodes_parsed) { + for (i = 0; i < MAX_NUMNODES; i++) { cutoff_node(i, start, end); - if (nodes[i].start == nodes[i].end) - node_clear(i, nodes_parsed); + if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) + unparse_node(i); } + if (acpi_numa <= 0) + return -1; + if (!nodes_cover_memory()) { bad_srat(); return -1; -- cgit v1.2.3 From 396bd50fed9c91f7aa1ac66f4b2880ca58c332ff Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 3 Feb 2006 21:51:20 +0100 Subject: [PATCH] x86_64: mark two routines as __cpuinit SIgned-off-by: Shaohua Li Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/apic.c | 2 +- arch/x86_64/kernel/time.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 14751dda7dc..673a2fe9923 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -937,7 +937,7 @@ void smp_apic_timer_interrupt(struct pt_regs *regs) * multi-chassis. Use available data to take a good guess. * If in doubt, go HPET. */ -__init int oem_force_hpet_timer(void) +__cpuinit int oem_force_hpet_timer(void) { int i, clusters, zeros; unsigned id; diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index c1f76055b57..c0844bffbf8 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -963,7 +963,7 @@ void __init time_init(void) * Make an educated guess if the TSC is trustworthy and synchronized * over all CPUs. */ -__init int unsynchronized_tsc(void) +__cpuinit int unsynchronized_tsc(void) { #ifdef CONFIG_SMP if (oem_force_hpet_timer()) -- cgit v1.2.3 From 7ded56895c11a656408b6ff21086ae04a6a7cda0 Mon Sep 17 00:00:00 2001 From: Ashok Raj Date: Fri, 3 Feb 2006 21:51:23 +0100 Subject: [PATCH] x86_64: data/functions wrongly marked as __init with cpu hotplug. attached patch is 2 more cases i found via running the reference_init.pl script. These were easy to spot just knowing the file names. There is one another about init/main.c that i cant exactly zero in. (partly because i dont know how to interpret the data thats spewed out of the tool). Signed-off-by: Ashok Raj Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/mce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index 13a2eada6c9..b8b9529fa89 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -380,7 +380,7 @@ static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c) */ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) { - static cpumask_t mce_cpus __initdata = CPU_MASK_NONE; + static cpumask_t mce_cpus = CPU_MASK_NONE; mce_cpu_quirks(c); -- cgit v1.2.3 From d22fe808449cd3b5bacd5c0d447e3675f9ab7619 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 3 Feb 2006 21:51:26 +0100 Subject: [PATCH] x86_64: Do more checking in the SRAT header code - Check if the processor/memory affinity entries are long enough according to the ACPI 3.0 spec. - Ignore memory affinity entries that define a zero length region. All based on BIOS issues found in the field @) Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/mm/srat.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c index 3bd827f5611..cd25300726f 100644 --- a/arch/x86_64/mm/srat.c +++ b/arch/x86_64/mm/srat.c @@ -135,7 +135,12 @@ void __init acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa) { int pxm, node; - if (srat_disabled() || pa->flags.enabled == 0) + if (srat_disabled()) + return; + if (pa->header.length != sizeof(struct acpi_table_processor_affinity)) { bad_srat(); + return; + } + if (pa->flags.enabled == 0) return; pxm = pa->proximity_domain; node = setup_node(pxm); @@ -159,8 +164,16 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) int node, pxm; int i; - if (srat_disabled() || ma->flags.enabled == 0) + if (srat_disabled()) return; + if (ma->header.length != sizeof(struct acpi_table_memory_affinity)) { + bad_srat(); + return; + } + if (ma->flags.enabled == 0) + return; + start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32); + end = start + (ma->length_lo | ((u64)ma->length_hi << 32)); pxm = ma->proximity_domain; node = setup_node(pxm); if (node < 0) { @@ -168,8 +181,6 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) bad_srat(); return; } - start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32); - end = start + (ma->length_lo | ((u64)ma->length_hi << 32)); /* It is fine to add this area to the nodes data it will be used later*/ if (ma->flags.hot_pluggable == 1) printk(KERN_INFO "SRAT: hot plug zone found %lx - %lx \n", -- cgit v1.2.3 From 1de6bf33bc4601d856c286ad5c7d515468e24bbb Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 3 Feb 2006 21:51:29 +0100 Subject: [PATCH] x86_64: Fix zero mcfg entry workaround on x86-64 I broke this earlier when moving the patch from i386 to x86-64. Need to return the virtual address here, not the physical address. This fixes some boot time crashes on x86-64. Cc: gregkh@suse.de Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/pci/mmconfig.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c index 00d4ddbf980..b4a3fe4ec24 100644 --- a/arch/x86_64/pci/mmconfig.c +++ b/arch/x86_64/pci/mmconfig.c @@ -46,7 +46,7 @@ static char __iomem *get_virt(unsigned int seg, unsigned bus) if (pci_mmcfg_config_num == 1 && cfg->pci_segment_group_number == 0 && (cfg->start_bus_number | cfg->end_bus_number) == 0) - return cfg->base_address; + return pci_mmcfg_virt[0].virt; /* Fall back to type 0 */ return 0; -- cgit v1.2.3 From a1002a48e1af5ff8d02bfe79536e6fce3a0ec369 Mon Sep 17 00:00:00 2001 From: Kevin VanMaren Date: Fri, 3 Feb 2006 21:51:32 +0100 Subject: [PATCH] x86_64: When allocation of merged SG lists fails in the IOMMU don't merge [ AK: I redid Kevin's fix to be simpler, but the idea and original analysis of the problem is from Kevin] This avoid allocation failures on some SATA systems like Nvidia CK8 when the IOMMU gets fragmented. Modern SATA devices have quite large queues (128 entries) and the FS with ext2/3 is good enough now that it often passes whole 128 page sg lists down to the driver. These require 512K of continuous free space in the IOMMU aperture to map when merged. When the IOMMU is fragmented this could lead to spurious IO errors due to failing mappings. Short term fix is to just try to map the SG list again unmerged page by page - this way fragmentation doesn't matter anymore. The code for that was already there, but it just wasn't enabled for the merge case. According to Kevin at least the Nvidia device doesn't seem to benefit from merging much anyways, so the only slowdown is from trying to do an unnecessary merge attempt. Kevin plans to implement better fragmentation avoidance in the future, but that wouldn't be 2.6.16 material. TBD: should add some statistic counters to count how often that really happens. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/pci-gart.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c index c37fc7726ba..9188b25fad2 100644 --- a/arch/x86_64/kernel/pci-gart.c +++ b/arch/x86_64/kernel/pci-gart.c @@ -457,9 +457,12 @@ int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) error: flush_gart(NULL); gart_unmap_sg(dev, sg, nents, dir); - /* When it was forced try again unforced */ - if (force_iommu) - return dma_map_sg_nonforce(dev, sg, nents, dir); + /* When it was forced or merged try again in a dumb way */ + if (force_iommu || iommu_merge) { + out = dma_map_sg_nonforce(dev, sg, nents, dir); + if (out > 0) + return out; + } if (panic_on_overflow) panic("dma_map_sg: overflow on %lu pages\n", pages); iommu_full(dev, pages << PAGE_SHIFT, dir); -- cgit v1.2.3 From 16acc0cd8f1aac63ddf5228c5bc29cf7d31627f4 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Fri, 3 Feb 2006 21:51:35 +0100 Subject: [PATCH] x86_64: align per-cpu section to configured cache bytes Align the start of the per-cpu section to the configured number of bytes in a cache line. This stops a BUG_ON() from triggering in load_module() when DEFINE_PER_CPU() is used in a module and the section isn't cacheline-aligned. Rusty also found this and sent a patch in a while ago (http://lkml.org/lkml/2004/10/19/17), I don't know what came of that. Signed-off-by: Zach Brown Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/vmlinux.lds.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index b0eed1faf74..59316d1c1d8 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -173,7 +173,7 @@ SECTIONS __initramfs_start = .; .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) } __initramfs_end = .; - . = ALIGN(32); + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); __per_cpu_start = .; .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) } __per_cpu_end = .; -- cgit v1.2.3 From 099f318b8d97490e58c595eb1b6d3415da5ccc03 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 3 Feb 2006 21:51:38 +0100 Subject: [PATCH] x86_64: Don't allow kprobes on __switch_to kprobes cannot deal with the funny calling conventions when it runs on a different stack when it returns. If someone wants to instrument context switch they can add a probe to schedule() instead. Cc: jkenisto@us.ibm.com, prasanna@in.ibm.com Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/process.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index 8ded407e4a9..22a05dec81a 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -516,8 +516,10 @@ out: * This could still be optimized: * - fold all the options into a flag word and test it with a single test. * - could test fs/gs bitsliced + * + * Kprobes not supported here. Set the probe on schedule instead. */ -struct task_struct * +__kprobes struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, -- cgit v1.2.3 From 0c3749c41f5eee0da36bbf92b2793338b4d8574f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 3 Feb 2006 21:51:41 +0100 Subject: [PATCH] x86_64: Calibrate APIC timer using PM timer On some broken motherboards (at least one NForce3 based AMD64 laptop) the PIT timer runs at a incorrect frequency. This patch adds a new option "apicpmtimer" that allows to use the APIC timer and calibrate it using the PMTimer. It requires the earlier patch that allows to run the main timer from the APIC. Specifying apicpmtimer implies apicmaintimer. The option defaults to off for now. I tested it on a few systems and the resulting APIC timer frequencies were usually a bit off, but always <1%, which should be tolerable. TBD figure out heuristic to enable this automatically on the affected systems TBD perhaps do it on all NForce3s or using DMI? Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/apic.c | 33 +++++++++++++++++++++++++++------ arch/x86_64/kernel/pmtimer.c | 20 ++++++++++++++++++++ 2 files changed, 47 insertions(+), 6 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 673a2fe9923..c02218b3ae2 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -40,6 +40,7 @@ int apic_verbosity; int apic_runs_main_timer; +int apic_calibrate_pmtmr __initdata; int disable_apic_timer __initdata; @@ -746,14 +747,27 @@ static int __init calibrate_APIC_clock(void) __setup_APIC_LVTT(1000000000); apic_start = apic_read(APIC_TMCCT); - rdtscl(tsc_start); - - do { +#ifdef CONFIG_X86_PM_TIMER + if (apic_calibrate_pmtmr && pmtmr_ioport) { + pmtimer_wait(5000); /* 5ms wait */ apic = apic_read(APIC_TMCCT); - rdtscl(tsc); - } while ((tsc - tsc_start) < TICK_COUNT && (apic - apic_start) < TICK_COUNT); + result = (apic_start - apic) * 1000L / 5; + } else +#endif + { + rdtscl(tsc_start); + + do { + apic = apic_read(APIC_TMCCT); + rdtscl(tsc); + } while ((tsc - tsc_start) < TICK_COUNT && + (apic - apic_start) < TICK_COUNT); + + result = (apic_start - apic) * 1000L * cpu_khz / + (tsc - tsc_start); + } + printk("result %d\n", result); - result = (apic_start - apic) * 1000L * cpu_khz / (tsc - tsc_start); printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", result / 1000 / 1000, result / 1000 % 1000); @@ -1115,6 +1129,13 @@ static __init int setup_noapicmaintimer(char *str) } __setup("noapicmaintimer", setup_noapicmaintimer); +static __init int setup_apicpmtimer(char *s) +{ + apic_calibrate_pmtmr = 1; + return setup_apicmaintimer(NULL); +} +__setup("apicpmtimer", setup_apicpmtimer); + /* dummy parsing: see setup.c */ __setup("disableapic", setup_disableapic); diff --git a/arch/x86_64/kernel/pmtimer.c b/arch/x86_64/kernel/pmtimer.c index 8b2655ae4e6..5c51d10408a 100644 --- a/arch/x86_64/kernel/pmtimer.c +++ b/arch/x86_64/kernel/pmtimer.c @@ -80,6 +80,26 @@ int pmtimer_mark_offset(void) return lost - 1; } +static unsigned pmtimer_wait_tick(void) +{ + u32 a, b; + for (a = b = inl(pmtmr_ioport) & ACPI_PM_MASK; + a == b; + b = inl(pmtmr_ioport) & ACPI_PM_MASK) + ; + return b; +} + +/* note: wait time is rounded up to one tick */ +void pmtimer_wait(unsigned us) +{ + u32 a, b; + a = pmtimer_wait_tick(); + do { + b = inl(pmtmr_ioport); + } while (cyc2us(b - a) < us); +} + void pmtimer_resume(void) { last_pmtmr_tick = inl(pmtmr_ioport); -- cgit v1.2.3 From 91522a964b295e7bccb173218cc00adadfc77bbf Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 3 Feb 2006 21:51:44 +0100 Subject: [PATCH] x86_64: small fix for CFI annotations Conditionalize two unwind directives to match other similarly conditional code. Signed-Off-By: Jan Beulich Cc: Jim Houston Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/entry.S | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index cdf9cb1288c..b150c87a08c 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -499,7 +499,9 @@ ENTRY(stub_rt_sigreturn) movq %gs:pda_irqstackptr,%rax cmoveq %rax,%rsp /*todo This needs CFI annotation! */ pushq %rdi # save old stack +#ifndef CONFIG_DEBUG_INFO CFI_ADJUST_CFA_OFFSET 8 +#endif call \func .endm @@ -509,7 +511,9 @@ ENTRY(common_interrupt) /* 0(%rsp): oldrsp-ARGOFFSET */ ret_from_intr: popq %rdi +#ifndef CONFIG_DEBUG_INFO CFI_ADJUST_CFA_OFFSET -8 +#endif cli decl %gs:pda_irqcount #ifdef CONFIG_DEBUG_INFO -- cgit v1.2.3 From d646bce4c77af72f5a293a96fbf14174c9f2a9c2 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 3 Feb 2006 21:51:47 +0100 Subject: [PATCH] x86_64: minor odering correction to dump_pagetable() Checking of the validity of pointers should be consistently done before dereferencing the pointer. Signed-Off-By: Jan Beulich Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/mm/fault.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index 26eac194064..2e7c3c8ffe0 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c @@ -33,7 +33,6 @@ #include #include #include -#include /* Page fault error code bits */ #define PF_PROT (1<<0) /* or no page found */ @@ -157,8 +156,8 @@ void dump_pagetable(unsigned long address) pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); pgd += pgd_index(address); - printk("PGD %lx ", pgd_val(*pgd)); if (bad_address(pgd)) goto bad; + printk("PGD %lx ", pgd_val(*pgd)); if (!pgd_present(*pgd)) goto ret; pud = __pud_offset_k((pud_t *)pgd_page(*pgd), address); -- cgit v1.2.3 From 3777a95903953c55f2309a89679b73c19ae5535b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 3 Feb 2006 21:51:53 +0100 Subject: [PATCH] i386/x86-64: Don't ack the APIC for bad interrupts when the APIC is not enabled It's bad juju to touch the APIC when it hasn't been enabled. I also moved ack_bad_irq for x86-64 out of line following i386. Signed-off-by: Andi Kleen Acked-by: Ingo Molnar Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/apic.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index c02218b3ae2..6147770b434 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -72,6 +72,26 @@ int get_maxlvt(void) return maxlvt; } +/* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themselves. + */ +void ack_bad_irq(unsigned int irq) +{ + printk("unexpected IRQ trap at vector %02x\n", irq); + /* + * Currently unexpected vectors happen only on SMP and APIC. + * We _must_ ack these because every local APIC has only N + * irq slots per priority level, and a 'hanging, unacked' IRQ + * holds up an irq slot - in excessive cases (when multiple + * unexpected vectors occur) that might lock up the APIC + * completely. + * But don't ack when the APIC is disabled. -AK + */ + if (!disable_apic) + ack_APIC_irq(); +} + void clear_local_APIC(void) { int maxlvt; -- cgit v1.2.3 From 14c3f8558717adb192c364f58b0d63dfc850ecca Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 3 Feb 2006 21:51:56 +0100 Subject: [PATCH] x86_64: Let impossible CPUs point to reference per cpu data Hack for 2.6.16. In 2.6.17 all code that uses NR_CPUs should be audited and changed to only touch possible CPUs. Don't mark the reference per cpu data init data (so it stays around after boot) and point all impossible CPUs to it. This way they reference some valid - although shared memory. Usually this is only initialization like INIT_LIST_HEADs and there won't be races because these CPUs never run. Still somewhat hackish. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/vmlinux.lds.S | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index 59316d1c1d8..74db0062d4a 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -172,13 +172,15 @@ SECTIONS . = ALIGN(4096); __initramfs_start = .; .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) } - __initramfs_end = .; + __initramfs_end = .; + /* temporary here to work around NR_CPUS. If you see this comment in 2.6.17+ + complain */ + . = ALIGN(4096); + __init_end = .; . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); __per_cpu_start = .; .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) } __per_cpu_end = .; - . = ALIGN(4096); - __init_end = .; . = ALIGN(4096); __nosave_begin = .; -- cgit v1.2.3 From 5b7b644ca9a12396a46fad825a973fb8bed17102 Mon Sep 17 00:00:00 2001 From: Jon Mason Date: Fri, 3 Feb 2006 21:51:59 +0100 Subject: [PATCH] x86_64: IOMMU printk cleanup This patch contains a printk reorder to remove the current problem of displaying "PCI-DMA: Disabling IOMMU." and then "PCI-DMA: using GART IOMMU" 20 lines later in dmesg. It also constains a printk reorder in swiotlb to state swiotlb enablement prior to describing the location of the bounce buffers, and a printk reorder to state gart enablement prior to describing the aperature. Also constains a whitespace cleanup in arch/x86_64/kernel/setup.c Tested (along with patch 2/2) on dual opteron with gart enabled, iommu=soft, and iommu=off. Signed-off-by: Jon Mason Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/pci-gart.c | 10 +++++++++- arch/x86_64/kernel/pci-nommu.c | 7 ------- arch/x86_64/kernel/pci-swiotlb.c | 2 +- arch/x86_64/kernel/setup.c | 2 +- 4 files changed, 11 insertions(+), 10 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c index 9188b25fad2..2fe23a6c361 100644 --- a/arch/x86_64/kernel/pci-gart.c +++ b/arch/x86_64/kernel/pci-gart.c @@ -645,9 +645,18 @@ static int __init pci_iommu_init(void) (no_agp && init_k8_gatt(&info) < 0)) { no_iommu = 1; no_iommu_init(); + printk(KERN_INFO "PCI-DMA: Disabling IOMMU.\n"); + if (end_pfn > MAX_DMA32_PFN) { + printk(KERN_ERR "WARNING more than 4GB of memory " + "but IOMMU not compiled in.\n" + KERN_ERR "WARNING 32bit PCI may malfunction.\n" + KERN_ERR "You might want to enable " + "CONFIG_GART_IOMMU\n"); + } return -1; } + printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); aper_size = info.aper_size * 1024 * 1024; iommu_size = check_iommu_size(info.aper_base, aper_size); iommu_pages = iommu_size >> PAGE_SHIFT; @@ -721,7 +730,6 @@ static int __init pci_iommu_init(void) flush_gart(NULL); - printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); dma_ops = &gart_dma_ops; return 0; diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c index e4156497519..44adcc2d5e5 100644 --- a/arch/x86_64/kernel/pci-nommu.c +++ b/arch/x86_64/kernel/pci-nommu.c @@ -88,12 +88,5 @@ void __init no_iommu_init(void) { if (dma_ops) return; - printk(KERN_INFO "PCI-DMA: Disabling IOMMU.\n"); dma_ops = &nommu_dma_ops; - if (end_pfn > MAX_DMA32_PFN) { - printk(KERN_ERR - "WARNING more than 4GB of memory but IOMMU not compiled in.\n" - KERN_ERR "WARNING 32bit PCI may malfunction.\n" - KERN_ERR "You might want to enable CONFIG_GART_IOMMU\n"); - } } diff --git a/arch/x86_64/kernel/pci-swiotlb.c b/arch/x86_64/kernel/pci-swiotlb.c index 3569a25ad7f..990ed67896f 100644 --- a/arch/x86_64/kernel/pci-swiotlb.c +++ b/arch/x86_64/kernel/pci-swiotlb.c @@ -35,8 +35,8 @@ void pci_swiotlb_init(void) (end_pfn > MAX_DMA32_PFN || force_iommu)) swiotlb = 1; if (swiotlb) { - swiotlb_init(); printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); + swiotlb_init(); dma_ops = &swiotlb_dma_ops; } } diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 506f152c238..363db5a003d 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -741,7 +741,7 @@ void __init setup_arch(char **cmdline_p) e820_setup_gap(); #ifdef CONFIG_GART_IOMMU - iommu_hole_init(); + iommu_hole_init(); #endif #ifdef CONFIG_VT -- cgit v1.2.3 From cef5076987dd545ac74f4efcf1c962be8eac34b0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 5 Feb 2006 10:51:57 -0800 Subject: Revert "[PATCH] x86_64: Fix the node cpumask of a cpu going down" This reverts commit 10f4dc8b27ac42f930ac55adb8c521264dc997f8. Quoth Andi Kleen: "Kiran decided that it makes the problem worse than it was before. Fixing it fully requires more work which is too much for 2.6.16. So please revert that commit for now." Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/smpboot.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index 67e4e28f4df..a28756ef7ce 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -59,7 +59,6 @@ #include #include #include -#include /* Number of siblings per CPU package */ int smp_num_siblings = 1; @@ -891,7 +890,6 @@ do_rest: if (boot_error) { cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ - clear_node_cpumask(cpu); /* was set by numa_add_cpu */ cpu_clear(cpu, cpu_present_map); cpu_clear(cpu, cpu_possible_map); x86_cpu_to_apicid[cpu] = BAD_APICID; @@ -1189,7 +1187,6 @@ void remove_cpu_from_maps(void) cpu_clear(cpu, cpu_callout_map); cpu_clear(cpu, cpu_callin_map); clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ - clear_node_cpumask(cpu); } int __cpu_disable(void) -- cgit v1.2.3 From 488fc08d914f2b07b701c9b9c811437cc1c1c518 Mon Sep 17 00:00:00 2001 From: Ravikiran G Thirumalai Date: Tue, 7 Feb 2006 12:58:23 -0800 Subject: [PATCH] x86_64: Fix the node cpumask of a cpu going down Currently, x86_64 and ia64 arches do not clear the corresponding bits in the node's cpumask when a cpu goes down or cpu bring up is cancelled. This is buggy since there are pieces of common code where the cpumask is checked in the cpu down code path to decide on things (like in the slab down path). PPC does the right thing, but x86_64 and ia64 don't (This was the reason Sonny hit upon a slab bug during cpu offline on ppc and could not reproduce on other arches). This patch fixes it for x86_64. I won't attempt ia64 as I cannot test it. Credit for spotting this should go to Alok. (akpm: this was applied, then reverted. But it's OK now because we now use for_each_cpu() in the right places). Signed-off-by: Alok N Kataria Signed-off-by: Ravikiran Thirumalai Signed-off-by: Shai Fultheim Signed-off-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/smpboot.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index a28756ef7ce..67e4e28f4df 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -59,6 +59,7 @@ #include #include #include +#include /* Number of siblings per CPU package */ int smp_num_siblings = 1; @@ -890,6 +891,7 @@ do_rest: if (boot_error) { cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ + clear_node_cpumask(cpu); /* was set by numa_add_cpu */ cpu_clear(cpu, cpu_present_map); cpu_clear(cpu, cpu_possible_map); x86_cpu_to_apicid[cpu] = BAD_APICID; @@ -1187,6 +1189,7 @@ void remove_cpu_from_maps(void) cpu_clear(cpu, cpu_callout_map); cpu_clear(cpu, cpu_callin_map); clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ + clear_node_cpumask(cpu); } int __cpu_disable(void) -- cgit v1.2.3 From 4fb7d9827e89cc0a4ad2fde32ffa08f77cc0b7fe Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 21 Dec 2005 13:24:46 -0500 Subject: [PATCH] drive_info removal outside of arch/i386 drive_info is used only by hd.c and that happens under #ifdef __i386__. Signed-off-by: Al Viro --- arch/x86_64/kernel/setup.c | 2 -- arch/x86_64/kernel/x8664_ksyms.c | 5 ----- 2 files changed, 7 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 363db5a003d..9435ab7d6fb 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -94,7 +94,6 @@ unsigned long saved_video_mode; /* * Setup options */ -struct drive_info_struct { char dummy[32]; } drive_info; struct screen_info screen_info; struct sys_desc_table_struct { unsigned short length; @@ -572,7 +571,6 @@ void __init setup_arch(char **cmdline_p) unsigned long kernel_end; ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV); - drive_info = DRIVE_INFO; screen_info = SCREEN_INFO; edid_info = EDID_INFO; saved_video_mode = SAVED_VIDEO_MODE; diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c index b614d54d2ae..3496abc8d37 100644 --- a/arch/x86_64/kernel/x8664_ksyms.c +++ b/arch/x86_64/kernel/x8664_ksyms.c @@ -39,11 +39,6 @@ extern void __write_lock_failed(rwlock_t *rw); extern void __read_lock_failed(rwlock_t *rw); #endif -#if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE) -extern struct drive_info_struct drive_info; -EXPORT_SYMBOL(drive_info); -#endif - /* platform dependent support */ EXPORT_SYMBOL(boot_cpu_data); //EXPORT_SYMBOL(dump_fpu); -- cgit v1.2.3 From dd42b1518666132c21e7348c4b599c501f0021a1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 1 Feb 2006 07:30:33 -0500 Subject: [PATCH] amd64 time.c __iomem annotations Signed-off-by: Al Viro --- arch/x86_64/kernel/time.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index c0844bffbf8..dba7237be5c 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -748,7 +748,7 @@ static __init int late_hpet_init(void) * Timer0 and Timer1 is used by platform. */ hd.hd_phys_address = vxtime.hpet_address; - hd.hd_address = (void *)fix_to_virt(FIX_HPET_BASE); + hd.hd_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE); hd.hd_nirqs = ntimer; hd.hd_flags = HPET_DATA_PLATFORM; hpet_reserve_timer(&hd, 0); -- cgit v1.2.3 From cc59853b4a9973126e15e0e6bdddf0627d4b99c4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 3 Feb 2006 20:28:01 -0500 Subject: [PATCH] arch/x86_64/pci/mmconfig.c NULL noise removal Signed-off-by: Al Viro --- arch/x86_64/pci/mmconfig.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c index b4a3fe4ec24..18f371fe37f 100644 --- a/arch/x86_64/pci/mmconfig.c +++ b/arch/x86_64/pci/mmconfig.c @@ -49,7 +49,7 @@ static char __iomem *get_virt(unsigned int seg, unsigned bus) return pci_mmcfg_virt[0].virt; /* Fall back to type 0 */ - return 0; + return NULL; } static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn) -- cgit v1.2.3 From 4b88f09364e94b05b66fb1441131e8460495a2f8 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Feb 2006 00:35:50 +0100 Subject: [PATCH] x86-64: Add sys_unshare Add unshare syscall for x86-64 ppoll/pselect are not ready yet, but add reservations. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/ia32/ia32entry.S | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86_64') diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index 067c0f47bd0..ada4535d016 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -685,6 +685,9 @@ ia32_sys_call_table: .quad sys_readlinkat /* 305 */ .quad sys_fchmodat .quad sys_faccessat + .quad sys_ni_syscall /* pselect6 for now */ + .quad sys_ni_syscall /* ppoll for now */ + .quad sys_unshare /* 310 */ ia32_syscall_end: .rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8 .quad ni_syscall -- cgit v1.2.3