From e0c1e9bf81badc7ba59e120d6218101903d5d103 Mon Sep 17 00:00:00 2001 From: Kimball Murray Date: Mon, 8 May 2006 15:17:16 +0200 Subject: [PATCH] x86_64: avoid IRQ0 ioapic pin collision The patch addresses a problem with ACPI SCI interrupt entry, which gets re-used, and the IRQ is assigned to another unrelated device. The patch corrects the code such that SCI IRQ is skipped and duplicate entry is avoided. Second issue came up with VIA chipset, the problem was caused by original patch assigning IRQs starting 16 and up. The VIA chipset uses 4-bit IRQ register for internal interrupt routing, and therefore cannot handle IRQ numbers assigned to its devices. The patch corrects this problem by allowing PCI IRQs below 16. Cc: len.brown@intel.com Signed-off by: Natalie Protasevich Signed-off-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/io_apic.c | 5 +++++ arch/x86_64/kernel/mpparse.c | 12 +++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 77b4c608cca..0de3ea93883 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -1777,6 +1777,8 @@ static inline void unlock_ExtINT_logic(void) spin_unlock_irqrestore(&ioapic_lock, flags); } +int timer_uses_ioapic_pin_0; + /* * This code may look a bit paranoid, but it's supposed to cooperate with * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ @@ -1814,6 +1816,9 @@ static inline void check_timer(void) pin2 = ioapic_i8259.pin; apic2 = ioapic_i8259.apic; + if (pin1 == 0) + timer_uses_ioapic_pin_0 = 1; + apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", vector, apic1, pin1, apic2, pin2); diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index b17cf3eba35..083da7e606b 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -968,7 +968,17 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity) */ int irq = gsi; if (gsi < MAX_GSI_NUM) { - if (gsi > 15) + /* + * Retain the VIA chipset work-around (gsi > 15), but + * avoid a problem where the 8254 timer (IRQ0) is setup + * via an override (so it's not on pin 0 of the ioapic), + * and at the same time, the pin 0 interrupt is a PCI + * type. The gsi > 15 test could cause these two pins + * to be shared as IRQ0, and they are not shareable. + * So test for this condition, and if necessary, avoid + * the pin collision. + */ + if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0)) gsi = pci_irq++; /* * Don't assign IRQ used by ACPI SCI -- cgit v1.2.3 From 5192d84e4c32cd335fd572e5ff0712041f45f7e7 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 8 May 2006 15:17:19 +0200 Subject: [PATCH] x86_64: Check for too many northbridges in IOMMU code The IOMMU code can only deal with 8 northbridges. Error out when more are found. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/pci-gart.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c index 9d3d76c85ae..2480d3f08a4 100644 --- a/arch/x86_64/kernel/pci-gart.c +++ b/arch/x86_64/kernel/pci-gart.c @@ -639,6 +639,14 @@ static int __init pci_iommu_init(void) return -1; } + i = 0; + for_all_nb(dev) + i++; + if (i > MAX_NB) { + printk(KERN_ERR "PCI-GART: Too many northbridges (%ld). Disabled\n", i); + return -1; + } + printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); aper_size = info.aper_size * 1024 * 1024; iommu_size = check_iommu_size(info.aper_base, aper_size); -- cgit v1.2.3 From cdc60a4c8e71c4bcf67e83fac6c0cabd0ff19bfe Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Mon, 8 May 2006 15:17:22 +0200 Subject: [PATCH] x86_64: fix die_lock nesting I noticed this when poking around in this area. The oops_begin() function in x86_64 would only conditionally claim the die_lock if the call is nested, but oops_end() would always release the spinlock. This patch adds a nest count for the die lock so that the release of the lock is only done on the final oops_end(). Signed-off-by: Corey Minyard Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/traps.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 2700b1375c1..0ebb281aa17 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -385,6 +385,7 @@ void out_of_line_bug(void) static DEFINE_SPINLOCK(die_lock); static int die_owner = -1; +static unsigned int die_nest_count; unsigned __kprobes long oops_begin(void) { @@ -399,6 +400,7 @@ unsigned __kprobes long oops_begin(void) else spin_lock(&die_lock); } + die_nest_count++; die_owner = cpu; console_verbose(); bust_spinlocks(1); @@ -409,7 +411,13 @@ void __kprobes oops_end(unsigned long flags) { die_owner = -1; bust_spinlocks(0); - spin_unlock_irqrestore(&die_lock, flags); + die_nest_count--; + if (die_nest_count) + /* We still own the lock */ + local_irq_restore(flags); + else + /* Nest count reaches zero, release the lock. */ + spin_unlock_irqrestore(&die_lock, flags); if (panic_on_oops) panic("Oops"); } -- cgit v1.2.3 From 8b1ffe9550e71224c43d8c754245bd76f4ea9bb8 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Mon, 8 May 2006 15:17:25 +0200 Subject: [PATCH] x86_64: add nmi_exit to die_nmi Playing with NMI watchdog on x86_64, I discovered that it didn't do what I expected. It always panic-ed, even when it didn't happen from interrupt context. This patch solves that problem for me. Also, in this case, do_exit() will be called with interrupts disabled, I believe. Would it be wise to also call local_irq_enable() after nmi_exit()? [Yes I added it -AK] Currently, on x86_64, any NMI watchdog timeout will cause a panic because the irq count will always be set to be in an interrupt when do_exit() is called from die_nmi(). If we add nmi_exit() to the die_nmi() call (since the nmi will never exit "normally") it seems to solve this problem. The following small program can be used to trigger the NMI watchdog to reproduce this: main () { iopl(3); for (;;) asm("cli"); } Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/traps.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 0ebb281aa17..6b87268c5c2 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -472,6 +472,8 @@ void __kprobes die_nmi(char *str, struct pt_regs *regs) panic("nmi watchdog"); printk("console shuts up ...\n"); oops_end(flags); + nmi_exit(); + local_irq_enable(); do_exit(SIGSEGV); } -- cgit v1.2.3 From ac71d12c990526b01ef6cfe50907ef8530a30331 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 8 May 2006 15:17:28 +0200 Subject: [PATCH] x86_64: Avoid EBDA area in early boot allocator Based on analysis&patch from Robert Hentosch Observed on a Dell PE6850 with 16GB The problem occurs very early on, when the kernel allocates space for the temporary memory map called bootmap. The bootmap overlaps the EBDA region. EBDA region is not historically reserved in the e820 mapping. When the bootmap is freed it marks the EBDA region as usable. If you notice in setup.c there is already code to work around the EBDA in reserve_ebda_region(), this check however occurs after the bootmap is allocated and doesn't prevent the bootmap from using this range. AK: I redid the original patch. Thanks also to Jan Beulich for spotting some mistakes. Cc: Robert_Hentosch@dell.com Cc: jbeulich@novell.com Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/e820.c | 6 ++++++ arch/x86_64/kernel/setup.c | 30 ++++++++++++++++++++++-------- 2 files changed, 28 insertions(+), 8 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index 62776c07cff..222b5b46d2b 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -76,6 +76,12 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size) *addrp = __pa_symbol(&_end); return 1; } + + if (last >= ebda_addr && addr < ebda_addr + ebda_size) { + *addrp = ebda_addr + ebda_size; + return 1; + } + /* XXX ramdisk image here? */ return 0; } diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index ebc3c33b1c6..f0870bef24d 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -571,17 +571,28 @@ static inline void copy_edd(void) #endif #define EBDA_ADDR_POINTER 0x40E -static void __init reserve_ebda_region(void) + +unsigned __initdata ebda_addr; +unsigned __initdata ebda_size; + +static void discover_ebda(void) { - unsigned int addr; - /** + /* * there is a real-mode segmented pointer pointing to the * 4K EBDA area at 0x40E */ - addr = *(unsigned short *)phys_to_virt(EBDA_ADDR_POINTER); - addr <<= 4; - if (addr) - reserve_bootmem_generic(addr, PAGE_SIZE); + ebda_addr = *(unsigned short *)EBDA_ADDR_POINTER; + ebda_addr <<= 4; + + ebda_size = *(unsigned short *)(unsigned long)ebda_addr; + + /* Round EBDA up to pages */ + if (ebda_size == 0) + ebda_size = 1; + ebda_size <<= 10; + ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE); + if (ebda_size > 64*1024) + ebda_size = 64*1024; } void __init setup_arch(char **cmdline_p) @@ -627,6 +638,8 @@ void __init setup_arch(char **cmdline_p) check_efer(); + discover_ebda(); + init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT)); dmi_scan_machine(); @@ -669,7 +682,8 @@ void __init setup_arch(char **cmdline_p) reserve_bootmem_generic(0, PAGE_SIZE); /* reserve ebda region */ - reserve_ebda_region(); + if (ebda_addr) + reserve_bootmem_generic(ebda_addr, ebda_size); #ifdef CONFIG_SMP /* -- cgit v1.2.3 From f0fdabf8bf187c9aafeb139a828c530ef45cf022 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 15 May 2006 18:19:38 +0200 Subject: [PATCH] x86_64: Don't warn for overflow in nommu case when dma_mask is < 32bit This triggers for b44's 1GB DMA workaround which tries to map first and then bounces. The 32bit heuristic is reasonable because the IOMMU doesn't attempt to handle < 32bit masks anyways. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/pci-nommu.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c index 44adcc2d5e5..1f6ecc62061 100644 --- a/arch/x86_64/kernel/pci-nommu.c +++ b/arch/x86_64/kernel/pci-nommu.c @@ -12,9 +12,10 @@ static int check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) { if (hwdev && bus + size > *hwdev->dma_mask) { - printk(KERN_ERR - "nommu_%s: overflow %Lx+%lu of device mask %Lx\n", - name, (long long)bus, size, (long long)*hwdev->dma_mask); + if (*hwdev->dma_mask >= 0xffffffffULL) + printk(KERN_ERR + "nommu_%s: overflow %Lx+%lu of device mask %Lx\n", + name, (long long)bus, size, (long long)*hwdev->dma_mask); return 0; } return 1; -- cgit v1.2.3 From 40e59a61669a3cab9e0bd24644e29155d6b00970 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 15 May 2006 18:19:47 +0200 Subject: [PATCH] x86_64: Don't schedule on exception stack on preemptive kernels Extends an earlier patch from John Blackwood to more exception handlers that also run on the exception stacks. Expand the use of preempt_conditional_{sti,cli} to all cases where interrupts are to be re-enabled during exception handling while running on an IST stack. Based on original patch from Jan Beulich. Cc: John Blackwood Cc: jbeulich@novell.com Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/traps.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 6b87268c5c2..cea335e8746 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -102,6 +102,8 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) { if (regs->eflags & X86_EFLAGS_IF) local_irq_disable(); + /* Make sure to not schedule here because we could be running + on an exception stack. */ preempt_enable_no_resched(); } @@ -483,8 +485,6 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, { struct task_struct *tsk = current; - conditional_sti(regs); - tsk->thread.error_code = error_code; tsk->thread.trap_no = trapnr; @@ -521,6 +521,7 @@ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ return; \ + conditional_sti(regs); \ do_trap(trapnr, signr, str, regs, error_code, NULL); \ } @@ -535,6 +536,7 @@ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ return; \ + conditional_sti(regs); \ do_trap(trapnr, signr, str, regs, error_code, &info); \ } @@ -548,7 +550,17 @@ DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) DO_ERROR(18, SIGSEGV, "reserved", reserved) -DO_ERROR(12, SIGBUS, "stack segment", stack_segment) + +/* Runs on IST stack */ +asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code) +{ + if (notify_die(DIE_TRAP, "stack segment", regs, error_code, + 12, SIGBUS) == NOTIFY_STOP) + return; + preempt_conditional_sti(regs); + do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); + preempt_conditional_cli(regs); +} asmlinkage void do_double_fault(struct pt_regs * regs, long error_code) { @@ -682,8 +694,9 @@ asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code) if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) { return; } + preempt_conditional_sti(regs); do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); - return; + preempt_conditional_cli(regs); } /* Help handler running on IST stack to switch back to user stack -- cgit v1.2.3