From 4ed0d3e6c64cfd9ba4ceb2099b10d1cf8ece4320 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Fri, 24 Apr 2009 17:30:20 -0700 Subject: Intel IOMMU Pass Through Support The patch adds kernel parameter intel_iommu=pt to set up pass through mode in context mapping entry. This disables DMAR in linux kernel; but KVM still runs on VT-d and interrupt remapping still works. In this mode, kernel uses swiotlb for DMA API functions but other VT-d functionalities are enabled for KVM. KVM always uses multi level translation page table in VT-d. By default, pass though mode is disabled in kernel. This is useful when people don't want to enable VT-d DMAR in kernel but still want to use KVM and interrupt remapping for reasons like DMAR performance concern or debug purpose. Signed-off-by: Fenghua Yu Acked-by: Weidong Han Signed-off-by: David Woodhouse --- arch/x86/kernel/pci-dma.c | 6 ++++++ arch/x86/kernel/pci-swiotlb.c | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 745579bc825..8cad0d85424 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -160,6 +160,8 @@ again: return page_address(page); } +extern int iommu_pass_through; + /* * See for the iommu kernel parameter * documentation. @@ -209,6 +211,10 @@ static __init int iommu_setup(char *p) #ifdef CONFIG_SWIOTLB if (!strncmp(p, "soft", 4)) swiotlb = 1; + if (!strncmp(p, "pt", 2)) { + iommu_pass_through = 1; + return 1; + } #endif gart_parse_options(p); diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 221a3853e26..3a0c51e0ba6 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -71,7 +71,8 @@ void __init pci_swiotlb_init(void) { /* don't initialize swiotlb if iommu=off (no_iommu=1) */ #ifdef CONFIG_X86_64 - if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) + if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) || + iommu_pass_through) swiotlb = 1; #endif if (swiotlb_force) -- cgit v1.2.3 From aed5d5f4c5ea5da01a774e42cff08c4b4fa59072 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Thu, 30 Apr 2009 17:57:11 -0700 Subject: Fix !CONFIG_DMAR build failure introduced by Intel IOMMU Pass Through Support This updated patch should fix the compiling errors and remove the extern iommu_pass_through from drivers/pci/intel-iommu.c file. Signed-off-by: Fenghua Yu Signed-off-by: David Woodhouse --- arch/x86/kernel/pci-dma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 8cad0d85424..049005e8217 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -32,6 +32,8 @@ int no_iommu __read_mostly; /* Set this to 1 if there is a HW IOMMU in the system */ int iommu_detected __read_mostly = 0; +int iommu_pass_through; + dma_addr_t bad_dma_address __read_mostly = 0; EXPORT_SYMBOL(bad_dma_address); @@ -160,8 +162,6 @@ again: return page_address(page); } -extern int iommu_pass_through; - /* * See for the iommu kernel parameter * documentation. -- cgit v1.2.3 From ee1ca48fae7e575d5e399d4fdcfe0afc1212a64c Mon Sep 17 00:00:00 2001 From: "Pallipadi, Venkatesh" Date: Thu, 21 May 2009 17:09:10 -0700 Subject: ACPI: Disable ARB_DISABLE on platforms where it is not needed ARB_DISABLE is a NOP on all of the recent Intel platforms. For such platforms, reduce contention on c3_lock by skipping the fake ARB_DISABLE. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Len Brown --- arch/x86/kernel/acpi/cstate.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index bbbe4bbb6f3..8c44c232efc 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -34,12 +34,22 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, flags->bm_check = 1; else if (c->x86_vendor == X86_VENDOR_INTEL) { /* - * Today all CPUs that support C3 share cache. - * TBD: This needs to look at cache shared map, once - * multi-core detection patch makes to the base. + * Today all MP CPUs that support C3 share cache. + * And caches should not be flushed by software while + * entering C3 type state. */ flags->bm_check = 1; } + + /* + * On all recent Intel platforms, ARB_DISABLE is a nop. + * So, set bm_control to zero to indicate that ARB_DISABLE + * is not required while entering C3 type state on + * P4, Core and beyond CPUs + */ + if (c->x86_vendor == X86_VENDOR_INTEL && + (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 14))) + flags->bm_control = 0; } EXPORT_SYMBOL(acpi_processor_power_init_bm_check); -- cgit v1.2.3 From c636f753b5b943f08fb3490e7f1a9b47aa5cc7d3 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Tue, 19 May 2009 23:47:38 -0400 Subject: ACPI: delete dead acpi_disabled setting code Testing CONFIG_ACPI inside boot.c is a waste of text, since boot.c is built only when CONFIG_ACPI=y Signed-off-by: Len Brown --- arch/x86/kernel/acpi/boot.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 723989d7f80..d4acedf0786 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -43,11 +43,7 @@ static int __initdata acpi_force = 0; u32 acpi_rsdt_forced; -#ifdef CONFIG_ACPI -int acpi_disabled = 0; -#else -int acpi_disabled = 1; -#endif +int acpi_disabled; EXPORT_SYMBOL(acpi_disabled); #ifdef CONFIG_X86_64 -- cgit v1.2.3 From d023e49118b9c93bbab9aaf798b25f78f1a5803c Mon Sep 17 00:00:00 2001 From: Olivier Berger Date: Thu, 21 May 2009 16:07:38 +0200 Subject: ACPI: Remove Asus P4B266 from blacklist See http://marc.info/?l=linux-acpi&m=124068823904429&w=2 for discussion Signed-off-by: Olivier Berger Signed-off-by: Len Brown --- arch/x86/kernel/acpi/boot.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index d4acedf0786..817d6a5e115 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1563,14 +1563,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "Workstation W8000"), }, }, - { - .callback = force_acpi_ht, - .ident = "ASUS P4B266", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), - DMI_MATCH(DMI_BOARD_NAME, "P4B266"), - }, - }, { .callback = force_acpi_ht, .ident = "ASUS P2B-DS", -- cgit v1.2.3 From c4bf2f372db09ef8d16a25a60d523bfa1c50f7b5 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Thu, 11 Jun 2009 23:53:55 -0400 Subject: ACPI, PCI, x86: move MCFG parsing routine from ACPI to PCI file Move arch/x86/kernel/acpi/boot.c: acpi_parse_mcfg() to arch/x86/pci/mmconfig-shared.c: pci_parse_mcfg() where it is used, and make it static. Move associated globals and helper routine with it. No functional change. This code move is in preparation for SFI support, which will allow the PCI code to find the MCFG table on systems which do not support ACPI. Signed-off-by: Len Brown Acked-by: Jesse Barnes --- arch/x86/kernel/acpi/boot.c | 66 --------------------------------------------- 1 file changed, 66 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 817d6a5e115..f54e0e557cd 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -117,72 +117,6 @@ void __init __acpi_unmap_table(char *map, unsigned long size) early_iounmap(map, size); } -#ifdef CONFIG_PCI_MMCONFIG - -static int acpi_mcfg_64bit_base_addr __initdata = FALSE; - -/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ -struct acpi_mcfg_allocation *pci_mmcfg_config; -int pci_mmcfg_config_num; - -static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg) -{ - if (!strcmp(mcfg->header.oem_id, "SGI")) - acpi_mcfg_64bit_base_addr = TRUE; - - return 0; -} - -int __init acpi_parse_mcfg(struct acpi_table_header *header) -{ - struct acpi_table_mcfg *mcfg; - unsigned long i; - int config_size; - - if (!header) - return -EINVAL; - - mcfg = (struct acpi_table_mcfg *)header; - - /* how many config structures do we have */ - pci_mmcfg_config_num = 0; - i = header->length - sizeof(struct acpi_table_mcfg); - while (i >= sizeof(struct acpi_mcfg_allocation)) { - ++pci_mmcfg_config_num; - i -= sizeof(struct acpi_mcfg_allocation); - }; - if (pci_mmcfg_config_num == 0) { - printk(KERN_ERR PREFIX "MMCONFIG has no entries\n"); - return -ENODEV; - } - - config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config); - pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL); - if (!pci_mmcfg_config) { - printk(KERN_WARNING PREFIX - "No memory for MCFG config tables\n"); - return -ENOMEM; - } - - memcpy(pci_mmcfg_config, &mcfg[1], config_size); - - acpi_mcfg_oem_check(mcfg); - - for (i = 0; i < pci_mmcfg_config_num; ++i) { - if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) && - !acpi_mcfg_64bit_base_addr) { - printk(KERN_ERR PREFIX - "MMCONFIG not in low 4GB of memory\n"); - kfree(pci_mmcfg_config); - pci_mmcfg_config_num = 0; - return -ENODEV; - } - } - - return 0; -} -#endif /* CONFIG_PCI_MMCONFIG */ - #ifdef CONFIG_X86_LOCAL_APIC static int __init acpi_parse_madt(struct acpi_table_header *table) { -- cgit v1.2.3 From 4d2be1267fcfb3a4d2198fd696aec5e3dcbce60e Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 11 Jun 2009 15:28:09 +0530 Subject: perf_counter, x86: Check old-AMD performance monitoring support AMD supports performance monitoring start from K7 (i.e. family 6), so disable it for earlier AMD CPUs. Signed-off-by: Jaswinder Singh Rajput Cc: Robert Richter Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: <1244714289.6923.0.camel@ht.satnam> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 275bc142cd5..3c37c3930ca 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1459,6 +1459,10 @@ static int intel_pmu_init(void) static int amd_pmu_init(void) { + /* Performance-monitoring supported from K7 and later: */ + if (boot_cpu_data.x86 < 6) + return -ENODEV; + x86_pmu = amd_pmu; switch (boot_cpu_data.x86) { -- cgit v1.2.3 From f4db43a38f7387c3b19c9565124c06ab0c5d6e9a Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 13 Jun 2009 01:06:21 +0530 Subject: perf_counter, x86: Update AMD hw caching related event table All AMD models share the same hw caching related event table. Also complete the table with more events. Signed-off-by: Jaswinder Singh Rajput Cc: Robert Richter Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: <1244835381.2802.2.camel@ht.satnam> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 3c37c3930ca..77a59a5566a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -389,23 +389,23 @@ static u64 intel_pmu_raw_event(u64 event) return event & CORE_EVNTSEL_MASK; } -static const u64 amd_0f_hw_cache_event_ids +static const u64 amd_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = { [ C(L1D) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, + [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ + [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ }, [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_ACCESS) ] = 0x0042, /* Data Cache Refills from L2 */ [ C(RESULT_MISS) ] = 0, }, [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, + [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */ + [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */ }, }, [ C(L1I ) ] = { @@ -418,17 +418,17 @@ static const u64 amd_0f_hw_cache_event_ids [ C(RESULT_MISS) ] = -1, }, [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */ [ C(RESULT_MISS) ] = 0, }, }, [ C(LL ) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, + [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */ + [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */ }, [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */ [ C(RESULT_MISS) ] = 0, }, [ C(OP_PREFETCH) ] = { @@ -438,8 +438,8 @@ static const u64 amd_0f_hw_cache_event_ids }, [ C(DTLB) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, + [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ + [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */ }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = 0, @@ -1465,16 +1465,10 @@ static int amd_pmu_init(void) x86_pmu = amd_pmu; - switch (boot_cpu_data.x86) { - case 0x0f: - case 0x10: - case 0x11: - memcpy(hw_cache_event_ids, amd_0f_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); + /* Events are common for all AMDs */ + memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); - pr_cont("AMD Family 0f/10/11 events, "); - break; - } return 0; } -- cgit v1.2.3 From 507fa3a3d80365c595113a5ac3232309e3dbf5d8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Jun 2009 17:46:01 +0200 Subject: x86: hpet: Mark per cpu interrupts IRQF_TIMER to prevent resume failure timer interrupts are excluded from being disabled during suspend. The clock events code manages the disabling of clock events on its own because the timer interrupt needs to be functional before the resume code reenables the device interrupts. The hpet per cpu timers request their interrupt without setting the IRQF_TIMER flag so suspend_device_irqs() disables them as well which results in a fatal resume failure on the boot CPU. Adding IRQF_TIMER to the interupt flags when requesting the hpet per cpu timer interrupts solves the problem. Reported-by: Benjamin S. Signed-off-by: Thomas Gleixner Tested-by: Benjamin S. Cc: stable@kernel.org --- arch/x86/kernel/hpet.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 81408b93f88..dedc2bddf7a 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -510,7 +510,8 @@ static int hpet_setup_irq(struct hpet_dev *dev) { if (request_irq(dev->irq, hpet_interrupt_handler, - IRQF_DISABLED|IRQF_NOBALANCING, dev->name, dev)) + IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, + dev->name, dev)) return -1; disable_irq(dev->irq); -- cgit v1.2.3 From 5a6cec3abbdb74244caab68db100825a8c4ac02d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 29 May 2009 11:25:09 +0200 Subject: perf_counter, x86: Fix call-chain walking Fix the ptregs variant when we hit user-mode tasks. Cc: Frederic Weisbecker Cc: Pekka Enberg Cc: Arjan van de Ven Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 77a59a5566a..09d8cb69c3f 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1644,7 +1644,9 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) const void __user *fp; int nr = entry->nr; - regs = (struct pt_regs *)current->thread.sp0 - 1; + if (!user_mode(regs)) + regs = task_pt_regs(current); + fp = (void __user *)regs->bp; callchain_store(entry, regs->ip); @@ -1656,7 +1658,7 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) if (!copy_stack_frame(fp, &frame)) break; - if ((unsigned long)fp < user_stack_pointer(regs)) + if ((unsigned long)fp < regs->sp) break; callchain_store(entry, frame.return_address); -- cgit v1.2.3 From 038e836e97e70c4ad2b5058b07fc7207f50b59dd Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Jun 2009 09:57:59 +0200 Subject: perf_counter, x86: Fix kernel-space call-chains Kernel-space call-chains were trimmed at the first entry because we never processed anything beyond the first stack context. Allow the backtrace to jump from NMI to IRQ stack then to task stack and finally user-space stack. Also calculate the stack and bp variables correctly so that the stack walker does not exit early. We can get deep traces as a result, visible in perf report -D output: 0x32af0 [0xe0]: PERF_EVENT (IP, 5): 15134: 0xffffffff815225fd period: 1 ... chain: u:2, k:22, nr:24 ..... 0: 0xffffffff815225fd ..... 1: 0xffffffff810ac51c ..... 2: 0xffffffff81018e29 ..... 3: 0xffffffff81523939 ..... 4: 0xffffffff81524b8f ..... 5: 0xffffffff81524bd9 ..... 6: 0xffffffff8105e498 ..... 7: 0xffffffff8152315a ..... 8: 0xffffffff81522c3a ..... 9: 0xffffffff810d9b74 ..... 10: 0xffffffff810dbeec ..... 11: 0xffffffff810dc3fb This is a 22-entries kernel-space chain. (We still only record reliable stack entries.) Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 09d8cb69c3f..6d5e7cfd97e 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1575,8 +1575,8 @@ static void backtrace_warning(void *data, char *msg) static int backtrace_stack(void *data, char *name) { - /* Don't bother with IRQ stacks for now */ - return -1; + /* Process all stacks: */ + return 0; } static void backtrace_address(void *data, unsigned long addr, int reliable) @@ -1594,6 +1594,8 @@ static const struct stacktrace_ops backtrace_ops = { .address = backtrace_address, }; +#include "../dumpstack.h" + static void perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) { @@ -1601,26 +1603,20 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) char *stack; int nr = entry->nr; - callchain_store(entry, instruction_pointer(regs)); + callchain_store(entry, regs->ip); stack = ((char *)regs + sizeof(struct pt_regs)); #ifdef CONFIG_FRAME_POINTER - bp = frame_pointer(regs); + get_bp(bp); #else bp = 0; #endif - dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry); + dump_trace(NULL, regs, (void *)&stack, bp, &backtrace_ops, entry); entry->kernel = entry->nr - nr; } - -struct stack_frame { - const void __user *next_fp; - unsigned long return_address; -}; - static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) { int ret; @@ -1652,7 +1648,7 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) callchain_store(entry, regs->ip); while (entry->nr < MAX_STACK_DEPTH) { - frame.next_fp = NULL; + frame.next_frame = NULL; frame.return_address = 0; if (!copy_stack_frame(fp, &frame)) @@ -1662,7 +1658,7 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) break; callchain_store(entry, frame.return_address); - fp = frame.next_fp; + fp = frame.next_frame; } entry->user = entry->nr - nr; -- cgit v1.2.3 From 0975904276552c8e201dad0ad31152ba8a21505a Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 9 Jun 2009 17:52:27 +0200 Subject: amd-iommu: disable IOMMU hardware on shutdown When the IOMMU stays enabled the BIOS may not be able to finish the machine shutdown properly. So disable the hardware on shutdown. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 5 +++++ arch/x86/kernel/pci-dma.c | 2 ++ 2 files changed, 7 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 238989ec077..575ca46211b 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1273,6 +1273,11 @@ free: goto out; } +void amd_iommu_shutdown(void) +{ + disable_iommus(); +} + /**************************************************************************** * * Early detect code. This code runs at IOMMU detection time in the DMA diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 745579bc825..328592fb604 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -290,6 +290,8 @@ static int __init pci_iommu_init(void) void pci_iommu_shutdown(void) { gart_iommu_shutdown(); + + amd_iommu_shutdown(); } /* Must execute after PCI subsystem */ fs_initcall(pci_iommu_init); -- cgit v1.2.3 From 61d047be99757fd9b0af900d7abce9a13a337488 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 9 Jun 2009 17:56:09 +0200 Subject: x86: disable IOMMUs on kernel crash If the IOMMUs are still enabled when the kexec kernel boots access to the disk is not possible. This is bad for tools like kdump or anything else which wants to use PCI devices. Signed-off-by: Joerg Roedel --- arch/x86/kernel/crash.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index ff958248e61..5e409dc298a 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -27,6 +27,7 @@ #include #include #include +#include #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) @@ -103,5 +104,10 @@ void native_machine_crash_shutdown(struct pt_regs *regs) #ifdef CONFIG_HPET_TIMER hpet_disable(); #endif + +#ifdef CONFIG_X86_64 + pci_iommu_shutdown(); +#endif + crash_save_cpu(regs, safe_smp_processor_id()); } -- cgit v1.2.3 From 42a49f965a8d24ed92af04f5b564d63f17fd9c56 Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Mon, 15 Jun 2009 15:42:00 +0200 Subject: amd-iommu: flush domain tlb when attaching a new device When kexec'ing to a new kernel (for example, when crashing and launching a kdump session), the AMD IOMMU may have cached translations. The kexec'd kernel, during initialization, will invalidate the IOMMU device table entries, but not the domain translations. These stale entries can cause a device's DMA to fail, makes it rough to write a dump to disk when the disk controller can't DMA ;-) Signed-off-by: Chris Wright Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 1c60554537c..9372f0406ad 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -434,6 +434,16 @@ static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid) iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1); } +/* Flush the whole IO/TLB for a given protection domain - including PDE */ +static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid) +{ + u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; + + INC_STATS_COUNTER(domain_flush_single); + + iommu_queue_inv_iommu_pages(iommu, address, domid, 1, 1); +} + /* * This function is used to flush the IO/TLB for a given protection domain * on every IOMMU in the system @@ -1078,7 +1088,13 @@ static void attach_device(struct amd_iommu *iommu, amd_iommu_pd_table[devid] = domain; write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + /* + * We might boot into a crash-kernel here. The crashed kernel + * left the caches in the IOMMU dirty. So we have to flush + * here to evict all dirty stuff. + */ iommu_queue_inv_dev_entry(iommu, devid); + iommu_flush_tlb_pde(iommu, domain->id); } /* -- cgit v1.2.3 From a8c485bb6857811807d42f9fd1fde2f5f89cc5c9 Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Mon, 15 Jun 2009 15:53:45 +0200 Subject: amd-iommu: disable cmd buffer and evt logging before reprogramming iommu The IOMMU spec states that IOMMU behavior may be undefined when the IOMMU registers are rewritten while command or event buffer is enabled. Disable them in IOMMU disable path. Signed-off-by: Chris Wright Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 575ca46211b..48a79b9b2f9 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -260,6 +260,14 @@ static void iommu_enable(struct amd_iommu *iommu) static void iommu_disable(struct amd_iommu *iommu) { + /* Disable command buffer */ + iommu_feature_disable(iommu, CONTROL_CMDBUF_EN); + + /* Disable event logging and event interrupts */ + iommu_feature_disable(iommu, CONTROL_EVT_INT_EN); + iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN); + + /* Disable IOMMU hardware itself */ iommu_feature_disable(iommu, CONTROL_IOMMU_EN); } @@ -1042,6 +1050,7 @@ static void enable_iommus(void) struct amd_iommu *iommu; for_each_iommu(iommu) { + iommu_disable(iommu); iommu_set_device_table(iommu); iommu_enable_command_buffer(iommu); iommu_enable_event_buffer(iommu); -- cgit v1.2.3 From 74193ef0ecab92535c8517f082f1f50504526c9b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Jun 2009 13:07:24 +0200 Subject: perf_counter: x86: Fix call-chain support to use NMI-safe methods __copy_from_user_inatomic() isn't NMI safe in that it can trigger the page fault handler which is another trap and its return path invokes IRET which will also close the NMI context. Therefore use a GUP based approach to copy the stack frames over. We tried an alternative solution as well: we used a forward ported version of Mathieu Desnoyers's "NMI safe INT3 and Page Fault" patch that modifies the exception return path to use an open-coded IRET with explicit stack unrolling and TF checking. This didnt work as it interacted with faulting user-space instructions, causing them not to restart properly, which corrupts user-space registers. Solving that would probably involve disassembling those instructions and backtracing the RIP. But even without that, the code was deemed rather complex to the already non-trivial x86 entry assembly code, so instead we went for this GUP based method that does a software-walk of the pagetables. Signed-off-by: Peter Zijlstra Cc: Nick Piggin Cc: Pekka Enberg Cc: Vegard Nossum Cc: Jeremy Fitzhardinge Cc: Mathieu Desnoyers Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 49 ++++++++++++++++++++++++++++++-------- 1 file changed, 39 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 6d5e7cfd97e..e8c68a5091d 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -1617,20 +1618,48 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) entry->kernel = entry->nr - nr; } -static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +/* + * best effort, GUP based copy_from_user() that assumes IRQ or NMI context + */ +static unsigned long +copy_from_user_nmi(void *to, const void __user *from, unsigned long n) { + unsigned long offset, addr = (unsigned long)from; + int type = in_nmi() ? KM_NMI : KM_IRQ0; + unsigned long size, len = 0; + struct page *page; + void *map; int ret; - if (!access_ok(VERIFY_READ, fp, sizeof(*frame))) - return 0; + do { + ret = __get_user_pages_fast(addr, 1, 0, &page); + if (!ret) + break; - ret = 1; - pagefault_disable(); - if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) - ret = 0; - pagefault_enable(); + offset = addr & (PAGE_SIZE - 1); + size = min(PAGE_SIZE - offset, n - len); - return ret; + map = kmap_atomic(page, type); + memcpy(to, map+offset, size); + kunmap_atomic(map, type); + put_page(page); + + len += size; + to += size; + addr += size; + + } while (len < n); + + return len; +} + +static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +{ + unsigned long bytes; + + bytes = copy_from_user_nmi(frame, fp, sizeof(*frame)); + + return bytes == sizeof(*frame); } static void @@ -1643,7 +1672,7 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) if (!user_mode(regs)) regs = task_pt_regs(current); - fp = (void __user *)regs->bp; + fp = (void __user *)regs->bp; callchain_store(entry, regs->ip); -- cgit v1.2.3 From 09067207f6eacb7f00c8f7f0623c3696083ce042 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 15 Jun 2009 16:06:48 +0200 Subject: amd-iommu: set event buffer head and tail to 0 manually These registers may contain values from previous kernels. So reset them to known values before enable the event buffer again. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 48a79b9b2f9..068a3569f83 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -486,6 +486,10 @@ static void iommu_enable_event_buffer(struct amd_iommu *iommu) memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, &entry, sizeof(entry)); + /* set head and tail to zero manually */ + writel(0x00, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); + writel(0x00, iommu->mmio_base + MMIO_EVT_TAIL_OFFSET); + iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); } -- cgit v1.2.3 From 6a047d8b9efc4b7d0c57ca4835f7e519e5c90d3f Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Tue, 16 Jun 2009 03:01:37 -0400 Subject: amd-iommu: resume cleanup Now that enable_iommus() will call iommu_disable() for each iommu, the call to disable_iommus() during resume is redundant. Also, the order for an invalidation is to invalidate device table entries first, then domain translations. Signed-off-by: Chris Wright Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 068a3569f83..10b2accd12e 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1079,12 +1079,6 @@ static void disable_iommus(void) static int amd_iommu_resume(struct sys_device *dev) { - /* - * Disable IOMMUs before reprogramming the hardware registers. - * IOMMU is still enabled from the resume kernel. - */ - disable_iommus(); - /* re-load the hardware */ enable_iommus(); @@ -1092,8 +1086,8 @@ static int amd_iommu_resume(struct sys_device *dev) * we have to flush after the IOMMUs are enabled because a * disabled IOMMU will never execute the commands we send */ - amd_iommu_flush_all_domains(); amd_iommu_flush_all_devices(); + amd_iommu_flush_all_domains(); return 0; } -- cgit v1.2.3 From 184e1fdfea066ab8f12a1e8912f402d2d6556d11 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 15 Jun 2009 15:37:07 +0800 Subject: x86, mce: fix a race condition about mce_callin and no_way_out If one CPU has no_way_out == 1, all other CPUs should have no_way_out == 1. But despite global_nwo is read after mce_callin, global_nwo is updated after mce_callin too. So it is possible that some CPU read global_nwo before some other CPU update global_nwo, so that no_way_out == 1 for some CPU, while no_way_out == 0 for some other CPU. This patch fixes this race condition via moving mce_callin updating after global_nwo updating, with a smp_wmb in between. A smp_rmb is added between their reading too. Signed-off-by: Huang Ying Acked-by: Andi Kleen Acked-by: Hidetoshi Seto --- arch/x86/kernel/cpu/mcheck/mce.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index fabba15e455..19294b8524c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -703,6 +703,11 @@ static int mce_start(int no_way_out, int *order) } atomic_add(no_way_out, &global_nwo); + /* + * global_nwo should be updated before mce_callin + */ + smp_wmb(); + *order = atomic_add_return(1, &mce_callin); /* * Wait for everyone. @@ -716,6 +721,10 @@ static int mce_start(int no_way_out, int *order) ndelay(SPINUNIT); } + /* + * mce_callin should be read before global_nwo + */ + smp_rmb(); /* * Cache the global no_way_out state. */ @@ -862,7 +871,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) * Establish sequential order between the CPUs entering the machine * check handler. */ - int order; + int order = -1; /* * If no_way_out gets set, there is no safe way to recover from this @@ -887,7 +896,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) if (!banks) goto out; - order = atomic_add_return(1, &mce_callin); mce_setup(&m); m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); -- cgit v1.2.3 From 33edbf02a92771fa2a81e41084a44ba874e3a5a5 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:18:45 +0900 Subject: x86, mce: don't init timer if !mce_available In mce_cpu_restart, mce_init_timer is called unconditionally. If !mce_available (e.g. mce is disabled), there are no useful work for timer. Stop running it. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 19294b8524c..dda77215e9e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1617,8 +1617,9 @@ static int mce_resume(struct sys_device *dev) static void mce_cpu_restart(void *data) { del_timer_sync(&__get_cpu_var(mce_timer)); - if (mce_available(¤t_cpu_data)) - mce_init(); + if (!mce_available(¤t_cpu_data)) + return; + mce_init(); mce_init_timer(); } -- cgit v1.2.3 From 7fb06fc9672b947424e05871243a4c8e19ec3bce Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 18:18:43 +0900 Subject: x86, mce: cleanup mce_start() Simplify interface of mce_start(): - no_way_out = mce_start(no_way_out, &order); + order = mce_start(&no_way_out); Now Monarch and Subjects share same exit(return) in usual path. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 66 +++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 35 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index dda77215e9e..739fd7eca0a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -691,23 +691,21 @@ static atomic_t global_nwo; * in the entry order. * TBD double check parallel CPU hotunplug */ -static int mce_start(int no_way_out, int *order) +static int mce_start(int *no_way_out) { - int nwo; + int order; int cpus = num_online_cpus(); u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; - if (!timeout) { - *order = -1; - return no_way_out; - } + if (!timeout) + return -1; - atomic_add(no_way_out, &global_nwo); + atomic_add(*no_way_out, &global_nwo); /* * global_nwo should be updated before mce_callin */ smp_wmb(); - *order = atomic_add_return(1, &mce_callin); + order = atomic_add_return(1, &mce_callin); /* * Wait for everyone. @@ -715,8 +713,7 @@ static int mce_start(int no_way_out, int *order) while (atomic_read(&mce_callin) != cpus) { if (mce_timed_out(&timeout)) { atomic_set(&global_nwo, 0); - *order = -1; - return no_way_out; + return -1; } ndelay(SPINUNIT); } @@ -725,34 +722,34 @@ static int mce_start(int no_way_out, int *order) * mce_callin should be read before global_nwo */ smp_rmb(); - /* - * Cache the global no_way_out state. - */ - nwo = atomic_read(&global_nwo); - /* - * Monarch starts executing now, the others wait. - */ - if (*order == 1) { + if (order == 1) { + /* + * Monarch: Starts executing now, the others wait. + */ atomic_set(&mce_executing, 1); - return nwo; + } else { + /* + * Subject: Now start the scanning loop one by one in + * the original callin order. + * This way when there are any shared banks it will be + * only seen by one CPU before cleared, avoiding duplicates. + */ + while (atomic_read(&mce_executing) < order) { + if (mce_timed_out(&timeout)) { + atomic_set(&global_nwo, 0); + return -1; + } + ndelay(SPINUNIT); + } } /* - * Now start the scanning loop one by one - * in the original callin order. - * This way when there are any shared banks it will - * be only seen by one CPU before cleared, avoiding duplicates. + * Cache the global no_way_out state. */ - while (atomic_read(&mce_executing) < *order) { - if (mce_timed_out(&timeout)) { - atomic_set(&global_nwo, 0); - *order = -1; - return no_way_out; - } - ndelay(SPINUNIT); - } - return nwo; + *no_way_out = atomic_read(&global_nwo); + + return order; } /* @@ -871,8 +868,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) * Establish sequential order between the CPUs entering the machine * check handler. */ - int order = -1; - + int order; /* * If no_way_out gets set, there is no safe way to recover from this * MCE. If tolerant is cranked up, we'll try anyway. @@ -917,7 +913,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) * This way we don't report duplicated events on shared banks * because the first one to see it will clear it. */ - no_way_out = mce_start(no_way_out, &order); + order = mce_start(&no_way_out); for (i = 0; i < banks; i++) { __clear_bit(i, toclear); if (!bank[i]) -- cgit v1.2.3 From 4e5b3e690dda890523e93af9c545261f5916a3a6 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:20:20 +0900 Subject: x86, mce: add __read_mostly Add __read_mostly to data written during setup. Suggested-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 739fd7eca0a..2d2e0b565a9 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -57,7 +57,7 @@ static void unexpected_machine_check(struct pt_regs *regs, long error_code) void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; -int mce_disabled; +int mce_disabled __read_mostly; #ifdef CONFIG_X86_NEW_MCE @@ -76,19 +76,19 @@ DEFINE_PER_CPU(unsigned, mce_exception_count); * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors * 3: never panic or SIGBUS, log all errors (for testing only) */ -static int tolerant = 1; -static int banks; -static u64 *bank; -static unsigned long notify_user; -static int rip_msr; -static int mce_bootlog = -1; -static int monarch_timeout = -1; -static int mce_panic_timeout; -static int mce_dont_log_ce; -int mce_cmci_disabled; -int mce_ignore_ce; -int mce_ser; +static int tolerant __read_mostly = 1; +static int banks __read_mostly; +static u64 *bank __read_mostly; +static int rip_msr __read_mostly; +static int mce_bootlog __read_mostly = -1; +static int monarch_timeout __read_mostly = -1; +static int mce_panic_timeout __read_mostly; +static int mce_dont_log_ce __read_mostly; +int mce_cmci_disabled __read_mostly; +int mce_ignore_ce __read_mostly; +int mce_ser __read_mostly; +static unsigned long notify_user; static char trigger[128]; static char *trigger_argv[2] = { trigger, NULL }; -- cgit v1.2.3 From 1020bcbcc7da36001d9226c5d57e999949cb80c5 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:20:57 +0900 Subject: x86, mce: rename static variables around trigger "trigger" is not straight forward name for valiable that holds name of user mode helper program which triggered by machine check events. This patch renames this valiable and kins to more recognizable names. trigger => mce_helper trigger_argv => mce_helper_argv notify_user => mce_need_notify No functional changes. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 2d2e0b565a9..e8b893e880e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -88,9 +88,10 @@ int mce_cmci_disabled __read_mostly; int mce_ignore_ce __read_mostly; int mce_ser __read_mostly; -static unsigned long notify_user; -static char trigger[128]; -static char *trigger_argv[2] = { trigger, NULL }; +/* User mode helper program triggered by machine check event */ +static unsigned long mce_need_notify; +static char mce_helper[128]; +static char *mce_helper_argv[2] = { mce_helper, NULL }; static unsigned long dont_init_banks; @@ -180,7 +181,7 @@ void mce_log(struct mce *mce) wmb(); mce->finished = 1; - set_bit(0, ¬ify_user); + set_bit(0, &mce_need_notify); } static void print_mce(struct mce *m) @@ -1122,7 +1123,7 @@ static void mcheck_timer(unsigned long data) static void mce_do_trigger(struct work_struct *work) { - call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); + call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); } static DECLARE_WORK(mce_trigger_work, mce_do_trigger); @@ -1139,7 +1140,7 @@ int mce_notify_irq(void) clear_thread_flag(TIF_MCE_NOTIFY); - if (test_and_clear_bit(0, ¬ify_user)) { + if (test_and_clear_bit(0, &mce_need_notify)) { wake_up_interruptible(&mce_wait); /* @@ -1147,7 +1148,7 @@ int mce_notify_irq(void) * work_pending is always cleared before the function is * executed. */ - if (trigger[0] && !work_pending(&mce_trigger_work)) + if (mce_helper[0] && !work_pending(&mce_trigger_work)) schedule_work(&mce_trigger_work); if (__ratelimit(&ratelimit)) @@ -1664,9 +1665,9 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) { - strcpy(buf, trigger); + strcpy(buf, mce_helper); strcat(buf, "\n"); - return strlen(trigger) + 1; + return strlen(mce_helper) + 1; } static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, @@ -1675,10 +1676,10 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *p; int len; - strncpy(trigger, buf, sizeof(trigger)); - trigger[sizeof(trigger)-1] = 0; - len = strlen(trigger); - p = strchr(trigger, '\n'); + strncpy(mce_helper, buf, sizeof(mce_helper)); + mce_helper[sizeof(mce_helper)-1] = 0; + len = strlen(mce_helper); + p = strchr(mce_helper, '\n'); if (*p) *p = 0; -- cgit v1.2.3 From 9af43b54ab4509f1dac49637d6917d57292e6518 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:21:36 +0900 Subject: x86, mce: sysfs entries for new mce options Add sysfs interface for admins who want to tweak these options without rebooting the system. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 84 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 83 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index e8b893e880e..4b62443b6e7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1626,6 +1626,26 @@ static void mce_restart(void) on_each_cpu(mce_cpu_restart, NULL, 1); } +/* Toggle features for corrected errors */ +static void mce_disable_ce(void *all) +{ + if (!mce_available(¤t_cpu_data)) + return; + if (all) + del_timer_sync(&__get_cpu_var(mce_timer)); + cmci_clear(); +} + +static void mce_enable_ce(void *all) +{ + if (!mce_available(¤t_cpu_data)) + return; + cmci_reenable(); + cmci_recheck(); + if (all) + mce_init_timer(); +} + static struct sysdev_class mce_sysclass = { .suspend = mce_suspend, .shutdown = mce_shutdown, @@ -1687,6 +1707,52 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, return len; } +static ssize_t set_ignore_ce(struct sys_device *s, + struct sysdev_attribute *attr, + const char *buf, size_t size) +{ + u64 new; + + if (strict_strtoull(buf, 0, &new) < 0) + return -EINVAL; + + if (mce_ignore_ce ^ !!new) { + if (new) { + /* disable ce features */ + on_each_cpu(mce_disable_ce, (void *)1, 1); + mce_ignore_ce = 1; + } else { + /* enable ce features */ + mce_ignore_ce = 0; + on_each_cpu(mce_enable_ce, (void *)1, 1); + } + } + return size; +} + +static ssize_t set_cmci_disabled(struct sys_device *s, + struct sysdev_attribute *attr, + const char *buf, size_t size) +{ + u64 new; + + if (strict_strtoull(buf, 0, &new) < 0) + return -EINVAL; + + if (mce_cmci_disabled ^ !!new) { + if (new) { + /* disable cmci */ + on_each_cpu(mce_disable_ce, NULL, 1); + mce_cmci_disabled = 1; + } else { + /* enable cmci */ + mce_cmci_disabled = 0; + on_each_cpu(mce_enable_ce, NULL, 1); + } + } + return size; +} + static ssize_t store_int_with_restart(struct sys_device *s, struct sysdev_attribute *attr, const char *buf, size_t size) @@ -1699,6 +1765,7 @@ static ssize_t store_int_with_restart(struct sys_device *s, static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); +static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); static struct sysdev_ext_attribute attr_check_interval = { _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, @@ -1706,9 +1773,24 @@ static struct sysdev_ext_attribute attr_check_interval = { &check_interval }; +static struct sysdev_ext_attribute attr_ignore_ce = { + _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce), + &mce_ignore_ce +}; + +static struct sysdev_ext_attribute attr_cmci_disabled = { + _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_cmci_disabled), + &mce_cmci_disabled +}; + static struct sysdev_attribute *mce_attrs[] = { - &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger, + &attr_tolerant.attr, + &attr_check_interval.attr, + &attr_trigger, &attr_monarch_timeout.attr, + &attr_dont_log_ce.attr, + &attr_ignore_ce.attr, + &attr_cmci_disabled.attr, NULL }; -- cgit v1.2.3 From 9e55e44e39798541ba39d57f4b569deb555ae1ce Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:22:15 +0900 Subject: x86, mce: unify mce.h There are 2 headers: arch/x86/include/asm/mce.h arch/x86/kernel/cpu/mcheck/mce.h and in the latter small header: #include This patch move all contents in the latter header into the former, and fix all files using the latter to include the former instead. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/k7.c | 3 +-- arch/x86/kernel/cpu/mcheck/mce.c | 1 - arch/x86/kernel/cpu/mcheck/mce.h | 38 ------------------------------- arch/x86/kernel/cpu/mcheck/mce_intel.c | 3 +-- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 2 -- arch/x86/kernel/cpu/mcheck/non-fatal.c | 3 +-- arch/x86/kernel/cpu/mcheck/p4.c | 3 +-- arch/x86/kernel/cpu/mcheck/p5.c | 3 +-- arch/x86/kernel/cpu/mcheck/p6.c | 3 +-- arch/x86/kernel/cpu/mcheck/winchip.c | 3 +-- arch/x86/kernel/traps.c | 3 +-- 11 files changed, 8 insertions(+), 57 deletions(-) delete mode 100644 arch/x86/kernel/cpu/mcheck/mce.h (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c index 89e51042415..b945d5dbc60 100644 --- a/arch/x86/kernel/cpu/mcheck/k7.c +++ b/arch/x86/kernel/cpu/mcheck/k7.c @@ -10,10 +10,9 @@ #include #include +#include #include -#include "mce.h" - /* Machine Check Handler For AMD Athlon/Duron: */ static void k7_machine_check(struct pt_regs *regs, long error_code) { diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 4b62443b6e7..faedd776847 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -44,7 +44,6 @@ #include #include "mce-internal.h" -#include "mce.h" /* Handle unconfigured int18 (should never happen) */ static void unexpected_machine_check(struct pt_regs *regs, long error_code) diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h deleted file mode 100644 index 84a552b458c..00000000000 --- a/arch/x86/kernel/cpu/mcheck/mce.h +++ /dev/null @@ -1,38 +0,0 @@ -#include -#include - -#ifdef CONFIG_X86_OLD_MCE -void amd_mcheck_init(struct cpuinfo_x86 *c); -void intel_p4_mcheck_init(struct cpuinfo_x86 *c); -void intel_p6_mcheck_init(struct cpuinfo_x86 *c); -#endif - -#ifdef CONFIG_X86_ANCIENT_MCE -void intel_p5_mcheck_init(struct cpuinfo_x86 *c); -void winchip_mcheck_init(struct cpuinfo_x86 *c); -extern int mce_p5_enable; -static inline int mce_p5_enabled(void) { return mce_p5_enable; } -static inline void enable_p5_mce(void) { mce_p5_enable = 1; } -#else -static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {} -static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {} -static inline int mce_p5_enabled(void) { return 0; } -static inline void enable_p5_mce(void) { } -#endif - -/* Call the installed machine check handler for this CPU setup. */ -extern void (*machine_check_vector)(struct pt_regs *, long error_code); - -#ifdef CONFIG_X86_OLD_MCE - -extern int nr_mce_banks; - -void intel_set_thermal_handler(void); - -#else - -static inline void intel_set_thermal_handler(void) { } - -#endif - -void intel_init_thermal(struct cpuinfo_x86 *c); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 2b011d2d857..475478b2088 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -11,10 +11,9 @@ #include #include #include +#include #include -#include "mce.h" - void intel_init_thermal(struct cpuinfo_x86 *c) { unsigned int cpu = smp_processor_id(); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index f2ef6952c40..c548111d011 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -16,8 +16,6 @@ #include #include -#include "mce.h" - asmlinkage void smp_thermal_interrupt(void) { __u64 msr_val; diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c index 70b710420f7..f5f2d6f71fb 100644 --- a/arch/x86/kernel/cpu/mcheck/non-fatal.c +++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c @@ -17,10 +17,9 @@ #include #include +#include #include -#include "mce.h" - static int firstbank; #define MCE_RATE (15*HZ) /* timer rate is 15s */ diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index 82cee108a2d..8d3e40edd64 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -12,10 +12,9 @@ #include #include #include +#include #include -#include "mce.h" - /* as supported by the P4/Xeon family */ struct intel_mce_extended_msrs { u32 eax; diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index 015f481ab1b..747853f9188 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -10,10 +10,9 @@ #include #include +#include #include -#include "mce.h" - /* By default disabled */ int mce_p5_enable; diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c index 43c24e66745..01e4f817818 100644 --- a/arch/x86/kernel/cpu/mcheck/p6.c +++ b/arch/x86/kernel/cpu/mcheck/p6.c @@ -10,10 +10,9 @@ #include #include +#include #include -#include "mce.h" - /* Machine Check Handler For PII/PIII */ static void intel_machine_check(struct pt_regs *regs, long error_code) { diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index 81b02487090..54060f56597 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -9,10 +9,9 @@ #include #include +#include #include -#include "mce.h" - /* Machine check handler for WinChip C6: */ static void winchip_machine_check(struct pt_regs *regs, long error_code) { diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 1e1e27b7d43..71f9c74814d 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -53,6 +53,7 @@ #include #include #include +#include #include @@ -64,8 +65,6 @@ #include #include -#include "cpu/mcheck/mce.h" - asmlinkage int system_call(void); /* Do we ignore FPU interrupts ? */ -- cgit v1.2.3 From c697836985e18d9c34897428ba563b13044a6dcd Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:22:49 +0900 Subject: x86, mce: make mce_disabled boolean The mce_disabled on 32bit is a tristate variable [1,0,-1], while 64bit version is boolean [0,1]. This patch makes mce_disabled always boolean, and use mce_p5_enabled to indicate the third state instead. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 8 +++----- arch/x86/kernel/cpu/mcheck/p5.c | 12 +++++------- 2 files changed, 8 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index faedd776847..6095e0296ab 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1286,8 +1286,7 @@ static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) return; switch (c->x86_vendor) { case X86_VENDOR_INTEL: - if (mce_p5_enabled()) - intel_p5_mcheck_init(c); + intel_p5_mcheck_init(c); break; case X86_VENDOR_CENTAUR: winchip_mcheck_init(c); @@ -2002,7 +2001,7 @@ EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ /* This has to be run for each processor */ void mcheck_init(struct cpuinfo_x86 *c) { - if (mce_disabled == 1) + if (mce_disabled) return; switch (c->x86_vendor) { @@ -2032,10 +2031,9 @@ void mcheck_init(struct cpuinfo_x86 *c) static int __init mcheck_enable(char *str) { - mce_disabled = -1; + mce_p5_enabled = 1; return 1; } - __setup("mce", mcheck_enable); #endif /* CONFIG_X86_OLD_MCE */ diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index 747853f9188..5c0e6533d9b 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -14,7 +14,7 @@ #include /* By default disabled */ -int mce_p5_enable; +int mce_p5_enabled __read_mostly; /* Machine check handler for Pentium class Intel CPUs: */ static void pentium_machine_check(struct pt_regs *regs, long error_code) @@ -42,15 +42,13 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c) { u32 l, h; - /* Check for MCE support: */ - if (!cpu_has(c, X86_FEATURE_MCE)) + /* Default P5 to off as its often misconnected: */ + if (!mce_p5_enabled) return; -#ifdef CONFIG_X86_OLD_MCE - /* Default P5 to off as its often misconnected: */ - if (mce_disabled != -1) + /* Check for MCE support: */ + if (!cpu_has(c, X86_FEATURE_MCE)) return; -#endif machine_check_vector = pentium_machine_check; /* Make sure the vector pointer is visible before we enable MCEs: */ -- cgit v1.2.3 From 3adacb70d32046ccc9f0333b50bb2ba1582ccdf4 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:23:28 +0900 Subject: x86, mce: unify smp_thermal_interrupt, prepare p4 Remove unused argument regs from handlers, and use inc_irq_stat. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/p4.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index 8d3e40edd64..ddeed6e295a 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -35,7 +35,7 @@ static int mce_num_extended_msrs; #ifdef CONFIG_X86_MCE_P4THERMAL -static void unexpected_thermal_interrupt(struct pt_regs *regs) +static void unexpected_thermal_interrupt(void) { printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", smp_processor_id()); @@ -43,7 +43,7 @@ static void unexpected_thermal_interrupt(struct pt_regs *regs) } /* P4/Xeon Thermal transition interrupt handler: */ -static void intel_thermal_interrupt(struct pt_regs *regs) +static void intel_thermal_interrupt(void) { __u64 msr_val; @@ -54,14 +54,13 @@ static void intel_thermal_interrupt(struct pt_regs *regs) } /* Thermal interrupt handler for this CPU setup: */ -static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = - unexpected_thermal_interrupt; +static void (*vendor_thermal_interrupt)(void) = unexpected_thermal_interrupt; void smp_thermal_interrupt(struct pt_regs *regs) { irq_enter(); - vendor_thermal_interrupt(regs); - __get_cpu_var(irq_stat).irq_thermal_count++; + vendor_thermal_interrupt(); + inc_irq_stat(irq_thermal_count); irq_exit(); } -- cgit v1.2.3 From 5335612a574a45beab14193ec641ed2f45e7a523 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:24:09 +0900 Subject: x86, mce: unify smp_thermal_interrupt, prepare mce_intel_64 Break smp_thermal_interrupt() into two functions. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index c548111d011..a5232b2c4ca 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -16,19 +16,21 @@ #include #include -asmlinkage void smp_thermal_interrupt(void) +static void intel_thermal_interrupt(void) { __u64 msr_val; - ack_APIC_irq(); - - exit_idle(); - irq_enter(); - rdmsrl(MSR_IA32_THERM_STATUS, msr_val); if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT)) mce_log_therm_throt_event(msr_val); +} +asmlinkage void smp_thermal_interrupt(void) +{ + ack_APIC_irq(); + exit_idle(); + irq_enter(); + intel_thermal_interrupt(); inc_irq_stat(irq_thermal_count); irq_exit(); } -- cgit v1.2.3 From e8ce2c5ee826b3787202493effcd08d4b1e1e639 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:24:40 +0900 Subject: x86, mce: unify smp_thermal_interrupt, prepare Let them in same shape. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 20 ++++++++++++++++++-- arch/x86/kernel/cpu/mcheck/p4.c | 10 ++++++---- 2 files changed, 24 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index a5232b2c4ca..922e3a482f6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -16,6 +16,14 @@ #include #include +static void unexpected_thermal_interrupt(void) +{ + printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", + smp_processor_id()); + add_taint(TAINT_MACHINE_CHECK); +} + +/* P4/Xeon Thermal transition interrupt handler: */ static void intel_thermal_interrupt(void) { __u64 msr_val; @@ -25,14 +33,22 @@ static void intel_thermal_interrupt(void) mce_log_therm_throt_event(msr_val); } +/* Thermal interrupt handler for this CPU setup: */ +static void (*vendor_thermal_interrupt)(void) = unexpected_thermal_interrupt; + asmlinkage void smp_thermal_interrupt(void) { - ack_APIC_irq(); exit_idle(); irq_enter(); - intel_thermal_interrupt(); inc_irq_stat(irq_thermal_count); + intel_thermal_interrupt(); irq_exit(); + ack_APIC_irq(); +} + +void intel_set_thermal_handler(void) +{ + vendor_thermal_interrupt = intel_thermal_interrupt; } /* diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index ddeed6e295a..75313f5b624 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -47,10 +48,9 @@ static void intel_thermal_interrupt(void) { __u64 msr_val; - ack_APIC_irq(); - rdmsrl(MSR_IA32_THERM_STATUS, msr_val); - therm_throt_process(msr_val & THERM_STATUS_PROCHOT); + if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT)) + mce_log_therm_throt_event(msr_val); } /* Thermal interrupt handler for this CPU setup: */ @@ -58,10 +58,12 @@ static void (*vendor_thermal_interrupt)(void) = unexpected_thermal_interrupt; void smp_thermal_interrupt(struct pt_regs *regs) { + exit_idle(); irq_enter(); - vendor_thermal_interrupt(); inc_irq_stat(irq_thermal_count); + vendor_thermal_interrupt(); irq_exit(); + ack_APIC_irq(); } void intel_set_thermal_handler(void) -- cgit v1.2.3 From a65c88dd2c83b569dbd13778da689861bdf977f2 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:25:27 +0900 Subject: x86, mce: unify smp_thermal_interrupt Put common functions into therm_throt.c, modify Makefile. unexpected_thermal_interrupt intel_thermal_interrupt smp_thermal_interrupt intel_set_thermal_handler Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/Makefile | 7 ++--- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 38 -------------------------- arch/x86/kernel/cpu/mcheck/p4.c | 45 ------------------------------- arch/x86/kernel/cpu/mcheck/therm_throt.c | 40 ++++++++++++++++++++++++++- 4 files changed, 43 insertions(+), 87 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 45004faf67e..53df57d11c5 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,11 +1,12 @@ -obj-y = mce.o therm_throt.o +obj-y = mce.o obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o -obj-$(CONFIG_X86_MCE_P4THERMAL) += mce_intel.o -obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o mce_intel.o +obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o + +obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o mce_intel.o diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index 922e3a482f6..3b7a0572e2f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -9,48 +9,10 @@ #include #include #include -#include #include #include -#include -#include #include -static void unexpected_thermal_interrupt(void) -{ - printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", - smp_processor_id()); - add_taint(TAINT_MACHINE_CHECK); -} - -/* P4/Xeon Thermal transition interrupt handler: */ -static void intel_thermal_interrupt(void) -{ - __u64 msr_val; - - rdmsrl(MSR_IA32_THERM_STATUS, msr_val); - if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT)) - mce_log_therm_throt_event(msr_val); -} - -/* Thermal interrupt handler for this CPU setup: */ -static void (*vendor_thermal_interrupt)(void) = unexpected_thermal_interrupt; - -asmlinkage void smp_thermal_interrupt(void) -{ - exit_idle(); - irq_enter(); - inc_irq_stat(irq_thermal_count); - intel_thermal_interrupt(); - irq_exit(); - ack_APIC_irq(); -} - -void intel_set_thermal_handler(void) -{ - vendor_thermal_interrupt = intel_thermal_interrupt; -} - /* * Support for Intel Correct Machine Check Interrupts. This allows * the CPU to raise an interrupt when a corrected machine check happened. diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index 75313f5b624..76c5f0305f9 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -1,8 +1,6 @@ /* * P4 specific Machine Check Exception Reporting */ - -#include #include #include #include @@ -10,9 +8,6 @@ #include #include -#include -#include -#include #include #include @@ -33,46 +28,6 @@ struct intel_mce_extended_msrs { static int mce_num_extended_msrs; - -#ifdef CONFIG_X86_MCE_P4THERMAL - -static void unexpected_thermal_interrupt(void) -{ - printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", - smp_processor_id()); - add_taint(TAINT_MACHINE_CHECK); -} - -/* P4/Xeon Thermal transition interrupt handler: */ -static void intel_thermal_interrupt(void) -{ - __u64 msr_val; - - rdmsrl(MSR_IA32_THERM_STATUS, msr_val); - if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT)) - mce_log_therm_throt_event(msr_val); -} - -/* Thermal interrupt handler for this CPU setup: */ -static void (*vendor_thermal_interrupt)(void) = unexpected_thermal_interrupt; - -void smp_thermal_interrupt(struct pt_regs *regs) -{ - exit_idle(); - irq_enter(); - inc_irq_stat(irq_thermal_count); - vendor_thermal_interrupt(); - irq_exit(); - ack_APIC_irq(); -} - -void intel_set_thermal_handler(void) -{ - vendor_thermal_interrupt = intel_thermal_interrupt; -} - -#endif /* CONFIG_X86_MCE_P4THERMAL */ - /* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) { diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 7b1ae2e20ba..b3792b19685 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -13,6 +13,7 @@ * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. * Inspired by Ross Biro's and Al Borchers' counter code. */ +#include #include #include #include @@ -20,6 +21,8 @@ #include #include +#include +#include /* How long to wait between reporting thermal events */ #define CHECK_INTERVAL (300 * HZ) @@ -186,6 +189,41 @@ static __init int thermal_throttle_init_device(void) return 0; } - device_initcall(thermal_throttle_init_device); + #endif /* CONFIG_SYSFS */ + +/* Thermal transition interrupt handler */ +void intel_thermal_interrupt(void) +{ + __u64 msr_val; + + rdmsrl(MSR_IA32_THERM_STATUS, msr_val); + if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT)) + mce_log_therm_throt_event(msr_val); +} + +static void unexpected_thermal_interrupt(void) +{ + printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", + smp_processor_id()); + add_taint(TAINT_MACHINE_CHECK); +} + +static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; + +asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) +{ + exit_idle(); + irq_enter(); + inc_irq_stat(irq_thermal_count); + smp_thermal_vector(); + irq_exit(); + /* Ack only at the end to avoid potential reentry */ + ack_APIC_irq(); +} + +void intel_set_thermal_handler(void) +{ + smp_thermal_vector = intel_thermal_interrupt; +} -- cgit v1.2.3 From 895287c0a6aa571160c47ee10de11b542166c4f9 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:26:10 +0900 Subject: x86, mce: squash mce_intel.c into therm_throt.c move intel_init_thermal() into therm_throt.c Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/Makefile | 2 +- arch/x86/kernel/cpu/mcheck/mce_intel.c | 73 -------------------------------- arch/x86/kernel/cpu/mcheck/therm_throt.c | 66 +++++++++++++++++++++++++++++ 3 files changed, 67 insertions(+), 74 deletions(-) delete mode 100644 arch/x86/kernel/cpu/mcheck/mce_intel.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 53df57d11c5..659564e5fc0 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -9,4 +9,4 @@ obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o -obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o mce_intel.o +obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c deleted file mode 100644 index 475478b2088..00000000000 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Common code for Intel machine checks - */ -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -void intel_init_thermal(struct cpuinfo_x86 *c) -{ - unsigned int cpu = smp_processor_id(); - int tm2 = 0; - u32 l, h; - - /* Thermal monitoring depends on ACPI and clock modulation*/ - if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) - return; - - /* - * First check if its enabled already, in which case there might - * be some SMM goo which handles it, so we can't even put a handler - * since it might be delivered via SMI already: - */ - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - h = apic_read(APIC_LVTTHMR); - if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { - printk(KERN_DEBUG - "CPU%d: Thermal monitoring handled by SMI\n", cpu); - return; - } - - if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) - tm2 = 1; - - /* Check whether a vector already exists */ - if (h & APIC_VECTOR_MASK) { - printk(KERN_DEBUG - "CPU%d: Thermal LVT vector (%#x) already installed\n", - cpu, (h & APIC_VECTOR_MASK)); - return; - } - - /* We'll mask the thermal vector in the lapic till we're ready: */ - h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; - apic_write(APIC_LVTTHMR, h); - - rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); - wrmsr(MSR_IA32_THERM_INTERRUPT, - l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); - - intel_set_thermal_handler(); - - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); - - /* Unmask the thermal vector: */ - l = apic_read(APIC_LVTTHMR); - apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); - - printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", - cpu, tm2 ? "TM2" : "TM1"); - - /* enable thermal throttle processing */ - atomic_set(&therm_throt_en, 1); -} diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index b3792b19685..7a508aaafce 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -16,13 +16,21 @@ #include #include #include +#include #include #include +#include +#include +#include #include #include +#include +#include +#include #include #include +#include /* How long to wait between reporting thermal events */ #define CHECK_INTERVAL (300 * HZ) @@ -227,3 +235,61 @@ void intel_set_thermal_handler(void) { smp_thermal_vector = intel_thermal_interrupt; } + +void intel_init_thermal(struct cpuinfo_x86 *c) +{ + unsigned int cpu = smp_processor_id(); + int tm2 = 0; + u32 l, h; + + /* Thermal monitoring depends on ACPI and clock modulation*/ + if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) + return; + + /* + * First check if its enabled already, in which case there might + * be some SMM goo which handles it, so we can't even put a handler + * since it might be delivered via SMI already: + */ + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + h = apic_read(APIC_LVTTHMR); + if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { + printk(KERN_DEBUG + "CPU%d: Thermal monitoring handled by SMI\n", cpu); + return; + } + + if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) + tm2 = 1; + + /* Check whether a vector already exists */ + if (h & APIC_VECTOR_MASK) { + printk(KERN_DEBUG + "CPU%d: Thermal LVT vector (%#x) already installed\n", + cpu, (h & APIC_VECTOR_MASK)); + return; + } + + /* We'll mask the thermal vector in the lapic till we're ready: */ + h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; + apic_write(APIC_LVTTHMR, h); + + rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); + wrmsr(MSR_IA32_THERM_INTERRUPT, + l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); + + intel_set_thermal_handler(); + + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); + + /* Unmask the thermal vector: */ + l = apic_read(APIC_LVTTHMR); + apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); + + printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", + cpu, tm2 ? "TM2" : "TM1"); + + /* enable thermal throttle processing */ + atomic_set(&therm_throt_en, 1); +} -- cgit v1.2.3 From 8363fc82d36c0886292e33925391dca93f03bd50 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:26:36 +0900 Subject: x86, mce: remove intel_set_thermal_handler() and make intel_thermal_interrupt() static. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/therm_throt.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 7a508aaafce..7c7944cc515 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -202,7 +202,7 @@ device_initcall(thermal_throttle_init_device); #endif /* CONFIG_SYSFS */ /* Thermal transition interrupt handler */ -void intel_thermal_interrupt(void) +static void intel_thermal_interrupt(void) { __u64 msr_val; @@ -231,11 +231,6 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) ack_APIC_irq(); } -void intel_set_thermal_handler(void) -{ - smp_thermal_vector = intel_thermal_interrupt; -} - void intel_init_thermal(struct cpuinfo_x86 *c) { unsigned int cpu = smp_processor_id(); @@ -278,7 +273,7 @@ void intel_init_thermal(struct cpuinfo_x86 *c) wrmsr(MSR_IA32_THERM_INTERRUPT, l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); - intel_set_thermal_handler(); + smp_thermal_vector = intel_thermal_interrupt; rdmsr(MSR_IA32_MISC_ENABLE, l, h); wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); -- cgit v1.2.3 From 1149e7264528723b8c3b075a90386ceb2e07070a Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:27:13 +0900 Subject: x86, mce: remove therm_throt.h Now all symbols in the header are static. Remove the header. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 1 - arch/x86/kernel/cpu/mcheck/p4.c | 1 - arch/x86/kernel/cpu/mcheck/therm_throt.c | 5 ++--- 3 files changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index 3b7a0572e2f..663a88e8b3c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -11,7 +11,6 @@ #include #include #include -#include /* * Support for Intel Correct Machine Check Interrupts. This allows diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index 76c5f0305f9..4482aea9aa2 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -6,7 +6,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 7c7944cc515..bff8dd191dd 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -24,7 +24,6 @@ #include #include -#include #include #include #include @@ -38,7 +37,7 @@ static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); -atomic_t therm_throt_en = ATOMIC_INIT(0); +static atomic_t therm_throt_en = ATOMIC_INIT(0); #ifdef CONFIG_SYSFS #define define_therm_throt_sysdev_one_ro(_name) \ @@ -93,7 +92,7 @@ static struct attribute_group thermal_throttle_attr_group = { * 1 : Event should be logged further, and a message has been * printed to the syslog. */ -int therm_throt_process(int curr) +static int therm_throt_process(int curr) { unsigned int cpu = smp_processor_id(); __u64 tmp_jiffs = get_jiffies_64(); -- cgit v1.2.3 From 1af0815f9639752409a9c15ba6a3dc0f322fd114 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:28:38 +0900 Subject: x86, mce: rename _64.c files which are no longer 64-bit-specific Rename files that are no longer 64bit specific: mce_amd_64.c => mce_amd.c mce_intel_64.c => mce_intel.c Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/Makefile | 4 +- arch/x86/kernel/cpu/mcheck/mce_amd.c | 703 ++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 703 ------------------------------ arch/x86/kernel/cpu/mcheck/mce_intel.c | 225 ++++++++++ arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 225 ---------- 5 files changed, 930 insertions(+), 930 deletions(-) create mode 100644 arch/x86/kernel/cpu/mcheck/mce_amd.c delete mode 100644 arch/x86/kernel/cpu/mcheck/mce_amd_64.c create mode 100644 arch/x86/kernel/cpu/mcheck/mce_intel.c delete mode 100644 arch/x86/kernel/cpu/mcheck/mce_intel_64.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 659564e5fc0..188a1ca5ad2 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -3,8 +3,8 @@ obj-y = mce.o obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o -obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o -obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o +obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o +obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c new file mode 100644 index 00000000000..ddae21620bd --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -0,0 +1,703 @@ +/* + * (c) 2005, 2006 Advanced Micro Devices, Inc. + * Your use of this code is subject to the terms and conditions of the + * GNU general public license version 2. See "COPYING" or + * http://www.gnu.org/licenses/gpl.html + * + * Written by Jacob Shin - AMD, Inc. + * + * Support : jacob.shin@amd.com + * + * April 2006 + * - added support for AMD Family 0x10 processors + * + * All MC4_MISCi registers are shared between multi-cores + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define PFX "mce_threshold: " +#define VERSION "version 1.1.1" +#define NR_BANKS 6 +#define NR_BLOCKS 9 +#define THRESHOLD_MAX 0xFFF +#define INT_TYPE_APIC 0x00020000 +#define MASK_VALID_HI 0x80000000 +#define MASK_CNTP_HI 0x40000000 +#define MASK_LOCKED_HI 0x20000000 +#define MASK_LVTOFF_HI 0x00F00000 +#define MASK_COUNT_EN_HI 0x00080000 +#define MASK_INT_TYPE_HI 0x00060000 +#define MASK_OVERFLOW_HI 0x00010000 +#define MASK_ERR_COUNT_HI 0x00000FFF +#define MASK_BLKPTR_LO 0xFF000000 +#define MCG_XBLK_ADDR 0xC0000400 + +struct threshold_block { + unsigned int block; + unsigned int bank; + unsigned int cpu; + u32 address; + u16 interrupt_enable; + u16 threshold_limit; + struct kobject kobj; + struct list_head miscj; +}; + +/* defaults used early on boot */ +static struct threshold_block threshold_defaults = { + .interrupt_enable = 0, + .threshold_limit = THRESHOLD_MAX, +}; + +struct threshold_bank { + struct kobject *kobj; + struct threshold_block *blocks; + cpumask_var_t cpus; +}; +static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); + +#ifdef CONFIG_SMP +static unsigned char shared_bank[NR_BANKS] = { + 0, 0, 0, 0, 1 +}; +#endif + +static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ + +static void amd_threshold_interrupt(void); + +/* + * CPU Initialization + */ + +struct thresh_restart { + struct threshold_block *b; + int reset; + u16 old_limit; +}; + +/* must be called with correct cpu affinity */ +/* Called via smp_call_function_single() */ +static void threshold_restart_bank(void *_tr) +{ + struct thresh_restart *tr = _tr; + u32 mci_misc_hi, mci_misc_lo; + + rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi); + + if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) + tr->reset = 1; /* limit cannot be lower than err count */ + + if (tr->reset) { /* reset err count and overflow bit */ + mci_misc_hi = + (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | + (THRESHOLD_MAX - tr->b->threshold_limit); + } else if (tr->old_limit) { /* change limit w/o reset */ + int new_count = (mci_misc_hi & THRESHOLD_MAX) + + (tr->old_limit - tr->b->threshold_limit); + + mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | + (new_count & THRESHOLD_MAX); + } + + tr->b->interrupt_enable ? + (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : + (mci_misc_hi &= ~MASK_INT_TYPE_HI); + + mci_misc_hi |= MASK_COUNT_EN_HI; + wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi); +} + +/* cpu init entry point, called from mce.c with preempt off */ +void mce_amd_feature_init(struct cpuinfo_x86 *c) +{ + unsigned int cpu = smp_processor_id(); + u32 low = 0, high = 0, address = 0; + unsigned int bank, block; + struct thresh_restart tr; + u8 lvt_off; + + for (bank = 0; bank < NR_BANKS; ++bank) { + for (block = 0; block < NR_BLOCKS; ++block) { + if (block == 0) + address = MSR_IA32_MC0_MISC + bank * 4; + else if (block == 1) { + address = (low & MASK_BLKPTR_LO) >> 21; + if (!address) + break; + address += MCG_XBLK_ADDR; + } else + ++address; + + if (rdmsr_safe(address, &low, &high)) + break; + + if (!(high & MASK_VALID_HI)) { + if (block) + continue; + else + break; + } + + if (!(high & MASK_CNTP_HI) || + (high & MASK_LOCKED_HI)) + continue; + + if (!block) + per_cpu(bank_map, cpu) |= (1 << bank); +#ifdef CONFIG_SMP + if (shared_bank[bank] && c->cpu_core_id) + break; +#endif + lvt_off = setup_APIC_eilvt_mce(THRESHOLD_APIC_VECTOR, + APIC_EILVT_MSG_FIX, 0); + + high &= ~MASK_LVTOFF_HI; + high |= lvt_off << 20; + wrmsr(address, low, high); + + threshold_defaults.address = address; + tr.b = &threshold_defaults; + tr.reset = 0; + tr.old_limit = 0; + threshold_restart_bank(&tr); + + mce_threshold_vector = amd_threshold_interrupt; + } + } +} + +/* + * APIC Interrupt Handler + */ + +/* + * threshold interrupt handler will service THRESHOLD_APIC_VECTOR. + * the interrupt goes off when error_count reaches threshold_limit. + * the handler will simply log mcelog w/ software defined bank number. + */ +static void amd_threshold_interrupt(void) +{ + u32 low = 0, high = 0, address = 0; + unsigned int bank, block; + struct mce m; + + mce_setup(&m); + + /* assume first bank caused it */ + for (bank = 0; bank < NR_BANKS; ++bank) { + if (!(per_cpu(bank_map, m.cpu) & (1 << bank))) + continue; + for (block = 0; block < NR_BLOCKS; ++block) { + if (block == 0) { + address = MSR_IA32_MC0_MISC + bank * 4; + } else if (block == 1) { + address = (low & MASK_BLKPTR_LO) >> 21; + if (!address) + break; + address += MCG_XBLK_ADDR; + } else { + ++address; + } + + if (rdmsr_safe(address, &low, &high)) + break; + + if (!(high & MASK_VALID_HI)) { + if (block) + continue; + else + break; + } + + if (!(high & MASK_CNTP_HI) || + (high & MASK_LOCKED_HI)) + continue; + + /* + * Log the machine check that caused the threshold + * event. + */ + machine_check_poll(MCP_TIMESTAMP, + &__get_cpu_var(mce_poll_banks)); + + if (high & MASK_OVERFLOW_HI) { + rdmsrl(address, m.misc); + rdmsrl(MSR_IA32_MC0_STATUS + bank * 4, + m.status); + m.bank = K8_MCE_THRESHOLD_BASE + + bank * NR_BLOCKS + + block; + mce_log(&m); + return; + } + } + } +} + +/* + * Sysfs Interface + */ + +struct threshold_attr { + struct attribute attr; + ssize_t (*show) (struct threshold_block *, char *); + ssize_t (*store) (struct threshold_block *, const char *, size_t count); +}; + +#define SHOW_FIELDS(name) \ +static ssize_t show_ ## name(struct threshold_block *b, char *buf) \ +{ \ + return sprintf(buf, "%lx\n", (unsigned long) b->name); \ +} +SHOW_FIELDS(interrupt_enable) +SHOW_FIELDS(threshold_limit) + +static ssize_t +store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) +{ + struct thresh_restart tr; + unsigned long new; + + if (strict_strtoul(buf, 0, &new) < 0) + return -EINVAL; + + b->interrupt_enable = !!new; + + tr.b = b; + tr.reset = 0; + tr.old_limit = 0; + + smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); + + return size; +} + +static ssize_t +store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) +{ + struct thresh_restart tr; + unsigned long new; + + if (strict_strtoul(buf, 0, &new) < 0) + return -EINVAL; + + if (new > THRESHOLD_MAX) + new = THRESHOLD_MAX; + if (new < 1) + new = 1; + + tr.old_limit = b->threshold_limit; + b->threshold_limit = new; + tr.b = b; + tr.reset = 0; + + smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); + + return size; +} + +struct threshold_block_cross_cpu { + struct threshold_block *tb; + long retval; +}; + +static void local_error_count_handler(void *_tbcc) +{ + struct threshold_block_cross_cpu *tbcc = _tbcc; + struct threshold_block *b = tbcc->tb; + u32 low, high; + + rdmsr(b->address, low, high); + tbcc->retval = (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit); +} + +static ssize_t show_error_count(struct threshold_block *b, char *buf) +{ + struct threshold_block_cross_cpu tbcc = { .tb = b, }; + + smp_call_function_single(b->cpu, local_error_count_handler, &tbcc, 1); + return sprintf(buf, "%lx\n", tbcc.retval); +} + +static ssize_t store_error_count(struct threshold_block *b, + const char *buf, size_t count) +{ + struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 }; + + smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); + return 1; +} + +#define RW_ATTR(val) \ +static struct threshold_attr val = { \ + .attr = {.name = __stringify(val), .mode = 0644 }, \ + .show = show_## val, \ + .store = store_## val, \ +}; + +RW_ATTR(interrupt_enable); +RW_ATTR(threshold_limit); +RW_ATTR(error_count); + +static struct attribute *default_attrs[] = { + &interrupt_enable.attr, + &threshold_limit.attr, + &error_count.attr, + NULL +}; + +#define to_block(k) container_of(k, struct threshold_block, kobj) +#define to_attr(a) container_of(a, struct threshold_attr, attr) + +static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + struct threshold_block *b = to_block(kobj); + struct threshold_attr *a = to_attr(attr); + ssize_t ret; + + ret = a->show ? a->show(b, buf) : -EIO; + + return ret; +} + +static ssize_t store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct threshold_block *b = to_block(kobj); + struct threshold_attr *a = to_attr(attr); + ssize_t ret; + + ret = a->store ? a->store(b, buf, count) : -EIO; + + return ret; +} + +static struct sysfs_ops threshold_ops = { + .show = show, + .store = store, +}; + +static struct kobj_type threshold_ktype = { + .sysfs_ops = &threshold_ops, + .default_attrs = default_attrs, +}; + +static __cpuinit int allocate_threshold_blocks(unsigned int cpu, + unsigned int bank, + unsigned int block, + u32 address) +{ + struct threshold_block *b = NULL; + u32 low, high; + int err; + + if ((bank >= NR_BANKS) || (block >= NR_BLOCKS)) + return 0; + + if (rdmsr_safe_on_cpu(cpu, address, &low, &high)) + return 0; + + if (!(high & MASK_VALID_HI)) { + if (block) + goto recurse; + else + return 0; + } + + if (!(high & MASK_CNTP_HI) || + (high & MASK_LOCKED_HI)) + goto recurse; + + b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL); + if (!b) + return -ENOMEM; + + b->block = block; + b->bank = bank; + b->cpu = cpu; + b->address = address; + b->interrupt_enable = 0; + b->threshold_limit = THRESHOLD_MAX; + + INIT_LIST_HEAD(&b->miscj); + + if (per_cpu(threshold_banks, cpu)[bank]->blocks) { + list_add(&b->miscj, + &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj); + } else { + per_cpu(threshold_banks, cpu)[bank]->blocks = b; + } + + err = kobject_init_and_add(&b->kobj, &threshold_ktype, + per_cpu(threshold_banks, cpu)[bank]->kobj, + "misc%i", block); + if (err) + goto out_free; +recurse: + if (!block) { + address = (low & MASK_BLKPTR_LO) >> 21; + if (!address) + return 0; + address += MCG_XBLK_ADDR; + } else { + ++address; + } + + err = allocate_threshold_blocks(cpu, bank, ++block, address); + if (err) + goto out_free; + + if (b) + kobject_uevent(&b->kobj, KOBJ_ADD); + + return err; + +out_free: + if (b) { + kobject_put(&b->kobj); + kfree(b); + } + return err; +} + +static __cpuinit long +local_allocate_threshold_blocks(int cpu, unsigned int bank) +{ + return allocate_threshold_blocks(cpu, bank, 0, + MSR_IA32_MC0_MISC + bank * 4); +} + +/* symlinks sibling shared banks to first core. first core owns dir/files. */ +static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) +{ + int i, err = 0; + struct threshold_bank *b = NULL; + char name[32]; + + sprintf(name, "threshold_bank%i", bank); + +#ifdef CONFIG_SMP + if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ + i = cpumask_first(cpu_core_mask(cpu)); + + /* first core not up yet */ + if (cpu_data(i).cpu_core_id) + goto out; + + /* already linked */ + if (per_cpu(threshold_banks, cpu)[bank]) + goto out; + + b = per_cpu(threshold_banks, i)[bank]; + + if (!b) + goto out; + + err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj, + b->kobj, name); + if (err) + goto out; + + cpumask_copy(b->cpus, cpu_core_mask(cpu)); + per_cpu(threshold_banks, cpu)[bank] = b; + + goto out; + } +#endif + + b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); + if (!b) { + err = -ENOMEM; + goto out; + } + if (!alloc_cpumask_var(&b->cpus, GFP_KERNEL)) { + kfree(b); + err = -ENOMEM; + goto out; + } + + b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj); + if (!b->kobj) + goto out_free; + +#ifndef CONFIG_SMP + cpumask_setall(b->cpus); +#else + cpumask_copy(b->cpus, cpu_core_mask(cpu)); +#endif + + per_cpu(threshold_banks, cpu)[bank] = b; + + err = local_allocate_threshold_blocks(cpu, bank); + if (err) + goto out_free; + + for_each_cpu(i, b->cpus) { + if (i == cpu) + continue; + + err = sysfs_create_link(&per_cpu(mce_dev, i).kobj, + b->kobj, name); + if (err) + goto out; + + per_cpu(threshold_banks, i)[bank] = b; + } + + goto out; + +out_free: + per_cpu(threshold_banks, cpu)[bank] = NULL; + free_cpumask_var(b->cpus); + kfree(b); +out: + return err; +} + +/* create dir/files for all valid threshold banks */ +static __cpuinit int threshold_create_device(unsigned int cpu) +{ + unsigned int bank; + int err = 0; + + for (bank = 0; bank < NR_BANKS; ++bank) { + if (!(per_cpu(bank_map, cpu) & (1 << bank))) + continue; + err = threshold_create_bank(cpu, bank); + if (err) + goto out; + } +out: + return err; +} + +/* + * let's be hotplug friendly. + * in case of multiple core processors, the first core always takes ownership + * of shared sysfs dir/files, and rest of the cores will be symlinked to it. + */ + +static void deallocate_threshold_block(unsigned int cpu, + unsigned int bank) +{ + struct threshold_block *pos = NULL; + struct threshold_block *tmp = NULL; + struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank]; + + if (!head) + return; + + list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) { + kobject_put(&pos->kobj); + list_del(&pos->miscj); + kfree(pos); + } + + kfree(per_cpu(threshold_banks, cpu)[bank]->blocks); + per_cpu(threshold_banks, cpu)[bank]->blocks = NULL; +} + +static void threshold_remove_bank(unsigned int cpu, int bank) +{ + struct threshold_bank *b; + char name[32]; + int i = 0; + + b = per_cpu(threshold_banks, cpu)[bank]; + if (!b) + return; + if (!b->blocks) + goto free_out; + + sprintf(name, "threshold_bank%i", bank); + +#ifdef CONFIG_SMP + /* sibling symlink */ + if (shared_bank[bank] && b->blocks->cpu != cpu) { + sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name); + per_cpu(threshold_banks, cpu)[bank] = NULL; + + return; + } +#endif + + /* remove all sibling symlinks before unregistering */ + for_each_cpu(i, b->cpus) { + if (i == cpu) + continue; + + sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name); + per_cpu(threshold_banks, i)[bank] = NULL; + } + + deallocate_threshold_block(cpu, bank); + +free_out: + kobject_del(b->kobj); + kobject_put(b->kobj); + free_cpumask_var(b->cpus); + kfree(b); + per_cpu(threshold_banks, cpu)[bank] = NULL; +} + +static void threshold_remove_device(unsigned int cpu) +{ + unsigned int bank; + + for (bank = 0; bank < NR_BANKS; ++bank) { + if (!(per_cpu(bank_map, cpu) & (1 << bank))) + continue; + threshold_remove_bank(cpu, bank); + } +} + +/* get notified when a cpu comes on/off */ +static void __cpuinit +amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu) +{ + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + threshold_create_device(cpu); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + threshold_remove_device(cpu); + break; + default: + break; + } +} + +static __init int threshold_init_device(void) +{ + unsigned lcpu = 0; + + /* to hit CPUs online before the notifier is up */ + for_each_online_cpu(lcpu) { + int err = threshold_create_device(lcpu); + + if (err) + return err; + } + threshold_cpu_callback = amd_64_threshold_cpu_callback; + + return 0; +} +device_initcall(threshold_init_device); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c deleted file mode 100644 index ddae21620bd..00000000000 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ /dev/null @@ -1,703 +0,0 @@ -/* - * (c) 2005, 2006 Advanced Micro Devices, Inc. - * Your use of this code is subject to the terms and conditions of the - * GNU general public license version 2. See "COPYING" or - * http://www.gnu.org/licenses/gpl.html - * - * Written by Jacob Shin - AMD, Inc. - * - * Support : jacob.shin@amd.com - * - * April 2006 - * - added support for AMD Family 0x10 processors - * - * All MC4_MISCi registers are shared between multi-cores - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#define PFX "mce_threshold: " -#define VERSION "version 1.1.1" -#define NR_BANKS 6 -#define NR_BLOCKS 9 -#define THRESHOLD_MAX 0xFFF -#define INT_TYPE_APIC 0x00020000 -#define MASK_VALID_HI 0x80000000 -#define MASK_CNTP_HI 0x40000000 -#define MASK_LOCKED_HI 0x20000000 -#define MASK_LVTOFF_HI 0x00F00000 -#define MASK_COUNT_EN_HI 0x00080000 -#define MASK_INT_TYPE_HI 0x00060000 -#define MASK_OVERFLOW_HI 0x00010000 -#define MASK_ERR_COUNT_HI 0x00000FFF -#define MASK_BLKPTR_LO 0xFF000000 -#define MCG_XBLK_ADDR 0xC0000400 - -struct threshold_block { - unsigned int block; - unsigned int bank; - unsigned int cpu; - u32 address; - u16 interrupt_enable; - u16 threshold_limit; - struct kobject kobj; - struct list_head miscj; -}; - -/* defaults used early on boot */ -static struct threshold_block threshold_defaults = { - .interrupt_enable = 0, - .threshold_limit = THRESHOLD_MAX, -}; - -struct threshold_bank { - struct kobject *kobj; - struct threshold_block *blocks; - cpumask_var_t cpus; -}; -static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); - -#ifdef CONFIG_SMP -static unsigned char shared_bank[NR_BANKS] = { - 0, 0, 0, 0, 1 -}; -#endif - -static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ - -static void amd_threshold_interrupt(void); - -/* - * CPU Initialization - */ - -struct thresh_restart { - struct threshold_block *b; - int reset; - u16 old_limit; -}; - -/* must be called with correct cpu affinity */ -/* Called via smp_call_function_single() */ -static void threshold_restart_bank(void *_tr) -{ - struct thresh_restart *tr = _tr; - u32 mci_misc_hi, mci_misc_lo; - - rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi); - - if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) - tr->reset = 1; /* limit cannot be lower than err count */ - - if (tr->reset) { /* reset err count and overflow bit */ - mci_misc_hi = - (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | - (THRESHOLD_MAX - tr->b->threshold_limit); - } else if (tr->old_limit) { /* change limit w/o reset */ - int new_count = (mci_misc_hi & THRESHOLD_MAX) + - (tr->old_limit - tr->b->threshold_limit); - - mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | - (new_count & THRESHOLD_MAX); - } - - tr->b->interrupt_enable ? - (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : - (mci_misc_hi &= ~MASK_INT_TYPE_HI); - - mci_misc_hi |= MASK_COUNT_EN_HI; - wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi); -} - -/* cpu init entry point, called from mce.c with preempt off */ -void mce_amd_feature_init(struct cpuinfo_x86 *c) -{ - unsigned int cpu = smp_processor_id(); - u32 low = 0, high = 0, address = 0; - unsigned int bank, block; - struct thresh_restart tr; - u8 lvt_off; - - for (bank = 0; bank < NR_BANKS; ++bank) { - for (block = 0; block < NR_BLOCKS; ++block) { - if (block == 0) - address = MSR_IA32_MC0_MISC + bank * 4; - else if (block == 1) { - address = (low & MASK_BLKPTR_LO) >> 21; - if (!address) - break; - address += MCG_XBLK_ADDR; - } else - ++address; - - if (rdmsr_safe(address, &low, &high)) - break; - - if (!(high & MASK_VALID_HI)) { - if (block) - continue; - else - break; - } - - if (!(high & MASK_CNTP_HI) || - (high & MASK_LOCKED_HI)) - continue; - - if (!block) - per_cpu(bank_map, cpu) |= (1 << bank); -#ifdef CONFIG_SMP - if (shared_bank[bank] && c->cpu_core_id) - break; -#endif - lvt_off = setup_APIC_eilvt_mce(THRESHOLD_APIC_VECTOR, - APIC_EILVT_MSG_FIX, 0); - - high &= ~MASK_LVTOFF_HI; - high |= lvt_off << 20; - wrmsr(address, low, high); - - threshold_defaults.address = address; - tr.b = &threshold_defaults; - tr.reset = 0; - tr.old_limit = 0; - threshold_restart_bank(&tr); - - mce_threshold_vector = amd_threshold_interrupt; - } - } -} - -/* - * APIC Interrupt Handler - */ - -/* - * threshold interrupt handler will service THRESHOLD_APIC_VECTOR. - * the interrupt goes off when error_count reaches threshold_limit. - * the handler will simply log mcelog w/ software defined bank number. - */ -static void amd_threshold_interrupt(void) -{ - u32 low = 0, high = 0, address = 0; - unsigned int bank, block; - struct mce m; - - mce_setup(&m); - - /* assume first bank caused it */ - for (bank = 0; bank < NR_BANKS; ++bank) { - if (!(per_cpu(bank_map, m.cpu) & (1 << bank))) - continue; - for (block = 0; block < NR_BLOCKS; ++block) { - if (block == 0) { - address = MSR_IA32_MC0_MISC + bank * 4; - } else if (block == 1) { - address = (low & MASK_BLKPTR_LO) >> 21; - if (!address) - break; - address += MCG_XBLK_ADDR; - } else { - ++address; - } - - if (rdmsr_safe(address, &low, &high)) - break; - - if (!(high & MASK_VALID_HI)) { - if (block) - continue; - else - break; - } - - if (!(high & MASK_CNTP_HI) || - (high & MASK_LOCKED_HI)) - continue; - - /* - * Log the machine check that caused the threshold - * event. - */ - machine_check_poll(MCP_TIMESTAMP, - &__get_cpu_var(mce_poll_banks)); - - if (high & MASK_OVERFLOW_HI) { - rdmsrl(address, m.misc); - rdmsrl(MSR_IA32_MC0_STATUS + bank * 4, - m.status); - m.bank = K8_MCE_THRESHOLD_BASE - + bank * NR_BLOCKS - + block; - mce_log(&m); - return; - } - } - } -} - -/* - * Sysfs Interface - */ - -struct threshold_attr { - struct attribute attr; - ssize_t (*show) (struct threshold_block *, char *); - ssize_t (*store) (struct threshold_block *, const char *, size_t count); -}; - -#define SHOW_FIELDS(name) \ -static ssize_t show_ ## name(struct threshold_block *b, char *buf) \ -{ \ - return sprintf(buf, "%lx\n", (unsigned long) b->name); \ -} -SHOW_FIELDS(interrupt_enable) -SHOW_FIELDS(threshold_limit) - -static ssize_t -store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) -{ - struct thresh_restart tr; - unsigned long new; - - if (strict_strtoul(buf, 0, &new) < 0) - return -EINVAL; - - b->interrupt_enable = !!new; - - tr.b = b; - tr.reset = 0; - tr.old_limit = 0; - - smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); - - return size; -} - -static ssize_t -store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) -{ - struct thresh_restart tr; - unsigned long new; - - if (strict_strtoul(buf, 0, &new) < 0) - return -EINVAL; - - if (new > THRESHOLD_MAX) - new = THRESHOLD_MAX; - if (new < 1) - new = 1; - - tr.old_limit = b->threshold_limit; - b->threshold_limit = new; - tr.b = b; - tr.reset = 0; - - smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); - - return size; -} - -struct threshold_block_cross_cpu { - struct threshold_block *tb; - long retval; -}; - -static void local_error_count_handler(void *_tbcc) -{ - struct threshold_block_cross_cpu *tbcc = _tbcc; - struct threshold_block *b = tbcc->tb; - u32 low, high; - - rdmsr(b->address, low, high); - tbcc->retval = (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit); -} - -static ssize_t show_error_count(struct threshold_block *b, char *buf) -{ - struct threshold_block_cross_cpu tbcc = { .tb = b, }; - - smp_call_function_single(b->cpu, local_error_count_handler, &tbcc, 1); - return sprintf(buf, "%lx\n", tbcc.retval); -} - -static ssize_t store_error_count(struct threshold_block *b, - const char *buf, size_t count) -{ - struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 }; - - smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); - return 1; -} - -#define RW_ATTR(val) \ -static struct threshold_attr val = { \ - .attr = {.name = __stringify(val), .mode = 0644 }, \ - .show = show_## val, \ - .store = store_## val, \ -}; - -RW_ATTR(interrupt_enable); -RW_ATTR(threshold_limit); -RW_ATTR(error_count); - -static struct attribute *default_attrs[] = { - &interrupt_enable.attr, - &threshold_limit.attr, - &error_count.attr, - NULL -}; - -#define to_block(k) container_of(k, struct threshold_block, kobj) -#define to_attr(a) container_of(a, struct threshold_attr, attr) - -static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) -{ - struct threshold_block *b = to_block(kobj); - struct threshold_attr *a = to_attr(attr); - ssize_t ret; - - ret = a->show ? a->show(b, buf) : -EIO; - - return ret; -} - -static ssize_t store(struct kobject *kobj, struct attribute *attr, - const char *buf, size_t count) -{ - struct threshold_block *b = to_block(kobj); - struct threshold_attr *a = to_attr(attr); - ssize_t ret; - - ret = a->store ? a->store(b, buf, count) : -EIO; - - return ret; -} - -static struct sysfs_ops threshold_ops = { - .show = show, - .store = store, -}; - -static struct kobj_type threshold_ktype = { - .sysfs_ops = &threshold_ops, - .default_attrs = default_attrs, -}; - -static __cpuinit int allocate_threshold_blocks(unsigned int cpu, - unsigned int bank, - unsigned int block, - u32 address) -{ - struct threshold_block *b = NULL; - u32 low, high; - int err; - - if ((bank >= NR_BANKS) || (block >= NR_BLOCKS)) - return 0; - - if (rdmsr_safe_on_cpu(cpu, address, &low, &high)) - return 0; - - if (!(high & MASK_VALID_HI)) { - if (block) - goto recurse; - else - return 0; - } - - if (!(high & MASK_CNTP_HI) || - (high & MASK_LOCKED_HI)) - goto recurse; - - b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL); - if (!b) - return -ENOMEM; - - b->block = block; - b->bank = bank; - b->cpu = cpu; - b->address = address; - b->interrupt_enable = 0; - b->threshold_limit = THRESHOLD_MAX; - - INIT_LIST_HEAD(&b->miscj); - - if (per_cpu(threshold_banks, cpu)[bank]->blocks) { - list_add(&b->miscj, - &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj); - } else { - per_cpu(threshold_banks, cpu)[bank]->blocks = b; - } - - err = kobject_init_and_add(&b->kobj, &threshold_ktype, - per_cpu(threshold_banks, cpu)[bank]->kobj, - "misc%i", block); - if (err) - goto out_free; -recurse: - if (!block) { - address = (low & MASK_BLKPTR_LO) >> 21; - if (!address) - return 0; - address += MCG_XBLK_ADDR; - } else { - ++address; - } - - err = allocate_threshold_blocks(cpu, bank, ++block, address); - if (err) - goto out_free; - - if (b) - kobject_uevent(&b->kobj, KOBJ_ADD); - - return err; - -out_free: - if (b) { - kobject_put(&b->kobj); - kfree(b); - } - return err; -} - -static __cpuinit long -local_allocate_threshold_blocks(int cpu, unsigned int bank) -{ - return allocate_threshold_blocks(cpu, bank, 0, - MSR_IA32_MC0_MISC + bank * 4); -} - -/* symlinks sibling shared banks to first core. first core owns dir/files. */ -static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) -{ - int i, err = 0; - struct threshold_bank *b = NULL; - char name[32]; - - sprintf(name, "threshold_bank%i", bank); - -#ifdef CONFIG_SMP - if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ - i = cpumask_first(cpu_core_mask(cpu)); - - /* first core not up yet */ - if (cpu_data(i).cpu_core_id) - goto out; - - /* already linked */ - if (per_cpu(threshold_banks, cpu)[bank]) - goto out; - - b = per_cpu(threshold_banks, i)[bank]; - - if (!b) - goto out; - - err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj, - b->kobj, name); - if (err) - goto out; - - cpumask_copy(b->cpus, cpu_core_mask(cpu)); - per_cpu(threshold_banks, cpu)[bank] = b; - - goto out; - } -#endif - - b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); - if (!b) { - err = -ENOMEM; - goto out; - } - if (!alloc_cpumask_var(&b->cpus, GFP_KERNEL)) { - kfree(b); - err = -ENOMEM; - goto out; - } - - b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj); - if (!b->kobj) - goto out_free; - -#ifndef CONFIG_SMP - cpumask_setall(b->cpus); -#else - cpumask_copy(b->cpus, cpu_core_mask(cpu)); -#endif - - per_cpu(threshold_banks, cpu)[bank] = b; - - err = local_allocate_threshold_blocks(cpu, bank); - if (err) - goto out_free; - - for_each_cpu(i, b->cpus) { - if (i == cpu) - continue; - - err = sysfs_create_link(&per_cpu(mce_dev, i).kobj, - b->kobj, name); - if (err) - goto out; - - per_cpu(threshold_banks, i)[bank] = b; - } - - goto out; - -out_free: - per_cpu(threshold_banks, cpu)[bank] = NULL; - free_cpumask_var(b->cpus); - kfree(b); -out: - return err; -} - -/* create dir/files for all valid threshold banks */ -static __cpuinit int threshold_create_device(unsigned int cpu) -{ - unsigned int bank; - int err = 0; - - for (bank = 0; bank < NR_BANKS; ++bank) { - if (!(per_cpu(bank_map, cpu) & (1 << bank))) - continue; - err = threshold_create_bank(cpu, bank); - if (err) - goto out; - } -out: - return err; -} - -/* - * let's be hotplug friendly. - * in case of multiple core processors, the first core always takes ownership - * of shared sysfs dir/files, and rest of the cores will be symlinked to it. - */ - -static void deallocate_threshold_block(unsigned int cpu, - unsigned int bank) -{ - struct threshold_block *pos = NULL; - struct threshold_block *tmp = NULL; - struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank]; - - if (!head) - return; - - list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) { - kobject_put(&pos->kobj); - list_del(&pos->miscj); - kfree(pos); - } - - kfree(per_cpu(threshold_banks, cpu)[bank]->blocks); - per_cpu(threshold_banks, cpu)[bank]->blocks = NULL; -} - -static void threshold_remove_bank(unsigned int cpu, int bank) -{ - struct threshold_bank *b; - char name[32]; - int i = 0; - - b = per_cpu(threshold_banks, cpu)[bank]; - if (!b) - return; - if (!b->blocks) - goto free_out; - - sprintf(name, "threshold_bank%i", bank); - -#ifdef CONFIG_SMP - /* sibling symlink */ - if (shared_bank[bank] && b->blocks->cpu != cpu) { - sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name); - per_cpu(threshold_banks, cpu)[bank] = NULL; - - return; - } -#endif - - /* remove all sibling symlinks before unregistering */ - for_each_cpu(i, b->cpus) { - if (i == cpu) - continue; - - sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name); - per_cpu(threshold_banks, i)[bank] = NULL; - } - - deallocate_threshold_block(cpu, bank); - -free_out: - kobject_del(b->kobj); - kobject_put(b->kobj); - free_cpumask_var(b->cpus); - kfree(b); - per_cpu(threshold_banks, cpu)[bank] = NULL; -} - -static void threshold_remove_device(unsigned int cpu) -{ - unsigned int bank; - - for (bank = 0; bank < NR_BANKS; ++bank) { - if (!(per_cpu(bank_map, cpu) & (1 << bank))) - continue; - threshold_remove_bank(cpu, bank); - } -} - -/* get notified when a cpu comes on/off */ -static void __cpuinit -amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu) -{ - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - threshold_create_device(cpu); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - threshold_remove_device(cpu); - break; - default: - break; - } -} - -static __init int threshold_init_device(void) -{ - unsigned lcpu = 0; - - /* to hit CPUs online before the notifier is up */ - for_each_online_cpu(lcpu) { - int err = threshold_create_device(lcpu); - - if (err) - return err; - } - threshold_cpu_callback = amd_64_threshold_cpu_callback; - - return 0; -} -device_initcall(threshold_init_device); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c new file mode 100644 index 00000000000..663a88e8b3c --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -0,0 +1,225 @@ +/* + * Intel specific MCE features. + * Copyright 2004 Zwane Mwaikambo + * Copyright (C) 2008, 2009 Intel Corporation + * Author: Andi Kleen + */ + +#include +#include +#include +#include +#include +#include + +/* + * Support for Intel Correct Machine Check Interrupts. This allows + * the CPU to raise an interrupt when a corrected machine check happened. + * Normally we pick those up using a regular polling timer. + * Also supports reliable discovery of shared banks. + */ + +static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); + +/* + * cmci_discover_lock protects against parallel discovery attempts + * which could race against each other. + */ +static DEFINE_SPINLOCK(cmci_discover_lock); + +#define CMCI_THRESHOLD 1 + +static int cmci_supported(int *banks) +{ + u64 cap; + + if (mce_cmci_disabled || mce_ignore_ce) + return 0; + + /* + * Vendor check is not strictly needed, but the initial + * initialization is vendor keyed and this + * makes sure none of the backdoors are entered otherwise. + */ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return 0; + if (!cpu_has_apic || lapic_get_maxlvt() < 6) + return 0; + rdmsrl(MSR_IA32_MCG_CAP, cap); + *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff); + return !!(cap & MCG_CMCI_P); +} + +/* + * The interrupt handler. This is called on every event. + * Just call the poller directly to log any events. + * This could in theory increase the threshold under high load, + * but doesn't for now. + */ +static void intel_threshold_interrupt(void) +{ + machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); + mce_notify_irq(); +} + +static void print_update(char *type, int *hdr, int num) +{ + if (*hdr == 0) + printk(KERN_INFO "CPU %d MCA banks", smp_processor_id()); + *hdr = 1; + printk(KERN_CONT " %s:%d", type, num); +} + +/* + * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks + * on this CPU. Use the algorithm recommended in the SDM to discover shared + * banks. + */ +static void cmci_discover(int banks, int boot) +{ + unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned); + unsigned long flags; + int hdr = 0; + int i; + + spin_lock_irqsave(&cmci_discover_lock, flags); + for (i = 0; i < banks; i++) { + u64 val; + + if (test_bit(i, owned)) + continue; + + rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + + /* Already owned by someone else? */ + if (val & CMCI_EN) { + if (test_and_clear_bit(i, owned) || boot) + print_update("SHD", &hdr, i); + __clear_bit(i, __get_cpu_var(mce_poll_banks)); + continue; + } + + val |= CMCI_EN | CMCI_THRESHOLD; + wrmsrl(MSR_IA32_MC0_CTL2 + i, val); + rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + + /* Did the enable bit stick? -- the bank supports CMCI */ + if (val & CMCI_EN) { + if (!test_and_set_bit(i, owned) || boot) + print_update("CMCI", &hdr, i); + __clear_bit(i, __get_cpu_var(mce_poll_banks)); + } else { + WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); + } + } + spin_unlock_irqrestore(&cmci_discover_lock, flags); + if (hdr) + printk(KERN_CONT "\n"); +} + +/* + * Just in case we missed an event during initialization check + * all the CMCI owned banks. + */ +void cmci_recheck(void) +{ + unsigned long flags; + int banks; + + if (!mce_available(¤t_cpu_data) || !cmci_supported(&banks)) + return; + local_irq_save(flags); + machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); + local_irq_restore(flags); +} + +/* + * Disable CMCI on this CPU for all banks it owns when it goes down. + * This allows other CPUs to claim the banks on rediscovery. + */ +void cmci_clear(void) +{ + unsigned long flags; + int i; + int banks; + u64 val; + + if (!cmci_supported(&banks)) + return; + spin_lock_irqsave(&cmci_discover_lock, flags); + for (i = 0; i < banks; i++) { + if (!test_bit(i, __get_cpu_var(mce_banks_owned))) + continue; + /* Disable CMCI */ + rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); + wrmsrl(MSR_IA32_MC0_CTL2 + i, val); + __clear_bit(i, __get_cpu_var(mce_banks_owned)); + } + spin_unlock_irqrestore(&cmci_discover_lock, flags); +} + +/* + * After a CPU went down cycle through all the others and rediscover + * Must run in process context. + */ +void cmci_rediscover(int dying) +{ + int banks; + int cpu; + cpumask_var_t old; + + if (!cmci_supported(&banks)) + return; + if (!alloc_cpumask_var(&old, GFP_KERNEL)) + return; + cpumask_copy(old, ¤t->cpus_allowed); + + for_each_online_cpu(cpu) { + if (cpu == dying) + continue; + if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) + continue; + /* Recheck banks in case CPUs don't all have the same */ + if (cmci_supported(&banks)) + cmci_discover(banks, 0); + } + + set_cpus_allowed_ptr(current, old); + free_cpumask_var(old); +} + +/* + * Reenable CMCI on this CPU in case a CPU down failed. + */ +void cmci_reenable(void) +{ + int banks; + if (cmci_supported(&banks)) + cmci_discover(banks, 0); +} + +static void intel_init_cmci(void) +{ + int banks; + + if (!cmci_supported(&banks)) + return; + + mce_threshold_vector = intel_threshold_interrupt; + cmci_discover(banks, 1); + /* + * For CPU #0 this runs with still disabled APIC, but that's + * ok because only the vector is set up. We still do another + * check for the banks later for CPU #0 just to make sure + * to not miss any events. + */ + apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED); + cmci_recheck(); +} + +void mce_intel_feature_init(struct cpuinfo_x86 *c) +{ + intel_init_thermal(c); + intel_init_cmci(); +} diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c deleted file mode 100644 index 663a88e8b3c..00000000000 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ /dev/null @@ -1,225 +0,0 @@ -/* - * Intel specific MCE features. - * Copyright 2004 Zwane Mwaikambo - * Copyright (C) 2008, 2009 Intel Corporation - * Author: Andi Kleen - */ - -#include -#include -#include -#include -#include -#include - -/* - * Support for Intel Correct Machine Check Interrupts. This allows - * the CPU to raise an interrupt when a corrected machine check happened. - * Normally we pick those up using a regular polling timer. - * Also supports reliable discovery of shared banks. - */ - -static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); - -/* - * cmci_discover_lock protects against parallel discovery attempts - * which could race against each other. - */ -static DEFINE_SPINLOCK(cmci_discover_lock); - -#define CMCI_THRESHOLD 1 - -static int cmci_supported(int *banks) -{ - u64 cap; - - if (mce_cmci_disabled || mce_ignore_ce) - return 0; - - /* - * Vendor check is not strictly needed, but the initial - * initialization is vendor keyed and this - * makes sure none of the backdoors are entered otherwise. - */ - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) - return 0; - if (!cpu_has_apic || lapic_get_maxlvt() < 6) - return 0; - rdmsrl(MSR_IA32_MCG_CAP, cap); - *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff); - return !!(cap & MCG_CMCI_P); -} - -/* - * The interrupt handler. This is called on every event. - * Just call the poller directly to log any events. - * This could in theory increase the threshold under high load, - * but doesn't for now. - */ -static void intel_threshold_interrupt(void) -{ - machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); - mce_notify_irq(); -} - -static void print_update(char *type, int *hdr, int num) -{ - if (*hdr == 0) - printk(KERN_INFO "CPU %d MCA banks", smp_processor_id()); - *hdr = 1; - printk(KERN_CONT " %s:%d", type, num); -} - -/* - * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks - * on this CPU. Use the algorithm recommended in the SDM to discover shared - * banks. - */ -static void cmci_discover(int banks, int boot) -{ - unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned); - unsigned long flags; - int hdr = 0; - int i; - - spin_lock_irqsave(&cmci_discover_lock, flags); - for (i = 0; i < banks; i++) { - u64 val; - - if (test_bit(i, owned)) - continue; - - rdmsrl(MSR_IA32_MC0_CTL2 + i, val); - - /* Already owned by someone else? */ - if (val & CMCI_EN) { - if (test_and_clear_bit(i, owned) || boot) - print_update("SHD", &hdr, i); - __clear_bit(i, __get_cpu_var(mce_poll_banks)); - continue; - } - - val |= CMCI_EN | CMCI_THRESHOLD; - wrmsrl(MSR_IA32_MC0_CTL2 + i, val); - rdmsrl(MSR_IA32_MC0_CTL2 + i, val); - - /* Did the enable bit stick? -- the bank supports CMCI */ - if (val & CMCI_EN) { - if (!test_and_set_bit(i, owned) || boot) - print_update("CMCI", &hdr, i); - __clear_bit(i, __get_cpu_var(mce_poll_banks)); - } else { - WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); - } - } - spin_unlock_irqrestore(&cmci_discover_lock, flags); - if (hdr) - printk(KERN_CONT "\n"); -} - -/* - * Just in case we missed an event during initialization check - * all the CMCI owned banks. - */ -void cmci_recheck(void) -{ - unsigned long flags; - int banks; - - if (!mce_available(¤t_cpu_data) || !cmci_supported(&banks)) - return; - local_irq_save(flags); - machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); - local_irq_restore(flags); -} - -/* - * Disable CMCI on this CPU for all banks it owns when it goes down. - * This allows other CPUs to claim the banks on rediscovery. - */ -void cmci_clear(void) -{ - unsigned long flags; - int i; - int banks; - u64 val; - - if (!cmci_supported(&banks)) - return; - spin_lock_irqsave(&cmci_discover_lock, flags); - for (i = 0; i < banks; i++) { - if (!test_bit(i, __get_cpu_var(mce_banks_owned))) - continue; - /* Disable CMCI */ - rdmsrl(MSR_IA32_MC0_CTL2 + i, val); - val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); - wrmsrl(MSR_IA32_MC0_CTL2 + i, val); - __clear_bit(i, __get_cpu_var(mce_banks_owned)); - } - spin_unlock_irqrestore(&cmci_discover_lock, flags); -} - -/* - * After a CPU went down cycle through all the others and rediscover - * Must run in process context. - */ -void cmci_rediscover(int dying) -{ - int banks; - int cpu; - cpumask_var_t old; - - if (!cmci_supported(&banks)) - return; - if (!alloc_cpumask_var(&old, GFP_KERNEL)) - return; - cpumask_copy(old, ¤t->cpus_allowed); - - for_each_online_cpu(cpu) { - if (cpu == dying) - continue; - if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) - continue; - /* Recheck banks in case CPUs don't all have the same */ - if (cmci_supported(&banks)) - cmci_discover(banks, 0); - } - - set_cpus_allowed_ptr(current, old); - free_cpumask_var(old); -} - -/* - * Reenable CMCI on this CPU in case a CPU down failed. - */ -void cmci_reenable(void) -{ - int banks; - if (cmci_supported(&banks)) - cmci_discover(banks, 0); -} - -static void intel_init_cmci(void) -{ - int banks; - - if (!cmci_supported(&banks)) - return; - - mce_threshold_vector = intel_threshold_interrupt; - cmci_discover(banks, 1); - /* - * For CPU #0 this runs with still disabled APIC, but that's - * ok because only the vector is set up. We still do another - * check for the banks later for CPU #0 just to make sure - * to not miss any events. - */ - apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED); - cmci_recheck(); -} - -void mce_intel_feature_init(struct cpuinfo_x86 *c) -{ - intel_init_thermal(c); - intel_init_cmci(); -} -- cgit v1.2.3 From 95ee14e4379c5e19c0897c872350570402014742 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 9 Jun 2009 18:20:39 -0700 Subject: x86: cap iomem_resource to addressable physical memory iomem_resource is by default initialized to -1, which means 64 bits of physical address space if 64-bit resources are enabled. However, x86 CPUs cannot address 64 bits of physical address space. Thus, we want to cap the physical address space to what the union of all CPU can actually address. Without this patch, we may end up assigning inaccessible values to uninitialized 64-bit PCI memory resources. Signed-off-by: H. Peter Anvin Cc: Matthew Wilcox Cc: Jesse Barnes Cc: Martin Mares Cc: stable@kernel.org --- arch/x86/kernel/cpu/common.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 3ffdcfa9abd..5b9cb8839ca 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -853,6 +853,9 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) numa_add_cpu(smp_processor_id()); #endif + + /* Cap the iomem address space to what is addressable on all CPUs */ + iomem_resource.end &= (1ULL << c->x86_phys_bits) - 1; } #ifdef CONFIG_X86_64 -- cgit v1.2.3 From e2a7147640a54eb812c8ab5f3ee4424b92db4856 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Tue, 16 Jun 2009 16:43:40 -0500 Subject: x86: correct the conversion of EFI memory types This patch causes all the EFI_RESERVED_TYPE memory reservations to be recorded in the e820 table as type E820_RESERVED. (This patch replaces one called 'x86: vendor reserved memory type'. This version has been discussed a bit with Peter and Yinghai but not given a final opinion.) Without this patch EFI_RESERVED_TYPE memory reservations may be marked usable in the e820 table. There may be a collision between kernel use and some reserver's use of this memory. (An example use of this functionality is the UV system, which will access extremely large areas of memory with a memory engine that allows a user to address beyond the processor's range. Such areas are reserved in the EFI table by the BIOS. Some loaders have a restricted number of entries possible in the e820 table, hence the need to record the reservations in the unrestricted EFI table.) The call to do_add_efi_memmap() is only made if "add_efi_memmap" is specified on the kernel command line. Signed-off-by: Cliff Wickman Signed-off-by: H. Peter Anvin --- arch/x86/kernel/efi.c | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index 1736acc4d7a..96f7ac0bbf0 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -240,10 +240,35 @@ static void __init do_add_efi_memmap(void) unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; int e820_type; - if (md->attribute & EFI_MEMORY_WB) - e820_type = E820_RAM; - else + switch (md->type) { + case EFI_LOADER_CODE: + case EFI_LOADER_DATA: + case EFI_BOOT_SERVICES_CODE: + case EFI_BOOT_SERVICES_DATA: + case EFI_CONVENTIONAL_MEMORY: + if (md->attribute & EFI_MEMORY_WB) + e820_type = E820_RAM; + else + e820_type = E820_RESERVED; + break; + case EFI_ACPI_RECLAIM_MEMORY: + e820_type = E820_ACPI; + break; + case EFI_ACPI_MEMORY_NVS: + e820_type = E820_NVS; + break; + case EFI_UNUSABLE_MEMORY: + e820_type = E820_UNUSABLE; + break; + default: + /* + * EFI_RESERVED_TYPE EFI_RUNTIME_SERVICES_CODE + * EFI_RUNTIME_SERVICES_DATA EFI_MEMORY_MAPPED_IO + * EFI_MEMORY_MAPPED_IO_PORT_SPACE EFI_PAL_CODE + */ e820_type = E820_RESERVED; + break; + } e820_add_region(start, size, e820_type); } sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); -- cgit v1.2.3 From 203abd67b75f7714ce98ab0cdbd6cfd7ad79dec4 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 15 Jun 2009 14:52:01 +0200 Subject: x86: mce: Handle banks == 0 case in K7 quirk Vegard Nossum reported: > I get an MCE-related crash like this in latest linus tree: > > [ 0.115341] CPU: L1 I Cache: 64K (64 bytes/line), D cache 64K (64 bytes/line) > [ 0.116396] CPU: L2 Cache: 512K (64 bytes/line) > [ 0.120570] mce: CPU supports 0 MCE banks > [ 0.124870] BUG: unable to handle kernel NULL pointer dereference at 00000000 00000010 > [ 0.128001] IP: [] mcheck_init+0x278/0x320 > [ 0.128001] PGD 0 > [ 0.128001] Thread overran stack, or stack corrupted > [ 0.128001] Oops: 0002 [#1] PREEMPT SMP > [ 0.128001] last sysfs file: > [ 0.128001] CPU 0 > [ 0.128001] Modules linked in: > [ 0.128001] Pid: 0, comm: swapper Not tainted 2.6.30 #426 > [ 0.128001] RIP: 0010:[] [] mcheck_init+0x278/0x320 > [ 0.128001] RSP: 0018:ffffffff81595e38 EFLAGS: 00000246 > [ 0.128001] RAX: 0000000000000010 RBX: ffffffff8158f900 RCX: 0000000000000000 > [ 0.128001] RDX: 0000000000000000 RSI: 00000000000000ff RDI: 0000000000000010 > [ 0.128001] RBP: ffffffff81595e68 R08: 0000000000000001 R09: 0000000000000000 > [ 0.128001] R10: 0000000000000010 R11: 0000000000000000 R12: 0000000000000000 > [ 0.128001] R13: 00000000ffffffff R14: 0000000000000000 R15: 0000000000000000 > [ 0.128001] FS: 0000000000000000(0000) GS:ffff880002288000(0000) knlGS:00000 > 00000000000 > [ 0.128001] CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b > [ 0.128001] CR2: 0000000000000010 CR3: 0000000001001000 CR4: 00000000000006b0 > [ 0.128001] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > [ 0.128001] DR3: 0000000000000000 DR6: 0000000000000000 DR7: 0000000000000000 > [ 0.128001] Process swapper (pid: 0, threadinfo ffffffff81594000, task ffffff > ff8152a4a0) > [ 0.128001] Stack: > [ 0.128001] 0000000081595e68 5aa50ed3b4ddbe6e ffffffff8158f900 ffffffff8158f > 914 > [ 0.128001] ffffffff8158f948 0000000000000000 ffffffff81595eb8 ffffffff813b8 > 69c > [ 0.128001] 5aa50ed3b4ddbe6e 00000001078bfbfd 0000062300000800 5aa50ed3b4ddb > e6e > [ 0.128001] Call Trace: > [ 0.128001] [] identify_cpu+0x331/0x392 > [ 0.128001] [] identify_boot_cpu+0x23/0x6e > [ 0.128001] [] check_bugs+0x1c/0x60 > [ 0.128001] [] start_kernel+0x403/0x46e > [ 0.128001] [] x86_64_start_reservations+0xac/0xd5 > [ 0.128001] [] x86_64_start_kernel+0x115/0x14b > [ 0.128001] [] ? early_idt_handler+0x0/0x71 This happens on QEMU which reports MCA capability, but no banks. Without this patch there is a buffer overrun and boot ops because the code would try to initialize the 0 element of a zero length kmalloc() buffer. Reported-by: Vegard Nossum Tested-by: Pekka Enberg Signed-off-by: Andi Kleen LKML-Reference: <20090615125200.GD31969@one.firstfloor.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index fabba15e455..d9d77cfd8cc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1245,7 +1245,7 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) * Various K7s with broken bank 0 around. Always disable * by default. */ - if (c->x86 == 6) + if (c->x86 == 6 && banks > 0) bank[0] = 0; } -- cgit v1.2.3 From 84599f8a59e77699f18f06948cea171a349a3f0f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 16 Jun 2009 12:34:17 -0700 Subject: sched, x86: Fix cpufreq + sched_clock() TSC scaling For freqency dependent TSCs we only scale the cycles, we do not account for the discrepancy in absolute value. Our current formula is: time = cycles * mult (where mult is a function of the cpu-speed on variable tsc machines) Suppose our current cycle count is 10, and we have a multiplier of 5, then our time value would end up being 50. Now cpufreq comes along and changes the multiplier to say 3 or 7, which would result in our time being resp. 30 or 70. That means that we can observe random jumps in the time value due to frequency changes in both fwd and bwd direction. So what this patch does is change the formula to: time = cycles * frequency + offset And we calculate offset so that time_before == time_after, thereby ridding us of these jumps in time. [ Impact: fix/reduce sched_clock() jumps across frequency changing events ] Signed-off-by: Peter Zijlstra Cc: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar Chucked-on-by: Thomas Gleixner Signed-off-by: Andrew Morton --- arch/x86/kernel/tsc.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 3e1c057e98f..ef4dac50143 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -589,22 +589,26 @@ EXPORT_SYMBOL(recalibrate_cpu_khz); */ DEFINE_PER_CPU(unsigned long, cyc2ns); +DEFINE_PER_CPU(unsigned long long, cyc2ns_offset); static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) { - unsigned long long tsc_now, ns_now; + unsigned long long tsc_now, ns_now, *offset; unsigned long flags, *scale; local_irq_save(flags); sched_clock_idle_sleep_event(); scale = &per_cpu(cyc2ns, cpu); + offset = &per_cpu(cyc2ns_offset, cpu); rdtscll(tsc_now); ns_now = __cycles_2_ns(tsc_now); - if (cpu_khz) + if (cpu_khz) { *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; + *offset = ns_now - (tsc_now * *scale >> CYC2NS_SCALE_FACTOR); + } sched_clock_idle_wakeup_event(0); local_irq_restore(flags); -- cgit v1.2.3 From 5ce4243dcefbbc43791ffc36e1be55067ceec916 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 15 Jun 2009 22:26:33 +0400 Subject: x86: mce: Don't touch THERMAL_APIC_VECTOR if no active APIC present MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If APIC was disabled (for some reason) and as result it's not even mapped we should not try to enable thermal interrupts at all. Reported-by: Simon Holm Thøgersen Tested-by: Simon Holm Thøgersen Signed-off-by: Cyrill Gorcunov LKML-Reference: <20090615182633.GA7606@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_intel.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 2b011d2d857..61e32881f41 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -21,9 +21,15 @@ void intel_init_thermal(struct cpuinfo_x86 *c) int tm2 = 0; u32 l, h; - /* Thermal monitoring depends on ACPI and clock modulation*/ - if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) + /* + * Thermal monitoring depends on ACPI, clock modulation + * and APIC as well + */ + if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC) || + !cpu_has(c, X86_FEATURE_APIC)) { + pr_debug("Thermal monitoring disabled\n"); return; + } /* * First check if its enabled already, in which case there might -- cgit v1.2.3 From 50a8d4d29735ec99139e58ea3cb11bed3331cf87 Mon Sep 17 00:00:00 2001 From: "Figo.zhang" Date: Wed, 17 Jun 2009 22:25:20 +0800 Subject: x86, io_apic.c: Work around compiler warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This compiler warning: arch/x86/kernel/apic/io_apic.c: In function ‘ioapic_write_entry’: arch/x86/kernel/apic/io_apic.c:466: warning: ‘eu’ is used uninitialized in this function arch/x86/kernel/apic/io_apic.c:465: note: ‘eu’ was declared here Is bogus as 'eu' is always initialized. But annotate it away by initializing the variable, to make it easier for people to notice real warnings. A compiler that sees through this logic will optimize away the initialization. Signed-off-by: Figo.zhang LKML-Reference: <1245248720.3312.27.camel@myhost> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ef8d9290c7e..95e03701106 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -462,7 +462,8 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) { - union entry_union eu; + union entry_union eu = {{0, 0}}; + eu.entry = e; io_apic_write(apic, 0x11 + 2*pin, eu.w2); io_apic_write(apic, 0x10 + 2*pin, eu.w1); -- cgit v1.2.3 From 8f7007aabee22d6c186070e9532b0965f6036bb7 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 10 Jun 2009 12:41:01 -0700 Subject: x86: apic/io_apic.c: dmar_msi_type should be static Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 95e03701106..29d0752d951 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3568,7 +3568,7 @@ static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) #endif /* CONFIG_SMP */ -struct irq_chip dmar_msi_type = { +static struct irq_chip dmar_msi_type = { .name = "DMAR_MSI", .unmask = dmar_msi_unmask, .mask = dmar_msi_mask, -- cgit v1.2.3 From 1bf7b31efa0c322d93cb3f772cd9bc743e8bb42d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 17 Jun 2009 08:31:15 -0700 Subject: x86, mce: mce_intel.c needs mce_intel.c uses apic_write() and lapic_get_maxlvt(), and so it needs . Signed-off-by: H. Peter Anvin Cc: Andi Kleen Cc: Hidetoshi Seto --- arch/x86/kernel/cpu/mcheck/mce_intel.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 663a88e8b3c..e1acec0f7a3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From fe955e5c793aab398794be4c5ede172d48446c4a Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Wed, 10 Jun 2009 12:41:02 -0700 Subject: x86: nmi: Add Intel processor 0x6f4 to NMI perfctr1 workaround Expand Intel NMI perfctr1 workaround to include a Core2 processor stepping (cpuid family-6, model-f, stepping-4). Resolves a situation where the NMI would not enable on these processors. Signed-off-by: Prarit Bhargava Acked-by: Suresh Siddha Cc: prarit@redhat.com Cc: suresh.b.siddha@intel.com Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perfctr-watchdog.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index d6f5b9fbde3..5c481f6205b 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -716,11 +716,15 @@ static void probe_nmi_watchdog(void) wd_ops = &k7_wd_ops; break; case X86_VENDOR_INTEL: - /* - * Work around Core Duo (Yonah) errata AE49 where perfctr1 - * doesn't have a working enable bit. + /* Work around where perfctr1 doesn't have a working enable + * bit as described in the following errata: + * AE49 Core Duo and Intel Core Solo 65 nm + * AN49 Intel Pentium Dual-Core + * AF49 Dual-Core Intel Xeon Processor LV */ - if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) { + if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) || + ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 15 && + boot_cpu_data.x86_mask == 4))) { intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0; intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0; } -- cgit v1.2.3 From 8653f88ff93ba6bd6df4bac48908d41ad09dba7e Mon Sep 17 00:00:00 2001 From: Huang Weiyi Date: Sat, 13 Jun 2009 20:21:26 +0800 Subject: x86: Remove duplicated #include's Signed-off-by: Huang Weiyi LKML-Reference: <1244895686-2348-1-git-send-email-weiyi.huang@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/probe_32.c | 11 ----------- arch/x86/kernel/apic/summit_32.c | 1 - 2 files changed, 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 440a8bccd91..0c0182cc947 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -20,23 +20,12 @@ #include #include -#include -#include -#include -#include -#include -#include -#include #include -#include #include -#include -#include #include #include #include -#include #ifdef CONFIG_HOTPLUG_CPU #define DEFAULT_SEND_IPI (1) diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 344eee4ac0a..eafdfbd1ea9 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -44,7 +44,6 @@ #include #include #include -#include #include #include -- cgit v1.2.3 From 60f916dee612130c9977a8edd4abee98334202ba Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Jun 2009 19:00:20 +0200 Subject: perf_counter: x86: Set the period in the intel overflow handler Commit 9e350de37ac960 ("perf_counter: Accurate period data") missed a spot, which caused all Intel-PMU samples to have a period of 0. This broke auto-freq sampling. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index e8c68a5091d..ce1ae3f1f86 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1224,6 +1224,8 @@ again: if (!intel_pmu_save_and_restart(counter)) continue; + data.period = counter->hw.last_period; + if (perf_counter_overflow(counter, 1, &data)) intel_pmu_disable_counter(&counter->hw, bit); } -- cgit v1.2.3 From 3f4c3955ea320bde870ac2ce587466295aba5710 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 17 Jun 2009 22:13:22 +0400 Subject: x86, ioapic: Don't call disconnect_bsp_APIC if no APIC present Vegard Nossum reported: [ 503.576724] ACPI: Preparing to enter system sleep state S5 [ 503.710857] Disabling non-boot CPUs ... [ 503.716853] Power down. [ 503.717770] ------------[ cut here ]------------ [ 503.717770] WARNING: at arch/x86/kernel/apic/apic.c:249 native_apic_write_du) [ 503.717770] Hardware name: OptiPlex GX100 [ 503.717770] Modules linked in: [ 503.717770] Pid: 2136, comm: halt Not tainted 2.6.30 #443 [ 503.717770] Call Trace: [ 503.717770] [] ? printk+0x18/0x1a [ 503.717770] [] ? native_apic_write_dummy+0x38/0x50 [ 503.717770] [] warn_slowpath_common+0x6c/0xc0 [ 503.717770] [] ? native_apic_write_dummy+0x38/0x50 [ 503.717770] [] warn_slowpath_null+0x15/0x20 [ 503.717770] [] native_apic_write_dummy+0x38/0x50 [ 503.717770] [] disconnect_bsp_APIC+0x63/0x100 [ 503.717770] [] disable_IO_APIC+0xb8/0xc0 [ 503.717770] [] ? acpi_power_off+0x0/0x29 [ 503.717770] [] native_machine_shutdown+0x65/0x80 [ 503.717770] [] native_machine_power_off+0x26/0x30 [ 503.717770] [] machine_power_off+0x9/0x10 [ 503.717770] [] kernel_power_off+0x36/0x40 [ 503.717770] [] sys_reboot+0xfd/0x1f0 [ 503.717770] [] ? perf_swcounter_event+0xb0/0x130 [ 503.717770] [] ? perf_counter_task_sched_out+0x5d/0x120 [ 503.717770] [] ? finish_task_switch+0x56/0xd0 [ 503.717770] [] ? schedule+0x49e/0xb40 [ 503.717770] [] ? sys_kill+0x70/0x160 [ 503.717770] [] ? selinux_file_ioctl+0x3b/0x50 [ 503.717770] [] ? sys_ioctl+0x63/0x70 [ 503.717770] [] sysenter_do_call+0x12/0x22 [ 503.717770] ---[ end trace 8157b5d0ed378f15 ]--- | | That's including this commit: | | commit 103428e57be323c3c5545db8ad12667099bc6005 |Author: Cyrill Gorcunov |Date: Sun Jun 7 16:48:40 2009 +0400 | | x86, apic: Fix dummy apic read operation together with broken MP handling | If we have apic disabled we don't even switch to APIC mode and do not calling for connect_bsp_APIC. Though on SMP compiled kernel the native_machine_shutdown does try to write the apic register anyway. Fix it with explicit check if we really should touch apic registers. Reported-by: Vegard Nossum Signed-off-by: Cyrill Gorcunov Cc: Yinghai Lu LKML-Reference: <20090617181322.GG10822@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 29d0752d951..b7a79207295 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2004,7 +2004,9 @@ void disable_IO_APIC(void) /* * Use virtual wire A mode when interrupt remapping is enabled. */ - disconnect_bsp_APIC(!intr_remapping_enabled && ioapic_i8259.pin != -1); + if (cpu_has_apic) + disconnect_bsp_APIC(!intr_remapping_enabled && + ioapic_i8259.pin != -1); } #ifdef CONFIG_X86_32 -- cgit v1.2.3 From 2e04bc76560decd9270be2a805927316f617ef56 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Thu, 18 Jun 2009 00:35:57 +0200 Subject: i386: fix return to 16-bit stack from NMI handler Returning to a task with a 16-bit stack requires special care: the iret instruction does not restore the high word of esp in that case. The espfix code fixes this, but currently is not invoked on NMIs. This means that a running task gets the upper word of esp clobbered due intervening NMIs. To reproduce, compile and run the following program with the nmi watchdog enabled (nmi_watchdog=2 on the command line). Using gdb you can see that the high bits of esp contain garbage, while the low bits are still correct. This patch puts the espfix code back into the NMI code path. The patch is slightly complicated due to the irqtrace infrastructure not being NMI-safe. The NMI return path cannot call TRACE_IRQS_IRET. Otherwise, the tail of the normal iret-code is correct for the nmi code path too. To be able to share this code-path, the TRACE_IRQS_IRET was move up a bit. The espfix code exists after the TRACE_IRQS_IRET, but this code explicitly disables interrupts. This short interrupts-off section is now not traced anymore. The return-to-kernel path now always includes the preliminary test to decide if the espfix code should be called. This is never the case, but doing it this way keeps the patch as simple as possible and the few extra instructions should not affect timing in any significant way. #define _GNU_SOURCE #include #include #include #include #include #include int modify_ldt(int func, void *ptr, unsigned long bytecount) { return syscall(SYS_modify_ldt, func, ptr, bytecount); } /* this is assumed to be usable */ #define SEGBASEADDR 0x10000 #define SEGLIMIT 0x20000 /* 16-bit segment */ struct user_desc desc = { .entry_number = 0, .base_addr = SEGBASEADDR, .limit = SEGLIMIT, .seg_32bit = 0, .contents = 0, /* ??? */ .read_exec_only = 0, .limit_in_pages = 0, .seg_not_present = 0, .useable = 1 }; int main(void) { setvbuf(stdout, NULL, _IONBF, 0); /* map a 64 kb segment */ char *pointer = mmap((void *)SEGBASEADDR, SEGLIMIT+1, PROT_EXEC|PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0); if (pointer == NULL) { printf("could not map space\n"); return 0; } /* write ldt, new mode */ int err = modify_ldt(0x11, &desc, sizeof(desc)); if (err) { printf("error modifying ldt: %i\n", err); return 0; } for (int i=0; i<1000; i++) { asm volatile ( "pusha\n\t" "mov %ss, %eax\n\t" /* preserve ss:esp */ "mov %esp, %ebp\n\t" "push $7\n\t" /* index 0, ldt, user mode */ "push $65536-4096\n\t" /* esp */ "lss (%esp), %esp\n\t" /* switch to new stack */ "push %eax\n\t" /* save old ss:esp on new stack */ "push %ebp\n\t" "add $17*65536, %esp\n\t" /* set high bits */ "mov %esp, %edx\n\t" "mov $10000000, %ecx\n\t" /* wait... */ "1: loop 1b\n\t" /* ... a bit */ "cmp %esp, %edx\n\t" "je 1f\n\t" "ud2\n\t" /* esp changed inexplicably! */ "1:\n\t" "sub $17*65536, %esp\n\t" /* restore high bits */ "lss (%esp), %esp\n\t" /* restore old ss:esp */ "popa\n\t"); printf("\rx%ix", i); } return 0; } Signed-off-by: Alexander van Heukelum Acked-by: Stas Sergeev Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_32.S | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index c929add475c..d7d1c7d20e4 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -84,7 +84,7 @@ #define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF #else #define preempt_stop(clobbers) -#define resume_kernel restore_nocheck +#define resume_kernel restore_all #endif .macro TRACE_IRQS_IRET @@ -372,7 +372,7 @@ END(ret_from_exception) ENTRY(resume_kernel) DISABLE_INTERRUPTS(CLBR_ANY) cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? - jnz restore_nocheck + jnz restore_all need_resched: movl TI_flags(%ebp), %ecx # need_resched set ? testb $_TIF_NEED_RESCHED, %cl @@ -540,6 +540,8 @@ syscall_exit: jne syscall_exit_work restore_all: + TRACE_IRQS_IRET +restore_all_notrace: movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS # Warning: PT_OLDSS(%esp) contains the wrong/random values if we # are returning to the kernel. @@ -551,8 +553,6 @@ restore_all: CFI_REMEMBER_STATE je ldt_ss # returning to user-space with LDT SS restore_nocheck: - TRACE_IRQS_IRET -restore_nocheck_notrace: RESTORE_REGS 4 # skip orig_eax/error_code CFI_ADJUST_CFA_OFFSET -4 irq_return: @@ -601,8 +601,10 @@ ldt_ss: CFI_ADJUST_CFA_OFFSET 4 pushl %eax CFI_ADJUST_CFA_OFFSET 4 + /* Disable interrupts, but do not irqtrace this section: we + * will soon execute iret and the tracer was already set to + * the irqstate after the iret */ DISABLE_INTERRUPTS(CLBR_EAX) - TRACE_IRQS_OFF lss (%esp), %esp CFI_ADJUST_CFA_OFFSET -8 jmp restore_nocheck @@ -1329,7 +1331,7 @@ nmi_stack_correct: xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer call do_nmi - jmp restore_nocheck_notrace + jmp restore_all_notrace CFI_ENDPROC nmi_stack_fixup: -- cgit v1.2.3 From dc4c2a0aed3b09f6e255bd5c3faa50fe6e0b2ded Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Thu, 18 Jun 2009 00:35:58 +0200 Subject: i386: fix/simplify espfix stack switching, move it into assembly The espfix code triggers if we have a protected mode userspace application with a 16-bit stack. On returning to userspace, with iret, the CPU doesn't restore the high word of the stack pointer. This is an "official" bug, and the work-around used in the kernel is to temporarily switch to a 32-bit stack segment/pointer pair where the high word of the pointer is equal to the high word of the userspace stackpointer. The current implementation uses THREAD_SIZE to determine the cut-off, but there is no good reason not to use the more natural 64kb... However, implementing this by simply substituting THREAD_SIZE with 65536 in patch_espfix_desc crashed the test application. patch_espfix_desc tries to do what is described above, but gets it subtly wrong if the userspace stack pointer is just below a multiple of THREAD_SIZE: an overflow occurs to bit 13... With a bit of luck, when the kernelspace stackpointer is just below a 64kb-boundary, the overflow then ripples trough to bit 16 and userspace will see its stack pointer changed by 65536. This patch moves all espfix code into entry_32.S. Selecting a 16-bit cut-off simplifies the code. The game with changing the limit dynamically is removed too. It complicates matters and I see no value in it. Changing only the top 16-bit word of ESP is one instruction and it also implies that only two bytes of the ESPFIX GDT entry need to be changed and this can be implemented in just a handful simple to understand instructions. As a side effect, the operation to compute the original ESP from the ESPFIX ESP and the GDT entry simplifies a bit too, and the remaining three instructions have been expanded inline in entry_32.S. impact: can now reliably run userspace with ESP=xxxxfffc on 16-bit stack segment Signed-off-by: Alexander van Heukelum Acked-by: Stas Sergeev Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/entry_32.S | 49 ++++++++++++++++++++++++++++++-------------- 2 files changed, 35 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 5b9cb8839ca..ba1131ac943 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -108,7 +108,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { /* data */ [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, - [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, + [GDT_ENTRY_ESPFIX_SS] = { { { 0x0000ffff, 0x00cf9200 } } }, [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } }, GDT_STACK_CANARY_INIT #endif diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index d7d1c7d20e4..848f73f0954 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -588,24 +588,34 @@ ldt_ss: jne restore_nocheck #endif - /* If returning to userspace with 16bit stack, - * try to fix the higher word of ESP, as the CPU - * won't restore it. - * This is an "official" bug of all the x86-compatible - * CPUs, which we can try to work around to make - * dosemu and wine happy. */ - movl PT_OLDESP(%esp), %eax - movl %esp, %edx - call patch_espfix_desc +/* + * Setup and switch to ESPFIX stack + * + * We're returning to userspace with a 16 bit stack. The CPU will not + * restore the high word of ESP for us on executing iret... This is an + * "official" bug of all the x86-compatible CPUs, which we can work + * around to make dosemu and wine happy. We do this by preloading the + * high word of ESP with the high word of the userspace ESP while + * compensating for the offset by changing to the ESPFIX segment with + * a base address that matches for the difference. + */ + mov %esp, %edx /* load kernel esp */ + mov PT_OLDESP(%esp), %eax /* load userspace esp */ + mov %dx, %ax /* eax: new kernel esp */ + sub %eax, %edx /* offset (low word is 0) */ + PER_CPU(gdt_page, %ebx) + shr $16, %edx + mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */ + mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */ pushl $__ESPFIX_SS CFI_ADJUST_CFA_OFFSET 4 - pushl %eax + push %eax /* new kernel esp */ CFI_ADJUST_CFA_OFFSET 4 /* Disable interrupts, but do not irqtrace this section: we * will soon execute iret and the tracer was already set to * the irqstate after the iret */ DISABLE_INTERRUPTS(CLBR_EAX) - lss (%esp), %esp + lss (%esp), %esp /* switch to espfix segment */ CFI_ADJUST_CFA_OFFSET -8 jmp restore_nocheck CFI_ENDPROC @@ -718,15 +728,24 @@ PTREGSCALL(vm86) PTREGSCALL(vm86old) .macro FIXUP_ESPFIX_STACK - /* since we are on a wrong stack, we cant make it a C code :( */ +/* + * Switch back for ESPFIX stack to the normal zerobased stack + * + * We can't call C functions using the ESPFIX stack. This code reads + * the high word of the segment base from the GDT and swiches to the + * normal stack and adjusts ESP with the matching offset. + */ + /* fixup the stack */ PER_CPU(gdt_page, %ebx) - GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah) - addl %esp, %eax + mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */ + mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */ + shl $16, %eax + addl %esp, %eax /* the adjusted stack pointer */ pushl $__KERNEL_DS CFI_ADJUST_CFA_OFFSET 4 pushl %eax CFI_ADJUST_CFA_OFFSET 4 - lss (%esp), %esp + lss (%esp), %esp /* switch to the normal stack segment */ CFI_ADJUST_CFA_OFFSET -8 .endm .macro UNWIND_ESPFIX_STACK -- cgit v1.2.3 From bc3f5d3dbd576da94a575b1477b8e38551bf11da Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Thu, 18 Jun 2009 00:35:59 +0200 Subject: x86: de-assembler-ize asm/desc.h asm/desc.h is included in three assembly files, but the only macro it defines, GET_DESC_BASE, is never used. This patch removes the includes, removes the macro GET_DESC_BASE and the ASSEMBLY guard from asm/desc.h. Signed-off-by: Alexander van Heukelum Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_32.S | 1 - arch/x86/kernel/head_32.S | 1 - arch/x86/kernel/head_64.S | 1 - 3 files changed, 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 848f73f0954..9f8ce77dbc6 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -48,7 +48,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index dc5ed4bdd88..8663afb5653 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 54b29bb24e7..fa54f78e2a0 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From 74b602c7147212a7495879ec23fe6c2d3b470e06 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 17 Jun 2009 14:43:32 -0700 Subject: x86: fix duplicated sysfs attribute The sysfs attribute cmci_disabled was accidentall turned into a duplicate of ignore_ce, breaking all other attributes. Signed-off-by: Yinghai Lu Acked-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 2a560cefb67..5aac9e4dd13 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1777,7 +1777,7 @@ static struct sysdev_ext_attribute attr_ignore_ce = { }; static struct sysdev_ext_attribute attr_cmci_disabled = { - _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_cmci_disabled), + _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled), &mce_cmci_disabled }; -- cgit v1.2.3 From e92fae064ae42b2a4a77646f7655bca4c87bb1eb Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 17 Jun 2009 16:21:33 -0700 Subject: x86: use zalloc_cpumask_var for mce_dev_initialized We need a cleared cpu_mask to record if mce is initialized, especially when MAXSMP is used. used zalloc_... instead Signed-off-by: Yinghai Lu Reviewed-by: Hidetoshi Seto Cc: stable@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5aac9e4dd13..c2fb70d0286 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1969,7 +1969,7 @@ static __init int mce_init_device(void) if (!mce_available(&boot_cpu_data)) return -EIO; - alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); + zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); err = mce_init_banks(); if (err) -- cgit v1.2.3 From b1f49f9582f9be6de5055cfa97eabf6246f2eaf7 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 18 Jun 2009 14:53:24 +0900 Subject: x86, mce: fix error path in mce_create_device() Don't skip removing mce_attrs in route from error2. Signed-off-by: Hidetoshi Seto Cc: Andi Kleen Cc: Huang Ying Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index c2fb70d0286..284d1de968b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1798,7 +1798,7 @@ static cpumask_var_t mce_dev_initialized; static __cpuinit int mce_create_device(unsigned int cpu) { int err; - int i; + int i, j; if (!mce_available(&boot_cpu_data)) return -EIO; @@ -1816,9 +1816,9 @@ static __cpuinit int mce_create_device(unsigned int cpu) if (err) goto error; } - for (i = 0; i < banks; i++) { + for (j = 0; j < banks; j++) { err = sysdev_create_file(&per_cpu(mce_dev, cpu), - &bank_attrs[i]); + &bank_attrs[j]); if (err) goto error2; } @@ -1826,8 +1826,8 @@ static __cpuinit int mce_create_device(unsigned int cpu) return 0; error2: - while (--i >= 0) - sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); + while (--j >= 0) + sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[j]); error: while (--i >= 0) sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); -- cgit v1.2.3 From 71e308a239c098673570d0b417d42262bb535909 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 18 Jun 2009 12:45:08 -0400 Subject: function-graph: add stack frame test In case gcc does something funny with the stack frames, or the return from function code, we would like to detect that. An arch may implement passing of a variable that is unique to the function and can be saved on entering a function and can be tested when exiting the function. Usually the frame pointer can be used for this purpose. This patch also implements this for x86. Where it passes in the stack frame of the parent function, and will test that frame on exit. There was a case in x86_32 with optimize for size (-Os) where, for a few functions, gcc would align the stack frame and place a copy of the return address into it. The function graph tracer modified the copy and not the actual return address. On return from the funtion, it did not go to the tracer hook, but returned to the parent. This broke the function graph tracer, because the return of the parent (where gcc did not do this funky manipulation) returned to the location that the child function was suppose to. This caused strange kernel crashes. This test detected the problem and pointed out where the issue was. This modifies the parameters of one of the functions that the arch specific code calls, so it includes changes to arch code to accommodate the new prototype. Note, I notice that the parsic arch implements its own push_return_trace. This is now a generic function and the ftrace_push_return_trace should be used instead. This patch does not touch that code. Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: Frederic Weisbecker Cc: Helge Deller Cc: Kyle McMartin Signed-off-by: Steven Rostedt --- arch/x86/kernel/entry_32.S | 2 ++ arch/x86/kernel/entry_64.S | 2 ++ arch/x86/kernel/ftrace.c | 6 ++++-- 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index c929add475c..0d4b28564c1 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1154,6 +1154,7 @@ ENTRY(ftrace_graph_caller) pushl %edx movl 0xc(%esp), %edx lea 0x4(%ebp), %eax + movl (%ebp), %ecx subl $MCOUNT_INSN_SIZE, %edx call prepare_ftrace_return popl %edx @@ -1168,6 +1169,7 @@ return_to_handler: pushl %eax pushl %ecx pushl %edx + movl %ebp, %eax call ftrace_return_to_handler movl %eax, 0xc(%esp) popl %edx diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index de74f0a3e0e..c251be74510 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -135,6 +135,7 @@ ENTRY(ftrace_graph_caller) leaq 8(%rbp), %rdi movq 0x38(%rsp), %rsi + movq (%rbp), %rdx subq $MCOUNT_INSN_SIZE, %rsi call prepare_ftrace_return @@ -150,6 +151,7 @@ GLOBAL(return_to_handler) /* Save the return values */ movq %rax, (%rsp) movq %rdx, 8(%rsp) + movq %rbp, %rdi call ftrace_return_to_handler diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index b79c5533c42..d94e1ea3b9f 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -408,7 +408,8 @@ int ftrace_disable_ftrace_graph_caller(void) * Hook the return address and push it in the stack of return addrs * in current thread info. */ -void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) +void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, + unsigned long frame_pointer) { unsigned long old; int faulted; @@ -453,7 +454,8 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) return; } - if (ftrace_push_return_trace(old, self_addr, &trace.depth) == -EBUSY) { + if (ftrace_push_return_trace(old, self_addr, &trace.depth, + frame_pointer) == -EBUSY) { *parent = old; return; } -- cgit v1.2.3 From f9188e023c248d73f5b4a589b480e065c1864068 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 18 Jun 2009 22:20:52 +0200 Subject: perf_counter: Make callchain samples extensible Before exposing upstream tools to a callchain-samples ABI, tidy it up to make it more extensible in the future: Use markers in the IP chain to denote context, use (u64)-1..-4095 range for these context markers because we use them for ERR_PTR(), so these addresses are unlikely to be mapped. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 29 ++++++----------------------- 1 file changed, 6 insertions(+), 23 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index ce1ae3f1f86..76dfef23f78 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1555,9 +1555,9 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter) */ static inline -void callchain_store(struct perf_callchain_entry *entry, unsigned long ip) +void callchain_store(struct perf_callchain_entry *entry, u64 ip) { - if (entry->nr < MAX_STACK_DEPTH) + if (entry->nr < PERF_MAX_STACK_DEPTH) entry->ip[entry->nr++] = ip; } @@ -1602,22 +1602,10 @@ static const struct stacktrace_ops backtrace_ops = { static void perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) { - unsigned long bp; - char *stack; - int nr = entry->nr; - + callchain_store(entry, PERF_CONTEXT_KERNEL); callchain_store(entry, regs->ip); - stack = ((char *)regs + sizeof(struct pt_regs)); -#ifdef CONFIG_FRAME_POINTER - get_bp(bp); -#else - bp = 0; -#endif - - dump_trace(NULL, regs, (void *)&stack, bp, &backtrace_ops, entry); - - entry->kernel = entry->nr - nr; + dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); } /* @@ -1669,16 +1657,16 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) { struct stack_frame frame; const void __user *fp; - int nr = entry->nr; if (!user_mode(regs)) regs = task_pt_regs(current); fp = (void __user *)regs->bp; + callchain_store(entry, PERF_CONTEXT_USER); callchain_store(entry, regs->ip); - while (entry->nr < MAX_STACK_DEPTH) { + while (entry->nr < PERF_MAX_STACK_DEPTH) { frame.next_frame = NULL; frame.return_address = 0; @@ -1691,8 +1679,6 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) callchain_store(entry, frame.return_address); fp = frame.next_frame; } - - entry->user = entry->nr - nr; } static void @@ -1728,9 +1714,6 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) entry = &__get_cpu_var(irq_entry); entry->nr = 0; - entry->hv = 0; - entry->kernel = 0; - entry->user = 0; perf_do_callchain(regs, entry); -- cgit v1.2.3 From 7b768f07dce463a054c9dd84862d15ccc3d2b712 Mon Sep 17 00:00:00 2001 From: "Pallipadi, Venkatesh" Date: Fri, 19 Jun 2009 17:14:59 -0700 Subject: ACPI: pdc init related memory leak with physical CPU hotplug arch_acpi_processor_cleanup_pdc() in x86 and ia64 results in memory allocated for _PDC objects that is never freed and will cause memory leak in case of physical CPU remove and add. Patch fixes the memory leak by freeing the objects soon after _PDC is evaluated. Reported-by: Bjorn Helgaas Signed-off-by: Venkatesh Pallipadi Signed-off-by: Len Brown --- arch/x86/kernel/acpi/processor.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c index 7c074eec39f..d296f4a195c 100644 --- a/arch/x86/kernel/acpi/processor.c +++ b/arch/x86/kernel/acpi/processor.c @@ -72,6 +72,7 @@ static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c) return; } + /* Initialize _PDC data based on the CPU vendor */ void arch_acpi_processor_init_pdc(struct acpi_processor *pr) { @@ -85,3 +86,15 @@ void arch_acpi_processor_init_pdc(struct acpi_processor *pr) } EXPORT_SYMBOL(arch_acpi_processor_init_pdc); + +void arch_acpi_processor_cleanup_pdc(struct acpi_processor *pr) +{ + if (pr->pdc) { + kfree(pr->pdc->pointer->buffer.pointer); + kfree(pr->pdc->pointer); + kfree(pr->pdc); + pr->pdc = NULL; + } +} + +EXPORT_SYMBOL(arch_acpi_processor_cleanup_pdc); -- cgit v1.2.3 From 99bd0c0fc4b04da54cb311953ef9489931c19c63 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 19 Jun 2009 10:59:09 +0200 Subject: x86: Set cpu_llc_id on AMD CPUs This counts when building sched domains in case NUMA information is not available. ( See cpu_coregroup_mask() which uses llc_shared_map which in turn is created based on cpu_llc_id. ) Currently Linux builds domains as follows: (example from a dual socket quad-core system) CPU0 attaching sched-domain: domain 0: span 0-7 level CPU groups: 0 1 2 3 4 5 6 7 ... CPU7 attaching sched-domain: domain 0: span 0-7 level CPU groups: 7 0 1 2 3 4 5 6 Ever since that is borked for multi-core AMD CPU systems. This patch fixes that and now we get a proper: CPU0 attaching sched-domain: domain 0: span 0-3 level MC groups: 0 1 2 3 domain 1: span 0-7 level CPU groups: 0-3 4-7 ... CPU7 attaching sched-domain: domain 0: span 4-7 level MC groups: 7 4 5 6 domain 1: span 0-7 level CPU groups: 4-7 0-3 This allows scheduler to assign tasks to cores on different sockets (i.e. that don't share last level cache) for performance reasons. Signed-off-by: Andreas Herrmann LKML-Reference: <20090619085909.GJ5218@alberich.amd.com> Cc: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index e5b27d8f1b4..28e5f595604 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -258,13 +258,15 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_HT unsigned bits; + int cpu = smp_processor_id(); bits = c->x86_coreid_bits; - /* Low order bits define the core id (index of core in socket) */ c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); /* Convert the initial APIC ID into the socket ID */ c->phys_proc_id = c->initial_apicid >> bits; + /* use socket ID also for last level cache */ + per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; #endif } -- cgit v1.2.3 From d9f2a5ecb2846d0fd368fb4c45182e43f38e4471 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 20 Jun 2009 13:19:25 +0530 Subject: perf_counter, x8: Fix L1-data-Cache-Store-Referencees for AMD Fix AMD's Data Cache Refills from System event. After this patch : ./tools/perf/perf stat -e l1d -e l1d-misses -e l1d-write -e l1d-prefetch -e l1d-prefetch-miss -e l1i -e l1i-misses -e l1i-prefetch -e l2 -e l2-misses -e l2-write -e dtlb -e dtlb-misses -e itlb -e itlb-misses -e bpu -e bpu-misses ls /dev/ > /dev/null Performance counter stats for 'ls /dev/': 2499484 L1-data-Cache-Load-Referencees (scaled from 3.97%) 70347 L1-data-Cache-Load-Misses (scaled from 7.30%) 9360 L1-data-Cache-Store-Referencees (scaled from 8.64%) 32804 L1-data-Cache-Prefetch-Referencees (scaled from 17.72%) 7693 L1-data-Cache-Prefetch-Misses (scaled from 22.97%) 2180945 L1-instruction-Cache-Load-Referencees (scaled from 28.48%) 14518 L1-instruction-Cache-Load-Misses (scaled from 35.00%) 2405 L1-instruction-Cache-Prefetch-Referencees (scaled from 34.89%) 71387 L2-Cache-Load-Referencees (scaled from 34.94%) 18732 L2-Cache-Load-Misses (scaled from 34.92%) 79918 L2-Cache-Store-Referencees (scaled from 36.02%) 1295294 Data-TLB-Cache-Load-Referencees (scaled from 35.99%) 30896 Data-TLB-Cache-Load-Misses (scaled from 33.36%) 1222030 Instruction-TLB-Cache-Load-Referencees (scaled from 29.46%) 357 Instruction-TLB-Cache-Load-Misses (scaled from 20.46%) 530888 Branch-Cache-Load-Referencees (scaled from 11.48%) 8638 Branch-Cache-Load-Misses (scaled from 5.09%) 0.011295149 seconds time elapsed. Earlier it always shows value 0. Signed-off-by: Jaswinder Singh Rajput LKML-Reference: <1245484165.3102.6.camel@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 76dfef23f78..22eb3a1d4f9 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -401,7 +401,7 @@ static const u64 amd_hw_cache_event_ids [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ }, [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0042, /* Data Cache Refills from L2 */ + [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ [ C(RESULT_MISS) ] = 0, }, [ C(OP_PREFETCH) ] = { -- cgit v1.2.3 From c5806df9232d2a7f554b4839b57cac2e664fc256 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Jun 2009 11:56:24 +0900 Subject: x86: fix duplicate free in setup_pcpu_remap() failure path In the failure path, setup_pcpu_remap() tries to free the area which has already been freed to make holes in the large page. Fix it. [ Impact: fix duplicate free in failure path ] Signed-off-by: Tejun Heo Cc: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 9c3f0823e6a..dfbc7e6c64d 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -228,7 +228,7 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) enomem: for_each_possible_cpu(cpu) if (pcpur_ptrs[cpu]) - free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE); + free_bootmem(__pa(pcpur_ptrs[cpu]), pcpur_size); ret = -ENOMEM; out_free_ar: free_bootmem(__pa(pcpur_ptrs), ptrs_size); -- cgit v1.2.3 From 97c9bf0618cd40b05b4859c1f8a90d8ad97fefb2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Jun 2009 11:56:24 +0900 Subject: x86: rename remap percpu first chunk allocator to lpage The "remap" allocator remaps large pages to build the first chunk; however, the name isn't very good because 4k allocator remaps too and the whole point of the remap allocator is using large page mapping. The allocator will be generalized and exported outside of x86, rename it to lpage before that happens. percpu_alloc kernel parameter is updated to accept both "remap" and "lpage" for lpage allocator. [ Impact: code cleanup, kernel parameter argument updated ] Signed-off-by: Tejun Heo Cc: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 50 +++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 25 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index dfbc7e6c64d..8794c0c94d2 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -124,7 +124,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, } /* - * Remap allocator + * Large page remap allocator * * This allocator uses PMD page as unit. A PMD page is allocated for * each cpu and each is remapped into vmalloc area using PMD mapping. @@ -137,20 +137,20 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, * better than only using 4k mappings while still being NUMA friendly. */ #ifdef CONFIG_NEED_MULTIPLE_NODES -static size_t pcpur_size __initdata; -static void **pcpur_ptrs __initdata; +static size_t pcpul_size __initdata; +static void **pcpul_ptrs __initdata; -static struct page * __init pcpur_get_page(unsigned int cpu, int pageno) +static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) { size_t off = (size_t)pageno << PAGE_SHIFT; - if (off >= pcpur_size) + if (off >= pcpul_size) return NULL; - return virt_to_page(pcpur_ptrs[cpu] + off); + return virt_to_page(pcpul_ptrs[cpu] + off); } -static ssize_t __init setup_pcpu_remap(size_t static_size) +static ssize_t __init setup_pcpu_lpage(size_t static_size) { static struct vm_struct vm; size_t ptrs_size, dyn_size; @@ -170,36 +170,36 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) * Currently supports only single page. Supporting multiple * pages won't be too difficult if it ever becomes necessary. */ - pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + + pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE); - if (pcpur_size > PMD_SIZE) { + if (pcpul_size > PMD_SIZE) { pr_warning("PERCPU: static data is larger than large page, " "can't use large page\n"); return -EINVAL; } - dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; + dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; /* allocate pointer array and alloc large pages */ - ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0])); - pcpur_ptrs = alloc_bootmem(ptrs_size); + ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_ptrs[0])); + pcpul_ptrs = alloc_bootmem(ptrs_size); for_each_possible_cpu(cpu) { - pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE); - if (!pcpur_ptrs[cpu]) + pcpul_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE); + if (!pcpul_ptrs[cpu]) goto enomem; /* - * Only use pcpur_size bytes and give back the rest. + * Only use pcpul_size bytes and give back the rest. * * Ingo: The 2MB up-rounding bootmem is needed to make * sure the partial 2MB page is still fully RAM - it's * not well-specified to have a PAT-incompatible area * (unmapped RAM, device memory, etc.) in that hole. */ - free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size), - PMD_SIZE - pcpur_size); + free_bootmem(__pa(pcpul_ptrs[cpu] + pcpul_size), + PMD_SIZE - pcpul_size); - memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size); + memcpy(pcpul_ptrs[cpu], __per_cpu_load, static_size); } /* allocate address and map */ @@ -212,7 +212,7 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) pmd = populate_extra_pmd((unsigned long)vm.addr + cpu * PMD_SIZE); - set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])), + set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpul_ptrs[cpu])), PAGE_KERNEL_LARGE)); } @@ -220,22 +220,22 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) pr_info("PERCPU: Remapped at %p with large pages, static data " "%zu bytes\n", vm.addr, static_size); - ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, + ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, PERCPU_FIRST_CHUNK_RESERVE, dyn_size, PMD_SIZE, vm.addr, NULL); goto out_free_ar; enomem: for_each_possible_cpu(cpu) - if (pcpur_ptrs[cpu]) - free_bootmem(__pa(pcpur_ptrs[cpu]), pcpur_size); + if (pcpul_ptrs[cpu]) + free_bootmem(__pa(pcpul_ptrs[cpu]), pcpul_size); ret = -ENOMEM; out_free_ar: - free_bootmem(__pa(pcpur_ptrs), ptrs_size); + free_bootmem(__pa(pcpul_ptrs), ptrs_size); return ret; } #else -static ssize_t __init setup_pcpu_remap(size_t static_size) +static ssize_t __init setup_pcpu_lpage(size_t static_size) { return -EINVAL; } @@ -367,7 +367,7 @@ void __init setup_per_cpu_areas(void) * of large page mappings. Please read comments on top of * each allocator for details. */ - ret = setup_pcpu_remap(static_size); + ret = setup_pcpu_lpage(static_size); if (ret < 0) ret = setup_pcpu_embed(static_size); if (ret < 0) -- cgit v1.2.3 From 0ff2587fd54bd6f66bc6914ada4eb77a7e819a5b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Jun 2009 11:56:24 +0900 Subject: x86: prepare setup_pcpu_lpage() for pageattr fix Make the following changes in preparation of coming pageattr updates. * Define and use array of struct pcpul_ent instead of array of pointers. The only difference is ->cpu field which is set but unused yet. * Rename variables according to the above change. * Rename local variable vm to pcpul_vm and move it out of the function. [ Impact: no functional difference ] Signed-off-by: Tejun Heo Cc: Jan Beulich Cc: Andi Kleen Cc: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 58 ++++++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 25 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 8794c0c94d2..7d38941e2b8 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -137,8 +137,14 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, * better than only using 4k mappings while still being NUMA friendly. */ #ifdef CONFIG_NEED_MULTIPLE_NODES +struct pcpul_ent { + unsigned int cpu; + void *ptr; +}; + static size_t pcpul_size __initdata; -static void **pcpul_ptrs __initdata; +static struct pcpul_ent *pcpul_map __initdata; +static struct vm_struct pcpul_vm; static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) { @@ -147,13 +153,12 @@ static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) if (off >= pcpul_size) return NULL; - return virt_to_page(pcpul_ptrs[cpu] + off); + return virt_to_page(pcpul_map[cpu].ptr + off); } static ssize_t __init setup_pcpu_lpage(size_t static_size) { - static struct vm_struct vm; - size_t ptrs_size, dyn_size; + size_t map_size, dyn_size; unsigned int cpu; ssize_t ret; @@ -180,12 +185,14 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size) dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; /* allocate pointer array and alloc large pages */ - ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_ptrs[0])); - pcpul_ptrs = alloc_bootmem(ptrs_size); + map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0])); + pcpul_map = alloc_bootmem(map_size); for_each_possible_cpu(cpu) { - pcpul_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE); - if (!pcpul_ptrs[cpu]) + pcpul_map[cpu].cpu = cpu; + pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE, + PMD_SIZE); + if (!pcpul_map[cpu].ptr) goto enomem; /* @@ -196,42 +203,43 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size) * not well-specified to have a PAT-incompatible area * (unmapped RAM, device memory, etc.) in that hole. */ - free_bootmem(__pa(pcpul_ptrs[cpu] + pcpul_size), + free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size), PMD_SIZE - pcpul_size); - memcpy(pcpul_ptrs[cpu], __per_cpu_load, static_size); + memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size); } /* allocate address and map */ - vm.flags = VM_ALLOC; - vm.size = num_possible_cpus() * PMD_SIZE; - vm_area_register_early(&vm, PMD_SIZE); + pcpul_vm.flags = VM_ALLOC; + pcpul_vm.size = num_possible_cpus() * PMD_SIZE; + vm_area_register_early(&pcpul_vm, PMD_SIZE); for_each_possible_cpu(cpu) { - pmd_t *pmd; + pmd_t *pmd, pmd_v; - pmd = populate_extra_pmd((unsigned long)vm.addr - + cpu * PMD_SIZE); - set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpul_ptrs[cpu])), - PAGE_KERNEL_LARGE)); + pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr + + cpu * PMD_SIZE); + pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)), + PAGE_KERNEL_LARGE); + set_pmd(pmd, pmd_v); } /* we're ready, commit */ pr_info("PERCPU: Remapped at %p with large pages, static data " - "%zu bytes\n", vm.addr, static_size); + "%zu bytes\n", pcpul_vm.addr, static_size); ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, PERCPU_FIRST_CHUNK_RESERVE, dyn_size, - PMD_SIZE, vm.addr, NULL); - goto out_free_ar; + PMD_SIZE, pcpul_vm.addr, NULL); + goto out_free_map; enomem: for_each_possible_cpu(cpu) - if (pcpul_ptrs[cpu]) - free_bootmem(__pa(pcpul_ptrs[cpu]), pcpul_size); + if (pcpul_map[cpu].ptr) + free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size); ret = -ENOMEM; -out_free_ar: - free_bootmem(__pa(pcpul_ptrs), ptrs_size); +out_free_map: + free_bootmem(__pa(pcpul_map), map_size); return ret; } #else -- cgit v1.2.3 From e59a1bb2fdfb745c685f5b40ffbed126331d3223 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Jun 2009 11:56:24 +0900 Subject: x86: fix pageattr handling for lpage percpu allocator and re-enable it lpage allocator aliases a PMD page for each cpu and returns whatever is unused to the page allocator. When the pageattr of the recycled pages are changed, this makes the two aliases point to the overlapping regions with different attributes which isn't allowed and known to cause subtle data corruption in certain cases. This can be handled in simliar manner to the x86_64 highmap alias. pageattr code should detect if the target pages have PMD alias and split the PMD alias and synchronize the attributes. pcpur allocator is updated to keep the allocated PMD pages map sorted in ascending address order and provide pcpu_lpage_remapped() function which binary searches the array to determine whether the given address is aliased and if so to which address. pageattr is updated to use pcpu_lpage_remapped() to detect the PMD alias and split it up as necessary from cpa_process_alias(). Jan Beulich spotted the original problem and incorrect usage of vaddr instead of laddr for lookup. With this, lpage percpu allocator should work correctly. Re-enable it. [ Impact: fix subtle lpage pageattr bug and re-enable lpage ] Signed-off-by: Tejun Heo Reported-by: Jan Beulich Cc: Andi Kleen Cc: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 72 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 63 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 7d38941e2b8..bad2fd22311 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -142,8 +142,8 @@ struct pcpul_ent { void *ptr; }; -static size_t pcpul_size __initdata; -static struct pcpul_ent *pcpul_map __initdata; +static size_t pcpul_size; +static struct pcpul_ent *pcpul_map; static struct vm_struct pcpul_vm; static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) @@ -160,15 +160,14 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size) { size_t map_size, dyn_size; unsigned int cpu; + int i, j; ssize_t ret; /* * If large page isn't supported, there's no benefit in doing * this. Also, on non-NUMA, embedding is better. - * - * NOTE: disabled for now. */ - if (true || !cpu_has_pse || !pcpu_need_numa()) + if (!cpu_has_pse || !pcpu_need_numa()) return -EINVAL; /* @@ -231,16 +230,71 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size) ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, PERCPU_FIRST_CHUNK_RESERVE, dyn_size, PMD_SIZE, pcpul_vm.addr, NULL); - goto out_free_map; + + /* sort pcpul_map array for pcpu_lpage_remapped() */ + for (i = 0; i < num_possible_cpus() - 1; i++) + for (j = i + 1; j < num_possible_cpus(); j++) + if (pcpul_map[i].ptr > pcpul_map[j].ptr) { + struct pcpul_ent tmp = pcpul_map[i]; + pcpul_map[i] = pcpul_map[j]; + pcpul_map[j] = tmp; + } + + return ret; enomem: for_each_possible_cpu(cpu) if (pcpul_map[cpu].ptr) free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size); - ret = -ENOMEM; -out_free_map: free_bootmem(__pa(pcpul_map), map_size); - return ret; + return -ENOMEM; +} + +/** + * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area + * @kaddr: the kernel address in question + * + * Determine whether @kaddr falls in the pcpul recycled area. This is + * used by pageattr to detect VM aliases and break up the pcpu PMD + * mapping such that the same physical page is not mapped under + * different attributes. + * + * The recycled area is always at the tail of a partially used PMD + * page. + * + * RETURNS: + * Address of corresponding remapped pcpu address if match is found; + * otherwise, NULL. + */ +void *pcpu_lpage_remapped(void *kaddr) +{ + void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK); + unsigned long offset = (unsigned long)kaddr & ~PMD_MASK; + int left = 0, right = num_possible_cpus() - 1; + int pos; + + /* pcpul in use at all? */ + if (!pcpul_map) + return NULL; + + /* okay, perform binary search */ + while (left <= right) { + pos = (left + right) / 2; + + if (pcpul_map[pos].ptr < pmd_addr) + left = pos + 1; + else if (pcpul_map[pos].ptr > pmd_addr) + right = pos - 1; + else { + /* it shouldn't be in the area for the first chunk */ + WARN_ON(offset < pcpul_size); + + return pcpul_vm.addr + + pcpul_map[pos].cpu * PMD_SIZE + offset; + } + } + + return NULL; } #else static ssize_t __init setup_pcpu_lpage(size_t static_size) -- cgit v1.2.3 From fa8a7094ba1679b4b9b443e0ac9f5e046c79ee8d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Jun 2009 11:56:24 +0900 Subject: x86: implement percpu_alloc kernel parameter According to Andi, it isn't clear whether lpage allocator is worth the trouble as there are many processors where PMD TLB is far scarcer than PTE TLB. The advantage or disadvantage probably depends on the actual size of percpu area and specific processor. As performance degradation due to TLB pressure tends to be highly workload specific and subtle, it is difficult to decide which way to go without more data. This patch implements percpu_alloc kernel parameter to allow selecting which first chunk allocator to use to ease debugging and testing. While at it, make sure all the failure paths report why something failed to help determining why certain allocator isn't working. Also, kill the "Great future plan" comment which had already been realized quite some time ago. [ Impact: allow explicit percpu first chunk allocator selection ] Signed-off-by: Tejun Heo Reported-by: Jan Beulich Cc: Andi Kleen Cc: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 69 ++++++++++++++++++++++++++++++------------ 1 file changed, 50 insertions(+), 19 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index bad2fd22311..165ebd5ba83 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -156,20 +156,23 @@ static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) return virt_to_page(pcpul_map[cpu].ptr + off); } -static ssize_t __init setup_pcpu_lpage(size_t static_size) +static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) { size_t map_size, dyn_size; unsigned int cpu; int i, j; ssize_t ret; - /* - * If large page isn't supported, there's no benefit in doing - * this. Also, on non-NUMA, embedding is better. - */ - if (!cpu_has_pse || !pcpu_need_numa()) + /* on non-NUMA, embedding is better */ + if (!chosen && !pcpu_need_numa()) return -EINVAL; + /* need PSE */ + if (!cpu_has_pse) { + pr_warning("PERCPU: lpage allocator requires PSE\n"); + return -EINVAL; + } + /* * Currently supports only single page. Supporting multiple * pages won't be too difficult if it ever becomes necessary. @@ -191,8 +194,11 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size) pcpul_map[cpu].cpu = cpu; pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE); - if (!pcpul_map[cpu].ptr) + if (!pcpul_map[cpu].ptr) { + pr_warning("PERCPU: failed to allocate large page " + "for cpu%u\n", cpu); goto enomem; + } /* * Only use pcpul_size bytes and give back the rest. @@ -297,7 +303,7 @@ void *pcpu_lpage_remapped(void *kaddr) return NULL; } #else -static ssize_t __init setup_pcpu_lpage(size_t static_size) +static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) { return -EINVAL; } @@ -311,7 +317,7 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size) * mapping so that it can use PMD mapping without additional TLB * pressure. */ -static ssize_t __init setup_pcpu_embed(size_t static_size) +static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) { size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; @@ -320,7 +326,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size) * this. Also, embedding allocation doesn't play well with * NUMA. */ - if (!cpu_has_pse || pcpu_need_numa()) + if (!chosen && (!cpu_has_pse || pcpu_need_numa())) return -EINVAL; return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, @@ -370,8 +376,11 @@ static ssize_t __init setup_pcpu_4k(size_t static_size) void *ptr; ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); - if (!ptr) + if (!ptr) { + pr_warning("PERCPU: failed to allocate " + "4k page for cpu%u\n", cpu); goto enomem; + } memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); pcpu4k_pages[j++] = virt_to_page(ptr); @@ -395,6 +404,16 @@ out_free_ar: return ret; } +/* for explicit first chunk allocator selection */ +static char pcpu_chosen_alloc[16] __initdata; + +static int __init percpu_alloc_setup(char *str) +{ + strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1); + return 0; +} +early_param("percpu_alloc", percpu_alloc_setup); + static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 @@ -408,11 +427,6 @@ static inline void setup_percpu_segment(int cpu) #endif } -/* - * Great future plan: - * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. - * Always point %gs to its beginning - */ void __init setup_per_cpu_areas(void) { size_t static_size = __per_cpu_end - __per_cpu_start; @@ -429,9 +443,26 @@ void __init setup_per_cpu_areas(void) * of large page mappings. Please read comments on top of * each allocator for details. */ - ret = setup_pcpu_lpage(static_size); - if (ret < 0) - ret = setup_pcpu_embed(static_size); + ret = -EINVAL; + if (strlen(pcpu_chosen_alloc)) { + if (strcmp(pcpu_chosen_alloc, "4k")) { + if (!strcmp(pcpu_chosen_alloc, "lpage")) + ret = setup_pcpu_lpage(static_size, true); + else if (!strcmp(pcpu_chosen_alloc, "embed")) + ret = setup_pcpu_embed(static_size, true); + else + pr_warning("PERCPU: unknown allocator %s " + "specified\n", pcpu_chosen_alloc); + if (ret < 0) + pr_warning("PERCPU: %s allocator failed (%zd), " + "falling back to 4k\n", + pcpu_chosen_alloc, ret); + } + } else { + ret = setup_pcpu_lpage(static_size, false); + if (ret < 0) + ret = setup_pcpu_embed(static_size, false); + } if (ret < 0) ret = setup_pcpu_4k(static_size); if (ret < 0) -- cgit v1.2.3 From 0017c869ddcb73069905d09f9e98e68627466237 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Jun 2009 11:56:24 +0900 Subject: x86: ensure percpu lpage doesn't consume too much vmalloc space On extreme configuration (e.g. 32bit 32-way NUMA machine), lpage percpu first chunk allocator can consume too much of vmalloc space. Make it fall back to 4k allocator if the consumption goes over 20%. [ Impact: add sanity check for lpage percpu first chunk allocator ] Signed-off-by: Tejun Heo Reported-by: Jan Beulich Cc: Andi Kleen Cc: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 165ebd5ba83..29a3eef7cf4 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -163,9 +163,21 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) int i, j; ssize_t ret; - /* on non-NUMA, embedding is better */ - if (!chosen && !pcpu_need_numa()) - return -EINVAL; + if (!chosen) { + size_t vm_size = VMALLOC_END - VMALLOC_START; + size_t tot_size = num_possible_cpus() * PMD_SIZE; + + /* on non-NUMA, embedding is better */ + if (!pcpu_need_numa()) + return -EINVAL; + + /* don't consume more than 20% of vmalloc area */ + if (tot_size > vm_size / 5) { + pr_info("PERCPU: too large chunk size %zuMB for " + "large page remap\n", tot_size >> 20); + return -EINVAL; + } + } /* need PSE */ if (!cpu_has_pse) { -- cgit v1.2.3 From 854c879f5abf309ebd378bea1ee41acf4ddf7194 Mon Sep 17 00:00:00 2001 From: Pekka J Enberg Date: Mon, 22 Jun 2009 17:39:41 +0300 Subject: x86: Move init_gbpages() to setup_arch() The init_gbpages() function is conditionally called from init_memory_mapping() function. There are two call-sites where this 'after_bootmem' condition can be true: setup_arch() and mem_init() via pci_iommu_alloc(). Therefore, it's safe to move the call to init_gbpages() to setup_arch() as it's always called before mem_init(). This removes an after_bootmem use - paving the way to remove all uses of that state variable. Signed-off-by: Pekka Enberg Acked-by: Yinghai Lu LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index be5ae80f897..de2cab13284 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -289,6 +289,20 @@ void * __init extend_brk(size_t size, size_t align) return ret; } +#ifdef CONFIG_X86_64 +static void __init init_gbpages(void) +{ + if (direct_gbpages && cpu_has_gbpages) + printk(KERN_INFO "Using GB pages for direct mapping\n"); + else + direct_gbpages = 0; +} +#else +static inline void init_gbpages(void) +{ +} +#endif + static void __init reserve_brk(void) { if (_brk_end > _brk_start) @@ -871,6 +885,8 @@ void __init setup_arch(char **cmdline_p) reserve_brk(); + init_gbpages(); + /* max_pfn_mapped is updated here */ max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn< Date: Sat, 23 May 2009 00:41:15 +0800 Subject: Intel-IOMMU, intr-remap: source-id checking To support domain-isolation usages, the platform hardware must be capable of uniquely identifying the requestor (source-id) for each interrupt message. Without source-id checking for interrupt remapping , a rouge guest/VM with assigned devices can launch interrupt attacks to bring down anothe guest/VM or the VMM itself. This patch adds source-id checking for interrupt remapping, and then really isolates interrupts for guests/VMs with assigned devices. Because PCI subsystem is not initialized yet when set up IOAPIC entries, use read_pci_config_byte to access PCI config space directly. Signed-off-by: Weidong Han Signed-off-by: David Woodhouse --- arch/x86/kernel/apic/io_apic.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index b7a79207295..4d0216fcb36 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1414,6 +1414,9 @@ int setup_ioapic_entry(int apic_id, int irq, irte.vector = vector; irte.dest_id = IRTE_DEST(destination); + /* Set source-id of interrupt request */ + set_ioapic_sid(&irte, apic_id); + modify_irte(irq, &irte); ir_entry->index2 = (index >> 15) & 0x1; @@ -3290,6 +3293,9 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms irte.vector = cfg->vector; irte.dest_id = IRTE_DEST(dest); + /* Set source-id of interrupt request */ + set_msi_sid(&irte, pdev); + modify_irte(irq, &irte); msg->address_hi = MSI_ADDR_BASE_HI; -- cgit v1.2.3 From c14dab5c0782ef632742963a66276a195418a63c Mon Sep 17 00:00:00 2001 From: Yong Wang Date: Wed, 24 Jun 2009 10:13:24 +0800 Subject: perf_counter, x86: Set global control MSR correctly Previous code made an assumption that the power on value of global control MSR has enabled all fixed and general purpose counters properly. However, this is not the case for certain Intel processors, such as Atom - and it might also be firmware dependent. Each enable bit in IA32_PERF_GLOBAL_CTRL is AND'ed with the enable bits for all privilege levels in the respective IA32_PERFEVTSELx or IA32_PERF_FIXED_CTR_CTRL MSRs to start/stop the counting of respective counters. Counting is enabled if the AND'ed results is true; counting is disabled when the result is false. The end result is that all fixed counters are always disabled on Atom processors because the assumption is just invalid. Fix this by not initializing the ctrl-mask out of the global MSR, but setting it to perf_counter_mask. Reported-by: Stephane Eranian Signed-off-by: Yong Wang Cc: Arjan van de Ven Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <20090624021324.GA2788@ywang-moblin2.bj.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 22eb3a1d4f9..a310d19faca 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -969,13 +969,6 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) if (!x86_pmu.num_counters_fixed) return -1; - /* - * Quirk, IA32_FIXED_CTRs do not work on current Atom processors: - */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - boot_cpu_data.x86_model == 28) - return -1; - event = hwc->config & ARCH_PERFMON_EVENT_MASK; if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) @@ -1428,8 +1421,6 @@ static int intel_pmu_init(void) */ x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); - /* * Install the hw-cache-events table: */ @@ -1514,6 +1505,7 @@ void __init init_hw_perf_counters(void) perf_counter_mask |= ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; + x86_pmu.intel_ctrl = perf_counter_mask; perf_counters_lapic_init(); register_die_notifier(&perf_counter_nmi_notifier); -- cgit v1.2.3 From 9c26f52b900f7207135bafc8789e1a4f5d43e096 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 24 Jun 2009 09:41:59 -0500 Subject: x86: Fix uv bau sending buffer initialization The initialization of the UV Broadcast Assist Unit's sending buffers was making an invalid assumption about the initialization of an MMR that defines its address. The BIOS will not be providing that MMR. So uv_activation_descriptor_init() should unconditionally set it. Tested on UV simulator. Signed-off-by: Cliff Wickman Cc: # for v2.6.30.x LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 124d40c575d..8ccabb8a2f6 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -711,7 +711,6 @@ uv_activation_descriptor_init(int node, int pnode) unsigned long pa; unsigned long m; unsigned long n; - unsigned long mmr_image; struct bau_desc *adp; struct bau_desc *ad2; @@ -727,12 +726,8 @@ uv_activation_descriptor_init(int node, int pnode) n = pa >> uv_nshift; m = pa & uv_mmask; - mmr_image = uv_read_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE); - if (mmr_image) { - uv_write_global_mmr64(pnode, (unsigned long) - UVH_LB_BAU_SB_DESCRIPTOR_BASE, - (n << UV_DESC_BASE_PNODE_SHIFT | m)); - } + uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, + (n << UV_DESC_BASE_PNODE_SHIFT | m)); /* * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each -- cgit v1.2.3 From 194002b274e9169a04beb1b23dcc132159bb566c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Jun 2009 16:35:24 +0200 Subject: perf_counter, x86: Add mmap counter read support Update the mmap control page with the needed information to use the userspace RDPMC instruction for self monitoring. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a310d19faca..b83474b6021 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -912,6 +912,8 @@ x86_perf_counter_set_period(struct perf_counter *counter, err = checking_wrmsrl(hwc->counter_base + idx, (u64)(-left) & x86_pmu.counter_mask); + perf_counter_update_userpage(counter); + return ret; } @@ -1034,6 +1036,8 @@ try_generic: x86_perf_counter_set_period(counter, hwc, idx); x86_pmu.enable(hwc, idx); + perf_counter_update_userpage(counter); + return 0; } @@ -1126,6 +1130,8 @@ static void x86_pmu_disable(struct perf_counter *counter) x86_perf_counter_update(counter, hwc, idx); cpuc->counters[idx] = NULL; clear_bit(idx, cpuc->used_mask); + + perf_counter_update_userpage(counter); } /* -- cgit v1.2.3 From 5211a242d0cbdded372aee59da18f80552b0a80a Mon Sep 17 00:00:00 2001 From: Kurt Garloff Date: Wed, 24 Jun 2009 14:32:11 -0700 Subject: x86: Add sysctl to allow panic on IOCK NMI error This patch introduces a new sysctl: /proc/sys/kernel/panic_on_io_nmi which defaults to 0 (off). When enabled, the kernel panics when the kernel receives an NMI caused by an IO error. The IO error triggered NMI indicates a serious system condition, which could result in IO data corruption. Rather than contiuing, panicing and dumping might be a better choice, so one can figure out what's causing the IO error. This could be especially important to companies running IO intensive applications where corruption must be avoided, e.g. a bank's databases. [ SuSE has been shipping it for a while, it was done at the request of a large database vendor, for their users. ] Signed-off-by: Kurt Garloff Signed-off-by: Roberto Angelino Signed-off-by: Greg Kroah-Hartman Cc: "Eric W. Biederman" LKML-Reference: <20090624213211.GA11291@kroah.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 1 + arch/x86/kernel/traps.c | 3 +++ 2 files changed, 4 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 95ea5fa7d44..c8405718a4c 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -22,6 +22,7 @@ #include "dumpstack.h" int panic_on_unrecovered_nmi; +int panic_on_io_nmi; unsigned int code_bytes = 64; int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; static int die_counter; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a0f48f5671c..5204332f475 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -346,6 +346,9 @@ io_check_error(unsigned char reason, struct pt_regs *regs) printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); show_registers(regs); + if (panic_on_io_nmi) + panic("NMI IOCK error: Not continuing"); + /* Re-enable the IOCK line, wait for a few seconds */ reason = (reason & 0xf) | 8; outb(reason, 0x61); -- cgit v1.2.3 From 5be6066a7f8d917db347d94f1b359b9b70dcb572 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 24 Jun 2009 09:21:10 +0900 Subject: x86, mce: percpu mcheck_timer should be pinned If CONFIG_NO_HZ + CONFIG_SMP, timer added via add_timer() might be migrated on other cpu. Use add_timer_on() instead. Avoids the following failure: Maciej Rutecki wrote: > > After normal boot I try: > > > > echo 1 > /sys/devices/system/machinecheck/machinecheck0/check_interval > > > > I found this in dmesg: > > > > [ 141.704025] ------------[ cut here ]------------ > > [ 141.704039] WARNING: at arch/x86/kernel/cpu/mcheck/mce.c:1102 > > mcheck_timer+0xf5/0x100() Reported-by: Maciej Rutecki Signed-off-by: Hidetoshi Seto Tested-by: Maciej Rutecki Acked-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 284d1de968b..af425b83202 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1117,7 +1117,7 @@ static void mcheck_timer(unsigned long data) *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); t->expires = jiffies + *n; - add_timer(t); + add_timer_on(t, smp_processor_id()); } static void mce_do_trigger(struct work_struct *work) @@ -1321,7 +1321,7 @@ static void mce_init_timer(void) return; setup_timer(t, mcheck_timer, smp_processor_id()); t->expires = round_jiffies(jiffies + *n); - add_timer(t); + add_timer_on(t, smp_processor_id()); } /* -- cgit v1.2.3 From ff8a4bae459a9b6455504127fcb78fdbc8e50e4c Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sat, 27 Jun 2009 12:22:27 -0700 Subject: Revert "x86: cap iomem_resource to addressable physical memory" This reverts commit 95ee14e4379c5e19c0897c872350570402014742. Mikael Petterson reported that at least one of his systems will not boot as a result. We have ruled out the detection algorithm malfunctioning, so it is not a matter of producing the incorrect bitmasks; rather, something in the application of them fails. Revert the commit until we can root cause and correct this problem. -stable team: this means the underlying commit should be rejected. Reported-and-isolated-by: Mikael Petterson Signed-off-by: H. Peter Anvin LKML-Reference: <200906261559.n5QFxJH8027336@pilspetsen.it.uu.se> Cc: stable@kernel.org Cc: Grant Grundler --- arch/x86/kernel/cpu/common.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 6b26d4deada..f1961c07af9 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -848,9 +848,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) numa_add_cpu(smp_processor_id()); #endif - - /* Cap the iomem address space to what is addressable on all CPUs */ - iomem_resource.end &= (1ULL << c->x86_phys_bits) - 1; } #ifdef CONFIG_X86_64 -- cgit v1.2.3 From 4078c444cf667f018c3fc7ebf141131a2b7c9480 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 29 Jun 2009 00:41:11 -0700 Subject: perf_counter, x86: Update x86_pmu after WARN() The print out should read the value before changing the value. Signed-off-by: Yinghai Lu Cc: Peter Zijlstra LKML-Reference: <4A487017.4090007@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index b83474b6021..d4cf4ce19aa 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1496,17 +1496,17 @@ void __init init_hw_perf_counters(void) pr_cont("%s PMU driver.\n", x86_pmu.name); if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { - x86_pmu.num_counters = X86_PMC_MAX_GENERIC; WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", x86_pmu.num_counters, X86_PMC_MAX_GENERIC); + x86_pmu.num_counters = X86_PMC_MAX_GENERIC; } perf_counter_mask = (1 << x86_pmu.num_counters) - 1; perf_max_counters = x86_pmu.num_counters; if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { - x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); + x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; } perf_counter_mask |= -- cgit v1.2.3 From 3238c0c4d68d9a9022b411a11a4b933fbdb53a14 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 1 Jul 2009 18:56:16 +0100 Subject: intel-iommu: Make iommu=pt work on i386 too Signed-off-by: David Woodhouse --- arch/x86/kernel/pci-dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 47630479b06..1a041bcf506 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -211,11 +211,11 @@ static __init int iommu_setup(char *p) #ifdef CONFIG_SWIOTLB if (!strncmp(p, "soft", 4)) swiotlb = 1; +#endif if (!strncmp(p, "pt", 2)) { iommu_pass_through = 1; return 1; } -#endif gart_parse_options(p); -- cgit v1.2.3 From 7c5371c403abb29f01bc6cff6c5096abdf2dc524 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 1 Jul 2009 12:32:18 -0700 Subject: x86: add boundary check for 32bit res before expand e820 resource to alignment fix hang with HIGHMEM_64G and 32bit resource. According to hpa and Linus, use (resource_size_t)-1 to fend off big ranges. Analyzed by hpa Reported-and-tested-by: Mikael Pettersson Signed-off-by: Yinghai Lu Signed-off-by: Linus Torvalds --- arch/x86/kernel/e820.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 7271fa33d79..c4ca89d9aaf 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1383,6 +1383,8 @@ static unsigned long ram_alignment(resource_size_t pos) return 32*1024*1024; } +#define MAX_RESOURCE_SIZE ((resource_size_t)-1) + void __init e820_reserve_resources_late(void) { int i; @@ -1400,17 +1402,19 @@ void __init e820_reserve_resources_late(void) * avoid stolen RAM: */ for (i = 0; i < e820.nr_map; i++) { - struct e820entry *entry = &e820_saved.map[i]; - resource_size_t start, end; + struct e820entry *entry = &e820.map[i]; + u64 start, end; if (entry->type != E820_RAM) continue; start = entry->addr + entry->size; - end = round_up(start, ram_alignment(start)); - if (start == end) + end = round_up(start, ram_alignment(start)) - 1; + if (end > MAX_RESOURCE_SIZE) + end = MAX_RESOURCE_SIZE; + if (start >= end) continue; - reserve_region_with_split(&iomem_resource, start, - end - 1, "RAM buffer"); + reserve_region_with_split(&iomem_resource, start, end, + "RAM buffer"); } } -- cgit v1.2.3