From 241771ef016b5c0c83cd7a4372a74321c973c1e6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 3 Dec 2008 10:39:53 +0100 Subject: performance counters: x86 support Implement performance counters for x86 Intel CPUs. It's simplified right now: the PERFMON CPU feature is assumed, which is available in Core2 and later Intel CPUs. The design is flexible to be extended to more CPU types as well. Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic.c | 2 + arch/x86/kernel/cpu/Makefile | 12 +- arch/x86/kernel/cpu/common.c | 2 + arch/x86/kernel/cpu/perf_counter.c | 571 +++++++++++++++++++++++++++++++++++++ arch/x86/kernel/entry_64.S | 5 + arch/x86/kernel/irq.c | 5 + arch/x86/kernel/irqinit_32.c | 3 + arch/x86/kernel/irqinit_64.c | 5 + arch/x86/kernel/signal.c | 7 +- arch/x86/kernel/syscall_table_32.S | 1 + 10 files changed, 607 insertions(+), 6 deletions(-) create mode 100644 arch/x86/kernel/cpu/perf_counter.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 16f94879b52..8ab8c185867 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -31,6 +31,7 @@ #include #include +#include #include #include #include @@ -1147,6 +1148,7 @@ void __cpuinit setup_local_APIC(void) apic_write(APIC_ESR, 0); } #endif + perf_counters_lapic_init(0); preempt_disable(); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 82ec6075c05..89e53361fe2 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -1,5 +1,5 @@ # -# Makefile for x86-compatible CPU details and quirks +# Makefile for x86-compatible CPU details, features and quirks # obj-y := intel_cacheinfo.o addon_cpuid_features.o @@ -16,11 +16,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o -obj-$(CONFIG_X86_MCE) += mcheck/ -obj-$(CONFIG_MTRR) += mtrr/ -obj-$(CONFIG_CPU_FREQ) += cpufreq/ +obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o -obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o +obj-$(CONFIG_X86_MCE) += mcheck/ +obj-$(CONFIG_MTRR) += mtrr/ +obj-$(CONFIG_CPU_FREQ) += cpufreq/ + +obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o quiet_cmd_mkcapflags = MKCAP $@ cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b9c9ea0217a..4461011db47 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -750,6 +751,7 @@ void __init identify_boot_cpu(void) #else vgetcpu_set_mode(); #endif + init_hw_perf_counters(); } void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c new file mode 100644 index 00000000000..82440cbed0e --- /dev/null +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -0,0 +1,571 @@ +/* + * Performance counter x86 architecture code + * + * Copyright(C) 2008 Thomas Gleixner + * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar + * + * For licencing details see kernel-base/COPYING + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static bool perf_counters_initialized __read_mostly; + +/* + * Number of (generic) HW counters: + */ +static int nr_hw_counters __read_mostly; +static u32 perf_counter_mask __read_mostly; + +/* No support for fixed function counters yet */ + +#define MAX_HW_COUNTERS 8 + +struct cpu_hw_counters { + struct perf_counter *counters[MAX_HW_COUNTERS]; + unsigned long used[BITS_TO_LONGS(MAX_HW_COUNTERS)]; + int enable_all; +}; + +/* + * Intel PerfMon v3. Used on Core2 and later. + */ +static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters); + +const int intel_perfmon_event_map[] = +{ + [PERF_COUNT_CYCLES] = 0x003c, + [PERF_COUNT_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_CACHE_REFERENCES] = 0x4f2e, + [PERF_COUNT_CACHE_MISSES] = 0x412e, + [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_BRANCH_MISSES] = 0x00c5, +}; + +const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); + +/* + * Setup the hardware configuration for a given hw_event_type + */ +int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type) +{ + struct hw_perf_counter *hwc = &counter->hw; + + if (unlikely(!perf_counters_initialized)) + return -EINVAL; + + /* + * Count user events, and generate PMC IRQs: + * (keep 'enabled' bit clear for now) + */ + hwc->config = ARCH_PERFMON_EVENTSEL_USR | ARCH_PERFMON_EVENTSEL_INT; + + /* + * If privileged enough, count OS events too, and allow + * NMI events as well: + */ + hwc->nmi = 0; + if (capable(CAP_SYS_ADMIN)) { + hwc->config |= ARCH_PERFMON_EVENTSEL_OS; + if (hw_event_type & PERF_COUNT_NMI) + hwc->nmi = 1; + } + + hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0; + hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0; + + hwc->irq_period = counter->__irq_period; + /* + * Intel PMCs cannot be accessed sanely above 32 bit width, + * so we install an artificial 1<<31 period regardless of + * the generic counter period: + */ + if (!hwc->irq_period) + hwc->irq_period = 0x7FFFFFFF; + + hwc->next_count = -((s32) hwc->irq_period); + + /* + * Negative event types mean raw encoded event+umask values: + */ + if (hw_event_type < 0) { + counter->hw_event_type = -hw_event_type; + counter->hw_event_type &= ~PERF_COUNT_NMI; + } else { + hw_event_type &= ~PERF_COUNT_NMI; + if (hw_event_type >= max_intel_perfmon_events) + return -EINVAL; + /* + * The generic map: + */ + counter->hw_event_type = intel_perfmon_event_map[hw_event_type]; + } + hwc->config |= counter->hw_event_type; + counter->wakeup_pending = 0; + + return 0; +} + +static void __hw_perf_enable_all(void) +{ + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0); +} + +void hw_perf_enable_all(void) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + + cpuc->enable_all = 1; + __hw_perf_enable_all(); +} + +void hw_perf_disable_all(void) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + + cpuc->enable_all = 0; + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); +} + +static DEFINE_PER_CPU(u64, prev_next_count[MAX_HW_COUNTERS]); + +static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx) +{ + per_cpu(prev_next_count[idx], smp_processor_id()) = hwc->next_count; + + wrmsr(hwc->counter_base + idx, hwc->next_count, 0); + wrmsr(hwc->config_base + idx, hwc->config, 0); +} + +void hw_perf_counter_enable(struct perf_counter *counter) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct hw_perf_counter *hwc = &counter->hw; + int idx = hwc->idx; + + /* Try to get the previous counter again */ + if (test_and_set_bit(idx, cpuc->used)) { + idx = find_first_zero_bit(cpuc->used, nr_hw_counters); + set_bit(idx, cpuc->used); + hwc->idx = idx; + } + + perf_counters_lapic_init(hwc->nmi); + + wrmsr(hwc->config_base + idx, + hwc->config & ~ARCH_PERFMON_EVENTSEL0_ENABLE, 0); + + cpuc->counters[idx] = counter; + counter->hw.config |= ARCH_PERFMON_EVENTSEL0_ENABLE; + __hw_perf_counter_enable(hwc, idx); +} + +#ifdef CONFIG_X86_64 +static inline void atomic64_counter_set(struct perf_counter *counter, u64 val) +{ + atomic64_set(&counter->count, val); +} + +static inline u64 atomic64_counter_read(struct perf_counter *counter) +{ + return atomic64_read(&counter->count); +} +#else +/* + * Todo: add proper atomic64_t support to 32-bit x86: + */ +static inline void atomic64_counter_set(struct perf_counter *counter, u64 val64) +{ + u32 *val32 = (void *)&val64; + + atomic_set(counter->count32 + 0, *(val32 + 0)); + atomic_set(counter->count32 + 1, *(val32 + 1)); +} + +static inline u64 atomic64_counter_read(struct perf_counter *counter) +{ + return atomic_read(counter->count32 + 0) | + (u64) atomic_read(counter->count32 + 1) << 32; +} +#endif + +static void __hw_perf_save_counter(struct perf_counter *counter, + struct hw_perf_counter *hwc, int idx) +{ + s64 raw = -1; + s64 delta; + int err; + + /* + * Get the raw hw counter value: + */ + err = rdmsrl_safe(hwc->counter_base + idx, &raw); + WARN_ON_ONCE(err); + + /* + * Rebase it to zero (it started counting at -irq_period), + * to see the delta since ->prev_count: + */ + delta = (s64)hwc->irq_period + (s64)(s32)raw; + + atomic64_counter_set(counter, hwc->prev_count + delta); + + /* + * Adjust the ->prev_count offset - if we went beyond + * irq_period of units, then we got an IRQ and the counter + * was set back to -irq_period: + */ + while (delta >= (s64)hwc->irq_period) { + hwc->prev_count += hwc->irq_period; + delta -= (s64)hwc->irq_period; + } + + /* + * Calculate the next raw counter value we'll write into + * the counter at the next sched-in time: + */ + delta -= (s64)hwc->irq_period; + + hwc->next_count = (s32)delta; +} + +void perf_counter_print_debug(void) +{ + u64 ctrl, status, overflow, pmc_ctrl, pmc_count, next_count; + int cpu, err, idx; + + local_irq_disable(); + + cpu = smp_processor_id(); + + err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_CTRL, &ctrl); + WARN_ON_ONCE(err); + + err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_STATUS, &status); + WARN_ON_ONCE(err); + + err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_OVF_CTRL, &overflow); + WARN_ON_ONCE(err); + + printk(KERN_INFO "\n"); + printk(KERN_INFO "CPU#%d: ctrl: %016llx\n", cpu, ctrl); + printk(KERN_INFO "CPU#%d: status: %016llx\n", cpu, status); + printk(KERN_INFO "CPU#%d: overflow: %016llx\n", cpu, overflow); + + for (idx = 0; idx < nr_hw_counters; idx++) { + err = rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0 + idx, &pmc_ctrl); + WARN_ON_ONCE(err); + + err = rdmsrl_safe(MSR_ARCH_PERFMON_PERFCTR0 + idx, &pmc_count); + WARN_ON_ONCE(err); + + next_count = per_cpu(prev_next_count[idx], cpu); + + printk(KERN_INFO "CPU#%d: PMC%d ctrl: %016llx\n", + cpu, idx, pmc_ctrl); + printk(KERN_INFO "CPU#%d: PMC%d count: %016llx\n", + cpu, idx, pmc_count); + printk(KERN_INFO "CPU#%d: PMC%d next: %016llx\n", + cpu, idx, next_count); + } + local_irq_enable(); +} + +void hw_perf_counter_disable(struct perf_counter *counter) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct hw_perf_counter *hwc = &counter->hw; + unsigned int idx = hwc->idx; + + counter->hw.config &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsr(hwc->config_base + idx, hwc->config, 0); + + clear_bit(idx, cpuc->used); + cpuc->counters[idx] = NULL; + __hw_perf_save_counter(counter, hwc, idx); +} + +void hw_perf_counter_read(struct perf_counter *counter) +{ + struct hw_perf_counter *hwc = &counter->hw; + unsigned long addr = hwc->counter_base + hwc->idx; + s64 offs, val = -1LL; + s32 val32; + int err; + + /* Careful: NMI might modify the counter offset */ + do { + offs = hwc->prev_count; + err = rdmsrl_safe(addr, &val); + WARN_ON_ONCE(err); + } while (offs != hwc->prev_count); + + val32 = (s32) val; + val = (s64)hwc->irq_period + (s64)val32; + atomic64_counter_set(counter, hwc->prev_count + val); +} + +static void perf_store_irq_data(struct perf_counter *counter, u64 data) +{ + struct perf_data *irqdata = counter->irqdata; + + if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) { + irqdata->overrun++; + } else { + u64 *p = (u64 *) &irqdata->data[irqdata->len]; + + *p = data; + irqdata->len += sizeof(u64); + } +} + +static void perf_save_and_restart(struct perf_counter *counter) +{ + struct hw_perf_counter *hwc = &counter->hw; + int idx = hwc->idx; + + wrmsr(hwc->config_base + idx, + hwc->config & ~ARCH_PERFMON_EVENTSEL0_ENABLE, 0); + + if (hwc->config & ARCH_PERFMON_EVENTSEL0_ENABLE) { + __hw_perf_save_counter(counter, hwc, idx); + __hw_perf_counter_enable(hwc, idx); + } +} + +static void +perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown) +{ + struct perf_counter_context *ctx = leader->ctx; + struct perf_counter *counter; + int bit; + + list_for_each_entry(counter, &ctx->counters, list) { + if (counter->record_type != PERF_RECORD_SIMPLE || + counter == leader) + continue; + + if (counter->active) { + /* + * When counter was not in the overflow mask, we have to + * read it from hardware. We read it as well, when it + * has not been read yet and clear the bit in the + * status mask. + */ + bit = counter->hw.idx; + if (!test_bit(bit, (unsigned long *) overflown) || + test_bit(bit, (unsigned long *) status)) { + clear_bit(bit, (unsigned long *) status); + perf_save_and_restart(counter); + } + } + perf_store_irq_data(leader, counter->hw_event_type); + perf_store_irq_data(leader, atomic64_counter_read(counter)); + } +} + +/* + * This handler is triggered by the local APIC, so the APIC IRQ handling + * rules apply: + */ +static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) +{ + int bit, cpu = smp_processor_id(); + struct cpu_hw_counters *cpuc; + u64 ack, status; + + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + if (!status) { + ack_APIC_irq(); + return; + } + + /* Disable counters globally */ + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); + ack_APIC_irq(); + + cpuc = &per_cpu(cpu_hw_counters, cpu); + +again: + ack = status; + for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) { + struct perf_counter *counter = cpuc->counters[bit]; + + clear_bit(bit, (unsigned long *) &status); + if (!counter) + continue; + + perf_save_and_restart(counter); + + switch (counter->record_type) { + case PERF_RECORD_SIMPLE: + continue; + case PERF_RECORD_IRQ: + perf_store_irq_data(counter, instruction_pointer(regs)); + break; + case PERF_RECORD_GROUP: + perf_store_irq_data(counter, counter->hw_event_type); + perf_store_irq_data(counter, + atomic64_counter_read(counter)); + perf_handle_group(counter, &status, &ack); + break; + } + /* + * From NMI context we cannot call into the scheduler to + * do a task wakeup - but we mark these counters as + * wakeup_pending and initate a wakeup callback: + */ + if (nmi) { + counter->wakeup_pending = 1; + set_tsk_thread_flag(current, TIF_PERF_COUNTERS); + } else { + wake_up(&counter->waitq); + } + } + + wrmsr(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack, 0); + + /* + * Repeat if there is more work to be done: + */ + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + if (status) + goto again; + + /* + * Do not reenable when global enable is off: + */ + if (cpuc->enable_all) + __hw_perf_enable_all(); +} + +void smp_perf_counter_interrupt(struct pt_regs *regs) +{ + irq_enter(); +#ifdef CONFIG_X86_64 + add_pda(apic_perf_irqs, 1); +#else + per_cpu(irq_stat, smp_processor_id()).apic_perf_irqs++; +#endif + apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR); + __smp_perf_counter_interrupt(regs, 0); + + irq_exit(); +} + +/* + * This handler is triggered by NMI contexts: + */ +void perf_counter_notify(struct pt_regs *regs) +{ + struct cpu_hw_counters *cpuc; + unsigned long flags; + int bit, cpu; + + local_irq_save(flags); + cpu = smp_processor_id(); + cpuc = &per_cpu(cpu_hw_counters, cpu); + + for_each_bit(bit, cpuc->used, nr_hw_counters) { + struct perf_counter *counter = cpuc->counters[bit]; + + if (!counter) + continue; + + if (counter->wakeup_pending) { + counter->wakeup_pending = 0; + wake_up(&counter->waitq); + } + } + + local_irq_restore(flags); +} + +void __cpuinit perf_counters_lapic_init(int nmi) +{ + u32 apic_val; + + if (!perf_counters_initialized) + return; + /* + * Enable the performance counter vector in the APIC LVT: + */ + apic_val = apic_read(APIC_LVTERR); + + apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED); + if (nmi) + apic_write(APIC_LVTPC, APIC_DM_NMI); + else + apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR); + apic_write(APIC_LVTERR, apic_val); +} + +static int __kprobes +perf_counter_nmi_handler(struct notifier_block *self, + unsigned long cmd, void *__args) +{ + struct die_args *args = __args; + struct pt_regs *regs; + + if (likely(cmd != DIE_NMI_IPI)) + return NOTIFY_DONE; + + regs = args->regs; + + apic_write(APIC_LVTPC, APIC_DM_NMI); + __smp_perf_counter_interrupt(regs, 1); + + return NOTIFY_STOP; +} + +static __read_mostly struct notifier_block perf_counter_nmi_notifier = { + .notifier_call = perf_counter_nmi_handler +}; + +void __init init_hw_perf_counters(void) +{ + union cpuid10_eax eax; + unsigned int unused; + unsigned int ebx; + + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return; + + /* + * Check whether the Architectural PerfMon supports + * Branch Misses Retired Event or not. + */ + cpuid(10, &(eax.full), &ebx, &unused, &unused); + if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) + return; + + printk(KERN_INFO "Intel Performance Monitoring support detected.\n"); + + printk(KERN_INFO "... version: %d\n", eax.split.version_id); + printk(KERN_INFO "... num_counters: %d\n", eax.split.num_counters); + nr_hw_counters = eax.split.num_counters; + if (nr_hw_counters > MAX_HW_COUNTERS) { + nr_hw_counters = MAX_HW_COUNTERS; + WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", + nr_hw_counters, MAX_HW_COUNTERS); + } + perf_counter_mask = (1 << nr_hw_counters) - 1; + perf_max_counters = nr_hw_counters; + + printk(KERN_INFO "... bit_width: %d\n", eax.split.bit_width); + printk(KERN_INFO "... mask_length: %d\n", eax.split.mask_length); + + perf_counters_lapic_init(0); + register_die_notifier(&perf_counter_nmi_notifier); + + perf_counters_initialized = true; +} diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 3194636a429..fc013cfde30 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -984,6 +984,11 @@ apicinterrupt ERROR_APIC_VECTOR \ apicinterrupt SPURIOUS_APIC_VECTOR \ spurious_interrupt smp_spurious_interrupt +#ifdef CONFIG_PERF_COUNTERS +apicinterrupt LOCAL_PERF_VECTOR \ + perf_counter_interrupt smp_perf_counter_interrupt +#endif + /* * Exception entry points. */ diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index d1d4dc52f64..d92bc71e41a 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -56,6 +56,10 @@ static int show_other_interrupts(struct seq_file *p) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); seq_printf(p, " Local timer interrupts\n"); + seq_printf(p, "CNT: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); + seq_printf(p, " Performance counter interrupts\n"); #endif #ifdef CONFIG_SMP seq_printf(p, "RES: "); @@ -160,6 +164,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) #ifdef CONFIG_X86_LOCAL_APIC sum += irq_stats(cpu)->apic_timer_irqs; + sum += irq_stats(cpu)->apic_perf_irqs; #endif #ifdef CONFIG_SMP sum += irq_stats(cpu)->irq_resched_count; diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 607db63044a..6a33b5e3016 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -160,6 +160,9 @@ void __init native_init_IRQ(void) /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); +# ifdef CONFIG_PERF_COUNTERS + alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt); +# endif #endif #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 8670b3ce626..91d785c25ad 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -138,6 +138,11 @@ static void __init apic_intr_init(void) /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); + + /* Performance monitoring interrupt: */ +#ifdef CONFIG_PERF_COUNTERS + alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt); +#endif } void __init native_init_IRQ(void) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index b1cc6da6420..dee553c503d 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -6,7 +6,7 @@ * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes * 2000-2002 x86-64 support by Andi Kleen */ - +#include #include #include #include @@ -891,6 +891,11 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) tracehook_notify_resume(regs); } + if (thread_info_flags & _TIF_PERF_COUNTERS) { + clear_thread_flag(TIF_PERF_COUNTERS); + perf_counter_notify(regs); + } + #ifdef CONFIG_X86_32 clear_thread_flag(TIF_IRET); #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index d44395ff34c..496726ddcea 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -332,3 +332,4 @@ ENTRY(sys_call_table) .long sys_dup3 /* 330 */ .long sys_pipe2 .long sys_inotify_init1 + .long sys_perf_counter_open -- cgit v1.2.3 From 87b9cf4623ad4e5fc009e48c020593dffd5d3793 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Dec 2008 14:20:16 +0100 Subject: x86, perfcounters: read out MSR_CORE_PERF_GLOBAL_STATUS with counters disabled Impact: make perfcounter NMI and IRQ sequence more robust Make __smp_perf_counter_interrupt() a bit more conservative: first disable all counters, then read out the status. Most invocations are because there are real events, so there's no performance impact. Code flow gets a bit simpler as well this way. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 82440cbed0e..615e953208e 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -383,18 +383,16 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) struct cpu_hw_counters *cpuc; u64 ack, status; - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); - if (!status) { - ack_APIC_irq(); - return; - } - /* Disable counters globally */ wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); ack_APIC_irq(); cpuc = &per_cpu(cpu_hw_counters, cpu); + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + if (!status) + goto out; + again: ack = status; for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) { @@ -440,7 +438,7 @@ again: rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); if (status) goto again; - +out: /* * Do not reenable when global enable is off: */ -- cgit v1.2.3 From 7e2ae34749edf19e76e594b9c4b2cdde1066afc5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 9 Dec 2008 11:40:46 +0100 Subject: perfcounters, x86: simplify disable/enable of counters Impact: fix spurious missed counter wakeups In the case of NMI events, close a race window that can occur if an NMI hits counter code that temporarily disables+enables a counter, and the NMI leaks into the disabled section. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 40 ++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 615e953208e..7d528ffc2d2 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -136,14 +136,25 @@ void hw_perf_disable_all(void) wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); } +static inline void +__hw_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx) +{ + wrmsr(hwc->config_base + idx, hwc->config, 0); +} + static DEFINE_PER_CPU(u64, prev_next_count[MAX_HW_COUNTERS]); -static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx) +static void __hw_perf_counter_set_period(struct hw_perf_counter *hwc, int idx) { per_cpu(prev_next_count[idx], smp_processor_id()) = hwc->next_count; wrmsr(hwc->counter_base + idx, hwc->next_count, 0); - wrmsr(hwc->config_base + idx, hwc->config, 0); +} + +static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx) +{ + wrmsr(hwc->config_base + idx, + hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0); } void hw_perf_counter_enable(struct perf_counter *counter) @@ -161,11 +172,11 @@ void hw_perf_counter_enable(struct perf_counter *counter) perf_counters_lapic_init(hwc->nmi); - wrmsr(hwc->config_base + idx, - hwc->config & ~ARCH_PERFMON_EVENTSEL0_ENABLE, 0); + __hw_perf_counter_disable(hwc, idx); cpuc->counters[idx] = counter; - counter->hw.config |= ARCH_PERFMON_EVENTSEL0_ENABLE; + + __hw_perf_counter_set_period(hwc, idx); __hw_perf_counter_enable(hwc, idx); } @@ -286,8 +297,7 @@ void hw_perf_counter_disable(struct perf_counter *counter) struct hw_perf_counter *hwc = &counter->hw; unsigned int idx = hwc->idx; - counter->hw.config &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsr(hwc->config_base + idx, hwc->config, 0); + __hw_perf_counter_disable(hwc, idx); clear_bit(idx, cpuc->used); cpuc->counters[idx] = NULL; @@ -328,18 +338,24 @@ static void perf_store_irq_data(struct perf_counter *counter, u64 data) } } +/* + * NMI-safe enable method: + */ static void perf_save_and_restart(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; int idx = hwc->idx; + u64 pmc_ctrl; + int err; - wrmsr(hwc->config_base + idx, - hwc->config & ~ARCH_PERFMON_EVENTSEL0_ENABLE, 0); + err = rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0 + idx, &pmc_ctrl); + WARN_ON_ONCE(err); - if (hwc->config & ARCH_PERFMON_EVENTSEL0_ENABLE) { - __hw_perf_save_counter(counter, hwc, idx); + __hw_perf_save_counter(counter, hwc, idx); + __hw_perf_counter_set_period(hwc, idx); + + if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE) __hw_perf_counter_enable(hwc, idx); - } } static void -- cgit v1.2.3 From 1e12567678054bc1d4c944ecfad17624b3e49345 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 9 Dec 2008 12:18:18 +0100 Subject: perfcounters, x86: clean up debug code Impact: cleanup Get rid of unused debug code. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 35 ++++++++++++----------------------- 1 file changed, 12 insertions(+), 23 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 7d528ffc2d2..919ec46679b 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -214,13 +214,11 @@ static void __hw_perf_save_counter(struct perf_counter *counter, { s64 raw = -1; s64 delta; - int err; /* * Get the raw hw counter value: */ - err = rdmsrl_safe(hwc->counter_base + idx, &raw); - WARN_ON_ONCE(err); + rdmsrl(hwc->counter_base + idx, raw); /* * Rebase it to zero (it started counting at -irq_period), @@ -252,20 +250,18 @@ static void __hw_perf_save_counter(struct perf_counter *counter, void perf_counter_print_debug(void) { u64 ctrl, status, overflow, pmc_ctrl, pmc_count, next_count; - int cpu, err, idx; + int cpu, idx; + + if (!nr_hw_counters) + return; local_irq_disable(); cpu = smp_processor_id(); - err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_CTRL, &ctrl); - WARN_ON_ONCE(err); - - err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_STATUS, &status); - WARN_ON_ONCE(err); - - err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_OVF_CTRL, &overflow); - WARN_ON_ONCE(err); + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); printk(KERN_INFO "\n"); printk(KERN_INFO "CPU#%d: ctrl: %016llx\n", cpu, ctrl); @@ -273,11 +269,8 @@ void perf_counter_print_debug(void) printk(KERN_INFO "CPU#%d: overflow: %016llx\n", cpu, overflow); for (idx = 0; idx < nr_hw_counters; idx++) { - err = rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0 + idx, &pmc_ctrl); - WARN_ON_ONCE(err); - - err = rdmsrl_safe(MSR_ARCH_PERFMON_PERFCTR0 + idx, &pmc_count); - WARN_ON_ONCE(err); + rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl); + rdmsrl(MSR_ARCH_PERFMON_PERFCTR0 + idx, pmc_count); next_count = per_cpu(prev_next_count[idx], cpu); @@ -310,13 +303,11 @@ void hw_perf_counter_read(struct perf_counter *counter) unsigned long addr = hwc->counter_base + hwc->idx; s64 offs, val = -1LL; s32 val32; - int err; /* Careful: NMI might modify the counter offset */ do { offs = hwc->prev_count; - err = rdmsrl_safe(addr, &val); - WARN_ON_ONCE(err); + rdmsrl(addr, val); } while (offs != hwc->prev_count); val32 = (s32) val; @@ -346,10 +337,8 @@ static void perf_save_and_restart(struct perf_counter *counter) struct hw_perf_counter *hwc = &counter->hw; int idx = hwc->idx; u64 pmc_ctrl; - int err; - err = rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0 + idx, &pmc_ctrl); - WARN_ON_ONCE(err); + rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl); __hw_perf_save_counter(counter, hwc, idx); __hw_perf_counter_set_period(hwc, idx); -- cgit v1.2.3 From 43874d238d5f208854a73c3225ca2a22833eec8b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 9 Dec 2008 12:23:59 +0100 Subject: perfcounters: consolidate global-disable codepaths Impact: cleanup Simplify global disable handling. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 919ec46679b..6a93d1f04d9 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -33,7 +33,6 @@ static u32 perf_counter_mask __read_mostly; struct cpu_hw_counters { struct perf_counter *counters[MAX_HW_COUNTERS]; unsigned long used[BITS_TO_LONGS(MAX_HW_COUNTERS)]; - int enable_all; }; /* @@ -115,24 +114,13 @@ int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type) return 0; } -static void __hw_perf_enable_all(void) -{ - wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0); -} - void hw_perf_enable_all(void) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - - cpuc->enable_all = 1; - __hw_perf_enable_all(); + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0); } void hw_perf_disable_all(void) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - - cpuc->enable_all = 0; wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); } @@ -385,8 +373,10 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown) static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) { int bit, cpu = smp_processor_id(); + u64 ack, status, saved_global; struct cpu_hw_counters *cpuc; - u64 ack, status; + + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global); /* Disable counters globally */ wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); @@ -445,10 +435,9 @@ again: goto again; out: /* - * Do not reenable when global enable is off: + * Restore - do not reenable when global enable is off: */ - if (cpuc->enable_all) - __hw_perf_enable_all(); + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, saved_global, 0); } void smp_perf_counter_interrupt(struct pt_regs *regs) -- cgit v1.2.3 From 4ac13294e44664bb7edf4daf52edb71e7c6bbe84 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 9 Dec 2008 21:43:39 +0100 Subject: perf counters: protect them against CSTATE transitions Impact: fix rare lost events problem There are CPUs whose performance counters misbehave on CSTATE transitions, so provide a way to just disable/enable them around deep idle methods. (hw_perf_enable_all() is cheap on x86.) Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 6a93d1f04d9..0a7f3bea2dc 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -119,10 +120,21 @@ void hw_perf_enable_all(void) wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0); } -void hw_perf_disable_all(void) +void hw_perf_restore_ctrl(u64 ctrl) { + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0); +} +EXPORT_SYMBOL_GPL(hw_perf_restore_ctrl); + +u64 hw_perf_disable_all(void) +{ + u64 ctrl; + + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); + return ctrl; } +EXPORT_SYMBOL_GPL(hw_perf_disable_all); static inline void __hw_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx) -- cgit v1.2.3 From dfa7c899b401d7dc5d85aca416aee64ac82812f2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 8 Dec 2008 19:35:37 +0100 Subject: perf counters: expand use of counter->event Impact: change syscall, cleanup Make use of the new perf_counters event type. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 0a7f3bea2dc..30e7ebf7827 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -56,9 +56,10 @@ const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); /* * Setup the hardware configuration for a given hw_event_type */ -int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type) +int hw_perf_counter_init(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; + u32 hw_event_type = counter->event.hw_event_type; if (unlikely(!perf_counters_initialized)) return -EINVAL; @@ -83,7 +84,7 @@ int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type) hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0; hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0; - hwc->irq_period = counter->__irq_period; + hwc->irq_period = counter->event.hw_event_period; /* * Intel PMCs cannot be accessed sanely above 32 bit width, * so we install an artificial 1<<31 period regardless of @@ -95,21 +96,19 @@ int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type) hwc->next_count = -((s32) hwc->irq_period); /* - * Negative event types mean raw encoded event+umask values: + * Raw event type provide the config in the event structure */ - if (hw_event_type < 0) { - counter->hw_event_type = -hw_event_type; - counter->hw_event_type &= ~PERF_COUNT_NMI; + hw_event_type &= ~PERF_COUNT_NMI; + if (hw_event_type == PERF_COUNT_RAW) { + hwc->config |= counter->event.hw_raw_ctrl; } else { - hw_event_type &= ~PERF_COUNT_NMI; if (hw_event_type >= max_intel_perfmon_events) return -EINVAL; /* * The generic map: */ - counter->hw_event_type = intel_perfmon_event_map[hw_event_type]; + hwc->config |= intel_perfmon_event_map[hw_event_type]; } - hwc->config |= counter->hw_event_type; counter->wakeup_pending = 0; return 0; @@ -373,7 +372,7 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown) perf_save_and_restart(counter); } } - perf_store_irq_data(leader, counter->hw_event_type); + perf_store_irq_data(leader, counter->event.hw_event_type); perf_store_irq_data(leader, atomic64_counter_read(counter)); } } @@ -418,7 +417,8 @@ again: perf_store_irq_data(counter, instruction_pointer(regs)); break; case PERF_RECORD_GROUP: - perf_store_irq_data(counter, counter->hw_event_type); + perf_store_irq_data(counter, + counter->event.hw_event_type); perf_store_irq_data(counter, atomic64_counter_read(counter)); perf_handle_group(counter, &status, &ack); -- cgit v1.2.3 From 9f66a3810fe0d4100972db84290f3ae4a4d77025 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 10 Dec 2008 12:33:23 +0100 Subject: perf counters: restructure the API Impact: clean up new API Thorough cleanup of the new perf counters API, we now get clean separation of the various concepts: - introduce perf_counter_hw_event to separate out the event source details - move special type flags into separate attributes: PERF_COUNT_NMI, PERF_COUNT_RAW - extend the type to u64 and reserve it fully to the architecture in the raw type case. And make use of all these changes in the core and x86 perfcounters code. Also change the syscall signature to: asmlinkage int sys_perf_counter_open( struct perf_counter_hw_event *hw_event_uptr __user, pid_t pid, int cpu, int group_fd); ( Note that group_fd is unused for now - it's reserved for the counter groups abstraction. ) Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 30e7ebf7827..ef1936a871a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -58,8 +58,8 @@ const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); */ int hw_perf_counter_init(struct perf_counter *counter) { + struct perf_counter_hw_event *hw_event = &counter->hw_event; struct hw_perf_counter *hwc = &counter->hw; - u32 hw_event_type = counter->event.hw_event_type; if (unlikely(!perf_counters_initialized)) return -EINVAL; @@ -77,14 +77,14 @@ int hw_perf_counter_init(struct perf_counter *counter) hwc->nmi = 0; if (capable(CAP_SYS_ADMIN)) { hwc->config |= ARCH_PERFMON_EVENTSEL_OS; - if (hw_event_type & PERF_COUNT_NMI) + if (hw_event->nmi) hwc->nmi = 1; } - hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0; - hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0; + hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0; + hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0; - hwc->irq_period = counter->event.hw_event_period; + hwc->irq_period = hw_event->irq_period; /* * Intel PMCs cannot be accessed sanely above 32 bit width, * so we install an artificial 1<<31 period regardless of @@ -93,21 +93,20 @@ int hw_perf_counter_init(struct perf_counter *counter) if (!hwc->irq_period) hwc->irq_period = 0x7FFFFFFF; - hwc->next_count = -((s32) hwc->irq_period); + hwc->next_count = -(s32)hwc->irq_period; /* * Raw event type provide the config in the event structure */ - hw_event_type &= ~PERF_COUNT_NMI; - if (hw_event_type == PERF_COUNT_RAW) { - hwc->config |= counter->event.hw_raw_ctrl; + if (hw_event->raw) { + hwc->config |= hw_event->type; } else { - if (hw_event_type >= max_intel_perfmon_events) + if (hw_event->type >= max_intel_perfmon_events) return -EINVAL; /* * The generic map: */ - hwc->config |= intel_perfmon_event_map[hw_event_type]; + hwc->config |= intel_perfmon_event_map[hw_event->type]; } counter->wakeup_pending = 0; @@ -354,7 +353,7 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown) int bit; list_for_each_entry(counter, &ctx->counters, list) { - if (counter->record_type != PERF_RECORD_SIMPLE || + if (counter->hw_event.record_type != PERF_RECORD_SIMPLE || counter == leader) continue; @@ -372,7 +371,7 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown) perf_save_and_restart(counter); } } - perf_store_irq_data(leader, counter->event.hw_event_type); + perf_store_irq_data(leader, counter->hw_event.type); perf_store_irq_data(leader, atomic64_counter_read(counter)); } } @@ -410,7 +409,7 @@ again: perf_save_and_restart(counter); - switch (counter->record_type) { + switch (counter->hw_event.record_type) { case PERF_RECORD_SIMPLE: continue; case PERF_RECORD_IRQ: @@ -418,7 +417,7 @@ again: break; case PERF_RECORD_GROUP: perf_store_irq_data(counter, - counter->event.hw_event_type); + counter->hw_event.type); perf_store_irq_data(counter, atomic64_counter_read(counter)); perf_handle_group(counter, &status, &ack); -- cgit v1.2.3 From 04289bb9891882202d7e961c4c04d2376930e9f9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 08:38:42 +0100 Subject: perf counters: add support for group counters Impact: add group counters This patch adds the "counter groups" abstraction. Groups of counters behave much like normal 'single' counters, with a few semantic and behavioral extensions on top of that. A counter group is created by creating a new counter with the open() syscall's group-leader group_fd file descriptor parameter pointing to another, already existing counter. Groups of counters are scheduled in and out in one atomic group, and they are also roundrobin-scheduled atomically. Counters that are member of a group can also record events with an (atomic) extended timestamp that extends to all members of the group, if the record type is set to PERF_RECORD_GROUP. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index ef1936a871a..54b4ad0cce6 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -346,18 +346,22 @@ static void perf_save_and_restart(struct perf_counter *counter) } static void -perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown) +perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) { - struct perf_counter_context *ctx = leader->ctx; - struct perf_counter *counter; + struct perf_counter *counter, *group_leader = sibling->group_leader; int bit; - list_for_each_entry(counter, &ctx->counters, list) { - if (counter->hw_event.record_type != PERF_RECORD_SIMPLE || - counter == leader) - continue; + /* + * Store the counter's own timestamp first: + */ + perf_store_irq_data(sibling, sibling->hw_event.type); + perf_store_irq_data(sibling, atomic64_counter_read(sibling)); - if (counter->active) { + /* + * Then store sibling timestamps (if any): + */ + list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { + if (!counter->active) { /* * When counter was not in the overflow mask, we have to * read it from hardware. We read it as well, when it @@ -371,8 +375,8 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown) perf_save_and_restart(counter); } } - perf_store_irq_data(leader, counter->hw_event.type); - perf_store_irq_data(leader, atomic64_counter_read(counter)); + perf_store_irq_data(sibling, counter->hw_event.type); + perf_store_irq_data(sibling, atomic64_counter_read(counter)); } } @@ -416,10 +420,6 @@ again: perf_store_irq_data(counter, instruction_pointer(regs)); break; case PERF_RECORD_GROUP: - perf_store_irq_data(counter, - counter->hw_event.type); - perf_store_irq_data(counter, - atomic64_counter_read(counter)); perf_handle_group(counter, &status, &ack); break; } -- cgit v1.2.3 From 621a01eac89b5e2f81a4cf576568b31f40a02724 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 12:46:46 +0100 Subject: perf counters: hw driver API Impact: restructure code, introduce hw_ops driver abstraction Introduce this abstraction to handle counter details: struct hw_perf_counter_ops { void (*hw_perf_counter_enable) (struct perf_counter *counter); void (*hw_perf_counter_disable) (struct perf_counter *counter); void (*hw_perf_counter_read) (struct perf_counter *counter); }; This will be useful to support assymetric hw details, and it will also be useful to implement "software counters". (Counters that count kernel managed sw events such as pagefaults, context-switches, wall-clock time or task-local time.) Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 54b4ad0cce6..718b635dece 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -56,7 +56,7 @@ const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); /* * Setup the hardware configuration for a given hw_event_type */ -int hw_perf_counter_init(struct perf_counter *counter) +static int __hw_perf_counter_init(struct perf_counter *counter) { struct perf_counter_hw_event *hw_event = &counter->hw_event; struct hw_perf_counter *hwc = &counter->hw; @@ -135,7 +135,7 @@ u64 hw_perf_disable_all(void) EXPORT_SYMBOL_GPL(hw_perf_disable_all); static inline void -__hw_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx) +__x86_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx) { wrmsr(hwc->config_base + idx, hwc->config, 0); } @@ -149,13 +149,13 @@ static void __hw_perf_counter_set_period(struct hw_perf_counter *hwc, int idx) wrmsr(hwc->counter_base + idx, hwc->next_count, 0); } -static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx) +static void __x86_perf_counter_enable(struct hw_perf_counter *hwc, int idx) { wrmsr(hwc->config_base + idx, hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0); } -void hw_perf_counter_enable(struct perf_counter *counter) +static void x86_perf_counter_enable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; @@ -170,12 +170,12 @@ void hw_perf_counter_enable(struct perf_counter *counter) perf_counters_lapic_init(hwc->nmi); - __hw_perf_counter_disable(hwc, idx); + __x86_perf_counter_disable(hwc, idx); cpuc->counters[idx] = counter; __hw_perf_counter_set_period(hwc, idx); - __hw_perf_counter_enable(hwc, idx); + __x86_perf_counter_enable(hwc, idx); } #ifdef CONFIG_X86_64 @@ -282,20 +282,20 @@ void perf_counter_print_debug(void) local_irq_enable(); } -void hw_perf_counter_disable(struct perf_counter *counter) +static void x86_perf_counter_disable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; unsigned int idx = hwc->idx; - __hw_perf_counter_disable(hwc, idx); + __x86_perf_counter_disable(hwc, idx); clear_bit(idx, cpuc->used); cpuc->counters[idx] = NULL; __hw_perf_save_counter(counter, hwc, idx); } -void hw_perf_counter_read(struct perf_counter *counter) +static void x86_perf_counter_read(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; unsigned long addr = hwc->counter_base + hwc->idx; @@ -342,7 +342,7 @@ static void perf_save_and_restart(struct perf_counter *counter) __hw_perf_counter_set_period(hwc, idx); if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE) - __hw_perf_counter_enable(hwc, idx); + __x86_perf_counter_enable(hwc, idx); } static void @@ -572,3 +572,20 @@ void __init init_hw_perf_counters(void) perf_counters_initialized = true; } + +static struct hw_perf_counter_ops x86_perf_counter_ops = { + .hw_perf_counter_enable = x86_perf_counter_enable, + .hw_perf_counter_disable = x86_perf_counter_disable, + .hw_perf_counter_read = x86_perf_counter_read, +}; + +struct hw_perf_counter_ops *hw_perf_counter_init(struct perf_counter *counter) +{ + int err; + + err = __hw_perf_counter_init(counter); + if (err) + return NULL; + + return &x86_perf_counter_ops; +} -- cgit v1.2.3 From 5c92d12411dfe5f0f3d1b1c1e2f756245e6f7249 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 13:21:10 +0100 Subject: perf counters: implement PERF_COUNT_CPU_CLOCK Impact: add new perf-counter type The 'CPU clock' counter counts the amount of CPU clock time that is elapsing, in nanoseconds. (regardless of how much of it the task is spending on a CPU executing) This counter type is a Linux kernel based abstraction, it is available even if the hardware does not support native hardware performance counters. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 36 ++++-------------------------------- 1 file changed, 4 insertions(+), 32 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 718b635dece..43c8e9a38b4 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -178,35 +178,6 @@ static void x86_perf_counter_enable(struct perf_counter *counter) __x86_perf_counter_enable(hwc, idx); } -#ifdef CONFIG_X86_64 -static inline void atomic64_counter_set(struct perf_counter *counter, u64 val) -{ - atomic64_set(&counter->count, val); -} - -static inline u64 atomic64_counter_read(struct perf_counter *counter) -{ - return atomic64_read(&counter->count); -} -#else -/* - * Todo: add proper atomic64_t support to 32-bit x86: - */ -static inline void atomic64_counter_set(struct perf_counter *counter, u64 val64) -{ - u32 *val32 = (void *)&val64; - - atomic_set(counter->count32 + 0, *(val32 + 0)); - atomic_set(counter->count32 + 1, *(val32 + 1)); -} - -static inline u64 atomic64_counter_read(struct perf_counter *counter) -{ - return atomic_read(counter->count32 + 0) | - (u64) atomic_read(counter->count32 + 1) << 32; -} -#endif - static void __hw_perf_save_counter(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { @@ -309,7 +280,7 @@ static void x86_perf_counter_read(struct perf_counter *counter) } while (offs != hwc->prev_count); val32 = (s32) val; - val = (s64)hwc->irq_period + (s64)val32; + val = (s64)hwc->irq_period + (s64)val32; atomic64_counter_set(counter, hwc->prev_count + val); } @@ -573,13 +544,14 @@ void __init init_hw_perf_counters(void) perf_counters_initialized = true; } -static struct hw_perf_counter_ops x86_perf_counter_ops = { +static const struct hw_perf_counter_ops x86_perf_counter_ops = { .hw_perf_counter_enable = x86_perf_counter_enable, .hw_perf_counter_disable = x86_perf_counter_disable, .hw_perf_counter_read = x86_perf_counter_read, }; -struct hw_perf_counter_ops *hw_perf_counter_init(struct perf_counter *counter) +const struct hw_perf_counter_ops * +hw_perf_counter_init(struct perf_counter *counter) { int err; -- cgit v1.2.3 From 01b2838c4298c5e0d30b4993c195ac34dd9df61e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 13:45:51 +0100 Subject: perf counters: consolidate hw_perf save/restore APIs Impact: cleanup Rename them to better match up the usual IRQ disable/enable APIs: hw_perf_disable_all() => hw_perf_save_disable() hw_perf_restore_ctrl() => hw_perf_restore() Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 43c8e9a38b4..3e1dbebe22b 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -118,13 +118,13 @@ void hw_perf_enable_all(void) wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0); } -void hw_perf_restore_ctrl(u64 ctrl) +void hw_perf_restore(u64 ctrl) { wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0); } -EXPORT_SYMBOL_GPL(hw_perf_restore_ctrl); +EXPORT_SYMBOL_GPL(hw_perf_restore); -u64 hw_perf_disable_all(void) +u64 hw_perf_save_disable(void) { u64 ctrl; @@ -132,7 +132,7 @@ u64 hw_perf_disable_all(void) wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); return ctrl; } -EXPORT_SYMBOL_GPL(hw_perf_disable_all); +EXPORT_SYMBOL_GPL(hw_perf_save_disable); static inline void __x86_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx) -- cgit v1.2.3 From 6a930700c8b655a9e25e42fc4adc0b225ebbcefc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 15:17:03 +0100 Subject: perf counters: clean up state transitions Impact: cleanup Introduce a proper enum for the 3 states of a counter: PERF_COUNTER_STATE_OFF = -1 PERF_COUNTER_STATE_INACTIVE = 0 PERF_COUNTER_STATE_ACTIVE = 1 and rename counter->active to counter->state and propagate the changes everywhere. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 3e1dbebe22b..4854cca7fff 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -332,7 +332,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) * Then store sibling timestamps (if any): */ list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { - if (!counter->active) { + if (counter->state != PERF_COUNTER_STATE_ACTIVE) { /* * When counter was not in the overflow mask, we have to * read it from hardware. We read it as well, when it -- cgit v1.2.3 From ee06094f8279e1312fc0a31591320cc7b6f0ab1e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 13 Dec 2008 09:00:03 +0100 Subject: perfcounters: restructure x86 counter math Impact: restructure code Change counter math from absolute values to clear delta logic. We try to extract elapsed deltas from the raw hw counter - and put that into the generic counter. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 230 ++++++++++++++++++++----------------- 1 file changed, 124 insertions(+), 106 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index b903f8df72b..5afae13d8d5 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -53,6 +53,48 @@ const int intel_perfmon_event_map[] = const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); +/* + * Propagate counter elapsed time into the generic counter. + * Can only be executed on the CPU where the counter is active. + * Returns the delta events processed. + */ +static void +x86_perf_counter_update(struct perf_counter *counter, + struct hw_perf_counter *hwc, int idx) +{ + u64 prev_raw_count, new_raw_count, delta; + + WARN_ON_ONCE(counter->state != PERF_COUNTER_STATE_ACTIVE); + /* + * Careful: an NMI might modify the previous counter value. + * + * Our tactic to handle this is to first atomically read and + * exchange a new raw count - then add that new-prev delta + * count to the generic counter atomically: + */ +again: + prev_raw_count = atomic64_read(&hwc->prev_count); + rdmsrl(hwc->counter_base + idx, new_raw_count); + + if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count) + goto again; + + /* + * Now we have the new raw value and have updated the prev + * timestamp already. We can now calculate the elapsed delta + * (counter-)time and add that to the generic counter. + * + * Careful, not all hw sign-extends above the physical width + * of the count, so we do that by clipping the delta to 32 bits: + */ + delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count); + WARN_ON_ONCE((int)delta < 0); + + atomic64_add(delta, &counter->count); + atomic64_sub(delta, &hwc->period_left); +} + /* * Setup the hardware configuration for a given hw_event_type */ @@ -90,10 +132,10 @@ static int __hw_perf_counter_init(struct perf_counter *counter) * so we install an artificial 1<<31 period regardless of * the generic counter period: */ - if (!hwc->irq_period) + if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF) hwc->irq_period = 0x7FFFFFFF; - hwc->next_count = -(s32)hwc->irq_period; + atomic64_set(&hwc->period_left, hwc->irq_period); /* * Raw event type provide the config in the event structure @@ -118,12 +160,6 @@ void hw_perf_enable_all(void) wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0); } -void hw_perf_restore(u64 ctrl) -{ - wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0); -} -EXPORT_SYMBOL_GPL(hw_perf_restore); - u64 hw_perf_save_disable(void) { u64 ctrl; @@ -134,27 +170,74 @@ u64 hw_perf_save_disable(void) } EXPORT_SYMBOL_GPL(hw_perf_save_disable); +void hw_perf_restore(u64 ctrl) +{ + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0); +} +EXPORT_SYMBOL_GPL(hw_perf_restore); + static inline void -__x86_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx) +__x86_perf_counter_disable(struct perf_counter *counter, + struct hw_perf_counter *hwc, unsigned int idx) { - wrmsr(hwc->config_base + idx, hwc->config, 0); + int err; + + err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0); + WARN_ON_ONCE(err); } -static DEFINE_PER_CPU(u64, prev_next_count[MAX_HW_COUNTERS]); +static DEFINE_PER_CPU(u64, prev_left[MAX_HW_COUNTERS]); -static void __hw_perf_counter_set_period(struct hw_perf_counter *hwc, int idx) +/* + * Set the next IRQ period, based on the hwc->period_left value. + * To be called with the counter disabled in hw: + */ +static void +__hw_perf_counter_set_period(struct perf_counter *counter, + struct hw_perf_counter *hwc, int idx) { - per_cpu(prev_next_count[idx], smp_processor_id()) = hwc->next_count; + s32 left = atomic64_read(&hwc->period_left); + s32 period = hwc->irq_period; + + WARN_ON_ONCE(period <= 0); + + /* + * If we are way outside a reasoable range then just skip forward: + */ + if (unlikely(left <= -period)) { + left = period; + atomic64_set(&hwc->period_left, left); + } + + if (unlikely(left <= 0)) { + left += period; + atomic64_set(&hwc->period_left, left); + } - wrmsr(hwc->counter_base + idx, hwc->next_count, 0); + WARN_ON_ONCE(left <= 0); + + per_cpu(prev_left[idx], smp_processor_id()) = left; + + /* + * The hw counter starts counting from this counter offset, + * mark it to be able to extra future deltas: + */ + atomic64_set(&hwc->prev_count, (u64)(s64)-left); + + wrmsr(hwc->counter_base + idx, -left, 0); } -static void __x86_perf_counter_enable(struct hw_perf_counter *hwc, int idx) +static void +__x86_perf_counter_enable(struct perf_counter *counter, + struct hw_perf_counter *hwc, int idx) { wrmsr(hwc->config_base + idx, hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0); } +/* + * Find a PMC slot for the freshly enabled / scheduled in counter: + */ static void x86_perf_counter_enable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); @@ -170,55 +253,17 @@ static void x86_perf_counter_enable(struct perf_counter *counter) perf_counters_lapic_init(hwc->nmi); - __x86_perf_counter_disable(hwc, idx); + __x86_perf_counter_disable(counter, hwc, idx); cpuc->counters[idx] = counter; - __hw_perf_counter_set_period(hwc, idx); - __x86_perf_counter_enable(hwc, idx); -} - -static void __hw_perf_save_counter(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) -{ - s64 raw = -1; - s64 delta; - - /* - * Get the raw hw counter value: - */ - rdmsrl(hwc->counter_base + idx, raw); - - /* - * Rebase it to zero (it started counting at -irq_period), - * to see the delta since ->prev_count: - */ - delta = (s64)hwc->irq_period + (s64)(s32)raw; - - atomic64_counter_set(counter, hwc->prev_count + delta); - - /* - * Adjust the ->prev_count offset - if we went beyond - * irq_period of units, then we got an IRQ and the counter - * was set back to -irq_period: - */ - while (delta >= (s64)hwc->irq_period) { - hwc->prev_count += hwc->irq_period; - delta -= (s64)hwc->irq_period; - } - - /* - * Calculate the next raw counter value we'll write into - * the counter at the next sched-in time: - */ - delta -= (s64)hwc->irq_period; - - hwc->next_count = (s32)delta; + __hw_perf_counter_set_period(counter, hwc, idx); + __x86_perf_counter_enable(counter, hwc, idx); } void perf_counter_print_debug(void) { - u64 ctrl, status, overflow, pmc_ctrl, pmc_count, next_count; + u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left; int cpu, idx; if (!nr_hw_counters) @@ -241,14 +286,14 @@ void perf_counter_print_debug(void) rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl); rdmsrl(MSR_ARCH_PERFMON_PERFCTR0 + idx, pmc_count); - next_count = per_cpu(prev_next_count[idx], cpu); + prev_left = per_cpu(prev_left[idx], cpu); printk(KERN_INFO "CPU#%d: PMC%d ctrl: %016llx\n", cpu, idx, pmc_ctrl); printk(KERN_INFO "CPU#%d: PMC%d count: %016llx\n", cpu, idx, pmc_count); - printk(KERN_INFO "CPU#%d: PMC%d next: %016llx\n", - cpu, idx, next_count); + printk(KERN_INFO "CPU#%d: PMC%d left: %016llx\n", + cpu, idx, prev_left); } local_irq_enable(); } @@ -259,29 +304,16 @@ static void x86_perf_counter_disable(struct perf_counter *counter) struct hw_perf_counter *hwc = &counter->hw; unsigned int idx = hwc->idx; - __x86_perf_counter_disable(hwc, idx); + __x86_perf_counter_disable(counter, hwc, idx); clear_bit(idx, cpuc->used); cpuc->counters[idx] = NULL; - __hw_perf_save_counter(counter, hwc, idx); -} -static void x86_perf_counter_read(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - unsigned long addr = hwc->counter_base + hwc->idx; - s64 offs, val = -1LL; - s32 val32; - - /* Careful: NMI might modify the counter offset */ - do { - offs = hwc->prev_count; - rdmsrl(addr, val); - } while (offs != hwc->prev_count); - - val32 = (s32) val; - val = (s64)hwc->irq_period + (s64)val32; - atomic64_counter_set(counter, hwc->prev_count + val); + /* + * Drain the remaining delta count out of a counter + * that we are disabling: + */ + x86_perf_counter_update(counter, hwc, idx); } static void perf_store_irq_data(struct perf_counter *counter, u64 data) @@ -299,7 +331,8 @@ static void perf_store_irq_data(struct perf_counter *counter, u64 data) } /* - * NMI-safe enable method: + * Save and restart an expired counter. Called by NMI contexts, + * so it has to be careful about preempting normal counter ops: */ static void perf_save_and_restart(struct perf_counter *counter) { @@ -309,45 +342,25 @@ static void perf_save_and_restart(struct perf_counter *counter) rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl); - __hw_perf_save_counter(counter, hwc, idx); - __hw_perf_counter_set_period(hwc, idx); + x86_perf_counter_update(counter, hwc, idx); + __hw_perf_counter_set_period(counter, hwc, idx); if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE) - __x86_perf_counter_enable(hwc, idx); + __x86_perf_counter_enable(counter, hwc, idx); } static void perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) { struct perf_counter *counter, *group_leader = sibling->group_leader; - int bit; - - /* - * Store the counter's own timestamp first: - */ - perf_store_irq_data(sibling, sibling->hw_event.type); - perf_store_irq_data(sibling, atomic64_counter_read(sibling)); /* - * Then store sibling timestamps (if any): + * Store sibling timestamps (if any): */ list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { - if (counter->state != PERF_COUNTER_STATE_ACTIVE) { - /* - * When counter was not in the overflow mask, we have to - * read it from hardware. We read it as well, when it - * has not been read yet and clear the bit in the - * status mask. - */ - bit = counter->hw.idx; - if (!test_bit(bit, (unsigned long *) overflown) || - test_bit(bit, (unsigned long *) status)) { - clear_bit(bit, (unsigned long *) status); - perf_save_and_restart(counter); - } - } + x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); perf_store_irq_data(sibling, counter->hw_event.type); - perf_store_irq_data(sibling, atomic64_counter_read(counter)); + perf_store_irq_data(sibling, atomic64_read(&counter->count)); } } @@ -540,6 +553,11 @@ void __init init_hw_perf_counters(void) perf_counters_initialized = true; } +static void x86_perf_counter_read(struct perf_counter *counter) +{ + x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); +} + static const struct hw_perf_counter_ops x86_perf_counter_ops = { .hw_perf_counter_enable = x86_perf_counter_enable, .hw_perf_counter_disable = x86_perf_counter_disable, -- cgit v1.2.3 From 2b9ff0db19b5e2c77000b7201525f9c3d6e8328d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Dec 2008 18:36:30 +0100 Subject: perfcounters: fix non-intel-perfmon CPUs Do not write MSR_CORE_PERF_GLOBAL_CTRL on CPUs where it does not exist. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 5afae13d8d5..6d30f603b62 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -157,6 +157,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter) void hw_perf_enable_all(void) { + if (unlikely(!perf_counters_initialized)) + return; + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0); } @@ -164,14 +167,21 @@ u64 hw_perf_save_disable(void) { u64 ctrl; + if (unlikely(!perf_counters_initialized)) + return 0; + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); + return ctrl; } EXPORT_SYMBOL_GPL(hw_perf_save_disable); void hw_perf_restore(u64 ctrl) { + if (unlikely(!perf_counters_initialized)) + return; + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0); } EXPORT_SYMBOL_GPL(hw_perf_restore); -- cgit v1.2.3 From 75f224cf7700ed6006574dc3f2efa29860727570 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Dec 2008 21:58:46 +0100 Subject: perfcounters: fix lapic initialization Fix non-working NMI sampling in certain bootup scenarios. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 6d30f603b62..8a154bd7ba9 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -557,10 +557,10 @@ void __init init_hw_perf_counters(void) printk(KERN_INFO "... bit_width: %d\n", eax.split.bit_width); printk(KERN_INFO "... mask_length: %d\n", eax.split.mask_length); + perf_counters_initialized = true; + perf_counters_lapic_init(0); register_die_notifier(&perf_counter_nmi_notifier); - - perf_counters_initialized = true; } static void x86_perf_counter_read(struct perf_counter *counter) -- cgit v1.2.3 From 94c46572a6d9bb497eda0a14099d9f1360d57d5d Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Fri, 19 Dec 2008 22:37:58 +0530 Subject: x86: perf_counter.c intel_perfmon_event_map and max_intel_perfmon_events should be static Impact: cleanup, avoid sparse warnings, reduce kernel size a bit Fixes these sparse warnings: arch/x86/kernel/cpu/perf_counter.c:44:11: warning: symbol 'intel_perfmon_event_map' was not declared. Should it be static? arch/x86/kernel/cpu/perf_counter.c:54:11: warning: symbol 'max_intel_perfmon_events' was not declared. Should it be static? Signed-off-by: Jaswinder Singh Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 8a154bd7ba9..bdbdb56eaa3 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -41,7 +41,7 @@ struct cpu_hw_counters { */ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters); -const int intel_perfmon_event_map[] = +static const int intel_perfmon_event_map[] = { [PERF_COUNT_CYCLES] = 0x003c, [PERF_COUNT_INSTRUCTIONS] = 0x00c0, @@ -51,7 +51,7 @@ const int intel_perfmon_event_map[] = [PERF_COUNT_BRANCH_MISSES] = 0x00c5, }; -const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); +static const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); /* * Propagate counter elapsed time into the generic counter. -- cgit v1.2.3 From 8fb9331391af95ca1f4e5c0a0da8120b13cbae01 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 23 Dec 2008 12:04:16 +0100 Subject: perfcounters: remove warnings Impact: remove debug checks Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index bdbdb56eaa3..89fad5d4fb3 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -64,7 +64,6 @@ x86_perf_counter_update(struct perf_counter *counter, { u64 prev_raw_count, new_raw_count, delta; - WARN_ON_ONCE(counter->state != PERF_COUNTER_STATE_ACTIVE); /* * Careful: an NMI might modify the previous counter value. * @@ -89,7 +88,6 @@ again: * of the count, so we do that by clipping the delta to 32 bits: */ delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count); - WARN_ON_ONCE((int)delta < 0); atomic64_add(delta, &counter->count); atomic64_sub(delta, &hwc->period_left); @@ -193,7 +191,6 @@ __x86_perf_counter_disable(struct perf_counter *counter, int err; err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0); - WARN_ON_ONCE(err); } static DEFINE_PER_CPU(u64, prev_left[MAX_HW_COUNTERS]); @@ -209,8 +206,6 @@ __hw_perf_counter_set_period(struct perf_counter *counter, s32 left = atomic64_read(&hwc->period_left); s32 period = hwc->irq_period; - WARN_ON_ONCE(period <= 0); - /* * If we are way outside a reasoable range then just skip forward: */ @@ -224,8 +219,6 @@ __hw_perf_counter_set_period(struct perf_counter *counter, atomic64_set(&hwc->period_left, left); } - WARN_ON_ONCE(left <= 0); - per_cpu(prev_left[idx], smp_processor_id()) = left; /* -- cgit v1.2.3 From 5c167b8585c8d91206b395d57011ead7711e322f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 09:02:19 +0100 Subject: x86, perfcounters: rename intel_arch_perfmon.h => perf_counter.h Impact: rename include file We'll be providing an asm/perf_counter.h to the generic perfcounter code, so use the already existing x86 file for this purpose and rename it. Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic.c | 2 +- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/cpu/perf_counter.c | 2 +- arch/x86/kernel/cpu/perfctr-watchdog.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 0579ec1cd6e..4f859acb156 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -31,7 +31,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4461011db47..ad331b4d623 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 89fad5d4fb3..a4a3a09a654 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -16,7 +16,7 @@ #include #include -#include +#include #include static bool perf_counters_initialized __read_mostly; diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 9abd48b2267..d6f5b9fbde3 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -20,7 +20,7 @@ #include #include -#include +#include struct nmi_watchdog_ctlblk { unsigned int cccr_msr; -- cgit v1.2.3 From eb2b861810d4ff72454c83996b891df4e0aaff9a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 09:09:13 +0100 Subject: x86, perfcounters: prepare for fixed-mode PMCs Impact: refactor the x86 code for fixed-mode PMCs Extend the data structures and rename the existing facilities to allow for a 'generic' versus 'fixed' counter distinction. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 53 +++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 27 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a4a3a09a654..fc3af868823 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -27,13 +27,12 @@ static bool perf_counters_initialized __read_mostly; static int nr_hw_counters __read_mostly; static u32 perf_counter_mask __read_mostly; -/* No support for fixed function counters yet */ - -#define MAX_HW_COUNTERS 8 - struct cpu_hw_counters { - struct perf_counter *counters[MAX_HW_COUNTERS]; - unsigned long used[BITS_TO_LONGS(MAX_HW_COUNTERS)]; + struct perf_counter *generic[X86_PMC_MAX_GENERIC]; + unsigned long used[BITS_TO_LONGS(X86_PMC_MAX_GENERIC)]; + + struct perf_counter *fixed[X86_PMC_MAX_FIXED]; + unsigned long used_fixed[BITS_TO_LONGS(X86_PMC_MAX_FIXED)]; }; /* @@ -185,7 +184,7 @@ void hw_perf_restore(u64 ctrl) EXPORT_SYMBOL_GPL(hw_perf_restore); static inline void -__x86_perf_counter_disable(struct perf_counter *counter, +__pmc_generic_disable(struct perf_counter *counter, struct hw_perf_counter *hwc, unsigned int idx) { int err; @@ -193,7 +192,7 @@ __x86_perf_counter_disable(struct perf_counter *counter, err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0); } -static DEFINE_PER_CPU(u64, prev_left[MAX_HW_COUNTERS]); +static DEFINE_PER_CPU(u64, prev_left[X86_PMC_MAX_GENERIC]); /* * Set the next IRQ period, based on the hwc->period_left value. @@ -231,7 +230,7 @@ __hw_perf_counter_set_period(struct perf_counter *counter, } static void -__x86_perf_counter_enable(struct perf_counter *counter, +__pmc_generic_enable(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { wrmsr(hwc->config_base + idx, @@ -241,7 +240,7 @@ __x86_perf_counter_enable(struct perf_counter *counter, /* * Find a PMC slot for the freshly enabled / scheduled in counter: */ -static void x86_perf_counter_enable(struct perf_counter *counter) +static void pmc_generic_enable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; @@ -256,12 +255,12 @@ static void x86_perf_counter_enable(struct perf_counter *counter) perf_counters_lapic_init(hwc->nmi); - __x86_perf_counter_disable(counter, hwc, idx); + __pmc_generic_disable(counter, hwc, idx); - cpuc->counters[idx] = counter; + cpuc->generic[idx] = counter; __hw_perf_counter_set_period(counter, hwc, idx); - __x86_perf_counter_enable(counter, hwc, idx); + __pmc_generic_enable(counter, hwc, idx); } void perf_counter_print_debug(void) @@ -301,16 +300,16 @@ void perf_counter_print_debug(void) local_irq_enable(); } -static void x86_perf_counter_disable(struct perf_counter *counter) +static void pmc_generic_disable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; unsigned int idx = hwc->idx; - __x86_perf_counter_disable(counter, hwc, idx); + __pmc_generic_disable(counter, hwc, idx); clear_bit(idx, cpuc->used); - cpuc->counters[idx] = NULL; + cpuc->generic[idx] = NULL; /* * Drain the remaining delta count out of a counter @@ -349,7 +348,7 @@ static void perf_save_and_restart(struct perf_counter *counter) __hw_perf_counter_set_period(counter, hwc, idx); if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE) - __x86_perf_counter_enable(counter, hwc, idx); + __pmc_generic_enable(counter, hwc, idx); } static void @@ -392,7 +391,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) again: ack = status; for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) { - struct perf_counter *counter = cpuc->counters[bit]; + struct perf_counter *counter = cpuc->generic[bit]; clear_bit(bit, (unsigned long *) &status); if (!counter) @@ -412,7 +411,7 @@ again: } /* * From NMI context we cannot call into the scheduler to - * do a task wakeup - but we mark these counters as + * do a task wakeup - but we mark these generic as * wakeup_pending and initate a wakeup callback: */ if (nmi) { @@ -462,7 +461,7 @@ void perf_counter_notify(struct pt_regs *regs) cpuc = &per_cpu(cpu_hw_counters, cpu); for_each_bit(bit, cpuc->used, nr_hw_counters) { - struct perf_counter *counter = cpuc->counters[bit]; + struct perf_counter *counter = cpuc->generic[bit]; if (!counter) continue; @@ -539,10 +538,10 @@ void __init init_hw_perf_counters(void) printk(KERN_INFO "... version: %d\n", eax.split.version_id); printk(KERN_INFO "... num_counters: %d\n", eax.split.num_counters); nr_hw_counters = eax.split.num_counters; - if (nr_hw_counters > MAX_HW_COUNTERS) { - nr_hw_counters = MAX_HW_COUNTERS; + if (nr_hw_counters > X86_PMC_MAX_GENERIC) { + nr_hw_counters = X86_PMC_MAX_GENERIC; WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", - nr_hw_counters, MAX_HW_COUNTERS); + nr_hw_counters, X86_PMC_MAX_GENERIC); } perf_counter_mask = (1 << nr_hw_counters) - 1; perf_max_counters = nr_hw_counters; @@ -556,15 +555,15 @@ void __init init_hw_perf_counters(void) register_die_notifier(&perf_counter_nmi_notifier); } -static void x86_perf_counter_read(struct perf_counter *counter) +static void pmc_generic_read(struct perf_counter *counter) { x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); } static const struct hw_perf_counter_ops x86_perf_counter_ops = { - .hw_perf_counter_enable = x86_perf_counter_enable, - .hw_perf_counter_disable = x86_perf_counter_disable, - .hw_perf_counter_read = x86_perf_counter_read, + .hw_perf_counter_enable = pmc_generic_enable, + .hw_perf_counter_disable = pmc_generic_disable, + .hw_perf_counter_read = pmc_generic_read, }; const struct hw_perf_counter_ops * -- cgit v1.2.3 From 703e937c83bbad79075a7846e062e447c2fee6a4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 10:51:15 +0100 Subject: perfcounters: add fixed-mode PMC enumeration Enumerate fixed-mode PMCs based on CPUID, and feed that into the perfcounter code. Does not use fixed-mode PMCs yet. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index fc3af868823..2fca50c4597 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -27,6 +27,8 @@ static bool perf_counters_initialized __read_mostly; static int nr_hw_counters __read_mostly; static u32 perf_counter_mask __read_mostly; +static int nr_hw_counters_fixed __read_mostly; + struct cpu_hw_counters { struct perf_counter *generic[X86_PMC_MAX_GENERIC]; unsigned long used[BITS_TO_LONGS(X86_PMC_MAX_GENERIC)]; @@ -519,8 +521,9 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = { void __init init_hw_perf_counters(void) { union cpuid10_eax eax; - unsigned int unused; unsigned int ebx; + unsigned int unused; + union cpuid10_edx edx; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) return; @@ -529,14 +532,14 @@ void __init init_hw_perf_counters(void) * Check whether the Architectural PerfMon supports * Branch Misses Retired Event or not. */ - cpuid(10, &(eax.full), &ebx, &unused, &unused); + cpuid(10, &eax.full, &ebx, &unused, &edx.full); if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) return; printk(KERN_INFO "Intel Performance Monitoring support detected.\n"); - printk(KERN_INFO "... version: %d\n", eax.split.version_id); - printk(KERN_INFO "... num_counters: %d\n", eax.split.num_counters); + printk(KERN_INFO "... version: %d\n", eax.split.version_id); + printk(KERN_INFO "... num counters: %d\n", eax.split.num_counters); nr_hw_counters = eax.split.num_counters; if (nr_hw_counters > X86_PMC_MAX_GENERIC) { nr_hw_counters = X86_PMC_MAX_GENERIC; @@ -546,8 +549,16 @@ void __init init_hw_perf_counters(void) perf_counter_mask = (1 << nr_hw_counters) - 1; perf_max_counters = nr_hw_counters; - printk(KERN_INFO "... bit_width: %d\n", eax.split.bit_width); - printk(KERN_INFO "... mask_length: %d\n", eax.split.mask_length); + printk(KERN_INFO "... bit width: %d\n", eax.split.bit_width); + printk(KERN_INFO "... mask length: %d\n", eax.split.mask_length); + + nr_hw_counters_fixed = edx.split.num_counters_fixed; + if (nr_hw_counters_fixed > X86_PMC_MAX_FIXED) { + nr_hw_counters_fixed = X86_PMC_MAX_FIXED; + WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", + nr_hw_counters_fixed, X86_PMC_MAX_FIXED); + } + printk(KERN_INFO "... fixed counters: %d\n", nr_hw_counters_fixed); perf_counters_initialized = true; -- cgit v1.2.3 From 862a1a5f346fe7e9181ea51eaae48cf2cd70f746 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 13:09:20 +0100 Subject: x86, perfcounters: refactor code for fixed-function PMCs Impact: clean up Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 73 ++++++++++++++++++++------------------ 1 file changed, 39 insertions(+), 34 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 2fca50c4597..358af526640 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -24,17 +24,14 @@ static bool perf_counters_initialized __read_mostly; /* * Number of (generic) HW counters: */ -static int nr_hw_counters __read_mostly; -static u32 perf_counter_mask __read_mostly; +static int nr_counters_generic __read_mostly; +static u64 perf_counter_mask __read_mostly; -static int nr_hw_counters_fixed __read_mostly; +static int nr_counters_fixed __read_mostly; struct cpu_hw_counters { - struct perf_counter *generic[X86_PMC_MAX_GENERIC]; - unsigned long used[BITS_TO_LONGS(X86_PMC_MAX_GENERIC)]; - - struct perf_counter *fixed[X86_PMC_MAX_FIXED]; - unsigned long used_fixed[BITS_TO_LONGS(X86_PMC_MAX_FIXED)]; + struct perf_counter *counters[X86_PMC_IDX_MAX]; + unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; }; /* @@ -159,7 +156,7 @@ void hw_perf_enable_all(void) if (unlikely(!perf_counters_initialized)) return; - wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask); } u64 hw_perf_save_disable(void) @@ -170,7 +167,7 @@ u64 hw_perf_save_disable(void) return 0; rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); - wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); return ctrl; } @@ -181,7 +178,7 @@ void hw_perf_restore(u64 ctrl) if (unlikely(!perf_counters_initialized)) return; - wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); } EXPORT_SYMBOL_GPL(hw_perf_restore); @@ -239,6 +236,11 @@ __pmc_generic_enable(struct perf_counter *counter, hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0); } +static int fixed_mode_idx(struct hw_perf_counter *hwc) +{ + return -1; +} + /* * Find a PMC slot for the freshly enabled / scheduled in counter: */ @@ -250,7 +252,7 @@ static void pmc_generic_enable(struct perf_counter *counter) /* Try to get the previous counter again */ if (test_and_set_bit(idx, cpuc->used)) { - idx = find_first_zero_bit(cpuc->used, nr_hw_counters); + idx = find_first_zero_bit(cpuc->used, nr_counters_generic); set_bit(idx, cpuc->used); hwc->idx = idx; } @@ -259,7 +261,7 @@ static void pmc_generic_enable(struct perf_counter *counter) __pmc_generic_disable(counter, hwc, idx); - cpuc->generic[idx] = counter; + cpuc->counters[idx] = counter; __hw_perf_counter_set_period(counter, hwc, idx); __pmc_generic_enable(counter, hwc, idx); @@ -270,7 +272,7 @@ void perf_counter_print_debug(void) u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left; int cpu, idx; - if (!nr_hw_counters) + if (!nr_counters_generic) return; local_irq_disable(); @@ -286,7 +288,7 @@ void perf_counter_print_debug(void) printk(KERN_INFO "CPU#%d: status: %016llx\n", cpu, status); printk(KERN_INFO "CPU#%d: overflow: %016llx\n", cpu, overflow); - for (idx = 0; idx < nr_hw_counters; idx++) { + for (idx = 0; idx < nr_counters_generic; idx++) { rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl); rdmsrl(MSR_ARCH_PERFMON_PERFCTR0 + idx, pmc_count); @@ -311,7 +313,7 @@ static void pmc_generic_disable(struct perf_counter *counter) __pmc_generic_disable(counter, hwc, idx); clear_bit(idx, cpuc->used); - cpuc->generic[idx] = NULL; + cpuc->counters[idx] = NULL; /* * Drain the remaining delta count out of a counter @@ -381,7 +383,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global); /* Disable counters globally */ - wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); ack_APIC_irq(); cpuc = &per_cpu(cpu_hw_counters, cpu); @@ -392,8 +394,8 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) again: ack = status; - for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) { - struct perf_counter *counter = cpuc->generic[bit]; + for_each_bit(bit, (unsigned long *) &status, nr_counters_generic) { + struct perf_counter *counter = cpuc->counters[bit]; clear_bit(bit, (unsigned long *) &status); if (!counter) @@ -424,7 +426,7 @@ again: } } - wrmsr(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack, 0); + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); /* * Repeat if there is more work to be done: @@ -436,7 +438,7 @@ out: /* * Restore - do not reenable when global enable is off: */ - wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, saved_global, 0); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global); } void smp_perf_counter_interrupt(struct pt_regs *regs) @@ -462,8 +464,8 @@ void perf_counter_notify(struct pt_regs *regs) cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_counters, cpu); - for_each_bit(bit, cpuc->used, nr_hw_counters) { - struct perf_counter *counter = cpuc->generic[bit]; + for_each_bit(bit, cpuc->used, X86_PMC_IDX_MAX) { + struct perf_counter *counter = cpuc->counters[bit]; if (!counter) continue; @@ -540,26 +542,29 @@ void __init init_hw_perf_counters(void) printk(KERN_INFO "... version: %d\n", eax.split.version_id); printk(KERN_INFO "... num counters: %d\n", eax.split.num_counters); - nr_hw_counters = eax.split.num_counters; - if (nr_hw_counters > X86_PMC_MAX_GENERIC) { - nr_hw_counters = X86_PMC_MAX_GENERIC; + nr_counters_generic = eax.split.num_counters; + if (nr_counters_generic > X86_PMC_MAX_GENERIC) { + nr_counters_generic = X86_PMC_MAX_GENERIC; WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", - nr_hw_counters, X86_PMC_MAX_GENERIC); + nr_counters_generic, X86_PMC_MAX_GENERIC); } - perf_counter_mask = (1 << nr_hw_counters) - 1; - perf_max_counters = nr_hw_counters; + perf_counter_mask = (1 << nr_counters_generic) - 1; + perf_max_counters = nr_counters_generic; printk(KERN_INFO "... bit width: %d\n", eax.split.bit_width); printk(KERN_INFO "... mask length: %d\n", eax.split.mask_length); - nr_hw_counters_fixed = edx.split.num_counters_fixed; - if (nr_hw_counters_fixed > X86_PMC_MAX_FIXED) { - nr_hw_counters_fixed = X86_PMC_MAX_FIXED; + nr_counters_fixed = edx.split.num_counters_fixed; + if (nr_counters_fixed > X86_PMC_MAX_FIXED) { + nr_counters_fixed = X86_PMC_MAX_FIXED; WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", - nr_hw_counters_fixed, X86_PMC_MAX_FIXED); + nr_counters_fixed, X86_PMC_MAX_FIXED); } - printk(KERN_INFO "... fixed counters: %d\n", nr_hw_counters_fixed); + printk(KERN_INFO "... fixed counters: %d\n", nr_counters_fixed); + + perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED; + printk(KERN_INFO "... counter mask: %016Lx\n", perf_counter_mask); perf_counters_initialized = true; perf_counters_lapic_init(0); -- cgit v1.2.3 From 7671581f1666ef4b54a1c1e598c51ac44c060a9b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 14:20:28 +0100 Subject: perfcounters: hw ops rename Impact: rename field names Shorten them. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 358af526640..b6755712142 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -577,9 +577,9 @@ static void pmc_generic_read(struct perf_counter *counter) } static const struct hw_perf_counter_ops x86_perf_counter_ops = { - .hw_perf_counter_enable = pmc_generic_enable, - .hw_perf_counter_disable = pmc_generic_disable, - .hw_perf_counter_read = pmc_generic_read, + .enable = pmc_generic_enable, + .disable = pmc_generic_disable, + .read = pmc_generic_read, }; const struct hw_perf_counter_ops * -- cgit v1.2.3 From 95cdd2e7851cce79ab839cb0b3cbe68d7911d0f1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 21 Dec 2008 13:50:42 +0100 Subject: perfcounters: enable lowlevel pmc code to schedule counters Allow lowlevel ->enable() op to return an error if a counter can not be added. This can be used to handle counter constraints. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index b6755712142..74090a393a7 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -244,7 +244,7 @@ static int fixed_mode_idx(struct hw_perf_counter *hwc) /* * Find a PMC slot for the freshly enabled / scheduled in counter: */ -static void pmc_generic_enable(struct perf_counter *counter) +static int pmc_generic_enable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; @@ -253,6 +253,8 @@ static void pmc_generic_enable(struct perf_counter *counter) /* Try to get the previous counter again */ if (test_and_set_bit(idx, cpuc->used)) { idx = find_first_zero_bit(cpuc->used, nr_counters_generic); + if (idx == nr_counters_generic) + return -EAGAIN; set_bit(idx, cpuc->used); hwc->idx = idx; } @@ -265,6 +267,8 @@ static void pmc_generic_enable(struct perf_counter *counter) __hw_perf_counter_set_period(counter, hwc, idx); __pmc_generic_enable(counter, hwc, idx); + + return 0; } void perf_counter_print_debug(void) -- cgit v1.2.3 From 0dff86aa7b9ec65a6d07167b7afb050b5fc98ddc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 23 Dec 2008 12:28:12 +0100 Subject: x86, perfcounters: print out the ->used bitmask Impact: extend debug printouts Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 74090a393a7..f3359c2b391 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -255,6 +255,7 @@ static int pmc_generic_enable(struct perf_counter *counter) idx = find_first_zero_bit(cpuc->used, nr_counters_generic); if (idx == nr_counters_generic) return -EAGAIN; + set_bit(idx, cpuc->used); hwc->idx = idx; } @@ -274,6 +275,7 @@ static int pmc_generic_enable(struct perf_counter *counter) void perf_counter_print_debug(void) { u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left; + struct cpu_hw_counters *cpuc; int cpu, idx; if (!nr_counters_generic) @@ -282,6 +284,7 @@ void perf_counter_print_debug(void) local_irq_disable(); cpu = smp_processor_id(); + cpuc = &per_cpu(cpu_hw_counters, cpu); rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); @@ -291,6 +294,7 @@ void perf_counter_print_debug(void) printk(KERN_INFO "CPU#%d: ctrl: %016llx\n", cpu, ctrl); printk(KERN_INFO "CPU#%d: status: %016llx\n", cpu, status); printk(KERN_INFO "CPU#%d: overflow: %016llx\n", cpu, overflow); + printk(KERN_INFO "CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); for (idx = 0; idx < nr_counters_generic; idx++) { rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl); -- cgit v1.2.3 From f650a672359819454c3d8d4135ecd1558cde0b24 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 23 Dec 2008 12:17:29 +0100 Subject: perfcounters: add PERF_COUNT_BUS_CYCLES Generalize "bus cycles" hw events - and map them to CPU_CLK_Unhalted.Ref on x86. (which is a good enough approximation) Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index f3359c2b391..86b2fdd344a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -41,12 +41,13 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters); static const int intel_perfmon_event_map[] = { - [PERF_COUNT_CYCLES] = 0x003c, + [PERF_COUNT_CPU_CYCLES] = 0x003c, [PERF_COUNT_INSTRUCTIONS] = 0x00c0, [PERF_COUNT_CACHE_REFERENCES] = 0x4f2e, [PERF_COUNT_CACHE_MISSES] = 0x412e, [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4, [PERF_COUNT_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_BUS_CYCLES] = 0x013c, }; static const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); -- cgit v1.2.3 From 2f18d1e8d07ae67dd0afce875287756d4bd31a46 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 22 Dec 2008 11:10:42 +0100 Subject: x86, perfcounters: add support for fixed-function pmcs Impact: extend performance counter support on x86 Intel CPUs Modern Intel CPUs have 3 "fixed-function" performance counters, which count these hardware events: Instr_Retired.Any CPU_CLK_Unhalted.Core CPU_CLK_Unhalted.Ref Add support for them to the performance counters subsystem. Their use is transparent to user-space: the counter scheduler is extended to automatically recognize the cases where a fixed-function PMC can be utilized instead of a generic PMC. In such cases the generic PMC is kept available for more counters. The above fixed-function events map to these generic counter hw events: PERF_COUNT_INSTRUCTIONS PERF_COUNT_CPU_CYCLES PERF_COUNT_BUS_CYCLES (The 'bus' cycles are in reality often CPU-ish cycles, just with a fixed frequency.) Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 149 +++++++++++++++++++++++++++++++------ 1 file changed, 125 insertions(+), 24 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 86b2fdd344a..da46eca1254 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -26,6 +26,7 @@ static bool perf_counters_initialized __read_mostly; */ static int nr_counters_generic __read_mostly; static u64 perf_counter_mask __read_mostly; +static u64 counter_value_mask __read_mostly; static int nr_counters_fixed __read_mostly; @@ -120,9 +121,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->nmi = 1; } - hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0; - hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0; - hwc->irq_period = hw_event->irq_period; /* * Intel PMCs cannot be accessed sanely above 32 bit width, @@ -183,16 +181,34 @@ void hw_perf_restore(u64 ctrl) } EXPORT_SYMBOL_GPL(hw_perf_restore); +static inline void +__pmc_fixed_disable(struct perf_counter *counter, + struct hw_perf_counter *hwc, unsigned int __idx) +{ + int idx = __idx - X86_PMC_IDX_FIXED; + u64 ctrl_val, mask; + int err; + + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + err = checking_wrmsrl(hwc->config_base, ctrl_val); +} + static inline void __pmc_generic_disable(struct perf_counter *counter, struct hw_perf_counter *hwc, unsigned int idx) { int err; + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) + return __pmc_fixed_disable(counter, hwc, idx); + err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0); } -static DEFINE_PER_CPU(u64, prev_left[X86_PMC_MAX_GENERIC]); +static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); /* * Set the next IRQ period, based on the hwc->period_left value. @@ -202,8 +218,9 @@ static void __hw_perf_counter_set_period(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { - s32 left = atomic64_read(&hwc->period_left); + s64 left = atomic64_read(&hwc->period_left); s32 period = hwc->irq_period; + int err; /* * If we are way outside a reasoable range then just skip forward: @@ -224,21 +241,64 @@ __hw_perf_counter_set_period(struct perf_counter *counter, * The hw counter starts counting from this counter offset, * mark it to be able to extra future deltas: */ - atomic64_set(&hwc->prev_count, (u64)(s64)-left); + atomic64_set(&hwc->prev_count, (u64)-left); - wrmsr(hwc->counter_base + idx, -left, 0); + err = checking_wrmsrl(hwc->counter_base + idx, + (u64)(-left) & counter_value_mask); +} + +static inline void +__pmc_fixed_enable(struct perf_counter *counter, + struct hw_perf_counter *hwc, unsigned int __idx) +{ + int idx = __idx - X86_PMC_IDX_FIXED; + u64 ctrl_val, bits, mask; + int err; + + /* + * Enable IRQ generation (0x8) and ring-3 counting (0x2), + * and enable ring-0 counting if allowed: + */ + bits = 0x8ULL | 0x2ULL; + if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) + bits |= 0x1; + bits <<= (idx * 4); + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + ctrl_val |= bits; + err = checking_wrmsrl(hwc->config_base, ctrl_val); } static void __pmc_generic_enable(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) + return __pmc_fixed_enable(counter, hwc, idx); + wrmsr(hwc->config_base + idx, hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0); } -static int fixed_mode_idx(struct hw_perf_counter *hwc) +static int +fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) { + unsigned int event; + + if (unlikely(hwc->nmi)) + return -1; + + event = hwc->config & ARCH_PERFMON_EVENT_MASK; + + if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_INSTRUCTIONS])) + return X86_PMC_IDX_FIXED_INSTRUCTIONS; + if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_CPU_CYCLES])) + return X86_PMC_IDX_FIXED_CPU_CYCLES; + if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_BUS_CYCLES])) + return X86_PMC_IDX_FIXED_BUS_CYCLES; + return -1; } @@ -249,16 +309,39 @@ static int pmc_generic_enable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; - int idx = hwc->idx; + int idx; - /* Try to get the previous counter again */ - if (test_and_set_bit(idx, cpuc->used)) { - idx = find_first_zero_bit(cpuc->used, nr_counters_generic); - if (idx == nr_counters_generic) - return -EAGAIN; + idx = fixed_mode_idx(counter, hwc); + if (idx >= 0) { + /* + * Try to get the fixed counter, if that is already taken + * then try to get a generic counter: + */ + if (test_and_set_bit(idx, cpuc->used)) + goto try_generic; - set_bit(idx, cpuc->used); + hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; + /* + * We set it so that counter_base + idx in wrmsr/rdmsr maps to + * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: + */ + hwc->counter_base = + MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; hwc->idx = idx; + } else { + idx = hwc->idx; + /* Try to get the previous generic counter again */ + if (test_and_set_bit(idx, cpuc->used)) { +try_generic: + idx = find_first_zero_bit(cpuc->used, nr_counters_generic); + if (idx == nr_counters_generic) + return -EAGAIN; + + set_bit(idx, cpuc->used); + hwc->idx = idx; + } + hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0; + hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0; } perf_counters_lapic_init(hwc->nmi); @@ -266,6 +349,10 @@ static int pmc_generic_enable(struct perf_counter *counter) __pmc_generic_disable(counter, hwc, idx); cpuc->counters[idx] = counter; + /* + * Make it visible before enabling the hw: + */ + smp_wmb(); __hw_perf_counter_set_period(counter, hwc, idx); __pmc_generic_enable(counter, hwc, idx); @@ -275,7 +362,7 @@ static int pmc_generic_enable(struct perf_counter *counter) void perf_counter_print_debug(void) { - u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left; + u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; struct cpu_hw_counters *cpuc; int cpu, idx; @@ -290,11 +377,13 @@ void perf_counter_print_debug(void) rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); + rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); printk(KERN_INFO "\n"); printk(KERN_INFO "CPU#%d: ctrl: %016llx\n", cpu, ctrl); printk(KERN_INFO "CPU#%d: status: %016llx\n", cpu, status); printk(KERN_INFO "CPU#%d: overflow: %016llx\n", cpu, overflow); + printk(KERN_INFO "CPU#%d: fixed: %016llx\n", cpu, fixed); printk(KERN_INFO "CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); for (idx = 0; idx < nr_counters_generic; idx++) { @@ -303,13 +392,19 @@ void perf_counter_print_debug(void) prev_left = per_cpu(prev_left[idx], cpu); - printk(KERN_INFO "CPU#%d: PMC%d ctrl: %016llx\n", + printk(KERN_INFO "CPU#%d: gen-PMC%d ctrl: %016llx\n", cpu, idx, pmc_ctrl); - printk(KERN_INFO "CPU#%d: PMC%d count: %016llx\n", + printk(KERN_INFO "CPU#%d: gen-PMC%d count: %016llx\n", cpu, idx, pmc_count); - printk(KERN_INFO "CPU#%d: PMC%d left: %016llx\n", + printk(KERN_INFO "CPU#%d: gen-PMC%d left: %016llx\n", cpu, idx, prev_left); } + for (idx = 0; idx < nr_counters_fixed; idx++) { + rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); + + printk(KERN_INFO "CPU#%d: fixed-PMC%d count: %016llx\n", + cpu, idx, pmc_count); + } local_irq_enable(); } @@ -323,6 +418,11 @@ static void pmc_generic_disable(struct perf_counter *counter) clear_bit(idx, cpuc->used); cpuc->counters[idx] = NULL; + /* + * Make sure the cleared pointer becomes visible before we + * (potentially) free the counter: + */ + smp_wmb(); /* * Drain the remaining delta count out of a counter @@ -353,14 +453,11 @@ static void perf_save_and_restart(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; int idx = hwc->idx; - u64 pmc_ctrl; - - rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl); x86_perf_counter_update(counter, hwc, idx); __hw_perf_counter_set_period(counter, hwc, idx); - if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE) + if (counter->state == PERF_COUNTER_STATE_ACTIVE) __pmc_generic_enable(counter, hwc, idx); } @@ -373,6 +470,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) * Store sibling timestamps (if any): */ list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { + x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); perf_store_irq_data(sibling, counter->hw_event.type); perf_store_irq_data(sibling, atomic64_read(&counter->count)); @@ -403,7 +501,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) again: ack = status; - for_each_bit(bit, (unsigned long *) &status, nr_counters_generic) { + for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { struct perf_counter *counter = cpuc->counters[bit]; clear_bit(bit, (unsigned long *) &status); @@ -561,6 +659,9 @@ void __init init_hw_perf_counters(void) perf_max_counters = nr_counters_generic; printk(KERN_INFO "... bit width: %d\n", eax.split.bit_width); + counter_value_mask = (1ULL << eax.split.bit_width) - 1; + printk(KERN_INFO "... value mask: %016Lx\n", counter_value_mask); + printk(KERN_INFO "... mask length: %d\n", eax.split.mask_length); nr_counters_fixed = edx.split.num_counters_fixed; -- cgit v1.2.3 From 2b583d8bc8d7105b58d7481a4a0ceb718dac49c6 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 27 Dec 2008 19:15:43 +0530 Subject: x86: perf_counter remove unwanted hw_perf_enable_all Impact: clean, reduce kernel size a bit, avoid sparse warnings Fixes sparse warnings: arch/x86/kernel/cpu/perf_counter.c:153:6: warning: symbol 'hw_perf_enable_all' was not declared. Should it be static? arch/x86/kernel/cpu/perf_counter.c:279:3: warning: returning void-valued expression arch/x86/kernel/cpu/perf_counter.c:206:3: warning: returning void-valued expression arch/x86/kernel/cpu/perf_counter.c:206:3: warning: returning void-valued expression Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index da46eca1254..9376771f757 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -150,14 +150,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return 0; } -void hw_perf_enable_all(void) -{ - if (unlikely(!perf_counters_initialized)) - return; - - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask); -} - u64 hw_perf_save_disable(void) { u64 ctrl; @@ -200,12 +192,10 @@ static inline void __pmc_generic_disable(struct perf_counter *counter, struct hw_perf_counter *hwc, unsigned int idx) { - int err; - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) - return __pmc_fixed_disable(counter, hwc, idx); - - err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0); + __pmc_fixed_disable(counter, hwc, idx); + else + wrmsr_safe(hwc->config_base + idx, hwc->config, 0); } static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); @@ -276,10 +266,10 @@ __pmc_generic_enable(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) - return __pmc_fixed_enable(counter, hwc, idx); - - wrmsr(hwc->config_base + idx, - hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0); + __pmc_fixed_enable(counter, hwc, idx); + else + wrmsr(hwc->config_base + idx, + hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0); } static int -- cgit v1.2.3 From 1b023a96d9b44f50f4d8ff28c15f5b80e354760f Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 23 Jan 2009 10:13:01 +0100 Subject: perfcounters: throttle on too high IRQ rates Starting kerneltop with only -c 100 seems to be a bad idea, it can easily lock the system due to perfcounter IRQ overload. So add throttling: if a new IRQ arrives in a shorter than PERFMON_MIN_PERIOD_NS time, turn off perfcounters and untrottle them from the next timer tick. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic.c | 2 ++ arch/x86/kernel/cpu/perf_counter.c | 38 ++++++++++++++++++++++++++++++++------ 2 files changed, 34 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 7b434e5b14c..849c23009bf 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -781,6 +781,8 @@ static void local_apic_timer_interrupt(void) inc_irq_stat(apic_timer_irqs); evt->event_handler(evt); + + perf_counter_unthrottle(); } /* diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 9376771f757..1a040b179b5 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -33,6 +33,9 @@ static int nr_counters_fixed __read_mostly; struct cpu_hw_counters { struct perf_counter *counters[X86_PMC_IDX_MAX]; unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + u64 last_interrupt; + u64 global_enable; + int throttled; }; /* @@ -474,16 +477,19 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) { int bit, cpu = smp_processor_id(); - u64 ack, status, saved_global; - struct cpu_hw_counters *cpuc; + u64 ack, status, now; + struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global); + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); /* Disable counters globally */ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); ack_APIC_irq(); - cpuc = &per_cpu(cpu_hw_counters, cpu); + now = sched_clock(); + if (now - cpuc->last_interrupt < PERFMON_MIN_PERIOD_NS) + cpuc->throttled = 1; + cpuc->last_interrupt = now; rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); if (!status) @@ -533,9 +539,29 @@ again: goto again; out: /* - * Restore - do not reenable when global enable is off: + * Restore - do not reenable when global enable is off or throttled: */ - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global); + if (!cpuc->throttled) + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); +} + +void perf_counter_unthrottle(void) +{ + struct cpu_hw_counters *cpuc; + + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return; + + if (unlikely(!perf_counters_initialized)) + return; + + cpuc = &per_cpu(cpu_hw_counters, smp_processor_id()); + if (cpuc->throttled) { + if (printk_ratelimit()) + printk(KERN_WARNING "PERFMON: max event frequency exceeded!\n"); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); + cpuc->throttled = 0; + } } void smp_perf_counter_interrupt(struct pt_regs *regs) -- cgit v1.2.3 From 4b39fd96855254a244f71245b41a91cdecb87d63 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 23 Jan 2009 14:36:16 +0100 Subject: perfcounters: ratelimit performance counter interrupts Ratelimit performance counter interrupts to 100KHz per CPU. This replaces the irq-delta-time based method. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 1a040b179b5..a56d4cf92f3 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -33,9 +33,8 @@ static int nr_counters_fixed __read_mostly; struct cpu_hw_counters { struct perf_counter *counters[X86_PMC_IDX_MAX]; unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - u64 last_interrupt; + unsigned long interrupts; u64 global_enable; - int throttled; }; /* @@ -470,6 +469,11 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) } } +/* + * Maximum interrupt frequency of 100KHz per CPU + */ +#define PERFMON_MAX_INTERRUPTS 100000/HZ + /* * This handler is triggered by the local APIC, so the APIC IRQ handling * rules apply: @@ -477,7 +481,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) { int bit, cpu = smp_processor_id(); - u64 ack, status, now; + u64 ack, status; struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); @@ -486,11 +490,6 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); ack_APIC_irq(); - now = sched_clock(); - if (now - cpuc->last_interrupt < PERFMON_MIN_PERIOD_NS) - cpuc->throttled = 1; - cpuc->last_interrupt = now; - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); if (!status) goto out; @@ -541,13 +540,14 @@ out: /* * Restore - do not reenable when global enable is off or throttled: */ - if (!cpuc->throttled) + if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); } void perf_counter_unthrottle(void) { struct cpu_hw_counters *cpuc; + u64 global_enable; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) return; @@ -556,12 +556,15 @@ void perf_counter_unthrottle(void) return; cpuc = &per_cpu(cpu_hw_counters, smp_processor_id()); - if (cpuc->throttled) { + if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) { if (printk_ratelimit()) - printk(KERN_WARNING "PERFMON: max event frequency exceeded!\n"); + printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n"); wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); - cpuc->throttled = 0; } + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, global_enable); + if (unlikely(cpuc->global_enable && !global_enable)) + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); + cpuc->interrupts = 0; } void smp_perf_counter_interrupt(struct pt_regs *regs) -- cgit v1.2.3 From 3415dd9146c574bffe8f012c096bfc2bc62b9508 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 23 Jan 2009 14:16:53 +0100 Subject: perfcounters fix section mismatch warning in perf_counter.c::perf_counters_lapic_init() Fix: WARNING: arch/x86/kernel/built-in.o(.text+0xdd0f): Section mismatch in reference from the function pmc_generic_enable() to the function .cpuinit.text:perf_counters_lapic_init() The function pmc_generic_enable() references the function __cpuinit perf_counters_lapic_init(). This is often because pmc_generic_enable lacks a __cpuinit annotation or the annotation of perf_counters_lapic_init is wrong. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a56d4cf92f3..46c436cdd73 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -605,7 +605,7 @@ void perf_counter_notify(struct pt_regs *regs) local_irq_restore(flags); } -void __cpuinit perf_counters_lapic_init(int nmi) +void perf_counters_lapic_init(int nmi) { u32 apic_val; -- cgit v1.2.3 From bb3f0b59ad005d2d2ecbbe9bd048eab6d1ecbd31 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 25 Jan 2009 02:38:09 -0800 Subject: x86: make irqinit_32.c more like irqinit_64.c, v2 Impact: cleanup 1. add smp_intr_init and apic_intr_init for 32bit, the same as 64bit 2. move the apic_intr_init() call before set gate with interrupt[i] 3. for 64bit, if ia32_emulation is not used, will make per_cpu to use 0x80 vector. [ v2: should use !test_bit() instead of test_bit() with 32bit ] Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 56 ++++++++++++++++++++++++++------------------ arch/x86/kernel/irqinit_64.c | 7 +++--- arch/x86/kernel/traps.c | 15 +++++------- 3 files changed, 43 insertions(+), 35 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index c56496f8c6f..ddf3eb72f86 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -120,28 +120,8 @@ int vector_used_by_percpu_irq(unsigned int vector) return 0; } -/* Overridden in paravirt.c */ -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); - -void __init native_init_IRQ(void) +static void __init smp_intr_init(void) { - int i; - - /* all the set up before the call gates are initialised */ - pre_intr_init_hook(); - - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { - /* SYSCALL_VECTOR was reserved in trap_init. */ - if (i != SYSCALL_VECTOR) - set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); - } - - #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) /* * The reschedule interrupt is a CPU-to-CPU reschedule-helper @@ -170,8 +150,13 @@ void __init native_init_IRQ(void) set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); #endif +} +static void __init apic_intr_init(void) +{ #ifdef CONFIG_X86_LOCAL_APIC + smp_intr_init(); + /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); @@ -181,12 +166,37 @@ void __init native_init_IRQ(void) # ifdef CONFIG_PERF_COUNTERS alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt); # endif -#endif -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) +# ifdef CONFIG_X86_MCE_P4THERMAL /* thermal monitor LVT interrupt */ alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); +# endif #endif +} + +/* Overridden in paravirt.c */ +void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); + +void __init native_init_IRQ(void) +{ + int i; + + /* all the set up before the call gates are initialised */ + pre_intr_init_hook(); + + apic_intr_init(); + + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { + int vector = FIRST_EXTERNAL_VECTOR + i; + /* SYSCALL_VECTOR was reserved in trap_init. */ + if (!test_bit(vector, used_vectors)) + set_intr_gate(vector, interrupt[i]); + } if (!acpi_ioapic) setup_irq(2, &irq2); diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 6a71bfc51e5..16e1fc68750 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -162,6 +162,9 @@ void __init native_init_IRQ(void) int i; init_ISA_irqs(); + + apic_intr_init(); + /* * Cover the whole vector space, no vector can escape * us. (some of these will be overridden and become @@ -169,12 +172,10 @@ void __init native_init_IRQ(void) */ for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { int vector = FIRST_EXTERNAL_VECTOR + i; - if (vector != IA32_SYSCALL_VECTOR) + if (!test_bit(vector, used_vectors)) set_intr_gate(vector, interrupt[i]); } - apic_intr_init(); - if (!acpi_ioapic) setup_irq(2, &irq2); } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ed5aee5f3fc..d36a502d87a 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -979,8 +979,13 @@ void __init trap_init(void) #endif set_intr_gate(19, &simd_coprocessor_error); + /* Reserve all the builtin and the syscall vector: */ + for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) + set_bit(i, used_vectors); + #ifdef CONFIG_IA32_EMULATION set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); + set_bit(IA32_SYSCALL_VECTOR, used_vectors); #endif #ifdef CONFIG_X86_32 @@ -997,17 +1002,9 @@ void __init trap_init(void) } set_system_trap_gate(SYSCALL_VECTOR, &system_call); -#endif - - /* Reserve all the builtin and the syscall vector: */ - for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) - set_bit(i, used_vectors); - -#ifdef CONFIG_X86_64 - set_bit(IA32_SYSCALL_VECTOR, used_vectors); -#else set_bit(SYSCALL_VECTOR, used_vectors); #endif + /* * Should be a barrier for any external CPU state: */ -- cgit v1.2.3 From 15081c61362618a0c81cc8d04e45e7427bc1ed71 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sun, 1 Feb 2009 22:07:39 +0530 Subject: x86: irqinit_32.c fix compilation warning Fix: arch/x86/kernel/irqinit_32.c:124: warning: 'smp_intr_init' defined but not used Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index ddf3eb72f86..520e6c1c5d2 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -154,9 +154,9 @@ static void __init smp_intr_init(void) static void __init apic_intr_init(void) { -#ifdef CONFIG_X86_LOCAL_APIC smp_intr_init(); +#ifdef CONFIG_X86_LOCAL_APIC /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); -- cgit v1.2.3 From 5b75af0a02fcf3b8899f38ff6f22164c5d8e2fdd Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 4 Feb 2009 17:11:34 +0100 Subject: perfcounters: fix "perf counters kill oprofile" bug With oprofile as a module, and unloaded by profiling script, both oprofile and kerneltop work fine.. unless you leave kerneltop running when you start profiling, then you may see badness. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 46c436cdd73..8bb213323fe 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -643,7 +643,9 @@ perf_counter_nmi_handler(struct notifier_block *self, } static __read_mostly struct notifier_block perf_counter_nmi_notifier = { - .notifier_call = perf_counter_nmi_handler + .notifier_call = perf_counter_nmi_handler, + .next = NULL, + .priority = 1 }; void __init init_hw_perf_counters(void) -- cgit v1.2.3 From d278c48435625cb6b7edcf6a547620768b175709 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Mon, 9 Feb 2009 07:38:50 +0100 Subject: perf_counters: account NMI interrupts I noticed that kerneltop interrupts were accounted as NMI, but not their perf counter origin. Account NMI performance counter interrupts. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 8bb213323fe..9901e46998d 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -495,6 +495,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) goto out; again: + inc_irq_stat(apic_perf_irqs); ack = status; for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { struct perf_counter *counter = cpuc->counters[bit]; @@ -570,7 +571,6 @@ void perf_counter_unthrottle(void) void smp_perf_counter_interrupt(struct pt_regs *regs) { irq_enter(); - inc_irq_stat(apic_perf_irqs); apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR); __smp_perf_counter_interrupt(regs, 0); -- cgit v1.2.3 From 0475f9ea8e2cc030298908949e0d5da9f2fc2cfe Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 11 Feb 2009 14:35:35 +1100 Subject: perf_counters: allow users to count user, kernel and/or hypervisor events Impact: new perf_counter feature This extends the perf_counter_hw_event struct with bits that specify that events in user, kernel and/or hypervisor mode should not be counted (i.e. should be excluded), and adds code to program the PMU mode selection bits accordingly on x86 and powerpc. For software counters, we don't currently have the infrastructure to distinguish which mode an event occurs in, so we currently fail the counter initialization if the setting of the hw_event.exclude_* bits would require us to distinguish. Context switches and CPU migrations are currently considered to occur in kernel mode. On x86, this changes the previous policy that only root can count kernel events. Now non-root users can count kernel events or exclude them. Non-root users still can't use NMI events, though. On x86 we don't appear to have any way to control whether hypervisor events are counted or not, so hw_event.exclude_hv is ignored. On powerpc, the selection of whether to count events in user, kernel and/or hypervisor mode is PMU-wide, not per-counter, so this adds a check that the hw_event.exclude_* settings are the same as other events on the PMU. Counters being added to a group have to have the same settings as the other hardware counters in the group. Counters and groups can only be enabled in hw_perf_group_sched_in or power_perf_enable if they have the same settings as any other counters already on the PMU. If we are not running on a hypervisor, the exclude_hv setting is ignored (by forcing it to 0) since we can't ever get any hypervisor events. Signed-off-by: Paul Mackerras --- arch/x86/kernel/cpu/perf_counter.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 9901e46998d..383d4c6423a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -107,21 +107,25 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return -EINVAL; /* - * Count user events, and generate PMC IRQs: + * Generate PMC IRQs: * (keep 'enabled' bit clear for now) */ - hwc->config = ARCH_PERFMON_EVENTSEL_USR | ARCH_PERFMON_EVENTSEL_INT; + hwc->config = ARCH_PERFMON_EVENTSEL_INT; /* - * If privileged enough, count OS events too, and allow - * NMI events as well: + * Count user and OS events unless requested not to. */ - hwc->nmi = 0; - if (capable(CAP_SYS_ADMIN)) { + if (!hw_event->exclude_user) + hwc->config |= ARCH_PERFMON_EVENTSEL_USR; + if (!hw_event->exclude_kernel) hwc->config |= ARCH_PERFMON_EVENTSEL_OS; - if (hw_event->nmi) - hwc->nmi = 1; - } + + /* + * If privileged enough, allow NMI events: + */ + hwc->nmi = 0; + if (capable(CAP_SYS_ADMIN) && hw_event->nmi) + hwc->nmi = 1; hwc->irq_period = hw_event->irq_period; /* @@ -248,10 +252,13 @@ __pmc_fixed_enable(struct perf_counter *counter, int err; /* - * Enable IRQ generation (0x8) and ring-3 counting (0x2), - * and enable ring-0 counting if allowed: + * Enable IRQ generation (0x8), + * and enable ring-3 counting (0x2) and ring-0 counting (0x1) + * if requested: */ - bits = 0x8ULL | 0x2ULL; + bits = 0x8ULL; + if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) + bits |= 0x2; if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) bits |= 0x1; bits <<= (idx * 4); -- cgit v1.2.3 From b56a3802dc6df29aa27d2c12edf420258091ad66 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 27 Feb 2009 18:09:09 +0530 Subject: x86: prepare perf_counter to add more cpus Introduced struct pmc_x86_ops to add more cpus. Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 106 +++++++++++++++++++++++++++---------- 1 file changed, 78 insertions(+), 28 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 383d4c6423a..a3c88529bb7 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -3,6 +3,7 @@ * * Copyright(C) 2008 Thomas Gleixner * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar + * Copyright(C) 2009 Jaswinder Singh Rajput * * For licencing details see kernel-base/COPYING */ @@ -38,10 +39,24 @@ struct cpu_hw_counters { }; /* - * Intel PerfMon v3. Used on Core2 and later. + * struct pmc_x86_ops - performance counter x86 ops */ +struct pmc_x86_ops { + u64 (*save_disable_all) (void); + void (*restore_all) (u64 ctrl); + unsigned eventsel; + unsigned perfctr; + int (*event_map) (int event); + int max_events; +}; + +static struct pmc_x86_ops *pmc_ops; + static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters); +/* + * Intel PerfMon v3. Used on Core2 and later. + */ static const int intel_perfmon_event_map[] = { [PERF_COUNT_CPU_CYCLES] = 0x003c, @@ -53,7 +68,10 @@ static const int intel_perfmon_event_map[] = [PERF_COUNT_BUS_CYCLES] = 0x013c, }; -static const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); +static int pmc_intel_event_map(int event) +{ + return intel_perfmon_event_map[event]; +} /* * Propagate counter elapsed time into the generic counter. @@ -144,38 +162,48 @@ static int __hw_perf_counter_init(struct perf_counter *counter) if (hw_event->raw) { hwc->config |= hw_event->type; } else { - if (hw_event->type >= max_intel_perfmon_events) + if (hw_event->type >= pmc_ops->max_events) return -EINVAL; /* * The generic map: */ - hwc->config |= intel_perfmon_event_map[hw_event->type]; + hwc->config |= pmc_ops->event_map(hw_event->type); } counter->wakeup_pending = 0; return 0; } -u64 hw_perf_save_disable(void) +static u64 pmc_intel_save_disable_all(void) { u64 ctrl; - if (unlikely(!perf_counters_initialized)) - return 0; - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); return ctrl; } + +u64 hw_perf_save_disable(void) +{ + if (unlikely(!perf_counters_initialized)) + return 0; + + return pmc_ops->save_disable_all(); +} EXPORT_SYMBOL_GPL(hw_perf_save_disable); +static void pmc_intel_restore_all(u64 ctrl) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); +} + void hw_perf_restore(u64 ctrl) { if (unlikely(!perf_counters_initialized)) return; - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); + pmc_ops->restore_all(ctrl); } EXPORT_SYMBOL_GPL(hw_perf_restore); @@ -291,11 +319,11 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) event = hwc->config & ARCH_PERFMON_EVENT_MASK; - if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_INSTRUCTIONS])) + if (unlikely(event == pmc_ops->event_map(PERF_COUNT_INSTRUCTIONS))) return X86_PMC_IDX_FIXED_INSTRUCTIONS; - if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_CPU_CYCLES])) + if (unlikely(event == pmc_ops->event_map(PERF_COUNT_CPU_CYCLES))) return X86_PMC_IDX_FIXED_CPU_CYCLES; - if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_BUS_CYCLES])) + if (unlikely(event == pmc_ops->event_map(PERF_COUNT_BUS_CYCLES))) return X86_PMC_IDX_FIXED_BUS_CYCLES; return -1; @@ -339,8 +367,8 @@ try_generic: set_bit(idx, cpuc->used); hwc->idx = idx; } - hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0; - hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0; + hwc->config_base = pmc_ops->eventsel; + hwc->counter_base = pmc_ops->perfctr; } perf_counters_lapic_init(hwc->nmi); @@ -386,8 +414,8 @@ void perf_counter_print_debug(void) printk(KERN_INFO "CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); for (idx = 0; idx < nr_counters_generic; idx++) { - rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl); - rdmsrl(MSR_ARCH_PERFMON_PERFCTR0 + idx, pmc_count); + rdmsrl(pmc_ops->eventsel + idx, pmc_ctrl); + rdmsrl(pmc_ops->perfctr + idx, pmc_count); prev_left = per_cpu(prev_left[idx], cpu); @@ -655,29 +683,56 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = { .priority = 1 }; -void __init init_hw_perf_counters(void) +static struct pmc_x86_ops pmc_intel_ops = { + .save_disable_all = pmc_intel_save_disable_all, + .restore_all = pmc_intel_restore_all, + .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .event_map = pmc_intel_event_map, + .max_events = ARRAY_SIZE(intel_perfmon_event_map), +}; + +static struct pmc_x86_ops *pmc_intel_init(void) { union cpuid10_eax eax; unsigned int ebx; unsigned int unused; union cpuid10_edx edx; - if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return; - /* * Check whether the Architectural PerfMon supports * Branch Misses Retired Event or not. */ cpuid(10, &eax.full, &ebx, &unused, &edx.full); if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) - return; + return NULL; printk(KERN_INFO "Intel Performance Monitoring support detected.\n"); - printk(KERN_INFO "... version: %d\n", eax.split.version_id); - printk(KERN_INFO "... num counters: %d\n", eax.split.num_counters); + printk(KERN_INFO "... bit width: %d\n", eax.split.bit_width); + printk(KERN_INFO "... mask length: %d\n", eax.split.mask_length); + nr_counters_generic = eax.split.num_counters; + nr_counters_fixed = edx.split.num_counters_fixed; + counter_value_mask = (1ULL << eax.split.bit_width) - 1; + + return &pmc_intel_ops; +} + +void __init init_hw_perf_counters(void) +{ + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return; + + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + pmc_ops = pmc_intel_init(); + break; + } + if (!pmc_ops) + return; + + printk(KERN_INFO "... num counters: %d\n", nr_counters_generic); if (nr_counters_generic > X86_PMC_MAX_GENERIC) { nr_counters_generic = X86_PMC_MAX_GENERIC; WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", @@ -686,13 +741,8 @@ void __init init_hw_perf_counters(void) perf_counter_mask = (1 << nr_counters_generic) - 1; perf_max_counters = nr_counters_generic; - printk(KERN_INFO "... bit width: %d\n", eax.split.bit_width); - counter_value_mask = (1ULL << eax.split.bit_width) - 1; printk(KERN_INFO "... value mask: %016Lx\n", counter_value_mask); - printk(KERN_INFO "... mask length: %d\n", eax.split.mask_length); - - nr_counters_fixed = edx.split.num_counters_fixed; if (nr_counters_fixed > X86_PMC_MAX_FIXED) { nr_counters_fixed = X86_PMC_MAX_FIXED; WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", -- cgit v1.2.3 From f87ad35d37fa543925210550f7db20a54c83ed70 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 27 Feb 2009 20:15:14 +0530 Subject: x86: AMD Support for perf_counter Supported basic performance counter for AMD K7 and later: $ perfstat -e 0,1,2,3,4,5,-1,-2,-3,-4,-5 ls > /dev/null Performance counter stats for 'ls': 12.298610 task clock ticks (msecs) 3298477 CPU cycles (events) 1406354 instructions (events) 749035 cache references (events) 16939 cache misses (events) 100589 branches (events) 11159 branch misses (events) 7.627540 cpu clock ticks (msecs) 12.298610 task clock ticks (msecs) 500 pagefaults (events) 6 context switches (events) 3 CPU migrations (events) Wall-clock time elapsed: 8.672290 msecs Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 4 ++ arch/x86/kernel/cpu/perf_counter.c | 83 +++++++++++++++++++++++++++++++++++++- 2 files changed, 85 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 25423a5b80e..edcde52bd17 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -368,6 +368,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) if (c->x86 >= 6) set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); + /* Enable Performance counter for K7 and later */ + if (c->x86 > 6 && c->x86 <= 0x11) + set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); + if (!c->x86_model_id[0]) { switch (c->x86) { case 0xf: diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a3c88529bb7..266618aa1a0 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -73,6 +73,24 @@ static int pmc_intel_event_map(int event) return intel_perfmon_event_map[event]; } +/* + * AMD Performance Monitor K7 and later. + */ +static const int amd_perfmon_event_map[] = +{ + [PERF_COUNT_CPU_CYCLES] = 0x0076, + [PERF_COUNT_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_CACHE_REFERENCES] = 0x0080, + [PERF_COUNT_CACHE_MISSES] = 0x0081, + [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_BRANCH_MISSES] = 0x00c5, +}; + +static int pmc_amd_event_map(int event) +{ + return amd_perfmon_event_map[event]; +} + /* * Propagate counter elapsed time into the generic counter. * Can only be executed on the CPU where the counter is active. @@ -151,8 +169,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter) * so we install an artificial 1<<31 period regardless of * the generic counter period: */ - if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF) - hwc->irq_period = 0x7FFFFFFF; + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF) + hwc->irq_period = 0x7FFFFFFF; atomic64_set(&hwc->period_left, hwc->irq_period); @@ -184,6 +203,22 @@ static u64 pmc_intel_save_disable_all(void) return ctrl; } +static u64 pmc_amd_save_disable_all(void) +{ + int idx; + u64 val, ctrl = 0; + + for (idx = 0; idx < nr_counters_generic; idx++) { + rdmsrl(MSR_K7_EVNTSEL0 + idx, val); + if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) + ctrl |= (1 << idx); + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_K7_EVNTSEL0 + idx, val); + } + + return ctrl; +} + u64 hw_perf_save_disable(void) { if (unlikely(!perf_counters_initialized)) @@ -198,6 +233,20 @@ static void pmc_intel_restore_all(u64 ctrl) wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); } +static void pmc_amd_restore_all(u64 ctrl) +{ + u64 val; + int idx; + + for (idx = 0; idx < nr_counters_generic; idx++) { + if (ctrl & (1 << idx)) { + rdmsrl(MSR_K7_EVNTSEL0 + idx, val); + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_K7_EVNTSEL0 + idx, val); + } + } +} + void hw_perf_restore(u64 ctrl) { if (unlikely(!perf_counters_initialized)) @@ -314,6 +363,9 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) { unsigned int event; + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + return -1; + if (unlikely(hwc->nmi)) return -1; @@ -401,6 +453,7 @@ void perf_counter_print_debug(void) cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_counters, cpu); + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); @@ -411,6 +464,7 @@ void perf_counter_print_debug(void) printk(KERN_INFO "CPU#%d: status: %016llx\n", cpu, status); printk(KERN_INFO "CPU#%d: overflow: %016llx\n", cpu, overflow); printk(KERN_INFO "CPU#%d: fixed: %016llx\n", cpu, fixed); + } printk(KERN_INFO "CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); for (idx = 0; idx < nr_counters_generic; idx++) { @@ -588,6 +642,9 @@ void perf_counter_unthrottle(void) if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) return; + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + return; + if (unlikely(!perf_counters_initialized)) return; @@ -692,6 +749,15 @@ static struct pmc_x86_ops pmc_intel_ops = { .max_events = ARRAY_SIZE(intel_perfmon_event_map), }; +static struct pmc_x86_ops pmc_amd_ops = { + .save_disable_all = pmc_amd_save_disable_all, + .restore_all = pmc_amd_restore_all, + .eventsel = MSR_K7_EVNTSEL0, + .perfctr = MSR_K7_PERFCTR0, + .event_map = pmc_amd_event_map, + .max_events = ARRAY_SIZE(amd_perfmon_event_map), +}; + static struct pmc_x86_ops *pmc_intel_init(void) { union cpuid10_eax eax; @@ -719,6 +785,16 @@ static struct pmc_x86_ops *pmc_intel_init(void) return &pmc_intel_ops; } +static struct pmc_x86_ops *pmc_amd_init(void) +{ + nr_counters_generic = 4; + nr_counters_fixed = 0; + + printk(KERN_INFO "AMD Performance Monitoring support detected.\n"); + + return &pmc_amd_ops; +} + void __init init_hw_perf_counters(void) { if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) @@ -728,6 +804,9 @@ void __init init_hw_perf_counters(void) case X86_VENDOR_INTEL: pmc_ops = pmc_intel_init(); break; + case X86_VENDOR_AMD: + pmc_ops = pmc_amd_init(); + break; } if (!pmc_ops) return; -- cgit v1.2.3 From 169e41eb7f5464c077a7e0e129f025759d04cc54 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 28 Feb 2009 18:37:49 +0530 Subject: x86: decent declarations in perf_counter.c Impact: cleanup making decent declrations for struct pmc_x86_ops and fix checkpatch error: ERROR: Macros with complex values should be enclosed in parenthesis Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 266618aa1a0..a1f3646a3e8 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -42,12 +42,12 @@ struct cpu_hw_counters { * struct pmc_x86_ops - performance counter x86 ops */ struct pmc_x86_ops { - u64 (*save_disable_all) (void); - void (*restore_all) (u64 ctrl); - unsigned eventsel; - unsigned perfctr; - int (*event_map) (int event); - int max_events; + u64 (*save_disable_all)(void); + void (*restore_all)(u64 ctrl); + unsigned eventsel; + unsigned perfctr; + int (*event_map)(int event); + int max_events; }; static struct pmc_x86_ops *pmc_ops; @@ -561,7 +561,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) /* * Maximum interrupt frequency of 100KHz per CPU */ -#define PERFMON_MAX_INTERRUPTS 100000/HZ +#define PERFMON_MAX_INTERRUPTS (100000/HZ) /* * This handler is triggered by the local APIC, so the APIC IRQ handling -- cgit v1.2.3 From a1ef58f442542d8b3e3b963339fbc522c36e827c Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 28 Feb 2009 18:45:39 +0530 Subject: x86: use pr_info in perf_counter.c Impact: cleanup using pr_info in perf_counter.c fixes various 80 characters warnings and also indenting for conditional statement Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 48 +++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a1f3646a3e8..3b65f19a668 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -454,18 +454,18 @@ void perf_counter_print_debug(void) cpuc = &per_cpu(cpu_hw_counters, cpu); if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); - rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); - rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); - - printk(KERN_INFO "\n"); - printk(KERN_INFO "CPU#%d: ctrl: %016llx\n", cpu, ctrl); - printk(KERN_INFO "CPU#%d: status: %016llx\n", cpu, status); - printk(KERN_INFO "CPU#%d: overflow: %016llx\n", cpu, overflow); - printk(KERN_INFO "CPU#%d: fixed: %016llx\n", cpu, fixed); + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); + rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); + + pr_info("\n"); + pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); + pr_info("CPU#%d: status: %016llx\n", cpu, status); + pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); + pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); } - printk(KERN_INFO "CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); + pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); for (idx = 0; idx < nr_counters_generic; idx++) { rdmsrl(pmc_ops->eventsel + idx, pmc_ctrl); @@ -473,17 +473,17 @@ void perf_counter_print_debug(void) prev_left = per_cpu(prev_left[idx], cpu); - printk(KERN_INFO "CPU#%d: gen-PMC%d ctrl: %016llx\n", + pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", cpu, idx, pmc_ctrl); - printk(KERN_INFO "CPU#%d: gen-PMC%d count: %016llx\n", + pr_info("CPU#%d: gen-PMC%d count: %016llx\n", cpu, idx, pmc_count); - printk(KERN_INFO "CPU#%d: gen-PMC%d left: %016llx\n", + pr_info("CPU#%d: gen-PMC%d left: %016llx\n", cpu, idx, prev_left); } for (idx = 0; idx < nr_counters_fixed; idx++) { rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); - printk(KERN_INFO "CPU#%d: fixed-PMC%d count: %016llx\n", + pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", cpu, idx, pmc_count); } local_irq_enable(); @@ -773,10 +773,10 @@ static struct pmc_x86_ops *pmc_intel_init(void) if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) return NULL; - printk(KERN_INFO "Intel Performance Monitoring support detected.\n"); - printk(KERN_INFO "... version: %d\n", eax.split.version_id); - printk(KERN_INFO "... bit width: %d\n", eax.split.bit_width); - printk(KERN_INFO "... mask length: %d\n", eax.split.mask_length); + pr_info("Intel Performance Monitoring support detected.\n"); + pr_info("... version: %d\n", eax.split.version_id); + pr_info("... bit width: %d\n", eax.split.bit_width); + pr_info("... mask length: %d\n", eax.split.mask_length); nr_counters_generic = eax.split.num_counters; nr_counters_fixed = edx.split.num_counters_fixed; @@ -790,7 +790,7 @@ static struct pmc_x86_ops *pmc_amd_init(void) nr_counters_generic = 4; nr_counters_fixed = 0; - printk(KERN_INFO "AMD Performance Monitoring support detected.\n"); + pr_info("AMD Performance Monitoring support detected.\n"); return &pmc_amd_ops; } @@ -811,7 +811,7 @@ void __init init_hw_perf_counters(void) if (!pmc_ops) return; - printk(KERN_INFO "... num counters: %d\n", nr_counters_generic); + pr_info("... num counters: %d\n", nr_counters_generic); if (nr_counters_generic > X86_PMC_MAX_GENERIC) { nr_counters_generic = X86_PMC_MAX_GENERIC; WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", @@ -820,18 +820,18 @@ void __init init_hw_perf_counters(void) perf_counter_mask = (1 << nr_counters_generic) - 1; perf_max_counters = nr_counters_generic; - printk(KERN_INFO "... value mask: %016Lx\n", counter_value_mask); + pr_info("... value mask: %016Lx\n", counter_value_mask); if (nr_counters_fixed > X86_PMC_MAX_FIXED) { nr_counters_fixed = X86_PMC_MAX_FIXED; WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", nr_counters_fixed, X86_PMC_MAX_FIXED); } - printk(KERN_INFO "... fixed counters: %d\n", nr_counters_fixed); + pr_info("... fixed counters: %d\n", nr_counters_fixed); perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED; - printk(KERN_INFO "... counter mask: %016Lx\n", perf_counter_mask); + pr_info("... counter mask: %016Lx\n", perf_counter_mask); perf_counters_initialized = true; perf_counters_lapic_init(0); -- cgit v1.2.3 From b0f3f28e0f14eb335f67bfaae33ce8b8d74fd58b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Mar 2009 18:08:27 +0100 Subject: perfcounters: IRQ and NMI support on AMD CPUs The below completes the K7+ performance counter support: - IRQ support - NMI support KernelTop output works now as well. Signed-off-by: Peter Zijlstra Cc: Jaswinder Singh Rajput Cc: Paul Mackerras LKML-Reference: <1236273633.5187.286.camel@laptop> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 272 +++++++++++++++++++++++++++++++------ 1 file changed, 228 insertions(+), 44 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 3b65f19a668..6ebe9abf6ae 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -28,6 +28,7 @@ static bool perf_counters_initialized __read_mostly; static int nr_counters_generic __read_mostly; static u64 perf_counter_mask __read_mostly; static u64 counter_value_mask __read_mostly; +static int counter_value_bits __read_mostly; static int nr_counters_fixed __read_mostly; @@ -35,7 +36,9 @@ struct cpu_hw_counters { struct perf_counter *counters[X86_PMC_IDX_MAX]; unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long interrupts; - u64 global_enable; + u64 throttle_ctrl; + u64 active_mask; + int enabled; }; /* @@ -43,21 +46,28 @@ struct cpu_hw_counters { */ struct pmc_x86_ops { u64 (*save_disable_all)(void); - void (*restore_all)(u64 ctrl); + void (*restore_all)(u64); + u64 (*get_status)(u64); + void (*ack_status)(u64); + void (*enable)(int, u64); + void (*disable)(int, u64); unsigned eventsel; unsigned perfctr; - int (*event_map)(int event); + u64 (*event_map)(int); + u64 (*raw_event)(u64); int max_events; }; static struct pmc_x86_ops *pmc_ops; -static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters); +static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { + .enabled = 1, +}; /* * Intel PerfMon v3. Used on Core2 and later. */ -static const int intel_perfmon_event_map[] = +static const u64 intel_perfmon_event_map[] = { [PERF_COUNT_CPU_CYCLES] = 0x003c, [PERF_COUNT_INSTRUCTIONS] = 0x00c0, @@ -68,15 +78,29 @@ static const int intel_perfmon_event_map[] = [PERF_COUNT_BUS_CYCLES] = 0x013c, }; -static int pmc_intel_event_map(int event) +static u64 pmc_intel_event_map(int event) { return intel_perfmon_event_map[event]; } +static u64 pmc_intel_raw_event(u64 event) +{ +#define CORE_EVNTSEL_EVENT_MASK 0x000000FF +#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00 +#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000 + +#define CORE_EVNTSEL_MASK \ + (CORE_EVNTSEL_EVENT_MASK | \ + CORE_EVNTSEL_UNIT_MASK | \ + CORE_EVNTSEL_COUNTER_MASK) + + return event & CORE_EVNTSEL_MASK; +} + /* * AMD Performance Monitor K7 and later. */ -static const int amd_perfmon_event_map[] = +static const u64 amd_perfmon_event_map[] = { [PERF_COUNT_CPU_CYCLES] = 0x0076, [PERF_COUNT_INSTRUCTIONS] = 0x00c0, @@ -86,11 +110,25 @@ static const int amd_perfmon_event_map[] = [PERF_COUNT_BRANCH_MISSES] = 0x00c5, }; -static int pmc_amd_event_map(int event) +static u64 pmc_amd_event_map(int event) { return amd_perfmon_event_map[event]; } +static u64 pmc_amd_raw_event(u64 event) +{ +#define K7_EVNTSEL_EVENT_MASK 0x7000000FF +#define K7_EVNTSEL_UNIT_MASK 0x00000FF00 +#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000 + +#define K7_EVNTSEL_MASK \ + (K7_EVNTSEL_EVENT_MASK | \ + K7_EVNTSEL_UNIT_MASK | \ + K7_EVNTSEL_COUNTER_MASK) + + return event & K7_EVNTSEL_MASK; +} + /* * Propagate counter elapsed time into the generic counter. * Can only be executed on the CPU where the counter is active. @@ -179,7 +217,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) * Raw event type provide the config in the event structure */ if (hw_event->raw) { - hwc->config |= hw_event->type; + hwc->config |= pmc_ops->raw_event(hw_event->type); } else { if (hw_event->type >= pmc_ops->max_events) return -EINVAL; @@ -205,18 +243,24 @@ static u64 pmc_intel_save_disable_all(void) static u64 pmc_amd_save_disable_all(void) { - int idx; - u64 val, ctrl = 0; + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + int enabled, idx; + + enabled = cpuc->enabled; + cpuc->enabled = 0; + barrier(); for (idx = 0; idx < nr_counters_generic; idx++) { + u64 val; + rdmsrl(MSR_K7_EVNTSEL0 + idx, val); - if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) - ctrl |= (1 << idx); - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_K7_EVNTSEL0 + idx, val); + if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) { + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_K7_EVNTSEL0 + idx, val); + } } - return ctrl; + return enabled; } u64 hw_perf_save_disable(void) @@ -226,6 +270,9 @@ u64 hw_perf_save_disable(void) return pmc_ops->save_disable_all(); } +/* + * Exported because of ACPI idle + */ EXPORT_SYMBOL_GPL(hw_perf_save_disable); static void pmc_intel_restore_all(u64 ctrl) @@ -235,11 +282,18 @@ static void pmc_intel_restore_all(u64 ctrl) static void pmc_amd_restore_all(u64 ctrl) { - u64 val; + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); int idx; + cpuc->enabled = ctrl; + barrier(); + if (!ctrl) + return; + for (idx = 0; idx < nr_counters_generic; idx++) { - if (ctrl & (1 << idx)) { + if (test_bit(idx, (unsigned long *)&cpuc->active_mask)) { + u64 val; + rdmsrl(MSR_K7_EVNTSEL0 + idx, val); val |= ARCH_PERFMON_EVENTSEL0_ENABLE; wrmsrl(MSR_K7_EVNTSEL0 + idx, val); @@ -254,8 +308,112 @@ void hw_perf_restore(u64 ctrl) pmc_ops->restore_all(ctrl); } +/* + * Exported because of ACPI idle + */ EXPORT_SYMBOL_GPL(hw_perf_restore); +static u64 pmc_intel_get_status(u64 mask) +{ + u64 status; + + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + + return status; +} + +static u64 pmc_amd_get_status(u64 mask) +{ + u64 status = 0; + int idx; + + for (idx = 0; idx < nr_counters_generic; idx++) { + s64 val; + + if (!(mask & (1 << idx))) + continue; + + rdmsrl(MSR_K7_PERFCTR0 + idx, val); + val <<= (64 - counter_value_bits); + if (val >= 0) + status |= (1 << idx); + } + + return status; +} + +static u64 hw_perf_get_status(u64 mask) +{ + if (unlikely(!perf_counters_initialized)) + return 0; + + return pmc_ops->get_status(mask); +} + +static void pmc_intel_ack_status(u64 ack) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); +} + +static void pmc_amd_ack_status(u64 ack) +{ +} + +static void hw_perf_ack_status(u64 ack) +{ + if (unlikely(!perf_counters_initialized)) + return; + + pmc_ops->ack_status(ack); +} + +static void pmc_intel_enable(int idx, u64 config) +{ + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, + config | ARCH_PERFMON_EVENTSEL0_ENABLE); +} + +static void pmc_amd_enable(int idx, u64 config) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + + set_bit(idx, (unsigned long *)&cpuc->active_mask); + if (cpuc->enabled) + config |= ARCH_PERFMON_EVENTSEL0_ENABLE; + + wrmsrl(MSR_K7_EVNTSEL0 + idx, config); +} + +static void hw_perf_enable(int idx, u64 config) +{ + if (unlikely(!perf_counters_initialized)) + return; + + pmc_ops->enable(idx, config); +} + +static void pmc_intel_disable(int idx, u64 config) +{ + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config); +} + +static void pmc_amd_disable(int idx, u64 config) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + + clear_bit(idx, (unsigned long *)&cpuc->active_mask); + wrmsrl(MSR_K7_EVNTSEL0 + idx, config); + +} + +static void hw_perf_disable(int idx, u64 config) +{ + if (unlikely(!perf_counters_initialized)) + return; + + pmc_ops->disable(idx, config); +} + static inline void __pmc_fixed_disable(struct perf_counter *counter, struct hw_perf_counter *hwc, unsigned int __idx) @@ -278,7 +436,7 @@ __pmc_generic_disable(struct perf_counter *counter, if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) __pmc_fixed_disable(counter, hwc, idx); else - wrmsr_safe(hwc->config_base + idx, hwc->config, 0); + hw_perf_disable(idx, hwc->config); } static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); @@ -354,8 +512,7 @@ __pmc_generic_enable(struct perf_counter *counter, if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) __pmc_fixed_enable(counter, hwc, idx); else - wrmsr(hwc->config_base + idx, - hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0); + hw_perf_enable(idx, hwc->config); } static int @@ -567,22 +724,20 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) * This handler is triggered by the local APIC, so the APIC IRQ handling * rules apply: */ -static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) +static int __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) { int bit, cpu = smp_processor_id(); u64 ack, status; struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); + int ret = 0; - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); - - /* Disable counters globally */ - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); - ack_APIC_irq(); + cpuc->throttle_ctrl = hw_perf_save_disable(); - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + status = hw_perf_get_status(cpuc->throttle_ctrl); if (!status) goto out; + ret = 1; again: inc_irq_stat(apic_perf_irqs); ack = status; @@ -618,12 +773,12 @@ again: } } - wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); + hw_perf_ack_status(ack); /* * Repeat if there is more work to be done: */ - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + status = hw_perf_get_status(cpuc->throttle_ctrl); if (status) goto again; out: @@ -631,32 +786,27 @@ out: * Restore - do not reenable when global enable is off or throttled: */ if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); + hw_perf_restore(cpuc->throttle_ctrl); + + return ret; } void perf_counter_unthrottle(void) { struct cpu_hw_counters *cpuc; - u64 global_enable; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) return; - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) - return; - if (unlikely(!perf_counters_initialized)) return; - cpuc = &per_cpu(cpu_hw_counters, smp_processor_id()); + cpuc = &__get_cpu_var(cpu_hw_counters); if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) { if (printk_ratelimit()) printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n"); - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); + hw_perf_restore(cpuc->throttle_ctrl); } - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, global_enable); - if (unlikely(cpuc->global_enable && !global_enable)) - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); cpuc->interrupts = 0; } @@ -664,8 +814,8 @@ void smp_perf_counter_interrupt(struct pt_regs *regs) { irq_enter(); apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR); + ack_APIC_irq(); __smp_perf_counter_interrupt(regs, 0); - irq_exit(); } @@ -722,16 +872,23 @@ perf_counter_nmi_handler(struct notifier_block *self, { struct die_args *args = __args; struct pt_regs *regs; + int ret; + + switch (cmd) { + case DIE_NMI: + case DIE_NMI_IPI: + break; - if (likely(cmd != DIE_NMI_IPI)) + default: return NOTIFY_DONE; + } regs = args->regs; apic_write(APIC_LVTPC, APIC_DM_NMI); - __smp_perf_counter_interrupt(regs, 1); + ret = __smp_perf_counter_interrupt(regs, 1); - return NOTIFY_STOP; + return ret ? NOTIFY_STOP : NOTIFY_OK; } static __read_mostly struct notifier_block perf_counter_nmi_notifier = { @@ -743,18 +900,28 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = { static struct pmc_x86_ops pmc_intel_ops = { .save_disable_all = pmc_intel_save_disable_all, .restore_all = pmc_intel_restore_all, + .get_status = pmc_intel_get_status, + .ack_status = pmc_intel_ack_status, + .enable = pmc_intel_enable, + .disable = pmc_intel_disable, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, .perfctr = MSR_ARCH_PERFMON_PERFCTR0, .event_map = pmc_intel_event_map, + .raw_event = pmc_intel_raw_event, .max_events = ARRAY_SIZE(intel_perfmon_event_map), }; static struct pmc_x86_ops pmc_amd_ops = { .save_disable_all = pmc_amd_save_disable_all, .restore_all = pmc_amd_restore_all, + .get_status = pmc_amd_get_status, + .ack_status = pmc_amd_ack_status, + .enable = pmc_amd_enable, + .disable = pmc_amd_disable, .eventsel = MSR_K7_EVNTSEL0, .perfctr = MSR_K7_PERFCTR0, .event_map = pmc_amd_event_map, + .raw_event = pmc_amd_raw_event, .max_events = ARRAY_SIZE(amd_perfmon_event_map), }; @@ -787,8 +954,25 @@ static struct pmc_x86_ops *pmc_intel_init(void) static struct pmc_x86_ops *pmc_amd_init(void) { + u64 old; + int bits; + nr_counters_generic = 4; nr_counters_fixed = 0; + counter_value_mask = ~0ULL; + + rdmsrl(MSR_K7_PERFCTR0, old); + wrmsrl(MSR_K7_PERFCTR0, counter_value_mask); + /* + * read the truncated mask + */ + rdmsrl(MSR_K7_PERFCTR0, counter_value_mask); + wrmsrl(MSR_K7_PERFCTR0, old); + + bits = 32 + fls(counter_value_mask >> 32); + if (bits == 32) + bits = fls((u32)counter_value_mask); + counter_value_bits = bits; pr_info("AMD Performance Monitoring support detected.\n"); -- cgit v1.2.3 From b5e8acf66ff5db707c7e08df49fdf6b415878442 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Mar 2009 20:34:21 +0100 Subject: perfcounters: IRQ and NMI support on AMD CPUs, fix The BKGD suggests that counter width on AMD CPUs is 48 for all existing models (it certainly is for mine). Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 6ebe9abf6ae..f5853718d4d 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -959,20 +959,8 @@ static struct pmc_x86_ops *pmc_amd_init(void) nr_counters_generic = 4; nr_counters_fixed = 0; - counter_value_mask = ~0ULL; - - rdmsrl(MSR_K7_PERFCTR0, old); - wrmsrl(MSR_K7_PERFCTR0, counter_value_mask); - /* - * read the truncated mask - */ - rdmsrl(MSR_K7_PERFCTR0, counter_value_mask); - wrmsrl(MSR_K7_PERFCTR0, old); - - bits = 32 + fls(counter_value_mask >> 32); - if (bits == 32) - bits = fls((u32)counter_value_mask); - counter_value_bits = bits; + counter_value_mask = 0x0000FFFFFFFFFFFFULL; + counter_value_bits = 48; pr_info("AMD Performance Monitoring support detected.\n"); -- cgit v1.2.3 From 184fe4ab1f2e4dfa45584889bb3820031648386b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 8 Mar 2009 11:34:19 +0100 Subject: x86: perf_counter cleanup Use and actual unsigned long bitmap instead of casting our way around. Signed-off-by: Peter Zijlstra Cc: Jaswinder Singh Rajput LKML-Reference: <1236508459.22914.3645.camel@twins> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index f5853718d4d..1df421042b2 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -37,7 +37,7 @@ struct cpu_hw_counters { unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long interrupts; u64 throttle_ctrl; - u64 active_mask; + unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; int enabled; }; @@ -291,7 +291,7 @@ static void pmc_amd_restore_all(u64 ctrl) return; for (idx = 0; idx < nr_counters_generic; idx++) { - if (test_bit(idx, (unsigned long *)&cpuc->active_mask)) { + if (test_bit(idx, cpuc->active_mask)) { u64 val; rdmsrl(MSR_K7_EVNTSEL0 + idx, val); @@ -377,7 +377,7 @@ static void pmc_amd_enable(int idx, u64 config) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - set_bit(idx, (unsigned long *)&cpuc->active_mask); + set_bit(idx, cpuc->active_mask); if (cpuc->enabled) config |= ARCH_PERFMON_EVENTSEL0_ENABLE; @@ -401,7 +401,7 @@ static void pmc_amd_disable(int idx, u64 config) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - clear_bit(idx, (unsigned long *)&cpuc->active_mask); + clear_bit(idx, cpuc->active_mask); wrmsrl(MSR_K7_EVNTSEL0 + idx, config); } -- cgit v1.2.3 From e255357764f92afcafafbd4879b222b8c752065a Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sun, 8 Mar 2009 17:09:49 +0530 Subject: x86: perf_counter cleanup Remove unused variables and duplicate header file. Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 1df421042b2..155bc3c239b 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -17,7 +17,6 @@ #include #include -#include #include static bool perf_counters_initialized __read_mostly; @@ -954,9 +953,6 @@ static struct pmc_x86_ops *pmc_intel_init(void) static struct pmc_x86_ops *pmc_amd_init(void) { - u64 old; - int bits; - nr_counters_generic = 4; nr_counters_fixed = 0; counter_value_mask = 0x0000FFFFFFFFFFFFULL; -- cgit v1.2.3 From b8bcfe997e46150fedcc3f5b26b846400122fdd9 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Feb 2009 23:05:19 -0800 Subject: x86/paravirt: remove lazy mode in interrupts Impact: simplification, robustness Make paravirt_lazy_mode() always return PARAVIRT_LAZY_NONE when in an interrupt. This prevents interrupt code from accidentally inheriting an outer lazy state, and instead does everything synchronously. Outer batched operations are left deferred. Signed-off-by: Jeremy Fitzhardinge Acked-by: Peter Zijlstra Cc: Thomas Gleixner --- arch/x86/kernel/paravirt.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 63dd358d8ee..8ab250ac498 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -282,6 +282,9 @@ void paravirt_leave_lazy_cpu(void) enum paravirt_lazy_mode paravirt_get_lazy_mode(void) { + if (in_interrupt()) + return PARAVIRT_LAZY_NONE; + return __get_cpu_var(paravirt_lazy_mode); } -- cgit v1.2.3 From 7fd7d83d49914f03aefffba6aee09032fcd54cce Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Feb 2009 23:24:03 -0800 Subject: x86/pvops: replace arch_enter_lazy_cpu_mode with arch_start_context_switch Impact: simplification, prepare for later changes Make lazy cpu mode more specific to context switching, so that it makes sense to do more context-switch specific things in the callbacks. Signed-off-by: Jeremy Fitzhardinge Acked-by: Peter Zijlstra --- arch/x86/kernel/paravirt.c | 13 ------------- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- 3 files changed, 2 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 8ab250ac498..5eea9548216 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -301,19 +301,6 @@ void arch_flush_lazy_mmu_mode(void) preempt_enable(); } -void arch_flush_lazy_cpu_mode(void) -{ - preempt_disable(); - - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) { - WARN_ON(preempt_count() == 1); - arch_leave_lazy_cpu_mode(); - arch_enter_lazy_cpu_mode(); - } - - preempt_enable(); -} - struct pv_info pv_info = { .name = "bare hardware", .paravirt_enabled = 0, diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 14014d766ca..57e49a8278a 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -407,7 +407,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * done before math_state_restore, so the TS bit is up * to date. */ - arch_leave_lazy_cpu_mode(); + arch_end_context_switch(); /* If the task has used fpu the last 5 timeslices, just do a full * restore of the math state immediately to avoid the trap; the diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index abb7e6a7f0c..7115e608532 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -428,7 +428,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * done before math_state_restore, so the TS bit is up * to date. */ - arch_leave_lazy_cpu_mode(); + arch_end_context_switch(); /* * Switch FS and GS. -- cgit v1.2.3 From b407fc57b815b2016186220baabc76cc8264206e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Feb 2009 23:46:21 -0800 Subject: x86/paravirt: flush pending mmu updates on context switch Impact: allow preemption during lazy mmu updates If we're in lazy mmu mode when context switching, leave lazy mmu mode, but remember the task's state in TIF_LAZY_MMU_UPDATES. When we resume the task, check this flag and re-enter lazy mmu mode if its set. This sets things up for allowing lazy mmu mode while preemptible, though that won't actually be active until the next change. Signed-off-by: Jeremy Fitzhardinge Acked-by: Peter Zijlstra --- arch/x86/kernel/kvm.c | 2 +- arch/x86/kernel/paravirt.c | 13 ++++++++++--- arch/x86/kernel/vmi_32.c | 14 ++++++++++---- 3 files changed, 21 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 478bca986ec..5d7f6e76b5d 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -201,7 +201,7 @@ static void kvm_leave_lazy_mmu(void) struct kvm_para_state *state = kvm_para_state(); mmu_queue_flush(state); - paravirt_leave_lazy(paravirt_get_lazy_mode()); + paravirt_leave_lazy_mmu(); state->mode = paravirt_get_lazy_mode(); } diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 5eea9548216..430a0e30577 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -252,7 +252,7 @@ static inline void enter_lazy(enum paravirt_lazy_mode mode) __get_cpu_var(paravirt_lazy_mode) = mode; } -void paravirt_leave_lazy(enum paravirt_lazy_mode mode) +static void leave_lazy(enum paravirt_lazy_mode mode) { BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode); BUG_ON(preemptible()); @@ -267,17 +267,24 @@ void paravirt_enter_lazy_mmu(void) void paravirt_leave_lazy_mmu(void) { - paravirt_leave_lazy(PARAVIRT_LAZY_MMU); + leave_lazy(PARAVIRT_LAZY_MMU); } void paravirt_enter_lazy_cpu(void) { + if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) { + arch_leave_lazy_mmu_mode(); + set_thread_flag(TIF_LAZY_MMU_UPDATES); + } enter_lazy(PARAVIRT_LAZY_CPU); } void paravirt_leave_lazy_cpu(void) { - paravirt_leave_lazy(PARAVIRT_LAZY_CPU); + leave_lazy(PARAVIRT_LAZY_CPU); + + if (test_and_clear_thread_flag(TIF_LAZY_MMU_UPDATES)) + arch_enter_lazy_mmu_mode(); } enum paravirt_lazy_mode paravirt_get_lazy_mode(void) diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 2cc4a90e2cb..950929c607d 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -473,16 +473,22 @@ static void vmi_enter_lazy_cpu(void) vmi_ops.set_lazy_mode(2); } +static void vmi_leave_lazy_cpu(void) +{ + vmi_ops.set_lazy_mode(0); + paravirt_leave_lazy_cpu(); +} + static void vmi_enter_lazy_mmu(void) { paravirt_enter_lazy_mmu(); vmi_ops.set_lazy_mode(1); } -static void vmi_leave_lazy(void) +static void vmi_leave_lazy_mmu(void) { - paravirt_leave_lazy(paravirt_get_lazy_mode()); vmi_ops.set_lazy_mode(0); + paravirt_leave_lazy_mmu(); } static inline int __init check_vmi_rom(struct vrom_header *rom) @@ -718,12 +724,12 @@ static inline int __init activate_vmi(void) para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu, set_lazy_mode, SetLazyMode); - para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy, + para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy_cpu, set_lazy_mode, SetLazyMode); para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu, set_lazy_mode, SetLazyMode); - para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy, + para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu, set_lazy_mode, SetLazyMode); /* user and kernel flush are just handled with different flags to FlushTLB */ -- cgit v1.2.3 From 224101ed69d3fbb486868e0f6e0f9fa37302efb4 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 18 Feb 2009 11:18:57 -0800 Subject: x86/paravirt: finish change from lazy cpu to context switch start/end Impact: fix lazy context switch API Pass the previous and next tasks into the context switch start end calls, so that the called functions can properly access the task state (esp in end_context_switch, in which the next task is not yet completely current). Signed-off-by: Jeremy Fitzhardinge Acked-by: Peter Zijlstra --- arch/x86/kernel/paravirt.c | 14 ++++++-------- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- arch/x86/kernel/vmi_32.c | 12 ++++++------ 4 files changed, 14 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 430a0e30577..cf1437503ba 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -270,20 +270,20 @@ void paravirt_leave_lazy_mmu(void) leave_lazy(PARAVIRT_LAZY_MMU); } -void paravirt_enter_lazy_cpu(void) +void paravirt_start_context_switch(struct task_struct *prev) { if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) { arch_leave_lazy_mmu_mode(); - set_thread_flag(TIF_LAZY_MMU_UPDATES); + set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES); } enter_lazy(PARAVIRT_LAZY_CPU); } -void paravirt_leave_lazy_cpu(void) +void paravirt_end_context_switch(struct task_struct *next) { leave_lazy(PARAVIRT_LAZY_CPU); - if (test_and_clear_thread_flag(TIF_LAZY_MMU_UPDATES)) + if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES)) arch_enter_lazy_mmu_mode(); } @@ -399,10 +399,8 @@ struct pv_cpu_ops pv_cpu_ops = { .set_iopl_mask = native_set_iopl_mask, .io_delay = native_io_delay, - .lazy_mode = { - .enter = paravirt_nop, - .leave = paravirt_nop, - }, + .start_context_switch = paravirt_nop, + .end_context_switch = paravirt_nop, }; struct pv_apic_ops pv_apic_ops = { diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 57e49a8278a..d766c7616fd 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -407,7 +407,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * done before math_state_restore, so the TS bit is up * to date. */ - arch_end_context_switch(); + arch_end_context_switch(next_p); /* If the task has used fpu the last 5 timeslices, just do a full * restore of the math state immediately to avoid the trap; the diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 7115e608532..e8a9aaf9df8 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -428,7 +428,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * done before math_state_restore, so the TS bit is up * to date. */ - arch_end_context_switch(); + arch_end_context_switch(next_p); /* * Switch FS and GS. diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 950929c607d..55a5d6938e5 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -467,16 +467,16 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, } #endif -static void vmi_enter_lazy_cpu(void) +static void vmi_start_context_switch(struct task_struct *prev) { - paravirt_enter_lazy_cpu(); + paravirt_start_context_switch(prev); vmi_ops.set_lazy_mode(2); } -static void vmi_leave_lazy_cpu(void) +static void vmi_end_context_switch(struct task_struct *next) { vmi_ops.set_lazy_mode(0); - paravirt_leave_lazy_cpu(); + paravirt_end_context_switch(next); } static void vmi_enter_lazy_mmu(void) @@ -722,9 +722,9 @@ static inline int __init activate_vmi(void) para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask); para_fill(pv_cpu_ops.io_delay, IODelay); - para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu, + para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch, set_lazy_mode, SetLazyMode); - para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy_cpu, + para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch, set_lazy_mode, SetLazyMode); para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu, -- cgit v1.2.3 From 2829b449276aed45f3d649efb21e3418e39dd5d1 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Feb 2009 23:53:19 -0800 Subject: x86/paravirt: allow preemption with lazy mmu mode Impact: remove obsolete checks, simplification Lift restrictions on preemption with lazy mmu mode, as it is now allowed. Signed-off-by: Jeremy Fitzhardinge Acked-by: Peter Zijlstra --- arch/x86/kernel/paravirt.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index cf1437503ba..bf2e86eee80 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -247,7 +247,6 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA static inline void enter_lazy(enum paravirt_lazy_mode mode) { BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); - BUG_ON(preemptible()); __get_cpu_var(paravirt_lazy_mode) = mode; } @@ -255,7 +254,6 @@ static inline void enter_lazy(enum paravirt_lazy_mode mode) static void leave_lazy(enum paravirt_lazy_mode mode) { BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode); - BUG_ON(preemptible()); __get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE; } @@ -272,6 +270,8 @@ void paravirt_leave_lazy_mmu(void) void paravirt_start_context_switch(struct task_struct *prev) { + BUG_ON(preemptible()); + if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) { arch_leave_lazy_mmu_mode(); set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES); @@ -281,6 +281,8 @@ void paravirt_start_context_switch(struct task_struct *prev) void paravirt_end_context_switch(struct task_struct *next) { + BUG_ON(preemptible()); + leave_lazy(PARAVIRT_LAZY_CPU); if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES)) @@ -300,7 +302,6 @@ void arch_flush_lazy_mmu_mode(void) preempt_disable(); if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { - WARN_ON(preempt_count() == 1); arch_leave_lazy_mmu_mode(); arch_enter_lazy_mmu_mode(); } -- cgit v1.2.3 From ab2f75f0b760d2b0c9a875b669a1b51dce02c85a Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 18 Feb 2009 00:18:50 -0800 Subject: x86/paravirt: use percpu_ rather than __get_cpu_var Impact: minor optimisation percpu_read/write is a slightly more direct way of getting to percpu data. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/paravirt.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index bf2e86eee80..254e8aa8bfd 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -246,16 +246,16 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA static inline void enter_lazy(enum paravirt_lazy_mode mode) { - BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); + BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); - __get_cpu_var(paravirt_lazy_mode) = mode; + percpu_write(paravirt_lazy_mode, mode); } static void leave_lazy(enum paravirt_lazy_mode mode) { - BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode); + BUG_ON(percpu_read(paravirt_lazy_mode) != mode); - __get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE; + percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE); } void paravirt_enter_lazy_mmu(void) @@ -294,7 +294,7 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) if (in_interrupt()) return PARAVIRT_LAZY_NONE; - return __get_cpu_var(paravirt_lazy_mode); + return percpu_read(paravirt_lazy_mode); } void arch_flush_lazy_mmu_mode(void) -- cgit v1.2.3 From 595258aaeac4cc6e187b98b1bf29bb176febe763 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:28 +0100 Subject: perf_counter: x86: fix 32-bit irq_period assumption No need to assume the irq_period is 32bit. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 155bc3c239b..1cedc3468ce 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -449,7 +449,7 @@ __hw_perf_counter_set_period(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { s64 left = atomic64_read(&hwc->period_left); - s32 period = hwc->irq_period; + s64 period = hwc->irq_period; int err; /* -- cgit v1.2.3 From 60b3df9c1e24a18aabb412da9905208c5f04ebea Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:30 +0100 Subject: perf_counter: add comment to barrier We need to ensure the enabled=0 write happens before we start disabling the actual counters, so that a pcm_amd_enable() will not enable one underneath us. I think the race is impossible anyway, we always balance the ops within any one context and perform enable() with IRQs disabled. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 1cedc3468ce..a2e3b76bfdc 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -247,6 +247,10 @@ static u64 pmc_amd_save_disable_all(void) enabled = cpuc->enabled; cpuc->enabled = 0; + /* + * ensure we write the disable before we start disabling the + * counters proper, so that pcm_amd_enable() does the right thing. + */ barrier(); for (idx = 0; idx < nr_counters_generic; idx++) { -- cgit v1.2.3 From 82bae4f8c2fd64a2bb1e2e72c508853ed2b4a299 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:31 +0100 Subject: perf_counter: x86: use ULL postfix for 64bit constants Fix a build warning on 32bit machines by explicitly marking the constants as 64-bit. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a2e3b76bfdc..22dab06c08a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -84,9 +84,9 @@ static u64 pmc_intel_event_map(int event) static u64 pmc_intel_raw_event(u64 event) { -#define CORE_EVNTSEL_EVENT_MASK 0x000000FF -#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00 -#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000 +#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL +#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL +#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL #define CORE_EVNTSEL_MASK \ (CORE_EVNTSEL_EVENT_MASK | \ @@ -116,9 +116,9 @@ static u64 pmc_amd_event_map(int event) static u64 pmc_amd_raw_event(u64 event) { -#define K7_EVNTSEL_EVENT_MASK 0x7000000FF -#define K7_EVNTSEL_UNIT_MASK 0x00000FF00 -#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000 +#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL +#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL +#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL #define K7_EVNTSEL_MASK \ (K7_EVNTSEL_EVENT_MASK | \ -- cgit v1.2.3 From 7bb497bd885eedd0f56dfe3cc1b5ff20710d33b9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 18 Mar 2009 08:59:21 +0100 Subject: perf_counter: fix crash on perfmon v1 systems Impact: fix boot crash on Intel Perfmon Version 1 systems Intel Perfmon v1 does not support the global MSRs, nor does it offer the generalized MSR ranges. So support v2 and later CPUs only. Also mark pmc_ops as read-mostly - to avoid false cacheline sharing. Cc: Paul Mackerras Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 22dab06c08a..6cba9d47b71 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -57,12 +57,14 @@ struct pmc_x86_ops { int max_events; }; -static struct pmc_x86_ops *pmc_ops; +static struct pmc_x86_ops *pmc_ops __read_mostly; static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { .enabled = 1, }; +static __read_mostly int intel_perfmon_version; + /* * Intel PerfMon v3. Used on Core2 and later. */ @@ -613,7 +615,7 @@ void perf_counter_print_debug(void) cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_counters, cpu); - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + if (intel_perfmon_version >= 2) { rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); @@ -930,10 +932,10 @@ static struct pmc_x86_ops pmc_amd_ops = { static struct pmc_x86_ops *pmc_intel_init(void) { + union cpuid10_edx edx; union cpuid10_eax eax; - unsigned int ebx; unsigned int unused; - union cpuid10_edx edx; + unsigned int ebx; /* * Check whether the Architectural PerfMon supports @@ -943,8 +945,12 @@ static struct pmc_x86_ops *pmc_intel_init(void) if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) return NULL; + intel_perfmon_version = eax.split.version_id; + if (intel_perfmon_version < 2) + return NULL; + pr_info("Intel Performance Monitoring support detected.\n"); - pr_info("... version: %d\n", eax.split.version_id); + pr_info("... version: %d\n", intel_perfmon_version); pr_info("... bit width: %d\n", eax.split.bit_width); pr_info("... mask length: %d\n", eax.split.mask_length); -- cgit v1.2.3 From b8e83514b64577b48bfb794fe85fcde40a9343ca Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2009 20:26:18 +0100 Subject: perf_counter: revamp syscall input ABI Impact: modify ABI The hardware/software classification in hw_event->type became a little strained due to the addition of tracepoint tracing. Instead split up the field and provide a type field to explicitly specify the counter type, while using the event_id field to specify which event to use. Raw counters still work as before, only the raw config now goes into raw_event. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Orig-LKML-Reference: <20090319194233.836807573@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 6cba9d47b71..d844ae41d5a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -217,15 +217,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter) /* * Raw event type provide the config in the event structure */ - if (hw_event->raw) { - hwc->config |= pmc_ops->raw_event(hw_event->type); + if (hw_event->raw_type) { + hwc->config |= pmc_ops->raw_event(hw_event->raw_event_id); } else { - if (hw_event->type >= pmc_ops->max_events) + if (hw_event->event_id >= pmc_ops->max_events) return -EINVAL; /* * The generic map: */ - hwc->config |= pmc_ops->event_map(hw_event->type); + hwc->config |= pmc_ops->event_map(hw_event->event_id); } counter->wakeup_pending = 0; @@ -715,7 +715,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); - perf_store_irq_data(sibling, counter->hw_event.type); + perf_store_irq_data(sibling, counter->hw_event.event_config); perf_store_irq_data(sibling, atomic64_read(&counter->count)); } } -- cgit v1.2.3 From 0322cd6ec504b0bf08ca7b2c3d7f43bda37d79c9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2009 20:26:19 +0100 Subject: perf_counter: unify irq output code Impact: cleanup Having 3 slightly different copies of the same code around does nobody any good. First step in revamping the output format. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Orig-LKML-Reference: <20090319194233.929962222@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 53 +------------------------------------- 1 file changed, 1 insertion(+), 52 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index d844ae41d5a..902282d68b0 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -674,20 +674,6 @@ static void pmc_generic_disable(struct perf_counter *counter) x86_perf_counter_update(counter, hwc, idx); } -static void perf_store_irq_data(struct perf_counter *counter, u64 data) -{ - struct perf_data *irqdata = counter->irqdata; - - if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) { - irqdata->overrun++; - } else { - u64 *p = (u64 *) &irqdata->data[irqdata->len]; - - *p = data; - irqdata->len += sizeof(u64); - } -} - /* * Save and restart an expired counter. Called by NMI contexts, * so it has to be careful about preempting normal counter ops: @@ -704,22 +690,6 @@ static void perf_save_and_restart(struct perf_counter *counter) __pmc_generic_enable(counter, hwc, idx); } -static void -perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) -{ - struct perf_counter *counter, *group_leader = sibling->group_leader; - - /* - * Store sibling timestamps (if any): - */ - list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { - - x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); - perf_store_irq_data(sibling, counter->hw_event.event_config); - perf_store_irq_data(sibling, atomic64_read(&counter->count)); - } -} - /* * Maximum interrupt frequency of 100KHz per CPU */ @@ -754,28 +724,7 @@ again: continue; perf_save_and_restart(counter); - - switch (counter->hw_event.record_type) { - case PERF_RECORD_SIMPLE: - continue; - case PERF_RECORD_IRQ: - perf_store_irq_data(counter, instruction_pointer(regs)); - break; - case PERF_RECORD_GROUP: - perf_handle_group(counter, &status, &ack); - break; - } - /* - * From NMI context we cannot call into the scheduler to - * do a task wakeup - but we mark these generic as - * wakeup_pending and initate a wakeup callback: - */ - if (nmi) { - counter->wakeup_pending = 1; - set_tsk_thread_flag(current, TIF_PERF_COUNTERS); - } else { - wake_up(&counter->waitq); - } + perf_counter_output(counter, nmi, regs); } hw_perf_ack_status(ack); -- cgit v1.2.3 From f4a2deb4860497f4332cf6a1acddab3dd628ddf0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 Mar 2009 18:22:06 +0100 Subject: perf_counter: remove the event config bitfields Since the bitfields turned into a bit of a mess, remove them and rely on good old masks. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Orig-LKML-Reference: <20090323172417.059499915@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 902282d68b0..3f95b0cdc55 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -217,15 +217,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter) /* * Raw event type provide the config in the event structure */ - if (hw_event->raw_type) { - hwc->config |= pmc_ops->raw_event(hw_event->raw_event_id); + if (perf_event_raw(hw_event)) { + hwc->config |= pmc_ops->raw_event(perf_event_config(hw_event)); } else { - if (hw_event->event_id >= pmc_ops->max_events) + if (perf_event_id(hw_event) >= pmc_ops->max_events) return -EINVAL; /* * The generic map: */ - hwc->config |= pmc_ops->event_map(hw_event->event_id); + hwc->config |= pmc_ops->event_map(perf_event_id(hw_event)); } counter->wakeup_pending = 0; -- cgit v1.2.3 From 925d519ab82b6dd7aca9420d809ee83819c08db2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:02 +0200 Subject: perf_counter: unify and fix delayed counter wakeup While going over the wakeup code I noticed delayed wakeups only work for hardware counters but basically all software counters rely on them. This patch unifies and generalizes the delayed wakeup to fix this issue. Since we're dealing with NMI context bits here, use a cmpxchg() based single link list implementation to track counters that have pending wakeups. [ This should really be generic code for delayed wakeups, but since we cannot use cmpxchg()/xchg() in generic code, I've let it live in the perf_counter code. -- Eric Dumazet could use it to aggregate the network wakeups. ] Furthermore, the x86 method of using TIF flags was flawed in that its quite possible to end up setting the bit on the idle task, loosing the wakeup. The powerpc method uses per-cpu storage and does appear to be sufficient. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Orig-LKML-Reference: <20090330171023.153932974@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 29 ----------------------------- arch/x86/kernel/signal.c | 6 ------ 2 files changed, 35 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 3f95b0cdc55..7aab177fb56 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -227,7 +227,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter) */ hwc->config |= pmc_ops->event_map(perf_event_id(hw_event)); } - counter->wakeup_pending = 0; return 0; } @@ -773,34 +772,6 @@ void smp_perf_counter_interrupt(struct pt_regs *regs) irq_exit(); } -/* - * This handler is triggered by NMI contexts: - */ -void perf_counter_notify(struct pt_regs *regs) -{ - struct cpu_hw_counters *cpuc; - unsigned long flags; - int bit, cpu; - - local_irq_save(flags); - cpu = smp_processor_id(); - cpuc = &per_cpu(cpu_hw_counters, cpu); - - for_each_bit(bit, cpuc->used, X86_PMC_IDX_MAX) { - struct perf_counter *counter = cpuc->counters[bit]; - - if (!counter) - continue; - - if (counter->wakeup_pending) { - counter->wakeup_pending = 0; - wake_up(&counter->waitq); - } - } - - local_irq_restore(flags); -} - void perf_counters_lapic_init(int nmi) { u32 apic_val; diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 611615a92c9..0a813b17b17 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -6,7 +6,6 @@ * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes * 2000-2002 x86-64 support by Andi Kleen */ -#include #include #include #include @@ -872,11 +871,6 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) tracehook_notify_resume(regs); } - if (thread_info_flags & _TIF_PERF_COUNTERS) { - clear_thread_flag(TIF_PERF_COUNTERS); - perf_counter_notify(regs); - } - #ifdef CONFIG_X86_32 clear_thread_flag(TIF_IRET); #endif /* CONFIG_X86_32 */ -- cgit v1.2.3 From 9ea98e191255ee642e64a5745014424fc63f83b0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:09 +0200 Subject: perf_counter: x86: proper error propagation for the x86 hw_perf_counter_init() Now that Paul cleaned up the error propagation paths, pass down the x86 error as well. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Orig-LKML-Reference: <20090330171023.792822360@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 7aab177fb56..b8885ccd804 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -954,7 +954,7 @@ hw_perf_counter_init(struct perf_counter *counter) err = __hw_perf_counter_init(counter); if (err) - return NULL; + return ERR_PTR(err); return &x86_perf_counter_ops; } -- cgit v1.2.3 From d7d59fb323833682b117b528d77eeb8ef587036a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:15 +0200 Subject: perf_counter: x86: callchain support Provide the x86 perf_callchain() implementation. Code based on the ftrace/sysprof code from Soeren Sandmann Pedersen. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Cc: Soeren Sandmann Pedersen Cc: Frederic Weisbecker Cc: Steven Rostedt Orig-LKML-Reference: <20090330171024.341993293@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 154 +++++++++++++++++++++++++++++++++++++ 1 file changed, 154 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index b8885ccd804..e16dfafc6d7 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -16,8 +16,10 @@ #include #include #include +#include #include +#include static bool perf_counters_initialized __read_mostly; @@ -958,3 +960,155 @@ hw_perf_counter_init(struct perf_counter *counter) return &x86_perf_counter_ops; } + +/* + * callchain support + */ + +static inline +void callchain_store(struct perf_callchain_entry *entry, unsigned long ip) +{ + if (entry->nr < MAX_STACK_DEPTH) + entry->ip[entry->nr++] = ip; +} + +static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); +static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); + + +static void +backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ + /* Ignore warnings */ +} + +static void backtrace_warning(void *data, char *msg) +{ + /* Ignore warnings */ +} + +static int backtrace_stack(void *data, char *name) +{ + /* Don't bother with IRQ stacks for now */ + return -1; +} + +static void backtrace_address(void *data, unsigned long addr, int reliable) +{ + struct perf_callchain_entry *entry = data; + + if (reliable) + callchain_store(entry, addr); +} + +static const struct stacktrace_ops backtrace_ops = { + .warning = backtrace_warning, + .warning_symbol = backtrace_warning_symbol, + .stack = backtrace_stack, + .address = backtrace_address, +}; + +static void +perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ + unsigned long bp; + char *stack; + + callchain_store(entry, instruction_pointer(regs)); + + stack = ((char *)regs + sizeof(struct pt_regs)); +#ifdef CONFIG_FRAME_POINTER + bp = frame_pointer(regs); +#else + bp = 0; +#endif + + dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry); +} + + +struct stack_frame { + const void __user *next_fp; + unsigned long return_address; +}; + +static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +{ + int ret; + + if (!access_ok(VERIFY_READ, fp, sizeof(*frame))) + return 0; + + ret = 1; + pagefault_disable(); + if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) + ret = 0; + pagefault_enable(); + + return ret; +} + +static void +perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ + struct stack_frame frame; + const void __user *fp; + + regs = (struct pt_regs *)current->thread.sp0 - 1; + fp = (void __user *)regs->bp; + + callchain_store(entry, regs->ip); + + while (entry->nr < MAX_STACK_DEPTH) { + frame.next_fp = NULL; + frame.return_address = 0; + + if (!copy_stack_frame(fp, &frame)) + break; + + if ((unsigned long)fp < user_stack_pointer(regs)) + break; + + callchain_store(entry, frame.return_address); + fp = frame.next_fp; + } +} + +static void +perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ + int is_user; + + if (!regs) + return; + + is_user = user_mode(regs); + + if (!current || current->pid == 0) + return; + + if (is_user && current->state != TASK_RUNNING) + return; + + if (!is_user) + perf_callchain_kernel(regs, entry); + + if (current->mm) + perf_callchain_user(regs, entry); +} + +struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ + struct perf_callchain_entry *entry; + + if (in_nmi()) + entry = &__get_cpu_var(nmi_entry); + else + entry = &__get_cpu_var(irq_entry); + + entry->nr = 0; + + perf_do_callchain(regs, entry); + + return entry; +} -- cgit v1.2.3 From 4e935e47177c3b26cf383e79849bae2a464d0160 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:16 +0200 Subject: perf_counter: pmc arbitration Follow the example set by powerpc and try to play nice with oprofile and the nmi watchdog. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Orig-LKML-Reference: <20090330171024.459968444@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 75 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index e16dfafc6d7..2a946a160ca 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -20,6 +20,7 @@ #include #include +#include static bool perf_counters_initialized __read_mostly; @@ -172,6 +173,65 @@ again: atomic64_sub(delta, &hwc->period_left); } +static atomic_t num_counters; +static DEFINE_MUTEX(pmc_reserve_mutex); + +static bool reserve_pmc_hardware(void) +{ + int i; + + if (nmi_watchdog == NMI_LOCAL_APIC) + disable_lapic_nmi_watchdog(); + + for (i = 0; i < nr_counters_generic; i++) { + if (!reserve_perfctr_nmi(pmc_ops->perfctr + i)) + goto perfctr_fail; + } + + for (i = 0; i < nr_counters_generic; i++) { + if (!reserve_evntsel_nmi(pmc_ops->eventsel + i)) + goto eventsel_fail; + } + + return true; + +eventsel_fail: + for (i--; i >= 0; i--) + release_evntsel_nmi(pmc_ops->eventsel + i); + + i = nr_counters_generic; + +perfctr_fail: + for (i--; i >= 0; i--) + release_perfctr_nmi(pmc_ops->perfctr + i); + + if (nmi_watchdog == NMI_LOCAL_APIC) + enable_lapic_nmi_watchdog(); + + return false; +} + +static void release_pmc_hardware(void) +{ + int i; + + for (i = 0; i < nr_counters_generic; i++) { + release_perfctr_nmi(pmc_ops->perfctr + i); + release_evntsel_nmi(pmc_ops->eventsel + i); + } + + if (nmi_watchdog == NMI_LOCAL_APIC) + enable_lapic_nmi_watchdog(); +} + +static void hw_perf_counter_destroy(struct perf_counter *counter) +{ + if (atomic_dec_and_mutex_lock(&num_counters, &pmc_reserve_mutex)) { + release_pmc_hardware(); + mutex_unlock(&pmc_reserve_mutex); + } +} + /* * Setup the hardware configuration for a given hw_event_type */ @@ -179,10 +239,23 @@ static int __hw_perf_counter_init(struct perf_counter *counter) { struct perf_counter_hw_event *hw_event = &counter->hw_event; struct hw_perf_counter *hwc = &counter->hw; + int err; if (unlikely(!perf_counters_initialized)) return -EINVAL; + err = 0; + if (atomic_inc_not_zero(&num_counters)) { + mutex_lock(&pmc_reserve_mutex); + if (atomic_read(&num_counters) == 0 && !reserve_pmc_hardware()) + err = -EBUSY; + else + atomic_inc(&num_counters); + mutex_unlock(&pmc_reserve_mutex); + } + if (err) + return err; + /* * Generate PMC IRQs: * (keep 'enabled' bit clear for now) @@ -230,6 +303,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->config |= pmc_ops->event_map(perf_event_id(hw_event)); } + counter->destroy = hw_perf_counter_destroy; + return 0; } -- cgit v1.2.3 From 5872bdb88a35fae7d224bd6b21e5f377e854ccfc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 2 Apr 2009 11:12:03 +0200 Subject: perf_counter: add more context information Put in counts to tell which ips belong to what context. ----- | | hv | -- nr | | kernel | -- | | user ----- Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Orig-LKML-Reference: <20090402091319.493101305@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 2a946a160ca..c74e20d593a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1088,6 +1088,7 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) { unsigned long bp; char *stack; + int nr = entry->nr; callchain_store(entry, instruction_pointer(regs)); @@ -1099,6 +1100,8 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) #endif dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry); + + entry->kernel = entry->nr - nr; } @@ -1128,6 +1131,7 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) { struct stack_frame frame; const void __user *fp; + int nr = entry->nr; regs = (struct pt_regs *)current->thread.sp0 - 1; fp = (void __user *)regs->bp; @@ -1147,6 +1151,8 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) callchain_store(entry, frame.return_address); fp = frame.next_fp; } + + entry->user = entry->nr - nr; } static void @@ -1182,6 +1188,9 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) entry = &__get_cpu_var(irq_entry); entry->nr = 0; + entry->hv = 0; + entry->kernel = 0; + entry->user = 0; perf_do_callchain(regs, entry); -- cgit v1.2.3 From b6276f353bf490add62dcf7db0ebd75baa3e1a37 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:03 +0200 Subject: perf_counter: x86: self-IPI for pending work Implement set_perf_counter_pending() with a self-IPI so that it will run ASAP in a usable context. For now use a second IRQ vector, because the primary vector pokes the apic in funny ways that seem to confuse things. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094517.724626696@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 14 ++++++++++++++ arch/x86/kernel/entry_64.S | 2 ++ arch/x86/kernel/irq.c | 5 +++++ arch/x86/kernel/irqinit_32.c | 1 + arch/x86/kernel/irqinit_64.c | 1 + 5 files changed, 23 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index c74e20d593a..438415866fe 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -849,6 +849,20 @@ void smp_perf_counter_interrupt(struct pt_regs *regs) irq_exit(); } +void smp_perf_pending_interrupt(struct pt_regs *regs) +{ + irq_enter(); + ack_APIC_irq(); + inc_irq_stat(apic_pending_irqs); + perf_counter_do_pending(); + irq_exit(); +} + +void set_perf_counter_pending(void) +{ + apic->send_IPI_self(LOCAL_PENDING_VECTOR); +} + void perf_counters_lapic_init(int nmi) { u32 apic_val; diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 3f129d963a0..1d46cba56fd 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1028,6 +1028,8 @@ apicinterrupt SPURIOUS_APIC_VECTOR \ #ifdef CONFIG_PERF_COUNTERS apicinterrupt LOCAL_PERF_VECTOR \ perf_counter_interrupt smp_perf_counter_interrupt +apicinterrupt LOCAL_PENDING_VECTOR \ + perf_pending_interrupt smp_perf_pending_interrupt #endif /* diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 9c2754302ec..d465487da58 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -67,6 +67,10 @@ static int show_other_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); seq_printf(p, " Performance counter interrupts\n"); + seq_printf(p, "PND: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); + seq_printf(p, " Performance pending work\n"); #endif if (generic_interrupt_extension) { seq_printf(p, "PLT: "); @@ -171,6 +175,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->apic_timer_irqs; sum += irq_stats(cpu)->irq_spurious_count; sum += irq_stats(cpu)->apic_perf_irqs; + sum += irq_stats(cpu)->apic_pending_irqs; #endif if (generic_interrupt_extension) sum += irq_stats(cpu)->generic_irqs; diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 925d87cfc55..3190a6b961e 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -166,6 +166,7 @@ static void __init apic_intr_init(void) alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); # ifdef CONFIG_PERF_COUNTERS alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt); + alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); # endif # ifdef CONFIG_X86_MCE_P4THERMAL diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 665e2ab48ab..53ceb26f80f 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -156,6 +156,7 @@ static void __init apic_intr_init(void) /* Performance monitoring interrupt: */ #ifdef CONFIG_PERF_COUNTERS alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt); + alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); #endif } -- cgit v1.2.3 From f6c7d5fe58b4846ee0cb4b98b6042489705eced4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:04 +0200 Subject: perf_counter: theres more to overflow than writing events Prepare for more generic overflow handling. The new perf_counter_overflow() method will handle the generic bits of the counter overflow, and can return a !0 return value, in which case the counter should be (soft) disabled, so that it won't count until it's properly disabled. XXX: do powerpc and swcounter Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094517.812109629@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 438415866fe..1116a41bc7b 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -800,7 +800,8 @@ again: continue; perf_save_and_restart(counter); - perf_counter_output(counter, nmi, regs); + if (perf_counter_overflow(counter, nmi, regs)) + __pmc_generic_disable(counter, &counter->hw, bit); } hw_perf_ack_status(ack); -- cgit v1.2.3 From fcb2ac5bdfa3a7a04fb9749b916f64400f4c35a8 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 8 Apr 2009 13:31:58 +0200 Subject: x86_32: introduce restore_fpu_checking() Impact: cleanup, prepare FPU code unificaton Like on x86_64, return an error from restore_fpu and kill the task if it fails. Also rename restore_fpu to restore_fpu_checking which allows ifdefs to be removed in math_state_restore(). Signed-off-by: Jiri Slaby LKML-Reference: <1239190320-23952-1-git-send-email-jirislaby@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a1d288327ff..d696145855b 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -839,9 +839,6 @@ asmlinkage void math_state_restore(void) } clts(); /* Allow maths ops (or we recurse) */ -#ifdef CONFIG_X86_32 - restore_fpu(tsk); -#else /* * Paranoid restore. send a SIGSEGV if we fail to restore the state. */ @@ -850,7 +847,7 @@ asmlinkage void math_state_restore(void) force_sig(SIGSEGV, tsk); return; } -#endif + thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ tsk->fpu_counter++; } -- cgit v1.2.3 From a59dacfdc9ba06903652fa4883bf1106278b18ec Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 17 Oct 2008 14:38:08 +0200 Subject: x86 early quirks: eliminate unused function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: cleanup this warning: arch/x86/kernel/early-quirks.c:99: warning: ‘ati_ixp4x0_rev’ defined but not used triggers because ati_ixp4x0_rev() is only used in the ACPI && X86_IO_APIC case. Signed-off-by: Ingo Molnar --- arch/x86/kernel/early-quirks.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 76b8cd953de..ebdb85cf268 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -96,6 +96,7 @@ static void __init nvidia_bugs(int num, int slot, int func) } +#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) #if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) static u32 __init ati_ixp4x0_rev(int num, int slot, int func) { @@ -114,6 +115,7 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func) d &= 0xff; return d; } +#endif static void __init ati_bugs(int num, int slot, int func) { -- cgit v1.2.3 From cdc1cb0d4445f39561a65204d26f89365f917550 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 3 Apr 2009 17:15:14 -0700 Subject: x86: make wakeup_secondary_cpu_via_init static Impact: cleanup Signed-off-by: Yinghai Lu LKML-Reference: <49D6A692.6040400@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 58d24ef917d..bddf2ccaf32 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -538,7 +538,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) return (send_status | accept_status); } -int __devinit +static int __devinit wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) { unsigned long send_status, accept_status = 0; -- cgit v1.2.3 From 02421f98ec55c3ff118f358740ff640f096c7ad6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 3 Apr 2009 17:15:53 -0700 Subject: x86: consistent about warm_reset_vector for UN_NON_UNIQUE_APIC Impact: cleanup didn't set it for UV_NON_UNIQUE_APIC, so don't restore it Signed-off-by: Yinghai Lu LKML-Reference: <49D6A6B9.6060501@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index bddf2ccaf32..bf8ad6344b1 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -822,10 +822,12 @@ do_rest: /* mark "stuck" area as not stuck */ *((volatile unsigned long *)trampoline_base) = 0; - /* - * Cleanup possible dangling ends... - */ - smpboot_restore_warm_reset_vector(); + if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { + /* + * Cleanup possible dangling ends... + */ + smpboot_restore_warm_reset_vector(); + } return boot_error; } -- cgit v1.2.3 From 42d7c5e353cef9062129b0de3ec9ddf10567b9ca Mon Sep 17 00:00:00 2001 From: Becky Bruce Date: Wed, 8 Apr 2009 09:09:21 -0500 Subject: swiotlb: change swiotlb_bus_to[phys,virt] prototypes Add a hwdev argument that is needed on some architectures in order to access a per-device offset that is taken into account when producing a physical address (also needed to get from bus address to virtual address because the physical address is an intermediate step). Also make swiotlb_bus_to_virt weak so architectures can override it. Signed-off-by: Becky Bruce Acked-by: FUJITA Tomonori Signed-off-by: Kumar Gala Cc: jeremy@goop.org Cc: ian.campbell@citrix.com LKML-Reference: <1239199761-22886-8-git-send-email-galak@kernel.crashing.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-swiotlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 34f12e9996e..887388a1c57 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -28,7 +28,7 @@ dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr) return paddr; } -phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr) +phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr) { return baddr; } -- cgit v1.2.3 From 7333a8003cdc0470e8c0ae8b949cbc44f3165ff3 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 25 Mar 2009 10:50:34 +0900 Subject: x86: smarten /proc/interrupts output for new counters Now /proc/interrupts of tip tree has new counters: CNT: Performance counter interrupts Format change of output, as like that by commit: commit 7a81d9a7da03d2f27840d659f97ef140d032f609 x86: smarten /proc/interrupts output should be applied to these new counters too. Signed-off-by: Hidetoshi Seto Cc: Jan Beulich LKML-Reference: <49C98DEA.8060208@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index d465487da58..dccaaa85578 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -63,7 +63,7 @@ static int show_other_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); seq_printf(p, " Spurious interrupts\n"); - seq_printf(p, "CNT: "); + seq_printf(p, "%*s: ", prec, "CNT"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); seq_printf(p, " Performance counter interrupts\n"); -- cgit v1.2.3 From 78f13e9525ba777da25c4ddab89f28e9366a8b7c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Apr 2009 15:01:33 +0200 Subject: perf_counter: allow for data addresses to be recorded Paul suggested we allow for data addresses to be recorded along with the traditional IPs as power can provide these. For now, only the software pagefault events provide data addresses, but in the future power might as well for some events. x86 doesn't seem capable of providing this atm. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090408130409.394816925@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 1116a41bc7b..0fcbaab83f9 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -800,7 +800,7 @@ again: continue; perf_save_and_restart(counter); - if (perf_counter_overflow(counter, nmi, regs)) + if (perf_counter_overflow(counter, nmi, regs, 0)) __pmc_generic_disable(counter, &counter->hw, bit); } -- cgit v1.2.3 From e85abf8f432bb2a13733ab7609fbb8e1500af51d Mon Sep 17 00:00:00 2001 From: Gary Hade Date: Wed, 8 Apr 2009 14:07:25 -0700 Subject: x86: consolidate SMP code in io_apic.c Impact: Cleanup Reorganizes the code in arch/x86/kernel/io_apic.c by combining two '#ifdef CONFIG_SMP' regions. In addition to making the code easier to understand the first '#ifdef CONFIG_SMP' region is moved to a location later in the file which will reduce the need for function forward declarations when the code subsequently revised. The only changes other than relocating code to a different position in the file were the removal of the assign_irq_vector() forward declaration which was no longer needed and some line length reduction formatting changes. Signed-off-by: Gary Hade Cc: lcm@us.ibm.com LKML-Reference: <20090408210725.GC11159@us.ibm.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 223 ++++++++++++++++++++--------------------- 1 file changed, 109 insertions(+), 114 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 767fe7e46d6..7c9d045ac83 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -518,120 +518,6 @@ static void ioapic_mask_entry(int apic, int pin) spin_unlock_irqrestore(&ioapic_lock, flags); } -#ifdef CONFIG_SMP -static void send_cleanup_vector(struct irq_cfg *cfg) -{ - cpumask_var_t cleanup_mask; - - if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { - unsigned int i; - cfg->move_cleanup_count = 0; - for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) - cfg->move_cleanup_count++; - for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) - apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); - } else { - cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); - cfg->move_cleanup_count = cpumask_weight(cleanup_mask); - apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - free_cpumask_var(cleanup_mask); - } - cfg->move_in_progress = 0; -} - -static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) -{ - int apic, pin; - struct irq_pin_list *entry; - u8 vector = cfg->vector; - - entry = cfg->irq_2_pin; - for (;;) { - unsigned int reg; - - if (!entry) - break; - - apic = entry->apic; - pin = entry->pin; - /* - * With interrupt-remapping, destination information comes - * from interrupt-remapping table entry. - */ - if (!irq_remapped(irq)) - io_apic_write(apic, 0x11 + pin*2, dest); - reg = io_apic_read(apic, 0x10 + pin*2); - reg &= ~IO_APIC_REDIR_VECTOR_MASK; - reg |= vector; - io_apic_modify(apic, 0x10 + pin*2, reg); - if (!entry->next) - break; - entry = entry->next; - } -} - -static int -assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask); - -/* - * Either sets desc->affinity to a valid value, and returns - * ->cpu_mask_to_apicid of that, or returns BAD_APICID and - * leaves desc->affinity untouched. - */ -static unsigned int -set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) -{ - struct irq_cfg *cfg; - unsigned int irq; - - if (!cpumask_intersects(mask, cpu_online_mask)) - return BAD_APICID; - - irq = desc->irq; - cfg = desc->chip_data; - if (assign_irq_vector(irq, cfg, mask)) - return BAD_APICID; - - /* check that before desc->addinity get updated */ - set_extra_move_desc(desc, mask); - - cpumask_copy(desc->affinity, mask); - - return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); -} - -static void -set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) -{ - struct irq_cfg *cfg; - unsigned long flags; - unsigned int dest; - unsigned int irq; - - irq = desc->irq; - cfg = desc->chip_data; - - spin_lock_irqsave(&ioapic_lock, flags); - dest = set_desc_affinity(desc, mask); - if (dest != BAD_APICID) { - /* Only the high 8 bits are valid. */ - dest = SET_APIC_LOGICAL_ID(dest); - __target_IO_APIC_irq(irq, dest, cfg); - } - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -static void -set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) -{ - struct irq_desc *desc; - - desc = irq_to_desc(irq); - - set_ioapic_affinity_irq_desc(desc, mask); -} -#endif /* CONFIG_SMP */ - /* * The common case is 1:1 IRQ<->pin mappings. Sometimes there are * shared ISA-space IRQs, so we have to support them. We are super @@ -2360,6 +2246,115 @@ static int ioapic_retrigger_irq(unsigned int irq) */ #ifdef CONFIG_SMP +static void send_cleanup_vector(struct irq_cfg *cfg) +{ + cpumask_var_t cleanup_mask; + + if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { + unsigned int i; + cfg->move_cleanup_count = 0; + for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) + cfg->move_cleanup_count++; + for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) + apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); + } else { + cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); + cfg->move_cleanup_count = cpumask_weight(cleanup_mask); + apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + free_cpumask_var(cleanup_mask); + } + cfg->move_in_progress = 0; +} + +static void +__target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) +{ + int apic, pin; + struct irq_pin_list *entry; + u8 vector = cfg->vector; + + entry = cfg->irq_2_pin; + for (;;) { + unsigned int reg; + + if (!entry) + break; + + apic = entry->apic; + pin = entry->pin; + /* + * With interrupt-remapping, destination information comes + * from interrupt-remapping table entry. + */ + if (!irq_remapped(irq)) + io_apic_write(apic, 0x11 + pin*2, dest); + reg = io_apic_read(apic, 0x10 + pin*2); + reg &= ~IO_APIC_REDIR_VECTOR_MASK; + reg |= vector; + io_apic_modify(apic, 0x10 + pin*2, reg); + if (!entry->next) + break; + entry = entry->next; + } +} + +/* + * Either sets desc->affinity to a valid value, and returns + * ->cpu_mask_to_apicid of that, or returns BAD_APICID and + * leaves desc->affinity untouched. + */ +static unsigned int +set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) +{ + struct irq_cfg *cfg; + unsigned int irq; + + if (!cpumask_intersects(mask, cpu_online_mask)) + return BAD_APICID; + + irq = desc->irq; + cfg = desc->chip_data; + if (assign_irq_vector(irq, cfg, mask)) + return BAD_APICID; + + /* check that before desc->addinity get updated */ + set_extra_move_desc(desc, mask); + + cpumask_copy(desc->affinity, mask); + + return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); +} + +static void +set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) +{ + struct irq_cfg *cfg; + unsigned long flags; + unsigned int dest; + unsigned int irq; + + irq = desc->irq; + cfg = desc->chip_data; + + spin_lock_irqsave(&ioapic_lock, flags); + dest = set_desc_affinity(desc, mask); + if (dest != BAD_APICID) { + /* Only the high 8 bits are valid. */ + dest = SET_APIC_LOGICAL_ID(dest); + __target_IO_APIC_irq(irq, dest, cfg); + } + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void +set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) +{ + struct irq_desc *desc; + + desc = irq_to_desc(irq); + + set_ioapic_affinity_irq_desc(desc, mask); +} #ifdef CONFIG_INTR_REMAP -- cgit v1.2.3 From 7a734e7dd93b9aea08ed51036a9a0e2c9dfd8dac Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 1 Apr 2009 18:08:28 -0700 Subject: x86, setup: "glove box" BIOS calls -- infrastructure Impact: new interfaces (not yet used) For all the platforms out there, there is an infinite number of buggy BIOSes. This adds infrastructure to treat BIOS interrupts more like toxic waste and "glove box" them -- we switch out the register set, perform the BIOS interrupt, and then restore the previous state. LKML-Reference: <49DE7F79.4030106@zytor.com> Signed-off-by: H. Peter Anvin Cc: Pavel Machek Cc: Rafael J. Wysocki --- arch/x86/kernel/acpi/realmode/Makefile | 2 +- arch/x86/kernel/acpi/realmode/bioscall.S | 1 + arch/x86/kernel/acpi/realmode/regs.c | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/acpi/realmode/bioscall.S create mode 100644 arch/x86/kernel/acpi/realmode/regs.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile index 1c31cc0e9de..167bc16ce0e 100644 --- a/arch/x86/kernel/acpi/realmode/Makefile +++ b/arch/x86/kernel/acpi/realmode/Makefile @@ -9,7 +9,7 @@ always := wakeup.bin targets := wakeup.elf wakeup.lds -wakeup-y += wakeup.o wakemain.o video-mode.o copy.o +wakeup-y += wakeup.o wakemain.o video-mode.o copy.o bioscall.o regs.o # The link order of the video-*.o modules can matter. In particular, # video-vga.o *must* be listed first, followed by video-vesa.o. diff --git a/arch/x86/kernel/acpi/realmode/bioscall.S b/arch/x86/kernel/acpi/realmode/bioscall.S new file mode 100644 index 00000000000..f51eb0bb56c --- /dev/null +++ b/arch/x86/kernel/acpi/realmode/bioscall.S @@ -0,0 +1 @@ +#include "../../../boot/bioscall.S" diff --git a/arch/x86/kernel/acpi/realmode/regs.c b/arch/x86/kernel/acpi/realmode/regs.c new file mode 100644 index 00000000000..6206033ba20 --- /dev/null +++ b/arch/x86/kernel/acpi/realmode/regs.c @@ -0,0 +1 @@ +#include "../../../boot/regs.c" -- cgit v1.2.3 From bda869c614c937c318547c3ee1d65a316b693c21 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Thu, 9 Apr 2009 15:05:10 +0200 Subject: x86: cacheinfo: use L3 cache index disable feature only for CPUs that support it AMD family 0x11 CPU doesn't support the feature. Some AMD family 0x10 CPUs do not support it or have an erratum, see erratum #382 in "Revision Guide for AMD Family 10h Processors, 41322 Rev. 3.40 February 2009". Signed-off-by: Andreas Herrmann CC: Mark Langsdorf Cc: Andrew Morton LKML-Reference: <20090409130510.GG31527@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 483eda96e10..72401264912 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -291,6 +291,14 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) { if (index < 3) return; + + if (boot_cpu_data.x86 == 0x11) + return; + + /* see erratum #382 */ + if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8)) + return; + this_leaf->can_disable = 1; } -- cgit v1.2.3 From 845d8c761ec763871936c62b837c4a9ea6d0fbdb Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Thu, 9 Apr 2009 15:07:29 +0200 Subject: x86: cacheinfo: correct return value when cache_disable feature is not active Impact: bug fix If user writes to "cache_disable" attribute on a CPU that does not support this feature, the process hangs due to an invalid return value in store_cache_disable(). Signed-off-by: Andreas Herrmann Cc: Andrew Morton Cc: Mark Langsdorf LKML-Reference: <20090409130729.GH31527@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 72401264912..1ab46e05adf 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -771,7 +771,7 @@ store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, unsigned int ret, index, val; if (!this_leaf->can_disable) - return 0; + return -EINVAL; if (strlen(buf) > 15) return -EINVAL; -- cgit v1.2.3 From afd9fceec55225d33be878927056a548c2eef26c Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Thu, 9 Apr 2009 15:16:17 +0200 Subject: x86: cacheinfo: use cached K8 NB_MISC devices instead of scanning for it Impact: avoid code duplication Signed-off-by: Andreas Herrmann Cc: Andrew Morton Cc: Mark Langsdorf LKML-Reference: <20090409131617.GI31527@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 37 +++-------------------------------- 1 file changed, 3 insertions(+), 34 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 1ab46e05adf..0cde0715369 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -17,6 +17,7 @@ #include #include +#include #define LVL_1_INST 1 #define LVL_1_DATA 2 @@ -159,14 +160,6 @@ struct _cpuid4_info_regs { unsigned long can_disable; }; -#if defined(CONFIG_PCI) && defined(CONFIG_SYSFS) -static struct pci_device_id k8_nb_id[] = { - { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, - {} -}; -#endif - unsigned short num_cache_leaves; /* AMD doesn't have CPUID4. Emulate it here to report the same @@ -704,30 +697,6 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) #define to_object(k) container_of(k, struct _index_kobject, kobj) #define to_attr(a) container_of(a, struct _cache_attr, attr) -#ifdef CONFIG_PCI -static struct pci_dev *get_k8_northbridge(int node) -{ - struct pci_dev *dev = NULL; - int i; - - for (i = 0; i <= node; i++) { - do { - dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); - if (!dev) - break; - } while (!pci_match_id(&k8_nb_id[0], dev)); - if (!dev) - break; - } - return dev; -} -#else -static struct pci_dev *get_k8_northbridge(int node) -{ - return NULL; -} -#endif - static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf) { const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); @@ -739,7 +708,7 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf) if (!this_leaf->can_disable) return sprintf(buf, "Feature not enabled\n"); - dev = get_k8_northbridge(node); + dev = node_to_k8_nb_misc(node); if (!dev) { printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); return -EINVAL; @@ -783,7 +752,7 @@ store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, return -EINVAL; val |= 0xc0000000; - dev = get_k8_northbridge(node); + dev = node_to_k8_nb_misc(node); if (!dev) { printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); return -EINVAL; -- cgit v1.2.3 From f8b201fc7110c3673437254e8ba02451461ece0b Mon Sep 17 00:00:00 2001 From: Mark Langsdorf Date: Thu, 9 Apr 2009 15:18:49 +0200 Subject: x86: cacheinfo: replace sysfs interface for cache_disable feature Impact: replace sysfs attribute Current interface violates against "one-value-per-sysfs-attribute rule". This patch replaces current attribute with two attributes -- one for each L3 Cache Index Disable register. Signed-off-by: Mark Langsdorf Signed-off-by: Andreas Herrmann Cc: Andrew Morton LKML-Reference: <20090409131849.GJ31527@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 90 +++++++++++++++++------------------ 1 file changed, 45 insertions(+), 45 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 0cde0715369..fc28291e40b 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -697,73 +697,69 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) #define to_object(k) container_of(k, struct _index_kobject, kobj) #define to_attr(a) container_of(a, struct _cache_attr, attr) -static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf) +static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, + unsigned int index) { - const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); - int node = cpu_to_node(cpumask_first(mask)); - struct pci_dev *dev = NULL; - ssize_t ret = 0; - int i; + int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); + int node = cpu_to_node(cpu); + struct pci_dev *dev = node_to_k8_nb_misc(node); + unsigned int reg = 0; if (!this_leaf->can_disable) - return sprintf(buf, "Feature not enabled\n"); - - dev = node_to_k8_nb_misc(node); - if (!dev) { - printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); return -EINVAL; - } - for (i = 0; i < 2; i++) { - unsigned int reg; + if (!dev) + return -EINVAL; - pci_read_config_dword(dev, 0x1BC + i * 4, ®); + pci_read_config_dword(dev, 0x1BC + index * 4, ®); + return sprintf(buf, "%x\n", reg); +} - ret += sprintf(buf, "%sEntry: %d\n", buf, i); - ret += sprintf(buf, "%sReads: %s\tNew Entries: %s\n", - buf, - reg & 0x80000000 ? "Disabled" : "Allowed", - reg & 0x40000000 ? "Disabled" : "Allowed"); - ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n", - buf, (reg & 0x30000) >> 16, reg & 0xfff); - } - return ret; +#define SHOW_CACHE_DISABLE(index) \ +static ssize_t \ +show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \ +{ \ + return show_cache_disable(this_leaf, buf, index); \ } +SHOW_CACHE_DISABLE(0) +SHOW_CACHE_DISABLE(1) -static ssize_t -store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, - size_t count) +static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, + const char *buf, size_t count, unsigned int index) { - const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); - int node = cpu_to_node(cpumask_first(mask)); - struct pci_dev *dev = NULL; - unsigned int ret, index, val; + int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); + int node = cpu_to_node(cpu); + struct pci_dev *dev = node_to_k8_nb_misc(node); + unsigned long val = 0; if (!this_leaf->can_disable) return -EINVAL; - if (strlen(buf) > 15) - return -EINVAL; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; - ret = sscanf(buf, "%x %x", &index, &val); - if (ret != 2) - return -EINVAL; - if (index > 1) + if (!dev) return -EINVAL; - val |= 0xc0000000; - dev = node_to_k8_nb_misc(node); - if (!dev) { - printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); + if (strict_strtoul(buf, 10, &val) < 0) return -EINVAL; - } + val |= 0xc0000000; pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000); wbinvd(); pci_write_config_dword(dev, 0x1BC + index * 4, val); + return count; +} - return 1; +#define STORE_CACHE_DISABLE(index) \ +static ssize_t \ +store_cache_disable_##index(struct _cpuid4_info *this_leaf, \ + const char *buf, size_t count) \ +{ \ + return store_cache_disable(this_leaf, buf, count, index); \ } +STORE_CACHE_DISABLE(0) +STORE_CACHE_DISABLE(1) struct _cache_attr { struct attribute attr; @@ -785,7 +781,10 @@ define_one_ro(size); define_one_ro(shared_cpu_map); define_one_ro(shared_cpu_list); -static struct _cache_attr cache_disable = __ATTR(cache_disable, 0644, show_cache_disable, store_cache_disable); +static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, + show_cache_disable_0, store_cache_disable_0); +static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, + show_cache_disable_1, store_cache_disable_1); static struct attribute * default_attrs[] = { &type.attr, @@ -797,7 +796,8 @@ static struct attribute * default_attrs[] = { &size.attr, &shared_cpu_map.attr, &shared_cpu_list.attr, - &cache_disable.attr, + &cache_disable_0.attr, + &cache_disable_1.attr, NULL }; -- cgit v1.2.3 From ba518bea2db21c72d44a6cbfd825b026ef9cdcb6 Mon Sep 17 00:00:00 2001 From: Mark Langsdorf Date: Thu, 9 Apr 2009 15:24:06 +0200 Subject: x86: cacheinfo: disable L3 ECC scrubbing when L3 cache index is disabled (Use correct mask to zero out bits 24-28 by Andreas) Signed-off-by: Mark Langsdorf Signed-off-by: Andreas Herrmann Cc: Andrew Morton LKML-Reference: <20090409132406.GK31527@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index fc28291e40b..d46a849f44a 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -731,6 +731,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, int node = cpu_to_node(cpu); struct pci_dev *dev = node_to_k8_nb_misc(node); unsigned long val = 0; + unsigned int scrubber = 0; if (!this_leaf->can_disable) return -EINVAL; @@ -745,6 +746,11 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, return -EINVAL; val |= 0xc0000000; + + pci_read_config_dword(dev, 0x58, &scrubber); + scrubber &= ~0x1f000000; + pci_write_config_dword(dev, 0x58, scrubber); + pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000); wbinvd(); pci_write_config_dword(dev, 0x1BC + index * 4, val); -- cgit v1.2.3 From f465145235313c451164bdfa9037ac254bf00c9a Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:18 +0300 Subject: x86: move x86_quirk_pre_intr_init() to irqinit_32.c Impact: cleanup In preparation for unifying irqinit_{32,64}.c, make x86_quirk_pre_intr_init() local to irqinit_32.c. Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 20 +++++++++++++++++++- arch/x86/kernel/setup.c | 18 ------------------ 2 files changed, 19 insertions(+), 19 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 368b0a8836f..0c0dedccd03 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -53,7 +53,7 @@ static struct irqaction fpu_irq = { .name = "fpu", }; -void __init init_ISA_irqs(void) +static void __init init_ISA_irqs(void) { int i; @@ -121,6 +121,24 @@ int vector_used_by_percpu_irq(unsigned int vector) /* Overridden in paravirt.c */ void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); +/** + * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors + * + * Description: + * Perform any necessary interrupt initialisation prior to setting up + * the "ordinary" interrupt call gates. For legacy reasons, the ISA + * interrupts should be initialised here if the machine emulates a PC + * in any way. + **/ +static void __init x86_quirk_pre_intr_init(void) +{ + if (x86_quirks->arch_pre_intr_init) { + if (x86_quirks->arch_pre_intr_init()) + return; + } + init_ISA_irqs(); +} + void __init native_init_IRQ(void) { int i; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b4158439bf6..523bb697120 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -996,24 +996,6 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_X86_32 -/** - * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors - * - * Description: - * Perform any necessary interrupt initialisation prior to setting up - * the "ordinary" interrupt call gates. For legacy reasons, the ISA - * interrupts should be initialised here if the machine emulates a PC - * in any way. - **/ -void __init x86_quirk_pre_intr_init(void) -{ - if (x86_quirks->arch_pre_intr_init) { - if (x86_quirks->arch_pre_intr_init()) - return; - } - init_ISA_irqs(); -} - /** * x86_quirk_intr_init - post gate setup interrupt initialisation * -- cgit v1.2.3 From 7371d9fcb88dc9185be9719f64744a339c537a92 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:19 +0300 Subject: x86: move init_ISA_irqs() in irqinit_32.c to match ordering in irqinit_64.c Impact: cleanup Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 48 ++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 0c0dedccd03..c5cb769db7b 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -53,30 +53,6 @@ static struct irqaction fpu_irq = { .name = "fpu", }; -static void __init init_ISA_irqs(void) -{ - int i; - -#ifdef CONFIG_X86_LOCAL_APIC - init_bsp_APIC(); -#endif - init_8259A(0); - - /* - * 16 old-style INTA-cycle interrupts: - */ - for (i = 0; i < NR_IRQS_LEGACY; i++) { - struct irq_desc *desc = irq_to_desc(i); - - desc->status = IRQ_DISABLED; - desc->action = NULL; - desc->depth = 1; - - set_irq_chip_and_handler_name(i, &i8259A_chip, - handle_level_irq, "XT"); - } -} - /* * IRQ2 is cascade interrupt to second interrupt controller */ @@ -118,6 +94,30 @@ int vector_used_by_percpu_irq(unsigned int vector) return 0; } +static void __init init_ISA_irqs(void) +{ + int i; + +#ifdef CONFIG_X86_LOCAL_APIC + init_bsp_APIC(); +#endif + init_8259A(0); + + /* + * 16 old-style INTA-cycle interrupts: + */ + for (i = 0; i < NR_IRQS_LEGACY; i++) { + struct irq_desc *desc = irq_to_desc(i); + + desc->status = IRQ_DISABLED; + desc->action = NULL; + desc->depth = 1; + + set_irq_chip_and_handler_name(i, &i8259A_chip, + handle_level_irq, "XT"); + } +} + /* Overridden in paravirt.c */ void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); -- cgit v1.2.3 From 36290d87f5abf260a543e5b711be4ceed03e6b1a Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:20 +0300 Subject: x86: introduce smp_intr_init() in irqinit_32.c Impact: cleanup Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 61 ++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 28 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index c5cb769db7b..df0aad5a062 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -121,6 +121,38 @@ static void __init init_ISA_irqs(void) /* Overridden in paravirt.c */ void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); +static void __init smp_intr_init(void) +{ +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) + /* + * The reschedule interrupt is a CPU-to-CPU reschedule-helper + * IPI, driven by wakeup. + */ + alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); + + /* IPIs for invalidation */ + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); + + /* IPI for generic function call */ + alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); + + /* IPI for single call function */ + alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, + call_function_single_interrupt); + + /* Low priority IPI to cleanup after moving an irq */ + set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); + set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); +#endif +} + /** * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors * @@ -158,34 +190,7 @@ void __init native_init_IRQ(void) } -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) - /* - * The reschedule interrupt is a CPU-to-CPU reschedule-helper - * IPI, driven by wakeup. - */ - alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); - - /* IPIs for invalidation */ - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); - - /* IPI for generic function call */ - alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - - /* IPI for single call function */ - alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, - call_function_single_interrupt); - - /* Low priority IPI to cleanup after moving an irq */ - set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); - set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); -#endif + smp_intr_init(); #ifdef CONFIG_X86_LOCAL_APIC /* self generated IPI for local APIC timer */ -- cgit v1.2.3 From 22813c45228160b07244a7c4ed7580388ac0f33d Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:21 +0300 Subject: x86: introduce apic_intr_init() in irqinit_32.c Impact: cleanup Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index df0aad5a062..9ba68c4557b 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -171,25 +171,8 @@ static void __init x86_quirk_pre_intr_init(void) init_ISA_irqs(); } -void __init native_init_IRQ(void) +static void __init apic_intr_init(void) { - int i; - - /* Execute any quirks before the call gates are initialised: */ - x86_quirk_pre_intr_init(); - - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { - /* SYSCALL_VECTOR was reserved in trap_init. */ - if (i != SYSCALL_VECTOR) - set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); - } - - smp_intr_init(); #ifdef CONFIG_X86_LOCAL_APIC @@ -208,6 +191,27 @@ void __init native_init_IRQ(void) /* thermal monitor LVT interrupt */ alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); #endif +} + +void __init native_init_IRQ(void) +{ + int i; + + /* Execute any quirks before the call gates are initialised: */ + x86_quirk_pre_intr_init(); + + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { + /* SYSCALL_VECTOR was reserved in trap_init. */ + if (i != SYSCALL_VECTOR) + set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); + } + + apic_intr_init(); if (!acpi_ioapic) setup_irq(2, &irq2); -- cgit v1.2.3 From d3496c85cae22fb7713af6ed542a6aeae8ee4210 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:22 +0300 Subject: x86: use identical loop constructs in 32-bit and 64-bit native_init_IRQ() Impact: cleanup Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 2 +- arch/x86/kernel/irqinit_64.c | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 9ba68c4557b..1029a1855f9 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -205,7 +205,7 @@ void __init native_init_IRQ(void) * us. (some of these will be overridden and become * 'special' SMP interrupts) */ - for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { + for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { /* SYSCALL_VECTOR was reserved in trap_init. */ if (i != SYSCALL_VECTOR) set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 8cd10537fd4..1c8858bb27f 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -159,15 +159,16 @@ void __init native_init_IRQ(void) int i; init_ISA_irqs(); + /* * Cover the whole vector space, no vector can escape * us. (some of these will be overridden and become * 'special' SMP interrupts) */ - for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { - int vector = FIRST_EXTERNAL_VECTOR + i; - if (vector != IA32_SYSCALL_VECTOR) - set_intr_gate(vector, interrupt[i]); + for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { + /* IA32_SYSCALL_VECTOR was reserved in trap_init. */ + if (i != IA32_SYSCALL_VECTOR) + set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); } apic_intr_init(); -- cgit v1.2.3 From b0096bb0b640d0a7713618b3472fd0f4adf30a96 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:23 +0300 Subject: x86: unify smp_intr_init() in irqinit_{32,64}.h Impact: cleanup Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 8 +++++--- arch/x86/kernel/irqinit_64.c | 2 ++ 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 1029a1855f9..ef2528d298b 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -123,7 +123,8 @@ void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); static void __init smp_intr_init(void) { -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) +#ifdef CONFIG_SMP +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) /* * The reschedule interrupt is a CPU-to-CPU reschedule-helper * IPI, driven by wakeup. @@ -143,14 +144,15 @@ static void __init smp_intr_init(void) /* IPI for generic function call */ alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - /* IPI for single call function */ + /* IPI for generic single function call */ alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, - call_function_single_interrupt); + call_function_single_interrupt); /* Low priority IPI to cleanup after moving an irq */ set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); #endif +#endif /* CONFIG_SMP */ } /** diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 1c8858bb27f..9e7c57dc79e 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -107,6 +107,7 @@ void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); static void __init smp_intr_init(void) { #ifdef CONFIG_SMP +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) /* * The reschedule interrupt is a CPU-to-CPU reschedule-helper * IPI, driven by wakeup. @@ -134,6 +135,7 @@ static void __init smp_intr_init(void) set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); #endif +#endif /* CONFIG_SMP */ } static void __init apic_intr_init(void) -- cgit v1.2.3 From 598c73d250ffb112715aa48fb325d79e255be23b Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:24 +0300 Subject: x86: unify init_ISA_irqs() in irqinit_{32,64}.c Impact: cleanup Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 2 +- arch/x86/kernel/irqinit_64.c | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index ef2528d298b..4488b713396 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -98,7 +98,7 @@ static void __init init_ISA_irqs(void) { int i; -#ifdef CONFIG_X86_LOCAL_APIC +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) init_bsp_APIC(); #endif init_8259A(0); diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 9e7c57dc79e..61c9a922e80 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -84,9 +84,14 @@ static void __init init_ISA_irqs(void) { int i; +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) init_bsp_APIC(); +#endif init_8259A(0); + /* + * 16 old-style INTA-cycle interrupts: + */ for (i = 0; i < NR_IRQS_LEGACY; i++) { struct irq_desc *desc = irq_to_desc(i); @@ -94,11 +99,8 @@ static void __init init_ISA_irqs(void) desc->action = NULL; desc->depth = 1; - /* - * 16 old-style INTA-cycle interrupts: - */ set_irq_chip_and_handler_name(i, &i8259A_chip, - handle_level_irq, "XT"); + handle_level_irq, "XT"); } } -- cgit v1.2.3 From 320fd99672a44ece6d1cd0d838ba31c8ebbf5979 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:25 +0300 Subject: x86: unify native_init_IRQ() in irqinit_{32,64}.c Impact: cleanup Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 53 ++++++++++++++++++---------- arch/x86/kernel/irqinit_64.c | 82 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 115 insertions(+), 20 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 4488b713396..a780de3ad5d 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -22,7 +22,7 @@ #include #include - +#ifdef CONFIG_X86_32 /* * Note that on a 486, we don't want to do a SIGFPE on an irq13 * as the irq is unreliable, and exception 16 works correctly @@ -52,6 +52,7 @@ static struct irqaction fpu_irq = { .handler = math_error_irq, .name = "fpu", }; +#endif /* * IRQ2 is cascade interrupt to second interrupt controller @@ -155,24 +156,6 @@ static void __init smp_intr_init(void) #endif /* CONFIG_SMP */ } -/** - * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors - * - * Description: - * Perform any necessary interrupt initialisation prior to setting up - * the "ordinary" interrupt call gates. For legacy reasons, the ISA - * interrupts should be initialised here if the machine emulates a PC - * in any way. - **/ -static void __init x86_quirk_pre_intr_init(void) -{ - if (x86_quirks->arch_pre_intr_init) { - if (x86_quirks->arch_pre_intr_init()) - return; - } - init_ISA_irqs(); -} - static void __init apic_intr_init(void) { smp_intr_init(); @@ -195,12 +178,36 @@ static void __init apic_intr_init(void) #endif } +#ifdef CONFIG_X86_32 +/** + * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors + * + * Description: + * Perform any necessary interrupt initialisation prior to setting up + * the "ordinary" interrupt call gates. For legacy reasons, the ISA + * interrupts should be initialised here if the machine emulates a PC + * in any way. + **/ +static void __init x86_quirk_pre_intr_init(void) +{ + if (x86_quirks->arch_pre_intr_init) { + if (x86_quirks->arch_pre_intr_init()) + return; + } + init_ISA_irqs(); +} +#endif + void __init native_init_IRQ(void) { int i; +#ifdef CONFIG_X86_32 /* Execute any quirks before the call gates are initialised: */ x86_quirk_pre_intr_init(); +#else + init_ISA_irqs(); +#endif /* * Cover the whole vector space, no vector can escape @@ -208,9 +215,15 @@ void __init native_init_IRQ(void) * 'special' SMP interrupts) */ for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { +#ifdef CONFIG_X86_32 /* SYSCALL_VECTOR was reserved in trap_init. */ if (i != SYSCALL_VECTOR) set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); +#else + /* IA32_SYSCALL_VECTOR was reserved in trap_init. */ + if (i != IA32_SYSCALL_VECTOR) + set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); +#endif } apic_intr_init(); @@ -218,6 +231,7 @@ void __init native_init_IRQ(void) if (!acpi_ioapic) setup_irq(2, &irq2); +#ifdef CONFIG_X86_32 /* * Call quirks after call gates are initialised (usually add in * the architecture specific gates): @@ -232,4 +246,5 @@ void __init native_init_IRQ(void) setup_irq(FPU_IRQ, &fpu_irq); irq_ctx_init(smp_processor_id()); +#endif } diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 61c9a922e80..ed50e35ce97 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -39,14 +39,46 @@ * (these are usually mapped into the 0x30-0xff vector range) */ +#ifdef CONFIG_X86_32 /* - * IRQ2 is cascade interrupt to second interrupt controller + * Note that on a 486, we don't want to do a SIGFPE on an irq13 + * as the irq is unreliable, and exception 16 works correctly + * (ie as explained in the intel literature). On a 386, you + * can't use exception 16 due to bad IBM design, so we have to + * rely on the less exact irq13. + * + * Careful.. Not only is IRQ13 unreliable, but it is also + * leads to races. IBM designers who came up with it should + * be shot. + */ + +static irqreturn_t math_error_irq(int cpl, void *dev_id) +{ + outb(0, 0xF0); + if (ignore_fpu_irq || !boot_cpu_data.hard_math) + return IRQ_NONE; + math_error((void __user *)get_irq_regs()->ip); + return IRQ_HANDLED; +} + +/* + * New motherboards sometimes make IRQ 13 be a PCI interrupt, + * so allow interrupt sharing. */ +static struct irqaction fpu_irq = { + .handler = math_error_irq, + .name = "fpu", +}; +#endif +/* + * IRQ2 is cascade interrupt to second interrupt controller + */ static struct irqaction irq2 = { .handler = no_action, .name = "cascade", }; + DEFINE_PER_CPU(vector_irq_t, vector_irq) = { [0 ... IRQ0_VECTOR - 1] = -1, [IRQ0_VECTOR] = 0, @@ -158,11 +190,36 @@ static void __init apic_intr_init(void) alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); } +#ifdef CONFIG_X86_32 +/** + * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors + * + * Description: + * Perform any necessary interrupt initialisation prior to setting up + * the "ordinary" interrupt call gates. For legacy reasons, the ISA + * interrupts should be initialised here if the machine emulates a PC + * in any way. + **/ +static void __init x86_quirk_pre_intr_init(void) +{ + if (x86_quirks->arch_pre_intr_init) { + if (x86_quirks->arch_pre_intr_init()) + return; + } + init_ISA_irqs(); +} +#endif + void __init native_init_IRQ(void) { int i; +#ifdef CONFIG_X86_32 + /* Execute any quirks before the call gates are initialised: */ + x86_quirk_pre_intr_init(); +#else init_ISA_irqs(); +#endif /* * Cover the whole vector space, no vector can escape @@ -170,13 +227,36 @@ void __init native_init_IRQ(void) * 'special' SMP interrupts) */ for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { +#ifdef CONFIG_X86_32 + /* SYSCALL_VECTOR was reserved in trap_init. */ + if (i != SYSCALL_VECTOR) + set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); +#else /* IA32_SYSCALL_VECTOR was reserved in trap_init. */ if (i != IA32_SYSCALL_VECTOR) set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); +#endif } apic_intr_init(); if (!acpi_ioapic) setup_irq(2, &irq2); + +#ifdef CONFIG_X86_32 + /* + * Call quirks after call gates are initialised (usually add in + * the architecture specific gates): + */ + x86_quirk_intr_init(); + + /* + * External FPU? Set up irq13 if so, for + * original braindamaged IBM FERR coupling. + */ + if (boot_cpu_data.hard_math && !cpu_has_fpu) + setup_irq(FPU_IRQ, &fpu_irq); + + irq_ctx_init(smp_processor_id()); +#endif } -- cgit v1.2.3 From 778838600eb6973bdb6fd11e7f91b43cea4d6f45 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:26 +0300 Subject: x86: unify trivial differences in irqinit_{32,64}.c Impact: cleanup Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 20 ++++++++++++++++++++ arch/x86/kernel/irqinit_64.c | 4 ++++ 2 files changed, 24 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index a780de3ad5d..72ce94268d3 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -1,20 +1,24 @@ +#include #include #include #include #include #include +#include #include #include #include #include #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -22,6 +26,22 @@ #include #include +/* + * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: + * (these are usually mapped to vectors 0x30-0x3f) + */ + +/* + * The IO-APIC gives us many more interrupt sources. Most of these + * are unused but an SMP system is supposed to have enough memory ... + * sometimes (mostly wrt. hw bugs) we get corrupted vectors all + * across the spectrum, so we really want to be prepared to get all + * of these. Plus, more powerful systems might have more than 64 + * IO-APIC registers. + * + * (these are usually mapped into the 0x30-0xff vector range) + */ + #ifdef CONFIG_X86_32 /* * Note that on a 486, we don't want to do a SIGFPE on an irq13 diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index ed50e35ce97..687b6c33cd7 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -17,11 +17,14 @@ #include #include +#include #include #include #include #include +#include #include +#include /* * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: @@ -136,6 +139,7 @@ static void __init init_ISA_irqs(void) } } +/* Overridden in paravirt.c */ void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); static void __init smp_intr_init(void) -- cgit v1.2.3 From ab19c25abd14db28d7454f00805ea59f22ed6057 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:27 +0300 Subject: x86: unify apic_intr_init() in irqinit_{32,64}.c Impact: cleanup Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 9 ++++++++- arch/x86/kernel/irqinit_64.c | 11 +++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 72ce94268d3..f3be5e97427 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -180,7 +180,12 @@ static void __init apic_intr_init(void) { smp_intr_init(); -#ifdef CONFIG_X86_LOCAL_APIC +#ifdef CONFIG_X86_64 + alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); + alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); +#endif + +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); @@ -192,10 +197,12 @@ static void __init apic_intr_init(void) alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); #endif +#ifdef CONFIG_X86_32 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) /* thermal monitor LVT interrupt */ alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); #endif +#endif } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 687b6c33cd7..f3be5e97427 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -180,9 +180,12 @@ static void __init apic_intr_init(void) { smp_intr_init(); +#ifdef CONFIG_X86_64 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); +#endif +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); @@ -192,6 +195,14 @@ static void __init apic_intr_init(void) /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); +#endif + +#ifdef CONFIG_X86_32 +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) + /* thermal monitor LVT interrupt */ + alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); +#endif +#endif } #ifdef CONFIG_X86_32 -- cgit v1.2.3 From 31cb45ef2600d47191d51253ec94b5e3f689260d Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:28 +0300 Subject: x86: unify irqinit_{32,64}.c into irqinit.c Impact: cleanup Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/irqinit.c | 277 +++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/irqinit_32.c | 277 ------------------------------------------- arch/x86/kernel/irqinit_64.c | 277 ------------------------------------------- 4 files changed, 278 insertions(+), 555 deletions(-) create mode 100644 arch/x86/kernel/irqinit.c delete mode 100644 arch/x86/kernel/irqinit_32.c delete mode 100644 arch/x86/kernel/irqinit_64.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 145cce75cda..16e3acfe19e 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -28,7 +28,7 @@ CFLAGS_paravirt.o := $(nostackp) obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o -obj-y += setup.o i8259.o irqinit_$(BITS).o +obj-y += setup.o i8259.o irqinit.o obj-$(CONFIG_X86_VISWS) += visws_quirks.o obj-$(CONFIG_X86_32) += probe_roms_32.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c new file mode 100644 index 00000000000..f3be5e97427 --- /dev/null +++ b/arch/x86/kernel/irqinit.c @@ -0,0 +1,277 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: + * (these are usually mapped to vectors 0x30-0x3f) + */ + +/* + * The IO-APIC gives us many more interrupt sources. Most of these + * are unused but an SMP system is supposed to have enough memory ... + * sometimes (mostly wrt. hw bugs) we get corrupted vectors all + * across the spectrum, so we really want to be prepared to get all + * of these. Plus, more powerful systems might have more than 64 + * IO-APIC registers. + * + * (these are usually mapped into the 0x30-0xff vector range) + */ + +#ifdef CONFIG_X86_32 +/* + * Note that on a 486, we don't want to do a SIGFPE on an irq13 + * as the irq is unreliable, and exception 16 works correctly + * (ie as explained in the intel literature). On a 386, you + * can't use exception 16 due to bad IBM design, so we have to + * rely on the less exact irq13. + * + * Careful.. Not only is IRQ13 unreliable, but it is also + * leads to races. IBM designers who came up with it should + * be shot. + */ + +static irqreturn_t math_error_irq(int cpl, void *dev_id) +{ + outb(0, 0xF0); + if (ignore_fpu_irq || !boot_cpu_data.hard_math) + return IRQ_NONE; + math_error((void __user *)get_irq_regs()->ip); + return IRQ_HANDLED; +} + +/* + * New motherboards sometimes make IRQ 13 be a PCI interrupt, + * so allow interrupt sharing. + */ +static struct irqaction fpu_irq = { + .handler = math_error_irq, + .name = "fpu", +}; +#endif + +/* + * IRQ2 is cascade interrupt to second interrupt controller + */ +static struct irqaction irq2 = { + .handler = no_action, + .name = "cascade", +}; + +DEFINE_PER_CPU(vector_irq_t, vector_irq) = { + [0 ... IRQ0_VECTOR - 1] = -1, + [IRQ0_VECTOR] = 0, + [IRQ1_VECTOR] = 1, + [IRQ2_VECTOR] = 2, + [IRQ3_VECTOR] = 3, + [IRQ4_VECTOR] = 4, + [IRQ5_VECTOR] = 5, + [IRQ6_VECTOR] = 6, + [IRQ7_VECTOR] = 7, + [IRQ8_VECTOR] = 8, + [IRQ9_VECTOR] = 9, + [IRQ10_VECTOR] = 10, + [IRQ11_VECTOR] = 11, + [IRQ12_VECTOR] = 12, + [IRQ13_VECTOR] = 13, + [IRQ14_VECTOR] = 14, + [IRQ15_VECTOR] = 15, + [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 +}; + +int vector_used_by_percpu_irq(unsigned int vector) +{ + int cpu; + + for_each_online_cpu(cpu) { + if (per_cpu(vector_irq, cpu)[vector] != -1) + return 1; + } + + return 0; +} + +static void __init init_ISA_irqs(void) +{ + int i; + +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) + init_bsp_APIC(); +#endif + init_8259A(0); + + /* + * 16 old-style INTA-cycle interrupts: + */ + for (i = 0; i < NR_IRQS_LEGACY; i++) { + struct irq_desc *desc = irq_to_desc(i); + + desc->status = IRQ_DISABLED; + desc->action = NULL; + desc->depth = 1; + + set_irq_chip_and_handler_name(i, &i8259A_chip, + handle_level_irq, "XT"); + } +} + +/* Overridden in paravirt.c */ +void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); + +static void __init smp_intr_init(void) +{ +#ifdef CONFIG_SMP +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) + /* + * The reschedule interrupt is a CPU-to-CPU reschedule-helper + * IPI, driven by wakeup. + */ + alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); + + /* IPIs for invalidation */ + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); + + /* IPI for generic function call */ + alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); + + /* IPI for generic single function call */ + alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, + call_function_single_interrupt); + + /* Low priority IPI to cleanup after moving an irq */ + set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); + set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); +#endif +#endif /* CONFIG_SMP */ +} + +static void __init apic_intr_init(void) +{ + smp_intr_init(); + +#ifdef CONFIG_X86_64 + alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); + alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); +#endif + +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) + /* self generated IPI for local APIC timer */ + alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); + + /* generic IPI for platform specific use */ + alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); + + /* IPI vectors for APIC spurious and error interrupts */ + alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); + alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); +#endif + +#ifdef CONFIG_X86_32 +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) + /* thermal monitor LVT interrupt */ + alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); +#endif +#endif +} + +#ifdef CONFIG_X86_32 +/** + * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors + * + * Description: + * Perform any necessary interrupt initialisation prior to setting up + * the "ordinary" interrupt call gates. For legacy reasons, the ISA + * interrupts should be initialised here if the machine emulates a PC + * in any way. + **/ +static void __init x86_quirk_pre_intr_init(void) +{ + if (x86_quirks->arch_pre_intr_init) { + if (x86_quirks->arch_pre_intr_init()) + return; + } + init_ISA_irqs(); +} +#endif + +void __init native_init_IRQ(void) +{ + int i; + +#ifdef CONFIG_X86_32 + /* Execute any quirks before the call gates are initialised: */ + x86_quirk_pre_intr_init(); +#else + init_ISA_irqs(); +#endif + + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { +#ifdef CONFIG_X86_32 + /* SYSCALL_VECTOR was reserved in trap_init. */ + if (i != SYSCALL_VECTOR) + set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); +#else + /* IA32_SYSCALL_VECTOR was reserved in trap_init. */ + if (i != IA32_SYSCALL_VECTOR) + set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); +#endif + } + + apic_intr_init(); + + if (!acpi_ioapic) + setup_irq(2, &irq2); + +#ifdef CONFIG_X86_32 + /* + * Call quirks after call gates are initialised (usually add in + * the architecture specific gates): + */ + x86_quirk_intr_init(); + + /* + * External FPU? Set up irq13 if so, for + * original braindamaged IBM FERR coupling. + */ + if (boot_cpu_data.hard_math && !cpu_has_fpu) + setup_irq(FPU_IRQ, &fpu_irq); + + irq_ctx_init(smp_processor_id()); +#endif +} diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c deleted file mode 100644 index f3be5e97427..00000000000 --- a/arch/x86/kernel/irqinit_32.c +++ /dev/null @@ -1,277 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: - * (these are usually mapped to vectors 0x30-0x3f) - */ - -/* - * The IO-APIC gives us many more interrupt sources. Most of these - * are unused but an SMP system is supposed to have enough memory ... - * sometimes (mostly wrt. hw bugs) we get corrupted vectors all - * across the spectrum, so we really want to be prepared to get all - * of these. Plus, more powerful systems might have more than 64 - * IO-APIC registers. - * - * (these are usually mapped into the 0x30-0xff vector range) - */ - -#ifdef CONFIG_X86_32 -/* - * Note that on a 486, we don't want to do a SIGFPE on an irq13 - * as the irq is unreliable, and exception 16 works correctly - * (ie as explained in the intel literature). On a 386, you - * can't use exception 16 due to bad IBM design, so we have to - * rely on the less exact irq13. - * - * Careful.. Not only is IRQ13 unreliable, but it is also - * leads to races. IBM designers who came up with it should - * be shot. - */ - -static irqreturn_t math_error_irq(int cpl, void *dev_id) -{ - outb(0, 0xF0); - if (ignore_fpu_irq || !boot_cpu_data.hard_math) - return IRQ_NONE; - math_error((void __user *)get_irq_regs()->ip); - return IRQ_HANDLED; -} - -/* - * New motherboards sometimes make IRQ 13 be a PCI interrupt, - * so allow interrupt sharing. - */ -static struct irqaction fpu_irq = { - .handler = math_error_irq, - .name = "fpu", -}; -#endif - -/* - * IRQ2 is cascade interrupt to second interrupt controller - */ -static struct irqaction irq2 = { - .handler = no_action, - .name = "cascade", -}; - -DEFINE_PER_CPU(vector_irq_t, vector_irq) = { - [0 ... IRQ0_VECTOR - 1] = -1, - [IRQ0_VECTOR] = 0, - [IRQ1_VECTOR] = 1, - [IRQ2_VECTOR] = 2, - [IRQ3_VECTOR] = 3, - [IRQ4_VECTOR] = 4, - [IRQ5_VECTOR] = 5, - [IRQ6_VECTOR] = 6, - [IRQ7_VECTOR] = 7, - [IRQ8_VECTOR] = 8, - [IRQ9_VECTOR] = 9, - [IRQ10_VECTOR] = 10, - [IRQ11_VECTOR] = 11, - [IRQ12_VECTOR] = 12, - [IRQ13_VECTOR] = 13, - [IRQ14_VECTOR] = 14, - [IRQ15_VECTOR] = 15, - [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 -}; - -int vector_used_by_percpu_irq(unsigned int vector) -{ - int cpu; - - for_each_online_cpu(cpu) { - if (per_cpu(vector_irq, cpu)[vector] != -1) - return 1; - } - - return 0; -} - -static void __init init_ISA_irqs(void) -{ - int i; - -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) - init_bsp_APIC(); -#endif - init_8259A(0); - - /* - * 16 old-style INTA-cycle interrupts: - */ - for (i = 0; i < NR_IRQS_LEGACY; i++) { - struct irq_desc *desc = irq_to_desc(i); - - desc->status = IRQ_DISABLED; - desc->action = NULL; - desc->depth = 1; - - set_irq_chip_and_handler_name(i, &i8259A_chip, - handle_level_irq, "XT"); - } -} - -/* Overridden in paravirt.c */ -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); - -static void __init smp_intr_init(void) -{ -#ifdef CONFIG_SMP -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) - /* - * The reschedule interrupt is a CPU-to-CPU reschedule-helper - * IPI, driven by wakeup. - */ - alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); - - /* IPIs for invalidation */ - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); - - /* IPI for generic function call */ - alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - - /* IPI for generic single function call */ - alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, - call_function_single_interrupt); - - /* Low priority IPI to cleanup after moving an irq */ - set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); - set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); -#endif -#endif /* CONFIG_SMP */ -} - -static void __init apic_intr_init(void) -{ - smp_intr_init(); - -#ifdef CONFIG_X86_64 - alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); - alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); -#endif - -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) - /* self generated IPI for local APIC timer */ - alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - - /* generic IPI for platform specific use */ - alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); - - /* IPI vectors for APIC spurious and error interrupts */ - alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); - alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); -#endif - -#ifdef CONFIG_X86_32 -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) - /* thermal monitor LVT interrupt */ - alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); -#endif -#endif -} - -#ifdef CONFIG_X86_32 -/** - * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors - * - * Description: - * Perform any necessary interrupt initialisation prior to setting up - * the "ordinary" interrupt call gates. For legacy reasons, the ISA - * interrupts should be initialised here if the machine emulates a PC - * in any way. - **/ -static void __init x86_quirk_pre_intr_init(void) -{ - if (x86_quirks->arch_pre_intr_init) { - if (x86_quirks->arch_pre_intr_init()) - return; - } - init_ISA_irqs(); -} -#endif - -void __init native_init_IRQ(void) -{ - int i; - -#ifdef CONFIG_X86_32 - /* Execute any quirks before the call gates are initialised: */ - x86_quirk_pre_intr_init(); -#else - init_ISA_irqs(); -#endif - - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { -#ifdef CONFIG_X86_32 - /* SYSCALL_VECTOR was reserved in trap_init. */ - if (i != SYSCALL_VECTOR) - set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); -#else - /* IA32_SYSCALL_VECTOR was reserved in trap_init. */ - if (i != IA32_SYSCALL_VECTOR) - set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); -#endif - } - - apic_intr_init(); - - if (!acpi_ioapic) - setup_irq(2, &irq2); - -#ifdef CONFIG_X86_32 - /* - * Call quirks after call gates are initialised (usually add in - * the architecture specific gates): - */ - x86_quirk_intr_init(); - - /* - * External FPU? Set up irq13 if so, for - * original braindamaged IBM FERR coupling. - */ - if (boot_cpu_data.hard_math && !cpu_has_fpu) - setup_irq(FPU_IRQ, &fpu_irq); - - irq_ctx_init(smp_processor_id()); -#endif -} diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c deleted file mode 100644 index f3be5e97427..00000000000 --- a/arch/x86/kernel/irqinit_64.c +++ /dev/null @@ -1,277 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: - * (these are usually mapped to vectors 0x30-0x3f) - */ - -/* - * The IO-APIC gives us many more interrupt sources. Most of these - * are unused but an SMP system is supposed to have enough memory ... - * sometimes (mostly wrt. hw bugs) we get corrupted vectors all - * across the spectrum, so we really want to be prepared to get all - * of these. Plus, more powerful systems might have more than 64 - * IO-APIC registers. - * - * (these are usually mapped into the 0x30-0xff vector range) - */ - -#ifdef CONFIG_X86_32 -/* - * Note that on a 486, we don't want to do a SIGFPE on an irq13 - * as the irq is unreliable, and exception 16 works correctly - * (ie as explained in the intel literature). On a 386, you - * can't use exception 16 due to bad IBM design, so we have to - * rely on the less exact irq13. - * - * Careful.. Not only is IRQ13 unreliable, but it is also - * leads to races. IBM designers who came up with it should - * be shot. - */ - -static irqreturn_t math_error_irq(int cpl, void *dev_id) -{ - outb(0, 0xF0); - if (ignore_fpu_irq || !boot_cpu_data.hard_math) - return IRQ_NONE; - math_error((void __user *)get_irq_regs()->ip); - return IRQ_HANDLED; -} - -/* - * New motherboards sometimes make IRQ 13 be a PCI interrupt, - * so allow interrupt sharing. - */ -static struct irqaction fpu_irq = { - .handler = math_error_irq, - .name = "fpu", -}; -#endif - -/* - * IRQ2 is cascade interrupt to second interrupt controller - */ -static struct irqaction irq2 = { - .handler = no_action, - .name = "cascade", -}; - -DEFINE_PER_CPU(vector_irq_t, vector_irq) = { - [0 ... IRQ0_VECTOR - 1] = -1, - [IRQ0_VECTOR] = 0, - [IRQ1_VECTOR] = 1, - [IRQ2_VECTOR] = 2, - [IRQ3_VECTOR] = 3, - [IRQ4_VECTOR] = 4, - [IRQ5_VECTOR] = 5, - [IRQ6_VECTOR] = 6, - [IRQ7_VECTOR] = 7, - [IRQ8_VECTOR] = 8, - [IRQ9_VECTOR] = 9, - [IRQ10_VECTOR] = 10, - [IRQ11_VECTOR] = 11, - [IRQ12_VECTOR] = 12, - [IRQ13_VECTOR] = 13, - [IRQ14_VECTOR] = 14, - [IRQ15_VECTOR] = 15, - [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 -}; - -int vector_used_by_percpu_irq(unsigned int vector) -{ - int cpu; - - for_each_online_cpu(cpu) { - if (per_cpu(vector_irq, cpu)[vector] != -1) - return 1; - } - - return 0; -} - -static void __init init_ISA_irqs(void) -{ - int i; - -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) - init_bsp_APIC(); -#endif - init_8259A(0); - - /* - * 16 old-style INTA-cycle interrupts: - */ - for (i = 0; i < NR_IRQS_LEGACY; i++) { - struct irq_desc *desc = irq_to_desc(i); - - desc->status = IRQ_DISABLED; - desc->action = NULL; - desc->depth = 1; - - set_irq_chip_and_handler_name(i, &i8259A_chip, - handle_level_irq, "XT"); - } -} - -/* Overridden in paravirt.c */ -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); - -static void __init smp_intr_init(void) -{ -#ifdef CONFIG_SMP -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) - /* - * The reschedule interrupt is a CPU-to-CPU reschedule-helper - * IPI, driven by wakeup. - */ - alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); - - /* IPIs for invalidation */ - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); - - /* IPI for generic function call */ - alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - - /* IPI for generic single function call */ - alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, - call_function_single_interrupt); - - /* Low priority IPI to cleanup after moving an irq */ - set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); - set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); -#endif -#endif /* CONFIG_SMP */ -} - -static void __init apic_intr_init(void) -{ - smp_intr_init(); - -#ifdef CONFIG_X86_64 - alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); - alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); -#endif - -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) - /* self generated IPI for local APIC timer */ - alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - - /* generic IPI for platform specific use */ - alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); - - /* IPI vectors for APIC spurious and error interrupts */ - alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); - alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); -#endif - -#ifdef CONFIG_X86_32 -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) - /* thermal monitor LVT interrupt */ - alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); -#endif -#endif -} - -#ifdef CONFIG_X86_32 -/** - * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors - * - * Description: - * Perform any necessary interrupt initialisation prior to setting up - * the "ordinary" interrupt call gates. For legacy reasons, the ISA - * interrupts should be initialised here if the machine emulates a PC - * in any way. - **/ -static void __init x86_quirk_pre_intr_init(void) -{ - if (x86_quirks->arch_pre_intr_init) { - if (x86_quirks->arch_pre_intr_init()) - return; - } - init_ISA_irqs(); -} -#endif - -void __init native_init_IRQ(void) -{ - int i; - -#ifdef CONFIG_X86_32 - /* Execute any quirks before the call gates are initialised: */ - x86_quirk_pre_intr_init(); -#else - init_ISA_irqs(); -#endif - - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { -#ifdef CONFIG_X86_32 - /* SYSCALL_VECTOR was reserved in trap_init. */ - if (i != SYSCALL_VECTOR) - set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); -#else - /* IA32_SYSCALL_VECTOR was reserved in trap_init. */ - if (i != IA32_SYSCALL_VECTOR) - set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); -#endif - } - - apic_intr_init(); - - if (!acpi_ioapic) - setup_irq(2, &irq2); - -#ifdef CONFIG_X86_32 - /* - * Call quirks after call gates are initialised (usually add in - * the architecture specific gates): - */ - x86_quirk_intr_init(); - - /* - * External FPU? Set up irq13 if so, for - * original braindamaged IBM FERR coupling. - */ - if (boot_cpu_data.hard_math && !cpu_has_fpu) - setup_irq(FPU_IRQ, &fpu_irq); - - irq_ctx_init(smp_processor_id()); -#endif -} -- cgit v1.2.3 From ac3048dfd4740becf8d768844cf47ebee363c9f8 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:29 +0300 Subject: x86: define IA32_SYSCALL_VECTOR on 32-bit to reduce ifdefs Impact: cleanup We can remove some #ifdefs if we define IA32_SYSCALL_VECTOR on 32-bit. Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit.c | 6 ------ arch/x86/kernel/traps.c | 5 +---- 2 files changed, 1 insertion(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index f3be5e97427..f2c60a59f47 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -242,15 +242,9 @@ void __init native_init_IRQ(void) * 'special' SMP interrupts) */ for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { -#ifdef CONFIG_X86_32 - /* SYSCALL_VECTOR was reserved in trap_init. */ - if (i != SYSCALL_VECTOR) - set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); -#else /* IA32_SYSCALL_VECTOR was reserved in trap_init. */ if (i != IA32_SYSCALL_VECTOR) set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); -#endif } apic_intr_init(); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a1d288327ff..2310700faca 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -969,11 +969,8 @@ void __init trap_init(void) for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) set_bit(i, used_vectors); -#ifdef CONFIG_X86_64 set_bit(IA32_SYSCALL_VECTOR, used_vectors); -#else - set_bit(SYSCALL_VECTOR, used_vectors); -#endif + /* * Should be a barrier for any external CPU state: */ -- cgit v1.2.3 From abdb5a5713330e17dfe91ab0d3e29c4744d95162 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:30 +0300 Subject: x86: remove some ifdefs from native_init_IRQ() Impact: cleanup Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index f2c60a59f47..626977200a5 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -205,7 +205,6 @@ static void __init apic_intr_init(void) #endif } -#ifdef CONFIG_X86_32 /** * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors * @@ -217,24 +216,21 @@ static void __init apic_intr_init(void) **/ static void __init x86_quirk_pre_intr_init(void) { +#ifdef CONFIG_X86_32 if (x86_quirks->arch_pre_intr_init) { if (x86_quirks->arch_pre_intr_init()) return; } +#endif init_ISA_irqs(); } -#endif void __init native_init_IRQ(void) { int i; -#ifdef CONFIG_X86_32 /* Execute any quirks before the call gates are initialised: */ x86_quirk_pre_intr_init(); -#else - init_ISA_irqs(); -#endif /* * Cover the whole vector space, no vector can escape -- cgit v1.2.3 From 6265ff19ca08df0d96c859ae5e4dc2d9ad07070e Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Thu, 9 Apr 2009 15:47:10 +0200 Subject: x86: cacheinfo: complete L2/L3 Cache and TLB associativity field definitions See "CPUID Specification" (AMD Publication #: 25481, Rev. 2.28, April 2008) Signed-off-by: Andreas Herrmann Cc: Mark Langsdorf LKML-Reference: <20090409134710.GA8026@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index d46a849f44a..789efe217e1 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -200,10 +200,17 @@ union l3_cache { }; static const unsigned short __cpuinitconst assocs[] = { - [1] = 1, [2] = 2, [4] = 4, [6] = 8, - [8] = 16, [0xa] = 32, [0xb] = 48, + [1] = 1, + [2] = 2, + [4] = 4, + [6] = 8, + [8] = 16, + [0xa] = 32, + [0xb] = 48, [0xc] = 64, - [0xf] = 0xffff // ?? + [0xd] = 96, + [0xe] = 128, + [0xf] = 0xffff /* fully associative - no way to show this currently */ }; static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 }; @@ -264,7 +271,8 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, eax->split.type = types[leaf]; eax->split.level = levels[leaf]; if (leaf == 3) - eax->split.num_threads_sharing = current_cpu_data.x86_max_cores - 1; + eax->split.num_threads_sharing = + current_cpu_data.x86_max_cores - 1; else eax->split.num_threads_sharing = 0; eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; -- cgit v1.2.3 From 47f16ca7631f9c6bad8e6d968cfb1433029b09ec Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 10 Apr 2009 14:58:05 +0200 Subject: x86, irqinit: preempt merge conflicts To make the topic merge life easier for tip:perfcounters/core, include two (inactive in this topic) IRQ vector initializations here. Also fix build bug - missing kprobes.h inclusion. Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 626977200a5..b424c32c4a0 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -195,6 +196,13 @@ static void __init apic_intr_init(void) /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); + + /* Performance monitoring interrupts: */ +# ifdef CONFIG_PERF_COUNTERS + alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt); + alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); +# endif + #endif #ifdef CONFIG_X86_32 -- cgit v1.2.3 From 2de1f33e99cec5fd79542a1d0e26efb9c36a98bb Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 11 Apr 2009 12:55:26 +0530 Subject: x86: apic/x2apic_cluster.c x86_cpu_to_logical_apicid should be static Impact: reduce kernel size a bit, address sparse warning Addresses the problem pointed out by this sparse warning: arch/x86/kernel/apic/x2apic_cluster.c:13:1: warning: symbol 'per_cpu__x86_cpu_to_logical_apicid' was not declared. Should it be static? Signed-off-by: Jaswinder Singh Rajput Cc: Suresh Siddha LKML-Reference: <1239434726.4418.24.camel@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_cluster.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 4a903e2f0d1..8e4cbb255c3 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -10,7 +10,7 @@ #include #include -DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); +static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { -- cgit v1.2.3 From 2c1b284e4fa260fd922b9a65c99169e2630c6862 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 11 Apr 2009 00:03:10 +0530 Subject: x86: clean up declarations and variables Impact: cleanup, no code changed - syscalls.h update declarations due to unifications - irq.c declare smp_generic_interrupt() before it gets used - process.c declare sys_fork() and sys_vfork() before they get used - tsc.c rename tsc_khz shadowed variable - apic/probe_32.c declare apic_default before it gets used - apic/nmi.c prev_nmi_count should be unsigned - apic/io_apic.c declare smp_irq_move_cleanup_interrupt() before it gets used - mm/init.c declare direct_gbpages and free_initrd_mem before they get used Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 1 + arch/x86/kernel/apic/nmi.c | 2 +- arch/x86/kernel/apic/probe_32.c | 1 - arch/x86/kernel/irq.c | 1 + arch/x86/kernel/process.c | 1 + arch/x86/kernel/tsc.c | 8 ++++---- 6 files changed, 8 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 767fe7e46d6..870c92ddaf9 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -59,6 +59,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index d6bd6240715..02056310f2f 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -104,7 +104,7 @@ static __init void nmi_cpu_busy(void *data) } #endif -static void report_broken_nmi(int cpu, int *prev_nmi_count) +static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count) { printk(KERN_CONT "\n"); diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 01eda2ac65e..440a8bccd91 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -160,7 +160,6 @@ extern struct apic apic_summit; extern struct apic apic_bigsmp; extern struct apic apic_es7000; extern struct apic apic_es7000_cluster; -extern struct apic apic_default; struct apic *apic = &apic_default; EXPORT_SYMBOL_GPL(apic); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 3aaf7b9e3a8..2188267f523 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -12,6 +12,7 @@ #include #include #include +#include atomic_t irq_err_count; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ca989158e84..3e21e38d7e3 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 7a567ebe636..a8dc0d00b83 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -384,13 +384,13 @@ unsigned long native_calibrate_tsc(void) { u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; - unsigned long flags, latch, ms, fast_calibrate, tsc_khz; + unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz; int hpet = is_hpet_enabled(), i, loopmin; - tsc_khz = get_hypervisor_tsc_freq(); - if (tsc_khz) { + hv_tsc_khz = get_hypervisor_tsc_freq(); + if (hv_tsc_khz) { printk(KERN_INFO "TSC: Frequency read from the hypervisor\n"); - return tsc_khz; + return hv_tsc_khz; } local_irq_save(flags); -- cgit v1.2.3 From edea7148a87c099e5d5d4838285cc27e459588b7 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 12 Apr 2009 20:47:39 +0400 Subject: x86: irq.c - tiny cleanup Impact: cleanup, robustization 1) guard ack_bad_irq with printk_ratelimit since there is no guarantee we will not be flooded one day 2) use pr_emerg() helper Signed-off-by: Cyrill Gorcunov LKML-Reference: <20090412165058.277579847@openvz.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 3aaf7b9e3a8..6603492e8b7 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -24,7 +24,8 @@ void (*generic_interrupt_extension)(void) = NULL; */ void ack_bad_irq(unsigned int irq) { - printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq); + if (printk_ratelimit()) + pr_err("unexpected IRQ trap at vector %02x\n", irq); #ifdef CONFIG_X86_LOCAL_APIC /* @@ -178,7 +179,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->irq_thermal_count; # ifdef CONFIG_X86_64 sum += irq_stats(cpu)->irq_threshold_count; -#endif +# endif #endif return sum; } @@ -219,8 +220,8 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs) #endif if (printk_ratelimit()) - printk(KERN_EMERG "%s: %d.%d No irq handler for vector (irq %d)\n", - __func__, smp_processor_id(), vector, irq); + pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n", + __func__, smp_processor_id(), vector, irq); } irq_exit(); -- cgit v1.2.3 From c0eaa4536f08b98fbcfa7fce5b7b0de1bebcb0e1 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 12 Apr 2009 20:47:40 +0400 Subject: x86: apic - introduce imcr_ helpers Impact: cleanup Distinguish port writting magic into helpers with comments. Signed-off-by: Cyrill Gorcunov LKML-Reference: <20090412165058.535921550@openvz.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 098ec84b8c0..c3be10f5773 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -98,6 +98,29 @@ early_param("lapic", parse_lapic); /* Local APIC was disabled by the BIOS and enabled by the kernel */ static int enabled_via_apicbase; +/* + * Handle interrupt mode configuration register (IMCR). + * This register controls whether the interrupt signals + * that reach the BSP come from the master PIC or from the + * local APIC. Before entering Symmetric I/O Mode, either + * the BIOS or the operating system must switch out of + * PIC Mode by changing the IMCR. + */ +static inline imcr_pic_to_apic(void) +{ + /* select IMCR register */ + outb(0x70, 0x22); + /* NMI and 8259 INTR go through APIC */ + outb(0x01, 0x23); +} + +static inline imcr_apic_to_pic(void) +{ + /* select IMCR register */ + outb(0x70, 0x22); + /* NMI and 8259 INTR go directly to BSP */ + outb(0x00, 0x23); +} #endif #ifdef CONFIG_X86_64 @@ -1727,8 +1750,7 @@ void __init connect_bsp_APIC(void) */ apic_printk(APIC_VERBOSE, "leaving PIC mode, " "enabling APIC mode.\n"); - outb(0x70, 0x22); - outb(0x01, 0x23); + imcr_pic_to_apic(); } #endif if (apic->enable_apic_mode) @@ -1756,8 +1778,7 @@ void disconnect_bsp_APIC(int virt_wire_setup) */ apic_printk(APIC_VERBOSE, "disabling APIC mode, " "entering PIC mode.\n"); - outb(0x70, 0x22); - outb(0x00, 0x23); + imcr_apic_to_pic(); return; } #endif -- cgit v1.2.3 From 08306ce61d6848e6fbf74fa4cc693c3fb29e943f Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 12 Apr 2009 20:47:41 +0400 Subject: x86: apic - introduce dummy apic operations Impact: refactor, speed up and robustize code In case if apic was disabled by kernel option or by hardware limits we can use dummy operations in apic->write to simplify the ack_APIC_irq() code. At the lame time the patch fixes the missed EOI in do_IRQ function (which has place if kernel is compiled as X86-32 and interrupt without handler happens where apic was not asked to be disabled via kernel option). Note that native_apic_write_dummy() consists of WARN_ON_ONCE to catch any buggy writes on enabled APICs. Could be removed after some time of testing. Signed-off-by: Cyrill Gorcunov LKML-Reference: <20090412165058.724788431@openvz.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 24 ++++++++++++++++++++++++ arch/x86/kernel/irq.c | 10 ++-------- 2 files changed, 26 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index c3be10f5773..9b849d4957d 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -232,6 +232,24 @@ static int modern_apic(void) return lapic_get_version() >= 0x14; } +/* + * bare function to substitute write operation + * and it's _that_ fast :) + */ +void native_apic_write_dummy(u32 reg, u32 v) +{ + WARN_ON_ONCE((cpu_has_apic || !disable_apic)); +} + +/* + * right after this call apic->write doesn't do anything + * note that there is no restore operation it works one way + */ +void apic_disable(void) +{ + apic->write = native_apic_write_dummy; +} + void native_apic_wait_icr_idle(void) { while (apic_read(APIC_ICR) & APIC_ICR_BUSY) @@ -1582,6 +1600,12 @@ void __init init_apic_mappings(void) */ if (boot_cpu_physical_apicid == -1U) boot_cpu_physical_apicid = read_apic_id(); + + /* lets check if we may to NOP'ify apic operations */ + if (!cpu_has_apic) { + pr_info("APIC: disable apic facility\n"); + apic_disable(); + } } /* diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 6603492e8b7..fd57bf35d0f 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -27,7 +27,6 @@ void ack_bad_irq(unsigned int irq) if (printk_ratelimit()) pr_err("unexpected IRQ trap at vector %02x\n", irq); -#ifdef CONFIG_X86_LOCAL_APIC /* * Currently unexpected vectors happen only on SMP and APIC. * We _must_ ack these because every local APIC has only N @@ -37,9 +36,7 @@ void ack_bad_irq(unsigned int irq) * completely. * But only ack when the APIC is enabled -AK */ - if (cpu_has_apic) - ack_APIC_irq(); -#endif + ack_APIC_irq(); } #define irq_stats(x) (&per_cpu(irq_stat, x)) @@ -214,10 +211,7 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs) irq = __get_cpu_var(vector_irq)[vector]; if (!handle_irq(irq, regs)) { -#ifdef CONFIG_X86_64 - if (!disable_apic) - ack_APIC_irq(); -#endif + ack_APIC_irq(); if (printk_ratelimit()) pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n", -- cgit v1.2.3 From b9b34f24b23ba9e79e07c0980e7fff16af2a67d1 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 12 Apr 2009 20:47:42 +0400 Subject: x86: smp.c - align smp_ops assignments Impact: cleanup It's a bit hard to parse by eyes without them being aligned. Signed-off-by: Cyrill Gorcunov LKML-Reference: <20090412165058.924175574@openvz.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/smp.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 13f33ea8cca..f6db48c405b 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -193,19 +193,19 @@ void smp_call_function_single_interrupt(struct pt_regs *regs) } struct smp_ops smp_ops = { - .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, - .smp_prepare_cpus = native_smp_prepare_cpus, - .smp_cpus_done = native_smp_cpus_done, + .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, + .smp_prepare_cpus = native_smp_prepare_cpus, + .smp_cpus_done = native_smp_cpus_done, - .smp_send_stop = native_smp_send_stop, - .smp_send_reschedule = native_smp_send_reschedule, + .smp_send_stop = native_smp_send_stop, + .smp_send_reschedule = native_smp_send_reschedule, - .cpu_up = native_cpu_up, - .cpu_die = native_cpu_die, - .cpu_disable = native_cpu_disable, - .play_dead = native_play_dead, + .cpu_up = native_cpu_up, + .cpu_die = native_cpu_die, + .cpu_disable = native_cpu_disable, + .play_dead = native_play_dead, - .send_call_func_ipi = native_send_call_func_ipi, + .send_call_func_ipi = native_send_call_func_ipi, .send_call_func_single_ipi = native_send_call_func_single_ipi, }; EXPORT_SYMBOL_GPL(smp_ops); -- cgit v1.2.3 From 0f3fd87ce43727d6b8573191ce89e874533b1429 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Mon, 13 Apr 2009 20:24:50 +0100 Subject: perf_counter: fix alignment in /proc/interrupts Trivial fix on columns alignment in /proc/interrupts file. Signed-off-by: Luis Henriques Cc: Peter Zijlstra LKML-Reference: <20090413192449.GA3920@hades.domain.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index dccaaa85578..849cfabb1fd 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -67,7 +67,7 @@ static int show_other_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); seq_printf(p, " Performance counter interrupts\n"); - seq_printf(p, "PND: "); + seq_printf(p, "%*s: ", prec, "PND"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); seq_printf(p, " Performance pending work\n"); -- cgit v1.2.3 From 5cda395f4a262788d8ed79ac8a26a2b821e5f751 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Mon, 13 Apr 2009 17:39:24 +0200 Subject: x86: fix function definitions after: x86: apic - introduce imcr_ helpers The patch "introduce imcr_ helpers" introduced good comments, but also a few new compile warnings. This fixes the function definitions to have a 'void' return type. Signed-off-by: Alexander van Heukelum Acked-by: Cyrill Gorcunov LKML-Reference: <20090413153924.GA20287@mailshack.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 9b849d4957d..4b48ff9163c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -106,7 +106,7 @@ static int enabled_via_apicbase; * the BIOS or the operating system must switch out of * PIC Mode by changing the IMCR. */ -static inline imcr_pic_to_apic(void) +static inline void imcr_pic_to_apic(void) { /* select IMCR register */ outb(0x70, 0x22); @@ -114,7 +114,7 @@ static inline imcr_pic_to_apic(void) outb(0x01, 0x23); } -static inline imcr_apic_to_pic(void) +static inline void imcr_apic_to_pic(void) { /* select IMCR register */ outb(0x70, 0x22); -- cgit v1.2.3 From e7d43a74cb07cbc4b8e9b5e4a914816b33fb0719 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Tue, 14 Apr 2009 13:18:28 +0530 Subject: x86: avoid multiple declaration of kstack_depth_to_print Impact: cleanup asm/stacktrace.h is more appropriate so removing other 2 declarations. Signed-off-by: Jaswinder Singh Rajput Cc: Neil Horman LKML-Reference: <1239695308.3033.34.camel@ht.satnam> Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h index da87590b869..81086c227ab 100644 --- a/arch/x86/kernel/dumpstack.h +++ b/arch/x86/kernel/dumpstack.h @@ -29,7 +29,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *sp, unsigned long bp, char *log_lvl); extern unsigned int code_bytes; -extern int kstack_depth_to_print; /* The form of the top of the frame on the stack */ struct stack_frame { -- cgit v1.2.3 From 7e05575c422d45f393c2d9b5900e97a30bf69bea Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 14 Apr 2009 12:12:29 +0900 Subject: x86: calgary: remove IOMMU_DEBUG CONFIG_IOMMU_DEBUG has depends on CONFIG_GART_IOMMU: config IOMMU_DEBUG bool "Enable IOMMU debugging" depends on GART_IOMMU && DEBUG_KERNEL depends on X86_64 So it's not useful to have CONFIG_IOMMU_DEBUG in Calgary IOMMU code, which does the extra checking of the bitmap space management. And Calgary uses the iommu helper for the bitmap space management now so it would be better to have the extra checking feature in the iommu helper rather than Calgary code (if necessary). Signed-off-by: FUJITA Tomonori Acked-by: Muli Ben-Yehuda Cc: Joerg Roedel Cc: alexisb@us.ibm.com LKML-Reference: <20090414120827G.fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-calgary_64.c | 54 ++-------------------------------------- 1 file changed, 2 insertions(+), 52 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 755c21e906f..971a3bec47a 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -186,37 +186,6 @@ static struct cal_chipset_ops calioc2_chip_ops = { static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; -/* enable this to stress test the chip's TCE cache */ -#ifdef CONFIG_IOMMU_DEBUG -static int debugging = 1; - -static inline unsigned long verify_bit_range(unsigned long* bitmap, - int expected, unsigned long start, unsigned long end) -{ - unsigned long idx = start; - - BUG_ON(start >= end); - - while (idx < end) { - if (!!test_bit(idx, bitmap) != expected) - return idx; - ++idx; - } - - /* all bits have the expected value */ - return ~0UL; -} -#else /* debugging is disabled */ -static int debugging; - -static inline unsigned long verify_bit_range(unsigned long* bitmap, - int expected, unsigned long start, unsigned long end) -{ - return ~0UL; -} - -#endif /* CONFIG_IOMMU_DEBUG */ - static inline int translation_enabled(struct iommu_table *tbl) { /* only PHBs with translation enabled have an IOMMU table */ @@ -228,7 +197,6 @@ static void iommu_range_reserve(struct iommu_table *tbl, { unsigned long index; unsigned long end; - unsigned long badbit; unsigned long flags; index = start_addr >> PAGE_SHIFT; @@ -243,14 +211,6 @@ static void iommu_range_reserve(struct iommu_table *tbl, spin_lock_irqsave(&tbl->it_lock, flags); - badbit = verify_bit_range(tbl->it_map, 0, index, end); - if (badbit != ~0UL) { - if (printk_ratelimit()) - printk(KERN_ERR "Calgary: entry already allocated at " - "0x%lx tbl %p dma 0x%lx npages %u\n", - badbit, tbl, start_addr, npages); - } - iommu_area_reserve(tbl->it_map, index, npages); spin_unlock_irqrestore(&tbl->it_lock, flags); @@ -326,7 +286,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, unsigned int npages) { unsigned long entry; - unsigned long badbit; unsigned long badend; unsigned long flags; @@ -346,14 +305,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, spin_lock_irqsave(&tbl->it_lock, flags); - badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages); - if (badbit != ~0UL) { - if (printk_ratelimit()) - printk(KERN_ERR "Calgary: bit is off at 0x%lx " - "tbl %p dma 0x%Lx entry 0x%lx npages %u\n", - badbit, tbl, dma_addr, entry, npages); - } - iommu_area_free(tbl->it_map, entry, npages); spin_unlock_irqrestore(&tbl->it_lock, flags); @@ -1488,9 +1439,8 @@ void __init detect_calgary(void) iommu_detected = 1; calgary_detected = 1; printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n"); - printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " - "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, - debugging ? "enabled" : "disabled"); + printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n", + specified_table_size); /* swiotlb for devices that aren't behind the Calgary. */ if (max_pfn > MAX_DMA32_PFN) -- cgit v1.2.3 From 19c1a6f5764d787113fa323ffb18be7991208f82 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 14 Apr 2009 09:43:19 +0900 Subject: x86 gart: reimplement IOMMU_LEAK feature by using DMA_API_DEBUG IOMMU_LEAK, GART's own feature, dumps the used IOMMU entries when IOMMU entries is full, which might be useful to find a bad driver that eats IOMMU entries. DMA_API_DEBUG provides the similar feature, debug_dma_dump_mappings, and it's better than GART's IOMMU_LEAK feature. GART's IOMMU_LEAK feature doesn't say who uses IOMMU entries so it's hard to find a bad driver. This patch reimplements the GART's IOMMU_LEAK feature by using DMA_API_DEBUG. Signed-off-by: FUJITA Tomonori Acked-by: Joerg Roedel Cc: Andrew Morton LKML-Reference: <1239669799-23579-2-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 45 ++++++++----------------------------------- 1 file changed, 8 insertions(+), 37 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index b284b58c035..1e8920d98f7 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -144,48 +144,21 @@ static void flush_gart(void) } #ifdef CONFIG_IOMMU_LEAK - -#define SET_LEAK(x) \ - do { \ - if (iommu_leak_tab) \ - iommu_leak_tab[x] = __builtin_return_address(0);\ - } while (0) - -#define CLEAR_LEAK(x) \ - do { \ - if (iommu_leak_tab) \ - iommu_leak_tab[x] = NULL; \ - } while (0) - /* Debugging aid for drivers that don't free their IOMMU tables */ -static void **iommu_leak_tab; static int leak_trace; static int iommu_leak_pages = 20; static void dump_leak(void) { - int i; static int dump; - if (dump || !iommu_leak_tab) + if (dump) return; dump = 1; - show_stack(NULL, NULL); - /* Very crude. dump some from the end of the table too */ - printk(KERN_DEBUG "Dumping %d pages from end of IOMMU:\n", - iommu_leak_pages); - for (i = 0; i < iommu_leak_pages; i += 2) { - printk(KERN_DEBUG "%lu: ", iommu_pages-i); - printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], - 0); - printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' '); - } - printk(KERN_DEBUG "\n"); + show_stack(NULL, NULL); + debug_dma_dump_mappings(NULL); } -#else -# define SET_LEAK(x) -# define CLEAR_LEAK(x) #endif static void iommu_full(struct device *dev, size_t size, int dir) @@ -248,7 +221,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, for (i = 0; i < npages; i++) { iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem); - SET_LEAK(iommu_page + i); phys_mem += PAGE_SIZE; } return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); @@ -294,7 +266,6 @@ static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr, npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); for (i = 0; i < npages; i++) { iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; - CLEAR_LEAK(iommu_page + i); } free_iommu(iommu_page, npages); } @@ -377,7 +348,6 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start, pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE); while (pages--) { iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); - SET_LEAK(iommu_page); addr += PAGE_SIZE; iommu_page++; } @@ -801,11 +771,12 @@ void __init gart_iommu_init(void) #ifdef CONFIG_IOMMU_LEAK if (leak_trace) { - iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, - get_order(iommu_pages*sizeof(void *))); - if (!iommu_leak_tab) + int ret; + + ret = dma_debug_resize_entries(iommu_pages); + if (ret) printk(KERN_DEBUG - "PCI-DMA: Cannot allocate leak trace area\n"); + "PCI-DMA: Cannot trace all the entries\n"); } #endif -- cgit v1.2.3 From 77857dc07247ed5fa700a197c96ef842d8dbebdf Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 15 Apr 2009 11:57:01 -0700 Subject: x86: use used_vectors in init_IRQ() Impact: fix crash with many devices I found this crash: [ 552.616646] general protection fault: 0403 [#1] SMP [ 552.620013] last sysfs file: /sys/devices/pci0000:00/0000:00:02.0/usb1/1-1/1-1:1.0/host13/target13:0:0/13:0:0:0/block/sr0/size [ 552.620013] CPU 0 [ 552.620013] Modules linked in: [ 552.620013] Pid: 0, comm: swapper Not tainted 2.6.30-rc1-tip-01931-g8fcafd8-dirty #28 Sun Fire X4440 [ 552.620013] RIP: 0010:[] [] default_idle+0x7d/0xda [ 552.620013] RSP: 0018:ffffffff81345e68 EFLAGS: 00010246 [ 552.620013] RAX: 0000000000000000 RBX: ffffffff8133d870 RCX: ffffc20000000000 [ 552.620013] RDX: 00000000001d0620 RSI: ffffffff8023bad8 RDI: ffffffff802a3169 [ 552.620013] RBP: ffffffff81345e98 R08: 0000000000000000 R09: ffffffff812244a0 [ 552.620013] R10: ffffffff81345dc8 R11: 7ebe1b6fa0bcac50 R12: 4ec4ec4ec4ec4ec5 [ 552.620013] R13: ffffffff813a54d0 R14: ffffffff813a7a40 R15: 0000000000000000 [ 552.620013] FS: 00000000006d1880(0000) GS:ffffc20000000000(0000) knlGS:0000000000000000 [ 552.620013] CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b [ 552.620013] CR2: 00007fec9d936a50 CR3: 000000007d1a9000 CR4: 00000000000006e0 [ 552.620013] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 552.620013] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 552.620013] Process swapper (pid: 0, threadinfo ffffffff81344000,task ffffffff812244a0) [ 552.620013] Stack: [ 552.620013] 0000000000000000 ffffc20000000000 00000000001d0620 7ebe1b6fa0bcac50 [ 552.620013] ffffffff8133d870 4ec4ec4ec4ec4ec5 ffffffff81345ec8 ffffffff8023bd84 [ 552.620013] 4ec4ec4ec4ec4ec5 ffffffff813a54d0 7ebe1b6fa0bcac50 ffffffff8133d870 [ 552.620013] Call Trace: [ 552.620013] [] c1e_idle+0x109/0x124 [ 552.620013] [] cpu_idle+0xb8/0x101 [ 552.620013] [] rest_init+0x7e/0x94 [ 552.620013] [] start_kernel+0x3dc/0x3fd [ 552.620013] [] x86_64_start_reservations+0xb9/0xd4 [ 552.620013] [] x86_64_start_kernel+0xee/0x109 [ 552.620013] Code: 48 8b 04 25 f8 b4 00 00 83 a0 3c e0 ff ff fb 0f ae f0 65 48 8b 04 25 f8 b4 00 00 f6 80 38 e0 ff ff 08 75 09 e8 71 76 06 00 fb f4 06 e8 68 76 06 00 fb 65 48 8b 04 25 f8 b4 00 00 83 88 3c e0 [ 552.620013] RIP [] default_idle+0x7d/0xda [ 552.620013] RSP [ 552.828646] ---[ end trace 4cbfc5c01382af7f ]--- Joerg Roedel said "The 0403 error code means that there was an external interrupt with vector 0x80. Yinghai, my theory is that the kernel on this machine has no 32bit emulation compiled in, right? In this case the selector points to a zero entry which may cause the #gpf right after the hlt. But I have no idea where the external int 0x80 comes from" it turns out that we could use 0x80 for external device on 64-bit when 32-bit emulation is disabled. But we forgot to set the gate for it. try to set gate for it by checking used_vectors. Also move apic_intr_init() early to avoid setting that gate two times. Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Joerg Roedel LKML-Reference: <49E62DFD.6010904@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index b424c32c4a0..2e08b10ad51 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -240,19 +240,19 @@ void __init native_init_IRQ(void) /* Execute any quirks before the call gates are initialised: */ x86_quirk_pre_intr_init(); + apic_intr_init(); + /* * Cover the whole vector space, no vector can escape * us. (some of these will be overridden and become * 'special' SMP interrupts) */ for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { - /* IA32_SYSCALL_VECTOR was reserved in trap_init. */ - if (i != IA32_SYSCALL_VECTOR) + /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ + if (!test_bit(i, used_vectors)) set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); } - apic_intr_init(); - if (!acpi_ioapic) setup_irq(2, &irq2); -- cgit v1.2.3 From 9b94b3a19b13e094c10f65f24bc358f6ffe4eacd Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 17 Apr 2009 12:07:46 +0200 Subject: x86: fixup numa_node information for AMD CPU northbridge functions Currently the numa_node attribute for these PCI devices is 0 (it corresponds to the numa_node for PCI bus 0). This is not a big issue but incorrect. This inconsistency can be fixed by reading the node number from CPU NB function 0. [ Impact: fill in dev->numa_node information, to optimize DMA allocations ] Signed-off-by: Andreas Herrmann Cc: jbarnes@virtuousgeek.org LKML-Reference: <20090417100746.GG16198@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/quirks.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index e95022e4f5d..94ad0c029f0 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -493,5 +493,42 @@ void force_hpet_resume(void) break; } } +#endif + +#if defined(CONFIG_PCI) && defined(CONFIG_NUMA) +/* Set correct numa_node information for AMD NB functions */ +static void __init quirk_amd_nb_node(struct pci_dev *dev) +{ + struct pci_dev *nb_ht; + unsigned int devfn; + u32 val; + + devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0); + nb_ht = pci_get_slot(dev->bus, devfn); + if (!nb_ht) + return; + + pci_read_config_dword(nb_ht, 0x60, &val); + set_dev_node(&dev->dev, val & 7); + pci_dev_put(dev); +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MEMCTL, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_HT, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MAP, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_DRAM, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK, + quirk_amd_nb_node); #endif -- cgit v1.2.3 From 5d0ae2db6deac4f15dac4f42f23bc56448fc8d4d Mon Sep 17 00:00:00 2001 From: Weidong Han Date: Fri, 17 Apr 2009 16:42:13 +0800 Subject: x86, intr-remap: fix ack for interrupt remapping Shouldn't call ack_apic_edge() in ir_ack_apic_edge(), because ack_apic_edge() does more than just ack: it also does irq migration in the non-interrupt-remapping case. But there is no such need for interrupt-remapping case, as irq migration is done in the process context. Similarly, ir_ack_apic_level() shouldn't call ack_apic_level, and instead should do the local cpu's EOI + directed EOI to the io-apic. ack_x2APIC_irq() is not neccessary, because ack_APIC_irq() will use MSR write for x2apic, and uncached write for non-x2apic. [ Impact: simplify/standardize intr-remap IRQ acking, fix on !x2apic ] Signed-off-by: Suresh Siddha Signed-off-by: Weidong Han Acked-by: David Woodhouse Cc: iommu@lists.linux-foundation.org Cc: allen.m.kay@intel.com Cc: fenghua.yu@intel.com LKML-Reference: <1239957736-6161-3-git-send-email-weidong.han@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 32 +++++--------------------------- 1 file changed, 5 insertions(+), 27 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 84990002240..ea22a86e3cd 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2552,20 +2552,6 @@ eoi_ioapic_irq(struct irq_desc *desc) spin_unlock_irqrestore(&ioapic_lock, flags); } -#ifdef CONFIG_X86_X2APIC -static void ack_x2apic_level(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - ack_x2APIC_irq(); - eoi_ioapic_irq(desc); -} - -static void ack_x2apic_edge(unsigned int irq) -{ - ack_x2APIC_irq(); -} -#endif - static void ack_apic_edge(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -2629,9 +2615,6 @@ static void ack_apic_level(unsigned int irq) */ ack_APIC_irq(); - if (irq_remapped(irq)) - eoi_ioapic_irq(desc); - /* Now we can move and renable the irq */ if (unlikely(do_unmask_irq)) { /* Only migrate the irq if the ack has been received. @@ -2680,20 +2663,15 @@ static void ack_apic_level(unsigned int irq) #ifdef CONFIG_INTR_REMAP static void ir_ack_apic_edge(unsigned int irq) { -#ifdef CONFIG_X86_X2APIC - if (x2apic_enabled()) - return ack_x2apic_edge(irq); -#endif - return ack_apic_edge(irq); + ack_APIC_irq(); } static void ir_ack_apic_level(unsigned int irq) { -#ifdef CONFIG_X86_X2APIC - if (x2apic_enabled()) - return ack_x2apic_level(irq); -#endif - return ack_apic_level(irq); + struct irq_desc *desc = irq_to_desc(irq); + + ack_APIC_irq(); + eoi_ioapic_irq(desc); } #endif /* CONFIG_INTR_REMAP */ -- cgit v1.2.3 From 937582382c71b75b29fbb92615629494e1a05ac0 Mon Sep 17 00:00:00 2001 From: Weidong Han Date: Fri, 17 Apr 2009 16:42:14 +0800 Subject: x86, intr-remap: enable interrupt remapping early Currently, when x2apic is not enabled, interrupt remapping will be enabled in init_dmars(), where it is too late to remap ioapic interrupts, that is, ioapic interrupts are really in compatibility mode, not remappable mode. This patch always enables interrupt remapping before ioapic setup, it guarantees all interrupts will be remapped when interrupt remapping is enabled. Thus it doesn't need to set the compatibility interrupt bit. [ Impact: refactor intr-remap init sequence, enable fuller remap mode ] Signed-off-by: Suresh Siddha Signed-off-by: Weidong Han Acked-by: David Woodhouse Cc: iommu@lists.linux-foundation.org Cc: allen.m.kay@intel.com Cc: fenghua.yu@intel.com LKML-Reference: <1239957736-6161-4-git-send-email-weidong.han@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 76 +++++++++++++++++++++------------------------ 1 file changed, 36 insertions(+), 40 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 83e47febcc8..0cf1eea750c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -141,6 +141,8 @@ static int x2apic_preenabled; static int disable_x2apic; static __init int setup_nox2apic(char *str) { + if (x2apic_enabled()) + panic("Bios already enabled x2apic, can't enforce nox2apic"); disable_x2apic = 1; setup_clear_cpu_cap(X86_FEATURE_X2APIC); return 0; @@ -1345,6 +1347,7 @@ void enable_x2apic(void) wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); } } +#endif /* CONFIG_X86_X2APIC */ void __init enable_IR_x2apic(void) { @@ -1353,32 +1356,21 @@ void __init enable_IR_x2apic(void) unsigned long flags; struct IO_APIC_route_entry **ioapic_entries = NULL; - if (!cpu_has_x2apic) - return; - - if (!x2apic_preenabled && disable_x2apic) { - pr_info("Skipped enabling x2apic and Interrupt-remapping " - "because of nox2apic\n"); - return; + ret = dmar_table_init(); + if (ret) { + pr_debug("dmar_table_init() failed with %d:\n", ret); + goto ir_failed; } - if (x2apic_preenabled && disable_x2apic) - panic("Bios already enabled x2apic, can't enforce nox2apic"); - - if (!x2apic_preenabled && skip_ioapic_setup) { - pr_info("Skipped enabling x2apic and Interrupt-remapping " - "because of skipping io-apic setup\n"); - return; + if (!intr_remapping_supported()) { + pr_debug("intr-remapping not supported\n"); + goto ir_failed; } - ret = dmar_table_init(); - if (ret) { - pr_info("dmar_table_init() failed with %d:\n", ret); - if (x2apic_preenabled) - panic("x2apic enabled by bios. But IR enabling failed"); - else - pr_info("Not enabling x2apic,Intr-remapping\n"); + if (!x2apic_preenabled && skip_ioapic_setup) { + pr_info("Skipped enabling intr-remap because of skipping " + "io-apic setup\n"); return; } @@ -1398,20 +1390,25 @@ void __init enable_IR_x2apic(void) mask_IO_APIC_setup(ioapic_entries); mask_8259A(); - ret = enable_intr_remapping(EIM_32BIT_APIC_ID); - - if (ret && x2apic_preenabled) { - local_irq_restore(flags); - panic("x2apic enabled by bios. But IR enabling failed"); - } +#ifdef CONFIG_X86_X2APIC + if (cpu_has_x2apic) + ret = enable_intr_remapping(EIM_32BIT_APIC_ID); + else +#endif + ret = enable_intr_remapping(EIM_8BIT_APIC_ID); if (ret) goto end_restore; - if (!x2apic) { + pr_info("Enabled Interrupt-remapping\n"); + +#ifdef CONFIG_X86_X2APIC + if (cpu_has_x2apic && !x2apic) { x2apic = 1; enable_x2apic(); + pr_info("Enabled x2apic\n"); } +#endif end_restore: if (ret) @@ -1426,30 +1423,29 @@ end_restore: local_irq_restore(flags); end: - if (!ret) { - if (!x2apic_preenabled) - pr_info("Enabled x2apic and interrupt-remapping\n"); - else - pr_info("Enabled Interrupt-remapping\n"); - } else - pr_err("Failed to enable Interrupt-remapping and x2apic\n"); if (ioapic_entries) free_ioapic_entries(ioapic_entries); + + if (!ret) + return; + +ir_failed: + if (x2apic_preenabled) + panic("x2apic enabled by bios. But IR enabling failed"); + else if (cpu_has_x2apic) + pr_info("Not enabling x2apic,Intr-remapping\n"); #else if (!cpu_has_x2apic) return; if (x2apic_preenabled) panic("x2apic enabled prior OS handover," - " enable CONFIG_INTR_REMAP"); - - pr_info("Enable CONFIG_INTR_REMAP for enabling intr-remapping " - " and x2apic\n"); + " enable CONFIG_X86_X2APIC, CONFIG_INTR_REMAP"); #endif return; } -#endif /* CONFIG_X86_X2APIC */ + #ifdef CONFIG_X86_64 /* -- cgit v1.2.3 From 9a2755c3569e4db92bd9b1daadeddb4045b0cccd Mon Sep 17 00:00:00 2001 From: Weidong Han Date: Fri, 17 Apr 2009 16:42:16 +0800 Subject: x86, intr-remap: fix x2apic/intr-remap resume Interrupt remapping was decoupled from x2apic. Shouldn't check x2apic before resume interrupt remapping. Otherwise, interrupt remapping won't be resumed when x2apic is not enabled. [ Impact: fix potential intr-remap resume hang on !x2apic ] Signed-off-by: Suresh Siddha Signed-off-by: Weidong Han Acked-by: David Woodhouse Cc: iommu@lists.linux-foundation.org Cc: allen.m.kay@intel.com Cc: fenghua.yu@intel.com LKML-Reference: <1239957736-6161-6-git-send-email-weidong.han@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 0cf1eea750c..7b41a32339e 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2032,7 +2032,7 @@ static int lapic_resume(struct sys_device *dev) return 0; local_irq_save(flags); - if (x2apic) { + if (intr_remapping_enabled) { ioapic_entries = alloc_ioapic_entries(); if (!ioapic_entries) { WARN(1, "Alloc ioapic_entries in lapic resume failed."); @@ -2048,8 +2048,10 @@ static int lapic_resume(struct sys_device *dev) mask_IO_APIC_setup(ioapic_entries); mask_8259A(); - enable_x2apic(); } + + if (x2apic) + enable_x2apic(); #else if (!apic_pm_state.active) return 0; @@ -2097,10 +2099,12 @@ static int lapic_resume(struct sys_device *dev) apic_read(APIC_ESR); #ifdef CONFIG_INTR_REMAP - if (intr_remapping_enabled) - reenable_intr_remapping(EIM_32BIT_APIC_ID); + if (intr_remapping_enabled) { + if (x2apic) + reenable_intr_remapping(EIM_32BIT_APIC_ID); + else + reenable_intr_remapping(EIM_8BIT_APIC_ID); - if (x2apic) { unmask_8259A(); restore_IO_APIC_setup(ioapic_entries); free_ioapic_entries(ioapic_entries); @@ -2109,7 +2113,6 @@ static int lapic_resume(struct sys_device *dev) local_irq_restore(flags); - return 0; } -- cgit v1.2.3 From cece3155d869a50ba534ed161b5a05e8a29dcad0 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 18 Apr 2009 23:45:28 +0400 Subject: x86: smpboot - wakeup_secondary should be done via __cpuinit section A caller (do_boot_cpu) already has __cpuinit attribute. Since HOTPLUG_CPU depends on SMP && HOTPLUG it doesn't lead to panic at moment. [ Impact: cleanup ] Signed-off-by: Cyrill Gorcunov LKML-Reference: <20090418194528.GD25510@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index bf8ad6344b1..d2e8de95815 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -504,7 +504,7 @@ void __inquire_remote_apic(int apicid) * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this * won't ... remember to clear down the APIC, etc later. */ -int __devinit +int __cpuinit wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) { unsigned long send_status, accept_status = 0; @@ -538,7 +538,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) return (send_status | accept_status); } -static int __devinit +static int __cpuinit wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) { unsigned long send_status, accept_status = 0; -- cgit v1.2.3 From 667c5296cc76fefe0abcb79228952b28d9af45e3 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 19 Apr 2009 11:43:11 +0400 Subject: x86: es7000, uv - use __cpuinit for kicking secondary cpus The caller already has __cpuinit attribute. [ Impact: save memory, address section mismatch warning ] Signed-off-by: Cyrill Gorcunov Cc: Yinghai Lu Cc: Pavel Emelyanov LKML-Reference: <20090419074311.GA8670@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/es7000_32.c | 2 +- arch/x86/kernel/apic/x2apic_uv_x.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 1c11b819f24..8e07c141866 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -145,7 +145,7 @@ es7000_rename_gsi(int ioapic, int gsi) return gsi; } -static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) +static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) { unsigned long vect = 0, psaival = 0; diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index de1a50af807..873bf7121e8 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -91,7 +91,7 @@ static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask) cpumask_set_cpu(cpu, retmask); } -static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) +static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) { #ifdef CONFIG_SMP unsigned long val; -- cgit v1.2.3 From fc1edaf9e7cc4d4696f83dee495b8f158d01c4eb Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 20 Apr 2009 13:02:27 -0700 Subject: x86: x2apic, IR: Clean up X86_X2APIC and INTR_REMAP config checks Add x2apic_supported() to clean up CONFIG_X86_X2APIC checks. Fix CONFIG_INTR_REMAP checks. [ Impact: cleanup ] Signed-off-by: Suresh Siddha Cc: dwmw2@infradead.org Cc: Suresh Siddha Cc: Weidong Han LKML-Reference: <20090420200450.128993000@linux-os.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 49 ++++++++++------------------------------- arch/x86/kernel/apic/io_apic.c | 2 -- arch/x86/kernel/apic/probe_64.c | 2 +- 3 files changed, 13 insertions(+), 40 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 7b41a32339e..2b30e520dce 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -134,8 +134,8 @@ static __init int setup_apicpmtimer(char *s) __setup("apicpmtimer", setup_apicpmtimer); #endif +int x2apic_mode; #ifdef CONFIG_X86_X2APIC -int x2apic; /* x2apic enabled before OS handover */ static int x2apic_preenabled; static int disable_x2apic; @@ -858,7 +858,7 @@ void clear_local_APIC(void) u32 v; /* APIC hasn't been mapped yet */ - if (!x2apic && !apic_phys) + if (!x2apic_mode && !apic_phys) return; maxlvt = lapic_get_maxlvt(); @@ -1330,7 +1330,7 @@ void check_x2apic(void) { if (x2apic_enabled()) { pr_info("x2apic enabled by BIOS, switching to x2apic ops\n"); - x2apic_preenabled = x2apic = 1; + x2apic_preenabled = x2apic_mode = 1; } } @@ -1338,7 +1338,7 @@ void enable_x2apic(void) { int msr, msr2; - if (!x2apic) + if (!x2apic_mode) return; rdmsr(MSR_IA32_APICBASE, msr, msr2); @@ -1390,25 +1390,17 @@ void __init enable_IR_x2apic(void) mask_IO_APIC_setup(ioapic_entries); mask_8259A(); -#ifdef CONFIG_X86_X2APIC - if (cpu_has_x2apic) - ret = enable_intr_remapping(EIM_32BIT_APIC_ID); - else -#endif - ret = enable_intr_remapping(EIM_8BIT_APIC_ID); - + ret = enable_intr_remapping(x2apic_supported()); if (ret) goto end_restore; pr_info("Enabled Interrupt-remapping\n"); -#ifdef CONFIG_X86_X2APIC - if (cpu_has_x2apic && !x2apic) { - x2apic = 1; + if (x2apic_supported() && !x2apic_mode) { + x2apic_mode = 1; enable_x2apic(); pr_info("Enabled x2apic\n"); } -#endif end_restore: if (ret) @@ -1576,7 +1568,7 @@ void __init early_init_lapic_mapping(void) */ void __init init_apic_mappings(void) { - if (x2apic) { + if (x2apic_mode) { boot_cpu_physical_apicid = read_apic_id(); return; } @@ -2010,10 +2002,10 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state) local_irq_save(flags); disable_local_APIC(); -#ifdef CONFIG_INTR_REMAP + if (intr_remapping_enabled) disable_intr_remapping(); -#endif + local_irq_restore(flags); return 0; } @@ -2023,8 +2015,6 @@ static int lapic_resume(struct sys_device *dev) unsigned int l, h; unsigned long flags; int maxlvt; - -#ifdef CONFIG_INTR_REMAP int ret; struct IO_APIC_route_entry **ioapic_entries = NULL; @@ -2050,17 +2040,8 @@ static int lapic_resume(struct sys_device *dev) mask_8259A(); } - if (x2apic) + if (x2apic_mode) enable_x2apic(); -#else - if (!apic_pm_state.active) - return 0; - - local_irq_save(flags); - if (x2apic) - enable_x2apic(); -#endif - else { /* * Make sure the APICBASE points to the right address @@ -2098,18 +2079,12 @@ static int lapic_resume(struct sys_device *dev) apic_write(APIC_ESR, 0); apic_read(APIC_ESR); -#ifdef CONFIG_INTR_REMAP if (intr_remapping_enabled) { - if (x2apic) - reenable_intr_remapping(EIM_32BIT_APIC_ID); - else - reenable_intr_remapping(EIM_8BIT_APIC_ID); - + reenable_intr_remapping(x2apic_mode); unmask_8259A(); restore_IO_APIC_setup(ioapic_entries); free_ioapic_entries(ioapic_entries); } -#endif local_irq_restore(flags); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ea22a86e3cd..3a45d2ec974 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -736,7 +736,6 @@ static int __init ioapic_pirq_setup(char *str) __setup("pirq=", ioapic_pirq_setup); #endif /* CONFIG_X86_32 */ -#ifdef CONFIG_INTR_REMAP struct IO_APIC_route_entry **alloc_ioapic_entries(void) { int apic; @@ -857,7 +856,6 @@ void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries) kfree(ioapic_entries); } -#endif /* * Find the IRQ entry number of a certain pin. diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index 1783652bb0e..bc3e880f9b8 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -50,7 +50,7 @@ static struct apic *apic_probe[] __initdata = { void __init default_setup_apic_routing(void) { #ifdef CONFIG_X86_X2APIC - if (x2apic && (apic != &apic_x2apic_phys && + if (x2apic_mode && (apic != &apic_x2apic_phys && #ifdef CONFIG_X86_UV apic != &apic_x2apic_uv_x && #endif -- cgit v1.2.3 From 25629d810a52176758401184d9b437fbb7f79195 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 20 Apr 2009 13:02:28 -0700 Subject: x86: x2apic, IR: Move eoi_ioapic_irq() into a CONFIG_INTR_REMAP section Address the following complier warning: arch/x86/kernel/apic/io_apic.c:2543: warning: `eoi_ioapic_irq' defined but not used By moving that function (and eoi_ioapic_irq()) into an existing #ifdef CONFIG_INTR_REMAP section of the code. [ Impact: cleanup ] Signed-off-by: Suresh Siddha Cc: dwmw2@infradead.org Cc: Weidong Han LKML-Reference: <20090420200450.271099000@linux-os.sc.intel.com> Signed-off-by: Ingo Molnar Cc: Weidong Han --- arch/x86/kernel/apic/io_apic.c | 66 +++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 33 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 3a45d2ec974..4baa9cbd630 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2517,39 +2517,6 @@ static void irq_complete_move(struct irq_desc **descp) static inline void irq_complete_move(struct irq_desc **descp) {} #endif -static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) -{ - int apic, pin; - struct irq_pin_list *entry; - - entry = cfg->irq_2_pin; - for (;;) { - - if (!entry) - break; - - apic = entry->apic; - pin = entry->pin; - io_apic_eoi(apic, pin); - entry = entry->next; - } -} - -static void -eoi_ioapic_irq(struct irq_desc *desc) -{ - struct irq_cfg *cfg; - unsigned long flags; - unsigned int irq; - - irq = desc->irq; - cfg = desc->chip_data; - - spin_lock_irqsave(&ioapic_lock, flags); - __eoi_ioapic_irq(irq, cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - static void ack_apic_edge(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -2659,6 +2626,39 @@ static void ack_apic_level(unsigned int irq) } #ifdef CONFIG_INTR_REMAP +static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) +{ + int apic, pin; + struct irq_pin_list *entry; + + entry = cfg->irq_2_pin; + for (;;) { + + if (!entry) + break; + + apic = entry->apic; + pin = entry->pin; + io_apic_eoi(apic, pin); + entry = entry->next; + } +} + +static void +eoi_ioapic_irq(struct irq_desc *desc) +{ + struct irq_cfg *cfg; + unsigned long flags; + unsigned int irq; + + irq = desc->irq; + cfg = desc->chip_data; + + spin_lock_irqsave(&ioapic_lock, flags); + __eoi_ioapic_irq(irq, cfg); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + static void ir_ack_apic_edge(unsigned int irq) { ack_APIC_irq(); -- cgit v1.2.3 From 39d83a5d684a457046aa2a6dac60f105966e78e9 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 20 Apr 2009 13:02:29 -0700 Subject: x86: x2apic, IR: Clean up panic() with nox2apic boot option Instead of panic() ignore the "nox2apic" boot option when BIOS has already enabled x2apic prior to OS handover. [ Impact: printk warning instead of panic() when BIOS has enabled x2apic already ] Signed-off-by: Suresh Siddha Cc: dwmw2@infradead.org Cc: Weidong Han LKML-Reference: <20090420200450.425091000@linux-os.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 2b30e520dce..d32f5589f1d 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -141,8 +141,12 @@ static int x2apic_preenabled; static int disable_x2apic; static __init int setup_nox2apic(char *str) { - if (x2apic_enabled()) - panic("Bios already enabled x2apic, can't enforce nox2apic"); + if (x2apic_enabled()) { + pr_warning("Bios already enabled x2apic, " + "can't enforce nox2apic"); + return 0; + } + disable_x2apic = 1; setup_clear_cpu_cap(X86_FEATURE_X2APIC); return 0; -- cgit v1.2.3 From ff166cb57a17124af75714a9c11f448f56f1a4a3 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 20 Apr 2009 13:02:30 -0700 Subject: x86: x2apic, IR: remove reinit_intr_remapped_IO_APIC() When interrupt-remapping is enabled, we are relying on setup_IO_APIC_irqs() to configure remapped entries in the IO-APIC, which comes little bit later after enabling interrupt-remapping. Meanwhile, restoration of old io-apic entries after enabling interrupt-remapping will not make the interrupts through io-apic functional anyway. So remove the unnecessary reinit_intr_remapped_IO_APIC() step. The longer story: When interrupt-remapping is enabled, IO-APIC entries need to be setup in the re-mappable format (pointing to interrupt-remapping table entries setup by the OS). This remapping configuration is happening in the same place where we traditionally configure IO-APIC (i.e., in setup_IO_APIC_irqs()). So when we enable interrupt-remapping successfully, there is no need to restore old io-apic RTE entries before we actually do a complete configuration shortly in setup_IO_APIC_irqs(). Old IO-APIC RTE's may be in traditional format (non re-mappable) or in re-mappable format pointing to interrupt-remapping table entries setup by BIOS. Restoring both of these will not make IO-APIC functional. We have to rely on setup_IO_APIC_irqs() for proper configuration by OS. So I am removing this unnecessary and broken step. [ Impact: remove unnecessary/broken IO-APIC setup step ] Signed-off-by: Suresh Siddha Acked-by: Weidong Han Cc: dwmw2@infradead.org LKML-Reference: <20090420200450.552359000@linux-os.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 2 -- arch/x86/kernel/apic/io_apic.c | 14 -------------- 2 files changed, 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index d32f5589f1d..1386dbec552 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1412,8 +1412,6 @@ end_restore: * IR enabling failed */ restore_IO_APIC_setup(ioapic_entries); - else - reinit_intr_remapped_IO_APIC(x2apic_preenabled, ioapic_entries); unmask_8259A(); local_irq_restore(flags); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 4baa9cbd630..8aef5f9d947 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -833,20 +833,6 @@ int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) return 0; } -void reinit_intr_remapped_IO_APIC(int intr_remapping, - struct IO_APIC_route_entry **ioapic_entries) - -{ - /* - * for now plain restore of previous settings. - * TBD: In the case of OS enabling interrupt-remapping, - * IO-APIC RTE's need to be setup to point to interrupt-remapping - * table entries. for now, do a plain restore, and wait for - * the setup_IO_APIC_irqs() to do proper initialization. - */ - restore_IO_APIC_setup(ioapic_entries); -} - void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries) { int apic; -- cgit v1.2.3 From 0a3ec21fcd311b26ab0f249d62960e127bc20ca8 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Sun, 26 Apr 2009 23:07:42 +0200 Subject: x86: beautify vmlinux_64.lds.S Beautify vmlinux_64.lds.S: - Use tabs for indent - Located curly braces like in C code - Rearranged a few comments There is no functional changes in this patch The beautification is done to prepare a unification of the _32 and the _64 variants of the linker scripts. [ Impact: cleanup ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <20090426210742.GA3464@uranus.ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux_64.lds.S | 448 +++++++++++++++++++++------------------ 1 file changed, 243 insertions(+), 205 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index c8742507b03..6d5a5b05eaa 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -15,69 +15,79 @@ OUTPUT_ARCH(i386:x86-64) ENTRY(phys_startup_64) jiffies_64 = jiffies; PHDRS { - text PT_LOAD FLAGS(5); /* R_E */ - data PT_LOAD FLAGS(7); /* RWE */ - user PT_LOAD FLAGS(7); /* RWE */ + text PT_LOAD FLAGS(5); /* R_E */ + data PT_LOAD FLAGS(7); /* RWE */ + user PT_LOAD FLAGS(7); /* RWE */ data.init PT_LOAD FLAGS(7); /* RWE */ #ifdef CONFIG_SMP percpu PT_LOAD FLAGS(7); /* RWE */ #endif data.init2 PT_LOAD FLAGS(7); /* RWE */ - note PT_NOTE FLAGS(0); /* ___ */ + note PT_NOTE FLAGS(0); /* ___ */ } SECTIONS { - . = __START_KERNEL; - phys_startup_64 = startup_64 - LOAD_OFFSET; - .text : AT(ADDR(.text) - LOAD_OFFSET) { - _text = .; /* Text and read-only data */ - /* First the code that has to be first for bootstrapping */ - *(.text.head) - _stext = .; - /* Then the rest */ - TEXT_TEXT - SCHED_TEXT - LOCK_TEXT - KPROBES_TEXT - IRQENTRY_TEXT - *(.fixup) - *(.gnu.warning) - _etext = .; /* End of text section */ - } :text = 0x9090 - - NOTES :text :note - - . = ALIGN(16); /* Exception table */ - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { - __start___ex_table = .; - *(__ex_table) - __stop___ex_table = .; - } :text = 0x9090 - - RODATA - - . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */ - /* Data */ - .data : AT(ADDR(.data) - LOAD_OFFSET) { - DATA_DATA - CONSTRUCTORS - _edata = .; /* End of data section */ - } :data + . = __START_KERNEL; + phys_startup_64 = startup_64 - LOAD_OFFSET; + + /* Text and read-only data */ + .text : AT(ADDR(.text) - LOAD_OFFSET) { + _text = .; + /* First the code that has to be first for bootstrapping */ + *(.text.head) + _stext = .; + /* Then the rest */ + TEXT_TEXT + SCHED_TEXT + LOCK_TEXT + KPROBES_TEXT + IRQENTRY_TEXT + *(.fixup) + *(.gnu.warning) + /* End of text section */ + _etext = .; + } :text = 0x9090 + + NOTES :text :note + + /* Exception table */ + . = ALIGN(16); + __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { + __start___ex_table = .; + *(__ex_table) + __stop___ex_table = .; + } :text = 0x9090 + RODATA - .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { + /* Align data segment to page size boundary */ . = ALIGN(PAGE_SIZE); - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - *(.data.cacheline_aligned) - } - . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); - .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { - *(.data.read_mostly) - } + /* Data */ + .data : AT(ADDR(.data) - LOAD_OFFSET) { + DATA_DATA + CONSTRUCTORS + /* End of data section */ + _edata = .; + } :data + + + .data.cacheline_aligned : + AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + *(.data.cacheline_aligned) + } + + . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); + .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { + *(.data.read_mostly) + } #define VSYSCALL_ADDR (-10*1024*1024) -#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) -#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) +#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \ + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) +#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \ + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) #define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) #define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) @@ -85,37 +95,53 @@ SECTIONS #define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) - . = VSYSCALL_ADDR; - .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user - __vsyscall_0 = VSYSCALL_VIRT_ADDR; + . = VSYSCALL_ADDR; + .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { + *(.vsyscall_0) + } :user + + __vsyscall_0 = VSYSCALL_VIRT_ADDR; + + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { + *(.vsyscall_fn) + } + + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { + *(.vsyscall_gtod_data) + } - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) } - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) - { *(.vsyscall_gtod_data) } - vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); - .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) - { *(.vsyscall_clock) } - vsyscall_clock = VVIRT(.vsyscall_clock); + vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); + .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) { + *(.vsyscall_clock) + } + vsyscall_clock = VVIRT(.vsyscall_clock); - .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) - { *(.vsyscall_1) } - .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) - { *(.vsyscall_2) } + .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { + *(.vsyscall_1) + } + .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { + *(.vsyscall_2) + } - .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) } - vgetcpu_mode = VVIRT(.vgetcpu_mode); + .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { + *(.vgetcpu_mode) + } + vgetcpu_mode = VVIRT(.vgetcpu_mode); - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) } - jiffies = VVIRT(.jiffies); + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .jiffies : AT(VLOAD(.jiffies)) { + *(.jiffies) + } + jiffies = VVIRT(.jiffies); - .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) - { *(.vsyscall_3) } + .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { + *(.vsyscall_3) + } - . = VSYSCALL_VIRT_ADDR + PAGE_SIZE; + . = VSYSCALL_VIRT_ADDR + PAGE_SIZE; #undef VSYSCALL_ADDR #undef VSYSCALL_PHYS_ADDR @@ -125,156 +151,168 @@ SECTIONS #undef VVIRT_OFFSET #undef VVIRT - .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { - . = ALIGN(THREAD_SIZE); /* init_task */ - *(.data.init_task) - }:data.init + /* init_task */ + .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { + . = ALIGN(THREAD_SIZE); + *(.data.init_task) + } :data.init - .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - *(.data.page_aligned) - } + .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); + *(.data.page_aligned) + } - .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { - /* might get freed after init */ - . = ALIGN(PAGE_SIZE); - __smp_alt_begin = .; - __smp_locks = .; - *(.smp_locks) - __smp_locks_end = .; + .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { + /* might get freed after init */ + . = ALIGN(PAGE_SIZE); + __smp_alt_begin = .; + __smp_locks = .; + *(.smp_locks) + __smp_locks_end = .; + . = ALIGN(PAGE_SIZE); + __smp_alt_end = .; + } + + /* Init code and data */ . = ALIGN(PAGE_SIZE); - __smp_alt_end = .; - } - - . = ALIGN(PAGE_SIZE); /* Init code and data */ - __init_begin = .; /* paired with __init_end */ - .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { - _sinittext = .; - INIT_TEXT - _einittext = .; - } - .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { - __initdata_begin = .; - INIT_DATA - __initdata_end = .; - } - - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { - . = ALIGN(16); - __setup_start = .; - *(.init.setup) - __setup_end = .; - } - .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { - __initcall_start = .; - INITCALLS - __initcall_end = .; - } - .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { - __con_initcall_start = .; - *(.con_initcall.init) - __con_initcall_end = .; - } - .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { - __x86_cpu_dev_start = .; - *(.x86_cpu_dev.init) - __x86_cpu_dev_end = .; - } - SECURITY_INIT - - . = ALIGN(8); - .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { - __parainstructions = .; - *(.parainstructions) - __parainstructions_end = .; - } - - .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { + __init_begin = .; /* paired with __init_end */ + .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { + _sinittext = .; + INIT_TEXT + _einittext = .; + } + + .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { + __initdata_begin = .; + INIT_DATA + __initdata_end = .; + } + + .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { + . = ALIGN(16); + __setup_start = .; + *(.init.setup) + __setup_end = .; + } + + .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { + __initcall_start = .; + INITCALLS + __initcall_end = .; + } + + .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { + __con_initcall_start = .; + *(.con_initcall.init) + __con_initcall_end = .; + } + + .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { + __x86_cpu_dev_start = .; + *(.x86_cpu_dev.init) + __x86_cpu_dev_end = .; + } + + SECURITY_INIT + . = ALIGN(8); - __alt_instructions = .; - *(.altinstructions) - __alt_instructions_end = .; - } - .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { - *(.altinstr_replacement) - } - /* .exit.text is discard at runtime, not link time, to deal with references - from .altinstructions and .eh_frame */ - .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { - EXIT_TEXT - } - .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { - EXIT_DATA - } + .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { + __parainstructions = .; + *(.parainstructions) + __parainstructions_end = .; + } + + .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { + . = ALIGN(8); + __alt_instructions = .; + *(.altinstructions) + __alt_instructions_end = .; + } + + .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { + *(.altinstr_replacement) + } + + /* + * .exit.text is discard at runtime, not link time, to deal with + * references from .altinstructions and .eh_frame + */ + .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { + EXIT_TEXT + } + + .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { + EXIT_DATA + } #ifdef CONFIG_BLK_DEV_INITRD - . = ALIGN(PAGE_SIZE); - .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { - __initramfs_start = .; - *(.init.ramfs) - __initramfs_end = .; - } + . = ALIGN(PAGE_SIZE); + .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { + __initramfs_start = .; + *(.init.ramfs) + __initramfs_end = .; + } #endif #ifdef CONFIG_SMP - /* - * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the - * output PHDR, so the next output section - __data_nosave - should - * start another section data.init2. Also, pda should be at the head of - * percpu area. Preallocate it and define the percpu offset symbol - * so that it can be accessed as a percpu variable. - */ - . = ALIGN(PAGE_SIZE); - PERCPU_VADDR(0, :percpu) + /* + * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the + * output PHDR, so the next output section - __data_nosave - should + * start another section data.init2. Also, pda should be at the head of + * percpu area. Preallocate it and define the percpu offset symbol + * so that it can be accessed as a percpu variable. + */ + . = ALIGN(PAGE_SIZE); + PERCPU_VADDR(0, :percpu) #else - PERCPU(PAGE_SIZE) + PERCPU(PAGE_SIZE) #endif - . = ALIGN(PAGE_SIZE); - __init_end = .; - - .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __nosave_begin = .; - *(.data.nosave) - . = ALIGN(PAGE_SIZE); - __nosave_end = .; - } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */ - - .bss : AT(ADDR(.bss) - LOAD_OFFSET) { . = ALIGN(PAGE_SIZE); - __bss_start = .; /* BSS */ - *(.bss.page_aligned) - *(.bss) - __bss_stop = .; - } + __init_end = .; + + .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); + __nosave_begin = .; + *(.data.nosave) + . = ALIGN(PAGE_SIZE); + __nosave_end = .; + } :data.init2 + /* use another section data.init2, see PERCPU_VADDR() above */ + + .bss : AT(ADDR(.bss) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); + __bss_start = .; /* BSS */ + *(.bss.page_aligned) + *(.bss) + __bss_stop = .; + } - .brk : AT(ADDR(.brk) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __brk_base = . ; - . += 64 * 1024 ; /* 64k alignment slop space */ - *(.brk_reservation) /* areas brk users have reserved */ - __brk_limit = . ; - } - - _end = . ; - - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - *(.eh_frame) - *(.discard) + .brk : AT(ADDR(.brk) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); + __brk_base = .; + . += 64 * 1024; /* 64k alignment slop space */ + *(.brk_reservation) /* areas brk users have reserved */ + __brk_limit = .; } - STABS_DEBUG + _end = . ; - DWARF_DEBUG + /* Sections to be discarded */ + /DISCARD/ : { + *(.exitcall.exit) + *(.eh_frame) + *(.discard) + } + + STABS_DEBUG + DWARF_DEBUG } - /* - * Per-cpu symbols which need to be offset from __per_cpu_load - * for the boot processor. - */ +/* + * Per-cpu symbols which need to be offset from __per_cpu_load + * for the boot processor. + */ #define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load INIT_PER_CPU(gdt_page); INIT_PER_CPU(irq_stack_union); -- cgit v1.2.3 From b2ba83ff4f4405cebc10884121ee71338a1a6c94 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 26 Apr 2009 23:38:08 -0700 Subject: x86: apic: Remove duplicated macros XAPIC_DEST_* is dupliicated to the one in apicdef.h [ Impact: cleanup ] Signed-off-by: Yinghai Lu LKML-Reference: <49F552D0.5050505@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/summit_32.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 9cfe1f415d8..344eee4ac0a 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -173,13 +173,6 @@ static inline int is_WPEG(struct rio_detail *rio){ rio->type == LookOutAWPEG || rio->type == LookOutBWPEG); } - -/* In clustered mode, the high nibble of APIC ID is a cluster number. - * The low nibble is a 4-bit bitmap. */ -#define XAPIC_DEST_CPUS_SHIFT 4 -#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1) -#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT) - #define SUMMIT_APIC_DFR_VALUE (APIC_DFR_CLUSTER) static const struct cpumask *summit_target_cpus(void) -- cgit v1.2.3 From e0e42142bab96404de535cceb85d6533d5ad7942 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 26 Apr 2009 23:39:38 -0700 Subject: x86: Use dmi check in apic_is_clustered() on 64-bit to mark the TSC unstable We will have systems with 2 and more sockets 8cores/2thread, but we treat them as multi chassis - while they could have a stable TSC domain. Use DMI check instead. [ Impact: do not turn possibly stable TSCs off incorrectly ] Signed-off-by: Yinghai Lu Cc: Ravikiran Thirumalai LKML-Reference: <49F5532A.5000802@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 86 +++++++++++++++++++++++++++++++-------------- 1 file changed, 59 insertions(+), 27 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 1386dbec552..28f747d61d7 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2138,31 +2138,14 @@ static void apic_pm_activate(void) { } #endif /* CONFIG_PM */ #ifdef CONFIG_X86_64 -/* - * apic_is_clustered_box() -- Check if we can expect good TSC - * - * Thus far, the major user of this is IBM's Summit2 series: - * - * Clustered boxes may have unsynced TSC problems if they are - * multi-chassis. Use available data to take a good guess. - * If in doubt, go HPET. - */ -__cpuinit int apic_is_clustered_box(void) + +static int __cpuinit apic_cluster_num(void) { int i, clusters, zeros; unsigned id; u16 *bios_cpu_apicid; DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); - /* - * there is not this kind of box with AMD CPU yet. - * Some AMD box with quadcore cpu and 8 sockets apicid - * will be [4, 0x23] or [8, 0x27] could be thought to - * vsmp box still need checking... - */ - if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box()) - return 0; - bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); bitmap_zero(clustermap, NUM_APIC_CLUSTERS); @@ -2198,18 +2181,67 @@ __cpuinit int apic_is_clustered_box(void) ++zeros; } - /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are - * not guaranteed to be synced between boards - */ - if (is_vsmp_box() && clusters > 1) + return clusters; +} + +static int __cpuinitdata multi_checked; +static int __cpuinitdata multi; + +static int __cpuinit set_multi(const struct dmi_system_id *d) +{ + if (multi) + return 0; + printk(KERN_INFO "APIC: %s detected, Multi Chassis\n", d->ident); + multi = 1; + return 0; +} + +static const __cpuinitconst struct dmi_system_id multi_dmi_table[] = { + { + .callback = set_multi, + .ident = "IBM System Summit2", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "IBM"), + DMI_MATCH(DMI_PRODUCT_NAME, "Summit2"), + }, + }, + {} +}; + +static void __cpuinit dmi_check_multi(void) +{ + if (multi_checked) + return; + + dmi_check_system(multi_dmi_table); + multi_checked = 1; +} + +/* + * apic_is_clustered_box() -- Check if we can expect good TSC + * + * Thus far, the major user of this is IBM's Summit2 series: + * Clustered boxes may have unsynced TSC problems if they are + * multi-chassis. + * Use DMI to check them + */ +__cpuinit int apic_is_clustered_box(void) +{ + dmi_check_multi(); + if (multi) return 1; + if (!is_vsmp_box()) + return 0; + /* - * If clusters > 2, then should be multi-chassis. - * May have to revisit this when multi-core + hyperthreaded CPUs come - * out, but AFAIK this will work even for them. + * ScaleMP vSMPowered boxes have one cluster per board and TSCs are + * not guaranteed to be synced between boards */ - return (clusters > 2); + if (apic_cluster_num() > 1) + return 1; + + return 0; } #endif -- cgit v1.2.3 From fcef5911c7ea89b80d5bfc727f402f37c9eefd57 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 27 Apr 2009 17:58:23 -0700 Subject: x86/irq: remove leftover code from NUMA_MIGRATE_IRQ_DESC The original feature of migrating irq_desc dynamic was too fragile and was causing problems: it caused crashes on systems with lots of cards with MSI-X when user-space irq-balancer was enabled. We now have new patches that create irq_desc according to device numa node. This patch removes the leftover bits of the dynamic balancer. [ Impact: remove dead code ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Suresh Siddha Cc: "Eric W. Biederman" Cc: Rusty Russell LKML-Reference: <49F654AF.8000808@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 56 +++--------------------------------------- 1 file changed, 4 insertions(+), 52 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 30da617d18e..9fbf0f7ec7e 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -148,9 +148,6 @@ struct irq_cfg { unsigned move_cleanup_count; u8 vector; u8 move_in_progress : 1; -#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC - u8 move_desc_pending : 1; -#endif }; /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ @@ -254,8 +251,7 @@ int arch_init_chip_data(struct irq_desc *desc, int cpu) return 0; } -#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC - +/* for move_irq_desc */ static void init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) { @@ -356,19 +352,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc) old_desc->chip_data = NULL; } } - -static void -set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask) -{ - struct irq_cfg *cfg = desc->chip_data; - - if (!cfg->move_in_progress) { - /* it means that domain is not changed */ - if (!cpumask_intersects(desc->affinity, mask)) - cfg->move_desc_pending = 1; - } -} -#endif +/* end for move_irq_desc */ #else static struct irq_cfg *irq_cfg(unsigned int irq) @@ -378,13 +362,6 @@ static struct irq_cfg *irq_cfg(unsigned int irq) #endif -#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC -static inline void -set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask) -{ -} -#endif - struct io_apic { unsigned int index; unsigned int unused[3]; @@ -592,9 +569,6 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) if (assign_irq_vector(irq, cfg, mask)) return BAD_APICID; - /* check that before desc->addinity get updated */ - set_extra_move_desc(desc, mask); - cpumask_copy(desc->affinity, mask); return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); @@ -2393,8 +2367,6 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) if (assign_irq_vector(irq, cfg, mask)) return; - set_extra_move_desc(desc, mask); - dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); irte.vector = cfg->vector; @@ -2491,34 +2463,14 @@ static void irq_complete_move(struct irq_desc **descp) struct irq_cfg *cfg = desc->chip_data; unsigned vector, me; - if (likely(!cfg->move_in_progress)) { -#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC - if (likely(!cfg->move_desc_pending)) - return; - - /* domain has not changed, but affinity did */ - me = smp_processor_id(); - if (cpumask_test_cpu(me, desc->affinity)) { - *descp = desc = move_irq_desc(desc, me); - /* get the new one */ - cfg = desc->chip_data; - cfg->move_desc_pending = 0; - } -#endif + if (likely(!cfg->move_in_progress)) return; - } vector = ~get_irq_regs()->orig_ax; me = smp_processor_id(); - if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) { -#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC - *descp = desc = move_irq_desc(desc, me); - /* get the new one */ - cfg = desc->chip_data; -#endif + if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) send_cleanup_vector(cfg); - } } #else static inline void irq_complete_move(struct irq_desc **descp) {} -- cgit v1.2.3 From d5dedd4507d307eb3f35f21b6e16f336fdc0d82a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 27 Apr 2009 17:59:21 -0700 Subject: irq: change ->set_affinity() to return status according to Ingo, change set_affinity() in irq_chip should return int, because that way we can handle failure cases in a much cleaner way, in the genirq layer. v2: fix two typos [ Impact: extend API ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Suresh Siddha Cc: "Eric W. Biederman" Cc: Rusty Russell Cc: linux-arch@vger.kernel.org LKML-Reference: <49F654E9.4070809@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 64 +++++++++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 23 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 9fbf0f7ec7e..5c7630b40a5 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -574,13 +574,14 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); } -static void +static int set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { struct irq_cfg *cfg; unsigned long flags; unsigned int dest; unsigned int irq; + int ret = -1; irq = desc->irq; cfg = desc->chip_data; @@ -591,18 +592,21 @@ set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) /* Only the high 8 bits are valid. */ dest = SET_APIC_LOGICAL_ID(dest); __target_IO_APIC_irq(irq, dest, cfg); + ret = 0; } spin_unlock_irqrestore(&ioapic_lock, flags); + + return ret; } -static void +static int set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc; desc = irq_to_desc(irq); - set_ioapic_affinity_irq_desc(desc, mask); + return set_ioapic_affinity_irq_desc(desc, mask); } #endif /* CONFIG_SMP */ @@ -2348,24 +2352,25 @@ static int ioapic_retrigger_irq(unsigned int irq) * Real vector that is used for interrupting cpu will be coming from * the interrupt-remapping table entry. */ -static void +static int migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { struct irq_cfg *cfg; struct irte irte; unsigned int dest; unsigned int irq; + int ret = -1; if (!cpumask_intersects(mask, cpu_online_mask)) - return; + return ret; irq = desc->irq; if (get_irte(irq, &irte)) - return; + return ret; cfg = desc->chip_data; if (assign_irq_vector(irq, cfg, mask)) - return; + return ret; dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); @@ -2381,27 +2386,30 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) send_cleanup_vector(cfg); cpumask_copy(desc->affinity, mask); + + return 0; } /* * Migrates the IRQ destination in the process context. */ -static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, +static int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { - migrate_ioapic_irq_desc(desc, mask); + return migrate_ioapic_irq_desc(desc, mask); } -static void set_ir_ioapic_affinity_irq(unsigned int irq, +static int set_ir_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); - set_ir_ioapic_affinity_irq_desc(desc, mask); + return set_ir_ioapic_affinity_irq_desc(desc, mask); } #else -static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, +static inline int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { + return 0; } #endif @@ -3318,7 +3326,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms } #ifdef CONFIG_SMP -static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) +static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; @@ -3327,7 +3335,7 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID) - return; + return -1; cfg = desc->chip_data; @@ -3339,13 +3347,15 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); write_msi_msg_desc(desc, &msg); + + return 0; } #ifdef CONFIG_INTR_REMAP /* * Migrate the MSI irq to another cpumask. This migration is * done in the process context using interrupt-remapping hardware. */ -static void +static int ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); @@ -3354,11 +3364,11 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) struct irte irte; if (get_irte(irq, &irte)) - return; + return -1; dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID) - return; + return -1; irte.vector = cfg->vector; irte.dest_id = IRTE_DEST(dest); @@ -3375,6 +3385,8 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) */ if (cfg->move_in_progress) send_cleanup_vector(cfg); + + return 0; } #endif @@ -3528,7 +3540,7 @@ void arch_teardown_msi_irq(unsigned int irq) #if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP) #ifdef CONFIG_SMP -static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) +static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; @@ -3537,7 +3549,7 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID) - return; + return -1; cfg = desc->chip_data; @@ -3549,6 +3561,8 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); dmar_msi_write(irq, &msg); + + return 0; } #endif /* CONFIG_SMP */ @@ -3582,7 +3596,7 @@ int arch_setup_dmar_msi(unsigned int irq) #ifdef CONFIG_HPET_TIMER #ifdef CONFIG_SMP -static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) +static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; @@ -3591,7 +3605,7 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID) - return; + return -1; cfg = desc->chip_data; @@ -3603,6 +3617,8 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); hpet_msi_write(irq, &msg); + + return 0; } #endif /* CONFIG_SMP */ @@ -3659,7 +3675,7 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) write_ht_irq_msg(irq, &msg); } -static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) +static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; @@ -3667,11 +3683,13 @@ static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID) - return; + return -1; cfg = desc->chip_data; target_ht_irq(irq, dest, cfg->vector); + + return 0; } #endif -- cgit v1.2.3 From 85ac16d033370caf6f48d743c8dc8103700f5cc5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 27 Apr 2009 18:00:38 -0700 Subject: x86/irq: change irq_desc_alloc() to take node instead of cpu This simplifies the node awareness of the code. All our allocators only deal with a NUMA node ID locality not with CPU ids anyway - so there's no need to maintain (and transform) a CPU id all across the IRq layer. v2: keep move_irq_desc related [ Impact: cleanup, prepare IRQ code to be NUMA-aware ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Suresh Siddha Cc: "Eric W. Biederman" Cc: Rusty Russell Cc: Jeremy Fitzhardinge LKML-Reference: <49F65536.2020300@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 58 +++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 32 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 5c7630b40a5..560b887ba27 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -129,12 +129,9 @@ struct irq_pin_list { struct irq_pin_list *next; }; -static struct irq_pin_list *get_one_free_irq_2_pin(int cpu) +static struct irq_pin_list *get_one_free_irq_2_pin(int node) { struct irq_pin_list *pin; - int node; - - node = cpu_to_node(cpu); pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node); @@ -209,12 +206,9 @@ static struct irq_cfg *irq_cfg(unsigned int irq) return cfg; } -static struct irq_cfg *get_one_free_irq_cfg(int cpu) +static struct irq_cfg *get_one_free_irq_cfg(int node) { struct irq_cfg *cfg; - int node; - - node = cpu_to_node(cpu); cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); if (cfg) { @@ -235,13 +229,13 @@ static struct irq_cfg *get_one_free_irq_cfg(int cpu) return cfg; } -int arch_init_chip_data(struct irq_desc *desc, int cpu) +int arch_init_chip_data(struct irq_desc *desc, int node) { struct irq_cfg *cfg; cfg = desc->chip_data; if (!cfg) { - desc->chip_data = get_one_free_irq_cfg(cpu); + desc->chip_data = get_one_free_irq_cfg(node); if (!desc->chip_data) { printk(KERN_ERR "can not alloc irq_cfg\n"); BUG_ON(1); @@ -253,7 +247,7 @@ int arch_init_chip_data(struct irq_desc *desc, int cpu) /* for move_irq_desc */ static void -init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) +init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node) { struct irq_pin_list *old_entry, *head, *tail, *entry; @@ -262,7 +256,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) if (!old_entry) return; - entry = get_one_free_irq_2_pin(cpu); + entry = get_one_free_irq_2_pin(node); if (!entry) return; @@ -272,7 +266,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) tail = entry; old_entry = old_entry->next; while (old_entry) { - entry = get_one_free_irq_2_pin(cpu); + entry = get_one_free_irq_2_pin(node); if (!entry) { entry = head; while (entry) { @@ -312,12 +306,12 @@ static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg) } void arch_init_copy_chip_data(struct irq_desc *old_desc, - struct irq_desc *desc, int cpu) + struct irq_desc *desc, int node) { struct irq_cfg *cfg; struct irq_cfg *old_cfg; - cfg = get_one_free_irq_cfg(cpu); + cfg = get_one_free_irq_cfg(node); if (!cfg) return; @@ -328,7 +322,7 @@ void arch_init_copy_chip_data(struct irq_desc *old_desc, memcpy(cfg, old_cfg, sizeof(struct irq_cfg)); - init_copy_irq_2_pin(old_cfg, cfg, cpu); + init_copy_irq_2_pin(old_cfg, cfg, node); } static void free_irq_cfg(struct irq_cfg *old_cfg) @@ -615,13 +609,13 @@ set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin) +static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) { struct irq_pin_list *entry; entry = cfg->irq_2_pin; if (!entry) { - entry = get_one_free_irq_2_pin(cpu); + entry = get_one_free_irq_2_pin(node); if (!entry) { printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n", apic, pin); @@ -641,7 +635,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin) entry = entry->next; } - entry->next = get_one_free_irq_2_pin(cpu); + entry->next = get_one_free_irq_2_pin(node); entry = entry->next; entry->apic = apic; entry->pin = pin; @@ -650,7 +644,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin) /* * Reroute an IRQ to a different pin. */ -static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu, +static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, int oldapic, int oldpin, int newapic, int newpin) { @@ -670,7 +664,7 @@ static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu, /* why? call replace before add? */ if (!replaced) - add_pin_to_irq_cpu(cfg, cpu, newapic, newpin); + add_pin_to_irq_node(cfg, node, newapic, newpin); } static inline void io_apic_modify_irq(struct irq_cfg *cfg, @@ -1612,7 +1606,7 @@ static void __init setup_IO_APIC_irqs(void) int notcon = 0; struct irq_desc *desc; struct irq_cfg *cfg; - int cpu = boot_cpu_id; + int node = cpu_to_node(boot_cpu_id); apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); @@ -1647,13 +1641,13 @@ static void __init setup_IO_APIC_irqs(void) apic->multi_timer_check(apic_id, irq)) continue; - desc = irq_to_desc_alloc_cpu(irq, cpu); + desc = irq_to_desc_alloc_node(irq, node); if (!desc) { printk(KERN_INFO "can not get irq_desc for %d\n", irq); continue; } cfg = desc->chip_data; - add_pin_to_irq_cpu(cfg, cpu, apic_id, pin); + add_pin_to_irq_node(cfg, node, apic_id, pin); setup_IO_APIC_irq(apic_id, pin, irq, desc, irq_trigger(idx), irq_polarity(idx)); @@ -2863,7 +2857,7 @@ static inline void __init check_timer(void) { struct irq_desc *desc = irq_to_desc(0); struct irq_cfg *cfg = desc->chip_data; - int cpu = boot_cpu_id; + int node = cpu_to_node(boot_cpu_id); int apic1, pin1, apic2, pin2; unsigned long flags; int no_pin1 = 0; @@ -2929,7 +2923,7 @@ static inline void __init check_timer(void) * Ok, does IRQ0 through the IOAPIC work? */ if (no_pin1) { - add_pin_to_irq_cpu(cfg, cpu, apic1, pin1); + add_pin_to_irq_node(cfg, node, apic1, pin1); setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); } else { /* for edge trigger, setup_IO_APIC_irq already @@ -2966,7 +2960,7 @@ static inline void __init check_timer(void) /* * legacy devices should be connected to IO APIC #0 */ - replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2); + replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2); setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); enable_8259A_irq(0); if (timer_irq_works()) { @@ -3185,7 +3179,7 @@ unsigned int create_irq_nr(unsigned int irq_want) unsigned int new; unsigned long flags; struct irq_cfg *cfg_new = NULL; - int cpu = boot_cpu_id; + int node = cpu_to_node(boot_cpu_id); struct irq_desc *desc_new = NULL; irq = 0; @@ -3194,7 +3188,7 @@ unsigned int create_irq_nr(unsigned int irq_want) spin_lock_irqsave(&vector_lock, flags); for (new = irq_want; new < nr_irqs; new++) { - desc_new = irq_to_desc_alloc_cpu(new, cpu); + desc_new = irq_to_desc_alloc_node(new, node); if (!desc_new) { printk(KERN_INFO "can not get irq_desc for %d\n", new); continue; @@ -3968,7 +3962,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p { struct irq_desc *desc; struct irq_cfg *cfg; - int cpu = boot_cpu_id; + int node = cpu_to_node(boot_cpu_id); if (!IO_APIC_IRQ(irq)) { apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", @@ -3976,7 +3970,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p return -EINVAL; } - desc = irq_to_desc_alloc_cpu(irq, cpu); + desc = irq_to_desc_alloc_node(irq, node); if (!desc) { printk(KERN_INFO "can not get irq_desc %d\n", irq); return 0; @@ -3987,7 +3981,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p */ if (irq >= NR_IRQS_LEGACY) { cfg = desc->chip_data; - add_pin_to_irq_cpu(cfg, cpu, ioapic, pin); + add_pin_to_irq_node(cfg, node, ioapic, pin); } setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity); -- cgit v1.2.3 From a2f809b08ae4dddc1015c7dcd8659e5729e45b3e Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 27 Apr 2009 18:01:20 -0700 Subject: irq: change ACPI GSI APIs to also take a device argument We want to use dev_to_node() later on, to be aware of the 'home node' of the GSI in question. [ Impact: cleanup, prepare the IRQ code to be more NUMA aware ] Signed-off-by: Yinghai Lu Acked-by: Len Brown Cc: Andrew Morton Cc: Suresh Siddha Cc: "Eric W. Biederman" Cc: Rusty Russell Cc: Len Brown Cc: Bjorn Helgaas Cc: Tony Luck Cc: linux-acpi@vger.kernel.org Cc: linux-ia64@vger.kernel.org LKML-Reference: <49F65560.20904@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 8 ++++---- arch/x86/kernel/apic/io_apic.c | 3 ++- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 723989d7f80..6ee96b5530f 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -522,7 +522,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) * success: return IRQ number (>=0) * failure: return < 0 */ -int acpi_register_gsi(u32 gsi, int triggering, int polarity) +int acpi_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) { unsigned int irq; unsigned int plat_gsi = gsi; @@ -539,7 +539,7 @@ int acpi_register_gsi(u32 gsi, int triggering, int polarity) #ifdef CONFIG_X86_IO_APIC if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) { - plat_gsi = mp_register_gsi(gsi, triggering, polarity); + plat_gsi = mp_register_gsi(dev, gsi, triggering, polarity); } #endif acpi_gsi_to_irq(plat_gsi, &irq); @@ -1158,7 +1158,7 @@ void __init mp_config_acpi_legacy_irqs(void) } } -int mp_register_gsi(u32 gsi, int triggering, int polarity) +int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) { int ioapic; int ioapic_pin; @@ -1253,7 +1253,7 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity) } } #endif - io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, + io_apic_set_pci_routing(dev, ioapic, ioapic_pin, gsi, triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, polarity == ACPI_ACTIVE_HIGH ? 0 : 1); return gsi; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 560b887ba27..d9346622601 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3958,7 +3958,8 @@ int __init io_apic_get_version(int ioapic) } #endif -int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) +int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, + int triggering, int polarity) { struct irq_desc *desc; struct irq_cfg *cfg; -- cgit v1.2.3 From 024154cfdd802654cb236a18c78b6e37351e2c49 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 27 Apr 2009 18:01:50 -0700 Subject: irq: change io_apic_set_pci_routing() to use device parameter Make actual use of the device parameter passed down to io_apic_set_pci_routing() - to have the IRQ descriptor on the home node of the device. If no device has been passed down, we assume it's a platform device and use the boot node ID for the IRQ descriptor. [ Impact: optimization, make IO-APIC code more NUMA aware ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Suresh Siddha Cc: "Eric W. Biederman" Cc: Rusty Russell LKML-Reference: <49F6557E.3080101@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index d9346622601..82376e021b5 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3963,7 +3963,7 @@ int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, { struct irq_desc *desc; struct irq_cfg *cfg; - int node = cpu_to_node(boot_cpu_id); + int node; if (!IO_APIC_IRQ(irq)) { apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", @@ -3971,6 +3971,11 @@ int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, return -EINVAL; } + if (dev) + node = dev_to_node(dev); + else + node = cpu_to_node(boot_cpu_id); + desc = irq_to_desc_alloc_node(irq, node); if (!desc) { printk(KERN_INFO "can not get irq_desc %d\n", irq); -- cgit v1.2.3 From d047f53a2ecce37e3bdf79eac5a326fbaadb3628 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 27 Apr 2009 18:02:23 -0700 Subject: x86/irq: change MSI irq_desc to be more numa aware Try to get irq_desc on the home node in create_irq_nr(). v2: don't check if we can move it when sparse_irq is not used v3: use move_irq_des, if that node is not what we want [ Impact: optimization, make MSI IRQ descriptors more NUMA aware ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Suresh Siddha Cc: "Eric W. Biederman" Cc: Rusty Russell LKML-Reference: <49F6559F.7070005@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 82376e021b5..9cd4806cdf5 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3172,14 +3172,13 @@ static int nr_irqs_gsi = NR_IRQS_LEGACY; /* * Dynamic irq allocate and deallocation */ -unsigned int create_irq_nr(unsigned int irq_want) +unsigned int create_irq_nr(unsigned int irq_want, int node) { /* Allocate an unused irq */ unsigned int irq; unsigned int new; unsigned long flags; struct irq_cfg *cfg_new = NULL; - int node = cpu_to_node(boot_cpu_id); struct irq_desc *desc_new = NULL; irq = 0; @@ -3197,6 +3196,13 @@ unsigned int create_irq_nr(unsigned int irq_want) if (cfg_new->vector != 0) continue; + +#ifdef CONFIG_NUMA_IRQ_DESC + /* different node ?*/ + if (desc_new->node != node) + desc = move_irq_desc(desc, node); +#endif + if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) irq = new; break; @@ -3214,11 +3220,12 @@ unsigned int create_irq_nr(unsigned int irq_want) int create_irq(void) { + int node = cpu_to_node(boot_cpu_id); unsigned int irq_want; int irq; irq_want = nr_irqs_gsi; - irq = create_irq_nr(irq_want); + irq = create_irq_nr(irq_want, node); if (irq == 0) irq = -1; @@ -3476,15 +3483,17 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) unsigned int irq_want; struct intel_iommu *iommu = NULL; int index = 0; + int node; /* x86 doesn't support multiple MSI yet */ if (type == PCI_CAP_ID_MSI && nvec > 1) return 1; + node = dev_to_node(&dev->dev); irq_want = nr_irqs_gsi; sub_handle = 0; list_for_each_entry(msidesc, &dev->msi_list, list) { - irq = create_irq_nr(irq_want); + irq = create_irq_nr(irq_want, node); if (irq == 0) return -1; irq_want = irq + 1; -- cgit v1.2.3 From aee6a166a5401dcfcb17fcdc055e5edf2a4f4042 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:17 +0200 Subject: x86: beautify vmlinux_32.lds.S Beautify vmlinux_32.lds.S: - Use tabs for indent - Located curly braces like in C code - Rearranged a few comments To see actual differences use "git diff -b" which ignore 'whitespace' changes. The beautification is done to prepare a unification of the _32 and _64 variants of the linker scripts. [ Impact: cleanup ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-1-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux_32.lds.S | 376 +++++++++++++++++++++------------------ 1 file changed, 200 insertions(+), 176 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 62ad500d55f..fffa45a1036 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -22,196 +22,220 @@ ENTRY(phys_startup_32) jiffies = jiffies_64; PHDRS { - text PT_LOAD FLAGS(5); /* R_E */ - data PT_LOAD FLAGS(7); /* RWE */ - note PT_NOTE FLAGS(0); /* ___ */ + text PT_LOAD FLAGS(5); /* R_E */ + data PT_LOAD FLAGS(7); /* RWE */ + note PT_NOTE FLAGS(0); /* ___ */ } SECTIONS { - . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; - phys_startup_32 = startup_32 - LOAD_OFFSET; - - .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { - _text = .; /* Text and read-only data */ - *(.text.head) - } :text = 0x9090 - - /* read-only */ - .text : AT(ADDR(.text) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); /* not really needed, already page aligned */ - *(.text.page_aligned) - TEXT_TEXT - SCHED_TEXT - LOCK_TEXT - KPROBES_TEXT - IRQENTRY_TEXT - *(.fixup) - *(.gnu.warning) - _etext = .; /* End of text section */ - } :text = 0x9090 - - NOTES :text :note - - . = ALIGN(16); /* Exception table */ - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { - __start___ex_table = .; - *(__ex_table) - __stop___ex_table = .; - } :text = 0x9090 - - RODATA - - /* writeable */ - . = ALIGN(PAGE_SIZE); - .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */ - DATA_DATA - CONSTRUCTORS + . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; + phys_startup_32 = startup_32 - LOAD_OFFSET; + + /* Text and read-only data */ + .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { + _text = .; + *(.text.head) + } :text = 0x9090 + + /* read-only */ + .text : AT(ADDR(.text) - LOAD_OFFSET) { + /* not really needed, already page aligned */ + . = ALIGN(PAGE_SIZE); + *(.text.page_aligned) + TEXT_TEXT + SCHED_TEXT + LOCK_TEXT + KPROBES_TEXT + IRQENTRY_TEXT + *(.fixup) + *(.gnu.warning) + /* End of text section */ + _etext = .; + } :text = 0x9090 + + NOTES :text :note + + /* Exception table */ + . = ALIGN(16); + __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { + __start___ex_table = .; + *(__ex_table) + __stop___ex_table = .; + } :text = 0x9090 + + RODATA + + /* writeable */ + . = ALIGN(PAGE_SIZE); + /* Data */ + .data : AT(ADDR(.data) - LOAD_OFFSET) { + DATA_DATA + CONSTRUCTORS } :data - . = ALIGN(PAGE_SIZE); - .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { - __nosave_begin = .; - *(.data.nosave) - . = ALIGN(PAGE_SIZE); - __nosave_end = .; - } - - . = ALIGN(PAGE_SIZE); - .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { - *(.data.page_aligned) - *(.data.idt) - } - - . = ALIGN(32); - .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { - *(.data.cacheline_aligned) - } - - /* rarely changed data like cpu maps */ - . = ALIGN(32); - .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { - *(.data.read_mostly) - _edata = .; /* End of data section */ - } - - . = ALIGN(THREAD_SIZE); /* init_task */ - .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { - *(.data.init_task) - } - - /* might get freed after init */ - . = ALIGN(PAGE_SIZE); - .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { - __smp_locks = .; - *(.smp_locks) - __smp_locks_end = .; - } - /* will be freed after init - * Following ALIGN() is required to make sure no other data falls on the - * same page where __smp_alt_end is pointing as that page might be freed - * after boot. Always make sure that ALIGN() directive is present after - * the section which contains __smp_alt_end. - */ - . = ALIGN(PAGE_SIZE); - - /* will be freed after init */ - . = ALIGN(PAGE_SIZE); /* Init code and data */ - .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { - __init_begin = .; - _sinittext = .; - INIT_TEXT - _einittext = .; - } - .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { - INIT_DATA - } - . = ALIGN(16); - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { - __setup_start = .; - *(.init.setup) - __setup_end = .; - } - .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { - __initcall_start = .; - INITCALLS - __initcall_end = .; - } - .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { - __con_initcall_start = .; - *(.con_initcall.init) - __con_initcall_end = .; - } - .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { - __x86_cpu_dev_start = .; - *(.x86_cpu_dev.init) - __x86_cpu_dev_end = .; - } - SECURITY_INIT - . = ALIGN(4); - .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { - __alt_instructions = .; - *(.altinstructions) - __alt_instructions_end = .; - } - .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { - *(.altinstr_replacement) - } - . = ALIGN(4); - .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { - __parainstructions = .; - *(.parainstructions) - __parainstructions_end = .; - } - /* .exit.text is discard at runtime, not link time, to deal with references - from .altinstructions and .eh_frame */ - .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { - EXIT_TEXT - } - .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { - EXIT_DATA - } + . = ALIGN(PAGE_SIZE); + .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { + __nosave_begin = .; + *(.data.nosave) + . = ALIGN(PAGE_SIZE); + __nosave_end = .; + } + + . = ALIGN(PAGE_SIZE); + .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { + *(.data.page_aligned) + *(.data.idt) + } + + . = ALIGN(32); + .data.cacheline_aligned : + AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { + *(.data.cacheline_aligned) + } + + /* rarely changed data like cpu maps */ + . = ALIGN(32); + .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { + *(.data.read_mostly) + + /* End of data section */ + _edata = .; + } + + /* init_task */ + . = ALIGN(THREAD_SIZE); + .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { + *(.data.init_task) + } + + . = ALIGN(PAGE_SIZE); + .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { + /* might get freed after init */ + __smp_locks = .; + *(.smp_locks) + __smp_locks_end = .; + } + /* will be freed after init + * Following ALIGN() is required to make sure no other data falls on the + * same page where __smp_alt_end is pointing as that page might be freed + * after boot. Always make sure that ALIGN() directive is present after + * the section which contains __smp_alt_end. + */ + . = ALIGN(PAGE_SIZE); + + /* Init code and data - will be freed after init */ + . = ALIGN(PAGE_SIZE); + .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { + __init_begin = .; + _sinittext = .; + INIT_TEXT + _einittext = .; + } + + .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { + INIT_DATA + } + + . = ALIGN(16); + .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { + __setup_start = .; + *(.init.setup) + __setup_end = .; + } + .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { + __initcall_start = .; + INITCALLS + __initcall_end = .; + } + + .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { + __con_initcall_start = .; + *(.con_initcall.init) + __con_initcall_end = .; + } + + .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { + __x86_cpu_dev_start = .; + *(.x86_cpu_dev.init) + __x86_cpu_dev_end = .; + } + + SECURITY_INIT + + . = ALIGN(4); + .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { + __alt_instructions = .; + *(.altinstructions) + __alt_instructions_end = .; + } + + .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { + *(.altinstr_replacement) + } + + . = ALIGN(4); + .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { + __parainstructions = .; + *(.parainstructions) + __parainstructions_end = .; + } + + /* + * .exit.text is discard at runtime, not link time, to deal with + * references from .altinstructions and .eh_frame + */ + .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { + EXIT_TEXT + } + + .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { + EXIT_DATA + } + #if defined(CONFIG_BLK_DEV_INITRD) - . = ALIGN(PAGE_SIZE); - .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { - __initramfs_start = .; - *(.init.ramfs) - __initramfs_end = .; - } + . = ALIGN(PAGE_SIZE); + .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { + __initramfs_start = .; + *(.init.ramfs) + __initramfs_end = .; + } #endif - PERCPU(PAGE_SIZE) - . = ALIGN(PAGE_SIZE); - /* freed after init ends here */ - - .bss : AT(ADDR(.bss) - LOAD_OFFSET) { - __init_end = .; - __bss_start = .; /* BSS */ - *(.bss.page_aligned) - *(.bss) - . = ALIGN(4); - __bss_stop = .; - } - .brk : AT(ADDR(.brk) - LOAD_OFFSET) { + PERCPU(PAGE_SIZE) + . = ALIGN(PAGE_SIZE); - __brk_base = . ; - . += 64 * 1024 ; /* 64k alignment slop space */ - *(.brk_reservation) /* areas brk users have reserved */ - __brk_limit = . ; - } + /* freed after init ends here */ + + /* BSS */ + .bss : AT(ADDR(.bss) - LOAD_OFFSET) { + __init_end = .; + __bss_start = .; + *(.bss.page_aligned) + *(.bss) + . = ALIGN(4); + __bss_stop = .; + } - .end : AT(ADDR(.end) - LOAD_OFFSET) { - _end = . ; - } + .brk : AT(ADDR(.brk) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); + __brk_base = .; + . += 64 * 1024; /* 64k alignment slop space */ + *(.brk_reservation) /* areas brk users have reserved */ + __brk_limit = .; + } - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - *(.discard) + .end : AT(ADDR(.end) - LOAD_OFFSET) { + _end = . ; } - STABS_DEBUG + /* Sections to be discarded */ + /DISCARD/ : { + *(.exitcall.exit) + *(.discard) + } - DWARF_DEBUG + STABS_DEBUG + DWARF_DEBUG } /* -- cgit v1.2.3 From 17ce265d6a1789eae5eb739a3bb7fcffdb3e87c5 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:18 +0200 Subject: x86, vmlinux.lds: unify header/footer Merge everything except PHDRS and SECTIONS into vmlinux.lds.S. [ Impact: cleanup ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-2-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 77 ++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 37 ------------------- arch/x86/kernel/vmlinux_64.lds.S | 42 ---------------------- 3 files changed, 77 insertions(+), 79 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 849ee611f01..d113642c134 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -1,5 +1,82 @@ +/* + * ld script for the x86 kernel + * + * Historic 32-bit version written by Martin Mares + * + * Modernisation and unification done by Sam Ravnborg + * + * + * Don't define absolute symbols until and unless you know that symbol + * value is should remain constant even if kernel image is relocated + * at run time. Absolute symbols are not relocated. If symbol value should + * change if kernel is relocated, make the symbol section relative and + * put it inside the section definition. + */ + +#ifdef CONFIG_X86_32 +#define LOAD_OFFSET __PAGE_OFFSET +#else +#define LOAD_OFFSET __START_KERNEL_map +#endif + +#include +#include +#include +#include +#include +#include + +#undef i386 /* in case the preprocessor is a 32bit one */ + +OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT) + +#ifdef CONFIG_X86_32 +OUTPUT_ARCH(i386) +ENTRY(phys_startup_32) +jiffies = jiffies_64; +#else +OUTPUT_ARCH(i386:x86-64) +ENTRY(phys_startup_64) +jiffies_64 = jiffies; +#endif + + #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" #else # include "vmlinux_64.lds.S" #endif + + +#ifdef CONFIG_X86_32 +ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE") +#else +/* + * Per-cpu symbols which need to be offset from __per_cpu_load + * for the boot processor. + */ +#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load +INIT_PER_CPU(gdt_page); +INIT_PER_CPU(irq_stack_union); + +/* + * Build-time check on the image size: + */ +ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE") + +#ifdef CONFIG_SMP +ASSERT((per_cpu__irq_stack_union == 0), + "irq_stack_union is not at start of per-cpu area"); +#endif + +#endif /* CONFIG_X86_32 */ + +#ifdef CONFIG_KEXEC +#include + +ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, + "kexec control code size is too big") +#endif + diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index fffa45a1036..4c985fcd9ab 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,26 +1,3 @@ -/* ld script to make i386 Linux kernel - * Written by Martin Mares ; - * - * Don't define absolute symbols until and unless you know that symbol - * value is should remain constant even if kernel image is relocated - * at run time. Absolute symbols are not relocated. If symbol value should - * change if kernel is relocated, make the symbol section relative and - * put it inside the section definition. - */ - -#define LOAD_OFFSET __PAGE_OFFSET - -#include -#include -#include -#include -#include - -OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") -OUTPUT_ARCH(i386) -ENTRY(phys_startup_32) -jiffies = jiffies_64; - PHDRS { text PT_LOAD FLAGS(5); /* R_E */ data PT_LOAD FLAGS(7); /* RWE */ @@ -237,17 +214,3 @@ SECTIONS STABS_DEBUG DWARF_DEBUG } - -/* - * Build-time check on the image size: - */ -ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), - "kernel image bigger than KERNEL_IMAGE_SIZE") - -#ifdef CONFIG_KEXEC -/* Link time checks */ -#include - -ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, - "kexec control code size is too big") -#endif diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 6d5a5b05eaa..7f1cc3d5fef 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,19 +1,3 @@ -/* ld script to make x86-64 Linux kernel - * Written by Martin Mares ; - */ - -#define LOAD_OFFSET __START_KERNEL_map - -#include -#include -#include - -#undef i386 /* in case the preprocessor is a 32bit one */ - -OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") -OUTPUT_ARCH(i386:x86-64) -ENTRY(phys_startup_64) -jiffies_64 = jiffies; PHDRS { text PT_LOAD FLAGS(5); /* R_E */ data PT_LOAD FLAGS(7); /* RWE */ @@ -308,29 +292,3 @@ SECTIONS STABS_DEBUG DWARF_DEBUG } - -/* - * Per-cpu symbols which need to be offset from __per_cpu_load - * for the boot processor. - */ -#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load -INIT_PER_CPU(gdt_page); -INIT_PER_CPU(irq_stack_union); - -/* - * Build-time check on the image size: - */ -ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), - "kernel image bigger than KERNEL_IMAGE_SIZE") - -#ifdef CONFIG_SMP -ASSERT((per_cpu__irq_stack_union == 0), - "irq_stack_union is not at start of per-cpu area"); -#endif - -#ifdef CONFIG_KEXEC -#include - -ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, - "kexec control code size is too big") -#endif -- cgit v1.2.3 From afb8095a7eab32e5760613fa73d2f80a39cc45bf Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:19 +0200 Subject: x86, vmlinux.lds: unify PHDRS PHDRS are not equal for the two - so use ifdefs to cover up for that. On the assumption that they may become equal the ifdef is inside the PHDRS definiton. [ Impact: cleanup ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-3-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 13 +++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 5 ----- arch/x86/kernel/vmlinux_64.lds.S | 11 ----------- 3 files changed, 13 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index d113642c134..1a1b303a427 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -40,6 +40,19 @@ ENTRY(phys_startup_64) jiffies_64 = jiffies; #endif +PHDRS { + text PT_LOAD FLAGS(5); /* R_E */ + data PT_LOAD FLAGS(7); /* RWE */ +#ifdef CONFIG_X86_64 + user PT_LOAD FLAGS(7); /* RWE */ + data.init PT_LOAD FLAGS(7); /* RWE */ +#ifdef CONFIG_SMP + percpu PT_LOAD FLAGS(7); /* RWE */ +#endif + data.init2 PT_LOAD FLAGS(7); /* RWE */ +#endif + note PT_NOTE FLAGS(0); /* ___ */ +} #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 4c985fcd9ab..4fd40dc5017 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,8 +1,3 @@ -PHDRS { - text PT_LOAD FLAGS(5); /* R_E */ - data PT_LOAD FLAGS(7); /* RWE */ - note PT_NOTE FLAGS(0); /* ___ */ -} SECTIONS { . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 7f1cc3d5fef..6e7cbee0e87 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,14 +1,3 @@ -PHDRS { - text PT_LOAD FLAGS(5); /* R_E */ - data PT_LOAD FLAGS(7); /* RWE */ - user PT_LOAD FLAGS(7); /* RWE */ - data.init PT_LOAD FLAGS(7); /* RWE */ -#ifdef CONFIG_SMP - percpu PT_LOAD FLAGS(7); /* RWE */ -#endif - data.init2 PT_LOAD FLAGS(7); /* RWE */ - note PT_NOTE FLAGS(0); /* ___ */ -} SECTIONS { . = __START_KERNEL; -- cgit v1.2.3 From 444e0ae4831f99ba25062d9a5ccb7117c62841a0 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:20 +0200 Subject: x86, vmlinux.lds: unify start/end of SECTIONS [ Impact: cleanup ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-4-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 14 ++++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 9 --------- arch/x86/kernel/vmlinux_64.lds.S | 9 --------- 3 files changed, 14 insertions(+), 18 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 1a1b303a427..845776fe529 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -54,12 +54,26 @@ PHDRS { note PT_NOTE FLAGS(0); /* ___ */ } +SECTIONS +{ +#ifdef CONFIG_X86_32 + . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; + phys_startup_32 = startup_32 - LOAD_OFFSET; +#else + . = __START_KERNEL; + phys_startup_64 = startup_64 - LOAD_OFFSET; +#endif + #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" #else # include "vmlinux_64.lds.S" #endif + STABS_DEBUG + DWARF_DEBUG +} + #ifdef CONFIG_X86_32 ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 4fd40dc5017..3d3d49c31b0 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,8 +1,3 @@ -SECTIONS -{ - . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; - phys_startup_32 = startup_32 - LOAD_OFFSET; - /* Text and read-only data */ .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { _text = .; @@ -205,7 +200,3 @@ SECTIONS *(.exitcall.exit) *(.discard) } - - STABS_DEBUG - DWARF_DEBUG -} diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 6e7cbee0e87..2d7fa2016c3 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,8 +1,3 @@ -SECTIONS -{ - . = __START_KERNEL; - phys_startup_64 = startup_64 - LOAD_OFFSET; - /* Text and read-only data */ .text : AT(ADDR(.text) - LOAD_OFFSET) { _text = .; @@ -277,7 +272,3 @@ SECTIONS *(.eh_frame) *(.discard) } - - STABS_DEBUG - DWARF_DEBUG -} -- cgit v1.2.3 From dfc20895d944cfa81d8ff00809b68ecb8f72cbb0 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:21 +0200 Subject: x86, vmlinux.lds: unify .text output sections 32 bit x86 had a dedicated .text.head output section, whereas 64 bit had it all in a single output section. In the unified version the dedicated .text.head output section was kept to have full control over the head code. 32 bit: - Moved definition of _stext to the linker script. The definition is located _after_ .text.page_aligned as this is what 32 bit did before. The ALIGN(8) was introduced so we hit the exact same address (on the tested config) before and after the move. I assume that it is a bug that _stext did not cover the .text.page_aligned section - if this is true it can be fixed in a follow-up patch (and the ugly ALIGN() can be dropped). [ Impact: 64-bit: cleanup, 32-bit: use the 64-bit linker script ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-5-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_32.S | 7 ------- arch/x86/kernel/vmlinux.lds.S | 31 +++++++++++++++++++++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 24 ------------------------ arch/x86/kernel/vmlinux_64.lds.S | 20 -------------------- 4 files changed, 31 insertions(+), 51 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 30683883e0c..dc5ed4bdd88 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -608,13 +608,6 @@ ignore_int: ENTRY(initial_code) .long i386_start_kernel -.section .text -/* - * Real beginning of normal "text" segment - */ -ENTRY(stext) -ENTRY(_stext) - /* * BSS section */ diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 845776fe529..a7c88bb4365 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -64,6 +64,37 @@ SECTIONS phys_startup_64 = startup_64 - LOAD_OFFSET; #endif + /* Text and read-only data */ + + /* bootstrapping code */ + .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { + _text = .; + *(.text.head) + } :text = 0x9090 + + /* The rest of the text */ + .text : AT(ADDR(.text) - LOAD_OFFSET) { +#ifdef CONFIG_X86_32 + /* not really needed, already page aligned */ + . = ALIGN(PAGE_SIZE); + *(.text.page_aligned) +#endif + . = ALIGN(8); + _stext = .; + TEXT_TEXT + SCHED_TEXT + LOCK_TEXT + KPROBES_TEXT + IRQENTRY_TEXT + *(.fixup) + *(.gnu.warning) + /* End of text section */ + _etext = .; + } :text = 0x9090 + + NOTES :text :note + + #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" #else diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 3d3d49c31b0..854009288ec 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,27 +1,3 @@ - /* Text and read-only data */ - .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { - _text = .; - *(.text.head) - } :text = 0x9090 - - /* read-only */ - .text : AT(ADDR(.text) - LOAD_OFFSET) { - /* not really needed, already page aligned */ - . = ALIGN(PAGE_SIZE); - *(.text.page_aligned) - TEXT_TEXT - SCHED_TEXT - LOCK_TEXT - KPROBES_TEXT - IRQENTRY_TEXT - *(.fixup) - *(.gnu.warning) - /* End of text section */ - _etext = .; - } :text = 0x9090 - - NOTES :text :note - /* Exception table */ . = ALIGN(16); __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 2d7fa2016c3..b5d43670d80 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,23 +1,3 @@ - /* Text and read-only data */ - .text : AT(ADDR(.text) - LOAD_OFFSET) { - _text = .; - /* First the code that has to be first for bootstrapping */ - *(.text.head) - _stext = .; - /* Then the rest */ - TEXT_TEXT - SCHED_TEXT - LOCK_TEXT - KPROBES_TEXT - IRQENTRY_TEXT - *(.fixup) - *(.gnu.warning) - /* End of text section */ - _etext = .; - } :text = 0x9090 - - NOTES :text :note - /* Exception table */ . = ALIGN(16); __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { -- cgit v1.2.3 From 448bc3ab0d03e77fee8e4264de0d001fc87bc164 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:22 +0200 Subject: x86, vmlinux.lds: unify exception table [ Impact: cleanup ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-6-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 10 ++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 10 ---------- arch/x86/kernel/vmlinux_64.lds.S | 10 ---------- 3 files changed, 10 insertions(+), 20 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index a7c88bb4365..67164f6f092 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -94,6 +94,16 @@ SECTIONS NOTES :text :note + /* Exception table */ + . = ALIGN(16); + __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { + __start___ex_table = .; + *(__ex_table) + __stop___ex_table = .; + } :text = 0x9090 + + RODATA + #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 854009288ec..920cc6989cc 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,13 +1,3 @@ - /* Exception table */ - . = ALIGN(16); - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { - __start___ex_table = .; - *(__ex_table) - __stop___ex_table = .; - } :text = 0x9090 - - RODATA - /* writeable */ . = ALIGN(PAGE_SIZE); /* Data */ diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index b5d43670d80..641f3f991a0 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,13 +1,3 @@ - /* Exception table */ - . = ALIGN(16); - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { - __start___ex_table = .; - *(__ex_table) - __stop___ex_table = .; - } :text = 0x9090 - - RODATA - /* Align data segment to page size boundary */ . = ALIGN(PAGE_SIZE); /* Data */ -- cgit v1.2.3 From 1f6397bac55040cd520d9eaf299e155a7aa01d5f Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:23 +0200 Subject: x86, vmlinux.lds: unify data output sections For 64 bit the following functional changes are introduced: - .data.page_aligned has moved - .data.cacheline_aligned has moved - .data.read_mostly has moved - ALIGN() moved out of output section for .data.cacheline_aligned - ALIGN() moved out of output section for .data.page_aligned Notice that 32 bit and 64 bit has different location of _edata. .data_nosave is 32 bit only as 64 bit is special due to PERCPU. [ Impact: 32-bit: cleanup, 64-bit: use 32-bit linker script ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-7-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 55 ++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 37 --------------------------- arch/x86/kernel/vmlinux_64.lds.S | 28 -------------------- 3 files changed, 55 insertions(+), 65 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 67164f6f092..067bdb012da 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -104,6 +104,61 @@ SECTIONS RODATA + /* Data */ + . = ALIGN(PAGE_SIZE); + .data : AT(ADDR(.data) - LOAD_OFFSET) { + DATA_DATA + CONSTRUCTORS + +#ifdef CONFIG_X86_64 + /* End of data section */ + _edata = .; +#endif + } :data + +#ifdef CONFIG_X86_32 + /* 32 bit has nosave before _edata */ + . = ALIGN(PAGE_SIZE); + .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { + __nosave_begin = .; + *(.data.nosave) + . = ALIGN(PAGE_SIZE); + __nosave_end = .; + } +#endif + + . = ALIGN(PAGE_SIZE); + .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { + *(.data.page_aligned) + *(.data.idt) + } + +#ifdef CONFIG_X86_32 + . = ALIGN(32); +#else + . = ALIGN(PAGE_SIZE); + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); +#endif + .data.cacheline_aligned : + AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { + *(.data.cacheline_aligned) + } + + /* rarely changed data like cpu maps */ +#ifdef CONFIG_X86_32 + . = ALIGN(32); +#else + . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); +#endif + .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { + *(.data.read_mostly) + +#ifdef CONFIG_X86_32 + /* End of data section */ + _edata = .; +#endif + } + #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 920cc6989cc..8ade84687b2 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,40 +1,3 @@ - /* writeable */ - . = ALIGN(PAGE_SIZE); - /* Data */ - .data : AT(ADDR(.data) - LOAD_OFFSET) { - DATA_DATA - CONSTRUCTORS - } :data - - . = ALIGN(PAGE_SIZE); - .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { - __nosave_begin = .; - *(.data.nosave) - . = ALIGN(PAGE_SIZE); - __nosave_end = .; - } - - . = ALIGN(PAGE_SIZE); - .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { - *(.data.page_aligned) - *(.data.idt) - } - - . = ALIGN(32); - .data.cacheline_aligned : - AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { - *(.data.cacheline_aligned) - } - - /* rarely changed data like cpu maps */ - . = ALIGN(32); - .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { - *(.data.read_mostly) - - /* End of data section */ - _edata = .; - } - /* init_task */ . = ALIGN(THREAD_SIZE); .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 641f3f991a0..826270147b5 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,26 +1,3 @@ - /* Align data segment to page size boundary */ - . = ALIGN(PAGE_SIZE); - /* Data */ - .data : AT(ADDR(.data) - LOAD_OFFSET) { - DATA_DATA - CONSTRUCTORS - /* End of data section */ - _edata = .; - } :data - - - .data.cacheline_aligned : - AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - *(.data.cacheline_aligned) - } - - . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); - .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { - *(.data.read_mostly) - } - #define VSYSCALL_ADDR (-10*1024*1024) #define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \ SIZEOF(.data.read_mostly) + 4095) & ~(4095)) @@ -95,11 +72,6 @@ *(.data.init_task) } :data.init - .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - *(.data.page_aligned) - } - .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { /* might get freed after init */ . = ALIGN(PAGE_SIZE); -- cgit v1.2.3 From ff6f87e1626e10beef675084c9b5384a9477e3d5 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:24 +0200 Subject: x86, vmlinux.lds: move vsyscall output sections [ Impact: cleanup ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-8-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 71 ++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/vmlinux_64.lds.S | 68 -------------------------------------- 2 files changed, 71 insertions(+), 68 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 067bdb012da..b3106c2a037 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -159,6 +159,77 @@ SECTIONS #endif } +#ifdef CONFIG_X86_64 + +#define VSYSCALL_ADDR (-10*1024*1024) +#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \ + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) +#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \ + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) + +#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) +#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) + +#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) +#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) + + . = VSYSCALL_ADDR; + .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { + *(.vsyscall_0) + } :user + + __vsyscall_0 = VSYSCALL_VIRT_ADDR; + + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { + *(.vsyscall_fn) + } + + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { + *(.vsyscall_gtod_data) + } + + vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); + .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) { + *(.vsyscall_clock) + } + vsyscall_clock = VVIRT(.vsyscall_clock); + + + .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { + *(.vsyscall_1) + } + .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { + *(.vsyscall_2) + } + + .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { + *(.vgetcpu_mode) + } + vgetcpu_mode = VVIRT(.vgetcpu_mode); + + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .jiffies : AT(VLOAD(.jiffies)) { + *(.jiffies) + } + jiffies = VVIRT(.jiffies); + + .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { + *(.vsyscall_3) + } + + . = VSYSCALL_VIRT_ADDR + PAGE_SIZE; + +#undef VSYSCALL_ADDR +#undef VSYSCALL_PHYS_ADDR +#undef VSYSCALL_VIRT_ADDR +#undef VLOAD_OFFSET +#undef VLOAD +#undef VVIRT_OFFSET +#undef VVIRT + +#endif /* CONFIG_X86_64 */ #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 826270147b5..013aa0e1dd3 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,71 +1,3 @@ -#define VSYSCALL_ADDR (-10*1024*1024) -#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \ - SIZEOF(.data.read_mostly) + 4095) & ~(4095)) -#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \ - SIZEOF(.data.read_mostly) + 4095) & ~(4095)) - -#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) -#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) - -#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) -#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) - - . = VSYSCALL_ADDR; - .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { - *(.vsyscall_0) - } :user - - __vsyscall_0 = VSYSCALL_VIRT_ADDR; - - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { - *(.vsyscall_fn) - } - - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { - *(.vsyscall_gtod_data) - } - - vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); - .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) { - *(.vsyscall_clock) - } - vsyscall_clock = VVIRT(.vsyscall_clock); - - - .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { - *(.vsyscall_1) - } - .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { - *(.vsyscall_2) - } - - .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { - *(.vgetcpu_mode) - } - vgetcpu_mode = VVIRT(.vgetcpu_mode); - - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .jiffies : AT(VLOAD(.jiffies)) { - *(.jiffies) - } - jiffies = VVIRT(.jiffies); - - .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { - *(.vsyscall_3) - } - - . = VSYSCALL_VIRT_ADDR + PAGE_SIZE; - -#undef VSYSCALL_ADDR -#undef VSYSCALL_PHYS_ADDR -#undef VSYSCALL_VIRT_ADDR -#undef VLOAD_OFFSET -#undef VLOAD -#undef VVIRT_OFFSET -#undef VVIRT - /* init_task */ .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { . = ALIGN(THREAD_SIZE); -- cgit v1.2.3 From e58bdaa8f810332e5c1760ce496b01e07d51642c Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:25 +0200 Subject: x86, vmlinux.lds: unify first part of initdata 32-bit: - Move definition of __init_begin outside output_section because it covers more than one section - Move ALIGN() for end-of-section inside .smp_locks output section. Same effect but the intent is better documented that we need both start and end aligned. 64-bit: - Move ALIGN() outside output section in .init.setup - Deleted unused __smp_alt_* symbols None of the above should result in any functional change. [ Impact: refactor and unify linker script ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-9-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 61 ++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 60 --------------------------------------- arch/x86/kernel/vmlinux_64.lds.S | 59 -------------------------------------- 3 files changed, 61 insertions(+), 119 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index b3106c2a037..8b203c4ced9 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -231,6 +231,67 @@ SECTIONS #endif /* CONFIG_X86_64 */ + /* init_task */ + . = ALIGN(THREAD_SIZE); + .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { + *(.data.init_task) + } +#ifdef CONFIG_X86_64 + :data.init +#endif + + /* + * smp_locks might be freed after init + * start/end must be page aligned + */ + . = ALIGN(PAGE_SIZE); + .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { + __smp_locks = .; + *(.smp_locks) + __smp_locks_end = .; + . = ALIGN(PAGE_SIZE); + } + + /* Init code and data - will be freed after init */ + . = ALIGN(PAGE_SIZE); + __init_begin = .; /* paired with __init_end */ + .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { + _sinittext = .; + INIT_TEXT + _einittext = .; + } + + .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { + INIT_DATA + } + + . = ALIGN(16); + .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { + __setup_start = .; + *(.init.setup) + __setup_end = .; + } + .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { + __initcall_start = .; + INITCALLS + __initcall_end = .; + } + + .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { + __con_initcall_start = .; + *(.con_initcall.init) + __con_initcall_end = .; + } + + .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { + __x86_cpu_dev_start = .; + *(.x86_cpu_dev.init) + __x86_cpu_dev_end = .; + } + + SECURITY_INIT + + #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" #else diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 8ade84687b2..d8ba5394af0 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,63 +1,3 @@ - /* init_task */ - . = ALIGN(THREAD_SIZE); - .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { - *(.data.init_task) - } - - . = ALIGN(PAGE_SIZE); - .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { - /* might get freed after init */ - __smp_locks = .; - *(.smp_locks) - __smp_locks_end = .; - } - /* will be freed after init - * Following ALIGN() is required to make sure no other data falls on the - * same page where __smp_alt_end is pointing as that page might be freed - * after boot. Always make sure that ALIGN() directive is present after - * the section which contains __smp_alt_end. - */ - . = ALIGN(PAGE_SIZE); - - /* Init code and data - will be freed after init */ - . = ALIGN(PAGE_SIZE); - .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { - __init_begin = .; - _sinittext = .; - INIT_TEXT - _einittext = .; - } - - .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { - INIT_DATA - } - - . = ALIGN(16); - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { - __setup_start = .; - *(.init.setup) - __setup_end = .; - } - .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { - __initcall_start = .; - INITCALLS - __initcall_end = .; - } - - .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { - __con_initcall_start = .; - *(.con_initcall.init) - __con_initcall_end = .; - } - - .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { - __x86_cpu_dev_start = .; - *(.x86_cpu_dev.init) - __x86_cpu_dev_end = .; - } - - SECURITY_INIT - . = ALIGN(4); .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { __alt_instructions = .; diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 013aa0e1dd3..0e8054e0c5c 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,62 +1,3 @@ - /* init_task */ - .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { - . = ALIGN(THREAD_SIZE); - *(.data.init_task) - } :data.init - - .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { - /* might get freed after init */ - . = ALIGN(PAGE_SIZE); - __smp_alt_begin = .; - __smp_locks = .; - *(.smp_locks) - __smp_locks_end = .; - . = ALIGN(PAGE_SIZE); - __smp_alt_end = .; - } - - /* Init code and data */ - . = ALIGN(PAGE_SIZE); - __init_begin = .; /* paired with __init_end */ - .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { - _sinittext = .; - INIT_TEXT - _einittext = .; - } - - .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { - __initdata_begin = .; - INIT_DATA - __initdata_end = .; - } - - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { - . = ALIGN(16); - __setup_start = .; - *(.init.setup) - __setup_end = .; - } - - .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { - __initcall_start = .; - INITCALLS - __initcall_end = .; - } - - .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { - __con_initcall_start = .; - *(.con_initcall.init) - __con_initcall_end = .; - } - - .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { - __x86_cpu_dev_start = .; - *(.x86_cpu_dev.init) - __x86_cpu_dev_end = .; - } - - SECURITY_INIT - . = ALIGN(8); .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { __parainstructions = .; -- cgit v1.2.3 From ae61836289a415351caa524d328110aaeae100d4 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:26 +0200 Subject: x86, vmlinux.lds: unify parainstructions 32 bit: - increase alignment from 4 to 8 for .parainstructions - increase alignment from 4 to 8 for .altinstructions 64 bit: - move ALIGN() outside output section for .altinstructions None of the above should result in any functional change. [ Impact: refactor and unify linker script ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-10-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 18 ++++++++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 18 ------------------ arch/x86/kernel/vmlinux_64.lds.S | 18 ------------------ 3 files changed, 18 insertions(+), 36 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 8b203c4ced9..c8dd71ecb56 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -291,6 +291,24 @@ SECTIONS SECURITY_INIT + . = ALIGN(8); + .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { + __parainstructions = .; + *(.parainstructions) + __parainstructions_end = .; + } + + . = ALIGN(8); + .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { + __alt_instructions = .; + *(.altinstructions) + __alt_instructions_end = .; + } + + .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { + *(.altinstr_replacement) + } + #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index d8ba5394af0..5df9647bb5d 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,21 +1,3 @@ - . = ALIGN(4); - .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { - __alt_instructions = .; - *(.altinstructions) - __alt_instructions_end = .; - } - - .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { - *(.altinstr_replacement) - } - - . = ALIGN(4); - .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { - __parainstructions = .; - *(.parainstructions) - __parainstructions_end = .; - } - /* * .exit.text is discard at runtime, not link time, to deal with * references from .altinstructions and .eh_frame diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 0e8054e0c5c..9ef70966985 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,21 +1,3 @@ - . = ALIGN(8); - .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { - __parainstructions = .; - *(.parainstructions) - __parainstructions_end = .; - } - - .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { - . = ALIGN(8); - __alt_instructions = .; - *(.altinstructions) - __alt_instructions_end = .; - } - - .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { - *(.altinstr_replacement) - } - /* * .exit.text is discard at runtime, not link time, to deal with * references from .altinstructions and .eh_frame -- cgit v1.2.3 From bf6a57418d5445c98047edbec022c9e54d1526e6 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:27 +0200 Subject: x86, vmlinux.lds: unify .exit.* and .init.ramfs [ Impact: cleanup ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-11-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 20 ++++++++++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 21 --------------------- arch/x86/kernel/vmlinux_64.lds.S | 21 --------------------- 3 files changed, 20 insertions(+), 42 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index c8dd71ecb56..1ab62a5fa1a 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -309,6 +309,26 @@ SECTIONS *(.altinstr_replacement) } + /* + * .exit.text is discard at runtime, not link time, to deal with + * references from .altinstructions and .eh_frame + */ + .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { + EXIT_TEXT + } + + .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { + EXIT_DATA + } + +#ifdef CONFIG_BLK_DEV_INITRD + . = ALIGN(PAGE_SIZE); + .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { + __initramfs_start = .; + *(.init.ramfs) + __initramfs_end = .; + } +#endif #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 5df9647bb5d..36c8fbd3c76 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,24 +1,3 @@ - /* - * .exit.text is discard at runtime, not link time, to deal with - * references from .altinstructions and .eh_frame - */ - .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { - EXIT_TEXT - } - - .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { - EXIT_DATA - } - -#if defined(CONFIG_BLK_DEV_INITRD) - . = ALIGN(PAGE_SIZE); - .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { - __initramfs_start = .; - *(.init.ramfs) - __initramfs_end = .; - } -#endif - PERCPU(PAGE_SIZE) . = ALIGN(PAGE_SIZE); diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 9ef70966985..1aa53622333 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,24 +1,3 @@ - /* - * .exit.text is discard at runtime, not link time, to deal with - * references from .altinstructions and .eh_frame - */ - .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { - EXIT_TEXT - } - - .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { - EXIT_DATA - } - -#ifdef CONFIG_BLK_DEV_INITRD - . = ALIGN(PAGE_SIZE); - .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { - __initramfs_start = .; - *(.init.ramfs) - __initramfs_end = .; - } -#endif - #ifdef CONFIG_SMP /* * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the -- cgit v1.2.3 From 9d16e78318f174fd4b07916a93e41749d5199267 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:28 +0200 Subject: x86, vmlinux.lds: unify percpu 32 bit: - move __init_end outside the .bss output section It really did not belong in there [ Impact: 64-bit: cleanup, 32-bit: refactor linker script ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-12-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 30 ++++++++++++++++++++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 6 ------ arch/x86/kernel/vmlinux_64.lds.S | 26 -------------------------- 3 files changed, 30 insertions(+), 32 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 1ab62a5fa1a..1ea2b8571e1 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -330,6 +330,36 @@ SECTIONS } #endif +#if defined(CONFIG_X86_64) && defined(CONFIG_SMP) + /* + * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the + * output PHDR, so the next output section - __data_nosave - should + * start another section data.init2. Also, pda should be at the head of + * percpu area. Preallocate it and define the percpu offset symbol + * so that it can be accessed as a percpu variable. + */ + . = ALIGN(PAGE_SIZE); + PERCPU_VADDR(0, :percpu) +#else + PERCPU(PAGE_SIZE) +#endif + + . = ALIGN(PAGE_SIZE); + /* freed after init ends here */ + __init_end = .; + +#ifdef CONFIG_X86_64 + .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); + __nosave_begin = .; + *(.data.nosave) + . = ALIGN(PAGE_SIZE); + __nosave_end = .; + } :data.init2 + /* use another section data.init2, see PERCPU_VADDR() above */ +#endif + + #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" #else diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 36c8fbd3c76..d23ee2c15c2 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,11 +1,5 @@ - PERCPU(PAGE_SIZE) - - . = ALIGN(PAGE_SIZE); - /* freed after init ends here */ - /* BSS */ .bss : AT(ADDR(.bss) - LOAD_OFFSET) { - __init_end = .; __bss_start = .; *(.bss.page_aligned) *(.bss) diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 1aa53622333..a53936696a0 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,29 +1,3 @@ -#ifdef CONFIG_SMP - /* - * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the - * output PHDR, so the next output section - __data_nosave - should - * start another section data.init2. Also, pda should be at the head of - * percpu area. Preallocate it and define the percpu offset symbol - * so that it can be accessed as a percpu variable. - */ - . = ALIGN(PAGE_SIZE); - PERCPU_VADDR(0, :percpu) -#else - PERCPU(PAGE_SIZE) -#endif - - . = ALIGN(PAGE_SIZE); - __init_end = .; - - .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __nosave_begin = .; - *(.data.nosave) - . = ALIGN(PAGE_SIZE); - __nosave_end = .; - } :data.init2 - /* use another section data.init2, see PERCPU_VADDR() above */ - .bss : AT(ADDR(.bss) - LOAD_OFFSET) { . = ALIGN(PAGE_SIZE); __bss_start = .; /* BSS */ -- cgit v1.2.3 From 091e52c3551d3031343df24b573b770b4c6c72b6 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:29 +0200 Subject: x86, vmlinux.lds: unify remaining parts 32 bit: - explicit page align .bss - move ALING() out of .brk output section - discard *(.eh_frame) 64 bit: - move ALIGN() out of .bss output section - move ALIGN() out of .brk output section - use a dedicated section to define _end [ Impact: unify and fix section alignments in linker script ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-13-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 32 +++++++++++++++++++++++++++----- arch/x86/kernel/vmlinux_32.lds.S | 26 -------------------------- arch/x86/kernel/vmlinux_64.lds.S | 24 ------------------------ 3 files changed, 27 insertions(+), 55 deletions(-) delete mode 100644 arch/x86/kernel/vmlinux_32.lds.S delete mode 100644 arch/x86/kernel/vmlinux_64.lds.S (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 1ea2b8571e1..ef3e4f1042b 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -359,12 +359,34 @@ SECTIONS /* use another section data.init2, see PERCPU_VADDR() above */ #endif + /* BSS */ + . = ALIGN(PAGE_SIZE); + .bss : AT(ADDR(.bss) - LOAD_OFFSET) { + __bss_start = .; + *(.bss.page_aligned) + *(.bss) + . = ALIGN(4); + __bss_stop = .; + } -#ifdef CONFIG_X86_32 -# include "vmlinux_32.lds.S" -#else -# include "vmlinux_64.lds.S" -#endif + . = ALIGN(PAGE_SIZE); + .brk : AT(ADDR(.brk) - LOAD_OFFSET) { + __brk_base = .; + . += 64 * 1024; /* 64k alignment slop space */ + *(.brk_reservation) /* areas brk users have reserved */ + __brk_limit = .; + } + + .end : AT(ADDR(.end) - LOAD_OFFSET) { + _end = .; + } + + /* Sections to be discarded */ + /DISCARD/ : { + *(.exitcall.exit) + *(.eh_frame) + *(.discard) + } STABS_DEBUG DWARF_DEBUG diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S deleted file mode 100644 index d23ee2c15c2..00000000000 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ /dev/null @@ -1,26 +0,0 @@ - /* BSS */ - .bss : AT(ADDR(.bss) - LOAD_OFFSET) { - __bss_start = .; - *(.bss.page_aligned) - *(.bss) - . = ALIGN(4); - __bss_stop = .; - } - - .brk : AT(ADDR(.brk) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __brk_base = .; - . += 64 * 1024; /* 64k alignment slop space */ - *(.brk_reservation) /* areas brk users have reserved */ - __brk_limit = .; - } - - .end : AT(ADDR(.end) - LOAD_OFFSET) { - _end = . ; - } - - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - *(.discard) - } diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S deleted file mode 100644 index a53936696a0..00000000000 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ /dev/null @@ -1,24 +0,0 @@ - .bss : AT(ADDR(.bss) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __bss_start = .; /* BSS */ - *(.bss.page_aligned) - *(.bss) - __bss_stop = .; - } - - .brk : AT(ADDR(.brk) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __brk_base = .; - . += 64 * 1024; /* 64k alignment slop space */ - *(.brk_reservation) /* areas brk users have reserved */ - __brk_limit = .; - } - - _end = . ; - - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - *(.eh_frame) - *(.discard) - } -- cgit v1.2.3 From 91fd7fe809bdf4d8aa56559d17b9f25a1a6fe732 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 29 Apr 2009 10:58:38 +0200 Subject: x86, vmlinux.lds: add copyright Acked-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-2-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index ef3e4f1042b..0bdbaa57969 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -3,7 +3,8 @@ * * Historic 32-bit version written by Martin Mares * - * Modernisation and unification done by Sam Ravnborg + * Modernisation, unification and other changes and fixes: + * Copyright (C) 2007-2009 Sam Ravnborg * * * Don't define absolute symbols until and unless you know that symbol -- cgit v1.2.3 From fd0731944333db6e9e91b6954c6ef95f4b71ab04 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 29 Apr 2009 12:56:58 +0200 Subject: x86, vmlinux.lds: fix relocatable symbols __init_begin/_end symbols should be inside sections as well, otherwise the relocatable kernel gets confused when freeing init sections in the wrong place. [ Impact: fix bootup crash ] Cc: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <20090429105056.GA28720@uranus.ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 0bdbaa57969..4c85b2e2bb6 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -255,8 +255,8 @@ SECTIONS /* Init code and data - will be freed after init */ . = ALIGN(PAGE_SIZE); - __init_begin = .; /* paired with __init_end */ .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { + __init_begin = .; /* paired with __init_end */ _sinittext = .; INIT_TEXT _einittext = .; @@ -346,8 +346,11 @@ SECTIONS #endif . = ALIGN(PAGE_SIZE); + /* freed after init ends here */ - __init_end = .; + .init.end : AT(ADDR(.init.end) - LOAD_OFFSET) { + __init_end = .; + } #ifdef CONFIG_X86_64 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { -- cgit v1.2.3 From da1a776be1ac7f78bb30ececbec4c1383163b079 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:46:58 +0200 Subject: perf_counter, x86: remove X86_FEATURE_ARCH_PERFMON flag for AMD cpus X86_FEATURE_ARCH_PERFMON is an Intel hardware feature that does not work on AMD CPUs. The flag is now only used in Intel specific code (especially initialization). [ Impact: refactor code ] Signed-off-by: Robert Richter Acked-by: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <1241002046-8832-2-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 4 ---- arch/x86/kernel/cpu/perf_counter.c | 6 +++--- 2 files changed, 3 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index fd69c514ca2..7e4a459daa6 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -420,10 +420,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) if (c->x86 >= 6) set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); - /* Enable Performance counter for K7 and later */ - if (c->x86 > 6 && c->x86 <= 0x11) - set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); - if (!c->x86_model_id[0]) { switch (c->x86) { case 0xf: diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 0fcbaab83f9..7d0f81dcb52 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -949,6 +949,9 @@ static struct pmc_x86_ops *pmc_intel_init(void) unsigned int unused; unsigned int ebx; + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return NULL; + /* * Check whether the Architectural PerfMon supports * Branch Misses Retired Event or not. @@ -987,9 +990,6 @@ static struct pmc_x86_ops *pmc_amd_init(void) void __init init_hw_perf_counters(void) { - if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return; - switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: pmc_ops = pmc_intel_init(); -- cgit v1.2.3 From 4138960a9251a265002b5cf07e671a49f8495381 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:00 +0200 Subject: perf_counter, x86: add default path to cpu detection This quits hw counter initialization immediately if no cpu is detected. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-4-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 7d0f81dcb52..d6d6529349d 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -997,6 +997,8 @@ void __init init_hw_perf_counters(void) case X86_VENDOR_AMD: pmc_ops = pmc_amd_init(); break; + default: + return; } if (!pmc_ops) return; -- cgit v1.2.3 From 4295ee62660b13ddb87d41539f49b239e6e7d56f Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:01 +0200 Subject: perf_counter, x86: rework pmc_amd_save_disable_all() and pmc_amd_restore_all() MSR reads and writes are expensive. This patch adds checks to avoid its usage where possible. [ Impact: micro-optimization on AMD CPUs ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-5-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index d6d6529349d..75a090394b6 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -334,11 +334,13 @@ static u64 pmc_amd_save_disable_all(void) for (idx = 0; idx < nr_counters_generic; idx++) { u64 val; + if (!test_bit(idx, cpuc->active_mask)) + continue; rdmsrl(MSR_K7_EVNTSEL0 + idx, val); - if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) { - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_K7_EVNTSEL0 + idx, val); - } + if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) + continue; + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_K7_EVNTSEL0 + idx, val); } return enabled; @@ -372,13 +374,15 @@ static void pmc_amd_restore_all(u64 ctrl) return; for (idx = 0; idx < nr_counters_generic; idx++) { - if (test_bit(idx, cpuc->active_mask)) { - u64 val; + u64 val; - rdmsrl(MSR_K7_EVNTSEL0 + idx, val); - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_K7_EVNTSEL0 + idx, val); - } + if (!test_bit(idx, cpuc->active_mask)) + continue; + rdmsrl(MSR_K7_EVNTSEL0 + idx, val); + if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) + continue; + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_K7_EVNTSEL0 + idx, val); } } -- cgit v1.2.3 From 527e26af3741a2168986d8b82653ffe173891324 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:02 +0200 Subject: perf_counter, x86: protect per-cpu variables with compile barriers only Per-cpu variables needn't to be protected with cpu barriers (smp_wmb()). Protection is only needed for preemption on the same cpu (rescheduling or the nmi handler). This can be done using a compiler barrier only. [ Impact: micro-optimization ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-6-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 75a090394b6..ad663d5ad2d 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -673,7 +673,7 @@ try_generic: /* * Make it visible before enabling the hw: */ - smp_wmb(); + barrier(); __hw_perf_counter_set_period(counter, hwc, idx); __pmc_generic_enable(counter, hwc, idx); @@ -745,7 +745,7 @@ static void pmc_generic_disable(struct perf_counter *counter) * Make sure the cleared pointer becomes visible before we * (potentially) free the counter: */ - smp_wmb(); + barrier(); /* * Drain the remaining delta count out of a counter -- cgit v1.2.3 From 4aeb0b4239bb3b67ed402cb9cef3e000c892cadf Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:03 +0200 Subject: perfcounters: rename struct hw_perf_counter_ops into struct pmu This patch renames struct hw_perf_counter_ops into struct pmu. It introduces a structure to describe a cpu specific pmu (performance monitoring unit). It may contain ops and data. The new name of the structure fits better, is shorter, and thus better to handle. Where it was appropriate, names of function and variable have been changed too. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-7-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index ad663d5ad2d..95de980c74a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -515,8 +515,8 @@ __pmc_fixed_disable(struct perf_counter *counter, } static inline void -__pmc_generic_disable(struct perf_counter *counter, - struct hw_perf_counter *hwc, unsigned int idx) +__x86_pmu_disable(struct perf_counter *counter, + struct hw_perf_counter *hwc, unsigned int idx) { if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) __pmc_fixed_disable(counter, hwc, idx); @@ -591,8 +591,8 @@ __pmc_fixed_enable(struct perf_counter *counter, } static void -__pmc_generic_enable(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) +__x86_pmu_enable(struct perf_counter *counter, + struct hw_perf_counter *hwc, int idx) { if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) __pmc_fixed_enable(counter, hwc, idx); @@ -626,7 +626,7 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) /* * Find a PMC slot for the freshly enabled / scheduled in counter: */ -static int pmc_generic_enable(struct perf_counter *counter) +static int x86_pmu_enable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; @@ -667,7 +667,7 @@ try_generic: perf_counters_lapic_init(hwc->nmi); - __pmc_generic_disable(counter, hwc, idx); + __x86_pmu_disable(counter, hwc, idx); cpuc->counters[idx] = counter; /* @@ -676,7 +676,7 @@ try_generic: barrier(); __hw_perf_counter_set_period(counter, hwc, idx); - __pmc_generic_enable(counter, hwc, idx); + __x86_pmu_enable(counter, hwc, idx); return 0; } @@ -731,13 +731,13 @@ void perf_counter_print_debug(void) local_irq_enable(); } -static void pmc_generic_disable(struct perf_counter *counter) +static void x86_pmu_disable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; unsigned int idx = hwc->idx; - __pmc_generic_disable(counter, hwc, idx); + __x86_pmu_disable(counter, hwc, idx); clear_bit(idx, cpuc->used); cpuc->counters[idx] = NULL; @@ -767,7 +767,7 @@ static void perf_save_and_restart(struct perf_counter *counter) __hw_perf_counter_set_period(counter, hwc, idx); if (counter->state == PERF_COUNTER_STATE_ACTIVE) - __pmc_generic_enable(counter, hwc, idx); + __x86_pmu_enable(counter, hwc, idx); } /* @@ -805,7 +805,7 @@ again: perf_save_and_restart(counter); if (perf_counter_overflow(counter, nmi, regs, 0)) - __pmc_generic_disable(counter, &counter->hw, bit); + __x86_pmu_disable(counter, &counter->hw, bit); } hw_perf_ack_status(ack); @@ -1034,19 +1034,18 @@ void __init init_hw_perf_counters(void) register_die_notifier(&perf_counter_nmi_notifier); } -static void pmc_generic_read(struct perf_counter *counter) +static void x86_pmu_read(struct perf_counter *counter) { x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); } -static const struct hw_perf_counter_ops x86_perf_counter_ops = { - .enable = pmc_generic_enable, - .disable = pmc_generic_disable, - .read = pmc_generic_read, +static const struct pmu pmu = { + .enable = x86_pmu_enable, + .disable = x86_pmu_disable, + .read = x86_pmu_read, }; -const struct hw_perf_counter_ops * -hw_perf_counter_init(struct perf_counter *counter) +const struct pmu *hw_perf_counter_init(struct perf_counter *counter) { int err; @@ -1054,7 +1053,7 @@ hw_perf_counter_init(struct perf_counter *counter) if (err) return ERR_PTR(err); - return &x86_perf_counter_ops; + return &pmu; } /* -- cgit v1.2.3 From 5f4ec28ffe77c840354cce1820a3436106e9e0f1 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:04 +0200 Subject: perf_counter, x86: rename struct pmc_x86_ops into struct x86_pmu This patch renames struct pmc_x86_ops into struct x86_pmu. It introduces a structure to describe an x86 model specific pmu (performance monitoring unit). It may contain ops and data. The new name of the structure fits better, is shorter, and thus better to handle. Where it was appropriate, names of function and variable have been changed too. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-8-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 135 +++++++++++++++++++------------------ 1 file changed, 68 insertions(+), 67 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 95de980c74a..808a1a11346 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -44,9 +44,9 @@ struct cpu_hw_counters { }; /* - * struct pmc_x86_ops - performance counter x86 ops + * struct x86_pmu - generic x86 pmu */ -struct pmc_x86_ops { +struct x86_pmu { u64 (*save_disable_all)(void); void (*restore_all)(u64); u64 (*get_status)(u64); @@ -60,7 +60,7 @@ struct pmc_x86_ops { int max_events; }; -static struct pmc_x86_ops *pmc_ops __read_mostly; +static struct x86_pmu *x86_pmu __read_mostly; static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { .enabled = 1, @@ -82,12 +82,12 @@ static const u64 intel_perfmon_event_map[] = [PERF_COUNT_BUS_CYCLES] = 0x013c, }; -static u64 pmc_intel_event_map(int event) +static u64 intel_pmu_event_map(int event) { return intel_perfmon_event_map[event]; } -static u64 pmc_intel_raw_event(u64 event) +static u64 intel_pmu_raw_event(u64 event) { #define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL #define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL @@ -114,12 +114,12 @@ static const u64 amd_perfmon_event_map[] = [PERF_COUNT_BRANCH_MISSES] = 0x00c5, }; -static u64 pmc_amd_event_map(int event) +static u64 amd_pmu_event_map(int event) { return amd_perfmon_event_map[event]; } -static u64 pmc_amd_raw_event(u64 event) +static u64 amd_pmu_raw_event(u64 event) { #define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL #define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL @@ -184,12 +184,12 @@ static bool reserve_pmc_hardware(void) disable_lapic_nmi_watchdog(); for (i = 0; i < nr_counters_generic; i++) { - if (!reserve_perfctr_nmi(pmc_ops->perfctr + i)) + if (!reserve_perfctr_nmi(x86_pmu->perfctr + i)) goto perfctr_fail; } for (i = 0; i < nr_counters_generic; i++) { - if (!reserve_evntsel_nmi(pmc_ops->eventsel + i)) + if (!reserve_evntsel_nmi(x86_pmu->eventsel + i)) goto eventsel_fail; } @@ -197,13 +197,13 @@ static bool reserve_pmc_hardware(void) eventsel_fail: for (i--; i >= 0; i--) - release_evntsel_nmi(pmc_ops->eventsel + i); + release_evntsel_nmi(x86_pmu->eventsel + i); i = nr_counters_generic; perfctr_fail: for (i--; i >= 0; i--) - release_perfctr_nmi(pmc_ops->perfctr + i); + release_perfctr_nmi(x86_pmu->perfctr + i); if (nmi_watchdog == NMI_LOCAL_APIC) enable_lapic_nmi_watchdog(); @@ -216,8 +216,8 @@ static void release_pmc_hardware(void) int i; for (i = 0; i < nr_counters_generic; i++) { - release_perfctr_nmi(pmc_ops->perfctr + i); - release_evntsel_nmi(pmc_ops->eventsel + i); + release_perfctr_nmi(x86_pmu->perfctr + i); + release_evntsel_nmi(x86_pmu->eventsel + i); } if (nmi_watchdog == NMI_LOCAL_APIC) @@ -293,14 +293,14 @@ static int __hw_perf_counter_init(struct perf_counter *counter) * Raw event type provide the config in the event structure */ if (perf_event_raw(hw_event)) { - hwc->config |= pmc_ops->raw_event(perf_event_config(hw_event)); + hwc->config |= x86_pmu->raw_event(perf_event_config(hw_event)); } else { - if (perf_event_id(hw_event) >= pmc_ops->max_events) + if (perf_event_id(hw_event) >= x86_pmu->max_events) return -EINVAL; /* * The generic map: */ - hwc->config |= pmc_ops->event_map(perf_event_id(hw_event)); + hwc->config |= x86_pmu->event_map(perf_event_id(hw_event)); } counter->destroy = hw_perf_counter_destroy; @@ -308,7 +308,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return 0; } -static u64 pmc_intel_save_disable_all(void) +static u64 intel_pmu_save_disable_all(void) { u64 ctrl; @@ -318,7 +318,7 @@ static u64 pmc_intel_save_disable_all(void) return ctrl; } -static u64 pmc_amd_save_disable_all(void) +static u64 amd_pmu_save_disable_all(void) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); int enabled, idx; @@ -327,7 +327,8 @@ static u64 pmc_amd_save_disable_all(void) cpuc->enabled = 0; /* * ensure we write the disable before we start disabling the - * counters proper, so that pcm_amd_enable() does the right thing. + * counters proper, so that amd_pmu_enable_counter() does the + * right thing. */ barrier(); @@ -351,19 +352,19 @@ u64 hw_perf_save_disable(void) if (unlikely(!perf_counters_initialized)) return 0; - return pmc_ops->save_disable_all(); + return x86_pmu->save_disable_all(); } /* * Exported because of ACPI idle */ EXPORT_SYMBOL_GPL(hw_perf_save_disable); -static void pmc_intel_restore_all(u64 ctrl) +static void intel_pmu_restore_all(u64 ctrl) { wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); } -static void pmc_amd_restore_all(u64 ctrl) +static void amd_pmu_restore_all(u64 ctrl) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); int idx; @@ -391,14 +392,14 @@ void hw_perf_restore(u64 ctrl) if (unlikely(!perf_counters_initialized)) return; - pmc_ops->restore_all(ctrl); + x86_pmu->restore_all(ctrl); } /* * Exported because of ACPI idle */ EXPORT_SYMBOL_GPL(hw_perf_restore); -static u64 pmc_intel_get_status(u64 mask) +static u64 intel_pmu_get_status(u64 mask) { u64 status; @@ -407,7 +408,7 @@ static u64 pmc_intel_get_status(u64 mask) return status; } -static u64 pmc_amd_get_status(u64 mask) +static u64 amd_pmu_get_status(u64 mask) { u64 status = 0; int idx; @@ -432,15 +433,15 @@ static u64 hw_perf_get_status(u64 mask) if (unlikely(!perf_counters_initialized)) return 0; - return pmc_ops->get_status(mask); + return x86_pmu->get_status(mask); } -static void pmc_intel_ack_status(u64 ack) +static void intel_pmu_ack_status(u64 ack) { wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); } -static void pmc_amd_ack_status(u64 ack) +static void amd_pmu_ack_status(u64 ack) { } @@ -449,16 +450,16 @@ static void hw_perf_ack_status(u64 ack) if (unlikely(!perf_counters_initialized)) return; - pmc_ops->ack_status(ack); + x86_pmu->ack_status(ack); } -static void pmc_intel_enable(int idx, u64 config) +static void intel_pmu_enable_counter(int idx, u64 config) { wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config | ARCH_PERFMON_EVENTSEL0_ENABLE); } -static void pmc_amd_enable(int idx, u64 config) +static void amd_pmu_enable_counter(int idx, u64 config) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); @@ -474,15 +475,15 @@ static void hw_perf_enable(int idx, u64 config) if (unlikely(!perf_counters_initialized)) return; - pmc_ops->enable(idx, config); + x86_pmu->enable(idx, config); } -static void pmc_intel_disable(int idx, u64 config) +static void intel_pmu_disable_counter(int idx, u64 config) { wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config); } -static void pmc_amd_disable(int idx, u64 config) +static void amd_pmu_disable_counter(int idx, u64 config) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); @@ -496,7 +497,7 @@ static void hw_perf_disable(int idx, u64 config) if (unlikely(!perf_counters_initialized)) return; - pmc_ops->disable(idx, config); + x86_pmu->disable(idx, config); } static inline void @@ -613,11 +614,11 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) event = hwc->config & ARCH_PERFMON_EVENT_MASK; - if (unlikely(event == pmc_ops->event_map(PERF_COUNT_INSTRUCTIONS))) + if (unlikely(event == x86_pmu->event_map(PERF_COUNT_INSTRUCTIONS))) return X86_PMC_IDX_FIXED_INSTRUCTIONS; - if (unlikely(event == pmc_ops->event_map(PERF_COUNT_CPU_CYCLES))) + if (unlikely(event == x86_pmu->event_map(PERF_COUNT_CPU_CYCLES))) return X86_PMC_IDX_FIXED_CPU_CYCLES; - if (unlikely(event == pmc_ops->event_map(PERF_COUNT_BUS_CYCLES))) + if (unlikely(event == x86_pmu->event_map(PERF_COUNT_BUS_CYCLES))) return X86_PMC_IDX_FIXED_BUS_CYCLES; return -1; @@ -661,8 +662,8 @@ try_generic: set_bit(idx, cpuc->used); hwc->idx = idx; } - hwc->config_base = pmc_ops->eventsel; - hwc->counter_base = pmc_ops->perfctr; + hwc->config_base = x86_pmu->eventsel; + hwc->counter_base = x86_pmu->perfctr; } perf_counters_lapic_init(hwc->nmi); @@ -710,8 +711,8 @@ void perf_counter_print_debug(void) pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); for (idx = 0; idx < nr_counters_generic; idx++) { - rdmsrl(pmc_ops->eventsel + idx, pmc_ctrl); - rdmsrl(pmc_ops->perfctr + idx, pmc_count); + rdmsrl(x86_pmu->eventsel + idx, pmc_ctrl); + rdmsrl(x86_pmu->perfctr + idx, pmc_count); prev_left = per_cpu(prev_left[idx], cpu); @@ -918,35 +919,35 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = { .priority = 1 }; -static struct pmc_x86_ops pmc_intel_ops = { - .save_disable_all = pmc_intel_save_disable_all, - .restore_all = pmc_intel_restore_all, - .get_status = pmc_intel_get_status, - .ack_status = pmc_intel_ack_status, - .enable = pmc_intel_enable, - .disable = pmc_intel_disable, +static struct x86_pmu intel_pmu = { + .save_disable_all = intel_pmu_save_disable_all, + .restore_all = intel_pmu_restore_all, + .get_status = intel_pmu_get_status, + .ack_status = intel_pmu_ack_status, + .enable = intel_pmu_enable_counter, + .disable = intel_pmu_disable_counter, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, .perfctr = MSR_ARCH_PERFMON_PERFCTR0, - .event_map = pmc_intel_event_map, - .raw_event = pmc_intel_raw_event, + .event_map = intel_pmu_event_map, + .raw_event = intel_pmu_raw_event, .max_events = ARRAY_SIZE(intel_perfmon_event_map), }; -static struct pmc_x86_ops pmc_amd_ops = { - .save_disable_all = pmc_amd_save_disable_all, - .restore_all = pmc_amd_restore_all, - .get_status = pmc_amd_get_status, - .ack_status = pmc_amd_ack_status, - .enable = pmc_amd_enable, - .disable = pmc_amd_disable, +static struct x86_pmu amd_pmu = { + .save_disable_all = amd_pmu_save_disable_all, + .restore_all = amd_pmu_restore_all, + .get_status = amd_pmu_get_status, + .ack_status = amd_pmu_ack_status, + .enable = amd_pmu_enable_counter, + .disable = amd_pmu_disable_counter, .eventsel = MSR_K7_EVNTSEL0, .perfctr = MSR_K7_PERFCTR0, - .event_map = pmc_amd_event_map, - .raw_event = pmc_amd_raw_event, + .event_map = amd_pmu_event_map, + .raw_event = amd_pmu_raw_event, .max_events = ARRAY_SIZE(amd_perfmon_event_map), }; -static struct pmc_x86_ops *pmc_intel_init(void) +static struct x86_pmu *intel_pmu_init(void) { union cpuid10_edx edx; union cpuid10_eax eax; @@ -977,10 +978,10 @@ static struct pmc_x86_ops *pmc_intel_init(void) nr_counters_fixed = edx.split.num_counters_fixed; counter_value_mask = (1ULL << eax.split.bit_width) - 1; - return &pmc_intel_ops; + return &intel_pmu; } -static struct pmc_x86_ops *pmc_amd_init(void) +static struct x86_pmu *amd_pmu_init(void) { nr_counters_generic = 4; nr_counters_fixed = 0; @@ -989,22 +990,22 @@ static struct pmc_x86_ops *pmc_amd_init(void) pr_info("AMD Performance Monitoring support detected.\n"); - return &pmc_amd_ops; + return &amd_pmu; } void __init init_hw_perf_counters(void) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: - pmc_ops = pmc_intel_init(); + x86_pmu = intel_pmu_init(); break; case X86_VENDOR_AMD: - pmc_ops = pmc_amd_init(); + x86_pmu = amd_pmu_init(); break; default: return; } - if (!pmc_ops) + if (!x86_pmu) return; pr_info("... num counters: %d\n", nr_counters_generic); -- cgit v1.2.3 From 39d81eab2374d71b2d9c82f66258a1a4f57ddd2e Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:05 +0200 Subject: perf_counter, x86: make interrupt handler model specific This separates the perfcounter interrupt handler for AMD and Intel cpus. The AMD interrupt handler implementation is a follow-on patch. [ Impact: refactor and clean up code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-9-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 808a1a11346..9d90de0bd0b 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -4,6 +4,7 @@ * Copyright(C) 2008 Thomas Gleixner * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar * Copyright(C) 2009 Jaswinder Singh Rajput + * Copyright(C) 2009 Advanced Micro Devices, Inc., Robert Richter * * For licencing details see kernel-base/COPYING */ @@ -47,6 +48,7 @@ struct cpu_hw_counters { * struct x86_pmu - generic x86 pmu */ struct x86_pmu { + int (*handle_irq)(struct pt_regs *, int); u64 (*save_disable_all)(void); void (*restore_all)(u64); u64 (*get_status)(u64); @@ -241,6 +243,10 @@ static int __hw_perf_counter_init(struct perf_counter *counter) struct hw_perf_counter *hwc = &counter->hw; int err; + /* disable temporarily */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + return -ENOSYS; + if (unlikely(!perf_counters_initialized)) return -EINVAL; @@ -780,7 +786,7 @@ static void perf_save_and_restart(struct perf_counter *counter) * This handler is triggered by the local APIC, so the APIC IRQ handling * rules apply: */ -static int __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) +static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) { int bit, cpu = smp_processor_id(); u64 ack, status; @@ -827,6 +833,8 @@ out: return ret; } +static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) { return 0; } + void perf_counter_unthrottle(void) { struct cpu_hw_counters *cpuc; @@ -851,7 +859,7 @@ void smp_perf_counter_interrupt(struct pt_regs *regs) irq_enter(); apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR); ack_APIC_irq(); - __smp_perf_counter_interrupt(regs, 0); + x86_pmu->handle_irq(regs, 0); irq_exit(); } @@ -908,7 +916,7 @@ perf_counter_nmi_handler(struct notifier_block *self, regs = args->regs; apic_write(APIC_LVTPC, APIC_DM_NMI); - ret = __smp_perf_counter_interrupt(regs, 1); + ret = x86_pmu->handle_irq(regs, 1); return ret ? NOTIFY_STOP : NOTIFY_OK; } @@ -920,6 +928,7 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = { }; static struct x86_pmu intel_pmu = { + .handle_irq = intel_pmu_handle_irq, .save_disable_all = intel_pmu_save_disable_all, .restore_all = intel_pmu_restore_all, .get_status = intel_pmu_get_status, @@ -934,6 +943,7 @@ static struct x86_pmu intel_pmu = { }; static struct x86_pmu amd_pmu = { + .handle_irq = amd_pmu_handle_irq, .save_disable_all = amd_pmu_save_disable_all, .restore_all = amd_pmu_restore_all, .get_status = amd_pmu_get_status, -- cgit v1.2.3 From b7f8859a8ed1937e2139c17b84878f1d413fa659 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:06 +0200 Subject: perf_counter, x86: remove get_status() from struct x86_pmu This function is Intel only and not necessary for AMD cpus. [ Impact: simplify code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-10-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 39 +++++--------------------------------- 1 file changed, 5 insertions(+), 34 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 9d90de0bd0b..d0bb02919c6 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -51,7 +51,6 @@ struct x86_pmu { int (*handle_irq)(struct pt_regs *, int); u64 (*save_disable_all)(void); void (*restore_all)(u64); - u64 (*get_status)(u64); void (*ack_status)(u64); void (*enable)(int, u64); void (*disable)(int, u64); @@ -405,41 +404,15 @@ void hw_perf_restore(u64 ctrl) */ EXPORT_SYMBOL_GPL(hw_perf_restore); -static u64 intel_pmu_get_status(u64 mask) +static inline u64 intel_pmu_get_status(u64 mask) { u64 status; - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); - - return status; -} - -static u64 amd_pmu_get_status(u64 mask) -{ - u64 status = 0; - int idx; - - for (idx = 0; idx < nr_counters_generic; idx++) { - s64 val; - - if (!(mask & (1 << idx))) - continue; - - rdmsrl(MSR_K7_PERFCTR0 + idx, val); - val <<= (64 - counter_value_bits); - if (val >= 0) - status |= (1 << idx); - } - - return status; -} - -static u64 hw_perf_get_status(u64 mask) -{ if (unlikely(!perf_counters_initialized)) return 0; + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); - return x86_pmu->get_status(mask); + return status; } static void intel_pmu_ack_status(u64 ack) @@ -795,7 +768,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) cpuc->throttle_ctrl = hw_perf_save_disable(); - status = hw_perf_get_status(cpuc->throttle_ctrl); + status = intel_pmu_get_status(cpuc->throttle_ctrl); if (!status) goto out; @@ -820,7 +793,7 @@ again: /* * Repeat if there is more work to be done: */ - status = hw_perf_get_status(cpuc->throttle_ctrl); + status = intel_pmu_get_status(cpuc->throttle_ctrl); if (status) goto again; out: @@ -931,7 +904,6 @@ static struct x86_pmu intel_pmu = { .handle_irq = intel_pmu_handle_irq, .save_disable_all = intel_pmu_save_disable_all, .restore_all = intel_pmu_restore_all, - .get_status = intel_pmu_get_status, .ack_status = intel_pmu_ack_status, .enable = intel_pmu_enable_counter, .disable = intel_pmu_disable_counter, @@ -946,7 +918,6 @@ static struct x86_pmu amd_pmu = { .handle_irq = amd_pmu_handle_irq, .save_disable_all = amd_pmu_save_disable_all, .restore_all = amd_pmu_restore_all, - .get_status = amd_pmu_get_status, .ack_status = amd_pmu_ack_status, .enable = amd_pmu_enable_counter, .disable = amd_pmu_disable_counter, -- cgit v1.2.3 From dee5d9067ca78b317538fd67930be4e09a83dbc5 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:07 +0200 Subject: perf_counter, x86: remove ack_status() from struct x86_pmu This function is Intel only and not necessary for AMD cpus. [ Impact: simplify code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-11-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index d0bb02919c6..6bbdc16cc69 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -51,7 +51,6 @@ struct x86_pmu { int (*handle_irq)(struct pt_regs *, int); u64 (*save_disable_all)(void); void (*restore_all)(u64); - void (*ack_status)(u64); void (*enable)(int, u64); void (*disable)(int, u64); unsigned eventsel; @@ -415,23 +414,11 @@ static inline u64 intel_pmu_get_status(u64 mask) return status; } -static void intel_pmu_ack_status(u64 ack) +static inline void intel_pmu_ack_status(u64 ack) { wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); } -static void amd_pmu_ack_status(u64 ack) -{ -} - -static void hw_perf_ack_status(u64 ack) -{ - if (unlikely(!perf_counters_initialized)) - return; - - x86_pmu->ack_status(ack); -} - static void intel_pmu_enable_counter(int idx, u64 config) { wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, @@ -788,7 +775,7 @@ again: __x86_pmu_disable(counter, &counter->hw, bit); } - hw_perf_ack_status(ack); + intel_pmu_ack_status(ack); /* * Repeat if there is more work to be done: @@ -904,7 +891,6 @@ static struct x86_pmu intel_pmu = { .handle_irq = intel_pmu_handle_irq, .save_disable_all = intel_pmu_save_disable_all, .restore_all = intel_pmu_restore_all, - .ack_status = intel_pmu_ack_status, .enable = intel_pmu_enable_counter, .disable = intel_pmu_disable_counter, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, @@ -918,7 +904,6 @@ static struct x86_pmu amd_pmu = { .handle_irq = amd_pmu_handle_irq, .save_disable_all = amd_pmu_save_disable_all, .restore_all = amd_pmu_restore_all, - .ack_status = amd_pmu_ack_status, .enable = amd_pmu_enable_counter, .disable = amd_pmu_disable_counter, .eventsel = MSR_K7_EVNTSEL0, -- cgit v1.2.3 From 26816c287e13eedc67bc4ed0cd40c138314b7c7d Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:08 +0200 Subject: perf_counter, x86: rename __hw_perf_counter_set_period into x86_perf_counter_set_period [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-12-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 6bbdc16cc69..fa6541d781b 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -498,7 +498,7 @@ static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); * To be called with the counter disabled in hw: */ static void -__hw_perf_counter_set_period(struct perf_counter *counter, +x86_perf_counter_set_period(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { s64 left = atomic64_read(&hwc->period_left); @@ -642,7 +642,7 @@ try_generic: */ barrier(); - __hw_perf_counter_set_period(counter, hwc, idx); + x86_perf_counter_set_period(counter, hwc, idx); __x86_pmu_enable(counter, hwc, idx); return 0; @@ -731,7 +731,7 @@ static void perf_save_and_restart(struct perf_counter *counter) int idx = hwc->idx; x86_perf_counter_update(counter, hwc, idx); - __hw_perf_counter_set_period(counter, hwc, idx); + x86_perf_counter_set_period(counter, hwc, idx); if (counter->state == PERF_COUNTER_STATE_ACTIVE) __x86_pmu_enable(counter, hwc, idx); -- cgit v1.2.3 From 55de0f2e57994b525324bf0d04d242d9358a2417 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:09 +0200 Subject: perf_counter, x86: rename intel only functions [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-13-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index fa6541d781b..5a52d73ccfa 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -725,7 +725,7 @@ static void x86_pmu_disable(struct perf_counter *counter) * Save and restart an expired counter. Called by NMI contexts, * so it has to be careful about preempting normal counter ops: */ -static void perf_save_and_restart(struct perf_counter *counter) +static void intel_pmu_save_and_restart(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; int idx = hwc->idx; @@ -753,7 +753,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); int ret = 0; - cpuc->throttle_ctrl = hw_perf_save_disable(); + cpuc->throttle_ctrl = intel_pmu_save_disable_all(); status = intel_pmu_get_status(cpuc->throttle_ctrl); if (!status) @@ -770,7 +770,7 @@ again: if (!counter) continue; - perf_save_and_restart(counter); + intel_pmu_save_and_restart(counter); if (perf_counter_overflow(counter, nmi, regs, 0)) __x86_pmu_disable(counter, &counter->hw, bit); } @@ -788,7 +788,7 @@ out: * Restore - do not reenable when global enable is off or throttled: */ if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) - hw_perf_restore(cpuc->throttle_ctrl); + intel_pmu_restore_all(cpuc->throttle_ctrl); return ret; } -- cgit v1.2.3 From 72eae04d3a3075c26d39e1e685acfc8e8c29db64 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:10 +0200 Subject: perf_counter, x86: modify initialization of struct x86_pmu This patch adds an error handler and changes initialization of struct x86_pmu. No functional changes. Needed for follow-on patches. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-14-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 5a52d73ccfa..7c72a942363 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -913,7 +913,7 @@ static struct x86_pmu amd_pmu = { .max_events = ARRAY_SIZE(amd_perfmon_event_map), }; -static struct x86_pmu *intel_pmu_init(void) +static int intel_pmu_init(void) { union cpuid10_edx edx; union cpuid10_eax eax; @@ -921,7 +921,7 @@ static struct x86_pmu *intel_pmu_init(void) unsigned int ebx; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return NULL; + return -ENODEV; /* * Check whether the Architectural PerfMon supports @@ -929,49 +929,54 @@ static struct x86_pmu *intel_pmu_init(void) */ cpuid(10, &eax.full, &ebx, &unused, &edx.full); if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) - return NULL; + return -ENODEV; intel_perfmon_version = eax.split.version_id; if (intel_perfmon_version < 2) - return NULL; + return -ENODEV; pr_info("Intel Performance Monitoring support detected.\n"); pr_info("... version: %d\n", intel_perfmon_version); pr_info("... bit width: %d\n", eax.split.bit_width); pr_info("... mask length: %d\n", eax.split.mask_length); + x86_pmu = &intel_pmu; + nr_counters_generic = eax.split.num_counters; nr_counters_fixed = edx.split.num_counters_fixed; counter_value_mask = (1ULL << eax.split.bit_width) - 1; - return &intel_pmu; + return 0; } -static struct x86_pmu *amd_pmu_init(void) +static int amd_pmu_init(void) { + x86_pmu = &amd_pmu; + nr_counters_generic = 4; nr_counters_fixed = 0; counter_value_mask = 0x0000FFFFFFFFFFFFULL; counter_value_bits = 48; pr_info("AMD Performance Monitoring support detected.\n"); - - return &amd_pmu; + return 0; } void __init init_hw_perf_counters(void) { + int err; + switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: - x86_pmu = intel_pmu_init(); + err = intel_pmu_init(); break; case X86_VENDOR_AMD: - x86_pmu = amd_pmu_init(); + err = amd_pmu_init(); break; default: return; } - if (!x86_pmu) + if (err != 0) return; pr_info("... num counters: %d\n", nr_counters_generic); -- cgit v1.2.3 From 4a06bd8508f65ad1dd5cd2046b85694813fa36a2 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:11 +0200 Subject: perf_counter, x86: make x86_pmu data a static struct Instead of using a pointer to reference to the x86 pmu we now have one single data structure that is initialized at the beginning. This saves the pointer access when using this memory. [ Impact: micro-optimization ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-15-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 50 +++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 25 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 7c72a942363..68597d76338 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -60,7 +60,7 @@ struct x86_pmu { int max_events; }; -static struct x86_pmu *x86_pmu __read_mostly; +static struct x86_pmu x86_pmu __read_mostly; static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { .enabled = 1, @@ -184,12 +184,12 @@ static bool reserve_pmc_hardware(void) disable_lapic_nmi_watchdog(); for (i = 0; i < nr_counters_generic; i++) { - if (!reserve_perfctr_nmi(x86_pmu->perfctr + i)) + if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) goto perfctr_fail; } for (i = 0; i < nr_counters_generic; i++) { - if (!reserve_evntsel_nmi(x86_pmu->eventsel + i)) + if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) goto eventsel_fail; } @@ -197,13 +197,13 @@ static bool reserve_pmc_hardware(void) eventsel_fail: for (i--; i >= 0; i--) - release_evntsel_nmi(x86_pmu->eventsel + i); + release_evntsel_nmi(x86_pmu.eventsel + i); i = nr_counters_generic; perfctr_fail: for (i--; i >= 0; i--) - release_perfctr_nmi(x86_pmu->perfctr + i); + release_perfctr_nmi(x86_pmu.perfctr + i); if (nmi_watchdog == NMI_LOCAL_APIC) enable_lapic_nmi_watchdog(); @@ -216,8 +216,8 @@ static void release_pmc_hardware(void) int i; for (i = 0; i < nr_counters_generic; i++) { - release_perfctr_nmi(x86_pmu->perfctr + i); - release_evntsel_nmi(x86_pmu->eventsel + i); + release_perfctr_nmi(x86_pmu.perfctr + i); + release_evntsel_nmi(x86_pmu.eventsel + i); } if (nmi_watchdog == NMI_LOCAL_APIC) @@ -297,14 +297,14 @@ static int __hw_perf_counter_init(struct perf_counter *counter) * Raw event type provide the config in the event structure */ if (perf_event_raw(hw_event)) { - hwc->config |= x86_pmu->raw_event(perf_event_config(hw_event)); + hwc->config |= x86_pmu.raw_event(perf_event_config(hw_event)); } else { - if (perf_event_id(hw_event) >= x86_pmu->max_events) + if (perf_event_id(hw_event) >= x86_pmu.max_events) return -EINVAL; /* * The generic map: */ - hwc->config |= x86_pmu->event_map(perf_event_id(hw_event)); + hwc->config |= x86_pmu.event_map(perf_event_id(hw_event)); } counter->destroy = hw_perf_counter_destroy; @@ -356,7 +356,7 @@ u64 hw_perf_save_disable(void) if (unlikely(!perf_counters_initialized)) return 0; - return x86_pmu->save_disable_all(); + return x86_pmu.save_disable_all(); } /* * Exported because of ACPI idle @@ -396,7 +396,7 @@ void hw_perf_restore(u64 ctrl) if (unlikely(!perf_counters_initialized)) return; - x86_pmu->restore_all(ctrl); + x86_pmu.restore_all(ctrl); } /* * Exported because of ACPI idle @@ -441,7 +441,7 @@ static void hw_perf_enable(int idx, u64 config) if (unlikely(!perf_counters_initialized)) return; - x86_pmu->enable(idx, config); + x86_pmu.enable(idx, config); } static void intel_pmu_disable_counter(int idx, u64 config) @@ -463,7 +463,7 @@ static void hw_perf_disable(int idx, u64 config) if (unlikely(!perf_counters_initialized)) return; - x86_pmu->disable(idx, config); + x86_pmu.disable(idx, config); } static inline void @@ -580,11 +580,11 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) event = hwc->config & ARCH_PERFMON_EVENT_MASK; - if (unlikely(event == x86_pmu->event_map(PERF_COUNT_INSTRUCTIONS))) + if (unlikely(event == x86_pmu.event_map(PERF_COUNT_INSTRUCTIONS))) return X86_PMC_IDX_FIXED_INSTRUCTIONS; - if (unlikely(event == x86_pmu->event_map(PERF_COUNT_CPU_CYCLES))) + if (unlikely(event == x86_pmu.event_map(PERF_COUNT_CPU_CYCLES))) return X86_PMC_IDX_FIXED_CPU_CYCLES; - if (unlikely(event == x86_pmu->event_map(PERF_COUNT_BUS_CYCLES))) + if (unlikely(event == x86_pmu.event_map(PERF_COUNT_BUS_CYCLES))) return X86_PMC_IDX_FIXED_BUS_CYCLES; return -1; @@ -628,8 +628,8 @@ try_generic: set_bit(idx, cpuc->used); hwc->idx = idx; } - hwc->config_base = x86_pmu->eventsel; - hwc->counter_base = x86_pmu->perfctr; + hwc->config_base = x86_pmu.eventsel; + hwc->counter_base = x86_pmu.perfctr; } perf_counters_lapic_init(hwc->nmi); @@ -677,8 +677,8 @@ void perf_counter_print_debug(void) pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); for (idx = 0; idx < nr_counters_generic; idx++) { - rdmsrl(x86_pmu->eventsel + idx, pmc_ctrl); - rdmsrl(x86_pmu->perfctr + idx, pmc_count); + rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); + rdmsrl(x86_pmu.perfctr + idx, pmc_count); prev_left = per_cpu(prev_left[idx], cpu); @@ -819,7 +819,7 @@ void smp_perf_counter_interrupt(struct pt_regs *regs) irq_enter(); apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR); ack_APIC_irq(); - x86_pmu->handle_irq(regs, 0); + x86_pmu.handle_irq(regs, 0); irq_exit(); } @@ -876,7 +876,7 @@ perf_counter_nmi_handler(struct notifier_block *self, regs = args->regs; apic_write(APIC_LVTPC, APIC_DM_NMI); - ret = x86_pmu->handle_irq(regs, 1); + ret = x86_pmu.handle_irq(regs, 1); return ret ? NOTIFY_STOP : NOTIFY_OK; } @@ -940,7 +940,7 @@ static int intel_pmu_init(void) pr_info("... bit width: %d\n", eax.split.bit_width); pr_info("... mask length: %d\n", eax.split.mask_length); - x86_pmu = &intel_pmu; + x86_pmu = intel_pmu; nr_counters_generic = eax.split.num_counters; nr_counters_fixed = edx.split.num_counters_fixed; @@ -951,7 +951,7 @@ static int intel_pmu_init(void) static int amd_pmu_init(void) { - x86_pmu = &amd_pmu; + x86_pmu = amd_pmu; nr_counters_generic = 4; nr_counters_fixed = 0; -- cgit v1.2.3 From 0933e5c6a680ba8d8d786a6f7fa377b7ec0d1e49 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:12 +0200 Subject: perf_counter, x86: move counter parameters to struct x86_pmu [ Impact: refactor and generalize code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-16-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 80 ++++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 43 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 68597d76338..75dbb1f0900 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -24,16 +24,7 @@ #include static bool perf_counters_initialized __read_mostly; - -/* - * Number of (generic) HW counters: - */ -static int nr_counters_generic __read_mostly; static u64 perf_counter_mask __read_mostly; -static u64 counter_value_mask __read_mostly; -static int counter_value_bits __read_mostly; - -static int nr_counters_fixed __read_mostly; struct cpu_hw_counters { struct perf_counter *counters[X86_PMC_IDX_MAX]; @@ -58,6 +49,10 @@ struct x86_pmu { u64 (*event_map)(int); u64 (*raw_event)(u64); int max_events; + int num_counters; + int num_counters_fixed; + int counter_bits; + u64 counter_mask; }; static struct x86_pmu x86_pmu __read_mostly; @@ -183,12 +178,12 @@ static bool reserve_pmc_hardware(void) if (nmi_watchdog == NMI_LOCAL_APIC) disable_lapic_nmi_watchdog(); - for (i = 0; i < nr_counters_generic; i++) { + for (i = 0; i < x86_pmu.num_counters; i++) { if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) goto perfctr_fail; } - for (i = 0; i < nr_counters_generic; i++) { + for (i = 0; i < x86_pmu.num_counters; i++) { if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) goto eventsel_fail; } @@ -199,7 +194,7 @@ eventsel_fail: for (i--; i >= 0; i--) release_evntsel_nmi(x86_pmu.eventsel + i); - i = nr_counters_generic; + i = x86_pmu.num_counters; perfctr_fail: for (i--; i >= 0; i--) @@ -215,7 +210,7 @@ static void release_pmc_hardware(void) { int i; - for (i = 0; i < nr_counters_generic; i++) { + for (i = 0; i < x86_pmu.num_counters; i++) { release_perfctr_nmi(x86_pmu.perfctr + i); release_evntsel_nmi(x86_pmu.eventsel + i); } @@ -336,7 +331,7 @@ static u64 amd_pmu_save_disable_all(void) */ barrier(); - for (idx = 0; idx < nr_counters_generic; idx++) { + for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; if (!test_bit(idx, cpuc->active_mask)) @@ -378,7 +373,7 @@ static void amd_pmu_restore_all(u64 ctrl) if (!ctrl) return; - for (idx = 0; idx < nr_counters_generic; idx++) { + for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; if (!test_bit(idx, cpuc->active_mask)) @@ -527,7 +522,7 @@ x86_perf_counter_set_period(struct perf_counter *counter, atomic64_set(&hwc->prev_count, (u64)-left); err = checking_wrmsrl(hwc->counter_base + idx, - (u64)(-left) & counter_value_mask); + (u64)(-left) & x86_pmu.counter_mask); } static inline void @@ -621,8 +616,9 @@ static int x86_pmu_enable(struct perf_counter *counter) /* Try to get the previous generic counter again */ if (test_and_set_bit(idx, cpuc->used)) { try_generic: - idx = find_first_zero_bit(cpuc->used, nr_counters_generic); - if (idx == nr_counters_generic) + idx = find_first_zero_bit(cpuc->used, + x86_pmu.num_counters); + if (idx == x86_pmu.num_counters) return -EAGAIN; set_bit(idx, cpuc->used); @@ -654,7 +650,7 @@ void perf_counter_print_debug(void) struct cpu_hw_counters *cpuc; int cpu, idx; - if (!nr_counters_generic) + if (!x86_pmu.num_counters) return; local_irq_disable(); @@ -676,7 +672,7 @@ void perf_counter_print_debug(void) } pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); - for (idx = 0; idx < nr_counters_generic; idx++) { + for (idx = 0; idx < x86_pmu.num_counters; idx++) { rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); rdmsrl(x86_pmu.perfctr + idx, pmc_count); @@ -689,7 +685,7 @@ void perf_counter_print_debug(void) pr_info("CPU#%d: gen-PMC%d left: %016llx\n", cpu, idx, prev_left); } - for (idx = 0; idx < nr_counters_fixed; idx++) { + for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", @@ -911,6 +907,9 @@ static struct x86_pmu amd_pmu = { .event_map = amd_pmu_event_map, .raw_event = amd_pmu_raw_event, .max_events = ARRAY_SIZE(amd_perfmon_event_map), + .num_counters = 4, + .counter_bits = 48, + .counter_mask = (1ULL << 48) - 1, }; static int intel_pmu_init(void) @@ -941,10 +940,10 @@ static int intel_pmu_init(void) pr_info("... mask length: %d\n", eax.split.mask_length); x86_pmu = intel_pmu; - - nr_counters_generic = eax.split.num_counters; - nr_counters_fixed = edx.split.num_counters_fixed; - counter_value_mask = (1ULL << eax.split.bit_width) - 1; + x86_pmu.num_counters = eax.split.num_counters; + x86_pmu.num_counters_fixed = edx.split.num_counters_fixed; + x86_pmu.counter_bits = eax.split.bit_width; + x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1; return 0; } @@ -952,12 +951,6 @@ static int intel_pmu_init(void) static int amd_pmu_init(void) { x86_pmu = amd_pmu; - - nr_counters_generic = 4; - nr_counters_fixed = 0; - counter_value_mask = 0x0000FFFFFFFFFFFFULL; - counter_value_bits = 48; - pr_info("AMD Performance Monitoring support detected.\n"); return 0; } @@ -979,25 +972,26 @@ void __init init_hw_perf_counters(void) if (err != 0) return; - pr_info("... num counters: %d\n", nr_counters_generic); - if (nr_counters_generic > X86_PMC_MAX_GENERIC) { - nr_counters_generic = X86_PMC_MAX_GENERIC; + pr_info("... num counters: %d\n", x86_pmu.num_counters); + if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { + x86_pmu.num_counters = X86_PMC_MAX_GENERIC; WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", - nr_counters_generic, X86_PMC_MAX_GENERIC); + x86_pmu.num_counters, X86_PMC_MAX_GENERIC); } - perf_counter_mask = (1 << nr_counters_generic) - 1; - perf_max_counters = nr_counters_generic; + perf_counter_mask = (1 << x86_pmu.num_counters) - 1; + perf_max_counters = x86_pmu.num_counters; - pr_info("... value mask: %016Lx\n", counter_value_mask); + pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask); - if (nr_counters_fixed > X86_PMC_MAX_FIXED) { - nr_counters_fixed = X86_PMC_MAX_FIXED; + if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { + x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", - nr_counters_fixed, X86_PMC_MAX_FIXED); + x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); } - pr_info("... fixed counters: %d\n", nr_counters_fixed); + pr_info("... fixed counters: %d\n", x86_pmu.num_counters_fixed); - perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED; + perf_counter_mask |= + ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; pr_info("... counter mask: %016Lx\n", perf_counter_mask); perf_counters_initialized = true; -- cgit v1.2.3 From faa28ae018ed004a22aa4a7704e04ccdde4a941e Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:13 +0200 Subject: perf_counter, x86: make pmu version generic This makes the use of the version variable generic. Also, some debug messages have been generalized. [ Impact: refactor and generalize code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-17-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 75dbb1f0900..15d2c03e16f 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -39,6 +39,8 @@ struct cpu_hw_counters { * struct x86_pmu - generic x86 pmu */ struct x86_pmu { + const char *name; + int version; int (*handle_irq)(struct pt_regs *, int); u64 (*save_disable_all)(void); void (*restore_all)(u64); @@ -61,8 +63,6 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { .enabled = 1, }; -static __read_mostly int intel_perfmon_version; - /* * Intel PerfMon v3. Used on Core2 and later. */ @@ -658,7 +658,7 @@ void perf_counter_print_debug(void) cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_counters, cpu); - if (intel_perfmon_version >= 2) { + if (x86_pmu.version >= 2) { rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); @@ -884,6 +884,7 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = { }; static struct x86_pmu intel_pmu = { + .name = "Intel", .handle_irq = intel_pmu_handle_irq, .save_disable_all = intel_pmu_save_disable_all, .restore_all = intel_pmu_restore_all, @@ -897,6 +898,7 @@ static struct x86_pmu intel_pmu = { }; static struct x86_pmu amd_pmu = { + .name = "AMD", .handle_irq = amd_pmu_handle_irq, .save_disable_all = amd_pmu_save_disable_all, .restore_all = amd_pmu_restore_all, @@ -918,6 +920,7 @@ static int intel_pmu_init(void) union cpuid10_eax eax; unsigned int unused; unsigned int ebx; + int version; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) return -ENODEV; @@ -930,16 +933,12 @@ static int intel_pmu_init(void) if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) return -ENODEV; - intel_perfmon_version = eax.split.version_id; - if (intel_perfmon_version < 2) + version = eax.split.version_id; + if (version < 2) return -ENODEV; - pr_info("Intel Performance Monitoring support detected.\n"); - pr_info("... version: %d\n", intel_perfmon_version); - pr_info("... bit width: %d\n", eax.split.bit_width); - pr_info("... mask length: %d\n", eax.split.mask_length); - x86_pmu = intel_pmu; + x86_pmu.version = version; x86_pmu.num_counters = eax.split.num_counters; x86_pmu.num_counters_fixed = edx.split.num_counters_fixed; x86_pmu.counter_bits = eax.split.bit_width; @@ -951,7 +950,6 @@ static int intel_pmu_init(void) static int amd_pmu_init(void) { x86_pmu = amd_pmu; - pr_info("AMD Performance Monitoring support detected.\n"); return 0; } @@ -972,6 +970,10 @@ void __init init_hw_perf_counters(void) if (err != 0) return; + pr_info("%s Performance Monitoring support detected.\n", x86_pmu.name); + pr_info("... version: %d\n", x86_pmu.version); + pr_info("... bit width: %d\n", x86_pmu.counter_bits); + pr_info("... num counters: %d\n", x86_pmu.num_counters); if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { x86_pmu.num_counters = X86_PMC_MAX_GENERIC; -- cgit v1.2.3 From bb775fc2d1dcd1aa6eafde37a8289ba2d80783aa Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:14 +0200 Subject: perf_counter, x86: make x86_pmu_read() static inline [ Impact: micro-optimization ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-18-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 15d2c03e16f..3f3ae477a7d 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1002,7 +1002,7 @@ void __init init_hw_perf_counters(void) register_die_notifier(&perf_counter_nmi_notifier); } -static void x86_pmu_read(struct perf_counter *counter) +static inline void x86_pmu_read(struct perf_counter *counter) { x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); } -- cgit v1.2.3 From 93904966934193204ad08e951f806d5631c29eb3 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:15 +0200 Subject: perf_counter, x86: rename cpuc->active_mask This is to have a consistent naming scheme with cpuc->used. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-19-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 3f3ae477a7d..9ec51a662db 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -29,9 +29,9 @@ static u64 perf_counter_mask __read_mostly; struct cpu_hw_counters { struct perf_counter *counters[X86_PMC_IDX_MAX]; unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long active[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long interrupts; u64 throttle_ctrl; - unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; int enabled; }; @@ -334,7 +334,7 @@ static u64 amd_pmu_save_disable_all(void) for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; - if (!test_bit(idx, cpuc->active_mask)) + if (!test_bit(idx, cpuc->active)) continue; rdmsrl(MSR_K7_EVNTSEL0 + idx, val); if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) @@ -376,7 +376,7 @@ static void amd_pmu_restore_all(u64 ctrl) for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; - if (!test_bit(idx, cpuc->active_mask)) + if (!test_bit(idx, cpuc->active)) continue; rdmsrl(MSR_K7_EVNTSEL0 + idx, val); if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) @@ -424,7 +424,7 @@ static void amd_pmu_enable_counter(int idx, u64 config) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - set_bit(idx, cpuc->active_mask); + set_bit(idx, cpuc->active); if (cpuc->enabled) config |= ARCH_PERFMON_EVENTSEL0_ENABLE; @@ -448,7 +448,7 @@ static void amd_pmu_disable_counter(int idx, u64 config) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - clear_bit(idx, cpuc->active_mask); + clear_bit(idx, cpuc->active); wrmsrl(MSR_K7_EVNTSEL0 + idx, config); } -- cgit v1.2.3 From 095342389e2ed8deed07b3076f990260ce3c7c9f Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:16 +0200 Subject: perf_counter, x86: generic use of cpuc->active cpuc->active will now be used to indicate an enabled counter which implies also valid pointers of cpuc->counters[]. In contrast, cpuc->used only locks the counter, but it can be still uninitialized. [ Impact: refactor and generalize code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-20-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 9ec51a662db..f7fd4a35515 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -424,7 +424,6 @@ static void amd_pmu_enable_counter(int idx, u64 config) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - set_bit(idx, cpuc->active); if (cpuc->enabled) config |= ARCH_PERFMON_EVENTSEL0_ENABLE; @@ -446,9 +445,6 @@ static void intel_pmu_disable_counter(int idx, u64 config) static void amd_pmu_disable_counter(int idx, u64 config) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - - clear_bit(idx, cpuc->active); wrmsrl(MSR_K7_EVNTSEL0 + idx, config); } @@ -633,10 +629,7 @@ try_generic: __x86_pmu_disable(counter, hwc, idx); cpuc->counters[idx] = counter; - /* - * Make it visible before enabling the hw: - */ - barrier(); + set_bit(idx, cpuc->active); x86_perf_counter_set_period(counter, hwc, idx); __x86_pmu_enable(counter, hwc, idx); @@ -700,10 +693,13 @@ static void x86_pmu_disable(struct perf_counter *counter) struct hw_perf_counter *hwc = &counter->hw; unsigned int idx = hwc->idx; + /* + * Must be done before we disable, otherwise the nmi handler + * could reenable again: + */ + clear_bit(idx, cpuc->active); __x86_pmu_disable(counter, hwc, idx); - clear_bit(idx, cpuc->used); - cpuc->counters[idx] = NULL; /* * Make sure the cleared pointer becomes visible before we * (potentially) free the counter: @@ -715,6 +711,8 @@ static void x86_pmu_disable(struct perf_counter *counter) * that we are disabling: */ x86_perf_counter_update(counter, hwc, idx); + cpuc->counters[idx] = NULL; + clear_bit(idx, cpuc->used); } /* @@ -763,7 +761,7 @@ again: struct perf_counter *counter = cpuc->counters[bit]; clear_bit(bit, (unsigned long *) &status); - if (!counter) + if (!test_bit(bit, cpuc->active)) continue; intel_pmu_save_and_restart(counter); -- cgit v1.2.3 From 6f00cada07bb5da7f751929d3173494dcc5446cc Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:17 +0200 Subject: perf_counter, x86: consistent use of type int for counter index The type of counter index is sometimes implemented as unsigned int. This patch changes this to have a consistent usage of int. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-21-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index f7fd4a35515..d8beebeb270 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -459,7 +459,7 @@ static void hw_perf_disable(int idx, u64 config) static inline void __pmc_fixed_disable(struct perf_counter *counter, - struct hw_perf_counter *hwc, unsigned int __idx) + struct hw_perf_counter *hwc, int __idx) { int idx = __idx - X86_PMC_IDX_FIXED; u64 ctrl_val, mask; @@ -474,7 +474,7 @@ __pmc_fixed_disable(struct perf_counter *counter, static inline void __x86_pmu_disable(struct perf_counter *counter, - struct hw_perf_counter *hwc, unsigned int idx) + struct hw_perf_counter *hwc, int idx) { if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) __pmc_fixed_disable(counter, hwc, idx); @@ -523,7 +523,7 @@ x86_perf_counter_set_period(struct perf_counter *counter, static inline void __pmc_fixed_enable(struct perf_counter *counter, - struct hw_perf_counter *hwc, unsigned int __idx) + struct hw_perf_counter *hwc, int __idx) { int idx = __idx - X86_PMC_IDX_FIXED; u64 ctrl_val, bits, mask; @@ -691,7 +691,7 @@ static void x86_pmu_disable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; - unsigned int idx = hwc->idx; + int idx = hwc->idx; /* * Must be done before we disable, otherwise the nmi handler -- cgit v1.2.3 From 7c90cc45f89af4dd4617f97d452740ad95b800d5 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:18 +0200 Subject: perf_counter, x86: rework counter enable functions There is vendor specific code in generic x86 code, and there is vendor specific code that could be generic. This patch introduces x86_pmu_enable_counter() for x86 generic code. Fixed counter code for Intel is moved to Intel only functions. In the end, checks and calls via function pointers were reduced to the necessary. Also, the internal function i/f changed. [ Impact: refactor and generalize code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-22-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 52 ++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 28 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index d8beebeb270..ae55933ce79 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -44,7 +44,7 @@ struct x86_pmu { int (*handle_irq)(struct pt_regs *, int); u64 (*save_disable_all)(void); void (*restore_all)(u64); - void (*enable)(int, u64); + void (*enable)(struct hw_perf_counter *, int); void (*disable)(int, u64); unsigned eventsel; unsigned perfctr; @@ -414,28 +414,15 @@ static inline void intel_pmu_ack_status(u64 ack) wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); } -static void intel_pmu_enable_counter(int idx, u64 config) +static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) { - wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, - config | ARCH_PERFMON_EVENTSEL0_ENABLE); -} - -static void amd_pmu_enable_counter(int idx, u64 config) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - - if (cpuc->enabled) - config |= ARCH_PERFMON_EVENTSEL0_ENABLE; - - wrmsrl(MSR_K7_EVNTSEL0 + idx, config); -} + int err; -static void hw_perf_enable(int idx, u64 config) -{ if (unlikely(!perf_counters_initialized)) return; - x86_pmu.enable(idx, config); + err = checking_wrmsrl(hwc->config_base + idx, + hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); } static void intel_pmu_disable_counter(int idx, u64 config) @@ -522,8 +509,7 @@ x86_perf_counter_set_period(struct perf_counter *counter, } static inline void -__pmc_fixed_enable(struct perf_counter *counter, - struct hw_perf_counter *hwc, int __idx) +intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx) { int idx = __idx - X86_PMC_IDX_FIXED; u64 ctrl_val, bits, mask; @@ -548,14 +534,24 @@ __pmc_fixed_enable(struct perf_counter *counter, err = checking_wrmsrl(hwc->config_base, ctrl_val); } -static void -__x86_pmu_enable(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) +static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) { - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) - __pmc_fixed_enable(counter, hwc, idx); + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + intel_pmu_enable_fixed(hwc, idx); + return; + } + + x86_pmu_enable_counter(hwc, idx); +} + +static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + + if (cpuc->enabled) + x86_pmu_enable_counter(hwc, idx); else - hw_perf_enable(idx, hwc->config); + amd_pmu_disable_counter(idx, hwc->config); } static int @@ -632,7 +628,7 @@ try_generic: set_bit(idx, cpuc->active); x86_perf_counter_set_period(counter, hwc, idx); - __x86_pmu_enable(counter, hwc, idx); + x86_pmu.enable(hwc, idx); return 0; } @@ -728,7 +724,7 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter) x86_perf_counter_set_period(counter, hwc, idx); if (counter->state == PERF_COUNTER_STATE_ACTIVE) - __x86_pmu_enable(counter, hwc, idx); + intel_pmu_enable_counter(hwc, idx); } /* -- cgit v1.2.3 From d43698918bd46c71d494555fb92195fbea1fcb6c Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:19 +0200 Subject: perf_counter, x86: rework counter disable functions As for the enable function, this patch reworks the disable functions and introduces x86_pmu_disable_counter(). The internal function i/f in struct x86_pmu changed too. [ Impact: refactor and generalize code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-23-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 48 ++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 25 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index ae55933ce79..df9012bbd21 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -45,7 +45,7 @@ struct x86_pmu { u64 (*save_disable_all)(void); void (*restore_all)(u64); void (*enable)(struct hw_perf_counter *, int); - void (*disable)(int, u64); + void (*disable)(struct hw_perf_counter *, int); unsigned eventsel; unsigned perfctr; u64 (*event_map)(int); @@ -425,28 +425,19 @@ static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); } -static void intel_pmu_disable_counter(int idx, u64 config) +static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) { - wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config); -} - -static void amd_pmu_disable_counter(int idx, u64 config) -{ - wrmsrl(MSR_K7_EVNTSEL0 + idx, config); - -} + int err; -static void hw_perf_disable(int idx, u64 config) -{ if (unlikely(!perf_counters_initialized)) return; - x86_pmu.disable(idx, config); + err = checking_wrmsrl(hwc->config_base + idx, + hwc->config); } static inline void -__pmc_fixed_disable(struct perf_counter *counter, - struct hw_perf_counter *hwc, int __idx) +intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx) { int idx = __idx - X86_PMC_IDX_FIXED; u64 ctrl_val, mask; @@ -460,13 +451,20 @@ __pmc_fixed_disable(struct perf_counter *counter, } static inline void -__x86_pmu_disable(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) +intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) { - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) - __pmc_fixed_disable(counter, hwc, idx); - else - hw_perf_disable(idx, hwc->config); + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + intel_pmu_disable_fixed(hwc, idx); + return; + } + + x86_pmu_disable_counter(hwc, idx); +} + +static inline void +amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +{ + x86_pmu_disable_counter(hwc, idx); } static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); @@ -551,7 +549,7 @@ static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) if (cpuc->enabled) x86_pmu_enable_counter(hwc, idx); else - amd_pmu_disable_counter(idx, hwc->config); + x86_pmu_disable_counter(hwc, idx); } static int @@ -622,7 +620,7 @@ try_generic: perf_counters_lapic_init(hwc->nmi); - __x86_pmu_disable(counter, hwc, idx); + x86_pmu.disable(hwc, idx); cpuc->counters[idx] = counter; set_bit(idx, cpuc->active); @@ -694,7 +692,7 @@ static void x86_pmu_disable(struct perf_counter *counter) * could reenable again: */ clear_bit(idx, cpuc->active); - __x86_pmu_disable(counter, hwc, idx); + x86_pmu.disable(hwc, idx); /* * Make sure the cleared pointer becomes visible before we @@ -762,7 +760,7 @@ again: intel_pmu_save_and_restart(counter); if (perf_counter_overflow(counter, nmi, regs, 0)) - __x86_pmu_disable(counter, &counter->hw, bit); + intel_pmu_disable_counter(&counter->hw, bit); } intel_pmu_ack_status(ack); -- cgit v1.2.3 From 85cf9dba92152bb4edec118b2f4f0be1ae7fdcab Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:20 +0200 Subject: perf_counter, x86: change and remove pmu initialization checks Some functions are only called if the pmu was proper initialized. That initalization checks can be removed. The way to check initialization changed too. Now, the pointer to the interrupt handler is checked. If it exists the pmu is initialized. This also removes a static variable and uses struct x86_pmu as only data source for the check. [ Impact: simplify code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-24-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 34 +++++++++++++--------------------- 1 file changed, 13 insertions(+), 21 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index df9012bbd21..2d3681bbb52 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -23,7 +23,6 @@ #include #include -static bool perf_counters_initialized __read_mostly; static u64 perf_counter_mask __read_mostly; struct cpu_hw_counters { @@ -227,6 +226,11 @@ static void hw_perf_counter_destroy(struct perf_counter *counter) } } +static inline int x86_pmu_initialized(void) +{ + return x86_pmu.handle_irq != NULL; +} + /* * Setup the hardware configuration for a given hw_event_type */ @@ -240,8 +244,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter) if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) return -ENOSYS; - if (unlikely(!perf_counters_initialized)) - return -EINVAL; + if (!x86_pmu_initialized()) + return -ENODEV; err = 0; if (atomic_inc_not_zero(&num_counters)) { @@ -348,9 +352,8 @@ static u64 amd_pmu_save_disable_all(void) u64 hw_perf_save_disable(void) { - if (unlikely(!perf_counters_initialized)) + if (!x86_pmu_initialized()) return 0; - return x86_pmu.save_disable_all(); } /* @@ -388,9 +391,8 @@ static void amd_pmu_restore_all(u64 ctrl) void hw_perf_restore(u64 ctrl) { - if (unlikely(!perf_counters_initialized)) + if (!x86_pmu_initialized()) return; - x86_pmu.restore_all(ctrl); } /* @@ -402,8 +404,6 @@ static inline u64 intel_pmu_get_status(u64 mask) { u64 status; - if (unlikely(!perf_counters_initialized)) - return 0; rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); return status; @@ -417,10 +417,6 @@ static inline void intel_pmu_ack_status(u64 ack) static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) { int err; - - if (unlikely(!perf_counters_initialized)) - return; - err = checking_wrmsrl(hwc->config_base + idx, hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); } @@ -428,10 +424,6 @@ static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) { int err; - - if (unlikely(!perf_counters_initialized)) - return; - err = checking_wrmsrl(hwc->config_base + idx, hwc->config); } @@ -787,10 +779,10 @@ void perf_counter_unthrottle(void) { struct cpu_hw_counters *cpuc; - if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + if (!x86_pmu_initialized()) return; - if (unlikely(!perf_counters_initialized)) + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) return; cpuc = &__get_cpu_var(cpu_hw_counters); @@ -829,8 +821,9 @@ void perf_counters_lapic_init(int nmi) { u32 apic_val; - if (!perf_counters_initialized) + if (!x86_pmu_initialized()) return; + /* * Enable the performance counter vector in the APIC LVT: */ @@ -988,7 +981,6 @@ void __init init_hw_perf_counters(void) ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; pr_info("... counter mask: %016Lx\n", perf_counter_mask); - perf_counters_initialized = true; perf_counters_lapic_init(0); register_die_notifier(&perf_counter_nmi_notifier); -- cgit v1.2.3 From a29aa8a7ff93e4196d558036928597e68337dd8d Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:21 +0200 Subject: perf_counter, x86: implement the interrupt handler for AMD cpus This patch implements the interrupt handler for AMD performance counters. In difference to the Intel pmu, there is no single status register and also there are no fixed counters. This makes the handler very different and it is useful to make the handler vendor specific. To check if a counter is overflowed the upper bit of the counter is checked. Only counters where the active bit is set are checked. With this patch throttling is enabled for AMD performance counters. This patch also reenables Linux performance counters on AMD cpus. [ Impact: re-enable perfcounters on AMD CPUs ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-25-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 45 +++++++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 2d3681bbb52..f4d59d4cf3f 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -240,10 +240,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter) struct hw_perf_counter *hwc = &counter->hw; int err; - /* disable temporarily */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) - return -ENOSYS; - if (!x86_pmu_initialized()) return -ENODEV; @@ -773,7 +769,43 @@ out: return ret; } -static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) { return 0; } +static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) +{ + int cpu = smp_processor_id(); + struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); + u64 val; + int handled = 0; + struct perf_counter *counter; + struct hw_perf_counter *hwc; + int idx; + + ++cpuc->interrupts; + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + if (!test_bit(idx, cpuc->active)) + continue; + counter = cpuc->counters[idx]; + hwc = &counter->hw; + x86_perf_counter_update(counter, hwc, idx); + val = atomic64_read(&hwc->prev_count); + if (val & (1ULL << (x86_pmu.counter_bits - 1))) + continue; + /* counter overflow */ + x86_perf_counter_set_period(counter, hwc, idx); + handled = 1; + inc_irq_stat(apic_perf_irqs); + if (perf_counter_overflow(counter, nmi, regs, 0)) + amd_pmu_disable_counter(hwc, idx); + else if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) + /* + * do not reenable when throttled, but reload + * the register + */ + amd_pmu_disable_counter(hwc, idx); + else if (counter->state == PERF_COUNTER_STATE_ACTIVE) + amd_pmu_enable_counter(hwc, idx); + } + return handled; +} void perf_counter_unthrottle(void) { @@ -782,9 +814,6 @@ void perf_counter_unthrottle(void) if (!x86_pmu_initialized()) return; - if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return; - cpuc = &__get_cpu_var(cpu_hw_counters); if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) { if (printk_ratelimit()) -- cgit v1.2.3 From 4b7bfd0d276da3a006d37e85d3cf900d7a14ae2a Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:22 +0200 Subject: perf_counter, x86: return raw count with x86_perf_counter_update() To check on AMD cpus if a counter overflows, the upper bit of the raw counter value must be checked. This value is already internally available in x86_perf_counter_update(). Now, the value is returned so that it can be used directly to check for overflows. [ Impact: micro-optimization ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-26-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index f4d59d4cf3f..a8a53abd706 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -132,7 +132,7 @@ static u64 amd_pmu_raw_event(u64 event) * Can only be executed on the CPU where the counter is active. * Returns the delta events processed. */ -static void +static u64 x86_perf_counter_update(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { @@ -165,6 +165,8 @@ again: atomic64_add(delta, &counter->count); atomic64_sub(delta, &hwc->period_left); + + return new_raw_count; } static atomic_t num_counters; @@ -785,8 +787,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) continue; counter = cpuc->counters[idx]; hwc = &counter->hw; - x86_perf_counter_update(counter, hwc, idx); - val = atomic64_read(&hwc->prev_count); + val = x86_perf_counter_update(counter, hwc, idx); if (val & (1ULL << (x86_pmu.counter_bits - 1))) continue; /* counter overflow */ -- cgit v1.2.3 From c619b8ffb1cec6a431687a35695dc6fd292a79e6 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:23 +0200 Subject: perf_counter, x86: introduce max_period variable In x86 pmus the allowed counter period to programm differs. This introduces a max_period value and allows the generic implementation for all models to check the max period. [ Impact: generalize code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-27-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a8a53abd706..4b8715b34f8 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -54,6 +54,7 @@ struct x86_pmu { int num_counters_fixed; int counter_bits; u64 counter_mask; + u64 max_period; }; static struct x86_pmu x86_pmu __read_mostly; @@ -279,14 +280,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->nmi = 1; hwc->irq_period = hw_event->irq_period; - /* - * Intel PMCs cannot be accessed sanely above 32 bit width, - * so we install an artificial 1<<31 period regardless of - * the generic counter period: - */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF) - hwc->irq_period = 0x7FFFFFFF; + if ((s64)hwc->irq_period <= 0 || hwc->irq_period > x86_pmu.max_period) + hwc->irq_period = x86_pmu.max_period; atomic64_set(&hwc->period_left, hwc->irq_period); @@ -910,6 +905,12 @@ static struct x86_pmu intel_pmu = { .event_map = intel_pmu_event_map, .raw_event = intel_pmu_raw_event, .max_events = ARRAY_SIZE(intel_perfmon_event_map), + /* + * Intel PMCs cannot be accessed sanely above 32 bit width, + * so we install an artificial 1<<31 period regardless of + * the generic counter period: + */ + .max_period = (1ULL << 31) - 1, }; static struct x86_pmu amd_pmu = { @@ -927,6 +928,8 @@ static struct x86_pmu amd_pmu = { .num_counters = 4, .counter_bits = 48, .counter_mask = (1ULL << 48) - 1, + /* use highest bit to detect overflow */ + .max_period = (1ULL << 47) - 1, }; static int intel_pmu_init(void) @@ -999,6 +1002,7 @@ void __init init_hw_perf_counters(void) perf_max_counters = x86_pmu.num_counters; pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask); + pr_info("... max period: %016Lx\n", x86_pmu.max_period); if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; -- cgit v1.2.3 From ef7b3e09ffdcd5200aea9523f6b56d331d1c4fc0 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:24 +0200 Subject: perf_counter, x86: remove vendor check in fixed_mode_idx() The function fixed_mode_idx() is used generically. Now it checks the num_counters_fixed value instead of the vendor to decide if fixed counters are present. [ Impact: generalize code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-28-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 4b8715b34f8..d1c8036dcbd 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -542,7 +542,7 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) { unsigned int event; - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + if (!x86_pmu.num_counters_fixed) return -1; if (unlikely(hwc->nmi)) -- cgit v1.2.3 From 19d84dab55a383d75c885b5c1a618f5ead96f2f6 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:25 +0200 Subject: perf_counter, x86: remove unused function argument in intel_pmu_get_status() The mask argument is unused and thus can be removed. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-29-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index d1c8036dcbd..856b0b85219 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -393,7 +393,7 @@ void hw_perf_restore(u64 ctrl) */ EXPORT_SYMBOL_GPL(hw_perf_restore); -static inline u64 intel_pmu_get_status(u64 mask) +static inline u64 intel_pmu_get_status(void) { u64 status; @@ -728,7 +728,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) cpuc->throttle_ctrl = intel_pmu_save_disable_all(); - status = intel_pmu_get_status(cpuc->throttle_ctrl); + status = intel_pmu_get_status(); if (!status) goto out; @@ -753,7 +753,7 @@ again: /* * Repeat if there is more work to be done: */ - status = intel_pmu_get_status(cpuc->throttle_ctrl); + status = intel_pmu_get_status(); if (status) goto again; out: -- cgit v1.2.3 From 98144511427c192e4249ff66a3f9debc55c59411 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 29 Apr 2009 14:52:50 +0200 Subject: perf_counter: add/update copyrights Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 856b0b85219..47e563bfd4c 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1,10 +1,11 @@ /* * Performance counter x86 architecture code * - * Copyright(C) 2008 Thomas Gleixner - * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar - * Copyright(C) 2009 Jaswinder Singh Rajput - * Copyright(C) 2009 Advanced Micro Devices, Inc., Robert Richter + * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2009 Jaswinder Singh Rajput + * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * * For licencing details see kernel-base/COPYING */ -- cgit v1.2.3 From 43f6201a22dbf1c5abe1cab96b49bd56fa9df8f4 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 16:55:56 +0200 Subject: perf_counter, x86: rename bitmasks to ->used_mask and ->active_mask Standardize on explicitly mentioning '_mask' in fields that are not plain flags but masks. This avoids typos like: if (cpuc->used) (which could easily slip through review unnoticed), while if a typo looks like this: if (cpuc->used_mask) it might get noticed during review. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <1241016956-24648-1-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 47e563bfd4c..fc06f4d3264 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -28,8 +28,8 @@ static u64 perf_counter_mask __read_mostly; struct cpu_hw_counters { struct perf_counter *counters[X86_PMC_IDX_MAX]; - unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - unsigned long active[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long interrupts; u64 throttle_ctrl; int enabled; @@ -332,7 +332,7 @@ static u64 amd_pmu_save_disable_all(void) for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; - if (!test_bit(idx, cpuc->active)) + if (!test_bit(idx, cpuc->active_mask)) continue; rdmsrl(MSR_K7_EVNTSEL0 + idx, val); if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) @@ -373,7 +373,7 @@ static void amd_pmu_restore_all(u64 ctrl) for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; - if (!test_bit(idx, cpuc->active)) + if (!test_bit(idx, cpuc->active_mask)) continue; rdmsrl(MSR_K7_EVNTSEL0 + idx, val); if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) @@ -576,7 +576,7 @@ static int x86_pmu_enable(struct perf_counter *counter) * Try to get the fixed counter, if that is already taken * then try to get a generic counter: */ - if (test_and_set_bit(idx, cpuc->used)) + if (test_and_set_bit(idx, cpuc->used_mask)) goto try_generic; hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; @@ -590,14 +590,14 @@ static int x86_pmu_enable(struct perf_counter *counter) } else { idx = hwc->idx; /* Try to get the previous generic counter again */ - if (test_and_set_bit(idx, cpuc->used)) { + if (test_and_set_bit(idx, cpuc->used_mask)) { try_generic: - idx = find_first_zero_bit(cpuc->used, + idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_counters); if (idx == x86_pmu.num_counters) return -EAGAIN; - set_bit(idx, cpuc->used); + set_bit(idx, cpuc->used_mask); hwc->idx = idx; } hwc->config_base = x86_pmu.eventsel; @@ -609,7 +609,7 @@ try_generic: x86_pmu.disable(hwc, idx); cpuc->counters[idx] = counter; - set_bit(idx, cpuc->active); + set_bit(idx, cpuc->active_mask); x86_perf_counter_set_period(counter, hwc, idx); x86_pmu.enable(hwc, idx); @@ -643,7 +643,7 @@ void perf_counter_print_debug(void) pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); } - pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); + pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask); for (idx = 0; idx < x86_pmu.num_counters; idx++) { rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); @@ -677,7 +677,7 @@ static void x86_pmu_disable(struct perf_counter *counter) * Must be done before we disable, otherwise the nmi handler * could reenable again: */ - clear_bit(idx, cpuc->active); + clear_bit(idx, cpuc->active_mask); x86_pmu.disable(hwc, idx); /* @@ -692,7 +692,7 @@ static void x86_pmu_disable(struct perf_counter *counter) */ x86_perf_counter_update(counter, hwc, idx); cpuc->counters[idx] = NULL; - clear_bit(idx, cpuc->used); + clear_bit(idx, cpuc->used_mask); } /* @@ -741,7 +741,7 @@ again: struct perf_counter *counter = cpuc->counters[bit]; clear_bit(bit, (unsigned long *) &status); - if (!test_bit(bit, cpuc->active)) + if (!test_bit(bit, cpuc->active_mask)) continue; intel_pmu_save_and_restart(counter); @@ -779,7 +779,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) ++cpuc->interrupts; for (idx = 0; idx < x86_pmu.num_counters; idx++) { - if (!test_bit(idx, cpuc->active)) + if (!test_bit(idx, cpuc->active_mask)) continue; counter = cpuc->counters[idx]; hwc = &counter->hw; -- cgit v1.2.3 From 2b72394e4089643f11669d9610907a1442fe044a Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 28 Apr 2009 16:00:49 +0300 Subject: x86: move max_pfn_mapped and max_low_pfn_mapped to setup.c This patch moves the max_pfn_mapped and max_low_pfn_mapped global variables to kernel/setup.c where they're initialized. [ Impact: cleanup ] Signed-off-by: Pekka Enberg LKML-Reference: <1240923649.1982.21.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b4158439bf6..0d77e56e821 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -112,6 +112,14 @@ #define ARCH_SETUP #endif +/* + * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. + * The direct mapping extends to max_pfn_mapped, so that we can directly access + * apertures, ACPI and other tables without having to play with fixmaps. + */ +unsigned long max_low_pfn_mapped; +unsigned long max_pfn_mapped; + RESERVE_BRK(dmi_alloc, 65536); unsigned int boot_cpu_id __read_mostly; -- cgit v1.2.3 From 12d161147f828192b5bcc33166f468a827832767 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 4 Apr 2009 21:01:10 +0000 Subject: x86: hookup sys_rt_tgsigqueueinfo Make the new sys_rt_tgsigqueueinfo available for x86. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/syscall_table_32.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index ff5c8736b49..734f92c02dd 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -334,3 +334,4 @@ ENTRY(sys_call_table) .long sys_inotify_init1 .long sys_preadv .long sys_pwritev + .long sys_rt_tgsigqueueinfo /* 335 */ -- cgit v1.2.3 From 63a809a2dc53b91268dd915bbcbd425063893676 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 1 May 2009 12:23:17 +0200 Subject: perf_counter: fix nmi-watchdog interaction When we don't have any perf-counters active, don't act like we know what the NMI is for. [ Impact: fix hard hang with nmi_watchdog=2 ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090501102533.109867793@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index fc06f4d3264..d4c0cc9d326 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -871,6 +871,9 @@ perf_counter_nmi_handler(struct notifier_block *self, struct pt_regs *regs; int ret; + if (!atomic_read(&num_counters)) + return NOTIFY_DONE; + switch (cmd) { case DIE_NMI: case DIE_NMI_IPI: -- cgit v1.2.3 From 15e957d08dd4a841359cfec59ecb74041e0097aa Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 30 Apr 2009 01:17:50 -0700 Subject: x86/irq: use move_irq_desc() in create_irq_nr() move_irq_desc() will try to move irq_desc to the home node if the allocated one is not correct, in create_irq_nr(). ( This can happen on devices that are on different nodes that are using MSI, when drivers are loaded and unloaded randomly. ) v2: fix non-smp build v3: add NUMA_IRQ_DESC to eliminate #ifdefs [ Impact: improve irq descriptor locality on NUMA systems ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Suresh Siddha Cc: "Eric W. Biederman" Cc: Rusty Russell LKML-Reference: <49F95EAE.2050903@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 9cd4806cdf5..e583291fe6c 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3197,11 +3197,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node) if (cfg_new->vector != 0) continue; -#ifdef CONFIG_NUMA_IRQ_DESC - /* different node ?*/ - if (desc_new->node != node) - desc = move_irq_desc(desc, node); -#endif + desc_new = move_irq_desc(desc_new, node); if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) irq = new; -- cgit v1.2.3 From 6f0aced639d346e5f54eea9fcb2784b633493d09 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Fri, 1 May 2009 23:54:25 +0400 Subject: x86, apic: use pr_ macro Replace recenly appeared printk with pr_ macro (the file already use a lot of them). [ Impact: cleanup ] Signed-off-by: Cyrill Gorcunov LKML-Reference: <20090501195425.GB4633@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 28f747d61d7..e258bedce7c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2191,7 +2191,7 @@ static int __cpuinit set_multi(const struct dmi_system_id *d) { if (multi) return 0; - printk(KERN_INFO "APIC: %s detected, Multi Chassis\n", d->ident); + pr_info("APIC: %s detected, Multi Chassis\n", d->ident); multi = 1; return 0; } -- cgit v1.2.3 From 1cbac972ba28e706fa9ce4d4c81830040bc811ee Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 2 May 2009 13:39:56 +0400 Subject: x86: uv io-apic - use BUILD_BUG_ON instead of BUG_ON The expression is known to be true/false at compilation time so we're allowed to use build-time instead of run-time check. Also align 'entry' items assignment. [ Impact: shrink kernel a bit, cleanup ] Signed-off-by: Cyrill Gorcunov Cc: Jack Steiner LKML-Reference: <20090502093956.GB4791@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 8aef5f9d947..a80335ba12c 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3749,6 +3749,8 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, unsigned long flags; int err; + BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); + cfg = irq_cfg(irq); err = assign_irq_vector(irq, cfg, eligible_cpu); @@ -3762,15 +3764,13 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, mmr_value = 0; entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); - - entry->vector = cfg->vector; - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; - entry->polarity = 0; - entry->trigger = 0; - entry->mask = 0; - entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); + entry->vector = cfg->vector; + entry->delivery_mode = apic->irq_delivery_mode; + entry->dest_mode = apic->irq_dest_mode; + entry->polarity = 0; + entry->trigger = 0; + entry->mask = 0; + entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); mmr_pnode = uv_blade_to_pnode(mmr_blade); uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); @@ -3788,10 +3788,10 @@ void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset) struct uv_IO_APIC_route_entry *entry; int mmr_pnode; + BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); + mmr_value = 0; entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); - entry->mask = 1; mmr_pnode = uv_blade_to_pnode(mmr_blade); -- cgit v1.2.3 From 9a8709d44139748fe2e0ab56d20d8c384c8b65ad Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 2 May 2009 00:25:11 +0400 Subject: x86: uv - prevent NULL dereference in uv_system_init() We may reach NULL dereference oops if kmalloc failed. Prevent it with explicit BUG_ON. [ Impact: more controlled assert in 'impossible' scenario ] Signed-off-by: Cyrill Gorcunov Acked-by: Jack Steiner LKML-Reference: <20090501202511.GE4633@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_uv_x.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 873bf7121e8..9d9e2281a82 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -569,15 +569,18 @@ void __init uv_system_init(void) bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); uv_blade_info = kmalloc(bytes, GFP_KERNEL); + BUG_ON(!uv_blade_info); get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size); bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes(); uv_node_to_blade = kmalloc(bytes, GFP_KERNEL); + BUG_ON(!uv_node_to_blade); memset(uv_node_to_blade, 255, bytes); bytes = sizeof(uv_cpu_to_blade[0]) * num_possible_cpus(); uv_cpu_to_blade = kmalloc(bytes, GFP_KERNEL); + BUG_ON(!uv_cpu_to_blade); memset(uv_cpu_to_blade, 255, bytes); blade = 0; -- cgit v1.2.3 From ba77813a2a22d631fe5bc0bf1ec0d11350544b70 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 May 2009 18:47:44 +0200 Subject: perf_counter: x86: fixup nmi_watchdog vs perf_counter boo-boo Invert the atomic_inc_not_zero() test so that we will indeed detect the first activation. Also rename the global num_counters, since its easy to confuse with x86_pmu.num_counters. [ Impact: fix non-working perfcounters on AMD CPUs, cleanup ] Signed-off-by: Peter Zijlstra LKML-Reference: <1241455664.7620.4938.camel@twins> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index d4c0cc9d326..196b58f0444 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -171,7 +171,7 @@ again: return new_raw_count; } -static atomic_t num_counters; +static atomic_t active_counters; static DEFINE_MUTEX(pmc_reserve_mutex); static bool reserve_pmc_hardware(void) @@ -224,7 +224,7 @@ static void release_pmc_hardware(void) static void hw_perf_counter_destroy(struct perf_counter *counter) { - if (atomic_dec_and_mutex_lock(&num_counters, &pmc_reserve_mutex)) { + if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) { release_pmc_hardware(); mutex_unlock(&pmc_reserve_mutex); } @@ -248,12 +248,12 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return -ENODEV; err = 0; - if (atomic_inc_not_zero(&num_counters)) { + if (!atomic_inc_not_zero(&active_counters)) { mutex_lock(&pmc_reserve_mutex); - if (atomic_read(&num_counters) == 0 && !reserve_pmc_hardware()) + if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware()) err = -EBUSY; else - atomic_inc(&num_counters); + atomic_inc(&active_counters); mutex_unlock(&pmc_reserve_mutex); } if (err) @@ -280,7 +280,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) if (capable(CAP_SYS_ADMIN) && hw_event->nmi) hwc->nmi = 1; - hwc->irq_period = hw_event->irq_period; + hwc->irq_period = hw_event->irq_period; if ((s64)hwc->irq_period <= 0 || hwc->irq_period > x86_pmu.max_period) hwc->irq_period = x86_pmu.max_period; @@ -871,7 +871,7 @@ perf_counter_nmi_handler(struct notifier_block *self, struct pt_regs *regs; int ret; - if (!atomic_read(&num_counters)) + if (!atomic_read(&active_counters)) return NOTIFY_DONE; switch (cmd) { -- cgit v1.2.3 From 066d7dea32c9bffe6decc0abe465627656cdd84e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 4 May 2009 19:04:09 +0200 Subject: perf_counter: fix fixed-purpose counter support on v2 Intel-PERFMON Fixed-purpose counters stopped working in a simple 'perf stat ls' run: cache references cache misses Due to: ef7b3e0: perf_counter, x86: remove vendor check in fixed_mode_idx() Which made x86_pmu.num_counters_fixed matter: if it's nonzero, the fixed-purpose counters are utilized. But on v2 perfmon this field is not set (despite there being fixed-purpose PMCs). So add a quirk to set the number of fixed-purpose counters to at least three. [ Impact: add quirk for three fixed-purpose counters on certain Intel CPUs ] Cc: Robert Richter Cc: Paul Mackerras Cc: Peter Zijlstra LKML-Reference: <1241002046-8832-28-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 196b58f0444..a6878b0798e 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -962,7 +962,13 @@ static int intel_pmu_init(void) x86_pmu = intel_pmu; x86_pmu.version = version; x86_pmu.num_counters = eax.split.num_counters; - x86_pmu.num_counters_fixed = edx.split.num_counters_fixed; + + /* + * Quirk: v2 perfmon does not report fixed-purpose counters, so + * assume at least 3 counters: + */ + x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); + x86_pmu.counter_bits = eax.split.bit_width; x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1; -- cgit v1.2.3 From 975e5f45500dff6d15c0001bb662e9aac0ce0076 Mon Sep 17 00:00:00 2001 From: Samuel Bronson Date: Wed, 6 May 2009 22:27:55 -0400 Subject: x86: use symbolic name for VM86_SIGNAL when used as vm86 default return This code has apparently used "0" and not VM86_SIGNAL since Linux 1.1.9, when Linus added VM86_SIGNAL to vm86.h. This patch changes the code to use the symbolic name. The magic 0 tripped me up in trying to extend the vm86(2) manpage to actually explain vm86()'s interface -- my greps for VM86_SIGNAL came up fruitless. [ Impact: cleanup; no object code change ] Signed-off-by: Samuel Bronson Signed-off-by: H. Peter Anvin --- arch/x86/kernel/vm86_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index d7ac84e7fc1..b8035a0f404 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -318,9 +318,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk } /* - * Save old state, set default return value (%ax) to 0 + * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL) */ - info->regs32->ax = 0; + info->regs32->ax = VM86_SIGNAL; tsk->thread.saved_sp0 = tsk->thread.sp0; tsk->thread.saved_fs = info->regs32->fs; tsk->thread.saved_gs = get_user_gs(info->regs32); -- cgit v1.2.3 From 643bec956544d376b7c2a80a3d5c3d0bf94da8d3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 7 May 2009 09:12:50 +0200 Subject: x86: clean up arch/x86/kernel/tsc_sync.c a bit - remove unused define - make the lock variable definition stand out some more - convert KERN_* to pr_info() / pr_warning() [ Impact: cleanup ] LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc_sync.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index bf36328f6ef..027b5b49899 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -34,6 +34,7 @@ static __cpuinitdata atomic_t stop_count; * of a critical section, to be able to prove TSC time-warps: */ static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; + static __cpuinitdata cycles_t last_tsc; static __cpuinitdata cycles_t max_warp; static __cpuinitdata int nr_warps; @@ -113,13 +114,12 @@ void __cpuinit check_tsc_sync_source(int cpu) return; if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { - printk(KERN_INFO - "Skipping synchronization checks as TSC is reliable.\n"); + pr_info("Skipping synchronization checks as TSC is reliable.\n"); return; } - printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:", - smp_processor_id(), cpu); + pr_info("checking TSC synchronization [CPU#%d -> CPU#%d]:", + smp_processor_id(), cpu); /* * Reset it - in case this is a second bootup: @@ -143,8 +143,8 @@ void __cpuinit check_tsc_sync_source(int cpu) if (nr_warps) { printk("\n"); - printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs," - " turning off TSC clock.\n", max_warp); + pr_warning("Measured %Ld cycles TSC warp between CPUs, " + "turning off TSC clock.\n", max_warp); mark_tsc_unstable("check_tsc_sync_source failed"); } else { printk(" passed.\n"); @@ -195,5 +195,3 @@ void __cpuinit check_tsc_sync_target(void) while (atomic_read(&stop_count) != cpus) cpu_relax(); } -#undef NR_LOOPS - -- cgit v1.2.3 From 6cac5a924668a56c7ccefc345805f1fe0536a90e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sun, 29 Mar 2009 19:56:29 -0700 Subject: xen/x86-64: fix breakpoints and hardware watchpoints Native x86-64 uses the IST mechanism to run int3 and debug traps on an alternative stack. Xen does not do this, and so the frames were being misinterpreted by the ptrace code. This change special-cases these two exceptions by using Xen variants which run on the normal kernel stack properly. Impact: avoid crash or bad data when IST trap is invoked under Xen Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/entry_64.S | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 38946c6e843..bb01ce080b8 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1379,6 +1379,11 @@ END(xen_failsafe_callback) paranoidzeroentry_ist debug do_debug DEBUG_STACK paranoidzeroentry_ist int3 do_int3 DEBUG_STACK paranoiderrorentry stack_segment do_stack_segment +#ifdef CONFIG_XEN +zeroentry xen_debug do_debug +zeroentry xen_int3 do_int3 +errorentry xen_stack_segment do_stack_segment +#endif errorentry general_protection do_general_protection errorentry page_fault do_page_fault #ifdef CONFIG_X86_MCE -- cgit v1.2.3 From 778dedae0cb76a441145f3a0c5d59fcb3ba296d5 Mon Sep 17 00:00:00 2001 From: Huang Weiyi Date: Sat, 9 May 2009 12:54:34 +0800 Subject: x86: mce: remove duplicated #include Remove duplicated #include in arch/x86/kernel/cpu/mcheck/mce_intel_64.c. [ Impact: cleanup ] Signed-off-by: Huang Weiyi Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index d6b72df89d6..aedf9766ee9 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -15,7 +15,6 @@ #include #include #include -#include asmlinkage void smp_thermal_interrupt(void) { -- cgit v1.2.3 From b74d446f1f337e3fe906169a3266cb65ffa4179e Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Sat, 9 May 2009 15:35:10 +0600 Subject: x86: Fix false positive section mismatch warnings in the apic code [ Impact: reduce kernel image size a bit, annotate away warnings ] Signed-off-by: Sam Ravnborg [ modified and tested it ] Signed-off-by: Rakib Mullick Cc: Marcin Slusarz LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/es7000_32.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 1c11b819f24..30294777557 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -254,7 +254,7 @@ static int parse_unisys_oem(char *oemptr) } #ifdef CONFIG_ACPI -static int find_unisys_acpi_oem_table(unsigned long *oem_addr) +static int __init find_unisys_acpi_oem_table(unsigned long *oem_addr) { struct acpi_table_header *header = NULL; struct es7000_oem_table *table; @@ -285,7 +285,7 @@ static int find_unisys_acpi_oem_table(unsigned long *oem_addr) return 0; } -static void unmap_unisys_acpi_oem_table(unsigned long oem_addr) +static void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr) { if (!oem_addr) return; @@ -306,7 +306,7 @@ static int es7000_check_dsdt(void) static int es7000_acpi_ret; /* Hook from generic ACPI tables.c */ -static int es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +static int __init es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { unsigned long oem_addr = 0; int check_dsdt; @@ -717,7 +717,7 @@ struct apic apic_es7000_cluster = { .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, }; -struct apic apic_es7000 = { +struct apic __refdata apic_es7000 = { .name = "es7000", .probe = probe_es7000, -- cgit v1.2.3 From 45fbe3ee01b8e463b28c2751b5dcc0cbdc142d90 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 6 May 2009 08:06:44 -0700 Subject: x86, e820, pci: reserve extra free space near end of RAM The point is to take all RAM resources we have, and _after_ we've added all the resources we've seen in the E820 tree, we then _also_ try to add fake reserved entries for any "round up to X" at the end of the RAM resources. [ Impact: improve PCI mem-resource allocation robustness, protect "stolen RAM" ] Reported-by: Yannick Roehlly Acked-by: Jesse Barnes Signed-off-by: Yinghai Lu Cc: Ivan Kokshaysky Cc: Andrew Morton Cc: yannick.roehlly@free.fr LKML-Reference: <4A01A784.2050407@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 00628130292..a2335d9de05 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1371,6 +1371,23 @@ void __init e820_reserve_resources(void) } } +/* How much should we pad RAM ending depending on where it is? */ +static unsigned long ram_alignment(resource_size_t pos) +{ + unsigned long mb = pos >> 20; + + /* To 64kB in the first megabyte */ + if (!mb) + return 64*1024; + + /* To 1MB in the first 16MB */ + if (mb < 16) + return 1024*1024; + + /* To 32MB for anything above that */ + return 32*1024*1024; +} + void __init e820_reserve_resources_late(void) { int i; @@ -1382,6 +1399,24 @@ void __init e820_reserve_resources_late(void) insert_resource_expand_to_fit(&iomem_resource, res); res++; } + + /* + * Try to bump up RAM regions to reasonable boundaries to + * avoid stolen RAM: + */ + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *entry = &e820_saved.map[i]; + resource_size_t start, end; + + if (entry->type != E820_RAM) + continue; + start = entry->addr + entry->size; + end = round_up(start, ram_alignment(start)); + if (start == end) + continue; + reserve_region_with_split(&iomem_resource, start, + end - 1, "RAM buffer"); + } } char *__init default_machine_specific_memory_setup(void) -- cgit v1.2.3 From 5d423ccd7ba4285f1084e91b26805e1d0ae978ed Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 6 May 2009 08:07:52 -0700 Subject: x86/pci: remove rounding quirk from e820_setup_gap() Now that the e820 code explicitly reserves 'potentially dangerous' free physical memory address space to protect ACPI stolen RAM, there's no need for the rounding quirk in the PCI allocator anymore. Also, this quirk was open-ended iteration that could end up reserving a lot of free space and potentially breaking drivers - such as the one reported by Yannick Roehlly where there's a PCI device with a large memory resource. So remove it. [ Impact: make more of the PCI hole available for assigning pci devices ] Reported-by: Yannick Roehlly Signed-off-by: Yinghai Lu Acked-by: Jesse Barnes Cc: Ivan Kokshaysky Cc: Linus Torvalds Cc: Andrew Morton LKML-Reference: <4A01A7C8.5090701@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index a2335d9de05..7271fa33d79 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -617,7 +617,7 @@ __init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, */ __init void e820_setup_gap(void) { - unsigned long gapstart, gapsize, round; + unsigned long gapstart, gapsize; int found; gapstart = 0x10000000; @@ -635,14 +635,9 @@ __init void e820_setup_gap(void) #endif /* - * See how much we want to round up: start off with - * rounding to the next 1MB area. + * e820_reserve_resources_late protect stolen RAM already */ - round = 0x100000; - while ((gapsize >> 4) > round) - round += round; - /* Fun with two's complement */ - pci_mem_start = (gapstart + round) & -round; + pci_mem_start = gapstart; printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", -- cgit v1.2.3 From b9e0353fc85dab4ef5ebcef2bd09ebc4ce6d5a7b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 6 May 2009 10:05:32 -0700 Subject: x86/acpi: remove irq-compression trick on 32-bit We already have a per cpu vector on 32-bit via recent changes, and don't need this trick any more (which trick obfuscates the real GSI mappings and which only triggers on larger systems to begin with): On 3 ioapic system (24 per ioapic) before patch I got: ACPI: PCI Interrupt Link [ILSB] enabled at IRQ 71 IOAPIC[2]: Set routing entry (10-23 -> 0xa9 -> IRQ 64 Mode:1 Active:1) pci 0000:80:01.1: PCI INT A -> Link[ILSB] -> GSI 71 (level, low) -> IRQ 64 ACPI: PCI Interrupt Link [LE5B] enabled at IRQ 67 IOAPIC[2]: Set routing entry (10-19 -> 0xb1 -> IRQ 65 Mode:1 Active:1) pci 0000:83:00.0: PCI INT B -> Link[LE5B] -> GSI 67 (level, low) -> IRQ 65 ACPI: PCI Interrupt Link [LE5A] enabled at IRQ 66 IOAPIC[2]: Set routing entry (10-18 -> 0xb9 -> IRQ 66 Mode:1 Active:1) pci 0000:83:00.1: PCI INT A -> Link[LE5A] -> GSI 66 (level, low) -> IRQ 66 ACPI: PCI Interrupt Link [LE5D] enabled at IRQ 65 IOAPIC[2]: Set routing entry (10-17 -> 0xc1 -> IRQ 67 Mode:1 Active:1) pci 0000:84:00.0: PCI INT B -> Link[LE5D] -> GSI 65 (level, low) -> IRQ 67 ACPI: PCI Interrupt Link [LE5C] enabled at IRQ 64 IOAPIC[2]: Set routing entry (10-16 -> 0xc9 -> IRQ 68 Mode:1 Active:1) pci 0000:84:00.1: PCI INT A -> Link[LE5C] -> GSI 64 (level, low) -> IRQ 68 pci 0000:87:00.0: PCI INT B -> Link[LE5A] -> GSI 66 (level, low) -> IRQ 66 pci 0000:87:00.1: PCI INT A -> Link[LE5D] -> GSI 65 (level, low) -> IRQ 67 pci 0000:88:00.0: PCI INT B -> Link[LE5C] -> GSI 64 (level, low) -> IRQ 68 pci 0000:88:00.1: PCI INT A -> Link[LE5B] -> GSI 67 (level, low) -> IRQ 65 pci 0000:8b:00.0: PCI INT B -> Link[LE5A] -> GSI 66 (level, low) -> IRQ 66 pci 0000:8b:00.1: PCI INT A -> Link[LE5D] -> GSI 65 (level, low) -> IRQ 67 pci 0000:8c:00.0: PCI INT B -> Link[LE5C] -> GSI 64 (level, low) -> IRQ 68 pci 0000:8c:00.1: PCI INT A -> Link[LE5B] -> GSI 67 (level, low) -> IRQ 65 after the patch we get: ACPI: PCI Interrupt Link [ILSB] enabled at IRQ 71 IOAPIC[2]: Set routing entry (10-23 -> 0xa9 -> IRQ 71 Mode:1 Active:1) pci 0000:80:01.1: PCI INT A -> Link[ILSB] -> GSI 71 (level, low) -> IRQ 71 ACPI: PCI Interrupt Link [LE5B] enabled at IRQ 67 IOAPIC[2]: Set routing entry (10-19 -> 0xb1 -> IRQ 67 Mode:1 Active:1) pci 0000:83:00.0: PCI INT B -> Link[LE5B] -> GSI 67 (level, low) -> IRQ 67 ACPI: PCI Interrupt Link [LE5A] enabled at IRQ 66 IOAPIC[2]: Set routing entry (10-18 -> 0xb9 -> IRQ 66 Mode:1 Active:1) pci 0000:83:00.1: PCI INT A -> Link[LE5A] -> GSI 66 (level, low) -> IRQ 66 ACPI: PCI Interrupt Link [LE5D] enabled at IRQ 65 IOAPIC[2]: Set routing entry (10-17 -> 0xc1 -> IRQ 65 Mode:1 Active:1) pci 0000:84:00.0: PCI INT B -> Link[LE5D] -> GSI 65 (level, low) -> IRQ 65 ACPI: PCI Interrupt Link [LE5C] enabled at IRQ 64 IOAPIC[2]: Set routing entry (10-16 -> 0xc9 -> IRQ 64 Mode:1 Active:1) pci 0000:84:00.1: PCI INT A -> Link[LE5C] -> GSI 64 (level, low) -> IRQ 64 pci 0000:87:00.0: PCI INT B -> Link[LE5A] -> GSI 66 (level, low) -> IRQ 66 pci 0000:87:00.1: PCI INT A -> Link[LE5D] -> GSI 65 (level, low) -> IRQ 65 pci 0000:88:00.0: PCI INT B -> Link[LE5C] -> GSI 64 (level, low) -> IRQ 64 pci 0000:88:00.1: PCI INT A -> Link[LE5B] -> GSI 67 (level, low) -> IRQ 67 pci 0000:8b:00.0: PCI INT B -> Link[LE5A] -> GSI 66 (level, low) -> IRQ 66 pci 0000:8b:00.1: PCI INT A -> Link[LE5D] -> GSI 65 (level, low) -> IRQ 65 pci 0000:8c:00.0: PCI INT B -> Link[LE5C] -> GSI 64 (level, low) -> IRQ 64 pci 0000:8c:00.1: PCI INT A -> Link[LE5B] -> GSI 67 (level, low) -> IRQ 67 As it can be seen that GSIs now get mapped lineary. [ Impact: simplify irq number mapping on bigger 32-bit systems ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Jesse Barnes Cc: Len Brown LKML-Reference: <4A01C35C.7060207@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 65 +++++---------------------------------------- 1 file changed, 7 insertions(+), 58 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 6ee96b5530f..fb5e88262d2 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1162,22 +1162,9 @@ int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) { int ioapic; int ioapic_pin; -#ifdef CONFIG_X86_32 -#define MAX_GSI_NUM 4096 -#define IRQ_COMPRESSION_START 64 - - static int pci_irq = IRQ_COMPRESSION_START; - /* - * Mapping between Global System Interrupts, which - * represent all possible interrupts, and IRQs - * assigned to actual devices. - */ - static int gsi_to_irq[MAX_GSI_NUM]; -#else if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) return gsi; -#endif /* Don't set up the ACPI SCI because it's already set up */ if (acpi_gbl_FADT.sci_interrupt == gsi) @@ -1196,66 +1183,28 @@ int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) gsi = ioapic_renumber_irq(ioapic, gsi); #endif - /* - * Avoid pin reprogramming. PRTs typically include entries - * with redundant pin->gsi mappings (but unique PCI devices); - * we only program the IOAPIC on the first. - */ if (ioapic_pin > MP_MAX_IOAPIC_PIN) { printk(KERN_ERR "Invalid reference to IOAPIC pin " "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, ioapic_pin); return gsi; } + + /* + * Avoid pin reprogramming. PRTs typically include entries + * with redundant pin->gsi mappings (but unique PCI devices); + * we only program the IOAPIC on the first. + */ if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) { pr_debug("Pin %d-%d already programmed\n", mp_ioapic_routing[ioapic].apic_id, ioapic_pin); -#ifdef CONFIG_X86_32 - return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); -#else return gsi; -#endif } - set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed); -#ifdef CONFIG_X86_32 - /* - * For GSI >= 64, use IRQ compression - */ - if ((gsi >= IRQ_COMPRESSION_START) - && (triggering == ACPI_LEVEL_SENSITIVE)) { - /* - * For PCI devices assign IRQs in order, avoiding gaps - * due to unused I/O APIC pins. - */ - int irq = gsi; - if (gsi < MAX_GSI_NUM) { - /* - * Retain the VIA chipset work-around (gsi > 15), but - * avoid a problem where the 8254 timer (IRQ0) is setup - * via an override (so it's not on pin 0 of the ioapic), - * and at the same time, the pin 0 interrupt is a PCI - * type. The gsi > 15 test could cause these two pins - * to be shared as IRQ0, and they are not shareable. - * So test for this condition, and if necessary, avoid - * the pin collision. - */ - gsi = pci_irq++; - /* - * Don't assign IRQ used by ACPI SCI - */ - if (gsi == acpi_gbl_FADT.sci_interrupt) - gsi = pci_irq++; - gsi_to_irq[irq] = gsi; - } else { - printk(KERN_ERR "GSI %u is too high\n", gsi); - return gsi; - } - } -#endif io_apic_set_pci_routing(dev, ioapic, ioapic_pin, gsi, triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, polarity == ACPI_ACTIVE_HIGH ? 0 : 1); + return gsi; } -- cgit v1.2.3 From ee214558c2e959781a406e76c5b34364da638e1d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 6 May 2009 10:07:07 -0700 Subject: x86: fix alloc_mptable() Fix the conditions when we stop updating the mptable due to running out of slots. [ Impact: fix memory corruption / non-working update_mptable boot parameter ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Jesse Barnes Cc: Len Brown LKML-Reference: <4A01C3BB.1000609@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/mpparse.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 70fd7e414c1..cd2a41a7c45 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -870,24 +870,17 @@ static inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {} #endif /* CONFIG_X86_IO_APIC */ -static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, - int count) +static int +check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count) { - if (!mpc_new_phys) { - pr_info("No spare slots, try to append...take your risk, " - "new mpc_length %x\n", count); - } else { - if (count <= mpc_new_length) - pr_info("No spare slots, try to append..., " - "new mpc_length %x\n", count); - else { - pr_err("mpc_new_length %lx is too small\n", - mpc_new_length); - return -1; - } + int ret = 0; + + if (!mpc_new_phys || count <= mpc_new_length) { + WARN(1, "update_mptable: No spare slots (length: %x)\n", count); + return -1; } - return 0; + return ret; } static int __init replace_intsrc_all(struct mpc_table *mpc, @@ -946,7 +939,7 @@ static int __init replace_intsrc_all(struct mpc_table *mpc, } else { struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; count += sizeof(struct mpc_intsrc); - if (!check_slot(mpc_new_phys, mpc_new_length, count)) + if (check_slot(mpc_new_phys, mpc_new_length, count) < 0) goto out; assign_to_mpc_intsrc(&mp_irqs[i], m); mpc->length = count; -- cgit v1.2.3 From a31f82057ce6f7ced578d64c07a72ccbdc7336e4 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 6 May 2009 10:06:15 -0700 Subject: x86/acpi: call mp_config_acpi_gsi() in mp_register_gsi() The patch to call mp_config_acpi_gsi() from the ACPI IRQ registration code never got mainline because there were open discussions about it. This call is needed to properly update the kernel's copy of the mptable, when the update_mptable boot parameter is needed. Now that the dust has settled with the APIC unification, and since there were no objections when the patch was re-submitted, try this again. [ Impact: fix the update_mptable boot parameter ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Jesse Barnes Cc: Len Brown LKML-Reference: <4A01C387.7090103@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 66 +++++++++++++++++++++++++++------------------ 1 file changed, 40 insertions(+), 26 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index fb5e88262d2..8019ecf66e9 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -1158,6 +1159,44 @@ void __init mp_config_acpi_legacy_irqs(void) } } +static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int triggering, + int polarity) +{ +#ifdef CONFIG_X86_MPPARSE + struct mpc_intsrc mp_irq; + struct pci_dev *pdev; + unsigned char number; + unsigned int devfn; + int ioapic; + u8 pin; + + if (!acpi_ioapic) + return 0; + if (!dev) + return 0; + if (dev->bus != &pci_bus_type) + return 0; + + pdev = to_pci_dev(dev); + number = pdev->bus->number; + devfn = pdev->devfn; + pin = pdev->pin; + /* print the entry should happen on mptable identically */ + mp_irq.type = MP_INTSRC; + mp_irq.irqtype = mp_INT; + mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | + (polarity == ACPI_ACTIVE_HIGH ? 1 : 3); + mp_irq.srcbus = number; + mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); + ioapic = mp_find_ioapic(gsi); + mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id; + mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); + + save_mp_irq(&mp_irq); +#endif + return 0; +} + int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) { int ioapic; @@ -1189,6 +1228,7 @@ int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) ioapic_pin); return gsi; } + mp_config_acpi_gsi(dev, gsi, triggering, polarity); /* * Avoid pin reprogramming. PRTs typically include entries @@ -1208,32 +1248,6 @@ int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) return gsi; } -int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, - u32 gsi, int triggering, int polarity) -{ -#ifdef CONFIG_X86_MPPARSE - struct mpc_intsrc mp_irq; - int ioapic; - - if (!acpi_ioapic) - return 0; - - /* print the entry should happen on mptable identically */ - mp_irq.type = MP_INTSRC; - mp_irq.irqtype = mp_INT; - mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | - (polarity == ACPI_ACTIVE_HIGH ? 1 : 3); - mp_irq.srcbus = number; - mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); - ioapic = mp_find_ioapic(gsi); - mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id; - mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); - - save_mp_irq(&mp_irq); -#endif - return 0; -} - /* * Parse IOAPIC related entries in MADT * returns 0 on success, < 0 on error -- cgit v1.2.3 From bdfe8ac153546537ed24de69610ea781a734f785 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 6 May 2009 10:07:41 -0700 Subject: x86/acpi: move pin_programmed bit map to io_apic.c Prepare to call setup_io_apic_routing() in pcibios_irq_enable() also remove not needed member apic_id. [ Impact: clean up, prepare for future change ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Jesse Barnes Cc: Len Brown LKML-Reference: <4A01C3DD.3050104@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 18 ++---------------- arch/x86/kernel/apic/io_apic.c | 25 ++++++++++++++++++++++++- 2 files changed, 26 insertions(+), 17 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 8019ecf66e9..dcfbc3ab9e4 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -904,10 +904,8 @@ extern int es7000_plat; #endif static struct { - int apic_id; int gsi_base; int gsi_end; - DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); } mp_ioapic_routing[MAX_IO_APICS]; int mp_find_ioapic(int gsi) @@ -996,7 +994,6 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) * Build basic GSI lookup table to facilitate gsi->io_apic lookups * and to prevent reprogramming of IOAPIC pins (PCI GSIs). */ - mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].apicid; mp_ioapic_routing[idx].gsi_base = gsi_base; mp_ioapic_routing[idx].gsi_end = gsi_base + io_apic_get_redir_entries(idx); @@ -1189,7 +1186,7 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int triggering, mp_irq.srcbus = number; mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); ioapic = mp_find_ioapic(gsi); - mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id; + mp_irq.dstapic = mp_ioapics[ioapic].apicid; mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); save_mp_irq(&mp_irq); @@ -1224,23 +1221,12 @@ int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) if (ioapic_pin > MP_MAX_IOAPIC_PIN) { printk(KERN_ERR "Invalid reference to IOAPIC pin " - "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, + "%d-%d\n", mp_ioapics[ioapic].apicid, ioapic_pin); return gsi; } mp_config_acpi_gsi(dev, gsi, triggering, polarity); - /* - * Avoid pin reprogramming. PRTs typically include entries - * with redundant pin->gsi mappings (but unique PCI devices); - * we only program the IOAPIC on the first. - */ - if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) { - pr_debug("Pin %d-%d already programmed\n", - mp_ioapic_routing[ioapic].apic_id, ioapic_pin); - return gsi; - } - set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed); io_apic_set_pci_routing(dev, ioapic, ioapic_pin, gsi, triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, polarity == ACPI_ACTIVE_HIGH ? 0 : 1); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 21c30e1121e..e279ae33928 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3922,7 +3922,7 @@ int __init io_apic_get_version(int ioapic) } #endif -int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, +static int __io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, int triggering, int polarity) { struct irq_desc *desc; @@ -3959,6 +3959,29 @@ int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, return 0; } +static struct { + DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); +} mp_ioapic_routing[MAX_IO_APICS]; + +int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, + int triggering, int polarity) +{ + + /* + * Avoid pin reprogramming. PRTs typically include entries + * with redundant pin->gsi mappings (but unique PCI devices); + * we only program the IOAPIC on the first. + */ + if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) { + pr_debug("Pin %d-%d already programmed\n", + mp_ioapics[ioapic].apicid, pin); + return 0; + } + set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed); + + return __io_apic_set_pci_routing(dev, ioapic, pin, irq, + triggering, polarity); +} int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) { -- cgit v1.2.3 From e20c06fd6950265a899edd96a02dc2e6ae2d1ce5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 6 May 2009 10:08:22 -0700 Subject: x86/pci: add 4 more return parameters to IO_APIC_get_PCI_irq_vector() To prepare those params for pcibios_irq_enable() to call setup_io_apic_routing(). [ Impact: extend function call API to prepare for new functionality ] Signed-off-by: Yinghai Lu Acked-by: Jesse Barnes Cc: Len Brown Cc: Andrew Morton LKML-Reference: <4A01C406.2040303@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 107 +++++++++++++++++++++++------------------ 1 file changed, 59 insertions(+), 48 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e279ae33928..caf9dbdde05 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -873,54 +873,6 @@ static int __init find_isa_irq_apic(int irq, int type) return -1; } -/* - * Find a specific PCI IRQ entry. - * Not an __init, possibly needed by modules - */ -static int pin_2_irq(int idx, int apic, int pin); - -int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) -{ - int apic, i, best_guess = -1; - - apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", - bus, slot, pin); - if (test_bit(bus, mp_bus_not_pci)) { - apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); - return -1; - } - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].srcbus; - - for (apic = 0; apic < nr_ioapics; apic++) - if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic || - mp_irqs[i].dstapic == MP_APIC_ALL) - break; - - if (!test_bit(lbus, mp_bus_not_pci) && - !mp_irqs[i].irqtype && - (bus == lbus) && - (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) { - int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq); - - if (!(apic || IO_APIC_IRQ(irq))) - continue; - - if (pin == (mp_irqs[i].srcbusirq & 3)) - return irq; - /* - * Use the first all-but-pin matching entry as a - * best-guess fuzzy result for broken mptables. - */ - if (best_guess < 0) - best_guess = irq; - } - } - return best_guess; -} - -EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); - #if defined(CONFIG_EISA) || defined(CONFIG_MCA) /* * EISA Edge/Level control register, ELCR @@ -1139,6 +1091,65 @@ static int pin_2_irq(int idx, int apic, int pin) return irq; } +/* + * Find a specific PCI IRQ entry. + * Not an __init, possibly needed by modules + */ +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin, + int *ioapic, int *ioapic_pin, + int *trigger, int *polarity) +{ + int apic, i, best_guess = -1; + + apic_printk(APIC_DEBUG, + "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", + bus, slot, pin); + if (test_bit(bus, mp_bus_not_pci)) { + apic_printk(APIC_VERBOSE, + "PCI BIOS passed nonexistent PCI bus %d!\n", bus); + return -1; + } + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].srcbus; + + for (apic = 0; apic < nr_ioapics; apic++) + if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic || + mp_irqs[i].dstapic == MP_APIC_ALL) + break; + + if (!test_bit(lbus, mp_bus_not_pci) && + !mp_irqs[i].irqtype && + (bus == lbus) && + (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) { + int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq); + + if (!(apic || IO_APIC_IRQ(irq))) + continue; + + if (pin == (mp_irqs[i].srcbusirq & 3)) { + *ioapic = apic; + *ioapic_pin = mp_irqs[i].dstirq; + *trigger = irq_trigger(i); + *polarity = irq_polarity(i); + return irq; + } + /* + * Use the first all-but-pin matching entry as a + * best-guess fuzzy result for broken mptables. + */ + if (best_guess < 0) { + *ioapic = apic; + *ioapic_pin = mp_irqs[i].dstirq; + *trigger = irq_trigger(i); + *polarity = irq_polarity(i); + best_guess = irq; + } + } + } + return best_guess; +} +EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); + void lock_vector_lock(void) { /* Used to the online set of cpus does not change -- cgit v1.2.3 From 5ef2183768bb7d64b85eccbfa1537a61cbefa97c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 6 May 2009 10:08:50 -0700 Subject: x86/acpi: move setup io apic routing out of CONFIG_ACPI scope So we could set io apic routing when ACPI is not enabled. [ Impact: prepare for new functionality ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Jesse Barnes Cc: Len Brown LKML-Reference: <4A01C422.5070400@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 122 ++++++++++++++++++++--------------------- 1 file changed, 61 insertions(+), 61 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index caf9dbdde05..3a68daee0d9 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3839,6 +3839,67 @@ int __init arch_probe_nr_irqs(void) } #endif +static int __io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, + int triggering, int polarity) +{ + struct irq_desc *desc; + struct irq_cfg *cfg; + int node; + + if (!IO_APIC_IRQ(irq)) { + apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", + ioapic); + return -EINVAL; + } + + if (dev) + node = dev_to_node(dev); + else + node = cpu_to_node(boot_cpu_id); + + desc = irq_to_desc_alloc_node(irq, node); + if (!desc) { + printk(KERN_INFO "can not get irq_desc %d\n", irq); + return 0; + } + + /* + * IRQs < 16 are already in the irq_2_pin[] map + */ + if (irq >= NR_IRQS_LEGACY) { + cfg = desc->chip_data; + add_pin_to_irq_node(cfg, node, ioapic, pin); + } + + setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity); + + return 0; +} + +static struct { + DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); +} mp_ioapic_routing[MAX_IO_APICS]; + +int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, + int triggering, int polarity) +{ + + /* + * Avoid pin reprogramming. PRTs typically include entries + * with redundant pin->gsi mappings (but unique PCI devices); + * we only program the IOAPIC on the first. + */ + if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) { + pr_debug("Pin %d-%d already programmed\n", + mp_ioapics[ioapic].apicid, pin); + return 0; + } + set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed); + + return __io_apic_set_pci_routing(dev, ioapic, pin, irq, + triggering, polarity); +} + /* -------------------------------------------------------------------------- ACPI-based IOAPIC Configuration -------------------------------------------------------------------------- */ @@ -3933,67 +3994,6 @@ int __init io_apic_get_version(int ioapic) } #endif -static int __io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, - int triggering, int polarity) -{ - struct irq_desc *desc; - struct irq_cfg *cfg; - int node; - - if (!IO_APIC_IRQ(irq)) { - apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", - ioapic); - return -EINVAL; - } - - if (dev) - node = dev_to_node(dev); - else - node = cpu_to_node(boot_cpu_id); - - desc = irq_to_desc_alloc_node(irq, node); - if (!desc) { - printk(KERN_INFO "can not get irq_desc %d\n", irq); - return 0; - } - - /* - * IRQs < 16 are already in the irq_2_pin[] map - */ - if (irq >= NR_IRQS_LEGACY) { - cfg = desc->chip_data; - add_pin_to_irq_node(cfg, node, ioapic, pin); - } - - setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity); - - return 0; -} - -static struct { - DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); -} mp_ioapic_routing[MAX_IO_APICS]; - -int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, - int triggering, int polarity) -{ - - /* - * Avoid pin reprogramming. PRTs typically include entries - * with redundant pin->gsi mappings (but unique PCI devices); - * we only program the IOAPIC on the first. - */ - if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) { - pr_debug("Pin %d-%d already programmed\n", - mp_ioapics[ioapic].apicid, pin); - return 0; - } - set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed); - - return __io_apic_set_pci_routing(dev, ioapic, pin, irq, - triggering, polarity); -} - int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) { int i; -- cgit v1.2.3 From b9c61b70075c87a8612624736faf4a2de5b1ed30 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 6 May 2009 10:10:06 -0700 Subject: x86/pci: update pirq_enable_irq() to setup io apic routing So we can set io apic routing only when enabling the device irq. This is advantageous for IRQ descriptor allocation affinity: if we set up the IO-APIC entry later, we have a chance to allocate the IRQ descriptor later and know which device it is on and can set affinity accordingly. [ Impact: standardize/enhance irq-enabling sequence for mptable irqs ] Signed-off-by: Yinghai Lu Acked-by: Jesse Barnes Cc: Len Brown Cc: Andrew Morton LKML-Reference: <4A01C46E.8000501@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 150 ++++++++++++++++++++--------------------- 1 file changed, 74 insertions(+), 76 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 3a68daee0d9..5d5f4120c74 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1480,9 +1480,13 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq ioapic_write_entry(apic_id, pin, entry); } +static struct { + DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); +} mp_ioapic_routing[MAX_IO_APICS]; + static void __init setup_IO_APIC_irqs(void) { - int apic_id, pin, idx, irq; + int apic_id = 0, pin, idx, irq; int notcon = 0; struct irq_desc *desc; struct irq_cfg *cfg; @@ -1490,48 +1494,53 @@ static void __init setup_IO_APIC_irqs(void) apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); - for (apic_id = 0; apic_id < nr_ioapics; apic_id++) { - for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { - - idx = find_irq_entry(apic_id, pin, mp_INT); - if (idx == -1) { - if (!notcon) { - notcon = 1; - apic_printk(APIC_VERBOSE, - KERN_DEBUG " %d-%d", - mp_ioapics[apic_id].apicid, pin); - } else - apic_printk(APIC_VERBOSE, " %d-%d", - mp_ioapics[apic_id].apicid, pin); - continue; - } - if (notcon) { - apic_printk(APIC_VERBOSE, - " (apicid-pin) not connected\n"); - notcon = 0; - } +#ifdef CONFIG_ACPI + if (!acpi_disabled && acpi_ioapic) { + apic_id = mp_find_ioapic(0); + if (apic_id < 0) + apic_id = 0; + } +#endif - irq = pin_2_irq(idx, apic_id, pin); + for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { + idx = find_irq_entry(apic_id, pin, mp_INT); + if (idx == -1) { + if (!notcon) { + notcon = 1; + apic_printk(APIC_VERBOSE, + KERN_DEBUG " %d-%d", + mp_ioapics[apic_id].apicid, pin); + } else + apic_printk(APIC_VERBOSE, " %d-%d", + mp_ioapics[apic_id].apicid, pin); + continue; + } + if (notcon) { + apic_printk(APIC_VERBOSE, + " (apicid-pin) not connected\n"); + notcon = 0; + } - /* - * Skip the timer IRQ if there's a quirk handler - * installed and if it returns 1: - */ - if (apic->multi_timer_check && - apic->multi_timer_check(apic_id, irq)) - continue; + irq = pin_2_irq(idx, apic_id, pin); - desc = irq_to_desc_alloc_node(irq, node); - if (!desc) { - printk(KERN_INFO "can not get irq_desc for %d\n", irq); - continue; - } - cfg = desc->chip_data; - add_pin_to_irq_node(cfg, node, apic_id, pin); + /* + * Skip the timer IRQ if there's a quirk handler + * installed and if it returns 1: + */ + if (apic->multi_timer_check && + apic->multi_timer_check(apic_id, irq)) + continue; - setup_IO_APIC_irq(apic_id, pin, irq, desc, - irq_trigger(idx), irq_polarity(idx)); + desc = irq_to_desc_alloc_node(irq, node); + if (!desc) { + printk(KERN_INFO "can not get irq_desc for %d\n", irq); + continue; } + cfg = desc->chip_data; + add_pin_to_irq_node(cfg, node, apic_id, pin); + set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed); + setup_IO_APIC_irq(apic_id, pin, irq, desc, + irq_trigger(idx), irq_polarity(idx)); } if (notcon) @@ -3876,10 +3885,6 @@ static int __io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, in return 0; } -static struct { - DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); -} mp_ioapic_routing[MAX_IO_APICS]; - int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, int triggering, int polarity) { @@ -4023,51 +4028,44 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) #ifdef CONFIG_SMP void __init setup_ioapic_dest(void) { - int pin, ioapic, irq, irq_entry; + int pin, ioapic = 0, irq, irq_entry; struct irq_desc *desc; - struct irq_cfg *cfg; const struct cpumask *mask; if (skip_ioapic_setup == 1) return; - for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { - for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { - irq_entry = find_irq_entry(ioapic, pin, mp_INT); - if (irq_entry == -1) - continue; - irq = pin_2_irq(irq_entry, ioapic, pin); - - /* setup_IO_APIC_irqs could fail to get vector for some device - * when you have too many devices, because at that time only boot - * cpu is online. - */ - desc = irq_to_desc(irq); - cfg = desc->chip_data; - if (!cfg->vector) { - setup_IO_APIC_irq(ioapic, pin, irq, desc, - irq_trigger(irq_entry), - irq_polarity(irq_entry)); - continue; +#ifdef CONFIG_ACPI + if (!acpi_disabled && acpi_ioapic) { + ioapic = mp_find_ioapic(0); + if (ioapic < 0) + ioapic = 0; + } +#endif - } + for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { + irq_entry = find_irq_entry(ioapic, pin, mp_INT); + if (irq_entry == -1) + continue; + irq = pin_2_irq(irq_entry, ioapic, pin); - /* - * Honour affinities which have been set in early boot - */ - if (desc->status & - (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) - mask = desc->affinity; - else - mask = apic->target_cpus(); + desc = irq_to_desc(irq); - if (intr_remapping_enabled) - set_ir_ioapic_affinity_irq_desc(desc, mask); - else - set_ioapic_affinity_irq_desc(desc, mask); - } + /* + * Honour affinities which have been set in early boot + */ + if (desc->status & + (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) + mask = desc->affinity; + else + mask = apic->target_cpus(); + if (intr_remapping_enabled) + set_ir_ioapic_affinity_irq_desc(desc, mask); + else + set_ioapic_affinity_irq_desc(desc, mask); } + } #endif -- cgit v1.2.3 From 61fe91e1319556f32bebfd7ed2c68ef02e2c17f7 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 9 May 2009 23:47:42 -0700 Subject: x86: apic: Check rev 3 fadt correctly for physical_apic bit Impact: fix fadt version checking FADT2_REVISION_ID has value 3 aka rev 3 FADT. So need to use >= instead of >, as other places in the code do. [ Impact: extend scope of APIC boot quirk ] Signed-off-by: Yinghai Lu LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic_flat_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 306e5e88fb6..744e6d8af27 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -235,7 +235,7 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) * regardless of how many processors are present (x86_64 ES7000 * is an example). */ - if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && + if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) { printk(KERN_DEBUG "system APIC only can use physical flat"); return 1; -- cgit v1.2.3 From 3e0c373749d7eb5b354ac0b043f2b2cdf84eefef Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 9 May 2009 23:47:42 -0700 Subject: x86: clean up and fix setup_clear/force_cpu_cap handling setup_force_cpu_cap() only have one user (Xen guest code), but it should not reuse cleared_cpu_cpus, otherwise it will have problems on SMP. Need to have a separate cpu_cpus_set array too, for forced-on flags, beyond the forced-off flags. Also need to setup handling before all cpus caps are combined. [ Impact: fix the forced-set CPU feature flag logic ] Cc: H. Peter Anvin Cc: Jeremy Fitzhardinge Cc: Rusty Russell Signed-off-by: Yinghai Lu LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c4f667896c2..e7fd5c4935a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -292,7 +292,8 @@ static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c) return NULL; /* Not found */ } -__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; +__u32 cpu_caps_cleared[NCAPINTS] __cpuinitdata; +__u32 cpu_caps_set[NCAPINTS] __cpuinitdata; void load_percpu_segment(int cpu) { @@ -806,6 +807,16 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) #endif init_hypervisor(c); + + /* + * Clear/Set all flags overriden by options, need do it + * before following smp all cpus cap AND. + */ + for (i = 0; i < NCAPINTS; i++) { + c->x86_capability[i] &= ~cpu_caps_cleared[i]; + c->x86_capability[i] |= cpu_caps_set[i]; + } + /* * On SMP, boot_cpu_data holds the common feature set between * all CPUs; so make sure that we indicate which features are @@ -818,10 +829,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; } - /* Clear all flags overriden by options */ - for (i = 0; i < NCAPINTS; i++) - c->x86_capability[i] &= ~cleared_cpu_caps[i]; - #ifdef CONFIG_X86_MCE /* Init Machine Check Exception if available. */ mcheck_init(c); -- cgit v1.2.3 From 80989ce0643c1034822f3e339ed8d790b649abe1 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 9 May 2009 23:47:42 -0700 Subject: x86: clean up and and print out initial max_pfn_mapped Do this so we can check the range that is mapped before init_memory_mapping(). To be able to print out meaningful info, we first have to fix 64-bit to have max_pfn_mapped assigned before that call. This also unifies the code-path a bit. [ Impact: print more debug info, cleanup ] Signed-off-by: Yinghai Lu LKML-Reference: <49BF0978.40605@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0d77e56e821..4031d6cb3ff 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -862,12 +862,16 @@ void __init setup_arch(char **cmdline_p) max_low_pfn = max_pfn; high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; + max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; #endif #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION setup_bios_corruption_check(); #endif + printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n", + max_pfn_mapped< Date: Sat, 2 May 2009 10:40:57 -0700 Subject: x86: read apic ID in the !acpi_lapic case Ed found that on 32-bit, boot_cpu_physical_apicid is not read right, when the mptable is broken. Interestingly, actually three paths use/set it: 1. acpi: at that time that is already read from reg 2. mptable: only read from mptable 3. no madt, and no mptable, that use default apic id 0 for 64-bit, -1 for 32-bit so we could read the apic id for the 2/3 path. We trust the hardware register more than we trust a BIOS data structure (the mptable). We can also avoid the double set_fixmap() when acpi_lapic is used, and also need to move cpu_has_apic earlier and call apic_disable(). Also when need to update the apic id, we'd better read and set the apic version as well - so that quirks are applied precisely. v2: make path 3 with 64bit, use -1 as apic id, so could read it later. v3: fix whitespace problem pointed out by Ed Swierk [ Impact: get correct apic id for bsp other than acpi path ] Reported-by: Ed Swierk Signed-off-by: Yinghai Lu Acked-by: Cyrill Gorcunov LKML-Reference: <49FC85A9.2070702@kernel.org> [ v4: sanity-check in the ACPI case too ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index e258bedce7c..1ee966f4ae9 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1456,7 +1456,6 @@ static int __init detect_init_APIC(void) } mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - boot_cpu_physical_apicid = 0; return 0; } #else @@ -1570,6 +1569,8 @@ void __init early_init_lapic_mapping(void) */ void __init init_apic_mappings(void) { + unsigned int new_apicid; + if (x2apic_mode) { boot_cpu_physical_apicid = read_apic_id(); return; @@ -1586,21 +1587,31 @@ void __init init_apic_mappings(void) } else apic_phys = mp_lapic_addr; - set_fixmap_nocache(FIX_APIC_BASE, apic_phys); + /* lets check if we may NOP'ify apic operations */ + if (!cpu_has_apic) { + pr_info("APIC: disable apic facility\n"); + apic_disable(); + return; + } + + /* + * acpi lapic path already maps that address in + * acpi_register_lapic_address() + */ + if (!acpi_lapic) + set_fixmap_nocache(FIX_APIC_BASE, apic_phys); + apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", APIC_BASE, apic_phys); - /* * Fetch the APIC ID of the BSP in case we have a * default configuration (or the MP table is broken). */ - if (boot_cpu_physical_apicid == -1U) - boot_cpu_physical_apicid = read_apic_id(); - - /* lets check if we may to NOP'ify apic operations */ - if (!cpu_has_apic) { - pr_info("APIC: disable apic facility\n"); - apic_disable(); + new_apicid = read_apic_id(); + if (boot_cpu_physical_apicid != new_apicid) { + boot_cpu_physical_apicid = new_apicid; + apic_version[new_apicid] = + GET_APIC_VERSION(apic_read(APIC_LVR)); } } -- cgit v1.2.3 From 917a0153621572e88aeb2d5df025ad2e81027287 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 6 May 2009 21:36:16 -0700 Subject: x86: mtrr: Fix high_width computation when phys-addr is >= 44bit found one system where cpu address line is 44bits, mtrr printout is not right: [ 0.000000] MTRR variable ranges enabled: [ 0.000000] 0 base 0 00000000 mask FF0 00000000 write-back [ 0.000000] 1 base 10 00000000 mask FFF 80000000 write-back [ 0.000000] 2 base 0 80000000 mask FFF 80000000 uncachable [ 0.000000] 3 base 0 7F800000 mask FFF FF800000 uncachable Li Zefan and Frederic pointed out the high_width could be -4 some how. It turns out when phys_addr is 44bit, size_or_mask will be ffffffff,00000000 so ffs(size_or_mask) will be 0. Try to check low 32 bit, to get correct high_width. Signed-off-by: Yinghai Lu Also-analyzed-by: Frederic Weisbecker Also-analyzed-by: Li Zefan Cc: Jeremy Fitzhardinge Cc: Zhaolei Cc: Steven Rostedt Cc: Vegard Nossum Cc: Andrew Morton LKML-Reference: <4A026540.8060504@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 0b776c09aff..d21d4fb161f 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -275,7 +275,11 @@ static void __init print_mtrr_state(void) } printk(KERN_DEBUG "MTRR variable ranges %sabled:\n", mtrr_state.enabled & 2 ? "en" : "dis"); - high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4; + if (size_or_mask & 0xffffffffUL) + high_width = ffs(size_or_mask & 0xffffffffUL) - 1; + else + high_width = ffs(size_or_mask>>32) + 32 - 1; + high_width = (high_width - (32 - PAGE_SHIFT) + 3) / 4; for (i = 0; i < num_var_ranges; ++i) { if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) printk(KERN_DEBUG " %u base %0*X%05X000 mask %0*X%05X000 %s\n", -- cgit v1.2.3 From 8823392360dc4992f87bf4c623834d315f297493 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Sun, 10 May 2009 10:53:05 +0200 Subject: perf_counter, x86: clean up throttling printk s/PERFMON/perfcounters for perfcounter interrupt throttling warning. 'perfmon' is the CPU feature name that is Intel-only, while we do throttling in a generic way. [ Impact: cleanup ] Signed-off-by: Mike Galbraith Cc: Robert Richter Cc: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a6878b0798e..da27419923a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -814,7 +814,7 @@ void perf_counter_unthrottle(void) cpuc = &__get_cpu_var(cpu_hw_counters); if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) { if (printk_ratelimit()) - printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n"); + printk(KERN_WARNING "perfcounters: max interrupts exceeded!\n"); hw_perf_restore(cpuc->throttle_ctrl); } cpuc->interrupts = 0; -- cgit v1.2.3 From 97a52714658cd959a3cfa35c5b6f489859f0204b Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 8 May 2009 18:23:50 +0200 Subject: x86: display extended apic registers with print_local_APIC and cpu_debug code Both print_local_APIC (used when apic=debug kernel param is set) and cpu_debug code missed support for some extended APIC registers that I'd like to see. This adds support to show: - extended APIC feature register - extended APIC control register - extended LVT registers [ Impact: print more debug info ] Signed-off-by: Andreas Herrmann Cc: Jaswinder Singh Rajput Cc: Cyrill Gorcunov LKML-Reference: <20090508162350.GO29045@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/apic/io_apic.c | 14 +++++++++++++- arch/x86/kernel/cpu/cpu_debug.c | 14 +++++++++++++- 3 files changed, 27 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 1ee966f4ae9..0e6543fafb5 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -395,7 +395,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) { - unsigned long reg = (lvt_off << 4) + APIC_EILVT0; + unsigned long reg = (lvt_off << 4) + APIC_EILVTn(0); unsigned int v = (mask << 16) | (msg_type << 8) | vector; apic_write(reg, v); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 2afe145d277..65b824c9c4f 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1739,7 +1739,7 @@ __apicdebuginit(void) print_APIC_bitfield(int base) __apicdebuginit(void) print_local_APIC(void *dummy) { - unsigned int v, ver, maxlvt; + unsigned int i, v, ver, maxlvt; u64 icr; if (apic_verbosity == APIC_QUIET) @@ -1827,6 +1827,18 @@ __apicdebuginit(void) print_local_APIC(void *dummy) printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); v = apic_read(APIC_TDCR); printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); + + if (boot_cpu_has(X86_FEATURE_EXTAPIC)) { + v = apic_read(APIC_EFEAT); + maxlvt = (v >> 16) & 0xff; + printk(KERN_DEBUG "... APIC EFEAT: %08x\n", v); + v = apic_read(APIC_ECTRL); + printk(KERN_DEBUG "... APIC ECTRL: %08x\n", v); + for (i = 0; i < maxlvt; i++) { + v = apic_read(APIC_EILVTn(i)); + printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v); + } + } printk("\n"); } diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c index 46e29ab96c6..2fc4f6bb9ca 100644 --- a/arch/x86/kernel/cpu/cpu_debug.c +++ b/arch/x86/kernel/cpu/cpu_debug.c @@ -588,8 +588,20 @@ static void print_apic(void *arg) seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT)); seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT)); seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR)); -#endif /* CONFIG_X86_LOCAL_APIC */ + if (boot_cpu_has(X86_FEATURE_EXTAPIC)) { + unsigned int i, v, maxeilvt; + + v = apic_read(APIC_EFEAT); + maxeilvt = (v >> 16) & 0xff; + seq_printf(seq, " EFEAT\t\t: %08x\n", v); + seq_printf(seq, " ECTRL\t\t: %08x\n", apic_read(APIC_ECTRL)); + for (i = 0; i < maxeilvt; i++) { + v = apic_read(APIC_EILVTn(i)); + seq_printf(seq, " EILVT%d\t\t: %08x\n", i, v); + } + } +#endif /* CONFIG_X86_LOCAL_APIC */ seq_printf(seq, "\n MSR\t:\n"); } -- cgit v1.2.3 From cec6be6d1069d697beb490bbb40a290d5ff554a2 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 11 May 2009 17:41:40 +0400 Subject: x86: apic: Fixmap apic address even if apic disabled In case if apic were disabled by boot option we still need read_apic operation. So fixmap a fake apic area if needed. [ Impact: fix boot crash ] Signed-off-by: Cyrill Gorcunov Cc: yinghai@kernel.org Cc: eswierk@aristanetworks.com LKML-Reference: <20090511134140.GH4624@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 0e6543fafb5..07cffc1214c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1587,13 +1587,6 @@ void __init init_apic_mappings(void) } else apic_phys = mp_lapic_addr; - /* lets check if we may NOP'ify apic operations */ - if (!cpu_has_apic) { - pr_info("APIC: disable apic facility\n"); - apic_disable(); - return; - } - /* * acpi lapic path already maps that address in * acpi_register_lapic_address() @@ -1602,7 +1595,15 @@ void __init init_apic_mappings(void) set_fixmap_nocache(FIX_APIC_BASE, apic_phys); apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", - APIC_BASE, apic_phys); + APIC_BASE, apic_phys); + + /* lets check if we may NOP'ify apic operations */ + if (!cpu_has_apic) { + pr_info("APIC: disable apic facility\n"); + apic_disable(); + return; + } + /* * Fetch the APIC ID of the BSP in case we have a * default configuration (or the MP table is broken). -- cgit v1.2.3 From 0c23590f00f85467b318ad0c20c36796a5bd4c60 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 4 May 2009 03:30:15 +0400 Subject: x86, 64-bit: ifdef out struct thread_struct::ip struct thread_struct::ip isn't used on x86_64, struct pt_regs::ip is used instead. kgdb should be reading 0 always, but I can't check it. [ Impact: (potentially) reduce thread_struct size on 64-bit ] Signed-off-by: Alexey Dobriyan Cc: containers@lists.linux-foundation.org LKML-Reference: <20090503233015.GJ16631@x200.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/kernel/kgdb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index eedfaebe106..d07706f1976 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -141,7 +141,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); gdb_regs32[GDB_CS] = __KERNEL_CS; gdb_regs32[GDB_SS] = __KERNEL_DS; - gdb_regs[GDB_PC] = p->thread.ip; + gdb_regs[GDB_PC] = 0; gdb_regs[GDB_R8] = 0; gdb_regs[GDB_R9] = 0; gdb_regs[GDB_R10] = 0; -- cgit v1.2.3 From 37ba7ab5e33cebc25c68fffe33e9f21e7c2014e8 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 11 May 2009 15:56:08 -0700 Subject: x86, boot: make kernel_alignment adjustable; new bzImage fields Make the kernel_alignment field adjustable; this allows us to set it to a large value (intended to be 16 MB to avoid ZONE_DMA contention, memory holes and other weirdness) while a smart bootloader can still force a loading at a lesser alignment if absolutely necessary. Also export pref_address (preferred loading address, corresponding to the link-time address) and init_size, the total amount of linear memory the kernel will require during initialization. [ Impact: allows better kernel placement, gives bootloader more info ] Signed-off-by: H. Peter Anvin --- arch/x86/kernel/asm-offsets_32.c | 1 + arch/x86/kernel/asm-offsets_64.c | 1 + 2 files changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 5a6aa1c1162..1a830cbd701 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -146,4 +146,5 @@ void foo(void) OFFSET(BP_loadflags, boot_params, hdr.loadflags); OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); OFFSET(BP_version, boot_params, hdr.version); + OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index e72f062fb4b..898ecc47e12 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -125,6 +125,7 @@ int main(void) OFFSET(BP_loadflags, boot_params, hdr.loadflags); OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); OFFSET(BP_version, boot_params, hdr.version); + OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); BLANK(); DEFINE(PAGE_SIZE_asm, PAGE_SIZE); -- cgit v1.2.3 From 5031296c57024a78ddad4edfc993367dbf4abb98 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 7 May 2009 16:54:11 -0700 Subject: x86: add extension fields for bootloader type and version A long ago, in days of yore, it all began with a god named Thor. There were vikings and boats and some plans for a Linux kernel header. Unfortunately, a single 8-bit field was used for bootloader type and version. This has generally worked without *too* much pain, but we're getting close to flat running out of ID fields. Add extension fields for both type and version. The type will be extended if it the old field is 0xE; the version is a simple MSB extension. Keep /proc/sys/kernel/bootloader_type containing (type << 4) + (ver & 0xf) for backwards compatiblity, but also add /proc/sys/kernel/bootloader_version which contains the full version number. [ Impact: new feature to support more bootloaders ] Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b4158439bf6..2b093451aec 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -214,8 +214,8 @@ unsigned long mmu_cr4_features; unsigned long mmu_cr4_features = X86_CR4_PAE; #endif -/* Boot loader ID as an integer, for the benefit of proc_dointvec */ -int bootloader_type; +/* Boot loader ID and version as integers, for the benefit of proc_dointvec */ +int bootloader_type, bootloader_version; /* * Setup options @@ -706,6 +706,12 @@ void __init setup_arch(char **cmdline_p) #endif saved_video_mode = boot_params.hdr.vid_mode; bootloader_type = boot_params.hdr.type_of_loader; + if ((bootloader_type >> 4) == 0xe) { + bootloader_type &= 0xf; + bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4; + } + bootloader_version = bootloader_type & 0xf; + bootloader_version |= boot_params.hdr.ext_loader_ver << 4; #ifdef CONFIG_BLK_DEV_RAM rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; -- cgit v1.2.3 From 871b72dd1e12afc3f024479531d25a9339d2e3f9 Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Mon, 11 May 2009 23:48:27 +0200 Subject: x86: microcode: use smp_call_function_single instead of set_cpus_allowed, cleanup of synchronization logic * Solve issues described in 6f66cbc63081fd70e3191b4dbb796746780e5ae1 in a way that doesn't resort to set_cpus_allowed(); * in fact, only collect_cpu_info and apply_microcode callbacks must run on a target cpu, others will do just fine on any other. smp_call_function_single() (as suggested by Ingo) is used to run these callbacks on a target cpu. * cleanup of synchronization logic of the 'microcode_core' part The generic 'microcode_core' part guarantees that only a single cpu (be it a full-fledged cpu, one of the cores or HT) is being updated at any particular moment of time. In general, there is no need for any additional sync. mechanism in arch-specific parts (the patch removes existing spinlocks). See also the "Synchronization" section in microcode_core.c. * return -EINVAL instead of -1 (which is translated into -EPERM) in microcode_write(), reload_cpu() and mc_sysdev_add(). Other suggestions for an error code? * use 'enum ucode_state' as return value of request_microcode_{fw, user} to gain more flexibility by distinguishing between real error cases and situations when an appropriate ucode was not found (which is not an error per-se). * some minor cleanups Thanks a lot to Hugh Dickins for review/suggestions/testing! Reference: http://marc.info/?l=linux-kernel&m=124025889012541&w=2 [ Impact: refactor and clean up microcode driver locking code ] Signed-off-by: Dmitry Adamushko Acked-by: Hugh Dickins Cc: Andrew Morton Cc: Rusty Russell Cc: Andreas Herrmann Cc: Peter Oruba Cc: Arjan van de Ven LKML-Reference: <1242078507.5560.9.camel@earth> [ did some more cleanups ] Signed-off-by: Ingo Molnar arch/x86/include/asm/microcode.h | 25 ++ arch/x86/kernel/microcode_amd.c | 58 ++---- arch/x86/kernel/microcode_core.c | 326 +++++++++++++++++++++----------------- arch/x86/kernel/microcode_intel.c | 92 +++------- 4 files changed, 261 insertions(+), 240 deletions(-) (~20 new comment lines) --- arch/x86/kernel/microcode_amd.c | 58 +++---- arch/x86/kernel/microcode_core.c | 329 ++++++++++++++++++++++---------------- arch/x86/kernel/microcode_intel.c | 90 ++++------- 3 files changed, 244 insertions(+), 233 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 453b5795a5c..c8be20f1644 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -13,25 +13,13 @@ * Licensed under the terms of the GNU General Public * License version 2. See file COPYING for details. */ -#include -#include -#include #include -#include -#include #include #include #include #include #include -#include -#include -#include -#include -#include #include -#include -#include #include #include @@ -79,9 +67,6 @@ struct microcode_amd { #define UCODE_CONTAINER_SECTION_HDR 8 #define UCODE_CONTAINER_HEADER_SIZE 12 -/* serialize access to the physical write */ -static DEFINE_SPINLOCK(microcode_update_lock); - static struct equiv_cpu_entry *equiv_cpu_table; static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) @@ -144,9 +129,8 @@ static int get_matching_microcode(int cpu, void *mc, int rev) return 1; } -static void apply_microcode_amd(int cpu) +static int apply_microcode_amd(int cpu) { - unsigned long flags; u32 rev, dummy; int cpu_num = raw_smp_processor_id(); struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; @@ -156,25 +140,25 @@ static void apply_microcode_amd(int cpu) BUG_ON(cpu_num != cpu); if (mc_amd == NULL) - return; + return 0; - spin_lock_irqsave(µcode_update_lock, flags); wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); /* get patch id after patching */ rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - spin_unlock_irqrestore(µcode_update_lock, flags); /* check current patch id and patch's id for match */ if (rev != mc_amd->hdr.patch_id) { printk(KERN_ERR "microcode: CPU%d: update failed " "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); - return; + return -1; } printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n", cpu, rev); uci->cpu_sig.rev = rev; + + return 0; } static int get_ucode_data(void *to, const u8 *from, size_t n) @@ -263,7 +247,8 @@ static void free_equiv_cpu_table(void) } } -static int generic_load_microcode(int cpu, const u8 *data, size_t size) +static enum ucode_state +generic_load_microcode(int cpu, const u8 *data, size_t size) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; const u8 *ucode_ptr = data; @@ -272,12 +257,13 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size) int new_rev = uci->cpu_sig.rev; unsigned int leftover; unsigned long offset; + enum ucode_state state = UCODE_OK; offset = install_equiv_cpu_table(ucode_ptr); if (!offset) { printk(KERN_ERR "microcode: failed to create " "equivalent cpu table\n"); - return -EINVAL; + return UCODE_ERROR; } ucode_ptr += offset; @@ -312,28 +298,27 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size) pr_debug("microcode: CPU%d found a matching microcode " "update with version 0x%x (current=0x%x)\n", cpu, new_rev, uci->cpu_sig.rev); - } else + } else { vfree(new_mc); - } + state = UCODE_ERROR; + } + } else + state = UCODE_NFOUND; free_equiv_cpu_table(); - return (int)leftover; + return state; } -static int request_microcode_fw(int cpu, struct device *device) +static enum ucode_state request_microcode_fw(int cpu, struct device *device) { const char *fw_name = "amd-ucode/microcode_amd.bin"; const struct firmware *firmware; - int ret; - - /* We should bind the task to the CPU */ - BUG_ON(cpu != raw_smp_processor_id()); + enum ucode_state ret; - ret = request_firmware(&firmware, fw_name, device); - if (ret) { + if (request_firmware(&firmware, fw_name, device)) { printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); - return ret; + return UCODE_NFOUND; } ret = generic_load_microcode(cpu, firmware->data, firmware->size); @@ -343,11 +328,12 @@ static int request_microcode_fw(int cpu, struct device *device) return ret; } -static int request_microcode_user(int cpu, const void __user *buf, size_t size) +static enum ucode_state +request_microcode_user(int cpu, const void __user *buf, size_t size) { printk(KERN_INFO "microcode: AMD microcode update via " "/dev/cpu/microcode not supported\n"); - return -1; + return UCODE_ERROR; } static void microcode_fini_cpu_amd(int cpu) diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 98c470c069d..9c4461501fc 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -71,27 +71,18 @@ * Thanks to Stuart Swales for pointing out this bug. */ #include -#include #include -#include +#include #include -#include -#include -#include -#include #include #include #include -#include -#include -#include #include #include #include #include #include -#include MODULE_DESCRIPTION("Microcode Update Driver"); MODULE_AUTHOR("Tigran Aivazian "); @@ -101,36 +92,110 @@ MODULE_LICENSE("GPL"); static struct microcode_ops *microcode_ops; -/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ +/* + * Synchronization. + * + * All non cpu-hotplug-callback call sites use: + * + * - microcode_mutex to synchronize with each other; + * - get/put_online_cpus() to synchronize with + * the cpu-hotplug-callback call sites. + * + * We guarantee that only a single cpu is being + * updated at any particular moment of time. + */ static DEFINE_MUTEX(microcode_mutex); struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; EXPORT_SYMBOL_GPL(ucode_cpu_info); +/* + * Operations that are run on a target cpu: + */ + +struct cpu_info_ctx { + struct cpu_signature *cpu_sig; + int err; +}; + +static void collect_cpu_info_local(void *arg) +{ + struct cpu_info_ctx *ctx = arg; + + ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(), + ctx->cpu_sig); +} + +static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig) +{ + struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 }; + int ret; + + ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1); + if (!ret) + ret = ctx.err; + + return ret; +} + +static int collect_cpu_info(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + int ret; + + memset(uci, 0, sizeof(*uci)); + + ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig); + if (!ret) + uci->valid = 1; + + return ret; +} + +struct apply_microcode_ctx { + int err; +}; + +static void apply_microcode_local(void *arg) +{ + struct apply_microcode_ctx *ctx = arg; + + ctx->err = microcode_ops->apply_microcode(smp_processor_id()); +} + +static int apply_microcode_on_target(int cpu) +{ + struct apply_microcode_ctx ctx = { .err = 0 }; + int ret; + + ret = smp_call_function_single(cpu, apply_microcode_local, &ctx, 1); + if (!ret) + ret = ctx.err; + + return ret; +} + #ifdef CONFIG_MICROCODE_OLD_INTERFACE static int do_microcode_update(const void __user *buf, size_t size) { - cpumask_t old; int error = 0; int cpu; - old = current->cpus_allowed; - for_each_online_cpu(cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + enum ucode_state ustate; if (!uci->valid) continue; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - error = microcode_ops->request_microcode_user(cpu, buf, size); - if (error < 0) - goto out; - if (!error) - microcode_ops->apply_microcode(cpu); + ustate = microcode_ops->request_microcode_user(cpu, buf, size); + if (ustate == UCODE_ERROR) { + error = -1; + break; + } else if (ustate == UCODE_OK) + apply_microcode_on_target(cpu); } -out: - set_cpus_allowed_ptr(current, &old); + return error; } @@ -143,19 +208,17 @@ static int microcode_open(struct inode *unused1, struct file *unused2) static ssize_t microcode_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { - ssize_t ret; + ssize_t ret = -EINVAL; if ((len >> PAGE_SHIFT) > num_physpages) { - printk(KERN_ERR "microcode: too much data (max %ld pages)\n", - num_physpages); - return -EINVAL; + pr_err("microcode: too much data (max %ld pages)\n", num_physpages); + return ret; } get_online_cpus(); mutex_lock(µcode_mutex); - ret = do_microcode_update(buf, len); - if (!ret) + if (do_microcode_update(buf, len) == 0) ret = (ssize_t)len; mutex_unlock(µcode_mutex); @@ -165,15 +228,15 @@ static ssize_t microcode_write(struct file *file, const char __user *buf, } static const struct file_operations microcode_fops = { - .owner = THIS_MODULE, - .write = microcode_write, - .open = microcode_open, + .owner = THIS_MODULE, + .write = microcode_write, + .open = microcode_open, }; static struct miscdevice microcode_dev = { - .minor = MICROCODE_MINOR, - .name = "microcode", - .fops = µcode_fops, + .minor = MICROCODE_MINOR, + .name = "microcode", + .fops = µcode_fops, }; static int __init microcode_dev_init(void) @@ -182,9 +245,7 @@ static int __init microcode_dev_init(void) error = misc_register(µcode_dev); if (error) { - printk(KERN_ERR - "microcode: can't misc_register on minor=%d\n", - MICROCODE_MINOR); + pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR); return error; } @@ -205,42 +266,51 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); /* fake device for request_firmware */ static struct platform_device *microcode_pdev; -static long reload_for_cpu(void *unused) +static int reload_for_cpu(int cpu) { - struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id(); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; int err = 0; mutex_lock(µcode_mutex); if (uci->valid) { - err = microcode_ops->request_microcode_fw(smp_processor_id(), - µcode_pdev->dev); - if (!err) - microcode_ops->apply_microcode(smp_processor_id()); + enum ucode_state ustate; + + ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev); + if (ustate == UCODE_OK) + apply_microcode_on_target(cpu); + else + if (ustate == UCODE_ERROR) + err = -EINVAL; } mutex_unlock(µcode_mutex); + return err; } static ssize_t reload_store(struct sys_device *dev, struct sysdev_attribute *attr, - const char *buf, size_t sz) + const char *buf, size_t size) { - char *end; - unsigned long val = simple_strtoul(buf, &end, 0); - int err = 0; + unsigned long val; int cpu = dev->id; + int ret = 0; + char *end; + val = simple_strtoul(buf, &end, 0); if (end == buf) return -EINVAL; + if (val == 1) { get_online_cpus(); if (cpu_online(cpu)) - err = work_on_cpu(cpu, reload_for_cpu, NULL); + ret = reload_for_cpu(cpu); put_online_cpus(); } - if (err) - return err; - return sz; + + if (!ret) + ret = size; + + return ret; } static ssize_t version_show(struct sys_device *dev, @@ -271,11 +341,11 @@ static struct attribute *mc_default_attrs[] = { }; static struct attribute_group mc_attr_group = { - .attrs = mc_default_attrs, - .name = "microcode", + .attrs = mc_default_attrs, + .name = "microcode", }; -static void __microcode_fini_cpu(int cpu) +static void microcode_fini_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; @@ -283,103 +353,68 @@ static void __microcode_fini_cpu(int cpu) uci->valid = 0; } -static void microcode_fini_cpu(int cpu) -{ - mutex_lock(µcode_mutex); - __microcode_fini_cpu(cpu); - mutex_unlock(µcode_mutex); -} - -static void collect_cpu_info(int cpu) +static enum ucode_state microcode_resume_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - memset(uci, 0, sizeof(*uci)); - if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig)) - uci->valid = 1; + if (!uci->mc) + return UCODE_NFOUND; + + pr_debug("microcode: CPU%d updated upon resume\n", cpu); + apply_microcode_on_target(cpu); + + return UCODE_OK; } -static int microcode_resume_cpu(int cpu) +static enum ucode_state microcode_init_cpu(int cpu) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - struct cpu_signature nsig; + enum ucode_state ustate; - pr_debug("microcode: CPU%d resumed\n", cpu); + if (collect_cpu_info(cpu)) + return UCODE_ERROR; - if (!uci->mc) - return 1; + /* --dimm. Trigger a delayed update? */ + if (system_state != SYSTEM_RUNNING) + return UCODE_NFOUND; - /* - * Let's verify that the 'cached' ucode does belong - * to this cpu (a bit of paranoia): - */ - if (microcode_ops->collect_cpu_info(cpu, &nsig)) { - __microcode_fini_cpu(cpu); - printk(KERN_ERR "failed to collect_cpu_info for resuming cpu #%d\n", - cpu); - return -1; - } + ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev); - if ((nsig.sig != uci->cpu_sig.sig) || (nsig.pf != uci->cpu_sig.pf)) { - __microcode_fini_cpu(cpu); - printk(KERN_ERR "cached ucode doesn't match the resuming cpu #%d\n", - cpu); - /* Should we look for a new ucode here? */ - return 1; + if (ustate == UCODE_OK) { + pr_debug("microcode: CPU%d updated upon init\n", cpu); + apply_microcode_on_target(cpu); } - return 0; + return ustate; } -static long microcode_update_cpu(void *unused) +static enum ucode_state microcode_update_cpu(int cpu) { - struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id(); - int err = 0; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + enum ucode_state ustate; - /* - * Check if the system resume is in progress (uci->valid != NULL), - * otherwise just request a firmware: - */ - if (uci->valid) { - err = microcode_resume_cpu(smp_processor_id()); - } else { - collect_cpu_info(smp_processor_id()); - if (uci->valid && system_state == SYSTEM_RUNNING) - err = microcode_ops->request_microcode_fw( - smp_processor_id(), - µcode_pdev->dev); - } - if (!err) - microcode_ops->apply_microcode(smp_processor_id()); - return err; -} + if (uci->valid) + ustate = microcode_resume_cpu(cpu); + else + ustate = microcode_init_cpu(cpu); -static int microcode_init_cpu(int cpu) -{ - int err; - mutex_lock(µcode_mutex); - err = work_on_cpu(cpu, microcode_update_cpu, NULL); - mutex_unlock(µcode_mutex); - - return err; + return ustate; } static int mc_sysdev_add(struct sys_device *sys_dev) { int err, cpu = sys_dev->id; - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; if (!cpu_online(cpu)) return 0; pr_debug("microcode: CPU%d added\n", cpu); - memset(uci, 0, sizeof(*uci)); err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); if (err) return err; - err = microcode_init_cpu(cpu); + if (microcode_init_cpu(cpu) == UCODE_ERROR) + err = -EINVAL; return err; } @@ -400,19 +435,30 @@ static int mc_sysdev_remove(struct sys_device *sys_dev) static int mc_sysdev_resume(struct sys_device *dev) { int cpu = dev->id; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; if (!cpu_online(cpu)) return 0; - /* only CPU 0 will apply ucode here */ - microcode_update_cpu(NULL); + /* + * All non-bootup cpus are still disabled, + * so only CPU 0 will apply ucode here. + * + * Moreover, there can be no concurrent + * updates from any other places at this point. + */ + WARN_ON(cpu != 0); + + if (uci->valid && uci->mc) + microcode_ops->apply_microcode(cpu); + return 0; } static struct sysdev_driver mc_sysdev_driver = { - .add = mc_sysdev_add, - .remove = mc_sysdev_remove, - .resume = mc_sysdev_resume, + .add = mc_sysdev_add, + .remove = mc_sysdev_remove, + .resume = mc_sysdev_resume, }; static __cpuinit int @@ -425,15 +471,12 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - if (microcode_init_cpu(cpu)) - printk(KERN_ERR "microcode: failed to init CPU%d\n", - cpu); + microcode_update_cpu(cpu); case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: pr_debug("microcode: CPU%d added\n", cpu); if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) - printk(KERN_ERR "microcode: Failed to create the sysfs " - "group for CPU%d\n", cpu); + pr_err("microcode: Failed to create group for CPU%d\n", cpu); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: @@ -465,13 +508,10 @@ static int __init microcode_init(void) microcode_ops = init_amd_microcode(); if (!microcode_ops) { - printk(KERN_ERR "microcode: no support for this CPU vendor\n"); + pr_err("microcode: no support for this CPU vendor\n"); return -ENODEV; } - error = microcode_dev_init(); - if (error) - return error; microcode_pdev = platform_device_register_simple("microcode", -1, NULL, 0); if (IS_ERR(microcode_pdev)) { @@ -480,23 +520,31 @@ static int __init microcode_init(void) } get_online_cpus(); + mutex_lock(µcode_mutex); + error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); + + mutex_unlock(µcode_mutex); put_online_cpus(); + if (error) { - microcode_dev_exit(); platform_device_unregister(microcode_pdev); return error; } + error = microcode_dev_init(); + if (error) + return error; + register_hotcpu_notifier(&mc_cpu_notifier); - printk(KERN_INFO - "Microcode Update Driver: v" MICROCODE_VERSION + pr_info("Microcode Update Driver: v" MICROCODE_VERSION " ," " Peter Oruba\n"); return 0; } +module_init(microcode_init); static void __exit microcode_exit(void) { @@ -505,16 +553,17 @@ static void __exit microcode_exit(void) unregister_hotcpu_notifier(&mc_cpu_notifier); get_online_cpus(); + mutex_lock(µcode_mutex); + sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); + + mutex_unlock(µcode_mutex); put_online_cpus(); platform_device_unregister(microcode_pdev); microcode_ops = NULL; - printk(KERN_INFO - "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); + pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); } - -module_init(microcode_init); module_exit(microcode_exit); diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 149b9ec7c1a..0d334ddd0a9 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -70,24 +70,11 @@ * Fix sigmatch() macro to handle old CPUs with pf == 0. * Thanks to Stuart Swales for pointing out this bug. */ -#include -#include -#include #include -#include -#include -#include #include -#include #include #include -#include -#include -#include -#include -#include -#include -#include +#include #include #include @@ -150,13 +137,9 @@ struct extended_sigtable { #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) -/* serialize access to the physical write to MSR 0x79 */ -static DEFINE_SPINLOCK(microcode_update_lock); - static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu_num); - unsigned long flags; unsigned int val[2]; memset(csig, 0, sizeof(*csig)); @@ -176,18 +159,14 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) csig->pf = 1 << ((val[1] >> 18) & 7); } - /* serialize access to the physical write to MSR 0x79 */ - spin_lock_irqsave(µcode_update_lock, flags); - wrmsr(MSR_IA32_UCODE_REV, 0, 0); /* see notes above for revision 1.07. Apparent chip bug */ sync_core(); /* get the current revision from MSR 0x8B */ rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); - spin_unlock_irqrestore(µcode_update_lock, flags); - pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", - csig->sig, csig->pf, csig->rev); + printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", + cpu_num, csig->sig, csig->pf, csig->rev); return 0; } @@ -318,11 +297,10 @@ get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev) return 0; } -static void apply_microcode(int cpu) +static int apply_microcode(int cpu) { struct microcode_intel *mc_intel; struct ucode_cpu_info *uci; - unsigned long flags; unsigned int val[2]; int cpu_num; @@ -334,10 +312,7 @@ static void apply_microcode(int cpu) BUG_ON(cpu_num != cpu); if (mc_intel == NULL) - return; - - /* serialize access to the physical write to MSR 0x79 */ - spin_lock_irqsave(µcode_update_lock, flags); + return 0; /* write microcode via MSR 0x79 */ wrmsr(MSR_IA32_UCODE_WRITE, @@ -351,30 +326,32 @@ static void apply_microcode(int cpu) /* get the current revision from MSR 0x8B */ rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - spin_unlock_irqrestore(µcode_update_lock, flags); if (val[1] != mc_intel->hdr.rev) { - printk(KERN_ERR "microcode: CPU%d update from revision " - "0x%x to 0x%x failed\n", - cpu_num, uci->cpu_sig.rev, val[1]); - return; + printk(KERN_ERR "microcode: CPU%d update " + "to revision 0x%x failed\n", + cpu_num, mc_intel->hdr.rev); + return -1; } - printk(KERN_INFO "microcode: CPU%d updated from revision " - "0x%x to 0x%x, date = %04x-%02x-%02x \n", - cpu_num, uci->cpu_sig.rev, val[1], + printk(KERN_INFO "microcode: CPU%d updated to revision " + "0x%x, date = %04x-%02x-%02x \n", + cpu_num, val[1], mc_intel->hdr.date & 0xffff, mc_intel->hdr.date >> 24, (mc_intel->hdr.date >> 16) & 0xff); uci->cpu_sig.rev = val[1]; + + return 0; } -static int generic_load_microcode(int cpu, void *data, size_t size, - int (*get_ucode_data)(void *, const void *, size_t)) +static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, + int (*get_ucode_data)(void *, const void *, size_t)) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; u8 *ucode_ptr = data, *new_mc = NULL, *mc; int new_rev = uci->cpu_sig.rev; unsigned int leftover = size; + enum ucode_state state = UCODE_OK; while (leftover) { struct microcode_header_intel mc_header; @@ -412,11 +389,15 @@ static int generic_load_microcode(int cpu, void *data, size_t size, leftover -= mc_size; } - if (!new_mc) + if (leftover) { + if (new_mc) + vfree(new_mc); + state = UCODE_ERROR; goto out; + } - if (leftover) { - vfree(new_mc); + if (!new_mc) { + state = UCODE_NFOUND; goto out; } @@ -427,9 +408,8 @@ static int generic_load_microcode(int cpu, void *data, size_t size, pr_debug("microcode: CPU%d found a matching microcode update with" " version 0x%x (current=0x%x)\n", cpu, new_rev, uci->cpu_sig.rev); - - out: - return (int)leftover; +out: + return state; } static int get_ucode_fw(void *to, const void *from, size_t n) @@ -438,21 +418,19 @@ static int get_ucode_fw(void *to, const void *from, size_t n) return 0; } -static int request_microcode_fw(int cpu, struct device *device) +static enum ucode_state request_microcode_fw(int cpu, struct device *device) { char name[30]; struct cpuinfo_x86 *c = &cpu_data(cpu); const struct firmware *firmware; - int ret; + enum ucode_state ret; - /* We should bind the task to the CPU */ - BUG_ON(cpu != raw_smp_processor_id()); sprintf(name, "intel-ucode/%02x-%02x-%02x", c->x86, c->x86_model, c->x86_mask); - ret = request_firmware(&firmware, name, device); - if (ret) { + + if (request_firmware(&firmware, name, device)) { pr_debug("microcode: data file %s load failed\n", name); - return ret; + return UCODE_NFOUND; } ret = generic_load_microcode(cpu, (void *)firmware->data, @@ -468,11 +446,9 @@ static int get_ucode_user(void *to, const void *from, size_t n) return copy_from_user(to, from, n); } -static int request_microcode_user(int cpu, const void __user *buf, size_t size) +static enum ucode_state +request_microcode_user(int cpu, const void __user *buf, size_t size) { - /* We should bind the task to the CPU */ - BUG_ON(cpu != raw_smp_processor_id()); - return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user); } -- cgit v1.2.3 From 9d62dcdfa6f6fc843f7d9b494bcd48f00b94f883 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Mon, 11 May 2009 22:05:28 -0400 Subject: x86: merge process.c a bit Merge arch_align_stack() and arch_randomize_brk(), since they are the same. Tested on x86_64. [ Impact: cleanup ] Signed-off-by: Amerigo Wang Signed-off-by: Ingo Molnar --- arch/x86/kernel/process.c | 14 ++++++++++++++ arch/x86/kernel/process_32.c | 13 ------------- arch/x86/kernel/process_64.c | 13 ------------- 3 files changed, 14 insertions(+), 26 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ca989158e84..2b9a8d0fb47 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -613,3 +614,16 @@ static int __init idle_setup(char *str) } early_param("idle", idle_setup); +unsigned long arch_align_stack(unsigned long sp) +{ + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) + sp -= get_random_int() % 8192; + return sp & ~0xf; +} + +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ + unsigned long range_end = mm->brk + 0x02000000; + return randomize_range(mm->brk, range_end, 0) ? : mm->brk; +} + diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 76f8f84043a..a3bb049ad08 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -33,7 +33,6 @@ #include #include #include -#include #include #include #include @@ -497,15 +496,3 @@ unsigned long get_wchan(struct task_struct *p) return 0; } -unsigned long arch_align_stack(unsigned long sp) -{ - if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() % 8192; - return sp & ~0xf; -} - -unsigned long arch_randomize_brk(struct mm_struct *mm) -{ - unsigned long range_end = mm->brk + 0x02000000; - return randomize_range(mm->brk, range_end, 0) ? : mm->brk; -} diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b751a41392b..34386f72bdc 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include @@ -660,15 +659,3 @@ long sys_arch_prctl(int code, unsigned long addr) return do_arch_prctl(current, code, addr); } -unsigned long arch_align_stack(unsigned long sp) -{ - if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() % 8192; - return sp & ~0xf; -} - -unsigned long arch_randomize_brk(struct mm_struct *mm) -{ - unsigned long range_end = mm->brk + 0x02000000; - return randomize_range(mm->brk, range_end, 0) ? : mm->brk; -} -- cgit v1.2.3 From bf78ad69cd351798b9447a269c6bd41ce1f111f4 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Mon, 11 May 2009 23:29:09 -0400 Subject: x86: process.c, remove useless headers is not needed by these files, remove them. [ Impact: cleanup ] Signed-off-by: WANG Cong Cc: akpm@linux-foundation.org LKML-Reference: <20090512032956.5040.77055.sendpatchset@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_32.c | 2 -- arch/x86/kernel/process_64.c | 2 -- 2 files changed, 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index a3bb049ad08..56d50b7d71d 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -9,8 +9,6 @@ * This file handles the architecture-dependent parts of process handling.. */ -#include - #include #include #include diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 34386f72bdc..9d6b20e6cd8 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -14,8 +14,6 @@ * This file handles the architecture-dependent parts of process handling.. */ -#include - #include #include #include -- cgit v1.2.3 From 4797f6b021a3fa399942245d07a1feb30df81bb8 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 2 May 2009 10:40:57 -0700 Subject: x86: read apic ID in the !acpi_lapic case Ed found that on 32-bit, boot_cpu_physical_apicid is not read right, when the mptable is broken. Interestingly, actually three paths use/set it: 1. acpi: at that time that is already read from reg 2. mptable: only read from mptable 3. no madt, and no mptable, that use default apic id 0 for 64-bit, -1 for 32-bit so we could read the apic id for the 2/3 path. We trust the hardware register more than we trust a BIOS data structure (the mptable). We can also avoid the double set_fixmap() when acpi_lapic is used, and also need to move cpu_has_apic earlier and call apic_disable(). Also when need to update the apic id, we'd better read and set the apic version as well - so that quirks are applied precisely. v2: make path 3 with 64bit, use -1 as apic id, so could read it later. v3: fix whitespace problem pointed out by Ed Swierk v5: fix boot crash [ Impact: get correct apic id for bsp other than acpi path ] Reported-by: Ed Swierk Signed-off-by: Yinghai Lu Acked-by: Cyrill Gorcunov LKML-Reference: <49FC85A9.2070702@kernel.org> [ v4: sanity-check in the ACPI case too ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 46 ++++++++++++++++++++---------------------- arch/x86/kernel/apic/io_apic.c | 5 +++++ 2 files changed, 27 insertions(+), 24 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 07cffc1214c..b0fd26442c4 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -242,17 +242,24 @@ static int modern_apic(void) * bare function to substitute write operation * and it's _that_ fast :) */ -void native_apic_write_dummy(u32 reg, u32 v) +static void native_apic_write_dummy(u32 reg, u32 v) { WARN_ON_ONCE((cpu_has_apic || !disable_apic)); } +static u32 native_apic_read_dummy(u32 reg) +{ + WARN_ON_ONCE((cpu_has_apic || !disable_apic)); + return 0; +} + /* - * right after this call apic->write doesn't do anything + * right after this call apic->write/read doesn't do anything * note that there is no restore operation it works one way */ void apic_disable(void) { + apic->read = native_apic_read_dummy; apic->write = native_apic_write_dummy; } @@ -1576,32 +1583,23 @@ void __init init_apic_mappings(void) return; } - /* - * If no local APIC can be found then set up a fake all - * zeroes page to simulate the local APIC and another - * one for the IO-APIC. - */ + /* If no local APIC can be found return early */ if (!smp_found_config && detect_init_APIC()) { - apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); - apic_phys = __pa(apic_phys); - } else + /* lets NOP'ify apic operations */ + pr_info("APIC: disable apic facility\n"); + apic_disable(); + } else { apic_phys = mp_lapic_addr; - /* - * acpi lapic path already maps that address in - * acpi_register_lapic_address() - */ - if (!acpi_lapic) - set_fixmap_nocache(FIX_APIC_BASE, apic_phys); - - apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", - APIC_BASE, apic_phys); + /* + * acpi lapic path already maps that address in + * acpi_register_lapic_address() + */ + if (!acpi_lapic) + set_fixmap_nocache(FIX_APIC_BASE, apic_phys); - /* lets check if we may NOP'ify apic operations */ - if (!cpu_has_apic) { - pr_info("APIC: disable apic facility\n"); - apic_disable(); - return; + apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", + APIC_BASE, apic_phys); } /* diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1f3d3669dae..74d2b480a20 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1878,6 +1878,11 @@ __apicdebuginit(void) print_PIC(void) __apicdebuginit(int) print_all_ICs(void) { print_PIC(); + + /* don't print out if apic is not there */ + if (!cpu_has_apic || disable_apic) + return 0; + print_all_local_APICs(); print_IO_APIC(); -- cgit v1.2.3 From 5bb9efe33ea4001a17ab98186a40a134a3061d67 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 May 2009 08:12:51 +0200 Subject: perf_counter: fix print debug irq disable inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage. bash/15802 [HC0[0]:SC0[0]:HE1:SE1] takes: (sysrq_key_table_lock){?.....}, Don't unconditionally enable interrupts in the perf_counter_print_debug() path. [ Impact: fix potential deadlock pointed out by lockdep ] LKML-Reference: Reported-by: Ingo Molnar Signed-off-by: Peter Zijlstra --- arch/x86/kernel/cpu/perf_counter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index da27419923a..f7772ff7936 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -621,12 +621,13 @@ void perf_counter_print_debug(void) { u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; struct cpu_hw_counters *cpuc; + unsigned long flags; int cpu, idx; if (!x86_pmu.num_counters) return; - local_irq_disable(); + local_irq_save(flags); cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_counters, cpu); @@ -664,7 +665,7 @@ void perf_counter_print_debug(void) pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", cpu, idx, pmc_count); } - local_irq_enable(); + local_irq_restore(flags); } static void x86_pmu_disable(struct perf_counter *counter) -- cgit v1.2.3 From 5c333864a6ba811052d52ef14fbed056b9ac3512 Mon Sep 17 00:00:00 2001 From: Arun R Bharadwaj Date: Thu, 16 Apr 2009 12:14:37 +0530 Subject: timers: Identifying the existing pinned timers * Arun R Bharadwaj [2009-04-16 12:11:36]: The following pinned hrtimers have been identified and marked: 1)sched_rt_period_timer 2)tick_sched_timer 3)stack_trace_timer_fn [ tglx: fixup the hrtimer pinned mode ] Signed-off-by: Arun R Bharadwaj Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/x2apic_uv_x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 2bda6935297..a9cad1b00d6 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -463,7 +463,7 @@ static void uv_heartbeat(unsigned long ignored) uv_set_scir_bits(bits); /* enable next timer period */ - mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL); + mod_timer_pinned(timer, jiffies + SCIR_CPU_HB_INTERVAL); } static void __cpuinit uv_heartbeat_enable(int cpu) -- cgit v1.2.3 From aa512a27e9e8ed32f31b15eec67ab1ceca33839b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 13 May 2009 13:52:19 -0400 Subject: x86/function-graph: fix constraint for recording old return value After upgrading from gcc 4.2.2 to 4.4.0, the function graph tracer broke. Investigating, I found that in the asm that replaces the return value, gcc was using the same register for the old value as it was for the new value. mov (addr), old mov new, (addr) But if old and new are the same register, we clobber new with old! I first thought this was a bug in gcc 4.4.0 and reported it: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=40132 Andrew Pinski responded (quickly), saying that it was correct gcc behavior and the code needed to denote old as an "early clobber". Instead of "=r"(old), we need "=&r"(old). [Impact: keep function graph tracer from breaking with gcc 4.4.0 ] Signed-off-by: Steven Rostedt --- arch/x86/kernel/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 18dfa30795c..b79c5533c42 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -442,7 +442,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) _ASM_EXTABLE(1b, 4b) _ASM_EXTABLE(2b, 4b) - : [old] "=r" (old), [faulted] "=r" (faulted) + : [old] "=&r" (old), [faulted] "=r" (faulted) : [parent] "r" (parent), [return_hooker] "r" (return_hooker) : "memory" ); -- cgit v1.2.3 From ec3232bdf8518bea8410f0027f870b24d3aa8753 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 May 2009 09:45:19 +0200 Subject: perf_counter: x86: More accurate counter update Take the counter width into account instead of assuming 32 bits. In particular Nehalem has 44 bit wide counters, and all arithmetics should happen on a 44-bit signed integer basis. [ Impact: fix rare event imprecision, warning message on Nehalem ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index f7772ff7936..3a92a2b2a80 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -138,7 +138,9 @@ static u64 x86_perf_counter_update(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { - u64 prev_raw_count, new_raw_count, delta; + int shift = 64 - x86_pmu.counter_bits; + u64 prev_raw_count, new_raw_count; + s64 delta; /* * Careful: an NMI might modify the previous counter value. @@ -161,9 +163,10 @@ again: * (counter-)time and add that to the generic counter. * * Careful, not all hw sign-extends above the physical width - * of the count, so we do that by clipping the delta to 32 bits: + * of the count. */ - delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count); + delta = (new_raw_count << shift) - (prev_raw_count << shift); + delta >>= shift; atomic64_add(delta, &counter->count); atomic64_sub(delta, &hwc->period_left); -- cgit v1.2.3 From f5a5a2f6e69e88647ae12da39f0ff3a510bcf0a6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 13 May 2009 12:54:01 +0200 Subject: perf_counter: x86: Fix throttling If counters are disabled globally when a perfcounter IRQ/NMI hits, and if we throttle in that case, we'll promote the '0' value to the next lapic IRQ and disable all perfcounters at that point, permanently ... Fix it. [ Impact: fix hung perfcounters under load ] Acked-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 3a92a2b2a80..88ae8cebf3c 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -765,8 +765,13 @@ out: /* * Restore - do not reenable when global enable is off or throttled: */ - if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) - intel_pmu_restore_all(cpuc->throttle_ctrl); + if (cpuc->throttle_ctrl) { + if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) { + intel_pmu_restore_all(cpuc->throttle_ctrl); + } else { + pr_info("CPU#%d: perfcounters: max interrupt rate exceeded! Throttle on.\n", smp_processor_id()); + } + } return ret; } @@ -817,11 +822,16 @@ void perf_counter_unthrottle(void) cpuc = &__get_cpu_var(cpu_hw_counters); if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) { - if (printk_ratelimit()) - printk(KERN_WARNING "perfcounters: max interrupts exceeded!\n"); + pr_info("CPU#%d: perfcounters: throttle off.\n", smp_processor_id()); + + /* + * Clear them before re-enabling irqs/NMIs again: + */ + cpuc->interrupts = 0; hw_perf_restore(cpuc->throttle_ctrl); + } else { + cpuc->interrupts = 0; } - cpuc->interrupts = 0; } void smp_perf_counter_interrupt(struct pt_regs *regs) -- cgit v1.2.3 From a026dfecc035f213c1cfa0bf6407ce3155f6a9df Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 May 2009 10:02:57 +0200 Subject: perf_counter: x86: Allow unpriviliged use of NMIs Apply sysctl_perf_counter_priv to NMIs. Also, fail the counter creation instead of silently down-grading to regular interrupts. [ Impact: allow wider perf-counter usage ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 88ae8cebf3c..c19e927b697 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -280,8 +280,11 @@ static int __hw_perf_counter_init(struct perf_counter *counter) * If privileged enough, allow NMI events: */ hwc->nmi = 0; - if (capable(CAP_SYS_ADMIN) && hw_event->nmi) + if (hw_event->nmi) { + if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN)) + return -EACCES; hwc->nmi = 1; + } hwc->irq_period = hw_event->irq_period; if ((s64)hwc->irq_period <= 0 || hwc->irq_period > x86_pmu.max_period) -- cgit v1.2.3 From 962bf7a66edca4d36a730a38ff8410a67f560e40 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 May 2009 13:21:36 +0200 Subject: perf_counter: x86: Fix up the amd NMI/INT throttle perf_counter_unthrottle() restores throttle_ctrl, buts its never set. Also, we fail to disable all counters when throttling. [ Impact: fix rare stuck perf-counters when they are throttled ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 38 ++++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index c19e927b697..7601c014f8f 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -334,6 +334,8 @@ static u64 amd_pmu_save_disable_all(void) * right thing. */ barrier(); + if (!enabled) + goto out; for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; @@ -347,6 +349,7 @@ static u64 amd_pmu_save_disable_all(void) wrmsrl(MSR_K7_EVNTSEL0 + idx, val); } +out: return enabled; } @@ -787,32 +790,43 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) int handled = 0; struct perf_counter *counter; struct hw_perf_counter *hwc; - int idx; + int idx, throttle = 0; + + cpuc->throttle_ctrl = cpuc->enabled; + cpuc->enabled = 0; + barrier(); + + if (cpuc->throttle_ctrl) { + if (++cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) + throttle = 1; + } - ++cpuc->interrupts; for (idx = 0; idx < x86_pmu.num_counters; idx++) { + int disable = 0; + if (!test_bit(idx, cpuc->active_mask)) continue; + counter = cpuc->counters[idx]; hwc = &counter->hw; val = x86_perf_counter_update(counter, hwc, idx); if (val & (1ULL << (x86_pmu.counter_bits - 1))) - continue; + goto next; + /* counter overflow */ x86_perf_counter_set_period(counter, hwc, idx); handled = 1; inc_irq_stat(apic_perf_irqs); - if (perf_counter_overflow(counter, nmi, regs, 0)) - amd_pmu_disable_counter(hwc, idx); - else if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) - /* - * do not reenable when throttled, but reload - * the register - */ + disable = perf_counter_overflow(counter, nmi, regs, 0); + +next: + if (disable || throttle) amd_pmu_disable_counter(hwc, idx); - else if (counter->state == PERF_COUNTER_STATE_ACTIVE) - amd_pmu_enable_counter(hwc, idx); } + + if (cpuc->throttle_ctrl && !throttle) + cpuc->enabled = 1; + return handled; } -- cgit v1.2.3 From 9e35ad388bea89f7d6f375af4c0ae98803688666 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 May 2009 16:21:38 +0200 Subject: perf_counter: Rework the perf counter disable/enable The current disable/enable mechanism is: token = hw_perf_save_disable(); ... /* do bits */ ... hw_perf_restore(token); This works well, provided that the use nests properly. Except we don't. x86 NMI/INT throttling has non-nested use of this, breaking things. Therefore provide a reference counter disable/enable interface, where the first disable disables the hardware, and the last enable enables the hardware again. [ Impact: refactor, simplify the PMU disable/enable logic ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 113 ++++++++++++++----------------------- 1 file changed, 42 insertions(+), 71 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 7601c014f8f..313638cecbb 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -31,7 +31,6 @@ struct cpu_hw_counters { unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long interrupts; - u64 throttle_ctrl; int enabled; }; @@ -42,8 +41,8 @@ struct x86_pmu { const char *name; int version; int (*handle_irq)(struct pt_regs *, int); - u64 (*save_disable_all)(void); - void (*restore_all)(u64); + void (*disable_all)(void); + void (*enable_all)(void); void (*enable)(struct hw_perf_counter *, int); void (*disable)(struct hw_perf_counter *, int); unsigned eventsel; @@ -56,6 +55,7 @@ struct x86_pmu { int counter_bits; u64 counter_mask; u64 max_period; + u64 intel_ctrl; }; static struct x86_pmu x86_pmu __read_mostly; @@ -311,22 +311,19 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return 0; } -static u64 intel_pmu_save_disable_all(void) +static void intel_pmu_disable_all(void) { - u64 ctrl; - - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); - - return ctrl; } -static u64 amd_pmu_save_disable_all(void) +static void amd_pmu_disable_all(void) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - int enabled, idx; + int idx; + + if (!cpuc->enabled) + return; - enabled = cpuc->enabled; cpuc->enabled = 0; /* * ensure we write the disable before we start disabling the @@ -334,8 +331,6 @@ static u64 amd_pmu_save_disable_all(void) * right thing. */ barrier(); - if (!enabled) - goto out; for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; @@ -348,37 +343,31 @@ static u64 amd_pmu_save_disable_all(void) val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; wrmsrl(MSR_K7_EVNTSEL0 + idx, val); } - -out: - return enabled; } -u64 hw_perf_save_disable(void) +void hw_perf_disable(void) { if (!x86_pmu_initialized()) - return 0; - return x86_pmu.save_disable_all(); + return; + return x86_pmu.disable_all(); } -/* - * Exported because of ACPI idle - */ -EXPORT_SYMBOL_GPL(hw_perf_save_disable); -static void intel_pmu_restore_all(u64 ctrl) +static void intel_pmu_enable_all(void) { - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); } -static void amd_pmu_restore_all(u64 ctrl) +static void amd_pmu_enable_all(void) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); int idx; - cpuc->enabled = ctrl; - barrier(); - if (!ctrl) + if (cpuc->enabled) return; + cpuc->enabled = 1; + barrier(); + for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; @@ -392,16 +381,12 @@ static void amd_pmu_restore_all(u64 ctrl) } } -void hw_perf_restore(u64 ctrl) +void hw_perf_enable(void) { if (!x86_pmu_initialized()) return; - x86_pmu.restore_all(ctrl); + x86_pmu.enable_all(); } -/* - * Exported because of ACPI idle - */ -EXPORT_SYMBOL_GPL(hw_perf_restore); static inline u64 intel_pmu_get_status(void) { @@ -735,15 +720,14 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) int bit, cpu = smp_processor_id(); u64 ack, status; struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); - int ret = 0; - - cpuc->throttle_ctrl = intel_pmu_save_disable_all(); + perf_disable(); status = intel_pmu_get_status(); - if (!status) - goto out; + if (!status) { + perf_enable(); + return 0; + } - ret = 1; again: inc_irq_stat(apic_perf_irqs); ack = status; @@ -767,19 +751,11 @@ again: status = intel_pmu_get_status(); if (status) goto again; -out: - /* - * Restore - do not reenable when global enable is off or throttled: - */ - if (cpuc->throttle_ctrl) { - if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) { - intel_pmu_restore_all(cpuc->throttle_ctrl); - } else { - pr_info("CPU#%d: perfcounters: max interrupt rate exceeded! Throttle on.\n", smp_processor_id()); - } - } - return ret; + if (++cpuc->interrupts != PERFMON_MAX_INTERRUPTS) + perf_enable(); + + return 1; } static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) @@ -792,13 +768,11 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) struct hw_perf_counter *hwc; int idx, throttle = 0; - cpuc->throttle_ctrl = cpuc->enabled; - cpuc->enabled = 0; - barrier(); - - if (cpuc->throttle_ctrl) { - if (++cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) - throttle = 1; + if (++cpuc->interrupts == PERFMON_MAX_INTERRUPTS) { + throttle = 1; + __perf_disable(); + cpuc->enabled = 0; + barrier(); } for (idx = 0; idx < x86_pmu.num_counters; idx++) { @@ -824,9 +798,6 @@ next: amd_pmu_disable_counter(hwc, idx); } - if (cpuc->throttle_ctrl && !throttle) - cpuc->enabled = 1; - return handled; } @@ -839,13 +810,11 @@ void perf_counter_unthrottle(void) cpuc = &__get_cpu_var(cpu_hw_counters); if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) { - pr_info("CPU#%d: perfcounters: throttle off.\n", smp_processor_id()); - /* * Clear them before re-enabling irqs/NMIs again: */ cpuc->interrupts = 0; - hw_perf_restore(cpuc->throttle_ctrl); + perf_enable(); } else { cpuc->interrupts = 0; } @@ -931,8 +900,8 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = { static struct x86_pmu intel_pmu = { .name = "Intel", .handle_irq = intel_pmu_handle_irq, - .save_disable_all = intel_pmu_save_disable_all, - .restore_all = intel_pmu_restore_all, + .disable_all = intel_pmu_disable_all, + .enable_all = intel_pmu_enable_all, .enable = intel_pmu_enable_counter, .disable = intel_pmu_disable_counter, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, @@ -951,8 +920,8 @@ static struct x86_pmu intel_pmu = { static struct x86_pmu amd_pmu = { .name = "AMD", .handle_irq = amd_pmu_handle_irq, - .save_disable_all = amd_pmu_save_disable_all, - .restore_all = amd_pmu_restore_all, + .disable_all = amd_pmu_disable_all, + .enable_all = amd_pmu_enable_all, .enable = amd_pmu_enable_counter, .disable = amd_pmu_disable_counter, .eventsel = MSR_K7_EVNTSEL0, @@ -1003,6 +972,8 @@ static int intel_pmu_init(void) x86_pmu.counter_bits = eax.split.bit_width; x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1; + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + return 0; } -- cgit v1.2.3 From a4016a79fcbd139e7378944c0d86a39fdbc70ecc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 May 2009 14:52:17 +0200 Subject: perf_counter: x86: Robustify interrupt handling Two consecutive NMIs could daze and confuse the machine when the first would handle the overflow of both counters. [ Impact: fix false-positive syslog messages under multi-session profiling ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 313638cecbb..1dcf67057f1 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -783,6 +783,10 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) counter = cpuc->counters[idx]; hwc = &counter->hw; + + if (counter->hw_event.nmi != nmi) + goto next; + val = x86_perf_counter_update(counter, hwc, idx); if (val & (1ULL << (x86_pmu.counter_bits - 1))) goto next; @@ -869,7 +873,6 @@ perf_counter_nmi_handler(struct notifier_block *self, { struct die_args *args = __args; struct pt_regs *regs; - int ret; if (!atomic_read(&active_counters)) return NOTIFY_DONE; @@ -886,9 +889,16 @@ perf_counter_nmi_handler(struct notifier_block *self, regs = args->regs; apic_write(APIC_LVTPC, APIC_DM_NMI); - ret = x86_pmu.handle_irq(regs, 1); + /* + * Can't rely on the handled return value to say it was our NMI, two + * counters could trigger 'simultaneously' raising two back-to-back NMIs. + * + * If the first NMI handles both, the latter will be empty and daze + * the CPU. + */ + x86_pmu.handle_irq(regs, 1); - return ret ? NOTIFY_STOP : NOTIFY_OK; + return NOTIFY_STOP; } static __read_mostly struct notifier_block perf_counter_nmi_notifier = { -- cgit v1.2.3 From 1c80f4b598d9b075a2a0be694e28be93a6702bcc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 15 May 2009 08:25:22 +0200 Subject: perf_counter: x86: Disallow interval of 1 On certain CPUs i have observed a stuck PMU if interval was set to 1 and NMIs were used. The PMU had PMC0 set in MSR_CORE_PERF_GLOBAL_STATUS, but it was not possible to ack it via MSR_CORE_PERF_GLOBAL_OVF_CTRL, and the NMI loop got stuck infinitely. [ Impact: fix rare hangs during high perfcounter load ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 1dcf67057f1..46a82d1e4cb 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -473,6 +473,11 @@ x86_perf_counter_set_period(struct perf_counter *counter, left += period; atomic64_set(&hwc->period_left, left); } + /* + * Quirk: certain CPUs dont like it if just 1 event is left: + */ + if (unlikely(left < 2)) + left = 2; per_cpu(prev_left[idx], smp_processor_id()) = left; -- cgit v1.2.3 From 9029a5e3801f1cc7cdaab80169d82427acf928d8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 15 May 2009 08:26:20 +0200 Subject: perf_counter: x86: Protect against infinite loops in intel_pmu_handle_irq() intel_pmu_handle_irq() can lock up in an infinite loop if the hardware does not allow the acking of irqs. Alas, this happened in testing so make this robust and emit a warning if it happens in the future. Also, clean up the IRQ handlers a bit. [ Impact: improve perfcounter irq/nmi handling robustness ] Acked-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 46a82d1e4cb..5a7f718eb1e 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -722,9 +722,13 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter) */ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) { - int bit, cpu = smp_processor_id(); + struct cpu_hw_counters *cpuc; + struct cpu_hw_counters; + int bit, cpu, loops; u64 ack, status; - struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); + + cpu = smp_processor_id(); + cpuc = &per_cpu(cpu_hw_counters, cpu); perf_disable(); status = intel_pmu_get_status(); @@ -733,7 +737,13 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) return 0; } + loops = 0; again: + if (++loops > 100) { + WARN_ONCE(1, "perfcounters: irq loop stuck!\n"); + return 1; + } + inc_irq_stat(apic_perf_irqs); ack = status; for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { @@ -765,13 +775,14 @@ again: static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) { - int cpu = smp_processor_id(); - struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); - u64 val; - int handled = 0; + int cpu, idx, throttle = 0, handled = 0; + struct cpu_hw_counters *cpuc; struct perf_counter *counter; struct hw_perf_counter *hwc; - int idx, throttle = 0; + u64 val; + + cpu = smp_processor_id(); + cpuc = &per_cpu(cpu_hw_counters, cpu); if (++cpuc->interrupts == PERFMON_MAX_INTERRUPTS) { throttle = 1; -- cgit v1.2.3 From 60db5e09c13109b13830cc9dcae688003fd39e79 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 May 2009 15:19:28 +0200 Subject: perf_counter: frequency based adaptive irq_period Instead of specifying the irq_period for a counter, provide a target interrupt frequency and dynamically adapt the irq_period to match this frequency. [ Impact: new perf-counter attribute/feature ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo LKML-Reference: <20090515132018.646195868@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 5a7f718eb1e..886dcf334bc 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -286,11 +286,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->nmi = 1; } - hwc->irq_period = hw_event->irq_period; - if ((s64)hwc->irq_period <= 0 || hwc->irq_period > x86_pmu.max_period) - hwc->irq_period = x86_pmu.max_period; - - atomic64_set(&hwc->period_left, hwc->irq_period); + atomic64_set(&hwc->period_left, + min(x86_pmu.max_period, hwc->irq_period)); /* * Raw event type provide the config in the event structure @@ -458,7 +455,7 @@ x86_perf_counter_set_period(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { s64 left = atomic64_read(&hwc->period_left); - s64 period = hwc->irq_period; + s64 period = min(x86_pmu.max_period, hwc->irq_period); int err; /* -- cgit v1.2.3 From d9bcc01d58d18ed287091707b0b45c6ac888a11a Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 14 May 2009 12:06:12 +0530 Subject: x86, mtrr: replace MTRRcap_MSR with msr-index's MSR_MTRRcap Use standard msr-index.h's MSR declaration and no need to declare again. [ Impact: cleanup, no object code change ] Signed-off-by: Jaswinder Singh Rajput Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/generic.c | 4 ++-- arch/x86/kernel/cpu/mtrr/main.c | 2 +- arch/x86/kernel/cpu/mtrr/mtrr.h | 1 - 3 files changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 0b776c09aff..de9c20ffa0d 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -306,7 +306,7 @@ void __init get_mtrr_state(void) vrs = mtrr_state.var_ranges; - rdmsr(MTRRcap_MSR, lo, dummy); + rdmsr(MSR_MTRRcap, lo, dummy); mtrr_state.have_fixed = (lo >> 8) & 1; for (i = 0; i < num_var_ranges; i++) @@ -703,7 +703,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i static int generic_have_wrcomb(void) { unsigned long config, dummy; - rdmsr(MTRRcap_MSR, config, dummy); + rdmsr(MSR_MTRRcap, config, dummy); return (config & (1 << 10)); } diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 03cda01f57c..8fc248b5aea 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -104,7 +104,7 @@ static void __init set_num_var_ranges(void) unsigned long config = 0, dummy; if (use_intel()) { - rdmsr(MTRRcap_MSR, config, dummy); + rdmsr(MSR_MTRRcap, config, dummy); } else if (is_cpu(AMD)) config = 2; else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 77f67f7b347..5d37fb14523 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -5,7 +5,6 @@ #include #include -#define MTRRcap_MSR 0x0fe #define MTRRdefType_MSR 0x2ff #define MTRRfix64K_00000_MSR 0x250 -- cgit v1.2.3 From a036c7a358cc9d7ed28a188480b9a4d709e09b24 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 14 May 2009 12:10:43 +0530 Subject: x86, mtrr: replace MTRRfix64K_00000_MSR with msr-index's MSR_MTRRfix64K_00000 Use standard msr-index.h's MSR declaration and no need to declare again. [ Impact: cleanup, no object code change ] Signed-off-by: Jaswinder Singh Rajput Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/generic.c | 4 ++-- arch/x86/kernel/cpu/mtrr/mtrr.h | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index de9c20ffa0d..8b115c0e590 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -20,7 +20,7 @@ struct fixed_range_block { }; static struct fixed_range_block fixed_range_blocks[] = { - { MTRRfix64K_00000_MSR, 1 }, /* one 64k MTRR */ + { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ { MTRRfix16K_80000_MSR, 2 }, /* two 16k MTRRs */ { MTRRfix4K_C0000_MSR, 8 }, /* eight 4k MTRRs */ {} @@ -194,7 +194,7 @@ get_fixed_ranges(mtrr_type * frs) k8_check_syscfg_dram_mod_en(); - rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]); + rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]); for (i = 0; i < 2; i++) rdmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], p[3 + i * 2]); diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 5d37fb14523..7f23caede71 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -7,7 +7,6 @@ #define MTRRdefType_MSR 0x2ff -#define MTRRfix64K_00000_MSR 0x250 #define MTRRfix16K_80000_MSR 0x258 #define MTRRfix16K_A0000_MSR 0x259 #define MTRRfix4K_C0000_MSR 0x268 -- cgit v1.2.3 From 7d9d55e449089df8463bca2045d702ae6cda64a2 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 14 May 2009 12:15:32 +0530 Subject: x86, mtrr: replace MTRRfix16K_80000_MSR with msr-index's MSR_MTRRfix16K_80000 Use standard msr-index.h's MSR declaration and no need to declare again [ Impact: cleanup, no object code change ] Signed-off-by: Jaswinder Singh Rajput Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/generic.c | 4 ++-- arch/x86/kernel/cpu/mtrr/mtrr.h | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 8b115c0e590..00437c2e8dd 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -21,7 +21,7 @@ struct fixed_range_block { static struct fixed_range_block fixed_range_blocks[] = { { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ - { MTRRfix16K_80000_MSR, 2 }, /* two 16k MTRRs */ + { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */ { MTRRfix4K_C0000_MSR, 8 }, /* eight 4k MTRRs */ {} }; @@ -197,7 +197,7 @@ get_fixed_ranges(mtrr_type * frs) rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]); for (i = 0; i < 2; i++) - rdmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], p[3 + i * 2]); + rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]); for (i = 0; i < 8; i++) rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]); } diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 7f23caede71..712b60524e6 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -7,7 +7,6 @@ #define MTRRdefType_MSR 0x2ff -#define MTRRfix16K_80000_MSR 0x258 #define MTRRfix16K_A0000_MSR 0x259 #define MTRRfix4K_C0000_MSR 0x268 #define MTRRfix4K_C8000_MSR 0x269 -- cgit v1.2.3 From 654ac05801ae806661c8fdeb3b5c420a31cbc5b1 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 14 May 2009 12:21:54 +0530 Subject: x86, mtrr: remove mtrr MSRs double declaration Removed MTRR MSR from mtrr/mtrr.h as these are already declared in msr-index.h and nobody is using them: MTRRfix16K_A0000_MSR MTRRfix4K_C8000_MSR MTRRfix4K_D0000_MSR MTRRfix4K_D8000_MSR MTRRfix4K_E0000_MSR MTRRfix4K_E8000_MSR MTRRfix4K_F0000_MSR MTRRfix4K_F8000_MSR Use standard msr-index.h's MSR declaration and no need to declare again [ Impact: cleanup, no object code change ] Signed-off-by: Jaswinder Singh Rajput Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/mtrr.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 712b60524e6..5053793f912 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -7,15 +7,7 @@ #define MTRRdefType_MSR 0x2ff -#define MTRRfix16K_A0000_MSR 0x259 #define MTRRfix4K_C0000_MSR 0x268 -#define MTRRfix4K_C8000_MSR 0x269 -#define MTRRfix4K_D0000_MSR 0x26a -#define MTRRfix4K_D8000_MSR 0x26b -#define MTRRfix4K_E0000_MSR 0x26c -#define MTRRfix4K_E8000_MSR 0x26d -#define MTRRfix4K_F0000_MSR 0x26e -#define MTRRfix4K_F8000_MSR 0x26f #define MTRR_CHANGE_MASK_FIXED 0x01 #define MTRR_CHANGE_MASK_VARIABLE 0x02 -- cgit v1.2.3 From ba5673ff1ff5f428256db4cedd4b05b7be008bb6 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 14 May 2009 12:29:25 +0530 Subject: x86, mtrr: replace MTRRfix4K_C0000_MSR with msr-index's MSR_MTRRfix4K_C0000 Use standard msr-index.h's MSR declaration and no need to declare again. [ Impact: cleanup, no object code change ] Signed-off-by: Jaswinder Singh Rajput Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/generic.c | 4 ++-- arch/x86/kernel/cpu/mtrr/mtrr.h | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 00437c2e8dd..3cf58e26534 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -22,7 +22,7 @@ struct fixed_range_block { static struct fixed_range_block fixed_range_blocks[] = { { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */ - { MTRRfix4K_C0000_MSR, 8 }, /* eight 4k MTRRs */ + { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */ {} }; @@ -199,7 +199,7 @@ get_fixed_ranges(mtrr_type * frs) for (i = 0; i < 2; i++) rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]); for (i = 0; i < 8; i++) - rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]); + rdmsr(MSR_MTRRfix4K_C0000 + i, p[6 + i * 2], p[7 + i * 2]); } void mtrr_save_fixed_ranges(void *info) diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 5053793f912..e5ee686d2c3 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -7,8 +7,6 @@ #define MTRRdefType_MSR 0x2ff -#define MTRRfix4K_C0000_MSR 0x268 - #define MTRR_CHANGE_MASK_FIXED 0x01 #define MTRR_CHANGE_MASK_VARIABLE 0x02 #define MTRR_CHANGE_MASK_DEFTYPE 0x04 -- cgit v1.2.3 From 52650257ea06bb15c2e2bbe854cbdf463726141a Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 14 May 2009 12:35:46 +0530 Subject: x86, mtrr: replace MTRRdefType_MSR with msr-index's MSR_MTRRdefType Use standard msr-index.h's MSR declaration and no need to declare again. [ Impact: cleanup, no object code change ] Signed-off-by: Jaswinder Singh Rajput Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/cleanup.c | 4 ++-- arch/x86/kernel/cpu/mtrr/generic.c | 8 ++++---- arch/x86/kernel/cpu/mtrr/mtrr.h | 2 -- arch/x86/kernel/cpu/mtrr/state.c | 6 +++--- 4 files changed, 9 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index ce0fe4b5c04..1d584a18a50 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -808,7 +808,7 @@ int __init mtrr_cleanup(unsigned address_bits) if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) return 0; - rdmsr(MTRRdefType_MSR, def, dummy); + rdmsr(MSR_MTRRdefType, def, dummy); def &= 0xff; if (def != MTRR_TYPE_UNCACHABLE) return 0; @@ -1003,7 +1003,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) */ if (!is_cpu(INTEL) || disable_mtrr_trim) return 0; - rdmsr(MTRRdefType_MSR, def, dummy); + rdmsr(MSR_MTRRdefType, def, dummy); def &= 0xff; if (def != MTRR_TYPE_UNCACHABLE) return 0; diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 3cf58e26534..e930a311770 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -314,7 +314,7 @@ void __init get_mtrr_state(void) if (mtrr_state.have_fixed) get_fixed_ranges(mtrr_state.fixed_ranges); - rdmsr(MTRRdefType_MSR, lo, dummy); + rdmsr(MSR_MTRRdefType, lo, dummy); mtrr_state.def_type = (lo & 0xff); mtrr_state.enabled = (lo & 0xc00) >> 10; @@ -579,10 +579,10 @@ static void prepare_set(void) __acquires(set_atomicity_lock) __flush_tlb(); /* Save MTRR state */ - rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); + rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); /* Disable MTRRs, and set the default type to uncached */ - mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & ~0xcff, deftype_hi); + mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); } static void post_set(void) __releases(set_atomicity_lock) @@ -591,7 +591,7 @@ static void post_set(void) __releases(set_atomicity_lock) __flush_tlb(); /* Intel (P6) standard MTRRs */ - mtrr_wrmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); + mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); /* Enable caches */ write_cr0(read_cr0() & 0xbfffffff); diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index e5ee686d2c3..7538b767f20 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -5,8 +5,6 @@ #include #include -#define MTRRdefType_MSR 0x2ff - #define MTRR_CHANGE_MASK_FIXED 0x01 #define MTRR_CHANGE_MASK_VARIABLE 0x02 #define MTRR_CHANGE_MASK_DEFTYPE 0x04 diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c index 7f7e2753685..1f5fb1588d1 100644 --- a/arch/x86/kernel/cpu/mtrr/state.c +++ b/arch/x86/kernel/cpu/mtrr/state.c @@ -35,7 +35,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt) if (use_intel()) /* Save MTRR state */ - rdmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); + rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); else /* Cyrix ARRs - everything else were excluded at the top */ ctxt->ccr3 = getCx86(CX86_CCR3); @@ -46,7 +46,7 @@ void set_mtrr_cache_disable(struct set_mtrr_context *ctxt) { if (use_intel()) /* Disable MTRRs, and set the default type to uncached */ - mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo & 0xf300UL, + mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL, ctxt->deftype_hi); else if (is_cpu(CYRIX)) /* Cyrix ARRs - everything else were excluded at the top */ @@ -64,7 +64,7 @@ void set_mtrr_done(struct set_mtrr_context *ctxt) /* Restore MTRRdefType */ if (use_intel()) /* Intel (P6) standard MTRRs */ - mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); + mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); else /* Cyrix ARRs - everything else was excluded at the top */ setCx86(CX86_CCR3, ctxt->ccr3); -- cgit v1.2.3 From b4ecc126991b30fe5f9a59dfacda046aeac124b2 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 13 May 2009 17:16:55 -0700 Subject: x86: Fix performance regression caused by paravirt_ops on native kernels Xiaohui Xin and some other folks at Intel have been looking into what's behind the performance hit of paravirt_ops when running native. It appears that the hit is entirely due to the paravirtualized spinlocks introduced by: | commit 8efcbab674de2bee45a2e4cdf97de16b8e609ac8 | Date: Mon Jul 7 12:07:51 2008 -0700 | | paravirt: introduce a "lock-byte" spinlock implementation The extra call/return in the spinlock path is somehow causing an increase in the cycles/instruction of somewhere around 2-7% (seems to vary quite a lot from test to test). The working theory is that the CPU's pipeline is getting upset about the call->call->locked-op->return->return, and seems to be failing to speculate (though I haven't seen anything definitive about the precise reasons). This doesn't entirely make sense, because the performance hit is also visible on unlock and other operations which don't involve locked instructions. But spinlock operations clearly swamp all the other pvops operations, even though I can't imagine that they're nearly as common (there's only a .05% increase in instructions executed). If I disable just the pv-spinlock calls, my tests show that pvops is identical to non-pvops performance on native (my measurements show that it is actually about .1% faster, but Xiaohui shows a .05% slowdown). Summary of results, averaging 10 runs of the "mmperf" test, using a no-pvops build as baseline: nopv Pv-nospin Pv-spin CPU cycles 100.00% 99.89% 102.18% instructions 100.00% 100.10% 100.15% CPI 100.00% 99.79% 102.03% cache ref 100.00% 100.84% 100.28% cache miss 100.00% 90.47% 88.56% cache miss rate 100.00% 89.72% 88.31% branches 100.00% 99.93% 100.04% branch miss 100.00% 103.66% 107.72% branch miss rt 100.00% 103.73% 107.67% wallclock 100.00% 99.90% 102.20% The clear effect here is that the 2% increase in CPI is directly reflected in the final wallclock time. (The other interesting effect is that the more ops are out of line calls via pvops, the lower the cache access and miss rates. Not too surprising, but it suggests that the non-pvops kernel is over-inlined. On the flipside, the branch misses go up correspondingly...) So, what's the fix? Paravirt patching turns all the pvops calls into direct calls, so _spin_lock etc do end up having direct calls. For example, the compiler generated code for paravirtualized _spin_lock is: <_spin_lock+0>: mov %gs:0xb4c8,%rax <_spin_lock+9>: incl 0xffffffffffffe044(%rax) <_spin_lock+15>: callq *0xffffffff805a5b30 <_spin_lock+22>: retq The indirect call will get patched to: <_spin_lock+0>: mov %gs:0xb4c8,%rax <_spin_lock+9>: incl 0xffffffffffffe044(%rax) <_spin_lock+15>: callq <__ticket_spin_lock> <_spin_lock+20>: nop; nop /* or whatever 2-byte nop */ <_spin_lock+22>: retq One possibility is to inline _spin_lock, etc, when building an optimised kernel (ie, when there's no spinlock/preempt instrumentation/debugging enabled). That will remove the outer call/return pair, returning the instruction stream to a single call/return, which will presumably execute the same as the non-pvops case. The downsides arel 1) it will replicate the preempt_disable/enable code at eack lock/unlock callsite; this code is fairly small, but not nothing; and 2) the spinlock definitions are already a very heavily tangled mass of #ifdefs and other preprocessor magic, and making any changes will be non-trivial. The other obvious answer is to disable pv-spinlocks. Making them a separate config option is fairly easy, and it would be trivial to enable them only when Xen is enabled (as the only non-default user). But it doesn't really address the common case of a distro build which is going to have Xen support enabled, and leaves the open question of whether the native performance cost of pv-spinlocks is worth the performance improvement on a loaded Xen system (10% saving of overall system CPU when guests block rather than spin). Still it is a reasonable short-term workaround. [ Impact: fix pvops performance regression when running native ] Analysed-by: "Xin Xiaohui" Analysed-by: "Li Xin" Analysed-by: "Nakajima Jun" Signed-off-by: Jeremy Fitzhardinge Acked-by: H. Peter Anvin Cc: Nick Piggin Cc: Xen-devel LKML-Reference: <4A0B62F7.5030802@goop.org> [ fixed the help text ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 3 ++- arch/x86/kernel/paravirt.c | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 145cce75cda..88d1bfc847d 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -89,7 +89,8 @@ obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o obj-$(CONFIG_KVM_GUEST) += kvm.o obj-$(CONFIG_KVM_CLOCK) += kvmclock.o -obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o paravirt-spinlocks.o +obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o +obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 8e45f446488..9faf43bea33 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -134,7 +134,9 @@ static void *get_call_destination(u8 type) .pv_irq_ops = pv_irq_ops, .pv_apic_ops = pv_apic_ops, .pv_mmu_ops = pv_mmu_ops, +#ifdef CONFIG_PARAVIRT_SPINLOCKS .pv_lock_ops = pv_lock_ops, +#endif }; return *((void **)&tmpl + type); } -- cgit v1.2.3 From d2517a49d55536b38c7a87e5289550cfedaa4dcc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 17 May 2009 10:04:45 +0200 Subject: perf_counter, x86: fix zero irq_period counters The quirk to irq_period unearthed an unrobustness we had in the hw_counter initialization sequence: we left irq_period at 0, which was then quirked up to 2 ... which then generated a _lot_ of interrupts during 'perf stat' runs, slowed them down and skewed the counter results in general. Initialize irq_period to the maximum instead. [ Impact: fix perf stat results ] Cc: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: Marcelo Tosatti Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 886dcf334bc..5bfd30ab392 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -286,6 +286,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->nmi = 1; } + if (!hwc->irq_period) + hwc->irq_period = x86_pmu.max_period; + atomic64_set(&hwc->period_left, min(x86_pmu.max_period, hwc->irq_period)); -- cgit v1.2.3 From e5198075c67a22ec9a09565b1ce88d3d3f5ba855 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 15 May 2009 13:05:16 -0700 Subject: x86, apic: introduce io_apic_irq_attr according to Ingo, io_apic irq-setup related functions have too many parameters with a repetitive signature. So reduce related funcs to get less params by passing a pointer to a newly defined io_apic_irq_attr structure. v2: io_apic_irq ==> irq_attr triggering ==> trigger v3: add set_io_apic_irq_attr [ Impact: cleanup ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Jesse Barnes Cc: Len Brown LKML-Reference: <4A08ACD3.2070401@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 22 +++++++++++---------- arch/x86/kernel/apic/io_apic.c | 43 ++++++++++++++++++++++++------------------ 2 files changed, 37 insertions(+), 28 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index dcfbc3ab9e4..4af63dfb0f0 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -523,7 +523,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) * success: return IRQ number (>=0) * failure: return < 0 */ -int acpi_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) +int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) { unsigned int irq; unsigned int plat_gsi = gsi; @@ -533,14 +533,14 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) * Make sure all (legacy) PCI IRQs are set as level-triggered. */ if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { - if (triggering == ACPI_LEVEL_SENSITIVE) + if (trigger == ACPI_LEVEL_SENSITIVE) eisa_set_level_irq(gsi); } #endif #ifdef CONFIG_X86_IO_APIC if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) { - plat_gsi = mp_register_gsi(dev, gsi, triggering, polarity); + plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity); } #endif acpi_gsi_to_irq(plat_gsi, &irq); @@ -1156,7 +1156,7 @@ void __init mp_config_acpi_legacy_irqs(void) } } -static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int triggering, +static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, int polarity) { #ifdef CONFIG_X86_MPPARSE @@ -1181,7 +1181,7 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int triggering, /* print the entry should happen on mptable identically */ mp_irq.type = MP_INTSRC; mp_irq.irqtype = mp_INT; - mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | + mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | (polarity == ACPI_ACTIVE_HIGH ? 1 : 3); mp_irq.srcbus = number; mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); @@ -1194,10 +1194,11 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int triggering, return 0; } -int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) +int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) { int ioapic; int ioapic_pin; + struct io_apic_irq_attr irq_attr; if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) return gsi; @@ -1225,11 +1226,12 @@ int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) ioapic_pin); return gsi; } - mp_config_acpi_gsi(dev, gsi, triggering, polarity); + mp_config_acpi_gsi(dev, gsi, trigger, polarity); - io_apic_set_pci_routing(dev, ioapic, ioapic_pin, gsi, - triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, - polarity == ACPI_ACTIVE_HIGH ? 0 : 1); + set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin, + trigger == ACPI_EDGE_SENSITIVE ? 0 : 1, + polarity == ACPI_ACTIVE_HIGH ? 0 : 1); + io_apic_set_pci_routing(dev, gsi, &irq_attr); return gsi; } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 74d2b480a20..ce1ac74baa7 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1096,8 +1096,7 @@ static int pin_2_irq(int idx, int apic, int pin) * Not an __init, possibly needed by modules */ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin, - int *ioapic, int *ioapic_pin, - int *trigger, int *polarity) + struct io_apic_irq_attr *irq_attr) { int apic, i, best_guess = -1; @@ -1127,10 +1126,10 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin, continue; if (pin == (mp_irqs[i].srcbusirq & 3)) { - *ioapic = apic; - *ioapic_pin = mp_irqs[i].dstirq; - *trigger = irq_trigger(i); - *polarity = irq_polarity(i); + set_io_apic_irq_attr(irq_attr, apic, + mp_irqs[i].dstirq, + irq_trigger(i), + irq_polarity(i)); return irq; } /* @@ -1138,10 +1137,10 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin, * best-guess fuzzy result for broken mptables. */ if (best_guess < 0) { - *ioapic = apic; - *ioapic_pin = mp_irqs[i].dstirq; - *trigger = irq_trigger(i); - *polarity = irq_polarity(i); + set_io_apic_irq_attr(irq_attr, apic, + mp_irqs[i].dstirq, + irq_trigger(i), + irq_polarity(i)); best_guess = irq; } } @@ -3865,13 +3864,16 @@ int __init arch_probe_nr_irqs(void) } #endif -static int __io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, - int triggering, int polarity) +static int __io_apic_set_pci_routing(struct device *dev, int irq, + struct io_apic_irq_attr *irq_attr) { struct irq_desc *desc; struct irq_cfg *cfg; int node; + int ioapic, pin; + int trigger, polarity; + ioapic = irq_attr->ioapic; if (!IO_APIC_IRQ(irq)) { apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", ioapic); @@ -3889,6 +3891,10 @@ static int __io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, in return 0; } + pin = irq_attr->ioapic_pin; + trigger = irq_attr->trigger; + polarity = irq_attr->polarity; + /* * IRQs < 16 are already in the irq_2_pin[] map */ @@ -3897,20 +3903,22 @@ static int __io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, in add_pin_to_irq_node(cfg, node, ioapic, pin); } - setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity); + setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity); return 0; } -int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, - int triggering, int polarity) +int io_apic_set_pci_routing(struct device *dev, int irq, + struct io_apic_irq_attr *irq_attr) { - + int ioapic, pin; /* * Avoid pin reprogramming. PRTs typically include entries * with redundant pin->gsi mappings (but unique PCI devices); * we only program the IOAPIC on the first. */ + ioapic = irq_attr->ioapic; + pin = irq_attr->ioapic_pin; if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) { pr_debug("Pin %d-%d already programmed\n", mp_ioapics[ioapic].apicid, pin); @@ -3918,8 +3926,7 @@ int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, } set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed); - return __io_apic_set_pci_routing(dev, ioapic, pin, irq, - triggering, polarity); + return __io_apic_set_pci_routing(dev, irq, irq_attr); } /* -------------------------------------------------------------------------- -- cgit v1.2.3 From 2759c3287de27266e06f1f4e82cbd2d65f6a044c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 15 May 2009 13:05:16 -0700 Subject: x86: don't call read_apic_id if !cpu_has_apic should not call that if apic is disabled. [ Impact: fix crash on certain UP configs ] Signed-off-by: Yinghai Lu Cc: Cyrill Gorcunov LKML-Reference: <4A09CCBB.2000306@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic_flat_64.c | 2 +- arch/x86/kernel/cpu/amd.c | 2 +- arch/x86/kernel/cpu/common.c | 6 ++++++ arch/x86/kernel/cpu/intel.c | 6 +++--- 4 files changed, 11 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 744e6d8af27..d0c99abc26c 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -161,7 +161,7 @@ static int flat_apic_id_registered(void) static int flat_phys_pkg_id(int initial_apic_id, int index_msb) { - return hard_smp_processor_id() >> index_msb; + return initial_apic_id >> index_msb; } struct apic apic_flat = { diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 7e4a459daa6..728b3750a3e 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -272,7 +272,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) int cpu = smp_processor_id(); int node; - unsigned apicid = hard_smp_processor_id(); + unsigned apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid; node = c->phys_proc_id; if (apicid_to_node[apicid] != NUMA_NO_NODE) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c1caefc82e6..017c600e05a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -761,6 +761,12 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) if (this_cpu->c_identify) this_cpu->c_identify(c); + /* Clear/Set all flags overriden by options, after probe */ + for (i = 0; i < NCAPINTS; i++) { + c->x86_capability[i] &= ~cpu_caps_cleared[i]; + c->x86_capability[i] |= cpu_caps_set[i]; + } + #ifdef CONFIG_X86_64 c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); #endif diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 7437fa133c0..daed39ba261 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -229,12 +229,12 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) } #endif -static void __cpuinit srat_detect_node(void) +static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) { #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) unsigned node; int cpu = smp_processor_id(); - int apicid = hard_smp_processor_id(); + int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid; /* Don't do the funky fallback heuristics the AMD version employs for now. */ @@ -400,7 +400,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) } /* Work around errata */ - srat_detect_node(); + srat_detect_node(c); if (cpu_has(c, X86_FEATURE_VMX)) detect_vmx_virtcap(c); -- cgit v1.2.3 From 35d5a9a61490bf39d2e48d7f499c8c801a39ebe9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 15 May 2009 13:59:37 -0700 Subject: x86: fix system without memory on node0 Jack found a boot crash on a system which doesn't have memory on node0. It turns out with recent per_cpu changes, node_number for BSP will always be 0, and it is not consistent to cpu_to_node() that might set it to a different (nearer) node already. aka when numa_set_node() for node0 is called early before per_cpu area is setup: two places touched that per_cpu(node_number,): 1. in cpu/common.c::cpu_init() and it is not for BP | #ifdef CONFIG_NUMA | if (cpu != 0 && percpu_read(node_number) == 0 && | cpu_to_node(cpu) != NUMA_NO_NODE) | percpu_write(node_number, cpu_to_node(cpu)); | #endif for BP: traps_init ==> cpu_init for AP: start_secondary ==> cpu_init 2. cpu/intel.c or amd.c::srat_detect_node via numa_set_node() for BP: check_bugs ==> identify_boot_cpu ==> identify_cpu() that is rather later before numa_node_id() is used for BP... for AP: start_secondary => smp_callin => smp_store_cpu_info() => => identify_secondary_cpu => identify_cpu() so try to set that for BP earlier in setup_per_cpu_areas(), and don't bother to set that for APs there (it will be updated later and will be used later) (and don't mess the 0 before the copying BP per_cpu data to APs) [ Impact: fix boot crash on memoryless node-0 ] Reported-and-tested-by: Jack Steiner Cc: Tejun Heo Signed-off-by: Yinghai Lu LKML-Reference: <4A0C4A02.7050401@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 3a97a4cf187..3b5f3271e73 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -423,6 +423,14 @@ void __init setup_per_cpu_areas(void) early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; #endif +#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) + /* + * make sure boot cpu node_number is right, when boot cpu is on the + * node that doesn't have mem installed + */ + per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id); +#endif + /* Setup node to cpumask map */ setup_node_to_cpumask_map(); -- cgit v1.2.3 From 629e15d245f46bef9d26199b450f882f9437a8fe Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 15 May 2009 13:05:16 -0700 Subject: x86, irq: update_mptable needs pci_routeirq To get all device irq routing and to save them. This is basically an implicit pci=routeirq enablement if (and on if) the update_mptable boot option (which is off by default) has been specified. [ Impact: extend the update_mptable boot opion's scope ] Signed-off-by: Yinghai Lu Cc: Jesse Barnes LKML-Reference: <4A0DB7B4.4060702@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/mpparse.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index cd2a41a7c45..e6bf9d08e50 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -961,6 +962,9 @@ static int __initdata enable_update_mptable; static int __init update_mptable_setup(char *str) { enable_update_mptable = 1; +#ifdef CONFIG_PCI + pci_routeirq = 1; +#endif return 0; } early_param("update_mptable", update_mptable_setup); @@ -973,6 +977,9 @@ static int __initdata alloc_mptable; static int __init parse_alloc_mptable_opt(char *p) { enable_update_mptable = 1; +#ifdef CONFIG_PCI + pci_routeirq = 1; +#endif alloc_mptable = 1; if (!p) return 0; -- cgit v1.2.3 From f1bdb523880c7f6990e9e8e50b0fc972ca475e84 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 15 May 2009 13:05:16 -0700 Subject: x86, irq: don't call mp_config_acpi_gsi() if update_mptable is not enabled Len expressed concern that the update_mptable feature has side-effects on the ACPI code. Make it sure explicitly that the code only ever gets called if the (default disabled) update_mptable boot quirk option is disabled. [ Impact: isolate the update_mptable feature from ACPI code more ] Signed-off-by: Yinghai Lu Cc: Len Brown LKML-Reference: <4A0DC832.5090200@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 4 +++- arch/x86/kernel/mpparse.c | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 4af63dfb0f0..844e5e25213 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1226,7 +1226,9 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) ioapic_pin); return gsi; } - mp_config_acpi_gsi(dev, gsi, trigger, polarity); + + if (enable_update_mptable) + mp_config_acpi_gsi(dev, gsi, trigger, polarity); set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin, trigger == ACPI_EDGE_SENSITIVE ? 0 : 1, diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index e6bf9d08e50..651c93b2886 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -957,7 +957,7 @@ out: return 0; } -static int __initdata enable_update_mptable; +int enable_update_mptable; static int __init update_mptable_setup(char *str) { -- cgit v1.2.3 From b68f1d2e7aa21029d73c7d453a8046e95d351740 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 17 May 2009 19:37:25 +0200 Subject: perf_counter, x86: speed up the scheduling fast-path We have to set up the LVT entry only at counter init time, not at every switch-in time. There's friction between NMI and non-NMI use here - we'll probably remove the per counter configurability of it - but until then, dont slow down things ... [ Impact: micro-optimization ] Cc: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: Marcelo Tosatti LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 5bfd30ab392..c109819c2cb 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -285,6 +285,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return -EACCES; hwc->nmi = 1; } + perf_counters_lapic_init(hwc->nmi); if (!hwc->irq_period) hwc->irq_period = x86_pmu.max_period; @@ -603,8 +604,6 @@ try_generic: hwc->counter_base = x86_pmu.perfctr; } - perf_counters_lapic_init(hwc->nmi); - x86_pmu.disable(hwc, idx); cpuc->counters[idx] = counter; @@ -1054,7 +1053,7 @@ void __init init_hw_perf_counters(void) pr_info("... counter mask: %016Lx\n", perf_counter_mask); - perf_counters_lapic_init(0); + perf_counters_lapic_init(1); register_die_notifier(&perf_counter_nmi_notifier); } -- cgit v1.2.3 From 4c6f18fc81565967da20f2d4a3922cdba33f8e2b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 18 May 2009 10:23:28 -0700 Subject: x86, io-apic: Don't mark pin_programmed early Peter bisected that: | commit b9c61b70075c87a8612624736faf4a2de5b1ed30 | Date: Wed May 6 10:10:06 2009 -0700 | | x86/pci: update pirq_enable_irq() to setup io apic routing | | So we can set io apic routing only when enabling the device irq. wrecked his opteron box, ata1 interrupts fail to get through. ata1 is using irq 11: [ 1.451839] sata_svw 0000:01:0e.0: version 2.3 [ 1.456333] sata_svw 0000:01:0e.0: PCI INT A -> GSI 11 (level, low) -> IRQ 11 [ 1.463639] scsi0 : sata_svw [ 1.466949] scsi1 : sata_svw [ 1.470022] scsi2 : sata_svw [ 1.473090] scsi3 : sata_svw [ 1.476112] ata1: SATA max UDMA/133 mmio m8192@0xff3fe000 port 0xff3fe000 irq 11 [ 1.483490] ata2: SATA max UDMA/133 mmio m8192@0xff3fe000 port 0xff3fe100 irq 11 [ 1.490870] ata3: SATA max UDMA/133 mmio m8192@0xff3fe000 port 0xff3fe200 irq 11 [ 1.498247] ata4: SATA max UDMA/133 mmio m8192@0xff3fe000 port 0xff3fe300 irq 11 that pin is overlapped with pin with legacy ones. We should not set bits in pin_programmed here, so that those bit could be set later via io_apic_set_pci_routing(). [ Impact: fix boot hang on certain systems ] Reported-by: Peter Zijlstra Signed-off-by: Yinghai Lu Tested-by: Peter Zijlstra Cc: Jack Steiner LKML-Reference: <4A119990.9020606@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ce1ac74baa7..ac7f3b6ad58 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1537,7 +1537,10 @@ static void __init setup_IO_APIC_irqs(void) } cfg = desc->chip_data; add_pin_to_irq_node(cfg, node, apic_id, pin); - set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed); + /* + * don't mark it in pin_programmed, so later acpi could + * set it correctly when irq < 16 + */ setup_IO_APIC_irq(apic_id, pin, irq, desc, irq_trigger(idx), irq_polarity(idx)); } -- cgit v1.2.3 From 34adc8062227f41b04ade0ff3fbd1dbe3002669e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 20 May 2009 20:13:28 +0200 Subject: perf_counter: Fix context removal deadlock Disable the PMU globally before removing a counter from a context. This fixes the following lockup: [22081.741922] ------------[ cut here ]------------ [22081.746668] WARNING: at arch/x86/kernel/cpu/perf_counter.c:803 intel_pmu_handle_irq+0x9b/0x24e() [22081.755624] Hardware name: X8DTN [22081.758903] perfcounters: irq loop stuck! [22081.762985] Modules linked in: [22081.766136] Pid: 11082, comm: perf Not tainted 2.6.30-rc6-tip #226 [22081.772432] Call Trace: [22081.774940] [] ? intel_pmu_handle_irq+0x9b/0x24e [22081.781993] [] ? intel_pmu_handle_irq+0x9b/0x24e [22081.788368] [] ? warn_slowpath_common+0x77/0xa3 [22081.794649] [] ? warn_slowpath_fmt+0x40/0x45 [22081.800696] [] ? intel_pmu_handle_irq+0x9b/0x24e [22081.807080] [] ? perf_counter_nmi_handler+0x3f/0x4a [22081.813751] [] ? notifier_call_chain+0x58/0x86 [22081.819951] [] ? notify_die+0x2d/0x32 [22081.825392] [] ? do_nmi+0x8e/0x242 [22081.830538] [] ? nmi+0x1a/0x20 [22081.835342] [] ? selinux_file_free_security+0x0/0x1a [22081.842105] [] ? x86_pmu_disable_counter+0x15/0x41 [22081.848673] <> [] ? x86_pmu_disable+0x86/0x103 [22081.855512] [] ? __perf_counter_remove_from_context+0x0/0xfe [22081.862926] [] ? counter_sched_out+0x30/0xce [22081.868909] [] ? __perf_counter_remove_from_context+0x59/0xfe [22081.876382] [] ? smp_call_function_single+0x6c/0xe6 [22081.882955] [] ? perf_release+0x86/0x14c [22081.888600] [] ? __fput+0xe7/0x195 [22081.893718] [] ? filp_close+0x5b/0x62 [22081.899107] [] ? put_files_struct+0x64/0xc2 [22081.905031] [] ? do_exit+0x1e2/0x6ef [22081.910360] [] ? _spin_lock_irqsave+0x9/0xe [22081.916292] [] ? do_group_exit+0x67/0x93 [22081.921953] [] ? sys_exit_group+0x12/0x16 [22081.927759] [] ? system_call_fastpath+0x16/0x1b [22081.934076] ---[ end trace 3a3936ce3e1b4505 ]--- And could potentially also fix the lockup reported by Marcelo Tosatti. Also, print more debug info in case of a detected lockup. [ Impact: fix lockup ] Reported-by: Marcelo Tosatti Acked-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Thomas Gleixner LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index c109819c2cb..6cc1660db8d 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -740,6 +740,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) again: if (++loops > 100) { WARN_ONCE(1, "perfcounters: irq loop stuck!\n"); + perf_counter_print_debug(); return 1; } -- cgit v1.2.3 From 88dff4936c0a5fa53080cca68dc963a8a2a674b0 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 22 May 2009 11:35:50 +0800 Subject: x86: DMI match for the Sony VGN-Z540N as it needs BIOS reboot x86: DMI match for the Sony VGN-Z540N as it needs BIOS reboot, see: http://bugzilla.kernel.org/show_bug.cgi?id=12901 [ Impact: fix hung reboot on certain systems ] Signed-off-by: Zhang Rui Cc: Len Brown LKML-Reference: <1242963350.32574.53.camel@rzhang-dt> Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 1340dad417f..667188e0b5a 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -232,6 +232,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "Dell DXP061"), }, }, + { /* Handle problems with rebooting on Sony VGN-Z540N */ + .callback = set_bios_reboot, + .ident = "Sony VGN-Z540N", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Sony Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"), + }, + }, { } }; -- cgit v1.2.3 From a63eaf34ae60bdb067a354cc8def2e8f4a01f5f4 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 22 May 2009 14:17:31 +1000 Subject: perf_counter: Dynamically allocate tasks' perf_counter_context struct This replaces the struct perf_counter_context in the task_struct with a pointer to a dynamically allocated perf_counter_context struct. The main reason for doing is this is to allow us to transfer a perf_counter_context from one task to another when we do lazy PMU switching in a later patch. This has a few side-benefits: the task_struct becomes a little smaller, we save some memory because only tasks that have perf_counters attached get a perf_counter_context allocated for them, and we can remove the inclusion of in sched.h, meaning that we don't end up recompiling nearly everything whenever perf_counter.h changes. The perf_counter_context structures are reference-counted and freed when the last reference is dropped. A context can have references from its task and the counters on its task. Counters can outlive the task so it is possible that a context will be freed well after its task has exited. Contexts are allocated on fork if the parent had a context, or otherwise the first time that a per-task counter is created on a task. In the latter case, we set the context pointer in the task struct locklessly using an atomic compare-and-exchange operation in case we raced with some other task in creating a context for the subject task. This also removes the task pointer from the perf_counter struct. The task pointer was not used anywhere and would make it harder to move a context from one task to another. Anything that needed to know which task a counter was attached to was already using counter->ctx->task. The __perf_counter_init_context function moves up in perf_counter.c so that it can be called from find_get_context, and now initializes the refcount, but is otherwise unchanged. We were potentially calling list_del_counter twice: once from __perf_counter_exit_task when the task exits and once from __perf_counter_remove_from_context when the counter's fd gets closed. This adds a check in list_del_counter so it doesn't do anything if the counter has already been removed from the lists. Since perf_counter_task_sched_in doesn't do anything if the task doesn't have a context, and leaves cpuctx->task_ctx = NULL, this adds code to __perf_install_in_context to set cpuctx->task_ctx if necessary, i.e. in the case where the current task adds the first counter to itself and thus creates a context for itself. This also adds similar code to __perf_counter_enable to handle a similar situation which can arise when the counters have been disabled using prctl; that also leaves cpuctx->task_ctx = NULL. [ Impact: refactor counter context management to prepare for new feature ] Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo LKML-Reference: <18966.10075.781053.231153@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index e9021a90802..b4f64402a82 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -14,6 +14,7 @@ * Mikael Pettersson : PM converted to driver model. */ +#include #include #include #include -- cgit v1.2.3 From 0c752a93353d9b17dbe148312d732fbe06d235e1 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Fri, 22 May 2009 12:17:45 -0700 Subject: x86: introduce noxsave boot parameter Introduce "noxsave" boot parameter which will disable the cpu's xsave/xrstor capabilities. Useful for debugging and working around xsave related issues. [ Impact: make it possible to debug problems in the field ] Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c1caefc82e6..77848d9fca6 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -114,6 +114,13 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { } }; EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); +static int __init x86_xsave_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_XSAVE); + return 1; +} +__setup("noxsave", x86_xsave_setup); + #ifdef CONFIG_X86_32 static int cachesize_override __cpuinitdata = -1; static int disable_x86_serial_nr __cpuinitdata = 1; -- cgit v1.2.3 From 71c9d8b68b299bef614afc7907393564a9f1476f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 May 2009 12:01:59 +0900 Subject: x86: Remove remap percpu allocator for the time being Remap percpu allocator has subtle bug when combined with page attribute changing. Remap percpu allocator aliases PMD pages for the first chunk and as pageattr doesn't know about the alias it ends up updating page attributes of the original mapping thus leaving the alises in inconsistent state which might lead to subtle data corruption. Please read the following threads for more information: http://thread.gmane.org/gmane.linux.kernel/835783 The following is the proposed fix which teaches pageattr about percpu aliases. http://thread.gmane.org/gmane.linux.kernel/837157 However, the above changes are deemed too pervasive for upstream inclusion for 2.6.30 release, so this patch essentially disables the remap allocator for the time being. Signed-off-by: Tejun Heo LKML-Reference: <4A1A0A27.4050301@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 3a97a4cf187..8f0e13be36b 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -160,8 +160,10 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) /* * If large page isn't supported, there's no benefit in doing * this. Also, on non-NUMA, embedding is better. + * + * NOTE: disabled for now. */ - if (!cpu_has_pse || !pcpu_need_numa()) + if (true || !cpu_has_pse || !pcpu_need_numa()) return -EINVAL; /* -- cgit v1.2.3 From ff99be573e02e9f7edc23b472c7f9a5ddba12795 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 May 2009 17:39:03 +0200 Subject: perf_counter: x86: Expose INV and EDGE bits Expose the INV and EDGE bits of the PMU to raw configs. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525153931.494709027@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 6cc1660db8d..c14437faf5d 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -87,11 +87,15 @@ static u64 intel_pmu_raw_event(u64 event) { #define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL #define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL +#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL +#define CORE_EVNTSEL_INV_MASK 0x00800000ULL #define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL #define CORE_EVNTSEL_MASK \ (CORE_EVNTSEL_EVENT_MASK | \ CORE_EVNTSEL_UNIT_MASK | \ + CORE_EVNTSEL_EDGE_MASK | \ + CORE_EVNTSEL_INV_MASK | \ CORE_EVNTSEL_COUNTER_MASK) return event & CORE_EVNTSEL_MASK; @@ -119,11 +123,15 @@ static u64 amd_pmu_raw_event(u64 event) { #define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL #define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL +#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL +#define K7_EVNTSEL_INV_MASK 0x000800000ULL #define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL #define K7_EVNTSEL_MASK \ (K7_EVNTSEL_EVENT_MASK | \ K7_EVNTSEL_UNIT_MASK | \ + K7_EVNTSEL_EDGE_MASK | \ + K7_EVNTSEL_INV_MASK | \ K7_EVNTSEL_COUNTER_MASK) return event & K7_EVNTSEL_MASK; -- cgit v1.2.3 From 48e22d56ecdeddd1ffb42a02fccba5c6ef42b133 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 May 2009 17:39:04 +0200 Subject: perf_counter: x86: Remove interrupt throttle remove the x86 specific interrupt throttle Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525153931.616671838@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 2 -- arch/x86/kernel/cpu/perf_counter.c | 47 ++++---------------------------------- 2 files changed, 5 insertions(+), 44 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b4f64402a82..89b63b5fad3 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -763,8 +763,6 @@ static void local_apic_timer_interrupt(void) inc_irq_stat(apic_timer_irqs); evt->event_handler(evt); - - perf_counter_unthrottle(); } /* diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index c14437faf5d..8c8177f859f 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -718,11 +718,6 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter) intel_pmu_enable_counter(hwc, idx); } -/* - * Maximum interrupt frequency of 100KHz per CPU - */ -#define PERFMON_MAX_INTERRUPTS (100000/HZ) - /* * This handler is triggered by the local APIC, so the APIC IRQ handling * rules apply: @@ -775,15 +770,14 @@ again: if (status) goto again; - if (++cpuc->interrupts != PERFMON_MAX_INTERRUPTS) - perf_enable(); + perf_enable(); return 1; } static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) { - int cpu, idx, throttle = 0, handled = 0; + int cpu, idx, handled = 0; struct cpu_hw_counters *cpuc; struct perf_counter *counter; struct hw_perf_counter *hwc; @@ -792,16 +786,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_counters, cpu); - if (++cpuc->interrupts == PERFMON_MAX_INTERRUPTS) { - throttle = 1; - __perf_disable(); - cpuc->enabled = 0; - barrier(); - } - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - int disable = 0; - if (!test_bit(idx, cpuc->active_mask)) continue; @@ -809,45 +794,23 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) hwc = &counter->hw; if (counter->hw_event.nmi != nmi) - goto next; + continue; val = x86_perf_counter_update(counter, hwc, idx); if (val & (1ULL << (x86_pmu.counter_bits - 1))) - goto next; + continue; /* counter overflow */ x86_perf_counter_set_period(counter, hwc, idx); handled = 1; inc_irq_stat(apic_perf_irqs); - disable = perf_counter_overflow(counter, nmi, regs, 0); - -next: - if (disable || throttle) + if (perf_counter_overflow(counter, nmi, regs, 0)) amd_pmu_disable_counter(hwc, idx); } return handled; } -void perf_counter_unthrottle(void) -{ - struct cpu_hw_counters *cpuc; - - if (!x86_pmu_initialized()) - return; - - cpuc = &__get_cpu_var(cpu_hw_counters); - if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) { - /* - * Clear them before re-enabling irqs/NMIs again: - */ - cpuc->interrupts = 0; - perf_enable(); - } else { - cpuc->interrupts = 0; - } -} - void smp_perf_counter_interrupt(struct pt_regs *regs) { irq_enter(); -- cgit v1.2.3 From a78ac3258782f3e64cb40beb5990808e1febcc0c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 May 2009 17:39:05 +0200 Subject: perf_counter: Generic per counter interrupt throttle Introduce a generic per counter interrupt throttle. This uses the perf_counter_overflow() quick disable to throttle a specific counter when its going too fast when a pmu->unthrottle() method is provided which can undo the quick disable. Power needs to implement both the quick disable and the unthrottle method. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525153931.703093461@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 8c8177f859f..c4b543d1a86 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -623,6 +623,18 @@ try_generic: return 0; } +static void x86_pmu_unthrottle(struct perf_counter *counter) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct hw_perf_counter *hwc = &counter->hw; + + if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX || + cpuc->counters[hwc->idx] != counter)) + return; + + x86_pmu.enable(hwc, hwc->idx); +} + void perf_counter_print_debug(void) { u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; @@ -1038,6 +1050,7 @@ static const struct pmu pmu = { .enable = x86_pmu_enable, .disable = x86_pmu_disable, .read = x86_pmu_read, + .unthrottle = x86_pmu_unthrottle, }; const struct pmu *hw_perf_counter_init(struct perf_counter *counter) -- cgit v1.2.3 From 53b441a565bf4036ab49c8ea04c5ad06ace7dd6b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 25 May 2009 21:41:28 +0200 Subject: Revert "perf_counter, x86: speed up the scheduling fast-path" This reverts commit b68f1d2e7aa21029d73c7d453a8046e95d351740. It is causing problems (stuck/stuttering profiling) - when mixed NMI and non-NMI counters are used. Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525153931.703093461@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index c4b543d1a86..189bf9d7cda 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -293,7 +293,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return -EACCES; hwc->nmi = 1; } - perf_counters_lapic_init(hwc->nmi); if (!hwc->irq_period) hwc->irq_period = x86_pmu.max_period; @@ -612,6 +611,8 @@ try_generic: hwc->counter_base = x86_pmu.perfctr; } + perf_counters_lapic_init(hwc->nmi); + x86_pmu.disable(hwc, idx); cpuc->counters[idx] = counter; @@ -1037,7 +1038,7 @@ void __init init_hw_perf_counters(void) pr_info("... counter mask: %016Lx\n", perf_counter_mask); - perf_counters_lapic_init(1); + perf_counters_lapic_init(0); register_die_notifier(&perf_counter_nmi_notifier); } -- cgit v1.2.3 From 79202ba9ff8cf570a75596f42e011167734d1c4b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 26 May 2009 08:10:00 +0200 Subject: perf_counter, x86: Fix APIC NMI programming My Nehalem box locks up in certain situations (with an always-asserted NMI causing a lockup) if the PMU LVT entry is programmed between NMI and IRQ mode with a high frequency. Standardize exlusively on NMIs instead. [ Impact: fix lockup ] Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 189bf9d7cda..ece3813c7a3 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -285,14 +285,10 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->config |= ARCH_PERFMON_EVENTSEL_OS; /* - * If privileged enough, allow NMI events: + * Use NMI events all the time: */ - hwc->nmi = 0; - if (hw_event->nmi) { - if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN)) - return -EACCES; - hwc->nmi = 1; - } + hwc->nmi = 1; + hw_event->nmi = 1; if (!hwc->irq_period) hwc->irq_period = x86_pmu.max_period; @@ -553,9 +549,6 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) if (!x86_pmu.num_counters_fixed) return -1; - if (unlikely(hwc->nmi)) - return -1; - event = hwc->config & ARCH_PERFMON_EVENT_MASK; if (unlikely(event == x86_pmu.event_map(PERF_COUNT_INSTRUCTIONS))) @@ -806,9 +799,6 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) counter = cpuc->counters[idx]; hwc = &counter->hw; - if (counter->hw_event.nmi != nmi) - continue; - val = x86_perf_counter_update(counter, hwc, idx); if (val & (1ULL << (x86_pmu.counter_bits - 1))) continue; -- cgit v1.2.3 From aaba98018b8295dfa2119345d17f833d74448cd0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 26 May 2009 08:10:00 +0200 Subject: perf_counter, x86: Make NMI lockups more robust We have a debug check that detects stuck NMIs and returns with the PMU disabled in the global ctrl MSR - but i managed to trigger a situation where this was not enough to deassert the NMI. So clear/reset the full PMU and keep the disable count balanced when exiting from here. This way the box produces a debug warning but stays up and is more debuggable. [ Impact: in case of PMU related bugs, recover more gracefully ] Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index ece3813c7a3..2eeaa99add1 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -724,6 +724,30 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter) intel_pmu_enable_counter(hwc, idx); } +static void intel_pmu_reset(void) +{ + unsigned long flags; + int idx; + + if (!x86_pmu.num_counters) + return; + + local_irq_save(flags); + + printk("clearing PMU state on CPU#%d\n", smp_processor_id()); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); + checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); + } + for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { + checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); + } + + local_irq_restore(flags); +} + + /* * This handler is triggered by the local APIC, so the APIC IRQ handling * rules apply: @@ -750,6 +774,8 @@ again: if (++loops > 100) { WARN_ONCE(1, "perfcounters: irq loop stuck!\n"); perf_counter_print_debug(); + intel_pmu_reset(); + perf_enable(); return 1; } -- cgit v1.2.3 From 4319503779060120fa5de9b8fde056603bb6e0fd Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Fri, 6 Mar 2009 20:24:57 +0000 Subject: [CPUFREQ] add atom family to p4-clockmod Some atom procs don't do freq scaling (such as the atom 330 on my own littlefalls2 board). By adding the atom family here, we at least get the benefit of passive cooling in a thermal emergency. Not sure how to see that its actually helping any, but the driver does bind and claim its functioning on my atom 330. Signed-off-by: Jarod Wilson Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/p4-clockmod.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c index 6ac55bd341a..86961519372 100644 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c @@ -168,6 +168,7 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c) case 0x0E: /* Core */ case 0x0F: /* Core Duo */ case 0x16: /* Celeron Core */ + case 0x1C: /* Atom */ p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS; return speedstep_get_frequency(SPEEDSTEP_CPU_PCORE); case 0x0D: /* Pentium M (Dothan) */ -- cgit v1.2.3 From d38e73e8dad454a5916f446b0d3523c1161ae95a Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 23 Apr 2009 13:36:12 -0400 Subject: [CPUFREQ] powernow-k7 build fix when ACPI=n MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit arch/x86/kernel/cpu/cpufreq/powernow-k7.c:172: warning: 'invalidate_entry' defined but not used Reported-by: Toralf Förster Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k7.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c index 3c28ccd4974..a8363e5be4e 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c @@ -168,10 +168,12 @@ static int check_powernow(void) return 1; } +#ifdef CONFIG_X86_POWERNOW_K7_ACPI static void invalidate_entry(unsigned int entry) { powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID; } +#endif static int get_ranges(unsigned char *pst) { -- cgit v1.2.3 From df1829770db415dc5a5ed5ada3bd70176c6f0a01 Mon Sep 17 00:00:00 2001 From: Thomas Renninger Date: Wed, 22 Apr 2009 13:48:32 +0200 Subject: [CPUFREQ] powernow-k8 cleanup msg if BIOS does not export ACPI _PSS cpufreq data - Make the message shorter and easier to grep for - Use printk_once instead of WARN_ONCE (functionality of these was mixed) Signed-off-by: Thomas Renninger Cc: Langsdorf, Mark Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 4709ead2db5..feef10c085a 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1215,13 +1215,16 @@ static int powernowk8_verify(struct cpufreq_policy *pol) return cpufreq_frequency_table_verify(pol, data->powernow_table); } +static const char ACPI_PSS_BIOS_BUG_MSG[] = + KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n" + KERN_ERR FW_BUG PFX "Try again with latest BIOS.\n"; + /* per CPU init entry point to the driver */ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) { struct powernow_k8_data *data; cpumask_t oldmask; int rc; - static int print_once; if (!cpu_online(pol->cpu)) return -ENODEV; @@ -1244,19 +1247,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) * an UP version, and is deprecated by AMD. */ if (num_online_cpus() != 1) { - /* - * Replace this one with print_once as soon as such a - * thing gets introduced - */ - if (!print_once) { - WARN_ONCE(1, KERN_ERR FW_BUG PFX "Your BIOS " - "does not provide ACPI _PSS objects " - "in a way that Linux understands. " - "Please report this to the Linux ACPI" - " maintainers and complain to your " - "BIOS vendor.\n"); - print_once++; - } + printk_once(ACPI_PSS_BIOS_BUG_MSG); goto err_out; } if (pol->cpu != 0) { -- cgit v1.2.3 From ca446d06351992e4f1a7c1e5e99870ab4ec5188f Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 22 Apr 2009 13:48:33 +0200 Subject: [CPUFREQ] powernow-k8: determine exact CPU frequency for HW Pstates Slightly modified by trenn@suse.de -> only do this on fam 10h and fam 11h. Currently powernow-k8 determines CPU frequency from ACPI PSS objects, but according to AMD family 11h BKDG this frequency is just a rounded value: "CoreFreq (MHz) = The CPU COF specified by MSRC001_00[6B:64][CpuFid] rounded to the nearest 100 Mhz." As a consequnce powernow-k8 reports wrong CPU frequency on some systems, e.g. on Turion X2 Ultra: powernow-k8: Found 1 AMD Turion(tm)X2 Ultra DualCore Mobile ZM-82 processors (2 cpu cores) (version 2.20.00) powernow-k8: 0 : pstate 0 (2200 MHz) powernow-k8: 1 : pstate 1 (1100 MHz) powernow-k8: 2 : pstate 2 (600 MHz) But this is wrong as frequency for Pstate2 is 550 MHz. x86info reports it correctly: #x86info -a |grep Pstate ... Pstate-0: fid=e, did=0, vid=24 (2200MHz) Pstate-1: fid=e, did=1, vid=30 (1100MHz) Pstate-2: fid=e, did=2, vid=3c (550MHz) (current) Solution is to determine the frequency directly from Pstate MSRs instead of using rounded values from ACPI table. Signed-off-by: Andreas Herrmann Signed-off-by: Thomas Renninger Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index feef10c085a..f6b32d11235 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -649,6 +649,20 @@ static void print_basics(struct powernow_k8_data *data) data->batps); } +static u32 freq_from_fid_did(u32 fid, u32 did) +{ + u32 mhz = 0; + + if (boot_cpu_data.x86 == 0x10) + mhz = (100 * (fid + 0x10)) >> did; + else if (boot_cpu_data.x86 == 0x11) + mhz = (100 * (fid + 8)) >> did; + else + BUG(); + + return mhz * 1000; +} + static int fill_powernow_table(struct powernow_k8_data *data, struct pst_s *pst, u8 maxvid) { @@ -923,8 +937,13 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, powernow_table[i].index = index; - powernow_table[i].frequency = - data->acpi_data.states[i].core_frequency * 1000; + /* Frequency may be rounded for these */ + if (boot_cpu_data.x86 == 0x10 || boot_cpu_data.x86 == 0x11) { + powernow_table[i].frequency = + freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7); + } else + powernow_table[i].frequency = + data->acpi_data.states[i].core_frequency * 1000; } return 0; } -- cgit v1.2.3 From 7d96fd41cadc55f4e00231c8c71b8e25c779f122 Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Mon, 25 May 2009 11:02:02 +0200 Subject: x86: move rdtsc_barrier() into the TSC vread method The *fence instructions were moved to vsyscall_64.c by commit cb9e35dce94a1b9c59d46224e8a94377d673e204. But this breaks the vDSO, because vread methods are also called from there. Besides, the synchronization might be unnecessary for other time sources than TSC. [ Impact: fix potential time warp in VDSO ] Signed-off-by: Petr Tesarik LKML-Reference: <9d0ea9ea0f866bdc1f4d76831221ae117f11ea67.1243241859.git.ptesarik@suse.cz> Signed-off-by: Thomas Gleixner Cc: --- arch/x86/kernel/tsc.c | 11 ++++++++++- arch/x86/kernel/vsyscall_64.c | 8 -------- 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index d57de05dc43..cf8611d991e 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -710,7 +710,16 @@ static cycle_t read_tsc(struct clocksource *cs) #ifdef CONFIG_X86_64 static cycle_t __vsyscall_fn vread_tsc(void) { - cycle_t ret = (cycle_t)vget_cycles(); + cycle_t ret; + + /* + * Surround the RDTSC by barriers, to make sure it's not + * speculated to outside the seqlock critical section and + * does not cause time warps: + */ + rdtsc_barrier(); + ret = (cycle_t)vget_cycles(); + rdtsc_barrier(); return ret >= __vsyscall_gtod_data.clock.cycle_last ? ret : __vsyscall_gtod_data.clock.cycle_last; diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 44153afc906..25ee06a80aa 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -132,15 +132,7 @@ static __always_inline void do_vgettimeofday(struct timeval * tv) return; } - /* - * Surround the RDTSC by barriers, to make sure it's not - * speculated to outside the seqlock critical section and - * does not cause time warps: - */ - rdtsc_barrier(); now = vread(); - rdtsc_barrier(); - base = __vsyscall_gtod_data.clock.cycle_last; mask = __vsyscall_gtod_data.clock.mask; mult = __vsyscall_gtod_data.clock.mult; -- cgit v1.2.3 From fefda117ddb324b872312f1f061230e627c9f5ee Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 20 May 2009 12:21:42 +0200 Subject: amd-iommu: add amd_iommu_dump parameter This kernel parameter will be useful to get some AMD IOMMU related information in dmesg that is not necessary for the default user but may be helpful in debug situations. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 8c0be0902da..57fb7a7cb6e 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -115,6 +115,8 @@ struct ivmd_header { u64 range_length; } __attribute__((packed)); +bool amd_iommu_dump; + static int __initdata amd_iommu_detected; u16 amd_iommu_last_bdf; /* largest PCI device id we have @@ -1211,6 +1213,13 @@ void __init amd_iommu_detect(void) * ****************************************************************************/ +static int __init parse_amd_iommu_dump(char *str) +{ + amd_iommu_dump = true; + + return 1; +} + static int __init parse_amd_iommu_options(char *str) { for (; *str; ++str) { @@ -1235,5 +1244,6 @@ static int __init parse_amd_iommu_size_options(char *str) return 1; } +__setup("amd_iommu_dump", parse_amd_iommu_dump); __setup("amd_iommu=", parse_amd_iommu_options); __setup("amd_iommu_size=", parse_amd_iommu_size_options); -- cgit v1.2.3 From 9c72041f719e2864d4208a89341c36b316dbf893 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 20 May 2009 13:53:57 +0200 Subject: amd-iommu: add dump for iommus described in ivrs table Add information about IOMMU devices described in the IVRS ACPI table to the kernel log if amd_iommu_dump was specified on the kernel command line. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 57fb7a7cb6e..28165902ae2 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -748,6 +748,15 @@ static int __init init_iommu_all(struct acpi_table_header *table) h = (struct ivhd_header *)p; switch (*p) { case ACPI_IVHD_TYPE: + + DUMP_printk("IOMMU: device: %02x:%02x.%01x cap: %04x " + "seg: %d flags: %01x info %04x\n", + PCI_BUS(h->devid), PCI_SLOT(h->devid), + PCI_FUNC(h->devid), h->cap_ptr, + h->pci_seg, h->flags, h->info); + DUMP_printk(" mmio-addr: %016llx\n", + h->mmio_phys); + iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL); if (iommu == NULL) return -ENOMEM; -- cgit v1.2.3 From 42a698f40a0946f5517308411b9e003ae031414d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 20 May 2009 15:41:28 +0200 Subject: amd-iommu: print ivhd information to dmesg when requested Add information about devices belonging to an IOMMU as described in the IVRS ACPI table to the kernel log if amd_iommu_dump was specified on the kernel command line. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 73 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 28165902ae2..fe3e6453cbf 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -598,32 +598,83 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, p += sizeof(struct ivhd_header); end += h->length; + while (p < end) { e = (struct ivhd_entry *)p; switch (e->type) { case IVHD_DEV_ALL: + + DUMP_printk(" DEV_ALL\t\t\t first devid: %02x:%02x.%x" + " last device %02x:%02x.%x flags: %02x\n", + PCI_BUS(iommu->first_device), + PCI_SLOT(iommu->first_device), + PCI_FUNC(iommu->first_device), + PCI_BUS(iommu->last_device), + PCI_SLOT(iommu->last_device), + PCI_FUNC(iommu->last_device), + e->flags); + for (dev_i = iommu->first_device; dev_i <= iommu->last_device; ++dev_i) set_dev_entry_from_acpi(iommu, dev_i, e->flags, 0); break; case IVHD_DEV_SELECT: + + DUMP_printk(" DEV_SELECT\t\t\t devid: %02x:%02x.%x " + "flags: %02x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid), + e->flags); + devid = e->devid; set_dev_entry_from_acpi(iommu, devid, e->flags, 0); break; case IVHD_DEV_SELECT_RANGE_START: + + DUMP_printk(" DEV_SELECT_RANGE_START\t " + "devid: %02x:%02x.%x flags: %02x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid), + e->flags); + devid_start = e->devid; flags = e->flags; ext_flags = 0; alias = false; break; case IVHD_DEV_ALIAS: + + DUMP_printk(" DEV_ALIAS\t\t\t devid: %02x:%02x.%x " + "flags: %02x devid_to: %02x:%02x.%x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid), + e->flags, + PCI_BUS(e->ext >> 8), + PCI_SLOT(e->ext >> 8), + PCI_FUNC(e->ext >> 8)); + devid = e->devid; devid_to = e->ext >> 8; set_dev_entry_from_acpi(iommu, devid, e->flags, 0); amd_iommu_alias_table[devid] = devid_to; break; case IVHD_DEV_ALIAS_RANGE: + + DUMP_printk(" DEV_ALIAS_RANGE\t\t " + "devid: %02x:%02x.%x flags: %02x " + "devid_to: %02x:%02x.%x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid), + e->flags, + PCI_BUS(e->ext >> 8), + PCI_SLOT(e->ext >> 8), + PCI_FUNC(e->ext >> 8)); + devid_start = e->devid; flags = e->flags; devid_to = e->ext >> 8; @@ -631,17 +682,39 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, alias = true; break; case IVHD_DEV_EXT_SELECT: + + DUMP_printk(" DEV_EXT_SELECT\t\t devid: %02x:%02x.%x " + "flags: %02x ext: %08x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid), + e->flags, e->ext); + devid = e->devid; set_dev_entry_from_acpi(iommu, devid, e->flags, e->ext); break; case IVHD_DEV_EXT_SELECT_RANGE: + + DUMP_printk(" DEV_EXT_SELECT_RANGE\t devid: " + "%02x:%02x.%x flags: %02x ext: %08x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid), + e->flags, e->ext); + devid_start = e->devid; flags = e->flags; ext_flags = e->ext; alias = false; break; case IVHD_DEV_RANGE_END: + + DUMP_printk(" DEV_RANGE_END\t\t devid: %02x:%02x.%x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid)); + devid = e->devid; for (dev_i = devid_start; dev_i <= devid; ++dev_i) { if (alias) -- cgit v1.2.3 From 02acc43a294098c2a4cd22cf24e9c988644f9f7f Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 20 May 2009 16:24:21 +0200 Subject: amd-iommu: print ivmd information to dmesg when requested Add information about device memory mapping requirements for the IOMMU as described in the IVRS ACPI table to the kernel log if amd_iommu_dump was specified on the kernel command line. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index fe3e6453cbf..b90a78cfdcb 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -983,6 +983,7 @@ static int __init init_exclusion_range(struct ivmd_header *m) static int __init init_unity_map_range(struct ivmd_header *m) { struct unity_map_entry *e = 0; + char *s; e = kzalloc(sizeof(*e), GFP_KERNEL); if (e == NULL) @@ -991,13 +992,16 @@ static int __init init_unity_map_range(struct ivmd_header *m) switch (m->type) { default: case ACPI_IVMD_TYPE: + s = "IVMD_TYPEi\t\t\t"; e->devid_start = e->devid_end = m->devid; break; case ACPI_IVMD_TYPE_ALL: + s = "IVMD_TYPE_ALL\t\t"; e->devid_start = 0; e->devid_end = amd_iommu_last_bdf; break; case ACPI_IVMD_TYPE_RANGE: + s = "IVMD_TYPE_RANGE\t\t"; e->devid_start = m->devid; e->devid_end = m->aux; break; @@ -1006,6 +1010,13 @@ static int __init init_unity_map_range(struct ivmd_header *m) e->address_end = e->address_start + PAGE_ALIGN(m->range_length); e->prot = m->flags >> 1; + DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x" + " range_start: %016llx range_end: %016llx flags: %x\n", s, + PCI_BUS(e->devid_start), PCI_SLOT(e->devid_start), + PCI_FUNC(e->devid_start), PCI_BUS(e->devid_end), + PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end), + e->address_start, e->address_end, m->flags); + list_add_tail(&e->list, &amd_iommu_unity_map); return 0; -- cgit v1.2.3 From b3b99ef8b4f80f3f093a72110e7697c2281ae45d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 22 May 2009 12:02:48 +0200 Subject: amd-iommu: move protection domain printk to dump code This information is only helpful for debugging. Don't print it anymore unless explicitly requested. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index a97db99dad5..33565990164 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1009,8 +1009,9 @@ static int device_change_notifier(struct notifier_block *nb, if (!dma_domain) dma_domain = iommu->default_dom; attach_device(iommu, &dma_domain->domain, devid); - printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " - "device %s\n", dma_domain->domain.id, dev_name(dev)); + DUMP_printk(KERN_INFO "AMD IOMMU: Using protection domain " + "%d for device %s\n", + dma_domain->domain.id, dev_name(dev)); break; case BUS_NOTIFY_UNBIND_DRIVER: if (!domain) @@ -1133,8 +1134,9 @@ static int get_device_resources(struct device *dev, dma_dom = (*iommu)->default_dom; *domain = &dma_dom->domain; attach_device(*iommu, *domain, *bdf); - printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " - "device %s\n", (*domain)->id, dev_name(dev)); + DUMP_printk(KERN_INFO "AMD IOMMU: Using protection domain " + "%d for device %s\n", + (*domain)->id, dev_name(dev)); } if (domain_for_device(_bdf) == NULL) -- cgit v1.2.3 From 2e8b569614b89c9b1b85cba37db36daeeeff744e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 22 May 2009 12:44:03 +0200 Subject: amd-iommu: disable device isolation with CONFIG_IOMMU_STRESS With device isolation disabled we can test better for race conditions in dma_ops related code. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index b90a78cfdcb..66941129e9c 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -124,8 +124,14 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings we find in ACPI */ unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ + +#ifdef CONFIG_IOMMU_STRESS +bool amd_iommu_isolate = false; +#else bool amd_iommu_isolate = true; /* if true, device isolation is enabled */ +#endif + bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the -- cgit v1.2.3 From 421f909c803d1c397f6c66b75653f238696c39ee Mon Sep 17 00:00:00 2001 From: Neil Turton Date: Thu, 14 May 2009 14:00:35 +0100 Subject: amd-iommu: fix an off-by-one error in the AMD IOMMU driver. The variable amd_iommu_last_bdf holds the maximum bdf of any device controlled by an IOMMU, so the number of device entries needed is amd_iommu_last_bdf+1. The function tbl_size used amd_iommu_last_bdf instead. This would be a problem if the last device were a large enough power of 2. [ Impact: fix amd_iommu_last_bdf off-by-one error ] Signed-off-by: Neil Turton Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 8c0be0902da..35fc9654c7a 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -175,7 +175,7 @@ static inline void update_last_devid(u16 devid) static inline unsigned long tbl_size(int entry_size) { unsigned shift = PAGE_SHIFT + - get_order(amd_iommu_last_bdf * entry_size); + get_order(((int)amd_iommu_last_bdf + 1) * entry_size); return 1UL << shift; } -- cgit v1.2.3 From 7455aab1f95f6464c5af3fbdee28744e73f38564 Mon Sep 17 00:00:00 2001 From: Neil Turton Date: Thu, 14 May 2009 14:08:11 +0100 Subject: amd-iommu: fix the handling of device aliases in the AMD IOMMU driver. The devid parameter to set_dev_entry_from_acpi is the requester ID rather than the device ID since it is used to index the IOMMU device table. The handling of IVHD_DEV_ALIAS used to pass the device ID. This patch fixes it to pass the requester ID. [ Impact: fix setting the wrong req-id in acpi-table parsing ] Signed-off-by: Neil Turton Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 35fc9654c7a..53f93db54c4 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -618,7 +618,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, case IVHD_DEV_ALIAS: devid = e->devid; devid_to = e->ext >> 8; - set_dev_entry_from_acpi(iommu, devid, e->flags, 0); + set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0); amd_iommu_alias_table[devid] = devid_to; break; case IVHD_DEV_ALIAS_RANGE: -- cgit v1.2.3 From 0bc252f430d6a3ac7836d40f00d0ae020593b11b Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 22 May 2009 12:48:05 +0200 Subject: amd-iommu: make sure only ivmd entries are parsed The bug never triggered. But it should be fixed to protect against broken ACPI tables in the future. [ Impact: protect against broken ivrs acpi table ] Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 53f93db54c4..a3a2b98bb39 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -906,6 +906,8 @@ static int __init init_unity_map_range(struct ivmd_header *m) switch (m->type) { default: + kfree(e); + return 0; case ACPI_IVMD_TYPE: e->devid_start = e->devid_end = m->devid; break; -- cgit v1.2.3 From c1eee67b2d8464781f5868a34168df61e40e85a6 Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Thu, 21 May 2009 00:56:58 -0700 Subject: amd iommu: properly detach from protection domain on ->remove Some drivers may use the dma api during ->remove which will cause a protection domain to get reattached to a device. Delay the detach until after the driver is completely unbound. [ joro: added a little merge helper ] [ Impact: fix too early device<->domain removal ] Signed-off-by: Chris Wright Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index a97db99dad5..d6898833c36 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -57,6 +57,10 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, static struct dma_ops_domain *find_protection_domain(u16 devid); +#ifndef BUS_NOTIFY_UNBOUND_DRIVER +#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005 +#endif + #ifdef CONFIG_AMD_IOMMU_STATS /* @@ -1012,7 +1016,7 @@ static int device_change_notifier(struct notifier_block *nb, printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " "device %s\n", dma_domain->domain.id, dev_name(dev)); break; - case BUS_NOTIFY_UNBIND_DRIVER: + case BUS_NOTIFY_UNBOUND_DRIVER: if (!domain) goto out; detach_device(domain, devid); -- cgit v1.2.3 From 3bd221724adb9d642270df0e78b0105fb61e4a1c Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 4 May 2009 15:06:20 +0200 Subject: amd-iommu: introduce for_each_iommu* macros This patch introduces the for_each_iommu and for_each_iommu_safe macros to simplify the developers life when having to iterate over all AMD IOMMUs in the system. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 8 ++++---- arch/x86/kernel/amd_iommu_init.c | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index a97db99dad5..d9e9dc141a1 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -213,7 +213,7 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data) { struct amd_iommu *iommu; - list_for_each_entry(iommu, &amd_iommu_list, list) + for_each_iommu(iommu) iommu_poll_events(iommu); return IRQ_HANDLED; @@ -440,7 +440,7 @@ static void iommu_flush_domain(u16 domid) __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, domid, 1, 1); - list_for_each_entry(iommu, &amd_iommu_list, list) { + for_each_iommu(iommu) { spin_lock_irqsave(&iommu->lock, flags); __iommu_queue_command(iommu, &cmd); __iommu_completion_wait(iommu); @@ -1672,7 +1672,7 @@ int __init amd_iommu_init_dma_ops(void) * found in the system. Devices not assigned to any other * protection domain will be assigned to the default one. */ - list_for_each_entry(iommu, &amd_iommu_list, list) { + for_each_iommu(iommu) { iommu->default_dom = dma_ops_domain_alloc(iommu, order); if (iommu->default_dom == NULL) return -ENOMEM; @@ -1710,7 +1710,7 @@ int __init amd_iommu_init_dma_ops(void) free_domains: - list_for_each_entry(iommu, &amd_iommu_list, list) { + for_each_iommu(iommu) { if (iommu->default_dom) dma_ops_domain_free(iommu->default_dom); } diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 8c0be0902da..675a4b642f7 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -679,7 +679,7 @@ static void __init free_iommu_all(void) { struct amd_iommu *iommu, *next; - list_for_each_entry_safe(iommu, next, &amd_iommu_list, list) { + for_each_iommu_safe(iommu, next) { list_del(&iommu->list); free_iommu_one(iommu); kfree(iommu); @@ -779,7 +779,7 @@ static int __init iommu_setup_msix(struct amd_iommu *iommu) struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */ int nvec = 0, i; - list_for_each_entry(curr, &amd_iommu_list, list) { + for_each_iommu(curr) { if (curr->dev == iommu->dev) { entries[nvec].entry = curr->evt_msi_num; entries[nvec].vector = 0; @@ -818,7 +818,7 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu) int r; struct amd_iommu *curr; - list_for_each_entry(curr, &amd_iommu_list, list) { + for_each_iommu(curr) { if (curr->dev == iommu->dev) curr->int_enabled = true; } @@ -971,7 +971,7 @@ static void __init enable_iommus(void) { struct amd_iommu *iommu; - list_for_each_entry(iommu, &amd_iommu_list, list) { + for_each_iommu(iommu) { iommu_set_exclusion_range(iommu); iommu_init_msi(iommu); iommu_enable_event_logging(iommu); -- cgit v1.2.3 From 58492e128892e3b55f1a6ef0cf3c3ab4ce7cc214 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 4 May 2009 18:41:16 +0200 Subject: amd-iommu: consolidate hardware initialization to one function This patch restructures the AMD IOMMU initialization code to initialize all hardware registers with one single function call. This is helpful for suspend/resume support. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 50 +++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 18 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 675a4b642f7..74f4f1fea93 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -252,13 +252,6 @@ static void __init iommu_enable(struct amd_iommu *iommu) iommu_feature_enable(iommu, CONTROL_IOMMU_EN); } -/* Function to enable IOMMU event logging and event interrupts */ -static void __init iommu_enable_event_logging(struct amd_iommu *iommu) -{ - iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); - iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); -} - /* * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in * the system has one. @@ -413,25 +406,36 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) { u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(CMD_BUFFER_SIZE)); - u64 entry; if (cmd_buf == NULL) return NULL; iommu->cmd_buf_size = CMD_BUFFER_SIZE; - entry = (u64)virt_to_phys(cmd_buf); + return cmd_buf; +} + +/* + * This function writes the command buffer address to the hardware and + * enables it. + */ +static void iommu_enable_command_buffer(struct amd_iommu *iommu) +{ + u64 entry; + + BUG_ON(iommu->cmd_buf == NULL); + + entry = (u64)virt_to_phys(iommu->cmd_buf); entry |= MMIO_CMD_SIZE_512; + memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, - &entry, sizeof(entry)); + &entry, sizeof(entry)); /* set head and tail to zero manually */ writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); - - return cmd_buf; } static void __init free_command_buffer(struct amd_iommu *iommu) @@ -443,20 +447,27 @@ static void __init free_command_buffer(struct amd_iommu *iommu) /* allocates the memory where the IOMMU will log its events to */ static u8 * __init alloc_event_buffer(struct amd_iommu *iommu) { - u64 entry; iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(EVT_BUFFER_SIZE)); if (iommu->evt_buf == NULL) return NULL; + return iommu->evt_buf; +} + +static void iommu_enable_event_buffer(struct amd_iommu *iommu) +{ + u64 entry; + + BUG_ON(iommu->evt_buf == NULL); + entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK; + memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, &entry, sizeof(entry)); - iommu->evt_buf_size = EVT_BUFFER_SIZE; - - return iommu->evt_buf; + iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); } static void __init free_event_buffer(struct amd_iommu *iommu) @@ -710,7 +721,6 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) if (!iommu->mmio_base) return -ENOMEM; - iommu_set_device_table(iommu); iommu->cmd_buf = alloc_command_buffer(iommu); if (!iommu->cmd_buf) return -ENOMEM; @@ -837,6 +847,8 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu) return 1; } + iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); + return 0; } @@ -972,9 +984,11 @@ static void __init enable_iommus(void) struct amd_iommu *iommu; for_each_iommu(iommu) { + iommu_set_device_table(iommu); + iommu_enable_command_buffer(iommu); + iommu_enable_event_buffer(iommu); iommu_set_exclusion_range(iommu); iommu_init_msi(iommu); - iommu_enable_event_logging(iommu); iommu_enable(iommu); } } -- cgit v1.2.3 From fab6afa30954a0684ef8ac1d9a606e74a6215ab6 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 4 May 2009 18:46:34 +0200 Subject: amd-iommu: drop pointless iommu-loop in msi setup code It is not necessary to loop again over all IOMMUs in this code. So drop the loop. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 74f4f1fea93..cc99f609223 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -826,13 +826,6 @@ out_free: static int __init iommu_setup_msi(struct amd_iommu *iommu) { int r; - struct amd_iommu *curr; - - for_each_iommu(curr) { - if (curr->dev == iommu->dev) - curr->int_enabled = true; - } - if (pci_enable_msi(iommu->dev)) return 1; @@ -847,6 +840,7 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu) return 1; } + iommu->int_enabled = true; iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); return 0; -- cgit v1.2.3 From d91cecdd796c27df46339e80ed436a980c56fcad Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 4 May 2009 18:51:00 +0200 Subject: amd-iommu: remove support for msi-x Current hardware uses msi instead of msi-x so this code it not necessary and can not be tested. The best thing is to drop this code. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 44 +--------------------------------------- 1 file changed, 1 insertion(+), 43 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index cc99f609223..feee475e626 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -783,46 +783,6 @@ static int __init init_iommu_all(struct acpi_table_header *table) * ****************************************************************************/ -static int __init iommu_setup_msix(struct amd_iommu *iommu) -{ - struct amd_iommu *curr; - struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */ - int nvec = 0, i; - - for_each_iommu(curr) { - if (curr->dev == iommu->dev) { - entries[nvec].entry = curr->evt_msi_num; - entries[nvec].vector = 0; - curr->int_enabled = true; - nvec++; - } - } - - if (pci_enable_msix(iommu->dev, entries, nvec)) { - pci_disable_msix(iommu->dev); - return 1; - } - - for (i = 0; i < nvec; ++i) { - int r = request_irq(entries->vector, amd_iommu_int_handler, - IRQF_SAMPLE_RANDOM, - "AMD IOMMU", - NULL); - if (r) - goto out_free; - } - - return 0; - -out_free: - for (i -= 1; i >= 0; --i) - free_irq(entries->vector, NULL); - - pci_disable_msix(iommu->dev); - - return 1; -} - static int __init iommu_setup_msi(struct amd_iommu *iommu) { int r; @@ -851,9 +811,7 @@ static int __init iommu_init_msi(struct amd_iommu *iommu) if (iommu->int_enabled) return 0; - if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX)) - return iommu_setup_msix(iommu); - else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI)) + if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI)) return iommu_setup_msi(iommu); return 1; -- cgit v1.2.3 From 92ac4320af6ed4294c2c221dd4ccbfd9026a3aa7 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 19 May 2009 19:06:27 +0200 Subject: amd-iommu: add function to disable all iommus This function is required for suspend/resume support with AMD IOMMU enabled. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index feee475e626..ed10c0f5ff7 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -252,6 +252,11 @@ static void __init iommu_enable(struct amd_iommu *iommu) iommu_feature_enable(iommu, CONTROL_IOMMU_EN); } +static void iommu_disable(struct amd_iommu *iommu) +{ + iommu_feature_disable(iommu, CONTROL_IOMMU_EN); +} + /* * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in * the system has one. @@ -945,6 +950,14 @@ static void __init enable_iommus(void) } } +static void disable_iommus(void) +{ + struct amd_iommu *iommu; + + for_each_iommu(iommu) + iommu_disable(iommu); +} + /* * Suspend/Resume support * disable suspend until real resume implemented -- cgit v1.2.3 From bfd1be1857e5a3385bf146e02e6dc3dd4241bec1 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 5 May 2009 15:33:57 +0200 Subject: amd-iommu: add function to flush tlb for all domains This function is required for suspend/resume support with AMD IOMMU enabled. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index d9e9dc141a1..826ad079efc 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -449,6 +449,17 @@ static void iommu_flush_domain(u16 domid) } } +void amd_iommu_flush_all_domains(void) +{ + int i; + + for (i = 1; i < MAX_DOMAIN_ID; ++i) { + if (!test_bit(i, amd_iommu_pd_alloc_bitmap)) + continue; + iommu_flush_domain(i); + } +} + /**************************************************************************** * * The functions below are used the create the page table mappings for -- cgit v1.2.3 From 7d7a110c6127b7fc683dc6d764555f2dbd22b054 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 5 May 2009 15:48:10 +0200 Subject: amd-iommu: add function to flush tlb for all devices This function is required for suspend/resume support with AMD IOMMU enabled. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 826ad079efc..92b0e1881e0 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -460,6 +460,24 @@ void amd_iommu_flush_all_domains(void) } } +void amd_iommu_flush_all_devices(void) +{ + struct amd_iommu *iommu; + int i; + + for (i = 0; i <= amd_iommu_last_bdf; ++i) { + if (amd_iommu_pd_table[i] == NULL) + continue; + + iommu = amd_iommu_rlookup_table[i]; + if (!iommu) + continue; + + iommu_queue_inv_dev_entry(iommu, i); + iommu_completion_wait(iommu); + } +} + /**************************************************************************** * * The functions below are used the create the page table mappings for -- cgit v1.2.3 From 05f92db9f47f852ff48bbed1b063b8ab8ad00285 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 12 May 2009 09:52:46 +0200 Subject: amd_iommu: un __init functions required for suspend/resume This patch makes sure that no function required for suspend/resume of AMD IOMMU driver is thrown away after boot. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index ed10c0f5ff7..330896ba6a9 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -193,7 +193,7 @@ static inline unsigned long tbl_size(int entry_size) * This function set the exclusion range in the IOMMU. DMA accesses to the * exclusion range are passed through untranslated */ -static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) +static void iommu_set_exclusion_range(struct amd_iommu *iommu) { u64 start = iommu->exclusion_start & PAGE_MASK; u64 limit = (start + iommu->exclusion_length) & PAGE_MASK; @@ -225,7 +225,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu) } /* Generic functions to enable/disable certain features of the IOMMU. */ -static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit) +static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit) { u32 ctrl; @@ -244,7 +244,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) } /* Function to enable the hardware */ -static void __init iommu_enable(struct amd_iommu *iommu) +static void iommu_enable(struct amd_iommu *iommu) { printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n", dev_name(&iommu->dev->dev), iommu->cap_ptr); @@ -811,7 +811,7 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu) return 0; } -static int __init iommu_init_msi(struct amd_iommu *iommu) +static int iommu_init_msi(struct amd_iommu *iommu) { if (iommu->int_enabled) return 0; @@ -936,7 +936,7 @@ static void init_device_table(void) * This function finally enables all IOMMUs found in the system after * they have been initialized */ -static void __init enable_iommus(void) +static void enable_iommus(void) { struct amd_iommu *iommu; -- cgit v1.2.3 From 736501ee000757082a4f0832826ae1eda7ea106e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 12 May 2009 09:56:12 +0200 Subject: amd-iommu: implement suspend/resume This patch puts everything together and enables suspend/resume support in the AMD IOMMU driver. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 330896ba6a9..4ca8fbfb68d 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -965,12 +965,31 @@ static void disable_iommus(void) static int amd_iommu_resume(struct sys_device *dev) { + /* + * Disable IOMMUs before reprogramming the hardware registers. + * IOMMU is still enabled from the resume kernel. + */ + disable_iommus(); + + /* re-load the hardware */ + enable_iommus(); + + /* + * we have to flush after the IOMMUs are enabled because a + * disabled IOMMU will never execute the commands we send + */ + amd_iommu_flush_all_domains(); + amd_iommu_flush_all_devices(); + return 0; } static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state) { - return -EINVAL; + /* disable IOMMUs to go out of the way for BIOS */ + disable_iommus(); + + return 0; } static struct sysdev_class amd_iommu_sysdev_class = { -- cgit v1.2.3 From c3239567a20e90e3026ac5453d5267506ef7b030 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 12 May 2009 10:56:44 +0200 Subject: amd-iommu: introduce aperture_range structure This is a preperation for extended address allocator. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 46 +++++++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 25 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index a97db99dad5..62acd09cd19 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -595,7 +595,8 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, * as allocated in the aperture */ if (addr < dma_dom->aperture_size) - __set_bit(addr >> PAGE_SHIFT, dma_dom->bitmap); + __set_bit(addr >> PAGE_SHIFT, + dma_dom->aperture.bitmap); } return 0; @@ -656,11 +657,12 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev, dom->need_flush = true; } - address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages, - 0 , boundary_size, align_mask); + address = iommu_area_alloc(dom->aperture.bitmap, limit, dom->next_bit, + pages, 0 , boundary_size, align_mask); if (address == -1) { - address = iommu_area_alloc(dom->bitmap, limit, 0, pages, - 0, boundary_size, align_mask); + address = iommu_area_alloc(dom->aperture.bitmap, limit, 0, + pages, 0, boundary_size, + align_mask); dom->need_flush = true; } @@ -685,7 +687,7 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom, unsigned int pages) { address >>= PAGE_SHIFT; - iommu_area_free(dom->bitmap, address, pages); + iommu_area_free(dom->aperture.bitmap, address, pages); if (address >= dom->next_bit) dom->need_flush = true; @@ -741,7 +743,7 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, if (start_page + pages > last_page) pages = last_page - start_page; - iommu_area_reserve(dom->bitmap, start_page, pages); + iommu_area_reserve(dom->aperture.bitmap, start_page, pages); } static void free_pagetable(struct protection_domain *domain) @@ -785,9 +787,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom) free_pagetable(&dom->domain); - kfree(dom->pte_pages); - - kfree(dom->bitmap); + free_page((unsigned long)dom->aperture.bitmap); kfree(dom); } @@ -826,16 +826,15 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_dom->domain.priv = dma_dom; if (!dma_dom->domain.pt_root) goto free_dma_dom; - dma_dom->aperture_size = (1ULL << order); - dma_dom->bitmap = kzalloc(dma_dom->aperture_size / (PAGE_SIZE * 8), - GFP_KERNEL); - if (!dma_dom->bitmap) + dma_dom->aperture_size = APERTURE_RANGE_SIZE; + dma_dom->aperture.bitmap = (void *)get_zeroed_page(GFP_KERNEL); + if (!dma_dom->aperture.bitmap) goto free_dma_dom; /* * mark the first page as allocated so we never return 0 as * a valid dma-address. So we can use 0 as error value */ - dma_dom->bitmap[0] = 1; + dma_dom->aperture.bitmap[0] = 1; dma_dom->next_bit = 0; dma_dom->need_flush = false; @@ -854,13 +853,9 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, /* * At the last step, build the page tables so we don't need to * allocate page table pages in the dma_ops mapping/unmapping - * path. + * path for the first 128MB of dma address space. */ num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512); - dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *), - GFP_KERNEL); - if (!dma_dom->pte_pages) - goto free_dma_dom; l2_pde = (u64 *)get_zeroed_page(GFP_KERNEL); if (l2_pde == NULL) @@ -869,10 +864,11 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde)); for (i = 0; i < num_pte_pages; ++i) { - dma_dom->pte_pages[i] = (u64 *)get_zeroed_page(GFP_KERNEL); - if (!dma_dom->pte_pages[i]) + u64 **pte_page = &dma_dom->aperture.pte_pages[i]; + *pte_page = (u64 *)get_zeroed_page(GFP_KERNEL); + if (!*pte_page) goto free_dma_dom; - address = virt_to_phys(dma_dom->pte_pages[i]); + address = virt_to_phys(*pte_page); l2_pde[i] = IOMMU_L1_PDE(address); } @@ -1159,7 +1155,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, paddr &= PAGE_MASK; - pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)]; + pte = dom->aperture.pte_pages[IOMMU_PTE_L1_INDEX(address)]; pte += IOMMU_PTE_L0_INDEX(address); __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; @@ -1192,7 +1188,7 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu, WARN_ON(address & ~PAGE_MASK || address >= dom->aperture_size); - pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)]; + pte = dom->aperture.pte_pages[IOMMU_PTE_L1_INDEX(address)]; pte += IOMMU_PTE_L0_INDEX(address); WARN_ON(!*pte); -- cgit v1.2.3 From 8bda3092bcfa68f786d94549ae026e8db1eff041 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 12 May 2009 12:02:46 +0200 Subject: amd-iommu: move page table allocation code to seperate function This patch makes page table allocation usable for dma_ops code. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 86 ++++++++++++++++++++++++++++++++------------- 1 file changed, 61 insertions(+), 25 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 62acd09cd19..ded79f7747c 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -55,7 +55,9 @@ struct iommu_cmd { static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, struct unity_map_entry *e); static struct dma_ops_domain *find_protection_domain(u16 devid); - +static u64* alloc_pte(struct protection_domain *dom, + unsigned long address, u64 + **pte_page, gfp_t gfp); #ifdef CONFIG_AMD_IOMMU_STATS @@ -468,7 +470,7 @@ static int iommu_map_page(struct protection_domain *dom, unsigned long phys_addr, int prot) { - u64 __pte, *pte, *page; + u64 __pte, *pte; bus_addr = PAGE_ALIGN(bus_addr); phys_addr = PAGE_ALIGN(phys_addr); @@ -477,27 +479,7 @@ static int iommu_map_page(struct protection_domain *dom, if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK)) return -EINVAL; - pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)]; - - if (!IOMMU_PTE_PRESENT(*pte)) { - page = (u64 *)get_zeroed_page(GFP_KERNEL); - if (!page) - return -ENOMEM; - *pte = IOMMU_L2_PDE(virt_to_phys(page)); - } - - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)]; - - if (!IOMMU_PTE_PRESENT(*pte)) { - page = (u64 *)get_zeroed_page(GFP_KERNEL); - if (!page) - return -ENOMEM; - *pte = IOMMU_L1_PDE(virt_to_phys(page)); - } - - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L0_INDEX(bus_addr)]; + pte = alloc_pte(dom, bus_addr, NULL, GFP_KERNEL); if (IOMMU_PTE_PRESENT(*pte)) return -EBUSY; @@ -1139,6 +1121,61 @@ static int get_device_resources(struct device *dev, return 1; } +/* + * If the pte_page is not yet allocated this function is called + */ +static u64* alloc_pte(struct protection_domain *dom, + unsigned long address, u64 **pte_page, gfp_t gfp) +{ + u64 *pte, *page; + + pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(address)]; + + if (!IOMMU_PTE_PRESENT(*pte)) { + page = (u64 *)get_zeroed_page(gfp); + if (!page) + return NULL; + *pte = IOMMU_L2_PDE(virt_to_phys(page)); + } + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L1_INDEX(address)]; + + if (!IOMMU_PTE_PRESENT(*pte)) { + page = (u64 *)get_zeroed_page(gfp); + if (!page) + return NULL; + *pte = IOMMU_L1_PDE(virt_to_phys(page)); + } + + pte = IOMMU_PTE_PAGE(*pte); + + if (pte_page) + *pte_page = pte; + + pte = &pte[IOMMU_PTE_L0_INDEX(address)]; + + return pte; +} + +/* + * This function fetches the PTE for a given address in the aperture + */ +static u64* dma_ops_get_pte(struct dma_ops_domain *dom, + unsigned long address) +{ + struct aperture_range *aperture = &dom->aperture; + u64 *pte, *pte_page; + + pte = aperture->pte_pages[IOMMU_PTE_L1_INDEX(address)]; + if (!pte) { + pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC); + aperture->pte_pages[IOMMU_PTE_L1_INDEX(address)] = pte_page; + } + + return pte; +} + /* * This is the generic map function. It maps one 4kb page at paddr to * the given address in the DMA address space for the domain. @@ -1155,8 +1192,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, paddr &= PAGE_MASK; - pte = dom->aperture.pte_pages[IOMMU_PTE_L1_INDEX(address)]; - pte += IOMMU_PTE_L0_INDEX(address); + pte = dma_ops_get_pte(dom, address); __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; -- cgit v1.2.3 From 53812c115cda1f660b286c939669154a56976f6b Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 12 May 2009 12:17:38 +0200 Subject: amd-iommu: handle page table allocation failures in dma_ops code The code will be required when the aperture size increases dynamically in the extended address allocator. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index ded79f7747c..a467addb44b 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1193,6 +1193,8 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, paddr &= PAGE_MASK; pte = dma_ops_get_pte(dom, address); + if (!pte) + return bad_dma_address; __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; @@ -1248,7 +1250,7 @@ static dma_addr_t __map_single(struct device *dev, u64 dma_mask) { dma_addr_t offset = paddr & ~PAGE_MASK; - dma_addr_t address, start; + dma_addr_t address, start, ret; unsigned int pages; unsigned long align_mask = 0; int i; @@ -1271,7 +1273,10 @@ static dma_addr_t __map_single(struct device *dev, start = address; for (i = 0; i < pages; ++i) { - dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); + ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); + if (ret == bad_dma_address) + goto out_unmap; + paddr += PAGE_SIZE; start += PAGE_SIZE; } @@ -1287,6 +1292,17 @@ static dma_addr_t __map_single(struct device *dev, out: return address; + +out_unmap: + + for (--i; i >= 0; --i) { + start -= PAGE_SIZE; + dma_ops_domain_unmap(iommu, dma_dom, start); + } + + dma_ops_free_addresses(dma_dom, address, pages); + + return bad_dma_address; } /* -- cgit v1.2.3 From 384de72910a7bf96a02a6d8023fe9e16d872beb2 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 15 May 2009 12:30:05 +0200 Subject: amd-iommu: make address allocator aware of multiple aperture ranges This patch changes the AMD IOMMU address allocator to allow up to 32 aperture ranges per dma_ops domain. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 138 ++++++++++++++++++++++++++++++++------------ 1 file changed, 101 insertions(+), 37 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index a467addb44b..794163ae97b 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -578,7 +578,7 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, */ if (addr < dma_dom->aperture_size) __set_bit(addr >> PAGE_SHIFT, - dma_dom->aperture.bitmap); + dma_dom->aperture[0]->bitmap); } return 0; @@ -615,43 +615,74 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, ****************************************************************************/ /* - * The address allocator core function. + * The address allocator core functions. * * called with domain->lock held */ + +static unsigned long dma_ops_area_alloc(struct device *dev, + struct dma_ops_domain *dom, + unsigned int pages, + unsigned long align_mask, + u64 dma_mask, + unsigned long start) +{ + unsigned long next_bit = dom->next_bit % APERTURE_RANGE_PAGES; + int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT; + int i = start >> APERTURE_RANGE_SHIFT; + unsigned long boundary_size; + unsigned long address = -1; + unsigned long limit; + + boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, + PAGE_SIZE) >> PAGE_SHIFT; + + for (;i < max_index; ++i) { + unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT; + + if (dom->aperture[i]->offset >= dma_mask) + break; + + limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset, + dma_mask >> PAGE_SHIFT); + + address = iommu_area_alloc(dom->aperture[i]->bitmap, + limit, next_bit, pages, 0, + boundary_size, align_mask); + if (address != -1) { + address = dom->aperture[i]->offset + + (address << PAGE_SHIFT); + dom->next_bit = (address >> PAGE_SHIFT) + pages; + break; + } + + next_bit = 0; + } + + return address; +} + static unsigned long dma_ops_alloc_addresses(struct device *dev, struct dma_ops_domain *dom, unsigned int pages, unsigned long align_mask, u64 dma_mask) { - unsigned long limit; unsigned long address; - unsigned long boundary_size; + unsigned long start = dom->next_bit << PAGE_SHIFT; - boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, - PAGE_SIZE) >> PAGE_SHIFT; - limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0, - dma_mask >> PAGE_SHIFT); - if (dom->next_bit >= limit) { - dom->next_bit = 0; - dom->need_flush = true; - } + address = dma_ops_area_alloc(dev, dom, pages, align_mask, + dma_mask, start); - address = iommu_area_alloc(dom->aperture.bitmap, limit, dom->next_bit, - pages, 0 , boundary_size, align_mask); if (address == -1) { - address = iommu_area_alloc(dom->aperture.bitmap, limit, 0, - pages, 0, boundary_size, - align_mask); + dom->next_bit = 0; + address = dma_ops_area_alloc(dev, dom, pages, align_mask, + dma_mask, 0); dom->need_flush = true; } - if (likely(address != -1)) { - dom->next_bit = address + pages; - address <<= PAGE_SHIFT; - } else + if (unlikely(address == -1)) address = bad_dma_address; WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); @@ -668,11 +699,17 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom, unsigned long address, unsigned int pages) { - address >>= PAGE_SHIFT; - iommu_area_free(dom->aperture.bitmap, address, pages); + unsigned i = address >> APERTURE_RANGE_SHIFT; + struct aperture_range *range = dom->aperture[i]; + + BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL); - if (address >= dom->next_bit) + if ((address >> PAGE_SHIFT) >= dom->next_bit) dom->need_flush = true; + + address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT; + iommu_area_free(range->bitmap, address, pages); + } /**************************************************************************** @@ -720,12 +757,16 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, unsigned long start_page, unsigned int pages) { - unsigned int last_page = dom->aperture_size >> PAGE_SHIFT; + unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT; if (start_page + pages > last_page) pages = last_page - start_page; - iommu_area_reserve(dom->aperture.bitmap, start_page, pages); + for (i = start_page; i < start_page + pages; ++i) { + int index = i / APERTURE_RANGE_PAGES; + int page = i % APERTURE_RANGE_PAGES; + __set_bit(page, dom->aperture[index]->bitmap); + } } static void free_pagetable(struct protection_domain *domain) @@ -764,12 +805,19 @@ static void free_pagetable(struct protection_domain *domain) */ static void dma_ops_domain_free(struct dma_ops_domain *dom) { + int i; + if (!dom) return; free_pagetable(&dom->domain); - free_page((unsigned long)dom->aperture.bitmap); + for (i = 0; i < APERTURE_MAX_RANGES; ++i) { + if (!dom->aperture[i]) + continue; + free_page((unsigned long)dom->aperture[i]->bitmap); + kfree(dom->aperture[i]); + } kfree(dom); } @@ -797,6 +845,11 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, if (!dma_dom) return NULL; + dma_dom->aperture[0] = kzalloc(sizeof(struct aperture_range), + GFP_KERNEL); + if (!dma_dom->aperture[0]) + goto free_dma_dom; + spin_lock_init(&dma_dom->domain.lock); dma_dom->domain.id = domain_id_alloc(); @@ -809,14 +862,14 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, if (!dma_dom->domain.pt_root) goto free_dma_dom; dma_dom->aperture_size = APERTURE_RANGE_SIZE; - dma_dom->aperture.bitmap = (void *)get_zeroed_page(GFP_KERNEL); - if (!dma_dom->aperture.bitmap) + dma_dom->aperture[0]->bitmap = (void *)get_zeroed_page(GFP_KERNEL); + if (!dma_dom->aperture[0]->bitmap) goto free_dma_dom; /* * mark the first page as allocated so we never return 0 as * a valid dma-address. So we can use 0 as error value */ - dma_dom->aperture.bitmap[0] = 1; + dma_dom->aperture[0]->bitmap[0] = 1; dma_dom->next_bit = 0; dma_dom->need_flush = false; @@ -846,7 +899,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde)); for (i = 0; i < num_pte_pages; ++i) { - u64 **pte_page = &dma_dom->aperture.pte_pages[i]; + u64 **pte_page = &dma_dom->aperture[0]->pte_pages[i]; *pte_page = (u64 *)get_zeroed_page(GFP_KERNEL); if (!*pte_page) goto free_dma_dom; @@ -1164,14 +1217,19 @@ static u64* alloc_pte(struct protection_domain *dom, static u64* dma_ops_get_pte(struct dma_ops_domain *dom, unsigned long address) { - struct aperture_range *aperture = &dom->aperture; + struct aperture_range *aperture; u64 *pte, *pte_page; - pte = aperture->pte_pages[IOMMU_PTE_L1_INDEX(address)]; + aperture = dom->aperture[APERTURE_RANGE_INDEX(address)]; + if (!aperture) + return NULL; + + pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; if (!pte) { pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC); - aperture->pte_pages[IOMMU_PTE_L1_INDEX(address)] = pte_page; - } + aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page; + } else + pte += IOMMU_PTE_L0_INDEX(address); return pte; } @@ -1219,14 +1277,20 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu, struct dma_ops_domain *dom, unsigned long address) { + struct aperture_range *aperture; u64 *pte; if (address >= dom->aperture_size) return; - WARN_ON(address & ~PAGE_MASK || address >= dom->aperture_size); + aperture = dom->aperture[APERTURE_RANGE_INDEX(address)]; + if (!aperture) + return; + + pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; + if (!pte) + return; - pte = dom->aperture.pte_pages[IOMMU_PTE_L1_INDEX(address)]; pte += IOMMU_PTE_L0_INDEX(address); WARN_ON(!*pte); -- cgit v1.2.3 From 803b8cb4d9a93b90c67aba2aab7f2c54d595b5b9 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 18 May 2009 15:32:48 +0200 Subject: amd-iommu: change dma_dom->next_bit to dma_dom->next_address Simplify the code a little bit by using the same unit for all address space related state in the dma_ops domain structure. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 794163ae97b..c1a08b9119c 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -627,13 +627,15 @@ static unsigned long dma_ops_area_alloc(struct device *dev, u64 dma_mask, unsigned long start) { - unsigned long next_bit = dom->next_bit % APERTURE_RANGE_PAGES; + unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE; int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT; int i = start >> APERTURE_RANGE_SHIFT; unsigned long boundary_size; unsigned long address = -1; unsigned long limit; + next_bit >>= PAGE_SHIFT; + boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, PAGE_SIZE) >> PAGE_SHIFT; @@ -652,7 +654,7 @@ static unsigned long dma_ops_area_alloc(struct device *dev, if (address != -1) { address = dom->aperture[i]->offset + (address << PAGE_SHIFT); - dom->next_bit = (address >> PAGE_SHIFT) + pages; + dom->next_address = address + (pages << PAGE_SHIFT); break; } @@ -669,14 +671,12 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev, u64 dma_mask) { unsigned long address; - unsigned long start = dom->next_bit << PAGE_SHIFT; - address = dma_ops_area_alloc(dev, dom, pages, align_mask, - dma_mask, start); + dma_mask, dom->next_address); if (address == -1) { - dom->next_bit = 0; + dom->next_address = 0; address = dma_ops_area_alloc(dev, dom, pages, align_mask, dma_mask, 0); dom->need_flush = true; @@ -704,10 +704,11 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom, BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL); - if ((address >> PAGE_SHIFT) >= dom->next_bit) + if (address >= dom->next_address) dom->need_flush = true; address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT; + iommu_area_free(range->bitmap, address, pages); } @@ -870,7 +871,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, * a valid dma-address. So we can use 0 as error value */ dma_dom->aperture[0]->bitmap[0] = 1; - dma_dom->next_bit = 0; + dma_dom->next_address = 0; dma_dom->need_flush = false; dma_dom->target_dev = 0xffff; -- cgit v1.2.3 From 9cabe89b99773e682538a8809abc7d4000c77083 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 18 May 2009 16:38:55 +0200 Subject: amd-iommu: move aperture_range allocation code to seperate function This patch prepares the dynamic increasement of dma_ops domain apertures. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 95 ++++++++++++++++++++++++++++----------------- 1 file changed, 59 insertions(+), 36 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index c1a08b9119c..8ff02ee69e8 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -620,6 +620,59 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, * called with domain->lock held */ +/* + * This function is used to add a new aperture range to an existing + * aperture in case of dma_ops domain allocation or address allocation + * failure. + */ +static int alloc_new_range(struct dma_ops_domain *dma_dom, + bool populate, gfp_t gfp) +{ + int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; + + if (index >= APERTURE_MAX_RANGES) + return -ENOMEM; + + dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp); + if (!dma_dom->aperture[index]) + return -ENOMEM; + + dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp); + if (!dma_dom->aperture[index]->bitmap) + goto out_free; + + dma_dom->aperture[index]->offset = dma_dom->aperture_size; + + if (populate) { + unsigned long address = dma_dom->aperture_size; + int i, num_ptes = APERTURE_RANGE_PAGES / 512; + u64 *pte, *pte_page; + + for (i = 0; i < num_ptes; ++i) { + pte = alloc_pte(&dma_dom->domain, address, + &pte_page, gfp); + if (!pte) + goto out_free; + + dma_dom->aperture[index]->pte_pages[i] = pte_page; + + address += APERTURE_RANGE_SIZE / 64; + } + } + + dma_dom->aperture_size += APERTURE_RANGE_SIZE; + + return 0; + +out_free: + free_page((unsigned long)dma_dom->aperture[index]->bitmap); + + kfree(dma_dom->aperture[index]); + dma_dom->aperture[index] = NULL; + + return -ENOMEM; +} + static unsigned long dma_ops_area_alloc(struct device *dev, struct dma_ops_domain *dom, unsigned int pages, @@ -832,9 +885,6 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, unsigned order) { struct dma_ops_domain *dma_dom; - unsigned i, num_pte_pages; - u64 *l2_pde; - u64 address; /* * Currently the DMA aperture must be between 32 MB and 1GB in size @@ -846,11 +896,6 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, if (!dma_dom) return NULL; - dma_dom->aperture[0] = kzalloc(sizeof(struct aperture_range), - GFP_KERNEL); - if (!dma_dom->aperture[0]) - goto free_dma_dom; - spin_lock_init(&dma_dom->domain.lock); dma_dom->domain.id = domain_id_alloc(); @@ -862,10 +907,13 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_dom->domain.priv = dma_dom; if (!dma_dom->domain.pt_root) goto free_dma_dom; - dma_dom->aperture_size = APERTURE_RANGE_SIZE; - dma_dom->aperture[0]->bitmap = (void *)get_zeroed_page(GFP_KERNEL); - if (!dma_dom->aperture[0]->bitmap) + + dma_dom->need_flush = false; + dma_dom->target_dev = 0xffff; + + if (alloc_new_range(dma_dom, true, GFP_KERNEL)) goto free_dma_dom; + /* * mark the first page as allocated so we never return 0 as * a valid dma-address. So we can use 0 as error value @@ -873,9 +921,6 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_dom->aperture[0]->bitmap[0] = 1; dma_dom->next_address = 0; - dma_dom->need_flush = false; - dma_dom->target_dev = 0xffff; - /* Intialize the exclusion range if necessary */ if (iommu->exclusion_start && iommu->exclusion_start < dma_dom->aperture_size) { @@ -886,28 +931,6 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_ops_reserve_addresses(dma_dom, startpage, pages); } - /* - * At the last step, build the page tables so we don't need to - * allocate page table pages in the dma_ops mapping/unmapping - * path for the first 128MB of dma address space. - */ - num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512); - - l2_pde = (u64 *)get_zeroed_page(GFP_KERNEL); - if (l2_pde == NULL) - goto free_dma_dom; - - dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde)); - - for (i = 0; i < num_pte_pages; ++i) { - u64 **pte_page = &dma_dom->aperture[0]->pte_pages[i]; - *pte_page = (u64 *)get_zeroed_page(GFP_KERNEL); - if (!*pte_page) - goto free_dma_dom; - address = virt_to_phys(*pte_page); - l2_pde[i] = IOMMU_L1_PDE(address); - } - return dma_dom; free_dma_dom: -- cgit v1.2.3 From 00cd122ae5e5e7c60cce2af3c35b190d4c3f2d0d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 19 May 2009 09:52:40 +0200 Subject: amd-iommu: handle exlusion ranges and unity mappings in alloc_new_range This patch makes sure no reserved addresses are allocated in an dma_ops domain when the aperture is increased dynamically. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 71 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 60 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 8ff02ee69e8..59ee1b94a7c 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -58,6 +58,9 @@ static struct dma_ops_domain *find_protection_domain(u16 devid); static u64* alloc_pte(struct protection_domain *dom, unsigned long address, u64 **pte_page, gfp_t gfp); +static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, + unsigned long start_page, + unsigned int pages); #ifdef CONFIG_AMD_IOMMU_STATS @@ -620,15 +623,43 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, * called with domain->lock held */ +/* + * This function checks if there is a PTE for a given dma address. If + * there is one, it returns the pointer to it. + */ +static u64* fetch_pte(struct protection_domain *domain, + unsigned long address) +{ + u64 *pte; + + pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(address)]; + + if (!IOMMU_PTE_PRESENT(*pte)) + return NULL; + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L1_INDEX(address)]; + + if (!IOMMU_PTE_PRESENT(*pte)) + return NULL; + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L0_INDEX(address)]; + + return pte; +} + /* * This function is used to add a new aperture range to an existing * aperture in case of dma_ops domain allocation or address allocation * failure. */ -static int alloc_new_range(struct dma_ops_domain *dma_dom, +static int alloc_new_range(struct amd_iommu *iommu, + struct dma_ops_domain *dma_dom, bool populate, gfp_t gfp) { int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; + int i; if (index >= APERTURE_MAX_RANGES) return -ENOMEM; @@ -662,6 +693,33 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom, dma_dom->aperture_size += APERTURE_RANGE_SIZE; + /* Intialize the exclusion range if necessary */ + if (iommu->exclusion_start && + iommu->exclusion_start >= dma_dom->aperture[index]->offset && + iommu->exclusion_start < dma_dom->aperture_size) { + unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; + int pages = iommu_num_pages(iommu->exclusion_start, + iommu->exclusion_length, + PAGE_SIZE); + dma_ops_reserve_addresses(dma_dom, startpage, pages); + } + + /* + * Check for areas already mapped as present in the new aperture + * range and mark those pages as reserved in the allocator. Such + * mappings may already exist as a result of requested unity + * mappings for devices. + */ + for (i = dma_dom->aperture[index]->offset; + i < dma_dom->aperture_size; + i += PAGE_SIZE) { + u64 *pte = fetch_pte(&dma_dom->domain, i); + if (!pte || !IOMMU_PTE_PRESENT(*pte)) + continue; + + dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1); + } + return 0; out_free: @@ -911,7 +969,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_dom->need_flush = false; dma_dom->target_dev = 0xffff; - if (alloc_new_range(dma_dom, true, GFP_KERNEL)) + if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL)) goto free_dma_dom; /* @@ -921,15 +979,6 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_dom->aperture[0]->bitmap[0] = 1; dma_dom->next_address = 0; - /* Intialize the exclusion range if necessary */ - if (iommu->exclusion_start && - iommu->exclusion_start < dma_dom->aperture_size) { - unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; - int pages = iommu_num_pages(iommu->exclusion_start, - iommu->exclusion_length, - PAGE_SIZE); - dma_ops_reserve_addresses(dma_dom, startpage, pages); - } return dma_dom; -- cgit v1.2.3 From 11b83888ae729457b5cfb936dbd498481f6408df Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 19 May 2009 10:23:15 +0200 Subject: amd-iommu: enlarge the aperture dynamically By dynamically increasing the aperture the extended allocator is now ready for use. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 59ee1b94a7c..d129d8feba0 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1403,10 +1403,26 @@ static dma_addr_t __map_single(struct device *dev, if (align) align_mask = (1UL << get_order(size)) - 1; +retry: address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, dma_mask); - if (unlikely(address == bad_dma_address)) - goto out; + if (unlikely(address == bad_dma_address)) { + /* + * setting next_address here will let the address + * allocator only scan the new allocated range in the + * first run. This is a small optimization. + */ + dma_dom->next_address = dma_dom->aperture_size; + + if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC)) + goto out; + + /* + * aperture was sucessfully enlarged by 128 MB, try + * allocation again + */ + goto retry; + } start = address; for (i = 0; i < pages; ++i) { -- cgit v1.2.3 From d9cfed925448f097ec7faab80d903eb7e5f99712 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 19 May 2009 12:16:29 +0200 Subject: amd-iommu: remove amd_iommu_size kernel parameter This parameter is not longer necessary when aperture increases dynamically. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 18 ++++-------------- arch/x86/kernel/amd_iommu_init.c | 15 --------------- 2 files changed, 4 insertions(+), 29 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index d129d8feba0..31d56c36010 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -939,17 +939,10 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom) * It also intializes the page table and the address allocator data * structures required for the dma_ops interface */ -static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, - unsigned order) +static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) { struct dma_ops_domain *dma_dom; - /* - * Currently the DMA aperture must be between 32 MB and 1GB in size - */ - if ((order < 25) || (order > 30)) - return NULL; - dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL); if (!dma_dom) return NULL; @@ -1087,7 +1080,6 @@ static int device_change_notifier(struct notifier_block *nb, struct protection_domain *domain; struct dma_ops_domain *dma_domain; struct amd_iommu *iommu; - int order = amd_iommu_aperture_order; unsigned long flags; if (devid > amd_iommu_last_bdf) @@ -1126,7 +1118,7 @@ static int device_change_notifier(struct notifier_block *nb, dma_domain = find_protection_domain(devid); if (dma_domain) goto out; - dma_domain = dma_ops_domain_alloc(iommu, order); + dma_domain = dma_ops_domain_alloc(iommu); if (!dma_domain) goto out; dma_domain->target_dev = devid; @@ -1826,7 +1818,6 @@ static void prealloc_protection_domains(void) struct pci_dev *dev = NULL; struct dma_ops_domain *dma_dom; struct amd_iommu *iommu; - int order = amd_iommu_aperture_order; u16 devid; while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { @@ -1839,7 +1830,7 @@ static void prealloc_protection_domains(void) iommu = amd_iommu_rlookup_table[devid]; if (!iommu) continue; - dma_dom = dma_ops_domain_alloc(iommu, order); + dma_dom = dma_ops_domain_alloc(iommu); if (!dma_dom) continue; init_unity_mappings_for_device(dma_dom, devid); @@ -1865,7 +1856,6 @@ static struct dma_map_ops amd_iommu_dma_ops = { int __init amd_iommu_init_dma_ops(void) { struct amd_iommu *iommu; - int order = amd_iommu_aperture_order; int ret; /* @@ -1874,7 +1864,7 @@ int __init amd_iommu_init_dma_ops(void) * protection domain will be assigned to the default one. */ list_for_each_entry(iommu, &amd_iommu_list, list) { - iommu->default_dom = dma_ops_domain_alloc(iommu, order); + iommu->default_dom = dma_ops_domain_alloc(iommu); if (iommu->default_dom == NULL) return -ENOMEM; iommu->default_dom->domain.flags |= PD_DEFAULT_MASK; diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 8c0be0902da..762a4eefec9 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -121,7 +121,6 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have to handle */ LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings we find in ACPI */ -unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ bool amd_iommu_isolate = true; /* if true, device isolation is enabled */ bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ @@ -1137,9 +1136,6 @@ int __init amd_iommu_init(void) enable_iommus(); - printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n", - (1 << (amd_iommu_aperture_order-20))); - printk(KERN_INFO "AMD IOMMU: device isolation "); if (amd_iommu_isolate) printk("enabled\n"); @@ -1225,15 +1221,4 @@ static int __init parse_amd_iommu_options(char *str) return 1; } -static int __init parse_amd_iommu_size_options(char *str) -{ - unsigned order = PAGE_SHIFT + get_order(memparse(str, &str)); - - if ((order > 24) && (order < 31)) - amd_iommu_aperture_order = order; - - return 1; -} - __setup("amd_iommu=", parse_amd_iommu_options); -__setup("amd_iommu_size=", parse_amd_iommu_size_options); -- cgit v1.2.3 From fe16f088a88fb73161bba8784375c829f7e87b54 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 22 May 2009 12:27:53 +0200 Subject: amd-iommu: disable round-robin allocator for CONFIG_IOMMU_STRESS Disabling the round-robin allocator results in reusing the same dma-addresses again very fast. This is a good test if the iotlb flushing is working correctly. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 31d56c36010..543822b39a8 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -783,6 +783,11 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev, { unsigned long address; +#ifdef CONFIG_IOMMU_STRESS + dom->next_address = 0; + dom->need_flush = true; +#endif + address = dma_ops_area_alloc(dev, dom, pages, align_mask, dma_mask, dom->next_address); -- cgit v1.2.3 From f5e9705c6429d24dee832b2edd7f4848d432ea03 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 22 May 2009 12:31:53 +0200 Subject: amd-iommu: don't preallocate page tables with CONFIG_IOMMU_STRESS This forces testing of on-demand page table allocation code. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 543822b39a8..33434c497a6 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -661,6 +661,10 @@ static int alloc_new_range(struct amd_iommu *iommu, int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; int i; +#ifdef CONFIG_IOMMU_STRESS + populate = false; +#endif + if (index >= APERTURE_MAX_RANGES) return -ENOMEM; -- cgit v1.2.3 From 47bccd6bb2b866449d3ecf2ba350ac1c7473b2b8 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 22 May 2009 12:40:54 +0200 Subject: amd-iommu: don't free dma adresses below 512MB with CONFIG_IOMMU_STRESS This will test the automatic aperture enlargement code. This is important because only very few devices will ever trigger this code path. So force it under CONFIG_IOMMU_STRESS. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 33434c497a6..04ff5ec4ac0 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -824,6 +824,11 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom, BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL); +#ifdef CONFIG_IOMMU_STRESS + if (i < 4) + return; +#endif + if (address >= dom->next_address) dom->need_flush = true; -- cgit v1.2.3 From 13503fa9137d9708d52214e9506c671dbf2fbdce Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 26 Mar 2009 17:39:20 +0900 Subject: x86, mce: Cleanup param parser - Fix the comment formatting. - The error path does not return 0, and printk lacks level and "\n". - Move __setup("nomce") next to mcheck_disable(). - Improve readability etc. [ Impact: cleanup ] Signed-off-by: Hidetoshi Seto Acked-by: Andi Kleen LKML-Reference: <49CB3F38.7090703@jp.fujitsu.com> Signed-off-by: Ingo Molnar Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 6fb0b359d2a..77effb55afe 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -839,25 +839,29 @@ static int __init mcheck_disable(char *str) mce_dont_init = 1; return 1; } +__setup("nomce", mcheck_disable); -/* mce=off disables machine check. - mce=TOLERANCELEVEL (number, see above) - mce=bootlog Log MCEs from before booting. Disabled by default on AMD. - mce=nobootlog Don't log MCEs from before booting. */ +/* + * mce=off disables machine check + * mce=TOLERANCELEVEL (number, see above) + * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. + * mce=nobootlog Don't log MCEs from before booting. + */ static int __init mcheck_enable(char *str) { if (!strcmp(str, "off")) mce_dont_init = 1; - else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog")) - mce_bootlog = str[0] == 'b'; + else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) + mce_bootlog = (str[0] == 'b'); else if (isdigit(str[0])) get_option(&str, &tolerant); - else - printk("mce= argument %s ignored. Please use /sys", str); + else { + printk(KERN_INFO "mce= argument %s ignored. Please use /sys\n", + str); + return 0; + } return 1; } - -__setup("nomce", mcheck_disable); __setup("mce=", mcheck_enable); /* -- cgit v1.2.3 From e9eee03e99d519599eb615c3e251d5f6cc4be57d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:17 +0200 Subject: x86, mce: clean up mce_64.c This file has been modified many times along the years, by multiple authors, so the general style and structure has diverged in a number of areas making this file hard to read. So fix the coding style match that of the rest of the x86 arch code. [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 247 ++++++++++++++++++++++-------------- 1 file changed, 149 insertions(+), 98 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 77effb55afe..1491246c4d6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -1,46 +1,47 @@ /* * Machine check handler. + * * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. * Rest from unknown author(s). * 2004 Andi Kleen. Rewrote most of it. * Copyright 2008 Intel Corporation * Author: Andi Kleen */ - -#include -#include -#include -#include +#include +#include +#include +#include +#include +#include #include +#include +#include +#include +#include #include -#include -#include #include -#include -#include -#include -#include -#include -#include -#include #include -#include -#include -#include +#include #include -#include +#include +#include +#include +#include +#include +#include + #include -#include -#include #include -#include #include +#include +#include +#include -#define MISC_MCELOG_MINOR 227 +#define MISC_MCELOG_MINOR 227 atomic_t mce_entry; -static int mce_dont_init; +static int mce_dont_init; /* * Tolerant levels: @@ -49,16 +50,16 @@ static int mce_dont_init; * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors * 3: never panic or SIGBUS, log all errors (for testing only) */ -static int tolerant = 1; -static int banks; -static u64 *bank; -static unsigned long notify_user; -static int rip_msr; -static int mce_bootlog = -1; -static atomic_t mce_events; +static int tolerant = 1; +static int banks; +static u64 *bank; +static unsigned long notify_user; +static int rip_msr; +static int mce_bootlog = -1; +static atomic_t mce_events; -static char trigger[128]; -static char *trigger_argv[2] = { trigger, NULL }; +static char trigger[128]; +static char *trigger_argv[2] = { trigger, NULL }; static DECLARE_WAIT_QUEUE_HEAD(mce_wait); @@ -89,19 +90,23 @@ static struct mce_log mcelog = { void mce_log(struct mce *mce) { unsigned next, entry; + atomic_inc(&mce_events); mce->finished = 0; wmb(); for (;;) { entry = rcu_dereference(mcelog.next); for (;;) { - /* When the buffer fills up discard new entries. Assume - that the earlier errors are the more interesting. */ + /* + * When the buffer fills up discard new entries. + * Assume that the earlier errors are the more + * interesting ones: + */ if (entry >= MCE_LOG_LEN) { set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags); return; } - /* Old left over entry. Skip. */ + /* Old left over entry. Skip: */ if (mcelog.entry[entry].finished) { entry++; continue; @@ -264,12 +269,12 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * implies that most kernel services cannot be safely used. Don't even * think about putting a printk in there! */ -void do_machine_check(struct pt_regs * regs, long error_code) +void do_machine_check(struct pt_regs *regs, long error_code) { struct mce m, panicm; + int panicm_found = 0; u64 mcestart = 0; int i; - int panicm_found = 0; /* * If no_way_out gets set, there is no safe way to recover from this * MCE. If tolerant is cranked up, we'll try anyway. @@ -293,6 +298,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) mce_setup(&m); rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + /* if the restart IP is not valid, we're done for */ if (!(m.mcgstatus & MCG_STATUS_RIPV)) no_way_out = 1; @@ -356,23 +362,29 @@ void do_machine_check(struct pt_regs * regs, long error_code) mce_get_rip(&m, regs); mce_log(&m); - /* Did this bank cause the exception? */ - /* Assume that the bank with uncorrectable errors did it, - and that there is only a single one. */ - if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) { + /* + * Did this bank cause the exception? + * + * Assume that the bank with uncorrectable errors did it, + * and that there is only a single one: + */ + if ((m.status & MCI_STATUS_UC) && + (m.status & MCI_STATUS_EN)) { panicm = m; panicm_found = 1; } } - /* If we didn't find an uncorrectable error, pick - the last one (shouldn't happen, just being safe). */ + /* + * If we didn't find an uncorrectable error, pick + * the last one (shouldn't happen, just being safe). + */ if (!panicm_found) panicm = m; /* * If we have decided that we just CAN'T continue, and the user - * has not set tolerant to an insane level, give up and die. + * has not set tolerant to an insane level, give up and die. */ if (no_way_out && tolerant < 3) mce_panic("Machine check", &panicm, mcestart); @@ -451,10 +463,9 @@ void mce_log_therm_throt_event(__u64 status) * poller finds an MCE, poll 2x faster. When the poller finds no more * errors, poll 2x slower (up to check_interval seconds). */ - static int check_interval = 5 * 60; /* 5 minutes */ + static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ -static void mcheck_timer(unsigned long); static DEFINE_PER_CPU(struct timer_list, mce_timer); static void mcheck_timer(unsigned long data) @@ -464,9 +475,10 @@ static void mcheck_timer(unsigned long data) WARN_ON(smp_processor_id() != data); - if (mce_available(¤t_cpu_data)) + if (mce_available(¤t_cpu_data)) { machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_poll_banks)); + } /* * Alert userspace if needed. If we logged an MCE, reduce the @@ -501,6 +513,7 @@ int mce_notify_user(void) static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); clear_thread_flag(TIF_MCE_NOTIFY); + if (test_and_clear_bit(0, ¬ify_user)) { wake_up_interruptible(&mce_wait); @@ -520,9 +533,10 @@ int mce_notify_user(void) return 0; } -/* see if the idle task needs to notify userspace */ +/* see if the idle task needs to notify userspace: */ static int -mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk) +mce_idle_callback(struct notifier_block *nfb, unsigned long action, + void *unused) { /* IDLE_END should be safe - interrupts are back on */ if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY)) @@ -532,7 +546,7 @@ mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk) } static struct notifier_block mce_idle_notifier = { - .notifier_call = mce_idle_callback, + .notifier_call = mce_idle_callback, }; static __init int periodic_mcheck_init(void) @@ -547,8 +561,8 @@ __initcall(periodic_mcheck_init); */ static int mce_cap_init(void) { - u64 cap; unsigned b; + u64 cap; rdmsrl(MSR_IA32_MCG_CAP, cap); b = cap & 0xff; @@ -578,9 +592,9 @@ static int mce_cap_init(void) static void mce_init(void *dummy) { + mce_banks_t all_banks; u64 cap; int i; - mce_banks_t all_banks; /* * Log the machine checks left over from the previous reset. @@ -605,14 +619,21 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) { /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD) { - if (c->x86 == 15 && banks > 4) - /* disable GART TBL walk error reporting, which trips off - incorrectly with the IOMMU & 3ware & Cerberus. */ + if (c->x86 == 15 && banks > 4) { + /* + * disable GART TBL walk error reporting, which + * trips off incorrectly with the IOMMU & 3ware + * & Cerberus: + */ clear_bit(10, (unsigned long *)&bank[4]); - if(c->x86 <= 17 && mce_bootlog < 0) - /* Lots of broken BIOS around that don't clear them - by default and leave crap in there. Don't log. */ + } + if (c->x86 <= 17 && mce_bootlog < 0) { + /* + * Lots of broken BIOS around that don't clear them + * by default and leave crap in there. Don't log: + */ mce_bootlog = 0; + } } } @@ -646,7 +667,7 @@ static void mce_init_timer(void) /* * Called for each booted CPU to set up machine checks. - * Must be called with preempt off. + * Must be called with preempt off: */ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) { @@ -669,8 +690,8 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) */ static DEFINE_SPINLOCK(mce_state_lock); -static int open_count; /* #times opened */ -static int open_exclu; /* already open exclusive? */ +static int open_count; /* #times opened */ +static int open_exclu; /* already open exclusive? */ static int mce_open(struct inode *inode, struct file *file) { @@ -680,6 +701,7 @@ static int mce_open(struct inode *inode, struct file *file) if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { spin_unlock(&mce_state_lock); unlock_kernel(); + return -EBUSY; } @@ -712,13 +734,14 @@ static void collect_tscs(void *data) rdtscll(cpu_tsc[smp_processor_id()]); } +static DEFINE_MUTEX(mce_read_mutex); + static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off) { + char __user *buf = ubuf; unsigned long *cpu_tsc; - static DEFINE_MUTEX(mce_read_mutex); unsigned prev, next; - char __user *buf = ubuf; int i, err; cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); @@ -732,6 +755,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { mutex_unlock(&mce_read_mutex); kfree(cpu_tsc); + return -EINVAL; } @@ -770,6 +794,7 @@ timeout: * synchronize. */ on_each_cpu(collect_tscs, cpu_tsc, 1); + for (i = next; i < MCE_LOG_LEN; i++) { if (mcelog.entry[i].finished && mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { @@ -782,6 +807,7 @@ timeout: } mutex_unlock(&mce_read_mutex); kfree(cpu_tsc); + return err ? -EFAULT : buf - ubuf; } @@ -799,6 +825,7 @@ static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + switch (cmd) { case MCE_GET_RECORD_LEN: return put_user(sizeof(struct mce), p); @@ -810,6 +837,7 @@ static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) do { flags = mcelog.flags; } while (cmpxchg(&mcelog.flags, flags, 0) != flags); + return put_user(flags, p); } default: @@ -818,11 +846,11 @@ static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) } static const struct file_operations mce_chrdev_ops = { - .open = mce_open, - .release = mce_release, - .read = mce_read, - .poll = mce_poll, - .unlocked_ioctl = mce_ioctl, + .open = mce_open, + .release = mce_release, + .read = mce_read, + .poll = mce_poll, + .unlocked_ioctl = mce_ioctl, }; static struct miscdevice mce_log_device = { @@ -891,13 +919,16 @@ static int mce_shutdown(struct sys_device *dev) return mce_disable(); } -/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. - Only one CPU is active at this time, the others get readded later using - CPU hotplug. */ +/* + * On resume clear all MCE state. Don't want to see leftovers from the BIOS. + * Only one CPU is active at this time, the others get re-added later using + * CPU hotplug: + */ static int mce_resume(struct sys_device *dev) { mce_init(NULL); mce_cpu_features(¤t_cpu_data); + return 0; } @@ -916,14 +947,16 @@ static void mce_restart(void) } static struct sysdev_class mce_sysclass = { - .suspend = mce_suspend, - .shutdown = mce_shutdown, - .resume = mce_resume, - .name = "machinecheck", + .suspend = mce_suspend, + .shutdown = mce_shutdown, + .resume = mce_resume, + .name = "machinecheck", }; DEFINE_PER_CPU(struct sys_device, device_mce); -void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata; + +__cpuinitdata +void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); /* Why are there no generic functions for this? */ #define ACCESSOR(name, var, start) \ @@ -937,9 +970,12 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit const char *buf, size_t siz) { \ char *end; \ unsigned long new = simple_strtoul(buf, &end, 0); \ - if (end == buf) return -EINVAL; \ + \ + if (end == buf) \ + return -EINVAL; \ var = new; \ start; \ + \ return end-buf; \ } \ static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); @@ -950,6 +986,7 @@ static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, char *buf) { u64 b = bank[attr - bank_attrs]; + return sprintf(buf, "%llx\n", b); } @@ -958,15 +995,18 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, { char *end; u64 new = simple_strtoull(buf, &end, 0); + if (end == buf) return -EINVAL; + bank[attr - bank_attrs] = new; mce_restart(); + return end-buf; } -static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, - char *buf) +static ssize_t +show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) { strcpy(buf, trigger); strcat(buf, "\n"); @@ -974,21 +1014,27 @@ static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, } static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, - const char *buf,size_t siz) + const char *buf, size_t siz) { char *p; int len; + strncpy(trigger, buf, sizeof(trigger)); trigger[sizeof(trigger)-1] = 0; len = strlen(trigger); p = strchr(trigger, '\n'); - if (*p) *p = 0; + + if (*p) + *p = 0; + return len; } static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); -ACCESSOR(check_interval,check_interval,mce_restart()) + +ACCESSOR(check_interval, check_interval, mce_restart()) + static struct sysdev_attribute *mce_attributes[] = { &attr_tolerant.attr, &attr_check_interval, &attr_trigger, NULL @@ -996,7 +1042,7 @@ static struct sysdev_attribute *mce_attributes[] = { static cpumask_var_t mce_device_initialized; -/* Per cpu sysdev init. All of the cpus still share the same ctl bank */ +/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ static __cpuinit int mce_create_device(unsigned int cpu) { int err; @@ -1006,15 +1052,15 @@ static __cpuinit int mce_create_device(unsigned int cpu) return -EIO; memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject)); - per_cpu(device_mce,cpu).id = cpu; - per_cpu(device_mce,cpu).cls = &mce_sysclass; + per_cpu(device_mce, cpu).id = cpu; + per_cpu(device_mce, cpu).cls = &mce_sysclass; - err = sysdev_register(&per_cpu(device_mce,cpu)); + err = sysdev_register(&per_cpu(device_mce, cpu)); if (err) return err; for (i = 0; mce_attributes[i]; i++) { - err = sysdev_create_file(&per_cpu(device_mce,cpu), + err = sysdev_create_file(&per_cpu(device_mce, cpu), mce_attributes[i]); if (err) goto error; @@ -1035,10 +1081,10 @@ error2: } error: while (--i >= 0) { - sysdev_remove_file(&per_cpu(device_mce,cpu), + sysdev_remove_file(&per_cpu(device_mce, cpu), mce_attributes[i]); } - sysdev_unregister(&per_cpu(device_mce,cpu)); + sysdev_unregister(&per_cpu(device_mce, cpu)); return err; } @@ -1051,12 +1097,12 @@ static __cpuinit void mce_remove_device(unsigned int cpu) return; for (i = 0; mce_attributes[i]; i++) - sysdev_remove_file(&per_cpu(device_mce,cpu), + sysdev_remove_file(&per_cpu(device_mce, cpu), mce_attributes[i]); for (i = 0; i < banks; i++) sysdev_remove_file(&per_cpu(device_mce, cpu), &bank_attrs[i]); - sysdev_unregister(&per_cpu(device_mce,cpu)); + sysdev_unregister(&per_cpu(device_mce, cpu)); cpumask_clear_cpu(cpu, mce_device_initialized); } @@ -1076,11 +1122,12 @@ static void mce_disable_cpu(void *h) static void mce_reenable_cpu(void *h) { - int i; unsigned long action = *(unsigned long *)h; + int i; if (!mce_available(¤t_cpu_data)) return; + if (!(action & CPU_TASKS_FROZEN)) cmci_reenable(); for (i = 0; i < banks; i++) @@ -1088,8 +1135,8 @@ static void mce_reenable_cpu(void *h) } /* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int __cpuinit +mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; struct timer_list *t = &per_cpu(mce_timer, cpu); @@ -1142,12 +1189,14 @@ static __init int mce_init_banks(void) for (i = 0; i < banks; i++) { struct sysdev_attribute *a = &bank_attrs[i]; - a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); + + a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); if (!a->attr.name) goto nomem; - a->attr.mode = 0644; - a->show = show_bank; - a->store = set_bank; + + a->attr.mode = 0644; + a->show = show_bank; + a->store = set_bank; } return 0; @@ -1156,6 +1205,7 @@ nomem: kfree(bank_attrs[i].attr.name); kfree(bank_attrs); bank_attrs = NULL; + return -ENOMEM; } @@ -1185,6 +1235,7 @@ static __init int mce_init_device(void) register_hotcpu_notifier(&mce_cpu_notifier); misc_register(&mce_log_device); + return err; } -- cgit v1.2.3 From 3b58dfd04bdfa52e717ead8f3c7622610eb7f950 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:21 +0200 Subject: x86, mce: clean up mce_32.c Make the coding style match that of the rest of the x86 arch code. [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_32.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c index 3552119b091..05979e7eff1 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_32.c +++ b/arch/x86/kernel/cpu/mcheck/mce_32.c @@ -2,13 +2,12 @@ * mce.c - x86 Machine Check Exception Reporting * (c) 2002 Alan Cox , Dave Jones */ - -#include -#include +#include #include #include +#include +#include #include -#include #include #include @@ -17,18 +16,20 @@ #include "mce.h" int mce_disabled; -int nr_mce_banks; +int nr_mce_banks; EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ /* Handle unconfigured int18 (should never happen) */ static void unexpected_machine_check(struct pt_regs *regs, long error_code) { - printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id()); + printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", + smp_processor_id()); } /* Call the installed machine check handler for this CPU setup. */ -void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; +void (*machine_check_vector)(struct pt_regs *, long error_code) = + unexpected_machine_check; /* This has to be run for each processor */ void mcheck_init(struct cpuinfo_x86 *c) -- cgit v1.2.3 From c5aaf0e0702513637278ca4e27a156caa9392817 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:18 +0200 Subject: x86, mce: clean up p4.c Make the coding style match that of the rest of the x86 arch code. [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/p4.c | 73 ++++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 30 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index f53bdcbaf38..cb344aba479 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -2,18 +2,17 @@ * P4 specific Machine Check Exception Reporting */ -#include -#include -#include #include +#include +#include +#include #include +#include #include #include -#include #include - -#include +#include #include "mce.h" @@ -36,6 +35,7 @@ static int mce_num_extended_msrs; #ifdef CONFIG_X86_MCE_P4THERMAL + static void unexpected_thermal_interrupt(struct pt_regs *regs) { printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", @@ -43,7 +43,7 @@ static void unexpected_thermal_interrupt(struct pt_regs *regs) add_taint(TAINT_MACHINE_CHECK); } -/* P4/Xeon Thermal transition interrupt handler */ +/* P4/Xeon Thermal transition interrupt handler: */ static void intel_thermal_interrupt(struct pt_regs *regs) { __u64 msr_val; @@ -54,8 +54,9 @@ static void intel_thermal_interrupt(struct pt_regs *regs) therm_throt_process(msr_val & 0x1); } -/* Thermal interrupt handler for this CPU setup */ -static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt; +/* Thermal interrupt handler for this CPU setup: */ +static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = + unexpected_thermal_interrupt; void smp_thermal_interrupt(struct pt_regs *regs) { @@ -65,67 +66,76 @@ void smp_thermal_interrupt(struct pt_regs *regs) irq_exit(); } -/* P4/Xeon Thermal regulation detect and init */ +/* P4/Xeon Thermal regulation detect and init: */ static void intel_init_thermal(struct cpuinfo_x86 *c) { - u32 l, h; unsigned int cpu = smp_processor_id(); + u32 l, h; - /* Thermal monitoring */ + /* Thermal monitoring: */ if (!cpu_has(c, X86_FEATURE_ACPI)) return; /* -ENODEV */ - /* Clock modulation */ + /* Clock modulation: */ if (!cpu_has(c, X86_FEATURE_ACC)) return; /* -ENODEV */ - /* first check if its enabled already, in which case there might + /* + * First check if its enabled already, in which case there might * be some SMM goo which handles it, so we can't even put a handler - * since it might be delivered via SMI already -zwanem. + * since it might be delivered via SMI already: */ rdmsr(MSR_IA32_MISC_ENABLE, l, h); h = apic_read(APIC_LVTTHMR); if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { - printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", - cpu); + printk(KERN_DEBUG + "CPU%d: Thermal monitoring handled by SMI\n", cpu); + return; /* -EBUSY */ } - /* check whether a vector already exists, temporarily masked? */ + /* Check whether a vector already exists, temporarily masked? */ if (h & APIC_VECTOR_MASK) { - printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already " - "installed\n", - cpu, (h & APIC_VECTOR_MASK)); + printk(KERN_DEBUG + "CPU%d: Thermal LVT vector (%#x) already installed\n", + cpu, (h & APIC_VECTOR_MASK)); + return; /* -EBUSY */ } - /* The temperature transition interrupt handler setup */ - h = THERMAL_APIC_VECTOR; /* our delivery vector */ - h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */ + /* + * The temperature transition interrupt handler setup: + */ + + /* Our delivery vector: */ + h = THERMAL_APIC_VECTOR; + + /* We'll mask the thermal vector in the lapic till we're ready: */ + h |= APIC_DM_FIXED | APIC_LVT_MASKED; apic_write(APIC_LVTTHMR, h); rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h); - /* ok we're good to go... */ + /* Ok, we're good to go... */ vendor_thermal_interrupt = intel_thermal_interrupt; rdmsr(MSR_IA32_MISC_ENABLE, l, h); wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); + /* Unmask the thermal vector: */ l = apic_read(APIC_LVTTHMR); apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); + printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); /* enable thermal throttle processing */ atomic_set(&therm_throt_en, 1); - return; } #endif /* CONFIG_X86_MCE_P4THERMAL */ - /* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ -static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) +static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) { u32 h; @@ -143,9 +153,9 @@ static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) static void intel_machine_check(struct pt_regs *regs, long error_code) { - int recover = 1; u32 alow, ahigh, high, low; u32 mcgstl, mcgsth; + int recover = 1; int i; rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); @@ -157,7 +167,9 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) if (mce_num_extended_msrs > 0) { struct intel_mce_extended_msrs dbg; + intel_get_extended_msrs(&dbg); + printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n" "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n" "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", @@ -171,6 +183,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) if (high & (1<<31)) { char misc[20]; char addr[24]; + misc[0] = addr[0] = '\0'; if (high & (1<<29)) recover |= 1; @@ -196,6 +209,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) panic("Unable to continue"); printk(KERN_EMERG "Attempting to continue.\n"); + /* * Do not clear the MSR_IA32_MCi_STATUS if the error is not * recoverable/continuable.This will allow BIOS to look at the MSRs @@ -217,7 +231,6 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); } - void intel_p4_mcheck_init(struct cpuinfo_x86 *c) { u32 l, h; -- cgit v1.2.3 From ed8bc7ed9a2ad875617b24d2ba09e49ee886638c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:21 +0200 Subject: x86, mce: clean up p5.c Make the coding style match that of the rest of the x86 arch code. [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/p5.c | 43 +++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index c9f77ea69ed..8812f544183 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -2,11 +2,10 @@ * P5 specific Machine Check Exception Reporting * (C) Copyright 2002 Alan Cox */ - -#include -#include -#include #include +#include +#include +#include #include #include @@ -15,39 +14,53 @@ #include "mce.h" -/* Machine check handler for Pentium class Intel */ +/* Machine check handler for Pentium class Intel CPUs: */ static void pentium_machine_check(struct pt_regs *regs, long error_code) { u32 loaddr, hi, lotype; + rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); - printk(KERN_EMERG "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", smp_processor_id(), loaddr, lotype); - if (lotype&(1<<5)) - printk(KERN_EMERG "CPU#%d: Possible thermal failure (CPU on fire ?).\n", smp_processor_id()); + + printk(KERN_EMERG + "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", + smp_processor_id(), loaddr, lotype); + + if (lotype & (1<<5)) { + printk(KERN_EMERG + "CPU#%d: Possible thermal failure (CPU on fire ?).\n", + smp_processor_id()); + } + add_taint(TAINT_MACHINE_CHECK); } -/* Set up machine check reporting for processors with Intel style MCE */ +/* Set up machine check reporting for processors with Intel style MCE: */ void intel_p5_mcheck_init(struct cpuinfo_x86 *c) { u32 l, h; - /*Check for MCE support */ + /* Check for MCE support: */ if (!cpu_has(c, X86_FEATURE_MCE)) return; - /* Default P5 to off as its often misconnected */ + /* Default P5 to off as its often misconnected: */ if (mce_disabled != -1) return; + machine_check_vector = pentium_machine_check; + /* Make sure the vector pointer is visible before we enable MCEs: */ wmb(); - /* Read registers before enabling */ + /* Read registers before enabling: */ rdmsr(MSR_IA32_P5_MC_ADDR, l, h); rdmsr(MSR_IA32_P5_MC_TYPE, l, h); - printk(KERN_INFO "Intel old style machine check architecture supported.\n"); + printk(KERN_INFO + "Intel old style machine check architecture supported.\n"); - /* Enable MCE */ + /* Enable MCE: */ set_in_cr4(X86_CR4_MCE); - printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id()); + printk(KERN_INFO + "Intel old style machine check reporting enabled on CPU#%d.\n", + smp_processor_id()); } -- cgit v1.2.3 From ea2566ff80e096eeecc0918fb5d5a4612d8f62ef Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:19 +0200 Subject: x86, mce: clean up p6.c Make the coding style match that of the rest of the x86 arch code. [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/p6.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c index 2ac52d7b434..43c24e66745 100644 --- a/arch/x86/kernel/cpu/mcheck/p6.c +++ b/arch/x86/kernel/cpu/mcheck/p6.c @@ -2,11 +2,10 @@ * P6 specific Machine Check Exception Reporting * (C) Copyright 2002 Alan Cox */ - -#include -#include -#include #include +#include +#include +#include #include #include @@ -18,9 +17,9 @@ /* Machine Check Handler For PII/PIII */ static void intel_machine_check(struct pt_regs *regs, long error_code) { - int recover = 1; u32 alow, ahigh, high, low; u32 mcgstl, mcgsth; + int recover = 1; int i; rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); @@ -35,12 +34,16 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) if (high & (1<<31)) { char misc[20]; char addr[24]; - misc[0] = addr[0] = '\0'; + + misc[0] = '\0'; + addr[0] = '\0'; + if (high & (1<<29)) recover |= 1; if (high & (1<<25)) recover |= 2; high &= ~(1<<31); + if (high & (1<<27)) { rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); snprintf(misc, 20, "[%08x%08x]", ahigh, alow); @@ -49,6 +52,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); snprintf(addr, 24, " at %08x%08x", ahigh, alow); } + printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", smp_processor_id(), i, high, low, misc, addr); } @@ -63,16 +67,17 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) /* * Do not clear the MSR_IA32_MCi_STATUS if the error is not * recoverable/continuable.This will allow BIOS to look at the MSRs - * for errors if the OS could not log the error. + * for errors if the OS could not log the error: */ for (i = 0; i < nr_mce_banks; i++) { unsigned int msr; + msr = MSR_IA32_MC0_STATUS+i*4; rdmsr(msr, low, high); if (high & (1<<31)) { - /* Clear it */ + /* Clear it: */ wrmsr(msr, 0UL, 0UL); - /* Serialize */ + /* Serialize: */ wmb(); add_taint(TAINT_MACHINE_CHECK); } @@ -81,7 +86,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); } -/* Set up machine check reporting for processors with Intel style MCE */ +/* Set up machine check reporting for processors with Intel style MCE: */ void intel_p6_mcheck_init(struct cpuinfo_x86 *c) { u32 l, h; @@ -97,6 +102,7 @@ void intel_p6_mcheck_init(struct cpuinfo_x86 *c) /* Ok machine check is available */ machine_check_vector = intel_machine_check; + /* Make sure the vector pointer is visible before we enable MCEs: */ wmb(); printk(KERN_INFO "Intel machine check architecture supported.\n"); -- cgit v1.2.3 From efee4ca80980f97b60e91e3322c3342f19623eff Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:20 +0200 Subject: x86, mce: clean up k7.c Make the coding style match that of the rest of the x86 arch code. [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/k7.c | 42 ++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c index dd3af6e7b39..89e51042415 100644 --- a/arch/x86/kernel/cpu/mcheck/k7.c +++ b/arch/x86/kernel/cpu/mcheck/k7.c @@ -2,11 +2,10 @@ * Athlon specific Machine Check Exception Reporting * (C) Copyright 2002 Dave Jones */ - -#include -#include -#include #include +#include +#include +#include #include #include @@ -15,12 +14,12 @@ #include "mce.h" -/* Machine Check Handler For AMD Athlon/Duron */ +/* Machine Check Handler For AMD Athlon/Duron: */ static void k7_machine_check(struct pt_regs *regs, long error_code) { - int recover = 1; u32 alow, ahigh, high, low; u32 mcgstl, mcgsth; + int recover = 1; int i; rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); @@ -32,15 +31,19 @@ static void k7_machine_check(struct pt_regs *regs, long error_code) for (i = 1; i < nr_mce_banks; i++) { rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - if (high&(1<<31)) { + if (high & (1<<31)) { char misc[20]; char addr[24]; - misc[0] = addr[0] = '\0'; + + misc[0] = '\0'; + addr[0] = '\0'; + if (high & (1<<29)) recover |= 1; if (high & (1<<25)) recover |= 2; high &= ~(1<<31); + if (high & (1<<27)) { rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); snprintf(misc, 20, "[%08x%08x]", ahigh, alow); @@ -49,27 +52,31 @@ static void k7_machine_check(struct pt_regs *regs, long error_code) rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); snprintf(addr, 24, " at %08x%08x", ahigh, alow); } + printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", smp_processor_id(), i, high, low, misc, addr); - /* Clear it */ + + /* Clear it: */ wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); - /* Serialize */ + /* Serialize: */ wmb(); add_taint(TAINT_MACHINE_CHECK); } } - if (recover&2) + if (recover & 2) panic("CPU context corrupt"); - if (recover&1) + if (recover & 1) panic("Unable to continue"); + printk(KERN_EMERG "Attempting to continue.\n"); + mcgstl &= ~(1<<2); wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); } -/* AMD K7 machine check is Intel like */ +/* AMD K7 machine check is Intel like: */ void amd_mcheck_init(struct cpuinfo_x86 *c) { u32 l, h; @@ -79,21 +86,26 @@ void amd_mcheck_init(struct cpuinfo_x86 *c) return; machine_check_vector = k7_machine_check; + /* Make sure the vector pointer is visible before we enable MCEs: */ wmb(); printk(KERN_INFO "Intel machine check architecture supported.\n"); + rdmsr(MSR_IA32_MCG_CAP, l, h); if (l & (1<<8)) /* Control register present ? */ wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); nr_mce_banks = l & 0xff; - /* Clear status for MC index 0 separately, we don't touch CTL, - * as some K7 Athlons cause spurious MCEs when its enabled. */ + /* + * Clear status for MC index 0 separately, we don't touch CTL, + * as some K7 Athlons cause spurious MCEs when its enabled: + */ if (boot_cpu_data.x86 == 6) { wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0); i = 1; } else i = 0; + for (; i < nr_mce_banks; i++) { wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); -- cgit v1.2.3 From 91425084f74c0ad087b3fb6bdad79a825f952720 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:22 +0200 Subject: x86, mce: clean up winchip.c Make the coding style match that of the rest of the x86 arch code. [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/winchip.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index 2a043d89811..81b02487090 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -2,11 +2,10 @@ * IDT Winchip specific Machine Check Exception Reporting * (C) Copyright 2002 Alan Cox */ - -#include -#include -#include #include +#include +#include +#include #include #include @@ -14,7 +13,7 @@ #include "mce.h" -/* Machine check handler for WinChip C6 */ +/* Machine check handler for WinChip C6: */ static void winchip_machine_check(struct pt_regs *regs, long error_code) { printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); @@ -25,12 +24,18 @@ static void winchip_machine_check(struct pt_regs *regs, long error_code) void winchip_mcheck_init(struct cpuinfo_x86 *c) { u32 lo, hi; + machine_check_vector = winchip_machine_check; + /* Make sure the vector pointer is visible before we enable MCEs: */ wmb(); + rdmsr(MSR_IDT_FCR1, lo, hi); lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */ lo &= ~(1<<4); /* Enable MCE */ wrmsr(MSR_IDT_FCR1, lo, hi); + set_in_cr4(X86_CR4_MCE); - printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n"); + + printk(KERN_INFO + "Winchip machine check reporting enabled on CPU#0.\n"); } -- cgit v1.2.3 From bdbfbdd5e8f0efb9bfef2e597f8ac673c36317ab Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:20 +0200 Subject: x86, mce: clean up non-fatal.c Make the coding style match that of the rest of the x86 arch code. [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/non-fatal.c | 57 +++++++++++++++++----------------- 1 file changed, 29 insertions(+), 28 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c index a74af128efc..70b710420f7 100644 --- a/arch/x86/kernel/cpu/mcheck/non-fatal.c +++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c @@ -6,15 +6,14 @@ * This file contains routines to check for non-fatal MCEs every 15s * */ - -#include -#include -#include -#include -#include #include -#include +#include +#include +#include #include +#include +#include +#include #include #include @@ -22,9 +21,9 @@ #include "mce.h" -static int firstbank; +static int firstbank; -#define MCE_RATE 15*HZ /* timer rate is 15s */ +#define MCE_RATE (15*HZ) /* timer rate is 15s */ static void mce_checkregs(void *info) { @@ -34,23 +33,24 @@ static void mce_checkregs(void *info) for (i = firstbank; i < nr_mce_banks; i++) { rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - if (high & (1<<31)) { - printk(KERN_INFO "MCE: The hardware reports a non " - "fatal, correctable incident occurred on " - "CPU %d.\n", + if (!(high & (1<<31))) + continue; + + printk(KERN_INFO "MCE: The hardware reports a non fatal, " + "correctable incident occurred on CPU %d.\n", smp_processor_id()); - printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low); - - /* - * Scrub the error so we don't pick it up in MCE_RATE - * seconds time. - */ - wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); - - /* Serialize */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } + + printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low); + + /* + * Scrub the error so we don't pick it up in MCE_RATE + * seconds time: + */ + wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); + + /* Serialize: */ + wmb(); + add_taint(TAINT_MACHINE_CHECK); } } @@ -77,16 +77,17 @@ static int __init init_nonfatal_mce_checker(void) /* Some Athlons misbehave when we frob bank 0 */ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 == 6) - firstbank = 1; + boot_cpu_data.x86 == 6) + firstbank = 1; else - firstbank = 0; + firstbank = 0; /* * Check for non-fatal errors every MCE_RATE s */ schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); printk(KERN_INFO "Machine check exception polling timer started.\n"); + return 0; } module_init(init_nonfatal_mce_checker); -- cgit v1.2.3 From cb6f3c155b0afabc48667efb9e7b1ce92ccfcab4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:19 +0200 Subject: x86, mce: clean up therm_throt.c Make the coding style match that of the rest of the x86 arch code. [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/therm_throt.c | 74 +++++++++++++++++--------------- 1 file changed, 39 insertions(+), 35 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index d5ae2243f0b..a2b5d7ddb19 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -1,7 +1,7 @@ /* - * * Thermal throttle event support code (such as syslog messaging and rate * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c). + * * This allows consistent reporting of CPU thermal throttle events. * * Maintains a counter in /sys that keeps track of the number of thermal @@ -13,43 +13,44 @@ * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. * Inspired by Ross Biro's and Al Borchers' counter code. */ - +#include +#include #include #include #include -#include -#include -#include + #include +#include /* How long to wait between reporting thermal events */ -#define CHECK_INTERVAL (300 * HZ) +#define CHECK_INTERVAL (300 * HZ) static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); -atomic_t therm_throt_en = ATOMIC_INIT(0); + +atomic_t therm_throt_en = ATOMIC_INIT(0); #ifdef CONFIG_SYSFS -#define define_therm_throt_sysdev_one_ro(_name) \ - static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) - -#define define_therm_throt_sysdev_show_func(name) \ -static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ - struct sysdev_attribute *attr, \ - char *buf) \ -{ \ - unsigned int cpu = dev->id; \ - ssize_t ret; \ - \ - preempt_disable(); /* CPU hotplug */ \ - if (cpu_online(cpu)) \ - ret = sprintf(buf, "%lu\n", \ - per_cpu(thermal_throttle_##name, cpu)); \ - else \ - ret = 0; \ - preempt_enable(); \ - \ - return ret; \ +#define define_therm_throt_sysdev_one_ro(_name) \ + static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) + +#define define_therm_throt_sysdev_show_func(name) \ +static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ + struct sysdev_attribute *attr, \ + char *buf) \ +{ \ + unsigned int cpu = dev->id; \ + ssize_t ret; \ + \ + preempt_disable(); /* CPU hotplug */ \ + if (cpu_online(cpu)) \ + ret = sprintf(buf, "%lu\n", \ + per_cpu(thermal_throttle_##name, cpu)); \ + else \ + ret = 0; \ + preempt_enable(); \ + \ + return ret; \ } define_therm_throt_sysdev_show_func(count); @@ -61,8 +62,8 @@ static struct attribute *thermal_throttle_attrs[] = { }; static struct attribute_group thermal_throttle_attr_group = { - .attrs = thermal_throttle_attrs, - .name = "thermal_throttle" + .attrs = thermal_throttle_attrs, + .name = "thermal_throttle" }; #endif /* CONFIG_SYSFS */ @@ -110,10 +111,11 @@ int therm_throt_process(int curr) } #ifdef CONFIG_SYSFS -/* Add/Remove thermal_throttle interface for CPU device */ +/* Add/Remove thermal_throttle interface for CPU device: */ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) { - return sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group); + return sysfs_create_group(&sys_dev->kobj, + &thermal_throttle_attr_group); } static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) @@ -121,19 +123,21 @@ static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); } -/* Mutex protecting device creation against CPU hotplug */ +/* Mutex protecting device creation against CPU hotplug: */ static DEFINE_MUTEX(therm_cpu_lock); /* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +static __cpuinit int +thermal_throttle_cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) { unsigned int cpu = (unsigned long)hcpu; struct sys_device *sys_dev; int err = 0; sys_dev = get_cpu_sysdev(cpu); + switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: -- cgit v1.2.3 From 1cb2a8e1767ab60370ecce90654c0f281c602d95 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:18 +0200 Subject: x86, mce: clean up mce_amd_64.c Make the coding style match that of the rest of the x86 arch code. [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 188 +++++++++++++++++--------------- 1 file changed, 103 insertions(+), 85 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 56dde9c4bc9..4d90ec3eb51 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -13,22 +13,22 @@ * * All MC4_MISCi registers are shared between multi-cores */ - -#include -#include -#include #include -#include #include -#include -#include +#include #include +#include +#include #include +#include +#include +#include + +#include #include +#include #include #include -#include -#include #define PFX "mce_threshold: " #define VERSION "version 1.1.1" @@ -48,26 +48,26 @@ #define MCG_XBLK_ADDR 0xC0000400 struct threshold_block { - unsigned int block; - unsigned int bank; - unsigned int cpu; - u32 address; - u16 interrupt_enable; - u16 threshold_limit; - struct kobject kobj; - struct list_head miscj; + unsigned int block; + unsigned int bank; + unsigned int cpu; + u32 address; + u16 interrupt_enable; + u16 threshold_limit; + struct kobject kobj; + struct list_head miscj; }; /* defaults used early on boot */ static struct threshold_block threshold_defaults = { - .interrupt_enable = 0, - .threshold_limit = THRESHOLD_MAX, + .interrupt_enable = 0, + .threshold_limit = THRESHOLD_MAX, }; struct threshold_bank { - struct kobject *kobj; - struct threshold_block *blocks; - cpumask_var_t cpus; + struct kobject *kobj; + struct threshold_block *blocks; + cpumask_var_t cpus; }; static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); @@ -86,9 +86,9 @@ static void amd_threshold_interrupt(void); */ struct thresh_restart { - struct threshold_block *b; - int reset; - u16 old_limit; + struct threshold_block *b; + int reset; + u16 old_limit; }; /* must be called with correct cpu affinity */ @@ -110,6 +110,7 @@ static void threshold_restart_bank(void *_tr) } else if (tr->old_limit) { /* change limit w/o reset */ int new_count = (mci_misc_hi & THRESHOLD_MAX) + (tr->old_limit - tr->b->threshold_limit); + mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | (new_count & THRESHOLD_MAX); } @@ -125,11 +126,11 @@ static void threshold_restart_bank(void *_tr) /* cpu init entry point, called from mce.c with preempt off */ void mce_amd_feature_init(struct cpuinfo_x86 *c) { - unsigned int bank, block; unsigned int cpu = smp_processor_id(); - u8 lvt_off; u32 low = 0, high = 0, address = 0; + unsigned int bank, block; struct thresh_restart tr; + u8 lvt_off; for (bank = 0; bank < NR_BANKS; ++bank) { for (block = 0; block < NR_BLOCKS; ++block) { @@ -140,8 +141,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) if (!address) break; address += MCG_XBLK_ADDR; - } - else + } else ++address; if (rdmsr_safe(address, &low, &high)) @@ -193,9 +193,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) */ static void amd_threshold_interrupt(void) { + u32 low = 0, high = 0, address = 0; unsigned int bank, block; struct mce m; - u32 low = 0, high = 0, address = 0; mce_setup(&m); @@ -204,16 +204,16 @@ static void amd_threshold_interrupt(void) if (!(per_cpu(bank_map, m.cpu) & (1 << bank))) continue; for (block = 0; block < NR_BLOCKS; ++block) { - if (block == 0) + if (block == 0) { address = MSR_IA32_MC0_MISC + bank * 4; - else if (block == 1) { + } else if (block == 1) { address = (low & MASK_BLKPTR_LO) >> 21; if (!address) break; address += MCG_XBLK_ADDR; - } - else + } else { ++address; + } if (rdmsr_safe(address, &low, &high)) break; @@ -229,8 +229,10 @@ static void amd_threshold_interrupt(void) (high & MASK_LOCKED_HI)) continue; - /* Log the machine check that caused the threshold - event. */ + /* + * Log the machine check that caused the threshold + * event. + */ machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_poll_banks)); @@ -254,48 +256,56 @@ static void amd_threshold_interrupt(void) struct threshold_attr { struct attribute attr; - ssize_t(*show) (struct threshold_block *, char *); - ssize_t(*store) (struct threshold_block *, const char *, size_t count); + ssize_t (*show) (struct threshold_block *, char *); + ssize_t (*store) (struct threshold_block *, const char *, size_t count); }; -#define SHOW_FIELDS(name) \ -static ssize_t show_ ## name(struct threshold_block * b, char *buf) \ -{ \ - return sprintf(buf, "%lx\n", (unsigned long) b->name); \ +#define SHOW_FIELDS(name) \ +static ssize_t show_ ## name(struct threshold_block *b, char *buf) \ +{ \ + return sprintf(buf, "%lx\n", (unsigned long) b->name); \ } SHOW_FIELDS(interrupt_enable) SHOW_FIELDS(threshold_limit) -static ssize_t store_interrupt_enable(struct threshold_block *b, - const char *buf, size_t count) +static ssize_t +store_interrupt_enable(struct threshold_block *b, const char *buf, size_t count) { - char *end; struct thresh_restart tr; - unsigned long new = simple_strtoul(buf, &end, 0); + unsigned long new; + char *end; + + new = simple_strtoul(buf, &end, 0); if (end == buf) return -EINVAL; + b->interrupt_enable = !!new; - tr.b = b; - tr.reset = 0; - tr.old_limit = 0; + tr.b = b; + tr.reset = 0; + tr.old_limit = 0; + smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); return end - buf; } -static ssize_t store_threshold_limit(struct threshold_block *b, - const char *buf, size_t count) +static ssize_t +store_threshold_limit(struct threshold_block *b, const char *buf, size_t count) { - char *end; struct thresh_restart tr; - unsigned long new = simple_strtoul(buf, &end, 0); + unsigned long new; + char *end; + + new = simple_strtoul(buf, &end, 0); if (end == buf) return -EINVAL; + if (new > THRESHOLD_MAX) new = THRESHOLD_MAX; if (new < 1) new = 1; + tr.old_limit = b->threshold_limit; b->threshold_limit = new; tr.b = b; @@ -307,8 +317,8 @@ static ssize_t store_threshold_limit(struct threshold_block *b, } struct threshold_block_cross_cpu { - struct threshold_block *tb; - long retval; + struct threshold_block *tb; + long retval; }; static void local_error_count_handler(void *_tbcc) @@ -338,15 +348,16 @@ static ssize_t store_error_count(struct threshold_block *b, return 1; } -#define THRESHOLD_ATTR(_name,_mode,_show,_store) { \ - .attr = {.name = __stringify(_name), .mode = _mode }, \ - .show = _show, \ - .store = _store, \ +#define THRESHOLD_ATTR(_name, _mode, _show, _store) \ +{ \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ }; -#define RW_ATTR(name) \ -static struct threshold_attr name = \ - THRESHOLD_ATTR(name, 0644, show_## name, store_## name) +#define RW_ATTR(name) \ +static struct threshold_attr name = \ + THRESHOLD_ATTR(name, 0644, show_## name, store_## name) RW_ATTR(interrupt_enable); RW_ATTR(threshold_limit); @@ -359,15 +370,17 @@ static struct attribute *default_attrs[] = { NULL }; -#define to_block(k) container_of(k, struct threshold_block, kobj) -#define to_attr(a) container_of(a, struct threshold_attr, attr) +#define to_block(k) container_of(k, struct threshold_block, kobj) +#define to_attr(a) container_of(a, struct threshold_attr, attr) static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) { struct threshold_block *b = to_block(kobj); struct threshold_attr *a = to_attr(attr); ssize_t ret; + ret = a->show ? a->show(b, buf) : -EIO; + return ret; } @@ -377,18 +390,20 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr, struct threshold_block *b = to_block(kobj); struct threshold_attr *a = to_attr(attr); ssize_t ret; + ret = a->store ? a->store(b, buf, count) : -EIO; + return ret; } static struct sysfs_ops threshold_ops = { - .show = show, - .store = store, + .show = show, + .store = store, }; static struct kobj_type threshold_ktype = { - .sysfs_ops = &threshold_ops, - .default_attrs = default_attrs, + .sysfs_ops = &threshold_ops, + .default_attrs = default_attrs, }; static __cpuinit int allocate_threshold_blocks(unsigned int cpu, @@ -396,9 +411,9 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu, unsigned int block, u32 address) { - int err; - u32 low, high; struct threshold_block *b = NULL; + u32 low, high; + int err; if ((bank >= NR_BANKS) || (block >= NR_BLOCKS)) return 0; @@ -421,20 +436,21 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu, if (!b) return -ENOMEM; - b->block = block; - b->bank = bank; - b->cpu = cpu; - b->address = address; - b->interrupt_enable = 0; - b->threshold_limit = THRESHOLD_MAX; + b->block = block; + b->bank = bank; + b->cpu = cpu; + b->address = address; + b->interrupt_enable = 0; + b->threshold_limit = THRESHOLD_MAX; INIT_LIST_HEAD(&b->miscj); - if (per_cpu(threshold_banks, cpu)[bank]->blocks) + if (per_cpu(threshold_banks, cpu)[bank]->blocks) { list_add(&b->miscj, &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj); - else + } else { per_cpu(threshold_banks, cpu)[bank]->blocks = b; + } err = kobject_init_and_add(&b->kobj, &threshold_ktype, per_cpu(threshold_banks, cpu)[bank]->kobj, @@ -447,8 +463,9 @@ recurse: if (!address) return 0; address += MCG_XBLK_ADDR; - } else + } else { ++address; + } err = allocate_threshold_blocks(cpu, bank, ++block, address); if (err) @@ -507,6 +524,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) cpumask_copy(b->cpus, cpu_core_mask(cpu)); per_cpu(threshold_banks, cpu)[bank] = b; + goto out; } #endif @@ -605,15 +623,13 @@ static void deallocate_threshold_block(unsigned int cpu, static void threshold_remove_bank(unsigned int cpu, int bank) { - int i = 0; struct threshold_bank *b; char name[32]; + int i = 0; b = per_cpu(threshold_banks, cpu)[bank]; - if (!b) return; - if (!b->blocks) goto free_out; @@ -624,6 +640,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) if (shared_bank[bank] && b->blocks->cpu != cpu) { sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name); per_cpu(threshold_banks, cpu)[bank] = NULL; + return; } #endif @@ -659,8 +676,8 @@ static void threshold_remove_device(unsigned int cpu) } /* get notified when a cpu comes on/off */ -static void __cpuinit amd_64_threshold_cpu_callback(unsigned long action, - unsigned int cpu) +static void __cpuinit +amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu) { if (cpu >= NR_CPUS) return; @@ -686,11 +703,12 @@ static __init int threshold_init_device(void) /* to hit CPUs online before the notifier is up */ for_each_online_cpu(lcpu) { int err = threshold_create_device(lcpu); + if (err) return err; } threshold_cpu_callback = amd_64_threshold_cpu_callback; + return 0; } - device_initcall(threshold_init_device); -- cgit v1.2.3 From 6cc6f3ebd19fea722c19630af5ad68af7f51d493 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Apr 2009 12:31:23 +0200 Subject: x86, mce: unify Intel thermal init, prepare Prepare for unification, make two intel_init_thermal equal. [ Impact: cleanup ] Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 35 +++++++++++++----------- arch/x86/kernel/cpu/mcheck/p4.c | 44 ++++++++++++++----------------- 2 files changed, 40 insertions(+), 39 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index cef3ee30744..b85d0c107c8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -34,21 +34,22 @@ asmlinkage void smp_thermal_interrupt(void) irq_exit(); } +static inline void intel_set_thermal_handler(void) { } + static void intel_init_thermal(struct cpuinfo_x86 *c) { - u32 l, h; - int tm2 = 0; unsigned int cpu = smp_processor_id(); + int tm2 = 0; + u32 l, h; - if (!cpu_has(c, X86_FEATURE_ACPI)) - return; - - if (!cpu_has(c, X86_FEATURE_ACC)) + /* Thermal monitoring depends on ACPI and clock modulation*/ + if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) return; - /* first check if TM1 is already enabled by the BIOS, in which - * case there might be some SMM goo which handles it, so we can't even - * put a handler since it might be delivered via SMI already. + /* + * First check if its enabled already, in which case there might + * be some SMM goo which handles it, so we can't even put a handler + * since it might be delivered via SMI already: */ rdmsr(MSR_IA32_MISC_ENABLE, l, h); h = apic_read(APIC_LVTTHMR); @@ -61,31 +62,35 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) tm2 = 1; + /* Check whether a vector already exists */ if (h & APIC_VECTOR_MASK) { printk(KERN_DEBUG - "CPU%d: Thermal LVT vector (%#x) already " - "installed\n", cpu, (h & APIC_VECTOR_MASK)); + "CPU%d: Thermal LVT vector (%#x) already installed\n", + cpu, (h & APIC_VECTOR_MASK)); return; } - h = THERMAL_APIC_VECTOR; - h |= (APIC_DM_FIXED | APIC_LVT_MASKED); + /* We'll mask the thermal vector in the lapic till we're ready: */ + h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; apic_write(APIC_LVTTHMR, h); rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); + intel_set_thermal_handler(); + rdmsr(MSR_IA32_MISC_ENABLE, l, h); wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); + /* Unmask the thermal vector: */ l = apic_read(APIC_LVTTHMR); apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); + printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", - cpu, tm2 ? "TM2" : "TM1"); + cpu, tm2 ? "TM2" : "TM1"); /* enable thermal throttle processing */ atomic_set(&therm_throt_en, 1); - return; } /* diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index cb344aba479..f70753a443b 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -66,19 +66,21 @@ void smp_thermal_interrupt(struct pt_regs *regs) irq_exit(); } +static void intel_set_thermal_handler(void) +{ + vendor_thermal_interrupt = intel_thermal_interrupt; +} + /* P4/Xeon Thermal regulation detect and init: */ static void intel_init_thermal(struct cpuinfo_x86 *c) { unsigned int cpu = smp_processor_id(); + int tm2 = 0; u32 l, h; - /* Thermal monitoring: */ - if (!cpu_has(c, X86_FEATURE_ACPI)) - return; /* -ENODEV */ - - /* Clock modulation: */ - if (!cpu_has(c, X86_FEATURE_ACC)) - return; /* -ENODEV */ + /* Thermal monitoring depends on ACPI and clock modulation*/ + if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) + return; /* * First check if its enabled already, in which case there might @@ -90,35 +92,28 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", cpu); - - return; /* -EBUSY */ + return; } - /* Check whether a vector already exists, temporarily masked? */ + if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) + tm2 = 1; + + /* Check whether a vector already exists */ if (h & APIC_VECTOR_MASK) { printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already installed\n", cpu, (h & APIC_VECTOR_MASK)); - - return; /* -EBUSY */ + return; } - /* - * The temperature transition interrupt handler setup: - */ - - /* Our delivery vector: */ - h = THERMAL_APIC_VECTOR; - /* We'll mask the thermal vector in the lapic till we're ready: */ - h |= APIC_DM_FIXED | APIC_LVT_MASKED; + h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; apic_write(APIC_LVTTHMR, h); rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); - wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h); + wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); - /* Ok, we're good to go... */ - vendor_thermal_interrupt = intel_thermal_interrupt; + intel_set_thermal_handler(); rdmsr(MSR_IA32_MISC_ENABLE, l, h); wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); @@ -127,7 +122,8 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) l = apic_read(APIC_LVTTHMR); apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); - printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); + printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", + cpu, tm2 ? "TM2" : "TM1"); /* enable thermal throttle processing */ atomic_set(&therm_throt_en, 1); -- cgit v1.2.3 From a65d086235208a3b3546e209d2210048549099b2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Apr 2009 12:31:23 +0200 Subject: x86, mce: unify Intel thermal init Mechanic unification. No change in code. [ Impact: cleanup, 32-bit / 64-bit unification ] Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/Makefile | 3 +- arch/x86/kernel/cpu/mcheck/mce.h | 11 +++++ arch/x86/kernel/cpu/mcheck/mce_intel.c | 73 +++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 61 +------------------------- arch/x86/kernel/cpu/mcheck/p4.c | 59 +------------------------ 5 files changed, 89 insertions(+), 118 deletions(-) create mode 100644 arch/x86/kernel/cpu/mcheck/mce_intel.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index b2f89829bbe..6def76942bf 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,7 +1,8 @@ obj-y = mce_$(BITS).o therm_throt.o obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o -obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o +obj-$(CONFIG_X86_MCE_P4THERMAL) += mce_intel.o +obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o mce_intel.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h index ae9f628838f..2d1a54bdadf 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.h +++ b/arch/x86/kernel/cpu/mcheck/mce.h @@ -1,6 +1,8 @@ #include #include +#ifdef CONFIG_X86_32 + void amd_mcheck_init(struct cpuinfo_x86 *c); void intel_p4_mcheck_init(struct cpuinfo_x86 *c); void intel_p5_mcheck_init(struct cpuinfo_x86 *c); @@ -12,3 +14,12 @@ extern void (*machine_check_vector)(struct pt_regs *, long error_code); extern int nr_mce_banks; +void intel_set_thermal_handler(void); + +#else + +static inline void intel_set_thermal_handler(void) { } + +#endif + +void intel_init_thermal(struct cpuinfo_x86 *c); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c new file mode 100644 index 00000000000..bad3cbb0e56 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -0,0 +1,73 @@ +/* + * Common code for Intel machine checks + */ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "mce.h" + +void intel_init_thermal(struct cpuinfo_x86 *c) +{ + unsigned int cpu = smp_processor_id(); + int tm2 = 0; + u32 l, h; + + /* Thermal monitoring depends on ACPI and clock modulation*/ + if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) + return; + + /* + * First check if its enabled already, in which case there might + * be some SMM goo which handles it, so we can't even put a handler + * since it might be delivered via SMI already: + */ + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + h = apic_read(APIC_LVTTHMR); + if ((l & (1 << 3)) && (h & APIC_DM_SMI)) { + printk(KERN_DEBUG + "CPU%d: Thermal monitoring handled by SMI\n", cpu); + return; + } + + if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13))) + tm2 = 1; + + /* Check whether a vector already exists */ + if (h & APIC_VECTOR_MASK) { + printk(KERN_DEBUG + "CPU%d: Thermal LVT vector (%#x) already installed\n", + cpu, (h & APIC_VECTOR_MASK)); + return; + } + + /* We'll mask the thermal vector in the lapic till we're ready: */ + h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; + apic_write(APIC_LVTTHMR, h); + + rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); + wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); + + intel_set_thermal_handler(); + + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h); + + /* Unmask the thermal vector: */ + l = apic_read(APIC_LVTTHMR); + apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); + + printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", + cpu, tm2 ? "TM2" : "TM1"); + + /* enable thermal throttle processing */ + atomic_set(&therm_throt_en, 1); +} diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index b85d0c107c8..38f9632306f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -17,6 +17,8 @@ #include #include +#include "mce.h" + asmlinkage void smp_thermal_interrupt(void) { __u64 msr_val; @@ -34,65 +36,6 @@ asmlinkage void smp_thermal_interrupt(void) irq_exit(); } -static inline void intel_set_thermal_handler(void) { } - -static void intel_init_thermal(struct cpuinfo_x86 *c) -{ - unsigned int cpu = smp_processor_id(); - int tm2 = 0; - u32 l, h; - - /* Thermal monitoring depends on ACPI and clock modulation*/ - if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) - return; - - /* - * First check if its enabled already, in which case there might - * be some SMM goo which handles it, so we can't even put a handler - * since it might be delivered via SMI already: - */ - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - h = apic_read(APIC_LVTTHMR); - if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { - printk(KERN_DEBUG - "CPU%d: Thermal monitoring handled by SMI\n", cpu); - return; - } - - if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) - tm2 = 1; - - /* Check whether a vector already exists */ - if (h & APIC_VECTOR_MASK) { - printk(KERN_DEBUG - "CPU%d: Thermal LVT vector (%#x) already installed\n", - cpu, (h & APIC_VECTOR_MASK)); - return; - } - - /* We'll mask the thermal vector in the lapic till we're ready: */ - h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; - apic_write(APIC_LVTTHMR, h); - - rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); - wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); - - intel_set_thermal_handler(); - - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); - - /* Unmask the thermal vector: */ - l = apic_read(APIC_LVTTHMR); - apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); - - printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", - cpu, tm2 ? "TM2" : "TM1"); - - /* enable thermal throttle processing */ - atomic_set(&therm_throt_en, 1); -} - /* * Support for Intel Correct Machine Check Interrupts. This allows * the CPU to raise an interrupt when a corrected machine check happened. diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index f70753a443b..f979ffea330 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -66,68 +66,11 @@ void smp_thermal_interrupt(struct pt_regs *regs) irq_exit(); } -static void intel_set_thermal_handler(void) +void intel_set_thermal_handler(void) { vendor_thermal_interrupt = intel_thermal_interrupt; } -/* P4/Xeon Thermal regulation detect and init: */ -static void intel_init_thermal(struct cpuinfo_x86 *c) -{ - unsigned int cpu = smp_processor_id(); - int tm2 = 0; - u32 l, h; - - /* Thermal monitoring depends on ACPI and clock modulation*/ - if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) - return; - - /* - * First check if its enabled already, in which case there might - * be some SMM goo which handles it, so we can't even put a handler - * since it might be delivered via SMI already: - */ - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - h = apic_read(APIC_LVTTHMR); - if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { - printk(KERN_DEBUG - "CPU%d: Thermal monitoring handled by SMI\n", cpu); - return; - } - - if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) - tm2 = 1; - - /* Check whether a vector already exists */ - if (h & APIC_VECTOR_MASK) { - printk(KERN_DEBUG - "CPU%d: Thermal LVT vector (%#x) already installed\n", - cpu, (h & APIC_VECTOR_MASK)); - return; - } - - /* We'll mask the thermal vector in the lapic till we're ready: */ - h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; - apic_write(APIC_LVTTHMR, h); - - rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); - wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); - - intel_set_thermal_handler(); - - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); - - /* Unmask the thermal vector: */ - l = apic_read(APIC_LVTTHMR); - apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); - - printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", - cpu, tm2 ? "TM2" : "TM1"); - - /* enable thermal throttle processing */ - atomic_set(&therm_throt_en, 1); -} #endif /* CONFIG_X86_MCE_P4THERMAL */ /* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ -- cgit v1.2.3 From a988d334ae8213c0e0e62327222f6e5e6e52bcf1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:25 +0200 Subject: x86, mce: unify, prepare codes Move current 32-bit mce_32.c code into mce_64.c. [ Remove unused artifact stop/restart_mce pointed by Andi Kleen ] Signed-off-by: Ingo Molnar Cc: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 65 +++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 1491246c4d6..ce48ae75e1d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -1240,3 +1240,68 @@ static __init int mce_init_device(void) } device_initcall(mce_init_device); + +#ifdef CONFIG_X86_32 + +int mce_disabled; + +int nr_mce_banks; +EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ + +/* Handle unconfigured int18 (should never happen) */ +static void unexpected_machine_check(struct pt_regs *regs, long error_code) +{ + printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", + smp_processor_id()); +} + +/* Call the installed machine check handler for this CPU setup. */ +void (*machine_check_vector)(struct pt_regs *, long error_code) = + unexpected_machine_check; + +/* This has to be run for each processor */ +void mcheck_init(struct cpuinfo_x86 *c) +{ + if (mce_disabled == 1) + return; + + switch (c->x86_vendor) { + case X86_VENDOR_AMD: + amd_mcheck_init(c); + break; + + case X86_VENDOR_INTEL: + if (c->x86 == 5) + intel_p5_mcheck_init(c); + if (c->x86 == 6) + intel_p6_mcheck_init(c); + if (c->x86 == 15) + intel_p4_mcheck_init(c); + break; + + case X86_VENDOR_CENTAUR: + if (c->x86 == 5) + winchip_mcheck_init(c); + break; + + default: + break; + } +} + +static int __init mcheck_disable(char *str) +{ + mce_disabled = 1; + return 1; +} + +static int __init mcheck_enable(char *str) +{ + mce_disabled = -1; + return 1; +} + +__setup("nomce", mcheck_disable); +__setup("mce", mcheck_enable); + +#endif /* CONFIG_X86_32 */ -- cgit v1.2.3 From 711c2e481c9d1113650d09de10f61ee88ab56fda Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:26 +0200 Subject: x86, mce: unify, prepare for 32-bit v2 Prepare the 64-bit mce_64.c code side to be built on 32-bit. [ includes ifdef relocation by Andi Kleen ] Signed-off-by: Ingo Molnar Cc: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.h | 4 ++-- arch/x86/kernel/cpu/mcheck/mce_64.c | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h index 2d1a54bdadf..cd6cffcc2de 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.h +++ b/arch/x86/kernel/cpu/mcheck/mce.h @@ -1,14 +1,14 @@ #include #include -#ifdef CONFIG_X86_32 - void amd_mcheck_init(struct cpuinfo_x86 *c); void intel_p4_mcheck_init(struct cpuinfo_x86 *c); void intel_p5_mcheck_init(struct cpuinfo_x86 *c); void intel_p6_mcheck_init(struct cpuinfo_x86 *c); void winchip_mcheck_init(struct cpuinfo_x86 *c); +#ifdef CONFIG_X86_32 + /* Call the installed machine check handler for this CPU setup. */ extern void (*machine_check_vector)(struct pt_regs *, long error_code); diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index ce48ae75e1d..2e2c3d2e958 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -37,6 +37,10 @@ #include #include +#include "mce.h" + +#ifdef CONFIG_X86_64 + #define MISC_MCELOG_MINOR 227 atomic_t mce_entry; @@ -1241,7 +1245,7 @@ static __init int mce_init_device(void) device_initcall(mce_init_device); -#ifdef CONFIG_X86_32 +#else /* CONFIG_X86_32: */ int mce_disabled; -- cgit v1.2.3 From dba3725d44f5dfb5711fd509fca10b5b828c43b7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:26 +0200 Subject: x86, mce: unify move mce_64.c => mce.c and glue it up in the Makefile. Remove mce_32.c Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/Makefile | 2 +- arch/x86/kernel/cpu/mcheck/mce.c | 1311 +++++++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/mcheck/mce_32.c | 77 -- arch/x86/kernel/cpu/mcheck/mce_64.c | 1311 ----------------------------------- 4 files changed, 1312 insertions(+), 1389 deletions(-) create mode 100644 arch/x86/kernel/cpu/mcheck/mce.c delete mode 100644 arch/x86/kernel/cpu/mcheck/mce_32.c delete mode 100644 arch/x86/kernel/cpu/mcheck/mce_64.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 6def76942bf..55f01b39a10 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,4 +1,4 @@ -obj-y = mce_$(BITS).o therm_throt.o +obj-y = mce.o therm_throt.o obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o obj-$(CONFIG_X86_MCE_P4THERMAL) += mce_intel.o diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c new file mode 100644 index 00000000000..2e2c3d2e958 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -0,0 +1,1311 @@ +/* + * Machine check handler. + * + * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. + * Rest from unknown author(s). + * 2004 Andi Kleen. Rewrote most of it. + * Copyright 2008 Intel Corporation + * Author: Andi Kleen + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "mce.h" + +#ifdef CONFIG_X86_64 + +#define MISC_MCELOG_MINOR 227 + +atomic_t mce_entry; + +static int mce_dont_init; + +/* + * Tolerant levels: + * 0: always panic on uncorrected errors, log corrected errors + * 1: panic or SIGBUS on uncorrected errors, log corrected errors + * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors + * 3: never panic or SIGBUS, log all errors (for testing only) + */ +static int tolerant = 1; +static int banks; +static u64 *bank; +static unsigned long notify_user; +static int rip_msr; +static int mce_bootlog = -1; +static atomic_t mce_events; + +static char trigger[128]; +static char *trigger_argv[2] = { trigger, NULL }; + +static DECLARE_WAIT_QUEUE_HEAD(mce_wait); + +/* MCA banks polled by the period polling timer for corrected events */ +DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { + [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL +}; + +/* Do initial initialization of a struct mce */ +void mce_setup(struct mce *m) +{ + memset(m, 0, sizeof(struct mce)); + m->cpu = smp_processor_id(); + rdtscll(m->tsc); +} + +/* + * Lockless MCE logging infrastructure. + * This avoids deadlocks on printk locks without having to break locks. Also + * separate MCEs from kernel messages to avoid bogus bug reports. + */ + +static struct mce_log mcelog = { + MCE_LOG_SIGNATURE, + MCE_LOG_LEN, +}; + +void mce_log(struct mce *mce) +{ + unsigned next, entry; + + atomic_inc(&mce_events); + mce->finished = 0; + wmb(); + for (;;) { + entry = rcu_dereference(mcelog.next); + for (;;) { + /* + * When the buffer fills up discard new entries. + * Assume that the earlier errors are the more + * interesting ones: + */ + if (entry >= MCE_LOG_LEN) { + set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags); + return; + } + /* Old left over entry. Skip: */ + if (mcelog.entry[entry].finished) { + entry++; + continue; + } + break; + } + smp_rmb(); + next = entry + 1; + if (cmpxchg(&mcelog.next, entry, next) == entry) + break; + } + memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); + wmb(); + mcelog.entry[entry].finished = 1; + wmb(); + + set_bit(0, ¬ify_user); +} + +static void print_mce(struct mce *m) +{ + printk(KERN_EMERG "\n" + KERN_EMERG "HARDWARE ERROR\n" + KERN_EMERG + "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", + m->cpu, m->mcgstatus, m->bank, m->status); + if (m->ip) { + printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", + !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", + m->cs, m->ip); + if (m->cs == __KERNEL_CS) + print_symbol("{%s}", m->ip); + printk("\n"); + } + printk(KERN_EMERG "TSC %llx ", m->tsc); + if (m->addr) + printk("ADDR %llx ", m->addr); + if (m->misc) + printk("MISC %llx ", m->misc); + printk("\n"); + printk(KERN_EMERG "This is not a software problem!\n"); + printk(KERN_EMERG "Run through mcelog --ascii to decode " + "and contact your hardware vendor\n"); +} + +static void mce_panic(char *msg, struct mce *backup, unsigned long start) +{ + int i; + + oops_begin(); + for (i = 0; i < MCE_LOG_LEN; i++) { + unsigned long tsc = mcelog.entry[i].tsc; + + if (time_before(tsc, start)) + continue; + print_mce(&mcelog.entry[i]); + if (backup && mcelog.entry[i].tsc == backup->tsc) + backup = NULL; + } + if (backup) + print_mce(backup); + panic(msg); +} + +int mce_available(struct cpuinfo_x86 *c) +{ + if (mce_dont_init) + return 0; + return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); +} + +static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) +{ + if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { + m->ip = regs->ip; + m->cs = regs->cs; + } else { + m->ip = 0; + m->cs = 0; + } + if (rip_msr) { + /* Assume the RIP in the MSR is exact. Is this true? */ + m->mcgstatus |= MCG_STATUS_EIPV; + rdmsrl(rip_msr, m->ip); + m->cs = 0; + } +} + +/* + * Poll for corrected events or events that happened before reset. + * Those are just logged through /dev/mcelog. + * + * This is executed in standard interrupt context. + */ +void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) +{ + struct mce m; + int i; + + mce_setup(&m); + + rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + for (i = 0; i < banks; i++) { + if (!bank[i] || !test_bit(i, *b)) + continue; + + m.misc = 0; + m.addr = 0; + m.bank = i; + m.tsc = 0; + + barrier(); + rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); + if (!(m.status & MCI_STATUS_VAL)) + continue; + + /* + * Uncorrected events are handled by the exception handler + * when it is enabled. But when the exception is disabled log + * everything. + * + * TBD do the same check for MCI_STATUS_EN here? + */ + if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) + continue; + + if (m.status & MCI_STATUS_MISCV) + rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); + if (m.status & MCI_STATUS_ADDRV) + rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); + + if (!(flags & MCP_TIMESTAMP)) + m.tsc = 0; + /* + * Don't get the IP here because it's unlikely to + * have anything to do with the actual error location. + */ + if (!(flags & MCP_DONTLOG)) { + mce_log(&m); + add_taint(TAINT_MACHINE_CHECK); + } + + /* + * Clear state for this bank. + */ + wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + } + + /* + * Don't clear MCG_STATUS here because it's only defined for + * exceptions. + */ +} + +/* + * The actual machine check handler. This only handles real + * exceptions when something got corrupted coming in through int 18. + * + * This is executed in NMI context not subject to normal locking rules. This + * implies that most kernel services cannot be safely used. Don't even + * think about putting a printk in there! + */ +void do_machine_check(struct pt_regs *regs, long error_code) +{ + struct mce m, panicm; + int panicm_found = 0; + u64 mcestart = 0; + int i; + /* + * If no_way_out gets set, there is no safe way to recover from this + * MCE. If tolerant is cranked up, we'll try anyway. + */ + int no_way_out = 0; + /* + * If kill_it gets set, there might be a way to recover from this + * error. + */ + int kill_it = 0; + DECLARE_BITMAP(toclear, MAX_NR_BANKS); + + atomic_inc(&mce_entry); + + if (notify_die(DIE_NMI, "machine check", regs, error_code, + 18, SIGKILL) == NOTIFY_STOP) + goto out2; + if (!banks) + goto out2; + + mce_setup(&m); + + rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + + /* if the restart IP is not valid, we're done for */ + if (!(m.mcgstatus & MCG_STATUS_RIPV)) + no_way_out = 1; + + rdtscll(mcestart); + barrier(); + + for (i = 0; i < banks; i++) { + __clear_bit(i, toclear); + if (!bank[i]) + continue; + + m.misc = 0; + m.addr = 0; + m.bank = i; + + rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); + if ((m.status & MCI_STATUS_VAL) == 0) + continue; + + /* + * Non uncorrected errors are handled by machine_check_poll + * Leave them alone. + */ + if ((m.status & MCI_STATUS_UC) == 0) + continue; + + /* + * Set taint even when machine check was not enabled. + */ + add_taint(TAINT_MACHINE_CHECK); + + __set_bit(i, toclear); + + if (m.status & MCI_STATUS_EN) { + /* if PCC was set, there's no way out */ + no_way_out |= !!(m.status & MCI_STATUS_PCC); + /* + * If this error was uncorrectable and there was + * an overflow, we're in trouble. If no overflow, + * we might get away with just killing a task. + */ + if (m.status & MCI_STATUS_UC) { + if (tolerant < 1 || m.status & MCI_STATUS_OVER) + no_way_out = 1; + kill_it = 1; + } + } else { + /* + * Machine check event was not enabled. Clear, but + * ignore. + */ + continue; + } + + if (m.status & MCI_STATUS_MISCV) + rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); + if (m.status & MCI_STATUS_ADDRV) + rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); + + mce_get_rip(&m, regs); + mce_log(&m); + + /* + * Did this bank cause the exception? + * + * Assume that the bank with uncorrectable errors did it, + * and that there is only a single one: + */ + if ((m.status & MCI_STATUS_UC) && + (m.status & MCI_STATUS_EN)) { + panicm = m; + panicm_found = 1; + } + } + + /* + * If we didn't find an uncorrectable error, pick + * the last one (shouldn't happen, just being safe). + */ + if (!panicm_found) + panicm = m; + + /* + * If we have decided that we just CAN'T continue, and the user + * has not set tolerant to an insane level, give up and die. + */ + if (no_way_out && tolerant < 3) + mce_panic("Machine check", &panicm, mcestart); + + /* + * If the error seems to be unrecoverable, something should be + * done. Try to kill as little as possible. If we can kill just + * one task, do that. If the user has set the tolerance very + * high, don't try to do anything at all. + */ + if (kill_it && tolerant < 3) { + int user_space = 0; + + /* + * If the EIPV bit is set, it means the saved IP is the + * instruction which caused the MCE. + */ + if (m.mcgstatus & MCG_STATUS_EIPV) + user_space = panicm.ip && (panicm.cs & 3); + + /* + * If we know that the error was in user space, send a + * SIGBUS. Otherwise, panic if tolerance is low. + * + * force_sig() takes an awful lot of locks and has a slight + * risk of deadlocking. + */ + if (user_space) { + force_sig(SIGBUS, current); + } else if (panic_on_oops || tolerant < 2) { + mce_panic("Uncorrected machine check", + &panicm, mcestart); + } + } + + /* notify userspace ASAP */ + set_thread_flag(TIF_MCE_NOTIFY); + + /* the last thing we do is clear state */ + for (i = 0; i < banks; i++) { + if (test_bit(i, toclear)) + wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + } + wrmsrl(MSR_IA32_MCG_STATUS, 0); + out2: + atomic_dec(&mce_entry); +} + +#ifdef CONFIG_X86_MCE_INTEL +/*** + * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog + * @cpu: The CPU on which the event occurred. + * @status: Event status information + * + * This function should be called by the thermal interrupt after the + * event has been processed and the decision was made to log the event + * further. + * + * The status parameter will be saved to the 'status' field of 'struct mce' + * and historically has been the register value of the + * MSR_IA32_THERMAL_STATUS (Intel) msr. + */ +void mce_log_therm_throt_event(__u64 status) +{ + struct mce m; + + mce_setup(&m); + m.bank = MCE_THERMAL_BANK; + m.status = status; + mce_log(&m); +} +#endif /* CONFIG_X86_MCE_INTEL */ + +/* + * Periodic polling timer for "silent" machine check errors. If the + * poller finds an MCE, poll 2x faster. When the poller finds no more + * errors, poll 2x slower (up to check_interval seconds). + */ +static int check_interval = 5 * 60; /* 5 minutes */ + +static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ +static DEFINE_PER_CPU(struct timer_list, mce_timer); + +static void mcheck_timer(unsigned long data) +{ + struct timer_list *t = &per_cpu(mce_timer, data); + int *n; + + WARN_ON(smp_processor_id() != data); + + if (mce_available(¤t_cpu_data)) { + machine_check_poll(MCP_TIMESTAMP, + &__get_cpu_var(mce_poll_banks)); + } + + /* + * Alert userspace if needed. If we logged an MCE, reduce the + * polling interval, otherwise increase the polling interval. + */ + n = &__get_cpu_var(next_interval); + if (mce_notify_user()) { + *n = max(*n/2, HZ/100); + } else { + *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); + } + + t->expires = jiffies + *n; + add_timer(t); +} + +static void mce_do_trigger(struct work_struct *work) +{ + call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); +} + +static DECLARE_WORK(mce_trigger_work, mce_do_trigger); + +/* + * Notify the user(s) about new machine check events. + * Can be called from interrupt context, but not from machine check/NMI + * context. + */ +int mce_notify_user(void) +{ + /* Not more than two messages every minute */ + static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); + + clear_thread_flag(TIF_MCE_NOTIFY); + + if (test_and_clear_bit(0, ¬ify_user)) { + wake_up_interruptible(&mce_wait); + + /* + * There is no risk of missing notifications because + * work_pending is always cleared before the function is + * executed. + */ + if (trigger[0] && !work_pending(&mce_trigger_work)) + schedule_work(&mce_trigger_work); + + if (__ratelimit(&ratelimit)) + printk(KERN_INFO "Machine check events logged\n"); + + return 1; + } + return 0; +} + +/* see if the idle task needs to notify userspace: */ +static int +mce_idle_callback(struct notifier_block *nfb, unsigned long action, + void *unused) +{ + /* IDLE_END should be safe - interrupts are back on */ + if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY)) + mce_notify_user(); + + return NOTIFY_OK; +} + +static struct notifier_block mce_idle_notifier = { + .notifier_call = mce_idle_callback, +}; + +static __init int periodic_mcheck_init(void) +{ + idle_notifier_register(&mce_idle_notifier); + return 0; +} +__initcall(periodic_mcheck_init); + +/* + * Initialize Machine Checks for a CPU. + */ +static int mce_cap_init(void) +{ + unsigned b; + u64 cap; + + rdmsrl(MSR_IA32_MCG_CAP, cap); + b = cap & 0xff; + if (b > MAX_NR_BANKS) { + printk(KERN_WARNING + "MCE: Using only %u machine check banks out of %u\n", + MAX_NR_BANKS, b); + b = MAX_NR_BANKS; + } + + /* Don't support asymmetric configurations today */ + WARN_ON(banks != 0 && b != banks); + banks = b; + if (!bank) { + bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); + if (!bank) + return -ENOMEM; + memset(bank, 0xff, banks * sizeof(u64)); + } + + /* Use accurate RIP reporting if available. */ + if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) + rip_msr = MSR_IA32_MCG_EIP; + + return 0; +} + +static void mce_init(void *dummy) +{ + mce_banks_t all_banks; + u64 cap; + int i; + + /* + * Log the machine checks left over from the previous reset. + */ + bitmap_fill(all_banks, MAX_NR_BANKS); + machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); + + set_in_cr4(X86_CR4_MCE); + + rdmsrl(MSR_IA32_MCG_CAP, cap); + if (cap & MCG_CTL_P) + wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); + + for (i = 0; i < banks; i++) { + wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); + wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + } +} + +/* Add per CPU specific workarounds here */ +static void mce_cpu_quirks(struct cpuinfo_x86 *c) +{ + /* This should be disabled by the BIOS, but isn't always */ + if (c->x86_vendor == X86_VENDOR_AMD) { + if (c->x86 == 15 && banks > 4) { + /* + * disable GART TBL walk error reporting, which + * trips off incorrectly with the IOMMU & 3ware + * & Cerberus: + */ + clear_bit(10, (unsigned long *)&bank[4]); + } + if (c->x86 <= 17 && mce_bootlog < 0) { + /* + * Lots of broken BIOS around that don't clear them + * by default and leave crap in there. Don't log: + */ + mce_bootlog = 0; + } + } + +} + +static void mce_cpu_features(struct cpuinfo_x86 *c) +{ + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + mce_intel_feature_init(c); + break; + case X86_VENDOR_AMD: + mce_amd_feature_init(c); + break; + default: + break; + } +} + +static void mce_init_timer(void) +{ + struct timer_list *t = &__get_cpu_var(mce_timer); + int *n = &__get_cpu_var(next_interval); + + *n = check_interval * HZ; + if (!*n) + return; + setup_timer(t, mcheck_timer, smp_processor_id()); + t->expires = round_jiffies(jiffies + *n); + add_timer(t); +} + +/* + * Called for each booted CPU to set up machine checks. + * Must be called with preempt off: + */ +void __cpuinit mcheck_init(struct cpuinfo_x86 *c) +{ + if (!mce_available(c)) + return; + + if (mce_cap_init() < 0) { + mce_dont_init = 1; + return; + } + mce_cpu_quirks(c); + + mce_init(NULL); + mce_cpu_features(c); + mce_init_timer(); +} + +/* + * Character device to read and clear the MCE log. + */ + +static DEFINE_SPINLOCK(mce_state_lock); +static int open_count; /* #times opened */ +static int open_exclu; /* already open exclusive? */ + +static int mce_open(struct inode *inode, struct file *file) +{ + lock_kernel(); + spin_lock(&mce_state_lock); + + if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { + spin_unlock(&mce_state_lock); + unlock_kernel(); + + return -EBUSY; + } + + if (file->f_flags & O_EXCL) + open_exclu = 1; + open_count++; + + spin_unlock(&mce_state_lock); + unlock_kernel(); + + return nonseekable_open(inode, file); +} + +static int mce_release(struct inode *inode, struct file *file) +{ + spin_lock(&mce_state_lock); + + open_count--; + open_exclu = 0; + + spin_unlock(&mce_state_lock); + + return 0; +} + +static void collect_tscs(void *data) +{ + unsigned long *cpu_tsc = (unsigned long *)data; + + rdtscll(cpu_tsc[smp_processor_id()]); +} + +static DEFINE_MUTEX(mce_read_mutex); + +static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, + loff_t *off) +{ + char __user *buf = ubuf; + unsigned long *cpu_tsc; + unsigned prev, next; + int i, err; + + cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); + if (!cpu_tsc) + return -ENOMEM; + + mutex_lock(&mce_read_mutex); + next = rcu_dereference(mcelog.next); + + /* Only supports full reads right now */ + if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { + mutex_unlock(&mce_read_mutex); + kfree(cpu_tsc); + + return -EINVAL; + } + + err = 0; + prev = 0; + do { + for (i = prev; i < next; i++) { + unsigned long start = jiffies; + + while (!mcelog.entry[i].finished) { + if (time_after_eq(jiffies, start + 2)) { + memset(mcelog.entry + i, 0, + sizeof(struct mce)); + goto timeout; + } + cpu_relax(); + } + smp_rmb(); + err |= copy_to_user(buf, mcelog.entry + i, + sizeof(struct mce)); + buf += sizeof(struct mce); +timeout: + ; + } + + memset(mcelog.entry + prev, 0, + (next - prev) * sizeof(struct mce)); + prev = next; + next = cmpxchg(&mcelog.next, prev, 0); + } while (next != prev); + + synchronize_sched(); + + /* + * Collect entries that were still getting written before the + * synchronize. + */ + on_each_cpu(collect_tscs, cpu_tsc, 1); + + for (i = next; i < MCE_LOG_LEN; i++) { + if (mcelog.entry[i].finished && + mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { + err |= copy_to_user(buf, mcelog.entry+i, + sizeof(struct mce)); + smp_rmb(); + buf += sizeof(struct mce); + memset(&mcelog.entry[i], 0, sizeof(struct mce)); + } + } + mutex_unlock(&mce_read_mutex); + kfree(cpu_tsc); + + return err ? -EFAULT : buf - ubuf; +} + +static unsigned int mce_poll(struct file *file, poll_table *wait) +{ + poll_wait(file, &mce_wait, wait); + if (rcu_dereference(mcelog.next)) + return POLLIN | POLLRDNORM; + return 0; +} + +static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) +{ + int __user *p = (int __user *)arg; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (cmd) { + case MCE_GET_RECORD_LEN: + return put_user(sizeof(struct mce), p); + case MCE_GET_LOG_LEN: + return put_user(MCE_LOG_LEN, p); + case MCE_GETCLEAR_FLAGS: { + unsigned flags; + + do { + flags = mcelog.flags; + } while (cmpxchg(&mcelog.flags, flags, 0) != flags); + + return put_user(flags, p); + } + default: + return -ENOTTY; + } +} + +static const struct file_operations mce_chrdev_ops = { + .open = mce_open, + .release = mce_release, + .read = mce_read, + .poll = mce_poll, + .unlocked_ioctl = mce_ioctl, +}; + +static struct miscdevice mce_log_device = { + MISC_MCELOG_MINOR, + "mcelog", + &mce_chrdev_ops, +}; + +/* + * Old style boot options parsing. Only for compatibility. + */ +static int __init mcheck_disable(char *str) +{ + mce_dont_init = 1; + return 1; +} +__setup("nomce", mcheck_disable); + +/* + * mce=off disables machine check + * mce=TOLERANCELEVEL (number, see above) + * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. + * mce=nobootlog Don't log MCEs from before booting. + */ +static int __init mcheck_enable(char *str) +{ + if (!strcmp(str, "off")) + mce_dont_init = 1; + else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) + mce_bootlog = (str[0] == 'b'); + else if (isdigit(str[0])) + get_option(&str, &tolerant); + else { + printk(KERN_INFO "mce= argument %s ignored. Please use /sys\n", + str); + return 0; + } + return 1; +} +__setup("mce=", mcheck_enable); + +/* + * Sysfs support + */ + +/* + * Disable machine checks on suspend and shutdown. We can't really handle + * them later. + */ +static int mce_disable(void) +{ + int i; + + for (i = 0; i < banks; i++) + wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + return 0; +} + +static int mce_suspend(struct sys_device *dev, pm_message_t state) +{ + return mce_disable(); +} + +static int mce_shutdown(struct sys_device *dev) +{ + return mce_disable(); +} + +/* + * On resume clear all MCE state. Don't want to see leftovers from the BIOS. + * Only one CPU is active at this time, the others get re-added later using + * CPU hotplug: + */ +static int mce_resume(struct sys_device *dev) +{ + mce_init(NULL); + mce_cpu_features(¤t_cpu_data); + + return 0; +} + +static void mce_cpu_restart(void *data) +{ + del_timer_sync(&__get_cpu_var(mce_timer)); + if (mce_available(¤t_cpu_data)) + mce_init(NULL); + mce_init_timer(); +} + +/* Reinit MCEs after user configuration changes */ +static void mce_restart(void) +{ + on_each_cpu(mce_cpu_restart, NULL, 1); +} + +static struct sysdev_class mce_sysclass = { + .suspend = mce_suspend, + .shutdown = mce_shutdown, + .resume = mce_resume, + .name = "machinecheck", +}; + +DEFINE_PER_CPU(struct sys_device, device_mce); + +__cpuinitdata +void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); + +/* Why are there no generic functions for this? */ +#define ACCESSOR(name, var, start) \ + static ssize_t show_ ## name(struct sys_device *s, \ + struct sysdev_attribute *attr, \ + char *buf) { \ + return sprintf(buf, "%lx\n", (unsigned long)var); \ + } \ + static ssize_t set_ ## name(struct sys_device *s, \ + struct sysdev_attribute *attr, \ + const char *buf, size_t siz) { \ + char *end; \ + unsigned long new = simple_strtoul(buf, &end, 0); \ + \ + if (end == buf) \ + return -EINVAL; \ + var = new; \ + start; \ + \ + return end-buf; \ + } \ + static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); + +static struct sysdev_attribute *bank_attrs; + +static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, + char *buf) +{ + u64 b = bank[attr - bank_attrs]; + + return sprintf(buf, "%llx\n", b); +} + +static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, + const char *buf, size_t siz) +{ + char *end; + u64 new = simple_strtoull(buf, &end, 0); + + if (end == buf) + return -EINVAL; + + bank[attr - bank_attrs] = new; + mce_restart(); + + return end-buf; +} + +static ssize_t +show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) +{ + strcpy(buf, trigger); + strcat(buf, "\n"); + return strlen(trigger) + 1; +} + +static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, + const char *buf, size_t siz) +{ + char *p; + int len; + + strncpy(trigger, buf, sizeof(trigger)); + trigger[sizeof(trigger)-1] = 0; + len = strlen(trigger); + p = strchr(trigger, '\n'); + + if (*p) + *p = 0; + + return len; +} + +static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); +static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); + +ACCESSOR(check_interval, check_interval, mce_restart()) + +static struct sysdev_attribute *mce_attributes[] = { + &attr_tolerant.attr, &attr_check_interval, &attr_trigger, + NULL +}; + +static cpumask_var_t mce_device_initialized; + +/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ +static __cpuinit int mce_create_device(unsigned int cpu) +{ + int err; + int i; + + if (!mce_available(&boot_cpu_data)) + return -EIO; + + memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject)); + per_cpu(device_mce, cpu).id = cpu; + per_cpu(device_mce, cpu).cls = &mce_sysclass; + + err = sysdev_register(&per_cpu(device_mce, cpu)); + if (err) + return err; + + for (i = 0; mce_attributes[i]; i++) { + err = sysdev_create_file(&per_cpu(device_mce, cpu), + mce_attributes[i]); + if (err) + goto error; + } + for (i = 0; i < banks; i++) { + err = sysdev_create_file(&per_cpu(device_mce, cpu), + &bank_attrs[i]); + if (err) + goto error2; + } + cpumask_set_cpu(cpu, mce_device_initialized); + + return 0; +error2: + while (--i >= 0) { + sysdev_remove_file(&per_cpu(device_mce, cpu), + &bank_attrs[i]); + } +error: + while (--i >= 0) { + sysdev_remove_file(&per_cpu(device_mce, cpu), + mce_attributes[i]); + } + sysdev_unregister(&per_cpu(device_mce, cpu)); + + return err; +} + +static __cpuinit void mce_remove_device(unsigned int cpu) +{ + int i; + + if (!cpumask_test_cpu(cpu, mce_device_initialized)) + return; + + for (i = 0; mce_attributes[i]; i++) + sysdev_remove_file(&per_cpu(device_mce, cpu), + mce_attributes[i]); + for (i = 0; i < banks; i++) + sysdev_remove_file(&per_cpu(device_mce, cpu), + &bank_attrs[i]); + sysdev_unregister(&per_cpu(device_mce, cpu)); + cpumask_clear_cpu(cpu, mce_device_initialized); +} + +/* Make sure there are no machine checks on offlined CPUs. */ +static void mce_disable_cpu(void *h) +{ + int i; + unsigned long action = *(unsigned long *)h; + + if (!mce_available(¤t_cpu_data)) + return; + if (!(action & CPU_TASKS_FROZEN)) + cmci_clear(); + for (i = 0; i < banks; i++) + wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); +} + +static void mce_reenable_cpu(void *h) +{ + unsigned long action = *(unsigned long *)h; + int i; + + if (!mce_available(¤t_cpu_data)) + return; + + if (!(action & CPU_TASKS_FROZEN)) + cmci_reenable(); + for (i = 0; i < banks; i++) + wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); +} + +/* Get notified when a cpu comes on/off. Be hotplug friendly. */ +static int __cpuinit +mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct timer_list *t = &per_cpu(mce_timer, cpu); + + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + mce_create_device(cpu); + if (threshold_cpu_callback) + threshold_cpu_callback(action, cpu); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + if (threshold_cpu_callback) + threshold_cpu_callback(action, cpu); + mce_remove_device(cpu); + break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + del_timer_sync(t); + smp_call_function_single(cpu, mce_disable_cpu, &action, 1); + break; + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + t->expires = round_jiffies(jiffies + + __get_cpu_var(next_interval)); + add_timer_on(t, cpu); + smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); + break; + case CPU_POST_DEAD: + /* intentionally ignoring frozen here */ + cmci_rediscover(cpu); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block mce_cpu_notifier __cpuinitdata = { + .notifier_call = mce_cpu_callback, +}; + +static __init int mce_init_banks(void) +{ + int i; + + bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, + GFP_KERNEL); + if (!bank_attrs) + return -ENOMEM; + + for (i = 0; i < banks; i++) { + struct sysdev_attribute *a = &bank_attrs[i]; + + a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); + if (!a->attr.name) + goto nomem; + + a->attr.mode = 0644; + a->show = show_bank; + a->store = set_bank; + } + return 0; + +nomem: + while (--i >= 0) + kfree(bank_attrs[i].attr.name); + kfree(bank_attrs); + bank_attrs = NULL; + + return -ENOMEM; +} + +static __init int mce_init_device(void) +{ + int err; + int i = 0; + + if (!mce_available(&boot_cpu_data)) + return -EIO; + + alloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); + + err = mce_init_banks(); + if (err) + return err; + + err = sysdev_class_register(&mce_sysclass); + if (err) + return err; + + for_each_online_cpu(i) { + err = mce_create_device(i); + if (err) + return err; + } + + register_hotcpu_notifier(&mce_cpu_notifier); + misc_register(&mce_log_device); + + return err; +} + +device_initcall(mce_init_device); + +#else /* CONFIG_X86_32: */ + +int mce_disabled; + +int nr_mce_banks; +EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ + +/* Handle unconfigured int18 (should never happen) */ +static void unexpected_machine_check(struct pt_regs *regs, long error_code) +{ + printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", + smp_processor_id()); +} + +/* Call the installed machine check handler for this CPU setup. */ +void (*machine_check_vector)(struct pt_regs *, long error_code) = + unexpected_machine_check; + +/* This has to be run for each processor */ +void mcheck_init(struct cpuinfo_x86 *c) +{ + if (mce_disabled == 1) + return; + + switch (c->x86_vendor) { + case X86_VENDOR_AMD: + amd_mcheck_init(c); + break; + + case X86_VENDOR_INTEL: + if (c->x86 == 5) + intel_p5_mcheck_init(c); + if (c->x86 == 6) + intel_p6_mcheck_init(c); + if (c->x86 == 15) + intel_p4_mcheck_init(c); + break; + + case X86_VENDOR_CENTAUR: + if (c->x86 == 5) + winchip_mcheck_init(c); + break; + + default: + break; + } +} + +static int __init mcheck_disable(char *str) +{ + mce_disabled = 1; + return 1; +} + +static int __init mcheck_enable(char *str) +{ + mce_disabled = -1; + return 1; +} + +__setup("nomce", mcheck_disable); +__setup("mce", mcheck_enable); + +#endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c deleted file mode 100644 index 05979e7eff1..00000000000 --- a/arch/x86/kernel/cpu/mcheck/mce_32.c +++ /dev/null @@ -1,77 +0,0 @@ -/* - * mce.c - x86 Machine Check Exception Reporting - * (c) 2002 Alan Cox , Dave Jones - */ -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "mce.h" - -int mce_disabled; - -int nr_mce_banks; -EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ - -/* Handle unconfigured int18 (should never happen) */ -static void unexpected_machine_check(struct pt_regs *regs, long error_code) -{ - printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", - smp_processor_id()); -} - -/* Call the installed machine check handler for this CPU setup. */ -void (*machine_check_vector)(struct pt_regs *, long error_code) = - unexpected_machine_check; - -/* This has to be run for each processor */ -void mcheck_init(struct cpuinfo_x86 *c) -{ - if (mce_disabled == 1) - return; - - switch (c->x86_vendor) { - case X86_VENDOR_AMD: - amd_mcheck_init(c); - break; - - case X86_VENDOR_INTEL: - if (c->x86 == 5) - intel_p5_mcheck_init(c); - if (c->x86 == 6) - intel_p6_mcheck_init(c); - if (c->x86 == 15) - intel_p4_mcheck_init(c); - break; - - case X86_VENDOR_CENTAUR: - if (c->x86 == 5) - winchip_mcheck_init(c); - break; - - default: - break; - } -} - -static int __init mcheck_disable(char *str) -{ - mce_disabled = 1; - return 1; -} - -static int __init mcheck_enable(char *str) -{ - mce_disabled = -1; - return 1; -} - -__setup("nomce", mcheck_disable); -__setup("mce", mcheck_enable); diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c deleted file mode 100644 index 2e2c3d2e958..00000000000 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ /dev/null @@ -1,1311 +0,0 @@ -/* - * Machine check handler. - * - * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. - * Rest from unknown author(s). - * 2004 Andi Kleen. Rewrote most of it. - * Copyright 2008 Intel Corporation - * Author: Andi Kleen - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "mce.h" - -#ifdef CONFIG_X86_64 - -#define MISC_MCELOG_MINOR 227 - -atomic_t mce_entry; - -static int mce_dont_init; - -/* - * Tolerant levels: - * 0: always panic on uncorrected errors, log corrected errors - * 1: panic or SIGBUS on uncorrected errors, log corrected errors - * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors - * 3: never panic or SIGBUS, log all errors (for testing only) - */ -static int tolerant = 1; -static int banks; -static u64 *bank; -static unsigned long notify_user; -static int rip_msr; -static int mce_bootlog = -1; -static atomic_t mce_events; - -static char trigger[128]; -static char *trigger_argv[2] = { trigger, NULL }; - -static DECLARE_WAIT_QUEUE_HEAD(mce_wait); - -/* MCA banks polled by the period polling timer for corrected events */ -DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { - [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL -}; - -/* Do initial initialization of a struct mce */ -void mce_setup(struct mce *m) -{ - memset(m, 0, sizeof(struct mce)); - m->cpu = smp_processor_id(); - rdtscll(m->tsc); -} - -/* - * Lockless MCE logging infrastructure. - * This avoids deadlocks on printk locks without having to break locks. Also - * separate MCEs from kernel messages to avoid bogus bug reports. - */ - -static struct mce_log mcelog = { - MCE_LOG_SIGNATURE, - MCE_LOG_LEN, -}; - -void mce_log(struct mce *mce) -{ - unsigned next, entry; - - atomic_inc(&mce_events); - mce->finished = 0; - wmb(); - for (;;) { - entry = rcu_dereference(mcelog.next); - for (;;) { - /* - * When the buffer fills up discard new entries. - * Assume that the earlier errors are the more - * interesting ones: - */ - if (entry >= MCE_LOG_LEN) { - set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags); - return; - } - /* Old left over entry. Skip: */ - if (mcelog.entry[entry].finished) { - entry++; - continue; - } - break; - } - smp_rmb(); - next = entry + 1; - if (cmpxchg(&mcelog.next, entry, next) == entry) - break; - } - memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); - wmb(); - mcelog.entry[entry].finished = 1; - wmb(); - - set_bit(0, ¬ify_user); -} - -static void print_mce(struct mce *m) -{ - printk(KERN_EMERG "\n" - KERN_EMERG "HARDWARE ERROR\n" - KERN_EMERG - "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", - m->cpu, m->mcgstatus, m->bank, m->status); - if (m->ip) { - printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", - !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", - m->cs, m->ip); - if (m->cs == __KERNEL_CS) - print_symbol("{%s}", m->ip); - printk("\n"); - } - printk(KERN_EMERG "TSC %llx ", m->tsc); - if (m->addr) - printk("ADDR %llx ", m->addr); - if (m->misc) - printk("MISC %llx ", m->misc); - printk("\n"); - printk(KERN_EMERG "This is not a software problem!\n"); - printk(KERN_EMERG "Run through mcelog --ascii to decode " - "and contact your hardware vendor\n"); -} - -static void mce_panic(char *msg, struct mce *backup, unsigned long start) -{ - int i; - - oops_begin(); - for (i = 0; i < MCE_LOG_LEN; i++) { - unsigned long tsc = mcelog.entry[i].tsc; - - if (time_before(tsc, start)) - continue; - print_mce(&mcelog.entry[i]); - if (backup && mcelog.entry[i].tsc == backup->tsc) - backup = NULL; - } - if (backup) - print_mce(backup); - panic(msg); -} - -int mce_available(struct cpuinfo_x86 *c) -{ - if (mce_dont_init) - return 0; - return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); -} - -static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) -{ - if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { - m->ip = regs->ip; - m->cs = regs->cs; - } else { - m->ip = 0; - m->cs = 0; - } - if (rip_msr) { - /* Assume the RIP in the MSR is exact. Is this true? */ - m->mcgstatus |= MCG_STATUS_EIPV; - rdmsrl(rip_msr, m->ip); - m->cs = 0; - } -} - -/* - * Poll for corrected events or events that happened before reset. - * Those are just logged through /dev/mcelog. - * - * This is executed in standard interrupt context. - */ -void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) -{ - struct mce m; - int i; - - mce_setup(&m); - - rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); - for (i = 0; i < banks; i++) { - if (!bank[i] || !test_bit(i, *b)) - continue; - - m.misc = 0; - m.addr = 0; - m.bank = i; - m.tsc = 0; - - barrier(); - rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); - if (!(m.status & MCI_STATUS_VAL)) - continue; - - /* - * Uncorrected events are handled by the exception handler - * when it is enabled. But when the exception is disabled log - * everything. - * - * TBD do the same check for MCI_STATUS_EN here? - */ - if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) - continue; - - if (m.status & MCI_STATUS_MISCV) - rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); - if (m.status & MCI_STATUS_ADDRV) - rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); - - if (!(flags & MCP_TIMESTAMP)) - m.tsc = 0; - /* - * Don't get the IP here because it's unlikely to - * have anything to do with the actual error location. - */ - if (!(flags & MCP_DONTLOG)) { - mce_log(&m); - add_taint(TAINT_MACHINE_CHECK); - } - - /* - * Clear state for this bank. - */ - wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); - } - - /* - * Don't clear MCG_STATUS here because it's only defined for - * exceptions. - */ -} - -/* - * The actual machine check handler. This only handles real - * exceptions when something got corrupted coming in through int 18. - * - * This is executed in NMI context not subject to normal locking rules. This - * implies that most kernel services cannot be safely used. Don't even - * think about putting a printk in there! - */ -void do_machine_check(struct pt_regs *regs, long error_code) -{ - struct mce m, panicm; - int panicm_found = 0; - u64 mcestart = 0; - int i; - /* - * If no_way_out gets set, there is no safe way to recover from this - * MCE. If tolerant is cranked up, we'll try anyway. - */ - int no_way_out = 0; - /* - * If kill_it gets set, there might be a way to recover from this - * error. - */ - int kill_it = 0; - DECLARE_BITMAP(toclear, MAX_NR_BANKS); - - atomic_inc(&mce_entry); - - if (notify_die(DIE_NMI, "machine check", regs, error_code, - 18, SIGKILL) == NOTIFY_STOP) - goto out2; - if (!banks) - goto out2; - - mce_setup(&m); - - rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); - - /* if the restart IP is not valid, we're done for */ - if (!(m.mcgstatus & MCG_STATUS_RIPV)) - no_way_out = 1; - - rdtscll(mcestart); - barrier(); - - for (i = 0; i < banks; i++) { - __clear_bit(i, toclear); - if (!bank[i]) - continue; - - m.misc = 0; - m.addr = 0; - m.bank = i; - - rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); - if ((m.status & MCI_STATUS_VAL) == 0) - continue; - - /* - * Non uncorrected errors are handled by machine_check_poll - * Leave them alone. - */ - if ((m.status & MCI_STATUS_UC) == 0) - continue; - - /* - * Set taint even when machine check was not enabled. - */ - add_taint(TAINT_MACHINE_CHECK); - - __set_bit(i, toclear); - - if (m.status & MCI_STATUS_EN) { - /* if PCC was set, there's no way out */ - no_way_out |= !!(m.status & MCI_STATUS_PCC); - /* - * If this error was uncorrectable and there was - * an overflow, we're in trouble. If no overflow, - * we might get away with just killing a task. - */ - if (m.status & MCI_STATUS_UC) { - if (tolerant < 1 || m.status & MCI_STATUS_OVER) - no_way_out = 1; - kill_it = 1; - } - } else { - /* - * Machine check event was not enabled. Clear, but - * ignore. - */ - continue; - } - - if (m.status & MCI_STATUS_MISCV) - rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); - if (m.status & MCI_STATUS_ADDRV) - rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); - - mce_get_rip(&m, regs); - mce_log(&m); - - /* - * Did this bank cause the exception? - * - * Assume that the bank with uncorrectable errors did it, - * and that there is only a single one: - */ - if ((m.status & MCI_STATUS_UC) && - (m.status & MCI_STATUS_EN)) { - panicm = m; - panicm_found = 1; - } - } - - /* - * If we didn't find an uncorrectable error, pick - * the last one (shouldn't happen, just being safe). - */ - if (!panicm_found) - panicm = m; - - /* - * If we have decided that we just CAN'T continue, and the user - * has not set tolerant to an insane level, give up and die. - */ - if (no_way_out && tolerant < 3) - mce_panic("Machine check", &panicm, mcestart); - - /* - * If the error seems to be unrecoverable, something should be - * done. Try to kill as little as possible. If we can kill just - * one task, do that. If the user has set the tolerance very - * high, don't try to do anything at all. - */ - if (kill_it && tolerant < 3) { - int user_space = 0; - - /* - * If the EIPV bit is set, it means the saved IP is the - * instruction which caused the MCE. - */ - if (m.mcgstatus & MCG_STATUS_EIPV) - user_space = panicm.ip && (panicm.cs & 3); - - /* - * If we know that the error was in user space, send a - * SIGBUS. Otherwise, panic if tolerance is low. - * - * force_sig() takes an awful lot of locks and has a slight - * risk of deadlocking. - */ - if (user_space) { - force_sig(SIGBUS, current); - } else if (panic_on_oops || tolerant < 2) { - mce_panic("Uncorrected machine check", - &panicm, mcestart); - } - } - - /* notify userspace ASAP */ - set_thread_flag(TIF_MCE_NOTIFY); - - /* the last thing we do is clear state */ - for (i = 0; i < banks; i++) { - if (test_bit(i, toclear)) - wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); - } - wrmsrl(MSR_IA32_MCG_STATUS, 0); - out2: - atomic_dec(&mce_entry); -} - -#ifdef CONFIG_X86_MCE_INTEL -/*** - * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog - * @cpu: The CPU on which the event occurred. - * @status: Event status information - * - * This function should be called by the thermal interrupt after the - * event has been processed and the decision was made to log the event - * further. - * - * The status parameter will be saved to the 'status' field of 'struct mce' - * and historically has been the register value of the - * MSR_IA32_THERMAL_STATUS (Intel) msr. - */ -void mce_log_therm_throt_event(__u64 status) -{ - struct mce m; - - mce_setup(&m); - m.bank = MCE_THERMAL_BANK; - m.status = status; - mce_log(&m); -} -#endif /* CONFIG_X86_MCE_INTEL */ - -/* - * Periodic polling timer for "silent" machine check errors. If the - * poller finds an MCE, poll 2x faster. When the poller finds no more - * errors, poll 2x slower (up to check_interval seconds). - */ -static int check_interval = 5 * 60; /* 5 minutes */ - -static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ -static DEFINE_PER_CPU(struct timer_list, mce_timer); - -static void mcheck_timer(unsigned long data) -{ - struct timer_list *t = &per_cpu(mce_timer, data); - int *n; - - WARN_ON(smp_processor_id() != data); - - if (mce_available(¤t_cpu_data)) { - machine_check_poll(MCP_TIMESTAMP, - &__get_cpu_var(mce_poll_banks)); - } - - /* - * Alert userspace if needed. If we logged an MCE, reduce the - * polling interval, otherwise increase the polling interval. - */ - n = &__get_cpu_var(next_interval); - if (mce_notify_user()) { - *n = max(*n/2, HZ/100); - } else { - *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); - } - - t->expires = jiffies + *n; - add_timer(t); -} - -static void mce_do_trigger(struct work_struct *work) -{ - call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); -} - -static DECLARE_WORK(mce_trigger_work, mce_do_trigger); - -/* - * Notify the user(s) about new machine check events. - * Can be called from interrupt context, but not from machine check/NMI - * context. - */ -int mce_notify_user(void) -{ - /* Not more than two messages every minute */ - static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); - - clear_thread_flag(TIF_MCE_NOTIFY); - - if (test_and_clear_bit(0, ¬ify_user)) { - wake_up_interruptible(&mce_wait); - - /* - * There is no risk of missing notifications because - * work_pending is always cleared before the function is - * executed. - */ - if (trigger[0] && !work_pending(&mce_trigger_work)) - schedule_work(&mce_trigger_work); - - if (__ratelimit(&ratelimit)) - printk(KERN_INFO "Machine check events logged\n"); - - return 1; - } - return 0; -} - -/* see if the idle task needs to notify userspace: */ -static int -mce_idle_callback(struct notifier_block *nfb, unsigned long action, - void *unused) -{ - /* IDLE_END should be safe - interrupts are back on */ - if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY)) - mce_notify_user(); - - return NOTIFY_OK; -} - -static struct notifier_block mce_idle_notifier = { - .notifier_call = mce_idle_callback, -}; - -static __init int periodic_mcheck_init(void) -{ - idle_notifier_register(&mce_idle_notifier); - return 0; -} -__initcall(periodic_mcheck_init); - -/* - * Initialize Machine Checks for a CPU. - */ -static int mce_cap_init(void) -{ - unsigned b; - u64 cap; - - rdmsrl(MSR_IA32_MCG_CAP, cap); - b = cap & 0xff; - if (b > MAX_NR_BANKS) { - printk(KERN_WARNING - "MCE: Using only %u machine check banks out of %u\n", - MAX_NR_BANKS, b); - b = MAX_NR_BANKS; - } - - /* Don't support asymmetric configurations today */ - WARN_ON(banks != 0 && b != banks); - banks = b; - if (!bank) { - bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); - if (!bank) - return -ENOMEM; - memset(bank, 0xff, banks * sizeof(u64)); - } - - /* Use accurate RIP reporting if available. */ - if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) - rip_msr = MSR_IA32_MCG_EIP; - - return 0; -} - -static void mce_init(void *dummy) -{ - mce_banks_t all_banks; - u64 cap; - int i; - - /* - * Log the machine checks left over from the previous reset. - */ - bitmap_fill(all_banks, MAX_NR_BANKS); - machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); - - set_in_cr4(X86_CR4_MCE); - - rdmsrl(MSR_IA32_MCG_CAP, cap); - if (cap & MCG_CTL_P) - wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - - for (i = 0; i < banks; i++) { - wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); - wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); - } -} - -/* Add per CPU specific workarounds here */ -static void mce_cpu_quirks(struct cpuinfo_x86 *c) -{ - /* This should be disabled by the BIOS, but isn't always */ - if (c->x86_vendor == X86_VENDOR_AMD) { - if (c->x86 == 15 && banks > 4) { - /* - * disable GART TBL walk error reporting, which - * trips off incorrectly with the IOMMU & 3ware - * & Cerberus: - */ - clear_bit(10, (unsigned long *)&bank[4]); - } - if (c->x86 <= 17 && mce_bootlog < 0) { - /* - * Lots of broken BIOS around that don't clear them - * by default and leave crap in there. Don't log: - */ - mce_bootlog = 0; - } - } - -} - -static void mce_cpu_features(struct cpuinfo_x86 *c) -{ - switch (c->x86_vendor) { - case X86_VENDOR_INTEL: - mce_intel_feature_init(c); - break; - case X86_VENDOR_AMD: - mce_amd_feature_init(c); - break; - default: - break; - } -} - -static void mce_init_timer(void) -{ - struct timer_list *t = &__get_cpu_var(mce_timer); - int *n = &__get_cpu_var(next_interval); - - *n = check_interval * HZ; - if (!*n) - return; - setup_timer(t, mcheck_timer, smp_processor_id()); - t->expires = round_jiffies(jiffies + *n); - add_timer(t); -} - -/* - * Called for each booted CPU to set up machine checks. - * Must be called with preempt off: - */ -void __cpuinit mcheck_init(struct cpuinfo_x86 *c) -{ - if (!mce_available(c)) - return; - - if (mce_cap_init() < 0) { - mce_dont_init = 1; - return; - } - mce_cpu_quirks(c); - - mce_init(NULL); - mce_cpu_features(c); - mce_init_timer(); -} - -/* - * Character device to read and clear the MCE log. - */ - -static DEFINE_SPINLOCK(mce_state_lock); -static int open_count; /* #times opened */ -static int open_exclu; /* already open exclusive? */ - -static int mce_open(struct inode *inode, struct file *file) -{ - lock_kernel(); - spin_lock(&mce_state_lock); - - if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { - spin_unlock(&mce_state_lock); - unlock_kernel(); - - return -EBUSY; - } - - if (file->f_flags & O_EXCL) - open_exclu = 1; - open_count++; - - spin_unlock(&mce_state_lock); - unlock_kernel(); - - return nonseekable_open(inode, file); -} - -static int mce_release(struct inode *inode, struct file *file) -{ - spin_lock(&mce_state_lock); - - open_count--; - open_exclu = 0; - - spin_unlock(&mce_state_lock); - - return 0; -} - -static void collect_tscs(void *data) -{ - unsigned long *cpu_tsc = (unsigned long *)data; - - rdtscll(cpu_tsc[smp_processor_id()]); -} - -static DEFINE_MUTEX(mce_read_mutex); - -static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, - loff_t *off) -{ - char __user *buf = ubuf; - unsigned long *cpu_tsc; - unsigned prev, next; - int i, err; - - cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); - if (!cpu_tsc) - return -ENOMEM; - - mutex_lock(&mce_read_mutex); - next = rcu_dereference(mcelog.next); - - /* Only supports full reads right now */ - if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { - mutex_unlock(&mce_read_mutex); - kfree(cpu_tsc); - - return -EINVAL; - } - - err = 0; - prev = 0; - do { - for (i = prev; i < next; i++) { - unsigned long start = jiffies; - - while (!mcelog.entry[i].finished) { - if (time_after_eq(jiffies, start + 2)) { - memset(mcelog.entry + i, 0, - sizeof(struct mce)); - goto timeout; - } - cpu_relax(); - } - smp_rmb(); - err |= copy_to_user(buf, mcelog.entry + i, - sizeof(struct mce)); - buf += sizeof(struct mce); -timeout: - ; - } - - memset(mcelog.entry + prev, 0, - (next - prev) * sizeof(struct mce)); - prev = next; - next = cmpxchg(&mcelog.next, prev, 0); - } while (next != prev); - - synchronize_sched(); - - /* - * Collect entries that were still getting written before the - * synchronize. - */ - on_each_cpu(collect_tscs, cpu_tsc, 1); - - for (i = next; i < MCE_LOG_LEN; i++) { - if (mcelog.entry[i].finished && - mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { - err |= copy_to_user(buf, mcelog.entry+i, - sizeof(struct mce)); - smp_rmb(); - buf += sizeof(struct mce); - memset(&mcelog.entry[i], 0, sizeof(struct mce)); - } - } - mutex_unlock(&mce_read_mutex); - kfree(cpu_tsc); - - return err ? -EFAULT : buf - ubuf; -} - -static unsigned int mce_poll(struct file *file, poll_table *wait) -{ - poll_wait(file, &mce_wait, wait); - if (rcu_dereference(mcelog.next)) - return POLLIN | POLLRDNORM; - return 0; -} - -static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) -{ - int __user *p = (int __user *)arg; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - switch (cmd) { - case MCE_GET_RECORD_LEN: - return put_user(sizeof(struct mce), p); - case MCE_GET_LOG_LEN: - return put_user(MCE_LOG_LEN, p); - case MCE_GETCLEAR_FLAGS: { - unsigned flags; - - do { - flags = mcelog.flags; - } while (cmpxchg(&mcelog.flags, flags, 0) != flags); - - return put_user(flags, p); - } - default: - return -ENOTTY; - } -} - -static const struct file_operations mce_chrdev_ops = { - .open = mce_open, - .release = mce_release, - .read = mce_read, - .poll = mce_poll, - .unlocked_ioctl = mce_ioctl, -}; - -static struct miscdevice mce_log_device = { - MISC_MCELOG_MINOR, - "mcelog", - &mce_chrdev_ops, -}; - -/* - * Old style boot options parsing. Only for compatibility. - */ -static int __init mcheck_disable(char *str) -{ - mce_dont_init = 1; - return 1; -} -__setup("nomce", mcheck_disable); - -/* - * mce=off disables machine check - * mce=TOLERANCELEVEL (number, see above) - * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. - * mce=nobootlog Don't log MCEs from before booting. - */ -static int __init mcheck_enable(char *str) -{ - if (!strcmp(str, "off")) - mce_dont_init = 1; - else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) - mce_bootlog = (str[0] == 'b'); - else if (isdigit(str[0])) - get_option(&str, &tolerant); - else { - printk(KERN_INFO "mce= argument %s ignored. Please use /sys\n", - str); - return 0; - } - return 1; -} -__setup("mce=", mcheck_enable); - -/* - * Sysfs support - */ - -/* - * Disable machine checks on suspend and shutdown. We can't really handle - * them later. - */ -static int mce_disable(void) -{ - int i; - - for (i = 0; i < banks; i++) - wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); - return 0; -} - -static int mce_suspend(struct sys_device *dev, pm_message_t state) -{ - return mce_disable(); -} - -static int mce_shutdown(struct sys_device *dev) -{ - return mce_disable(); -} - -/* - * On resume clear all MCE state. Don't want to see leftovers from the BIOS. - * Only one CPU is active at this time, the others get re-added later using - * CPU hotplug: - */ -static int mce_resume(struct sys_device *dev) -{ - mce_init(NULL); - mce_cpu_features(¤t_cpu_data); - - return 0; -} - -static void mce_cpu_restart(void *data) -{ - del_timer_sync(&__get_cpu_var(mce_timer)); - if (mce_available(¤t_cpu_data)) - mce_init(NULL); - mce_init_timer(); -} - -/* Reinit MCEs after user configuration changes */ -static void mce_restart(void) -{ - on_each_cpu(mce_cpu_restart, NULL, 1); -} - -static struct sysdev_class mce_sysclass = { - .suspend = mce_suspend, - .shutdown = mce_shutdown, - .resume = mce_resume, - .name = "machinecheck", -}; - -DEFINE_PER_CPU(struct sys_device, device_mce); - -__cpuinitdata -void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); - -/* Why are there no generic functions for this? */ -#define ACCESSOR(name, var, start) \ - static ssize_t show_ ## name(struct sys_device *s, \ - struct sysdev_attribute *attr, \ - char *buf) { \ - return sprintf(buf, "%lx\n", (unsigned long)var); \ - } \ - static ssize_t set_ ## name(struct sys_device *s, \ - struct sysdev_attribute *attr, \ - const char *buf, size_t siz) { \ - char *end; \ - unsigned long new = simple_strtoul(buf, &end, 0); \ - \ - if (end == buf) \ - return -EINVAL; \ - var = new; \ - start; \ - \ - return end-buf; \ - } \ - static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); - -static struct sysdev_attribute *bank_attrs; - -static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, - char *buf) -{ - u64 b = bank[attr - bank_attrs]; - - return sprintf(buf, "%llx\n", b); -} - -static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, - const char *buf, size_t siz) -{ - char *end; - u64 new = simple_strtoull(buf, &end, 0); - - if (end == buf) - return -EINVAL; - - bank[attr - bank_attrs] = new; - mce_restart(); - - return end-buf; -} - -static ssize_t -show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) -{ - strcpy(buf, trigger); - strcat(buf, "\n"); - return strlen(trigger) + 1; -} - -static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, - const char *buf, size_t siz) -{ - char *p; - int len; - - strncpy(trigger, buf, sizeof(trigger)); - trigger[sizeof(trigger)-1] = 0; - len = strlen(trigger); - p = strchr(trigger, '\n'); - - if (*p) - *p = 0; - - return len; -} - -static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); -static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); - -ACCESSOR(check_interval, check_interval, mce_restart()) - -static struct sysdev_attribute *mce_attributes[] = { - &attr_tolerant.attr, &attr_check_interval, &attr_trigger, - NULL -}; - -static cpumask_var_t mce_device_initialized; - -/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ -static __cpuinit int mce_create_device(unsigned int cpu) -{ - int err; - int i; - - if (!mce_available(&boot_cpu_data)) - return -EIO; - - memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject)); - per_cpu(device_mce, cpu).id = cpu; - per_cpu(device_mce, cpu).cls = &mce_sysclass; - - err = sysdev_register(&per_cpu(device_mce, cpu)); - if (err) - return err; - - for (i = 0; mce_attributes[i]; i++) { - err = sysdev_create_file(&per_cpu(device_mce, cpu), - mce_attributes[i]); - if (err) - goto error; - } - for (i = 0; i < banks; i++) { - err = sysdev_create_file(&per_cpu(device_mce, cpu), - &bank_attrs[i]); - if (err) - goto error2; - } - cpumask_set_cpu(cpu, mce_device_initialized); - - return 0; -error2: - while (--i >= 0) { - sysdev_remove_file(&per_cpu(device_mce, cpu), - &bank_attrs[i]); - } -error: - while (--i >= 0) { - sysdev_remove_file(&per_cpu(device_mce, cpu), - mce_attributes[i]); - } - sysdev_unregister(&per_cpu(device_mce, cpu)); - - return err; -} - -static __cpuinit void mce_remove_device(unsigned int cpu) -{ - int i; - - if (!cpumask_test_cpu(cpu, mce_device_initialized)) - return; - - for (i = 0; mce_attributes[i]; i++) - sysdev_remove_file(&per_cpu(device_mce, cpu), - mce_attributes[i]); - for (i = 0; i < banks; i++) - sysdev_remove_file(&per_cpu(device_mce, cpu), - &bank_attrs[i]); - sysdev_unregister(&per_cpu(device_mce, cpu)); - cpumask_clear_cpu(cpu, mce_device_initialized); -} - -/* Make sure there are no machine checks on offlined CPUs. */ -static void mce_disable_cpu(void *h) -{ - int i; - unsigned long action = *(unsigned long *)h; - - if (!mce_available(¤t_cpu_data)) - return; - if (!(action & CPU_TASKS_FROZEN)) - cmci_clear(); - for (i = 0; i < banks; i++) - wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); -} - -static void mce_reenable_cpu(void *h) -{ - unsigned long action = *(unsigned long *)h; - int i; - - if (!mce_available(¤t_cpu_data)) - return; - - if (!(action & CPU_TASKS_FROZEN)) - cmci_reenable(); - for (i = 0; i < banks; i++) - wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); -} - -/* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static int __cpuinit -mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - struct timer_list *t = &per_cpu(mce_timer, cpu); - - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - mce_create_device(cpu); - if (threshold_cpu_callback) - threshold_cpu_callback(action, cpu); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - if (threshold_cpu_callback) - threshold_cpu_callback(action, cpu); - mce_remove_device(cpu); - break; - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - del_timer_sync(t); - smp_call_function_single(cpu, mce_disable_cpu, &action, 1); - break; - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - t->expires = round_jiffies(jiffies + - __get_cpu_var(next_interval)); - add_timer_on(t, cpu); - smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); - break; - case CPU_POST_DEAD: - /* intentionally ignoring frozen here */ - cmci_rediscover(cpu); - break; - } - return NOTIFY_OK; -} - -static struct notifier_block mce_cpu_notifier __cpuinitdata = { - .notifier_call = mce_cpu_callback, -}; - -static __init int mce_init_banks(void) -{ - int i; - - bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, - GFP_KERNEL); - if (!bank_attrs) - return -ENOMEM; - - for (i = 0; i < banks; i++) { - struct sysdev_attribute *a = &bank_attrs[i]; - - a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); - if (!a->attr.name) - goto nomem; - - a->attr.mode = 0644; - a->show = show_bank; - a->store = set_bank; - } - return 0; - -nomem: - while (--i >= 0) - kfree(bank_attrs[i].attr.name); - kfree(bank_attrs); - bank_attrs = NULL; - - return -ENOMEM; -} - -static __init int mce_init_device(void) -{ - int err; - int i = 0; - - if (!mce_available(&boot_cpu_data)) - return -EIO; - - alloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); - - err = mce_init_banks(); - if (err) - return err; - - err = sysdev_class_register(&mce_sysclass); - if (err) - return err; - - for_each_online_cpu(i) { - err = mce_create_device(i); - if (err) - return err; - } - - register_hotcpu_notifier(&mce_cpu_notifier); - misc_register(&mce_log_device); - - return err; -} - -device_initcall(mce_init_device); - -#else /* CONFIG_X86_32: */ - -int mce_disabled; - -int nr_mce_banks; -EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ - -/* Handle unconfigured int18 (should never happen) */ -static void unexpected_machine_check(struct pt_regs *regs, long error_code) -{ - printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", - smp_processor_id()); -} - -/* Call the installed machine check handler for this CPU setup. */ -void (*machine_check_vector)(struct pt_regs *, long error_code) = - unexpected_machine_check; - -/* This has to be run for each processor */ -void mcheck_init(struct cpuinfo_x86 *c) -{ - if (mce_disabled == 1) - return; - - switch (c->x86_vendor) { - case X86_VENDOR_AMD: - amd_mcheck_init(c); - break; - - case X86_VENDOR_INTEL: - if (c->x86 == 5) - intel_p5_mcheck_init(c); - if (c->x86 == 6) - intel_p6_mcheck_init(c); - if (c->x86 == 15) - intel_p4_mcheck_init(c); - break; - - case X86_VENDOR_CENTAUR: - if (c->x86 == 5) - winchip_mcheck_init(c); - break; - - default: - break; - } -} - -static int __init mcheck_disable(char *str) -{ - mce_disabled = 1; - return 1; -} - -static int __init mcheck_enable(char *str) -{ - mce_disabled = -1; - return 1; -} - -__setup("nomce", mcheck_disable); -__setup("mce", mcheck_enable); - -#endif /* CONFIG_X86_32 */ -- cgit v1.2.3 From cb491fca55e5282f0a95ef39c55352e00d6ca75e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:17 +0200 Subject: x86, mce: Rename sysfs variables Shorten variable names. This also compacts the code a bit. device_mce => mce_dev mce_device_initialized => mce_dev_initialized mce_attribute => mce_attrs [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 58 +++++++++++++++------------------ arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 10 +++--- 2 files changed, 32 insertions(+), 36 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 2e2c3d2e958..ba8dd41a10d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -957,7 +957,7 @@ static struct sysdev_class mce_sysclass = { .name = "machinecheck", }; -DEFINE_PER_CPU(struct sys_device, device_mce); +DEFINE_PER_CPU(struct sys_device, mce_dev); __cpuinitdata void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); @@ -1039,12 +1039,12 @@ static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); ACCESSOR(check_interval, check_interval, mce_restart()) -static struct sysdev_attribute *mce_attributes[] = { +static struct sysdev_attribute *mce_attrs[] = { &attr_tolerant.attr, &attr_check_interval, &attr_trigger, NULL }; -static cpumask_var_t mce_device_initialized; +static cpumask_var_t mce_dev_initialized; /* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ static __cpuinit int mce_create_device(unsigned int cpu) @@ -1055,40 +1055,36 @@ static __cpuinit int mce_create_device(unsigned int cpu) if (!mce_available(&boot_cpu_data)) return -EIO; - memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject)); - per_cpu(device_mce, cpu).id = cpu; - per_cpu(device_mce, cpu).cls = &mce_sysclass; + memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); + per_cpu(mce_dev, cpu).id = cpu; + per_cpu(mce_dev, cpu).cls = &mce_sysclass; - err = sysdev_register(&per_cpu(device_mce, cpu)); + err = sysdev_register(&per_cpu(mce_dev, cpu)); if (err) return err; - for (i = 0; mce_attributes[i]; i++) { - err = sysdev_create_file(&per_cpu(device_mce, cpu), - mce_attributes[i]); + for (i = 0; mce_attrs[i]; i++) { + err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); if (err) goto error; } for (i = 0; i < banks; i++) { - err = sysdev_create_file(&per_cpu(device_mce, cpu), + err = sysdev_create_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); if (err) goto error2; } - cpumask_set_cpu(cpu, mce_device_initialized); + cpumask_set_cpu(cpu, mce_dev_initialized); return 0; error2: - while (--i >= 0) { - sysdev_remove_file(&per_cpu(device_mce, cpu), - &bank_attrs[i]); - } + while (--i >= 0) + sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); error: - while (--i >= 0) { - sysdev_remove_file(&per_cpu(device_mce, cpu), - mce_attributes[i]); - } - sysdev_unregister(&per_cpu(device_mce, cpu)); + while (--i >= 0) + sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); + + sysdev_unregister(&per_cpu(mce_dev, cpu)); return err; } @@ -1097,24 +1093,24 @@ static __cpuinit void mce_remove_device(unsigned int cpu) { int i; - if (!cpumask_test_cpu(cpu, mce_device_initialized)) + if (!cpumask_test_cpu(cpu, mce_dev_initialized)) return; - for (i = 0; mce_attributes[i]; i++) - sysdev_remove_file(&per_cpu(device_mce, cpu), - mce_attributes[i]); + for (i = 0; mce_attrs[i]; i++) + sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); + for (i = 0; i < banks; i++) - sysdev_remove_file(&per_cpu(device_mce, cpu), - &bank_attrs[i]); - sysdev_unregister(&per_cpu(device_mce, cpu)); - cpumask_clear_cpu(cpu, mce_device_initialized); + sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); + + sysdev_unregister(&per_cpu(mce_dev, cpu)); + cpumask_clear_cpu(cpu, mce_dev_initialized); } /* Make sure there are no machine checks on offlined CPUs. */ static void mce_disable_cpu(void *h) { - int i; unsigned long action = *(unsigned long *)h; + int i; if (!mce_available(¤t_cpu_data)) return; @@ -1221,7 +1217,7 @@ static __init int mce_init_device(void) if (!mce_available(&boot_cpu_data)) return -EIO; - alloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); + alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); err = mce_init_banks(); if (err) diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 4d90ec3eb51..083f270251f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -517,7 +517,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (!b) goto out; - err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj, + err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj, b->kobj, name); if (err) goto out; @@ -540,7 +540,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) goto out; } - b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj); + b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj); if (!b->kobj) goto out_free; @@ -560,7 +560,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (i == cpu) continue; - err = sysfs_create_link(&per_cpu(device_mce, i).kobj, + err = sysfs_create_link(&per_cpu(mce_dev, i).kobj, b->kobj, name); if (err) goto out; @@ -638,7 +638,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) #ifdef CONFIG_SMP /* sibling symlink */ if (shared_bank[bank] && b->blocks->cpu != cpu) { - sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name); + sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name); per_cpu(threshold_banks, cpu)[bank] = NULL; return; @@ -650,7 +650,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) if (i == cpu) continue; - sysfs_remove_link(&per_cpu(device_mce, i).kobj, name); + sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name); per_cpu(threshold_banks, i)[bank] = NULL; } -- cgit v1.2.3 From b659294b779565c60f5e12ef505328e2b974eb62 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:27 +0200 Subject: x86, mce: print number of MCE banks The number of MCE banks supported by a CPU is a useful number to know, so print it out during CPU initialization. [ Impact: add printout ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index ba8dd41a10d..49c74222359 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -570,6 +570,8 @@ static int mce_cap_init(void) rdmsrl(MSR_IA32_MCG_CAP, cap); b = cap & 0xff; + printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); + if (b > MAX_NR_BANKS) { printk(KERN_WARNING "MCE: Using only %u machine check banks out of %u\n", @@ -1287,6 +1289,7 @@ void mcheck_init(struct cpuinfo_x86 *c) default: break; } + printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); } static int __init mcheck_disable(char *str) -- cgit v1.2.3 From ba2d0f2b0c56d7174a0208f7c463271f39040728 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Apr 2009 12:31:24 +0200 Subject: x86, mce: Cleanup symbols in intel thermal codes Decode magic constants and turn them into symbols. [ Cleanup to use symbols already exists - HS ] [ Impact: cleanup ] Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_intel.c | 9 +++++---- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 2 +- arch/x86/kernel/cpu/mcheck/p4.c | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index bad3cbb0e56..2b011d2d857 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -32,13 +32,13 @@ void intel_init_thermal(struct cpuinfo_x86 *c) */ rdmsr(MSR_IA32_MISC_ENABLE, l, h); h = apic_read(APIC_LVTTHMR); - if ((l & (1 << 3)) && (h & APIC_DM_SMI)) { + if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", cpu); return; } - if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13))) + if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) tm2 = 1; /* Check whether a vector already exists */ @@ -54,12 +54,13 @@ void intel_init_thermal(struct cpuinfo_x86 *c) apic_write(APIC_LVTTHMR, h); rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); - wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); + wrmsr(MSR_IA32_THERM_INTERRUPT, + l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); intel_set_thermal_handler(); rdmsr(MSR_IA32_MISC_ENABLE, l, h); - wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h); + wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); /* Unmask the thermal vector: */ l = apic_read(APIC_LVTTHMR); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index 38f9632306f..13abafcb72e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -29,7 +29,7 @@ asmlinkage void smp_thermal_interrupt(void) irq_enter(); rdmsrl(MSR_IA32_THERM_STATUS, msr_val); - if (therm_throt_process(msr_val & 1)) + if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT)) mce_log_therm_throt_event(msr_val); inc_irq_stat(irq_thermal_count); diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index f979ffea330..82cee108a2d 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -51,7 +51,7 @@ static void intel_thermal_interrupt(struct pt_regs *regs) ack_APIC_irq(); rdmsrl(MSR_IA32_THERM_STATUS, msr_val); - therm_throt_process(msr_val & 0x1); + therm_throt_process(msr_val & THERM_STATUS_PROCHOT); } /* Thermal interrupt handler for this CPU setup: */ -- cgit v1.2.3 From 01c6680a547a3ee8dd170c269ea8e037b3191b71 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Apr 2009 12:31:24 +0200 Subject: x86, mce: Cleanup MCG definitions Decode more magic constants and turn them into symbols. [ Sort definitions bitwise, introduce MCG_EXT_CNT - HS ] Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 49c74222359..14733362741 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -569,7 +569,8 @@ static int mce_cap_init(void) u64 cap; rdmsrl(MSR_IA32_MCG_CAP, cap); - b = cap & 0xff; + + b = cap & MCG_BANKCNT_MASK; printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); if (b > MAX_NR_BANKS) { @@ -590,7 +591,7 @@ static int mce_cap_init(void) } /* Use accurate RIP reporting if available. */ - if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) + if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) rip_msr = MSR_IA32_MCG_EIP; return 0; -- cgit v1.2.3 From 3cde5c8c839bf46a7be799ed0e1d0b4780aaf794 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 27 Apr 2009 18:01:31 +0200 Subject: x86, mce: initial steps to make 64bit mce code 32bit clean Replace unsigned long with u64s if they need to contain 64bit values. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 14733362741..cd1313b4750 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -156,15 +156,15 @@ static void print_mce(struct mce *m) "and contact your hardware vendor\n"); } -static void mce_panic(char *msg, struct mce *backup, unsigned long start) +static void mce_panic(char *msg, struct mce *backup, u64 start) { int i; oops_begin(); for (i = 0; i < MCE_LOG_LEN; i++) { - unsigned long tsc = mcelog.entry[i].tsc; + u64 tsc = mcelog.entry[i].tsc; - if (time_before(tsc, start)) + if ((s64)(tsc - start) < 0) continue; print_mce(&mcelog.entry[i]); if (backup && mcelog.entry[i].tsc == backup->tsc) @@ -970,13 +970,13 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); static ssize_t show_ ## name(struct sys_device *s, \ struct sysdev_attribute *attr, \ char *buf) { \ - return sprintf(buf, "%lx\n", (unsigned long)var); \ + return sprintf(buf, "%Lx\n", (u64)var); \ } \ static ssize_t set_ ## name(struct sys_device *s, \ struct sysdev_attribute *attr, \ const char *buf, size_t siz) { \ char *end; \ - unsigned long new = simple_strtoul(buf, &end, 0); \ + u64 new = simple_strtoull(buf, &end, 0); \ \ if (end == buf) \ return -EINVAL; \ -- cgit v1.2.3 From 06b7a7a5ec917761969444fee967c43868a76468 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 27 Apr 2009 18:37:43 +0200 Subject: x86, mce: implement the PPro bank 0 quirk in the 64bit machine check code Quoting the comment: * SDM documents that on family 6 bank 0 should not be written * because it aliases to another special BIOS controlled * register. * But it's not aliased anymore on model 0x1a+ * Don't ignore bank 0 completely because there could be a valid * event later, merely don't write CTL0. This is mostly a port on the 32bit code, except that 32bit always didn't write it and didn't have the 0x1a heuristic. I checked with the CPU designers that the quirk is not required starting with this model. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 40 ++++++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index cd1313b4750..1dcd3be0332 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -65,6 +65,8 @@ static atomic_t mce_events; static char trigger[128]; static char *trigger_argv[2] = { trigger, NULL }; +static unsigned long dont_init_banks; + static DECLARE_WAIT_QUEUE_HEAD(mce_wait); /* MCA banks polled by the period polling timer for corrected events */ @@ -72,6 +74,11 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL }; +static inline int skip_bank_init(int i) +{ + return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); +} + /* Do initial initialization of a struct mce */ void mce_setup(struct mce *m) { @@ -616,6 +623,8 @@ static void mce_init(void *dummy) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); for (i = 0; i < banks; i++) { + if (skip_bank_init(i)) + continue; wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); } @@ -643,6 +652,19 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) } } + if (c->x86_vendor == X86_VENDOR_INTEL) { + /* + * SDM documents that on family 6 bank 0 should not be written + * because it aliases to another special BIOS controlled + * register. + * But it's not aliased anymore on model 0x1a+ + * Don't ignore bank 0 completely because there could be a + * valid event later, merely don't write CTL0. + */ + + if (c->x86 == 6 && c->x86_model < 0x1A) + __set_bit(0, &dont_init_banks); + } } static void mce_cpu_features(struct cpuinfo_x86 *c) @@ -911,8 +933,10 @@ static int mce_disable(void) { int i; - for (i = 0; i < banks; i++) - wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + for (i = 0; i < banks; i++) { + if (!skip_bank_init(i)) + wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + } return 0; } @@ -1119,8 +1143,10 @@ static void mce_disable_cpu(void *h) return; if (!(action & CPU_TASKS_FROZEN)) cmci_clear(); - for (i = 0; i < banks; i++) - wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + for (i = 0; i < banks; i++) { + if (!skip_bank_init(i)) + wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + } } static void mce_reenable_cpu(void *h) @@ -1133,8 +1159,10 @@ static void mce_reenable_cpu(void *h) if (!(action & CPU_TASKS_FROZEN)) cmci_reenable(); - for (i = 0; i < banks; i++) - wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); + for (i = 0; i < banks; i++) { + if (!skip_bank_init(i)) + wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); + } } /* Get notified when a cpu comes on/off. Be hotplug friendly. */ -- cgit v1.2.3 From 2e6f694fde0a7158590e121962ca2e3c06633528 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 27 Apr 2009 18:42:48 +0200 Subject: x86, mce: port K7 bank 0 quirk to 64bit mce code Various K7 have broken bank 0s. Don't enable it by default Port from the 32bit code. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 1dcd3be0332..1336280edcc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -650,6 +650,12 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) */ mce_bootlog = 0; } + /* + * Various K7s with broken bank 0 around. Always disable + * by default. + */ + if (c->x86 == 6) + bank[0] = 0; } if (c->x86_vendor == X86_VENDOR_INTEL) { -- cgit v1.2.3 From 5d7279268b654d1f8ac43b0eb6cd9598d9cf55fd Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 27 Apr 2009 19:25:48 +0200 Subject: x86, mce: use a call vector to call the 64bit mce handler Allows to call different machine check handlers from the low level machine check entry vector. This is needed for later when it will be used for 32bit too. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 23 ++++++++++++----------- arch/x86/kernel/cpu/mcheck/mce.h | 3 ++- arch/x86/kernel/entry_64.S | 2 +- 3 files changed, 15 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 1336280edcc..d99318b470d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -39,6 +39,16 @@ #include "mce.h" +/* Handle unconfigured int18 (should never happen) */ +static void unexpected_machine_check(struct pt_regs *regs, long error_code) +{ + printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", + smp_processor_id()); +} + +/* Call the installed machine check handler for this CPU setup. */ +void (*machine_check_vector)(struct pt_regs *, long error_code) = + unexpected_machine_check; #ifdef CONFIG_X86_64 #define MISC_MCELOG_MINOR 227 @@ -715,6 +725,8 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) } mce_cpu_quirks(c); + machine_check_vector = do_machine_check; + mce_init(NULL); mce_cpu_features(c); mce_init_timer(); @@ -1285,17 +1297,6 @@ int mce_disabled; int nr_mce_banks; EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ -/* Handle unconfigured int18 (should never happen) */ -static void unexpected_machine_check(struct pt_regs *regs, long error_code) -{ - printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", - smp_processor_id()); -} - -/* Call the installed machine check handler for this CPU setup. */ -void (*machine_check_vector)(struct pt_regs *, long error_code) = - unexpected_machine_check; - /* This has to be run for each processor */ void mcheck_init(struct cpuinfo_x86 *c) { diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h index cd6cffcc2de..966ae3c5cb1 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.h +++ b/arch/x86/kernel/cpu/mcheck/mce.h @@ -7,11 +7,12 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c); void intel_p6_mcheck_init(struct cpuinfo_x86 *c); void winchip_mcheck_init(struct cpuinfo_x86 *c); -#ifdef CONFIG_X86_32 /* Call the installed machine check handler for this CPU setup. */ extern void (*machine_check_vector)(struct pt_regs *, long error_code); +#ifdef CONFIG_X86_32 + extern int nr_mce_banks; void intel_set_thermal_handler(void); diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 38946c6e843..63276c45bff 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1382,7 +1382,7 @@ paranoiderrorentry stack_segment do_stack_segment errorentry general_protection do_general_protection errorentry page_fault do_page_fault #ifdef CONFIG_X86_MCE -paranoidzeroentry machine_check do_machine_check +paranoidzeroentry machine_check *machine_check_vector(%rip) #endif /* -- cgit v1.2.3 From 04b2b1a4df6cd0fdaa598f3c623a19c2d93cb48a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 28 Apr 2009 22:50:19 +0200 Subject: x86, mce: rename 64bit mce_dont_init to mce_disabled Give it the same name as on 32bit. This makes further merging easier. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index d99318b470d..6ab477060f5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -49,14 +49,15 @@ static void unexpected_machine_check(struct pt_regs *regs, long error_code) /* Call the installed machine check handler for this CPU setup. */ void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; + +int mce_disabled; + #ifdef CONFIG_X86_64 #define MISC_MCELOG_MINOR 227 atomic_t mce_entry; -static int mce_dont_init; - /* * Tolerant levels: * 0: always panic on uncorrected errors, log corrected errors @@ -194,7 +195,7 @@ static void mce_panic(char *msg, struct mce *backup, u64 start) int mce_available(struct cpuinfo_x86 *c) { - if (mce_dont_init) + if (mce_disabled) return 0; return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); } @@ -720,7 +721,7 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) return; if (mce_cap_init() < 0) { - mce_dont_init = 1; + mce_disabled = 1; return; } mce_cpu_quirks(c); @@ -911,7 +912,7 @@ static struct miscdevice mce_log_device = { */ static int __init mcheck_disable(char *str) { - mce_dont_init = 1; + mce_disabled = 1; return 1; } __setup("nomce", mcheck_disable); @@ -925,7 +926,7 @@ __setup("nomce", mcheck_disable); static int __init mcheck_enable(char *str) { if (!strcmp(str, "off")) - mce_dont_init = 1; + mce_disabled = 1; else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) mce_bootlog = (str[0] == 'b'); else if (isdigit(str[0])) @@ -1292,8 +1293,6 @@ device_initcall(mce_init_device); #else /* CONFIG_X86_32: */ -int mce_disabled; - int nr_mce_banks; EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ -- cgit v1.2.3 From d7c3c9a609563868d8a70e220399d06a25aba095 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 28 Apr 2009 23:07:25 +0200 Subject: x86, mce: move mce_disabled option into common 32bit/64bit code It's the same function, so let's share it. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 6ab477060f5..5395200dc9d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -907,16 +907,6 @@ static struct miscdevice mce_log_device = { &mce_chrdev_ops, }; -/* - * Old style boot options parsing. Only for compatibility. - */ -static int __init mcheck_disable(char *str) -{ - mce_disabled = 1; - return 1; -} -__setup("nomce", mcheck_disable); - /* * mce=off disables machine check * mce=TOLERANCELEVEL (number, see above) @@ -1327,19 +1317,22 @@ void mcheck_init(struct cpuinfo_x86 *c) printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); } -static int __init mcheck_disable(char *str) -{ - mce_disabled = 1; - return 1; -} - static int __init mcheck_enable(char *str) { mce_disabled = -1; return 1; } -__setup("nomce", mcheck_disable); __setup("mce", mcheck_enable); -#endif /* CONFIG_X86_32 */ +#endif /* CONFIG_X86_OLD_MCE */ + +/* + * Old style boot options parsing. Only for compatibility. + */ +static int __init mcheck_disable(char *str) +{ + mce_disabled = 1; + return 1; +} +__setup("nomce", mcheck_disable); -- cgit v1.2.3 From 8e97aef5f43ec715f394bc15015ff263b80c3ad6 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 28 Apr 2009 14:23:18 +0200 Subject: x86, mce: remove machine check handler idle notify on 64bit i386 has no idle notifiers, but the 64bit machine check code uses them to wake up mcelog from a fatal machine check exception. For corrected machine checks found by the poller or threshold interrupts going through an idle notifier is not needed because the wake_up can is just done directly and doesn't need the idle notifier. It is only needed for logging exceptions. To be honest I never liked the idle notifier even though I signed off on it. On closer investigation the code actually turned out to be nearly. Right now machine check exceptions on x86 are always unrecoverable (lead to panic due to PCC), which means we never execute the idle notifier path. The only exception is the somewhat weird tolerant==3 case, which ignores PCC. I'll fix this in a future patch in a much cleaner way. So remove the "mcelog wakeup through idle notifier" code from 64bit. This allows to compile the 64bit machine check handler on 32bit which doesn't have idle notifiers. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5395200dc9d..7562c1f674f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -555,29 +555,6 @@ int mce_notify_user(void) return 0; } -/* see if the idle task needs to notify userspace: */ -static int -mce_idle_callback(struct notifier_block *nfb, unsigned long action, - void *unused) -{ - /* IDLE_END should be safe - interrupts are back on */ - if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY)) - mce_notify_user(); - - return NOTIFY_OK; -} - -static struct notifier_block mce_idle_notifier = { - .notifier_call = mce_idle_callback, -}; - -static __init int periodic_mcheck_init(void) -{ - idle_notifier_register(&mce_idle_notifier); - return 0; -} -__initcall(periodic_mcheck_init); - /* * Initialize Machine Checks for a CPU. */ -- cgit v1.2.3 From d896a940ef4f12a0a6bc432853b249dcfbacabf0 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 28 Apr 2009 14:25:18 +0200 Subject: x86, mce: remove oops_begin() use in 64bit machine check First 32bit doesn't have oops_begin, so it's a barrier of using this code on 32bit. On closer examination it turns out oops_begin is not a good idea in a machine check panic anyways. All oops_begin does it so check for recursive/parallel oopses and implement the "wait on oops" heuristic. But there's actually no good reason to lock machine checks against oopses or prevent them from recursion. Also "wait on oops" does not really make sense for a machine check too. Replace it with a manual bust_spinlocks/console_verbose. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 7562c1f674f..f4d6841d2bd 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -178,7 +178,8 @@ static void mce_panic(char *msg, struct mce *backup, u64 start) { int i; - oops_begin(); + bust_spinlocks(1); + console_verbose(); for (i = 0; i < MCE_LOG_LEN; i++) { u64 tsc = mcelog.entry[i].tsc; -- cgit v1.2.3 From 4efc0670baf4b14bc95502e54a83ccf639146125 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 28 Apr 2009 19:07:31 +0200 Subject: x86, mce: use 64bit machine check code on 32bit The 64bit machine check code is in many ways much better than the 32bit machine check code: it is more specification compliant, is cleaner, only has a single code base versus one per CPU, has better infrastructure for recovery, has a cleaner way to communicate with user space etc. etc. Use the 64bit code for 32bit too. This is the second attempt to do this. There was one a couple of years ago to unify this code for 32bit and 64bit. Back then this ran into some trouble with K7s and was reverted. I believe this time the K7 problems (and some others) are addressed. I went over the old handlers and was very careful to retain all quirks. But of course this needs a lot of testing on old systems. On newer 64bit capable systems I don't expect much problems because they have been already tested with the 64bit kernel. I made this a CONFIG for now that still allows to select the old machine check code. This is mostly to make testing easier, if someone runs into a problem we can ask them to try with the CONFIG switched. The new code is default y for more coverage. Once there is confidence the 64bit code works well on older hardware too the CONFIG_X86_OLD_MCE and the associated code can be easily removed. This causes a behaviour change for 32bit installations. They now have to install the mcelog package to be able to log corrected machine checks. The 64bit machine check code only handles CPUs which support the standard Intel machine check architecture described in the IA32 SDM. The 32bit code has special support for some older CPUs which have non standard machine check architectures, in particular WinChip C3 and Intel P5. I made those a separate CONFIG option and kept them for now. The WinChip variant could be probably removed without too much pain, it doesn't really do anything interesting. P5 is also disabled by default (like it was before) because many motherboards have it miswired, but according to Alan Cox a few embedded setups use that one. Forward ported/heavily changed version of old patch, original patch included review/fixes from Thomas Gleixner, Bert Wesarg. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/apic.c | 4 ++-- arch/x86/kernel/apic/nmi.c | 2 +- arch/x86/kernel/cpu/mcheck/Makefile | 3 ++- arch/x86/kernel/cpu/mcheck/mce.c | 32 ++++++++++++++++++++++++++++---- arch/x86/kernel/cpu/mcheck/mce.h | 18 +++++++++++++++--- arch/x86/kernel/cpu/mcheck/p5.c | 5 +++++ arch/x86/kernel/irq.c | 4 ++-- arch/x86/kernel/irqinit_32.c | 2 +- arch/x86/kernel/signal.c | 4 ++-- arch/x86/kernel/traps.c | 4 ++-- 10 files changed, 60 insertions(+), 18 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index f2870920f24..ad532289ef2 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -843,7 +843,7 @@ void clear_local_APIC(void) } /* lets not touch this if we didn't frob it */ -#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) +#ifdef CONFIG_X86_THERMAL_VECTOR if (maxlvt >= 5) { v = apic_read(APIC_LVTTHMR); apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); @@ -1962,7 +1962,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state) apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); apic_pm_state.apic_tmict = apic_read(APIC_TMICT); apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); -#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) +#ifdef CONFIG_X86_THERMAL_VECTOR if (maxlvt >= 5) apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); #endif diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index ce4fbfa315a..c4762276c17 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -66,7 +66,7 @@ static inline unsigned int get_nmi_count(int cpu) static inline int mce_in_progress(void) { -#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) +#if defined(CONFIG_X86_NEW_MCE) return atomic_read(&mce_entry) > 0; #endif return 0; diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 55f01b39a10..5f8b09425d3 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,6 +1,7 @@ obj-y = mce.o therm_throt.o -obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o +obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o +obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o obj-$(CONFIG_X86_MCE_P4THERMAL) += mce_intel.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o mce_intel.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index f4d6841d2bd..e193de44ef1 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -52,7 +52,7 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) = int mce_disabled; -#ifdef CONFIG_X86_64 +#ifdef CONFIG_X86_NEW_MCE #define MISC_MCELOG_MINOR 227 @@ -662,6 +662,21 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) } } +static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) +{ + if (c->x86 != 5) + return; + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + if (mce_p5_enabled()) + intel_p5_mcheck_init(c); + break; + case X86_VENDOR_CENTAUR: + winchip_mcheck_init(c); + break; + } +} + static void mce_cpu_features(struct cpuinfo_x86 *c) { switch (c->x86_vendor) { @@ -695,6 +710,11 @@ static void mce_init_timer(void) */ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) { + if (mce_disabled) + return; + + mce_ancient_init(c); + if (!mce_available(c)) return; @@ -893,6 +913,10 @@ static struct miscdevice mce_log_device = { */ static int __init mcheck_enable(char *str) { + if (*str == 0) + enable_p5_mce(); + if (*str == '=') + str++; if (!strcmp(str, "off")) mce_disabled = 1; else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) @@ -900,13 +924,13 @@ static int __init mcheck_enable(char *str) else if (isdigit(str[0])) get_option(&str, &tolerant); else { - printk(KERN_INFO "mce= argument %s ignored. Please use /sys\n", + printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", str); return 0; } return 1; } -__setup("mce=", mcheck_enable); +__setup("mce", mcheck_enable); /* * Sysfs support @@ -1259,7 +1283,7 @@ static __init int mce_init_device(void) device_initcall(mce_init_device); -#else /* CONFIG_X86_32: */ +#else /* CONFIG_X86_OLD_MCE: */ int nr_mce_banks; EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h index 966ae3c5cb1..84a552b458c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.h +++ b/arch/x86/kernel/cpu/mcheck/mce.h @@ -1,17 +1,29 @@ #include #include +#ifdef CONFIG_X86_OLD_MCE void amd_mcheck_init(struct cpuinfo_x86 *c); void intel_p4_mcheck_init(struct cpuinfo_x86 *c); -void intel_p5_mcheck_init(struct cpuinfo_x86 *c); void intel_p6_mcheck_init(struct cpuinfo_x86 *c); -void winchip_mcheck_init(struct cpuinfo_x86 *c); +#endif +#ifdef CONFIG_X86_ANCIENT_MCE +void intel_p5_mcheck_init(struct cpuinfo_x86 *c); +void winchip_mcheck_init(struct cpuinfo_x86 *c); +extern int mce_p5_enable; +static inline int mce_p5_enabled(void) { return mce_p5_enable; } +static inline void enable_p5_mce(void) { mce_p5_enable = 1; } +#else +static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {} +static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {} +static inline int mce_p5_enabled(void) { return 0; } +static inline void enable_p5_mce(void) { } +#endif /* Call the installed machine check handler for this CPU setup. */ extern void (*machine_check_vector)(struct pt_regs *, long error_code); -#ifdef CONFIG_X86_32 +#ifdef CONFIG_X86_OLD_MCE extern int nr_mce_banks; diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index 8812f544183..015f481ab1b 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -14,6 +14,9 @@ #include "mce.h" +/* By default disabled */ +int mce_p5_enable; + /* Machine check handler for Pentium class Intel CPUs: */ static void pentium_machine_check(struct pt_regs *regs, long error_code) { @@ -44,9 +47,11 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c) if (!cpu_has(c, X86_FEATURE_MCE)) return; +#ifdef CONFIG_X86_OLD_MCE /* Default P5 to off as its often misconnected: */ if (mce_disabled != -1) return; +#endif machine_check_vector = pentium_machine_check; /* Make sure the vector pointer is visible before we enable MCEs: */ diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index c3fe010d74c..35eddc9ec99 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -89,7 +89,7 @@ static int show_other_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); seq_printf(p, " Thermal event interrupts\n"); -# ifdef CONFIG_X86_64 +# ifdef CONFIG_X86_MCE_THRESHOLD seq_printf(p, "%*s: ", prec, "THR"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); @@ -176,7 +176,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) #endif #ifdef CONFIG_X86_MCE sum += irq_stats(cpu)->irq_thermal_count; -# ifdef CONFIG_X86_64 +# ifdef CONFIG_X86_MCE_THRESHOLD sum += irq_stats(cpu)->irq_threshold_count; #endif #endif diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 368b0a8836f..98846e03211 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -181,7 +181,7 @@ void __init native_init_IRQ(void) alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); #endif -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) +#ifdef CONFIG_X86_THERMAL_VECTOR /* thermal monitor LVT interrupt */ alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); #endif diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 14425166b8e..d0851e3f77e 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -25,11 +25,11 @@ #include #include #include +#include #ifdef CONFIG_X86_64 #include #include -#include #endif /* CONFIG_X86_64 */ #include @@ -857,7 +857,7 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { -#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) +#ifdef CONFIG_X86_NEW_MCE /* notify userspace of pending MCEs */ if (thread_info_flags & _TIF_MCE_NOTIFY) mce_notify_user(); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a1d288327ff..ad771f15bdd 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -798,7 +798,8 @@ unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) return new_kesp; } -#else +#endif + asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) { } @@ -806,7 +807,6 @@ asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) { } -#endif /* * 'math_state_restore()' saves the current math information in the -- cgit v1.2.3 From 7856f6cce4a8cda8c1f94b99605c07d16b8d8dec Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 28 Apr 2009 23:32:56 +0200 Subject: x86, mce: enable MCE_INTEL for 32bit new MCE Enable the 64bit MCE_INTEL code (CMCI, thermal interrupts) for 32bit NEW_MCE. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/threshold.c | 2 +- arch/x86/kernel/entry_64.S | 2 +- arch/x86/kernel/irqinit_32.c | 4 ++++ arch/x86/kernel/traps.c | 2 +- 4 files changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index 23ee9e730f7..d746df2909c 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -17,7 +17,7 @@ static void default_threshold_interrupt(void) void (*mce_threshold_vector)(void) = default_threshold_interrupt; -asmlinkage void mce_threshold_interrupt(void) +asmlinkage void smp_threshold_interrupt(void) { exit_idle(); irq_enter(); diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 63276c45bff..a31a7f29cff 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1007,7 +1007,7 @@ apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \ #endif apicinterrupt THRESHOLD_APIC_VECTOR \ - threshold_interrupt mce_threshold_interrupt + threshold_interrupt smp_threshold_interrupt apicinterrupt THERMAL_APIC_VECTOR \ thermal_interrupt smp_thermal_interrupt diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 98846e03211..2512ad93dab 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -186,6 +186,10 @@ void __init native_init_IRQ(void) alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); #endif +#ifdef CONFIG_X86_MCE_THRESHOLD + alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); +#endif + if (!acpi_ioapic) setup_irq(2, &irq2); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ad771f15bdd..0d358c884b3 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -804,7 +804,7 @@ asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) { } -asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) +asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) { } -- cgit v1.2.3 From 5f8c1a54cab6f449fe04d42d0661bc796fa4e73e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 29 Apr 2009 19:29:12 +0200 Subject: x86, mce: add MSR read wrappers for easier error injection This will be used by future patches to allow machine check error injection. Right now it's a nop, except for adding some wrappers around the MSR reads. This is early in the sequence to avoid too many conflicts. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index e193de44ef1..e755c95674d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -194,6 +194,19 @@ static void mce_panic(char *msg, struct mce *backup, u64 start) panic(msg); } +/* MSR access wrappers used for error injection */ +static u64 mce_rdmsrl(u32 msr) +{ + u64 v; + rdmsrl(msr, v); + return v; +} + +static void mce_wrmsrl(u32 msr, u64 v) +{ + wrmsrl(msr, v); +} + int mce_available(struct cpuinfo_x86 *c) { if (mce_disabled) @@ -213,7 +226,7 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) if (rip_msr) { /* Assume the RIP in the MSR is exact. Is this true? */ m->mcgstatus |= MCG_STATUS_EIPV; - rdmsrl(rip_msr, m->ip); + m->ip = mce_rdmsrl(rip_msr); m->cs = 0; } } @@ -231,7 +244,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) mce_setup(&m); - rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); for (i = 0; i < banks; i++) { if (!bank[i] || !test_bit(i, *b)) continue; @@ -242,7 +255,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) m.tsc = 0; barrier(); - rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); + m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); if (!(m.status & MCI_STATUS_VAL)) continue; @@ -257,9 +270,9 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) continue; if (m.status & MCI_STATUS_MISCV) - rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); + m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); if (m.status & MCI_STATUS_ADDRV) - rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); + m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); if (!(flags & MCP_TIMESTAMP)) m.tsc = 0; @@ -275,7 +288,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) /* * Clear state for this bank. */ - wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); } /* @@ -320,7 +333,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_setup(&m); - rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); /* if the restart IP is not valid, we're done for */ if (!(m.mcgstatus & MCG_STATUS_RIPV)) @@ -338,7 +351,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) m.addr = 0; m.bank = i; - rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); + m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); if ((m.status & MCI_STATUS_VAL) == 0) continue; @@ -378,9 +391,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) } if (m.status & MCI_STATUS_MISCV) - rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); + m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); if (m.status & MCI_STATUS_ADDRV) - rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); + m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); mce_get_rip(&m, regs); mce_log(&m); @@ -449,9 +462,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) /* the last thing we do is clear state */ for (i = 0; i < banks; i++) { if (test_bit(i, toclear)) - wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); } - wrmsrl(MSR_IA32_MCG_STATUS, 0); + mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); out2: atomic_dec(&mce_entry); } -- cgit v1.2.3 From ea149b36c7f511d17dd89fee734cb09778a91fa0 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 29 Apr 2009 19:31:00 +0200 Subject: x86, mce: add basic error injection infrastructure Allow user programs to write mce records into /dev/mcelog. When they do that a fake machine check is triggered to test the machine check code. This uses the MCE MSR wrappers added earlier. The implementation is straight forward. There is a struct mce record per CPU and the MCE MSR accesses get data from there if there is valid data injected there. This allows to test the machine check code relatively realistically because only the lowest layer of hardware access is intercepted. The test suite and injector are available at git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/Makefile | 1 + arch/x86/kernel/cpu/mcheck/mce-inject.c | 126 ++++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/mcheck/mce.c | 39 +++++++++- 3 files changed, 165 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/cpu/mcheck/mce-inject.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 5f8b09425d3..60ee182c6c5 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -7,3 +7,4 @@ obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o mce_intel.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o +obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c new file mode 100644 index 00000000000..58afac4b5df --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -0,0 +1,126 @@ +/* + * Machine check injection support. + * Copyright 2008 Intel Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + * Authors: + * Andi Kleen + * Ying Huang + */ +#include +#include +#include +#include +#include +#include +#include +#include + +/* Update fake mce registers on current CPU. */ +static void inject_mce(struct mce *m) +{ + struct mce *i = &per_cpu(injectm, m->cpu); + + /* Make sure noone reads partially written injectm */ + i->finished = 0; + mb(); + m->finished = 0; + /* First set the fields after finished */ + i->cpu = m->cpu; + mb(); + /* Now write record in order, finished last (except above) */ + memcpy(i, m, sizeof(struct mce)); + /* Finally activate it */ + mb(); + i->finished = 1; +} + +struct delayed_mce { + struct timer_list timer; + struct mce m; +}; + +/* Inject mce on current CPU */ +static void raise_mce(unsigned long data) +{ + struct delayed_mce *dm = (struct delayed_mce *)data; + struct mce *m = &dm->m; + int cpu = m->cpu; + + inject_mce(m); + if (m->status & MCI_STATUS_UC) { + struct pt_regs regs; + memset(®s, 0, sizeof(struct pt_regs)); + regs.ip = m->ip; + regs.cs = m->cs; + printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu); + do_machine_check(®s, 0); + printk(KERN_INFO "MCE exception done on CPU %d\n", cpu); + } else { + mce_banks_t b; + memset(&b, 0xff, sizeof(mce_banks_t)); + printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu); + machine_check_poll(0, &b); + mce_notify_user(); + printk(KERN_INFO "Finished machine check poll on CPU %d\n", + cpu); + } + kfree(dm); +} + +/* Error injection interface */ +static ssize_t mce_write(struct file *filp, const char __user *ubuf, + size_t usize, loff_t *off) +{ + struct delayed_mce *dm; + struct mce m; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + /* + * There are some cases where real MSR reads could slip + * through. + */ + if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA)) + return -EIO; + + if ((unsigned long)usize > sizeof(struct mce)) + usize = sizeof(struct mce); + if (copy_from_user(&m, ubuf, usize)) + return -EFAULT; + + if (m.cpu >= NR_CPUS || !cpu_online(m.cpu)) + return -EINVAL; + + dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL); + if (!dm) + return -ENOMEM; + + /* + * Need to give user space some time to set everything up, + * so do it a jiffie or two later everywhere. + * Should we use a hrtimer here for better synchronization? + */ + memcpy(&dm->m, &m, sizeof(struct mce)); + setup_timer(&dm->timer, raise_mce, (unsigned long)dm); + dm->timer.expires = jiffies + 2; + add_timer_on(&dm->timer, m.cpu); + return usize; +} + +static int inject_init(void) +{ + printk(KERN_INFO "Machine check injector initialized\n"); + mce_chrdev_ops.write = mce_write; + return 0; +} + +module_init(inject_init); +/* Cannot tolerate unloading currently because we cannot + * guarantee all openers of mce_chrdev will get a reference to us. + */ +MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index e755c95674d..fe216bd10f4 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -98,6 +98,9 @@ void mce_setup(struct mce *m) rdtscll(m->tsc); } +DEFINE_PER_CPU(struct mce, injectm); +EXPORT_PER_CPU_SYMBOL_GPL(injectm); + /* * Lockless MCE logging infrastructure. * This avoids deadlocks on printk locks without having to break locks. Also @@ -194,16 +197,46 @@ static void mce_panic(char *msg, struct mce *backup, u64 start) panic(msg); } +/* Support code for software error injection */ + +static int msr_to_offset(u32 msr) +{ + unsigned bank = __get_cpu_var(injectm.bank); + if (msr == rip_msr) + return offsetof(struct mce, ip); + if (msr == MSR_IA32_MC0_STATUS + bank*4) + return offsetof(struct mce, status); + if (msr == MSR_IA32_MC0_ADDR + bank*4) + return offsetof(struct mce, addr); + if (msr == MSR_IA32_MC0_MISC + bank*4) + return offsetof(struct mce, misc); + if (msr == MSR_IA32_MCG_STATUS) + return offsetof(struct mce, mcgstatus); + return -1; +} + /* MSR access wrappers used for error injection */ static u64 mce_rdmsrl(u32 msr) { u64 v; + if (__get_cpu_var(injectm).finished) { + int offset = msr_to_offset(msr); + if (offset < 0) + return 0; + return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); + } rdmsrl(msr, v); return v; } static void mce_wrmsrl(u32 msr, u64 v) { + if (__get_cpu_var(injectm).finished) { + int offset = msr_to_offset(msr); + if (offset >= 0) + *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; + return; + } wrmsrl(msr, v); } @@ -296,6 +329,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * exceptions. */ } +EXPORT_SYMBOL_GPL(machine_check_poll); /* * The actual machine check handler. This only handles real @@ -468,6 +502,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) out2: atomic_dec(&mce_entry); } +EXPORT_SYMBOL_GPL(do_machine_check); #ifdef CONFIG_X86_MCE_INTEL /*** @@ -568,6 +603,7 @@ int mce_notify_user(void) } return 0; } +EXPORT_SYMBOL_GPL(mce_notify_user); /* * Initialize Machine Checks for a CPU. @@ -904,13 +940,14 @@ static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) } } -static const struct file_operations mce_chrdev_ops = { +struct file_operations mce_chrdev_ops = { .open = mce_open, .release = mce_release, .read = mce_read, .poll = mce_poll, .unlocked_ioctl = mce_ioctl, }; +EXPORT_SYMBOL_GPL(mce_chrdev_ops); static struct miscdevice mce_log_device = { MISC_MCELOG_MINOR, -- cgit v1.2.3 From a1ff41bfc1bb7a6d19cf958f89a9b539678781e5 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 25 May 2009 22:16:14 -0700 Subject: x86, mce: add comment about mce_chrdev_ops being writable Add a comment explaining that mce_chrdev_ops is intentionally writable. [ Impact: comment only ] Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index fe216bd10f4..156cdf6d918 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -940,6 +940,7 @@ static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) } } +/* Modified in mce-inject.c, so not static or const */ struct file_operations mce_chrdev_ops = { .open = mce_open, .release = mce_release, -- cgit v1.2.3 From 5706001aacba5d3db5f224ca135e5e91a30be39c Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 25 May 2009 22:18:17 -0700 Subject: x86, mce: fix comment style in mce-inject.c Fix style of winged comment in mce-inject.c. [ Impact: comment only ] Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce-inject.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 58afac4b5df..673c7285502 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -120,7 +120,8 @@ static int inject_init(void) } module_init(inject_init); -/* Cannot tolerate unloading currently because we cannot +/* + * Cannot tolerate unloading currently because we cannot * guarantee all openers of mce_chrdev will get a reference to us. */ MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 88921be30296e126896ee4d30758f989d1c4ddfb Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:51 +0200 Subject: x86, mce: synchronize core after machine check handling The example code in the IA32 SDM recommends to synchronize the CPU after machine check handling. So do that here. [ Impact: Spec compliance ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 156cdf6d918..495c9680866 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -328,6 +328,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * Don't clear MCG_STATUS here because it's only defined for * exceptions. */ + + sync_core(); } EXPORT_SYMBOL_GPL(machine_check_poll); @@ -501,6 +503,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); out2: atomic_dec(&mce_entry); + sync_core(); } EXPORT_SYMBOL_GPL(do_machine_check); -- cgit v1.2.3 From b56f642d2bf8c1f7c6499c1e55b23311a33cc796 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:52 +0200 Subject: x86, mce: use extended sysattrs for the check_interval attribute. Instead of using own callbacks use the generic ones provided by the sysdev later. This finally allows to get rid of the ugly ACCESSOR macros. Should also save some text size. [ Impact: cleanup ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 39 +++++++++++++++------------------------ 1 file changed, 15 insertions(+), 24 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 495c9680866..3cac9da7ce2 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1054,28 +1054,6 @@ DEFINE_PER_CPU(struct sys_device, mce_dev); __cpuinitdata void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); -/* Why are there no generic functions for this? */ -#define ACCESSOR(name, var, start) \ - static ssize_t show_ ## name(struct sys_device *s, \ - struct sysdev_attribute *attr, \ - char *buf) { \ - return sprintf(buf, "%Lx\n", (u64)var); \ - } \ - static ssize_t set_ ## name(struct sys_device *s, \ - struct sysdev_attribute *attr, \ - const char *buf, size_t siz) { \ - char *end; \ - u64 new = simple_strtoull(buf, &end, 0); \ - \ - if (end == buf) \ - return -EINVAL; \ - var = new; \ - start; \ - \ - return end-buf; \ - } \ - static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); - static struct sysdev_attribute *bank_attrs; static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, @@ -1126,13 +1104,26 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, return len; } +static ssize_t store_int_with_restart(struct sys_device *s, + struct sysdev_attribute *attr, + const char *buf, size_t size) +{ + ssize_t ret = sysdev_store_int(s, attr, buf, size); + mce_restart(); + return ret; +} + static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); -ACCESSOR(check_interval, check_interval, mce_restart()) +static struct sysdev_ext_attribute attr_check_interval = { + _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, + store_int_with_restart), + &check_interval +}; static struct sysdev_attribute *mce_attrs[] = { - &attr_tolerant.attr, &attr_check_interval, &attr_trigger, + &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger, NULL }; -- cgit v1.2.3 From fc016a49c2d92f2efbe22c1fb66eb7a5d2a06ed1 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:53 +0200 Subject: x86, mce: remove unused mce_events variable Remove unused mce_events static variable. [ Impact: cleanup ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 3cac9da7ce2..69aad7e96a5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -71,7 +71,6 @@ static u64 *bank; static unsigned long notify_user; static int rip_msr; static int mce_bootlog = -1; -static atomic_t mce_events; static char trigger[128]; static char *trigger_argv[2] = { trigger, NULL }; @@ -116,7 +115,6 @@ void mce_log(struct mce *mce) { unsigned next, entry; - atomic_inc(&mce_events); mce->finished = 0; wmb(); for (;;) { -- cgit v1.2.3 From 8be9110569aec1f65d86b08aef7ec49659137bf9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 27 May 2009 21:56:53 +0200 Subject: x86, mce: remove mce_init unused argument Remove unused mce_init argument. [ Impact: cleanup ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 69aad7e96a5..20c7e7c669d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -643,7 +643,7 @@ static int mce_cap_init(void) return 0; } -static void mce_init(void *dummy) +static void mce_init(void) { mce_banks_t all_banks; u64 cap; @@ -776,7 +776,7 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) machine_check_vector = do_machine_check; - mce_init(NULL); + mce_init(); mce_cpu_features(c); mce_init_timer(); } @@ -1020,7 +1020,7 @@ static int mce_shutdown(struct sys_device *dev) */ static int mce_resume(struct sys_device *dev) { - mce_init(NULL); + mce_init(); mce_cpu_features(¤t_cpu_data); return 0; @@ -1030,7 +1030,7 @@ static void mce_cpu_restart(void *data) { del_timer_sync(&__get_cpu_var(mce_timer)); if (mce_available(¤t_cpu_data)) - mce_init(NULL); + mce_init(); mce_init_timer(); } -- cgit v1.2.3 From 32561696c23028596f24b353d98f2e23b58f91f7 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:53 +0200 Subject: x86, mce: rename and align out2 label There's only a single out path in do_machine_check now, so rename the label from out2 to out. Also align it at the first column. [ Impact: minor cleanup, no functional changes ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 20c7e7c669d..18d505d8022 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -361,9 +361,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) if (notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL) == NOTIFY_STOP) - goto out2; + goto out; if (!banks) - goto out2; + goto out; mce_setup(&m); @@ -499,7 +499,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); } mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); - out2: +out: atomic_dec(&mce_entry); sync_core(); } -- cgit v1.2.3 From b170204ddb7844ffff62d2d537b20c0eeb97725e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:54 +0200 Subject: x86, mce: drop BKL in mce_open BKL is not needed for anything in mce_open because it has an own spinlock. Remove it. [ Impact: cleanup ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 18d505d8022..8ab28368bb9 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include @@ -791,12 +790,10 @@ static int open_exclu; /* already open exclusive? */ static int mce_open(struct inode *inode, struct file *file) { - lock_kernel(); spin_lock(&mce_state_lock); if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { spin_unlock(&mce_state_lock); - unlock_kernel(); return -EBUSY; } @@ -806,7 +803,6 @@ static int mce_open(struct inode *inode, struct file *file) open_count++; spin_unlock(&mce_state_lock); - unlock_kernel(); return nonseekable_open(inode, file); } -- cgit v1.2.3 From 9319cec8c185e84fc5281afb6ac5d4c47a234841 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Tue, 14 Apr 2009 17:26:30 +0900 Subject: x86, mce: use strict_strtoull Use strict_strtoull instead of simple_strtoull. Signed-off-by: Hidetoshi Seto Cc: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 9 ++++----- arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 16 ++++++---------- 2 files changed, 10 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 8ab28368bb9..4375ffb5459 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1059,18 +1059,17 @@ static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, } static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, - const char *buf, size_t siz) + const char *buf, size_t size) { - char *end; - u64 new = simple_strtoull(buf, &end, 0); + u64 new; - if (end == buf) + if (strict_strtoull(buf, 0, &new) < 0) return -EINVAL; bank[attr - bank_attrs] = new; mce_restart(); - return end-buf; + return size; } static ssize_t diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 083f270251f..0c563432e25 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -269,14 +269,12 @@ SHOW_FIELDS(interrupt_enable) SHOW_FIELDS(threshold_limit) static ssize_t -store_interrupt_enable(struct threshold_block *b, const char *buf, size_t count) +store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) { struct thresh_restart tr; unsigned long new; - char *end; - new = simple_strtoul(buf, &end, 0); - if (end == buf) + if (strict_strtoul(buf, 0, &new) < 0) return -EINVAL; b->interrupt_enable = !!new; @@ -287,18 +285,16 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t count) smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); - return end - buf; + return size; } static ssize_t -store_threshold_limit(struct threshold_block *b, const char *buf, size_t count) +store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) { struct thresh_restart tr; unsigned long new; - char *end; - new = simple_strtoul(buf, &end, 0); - if (end == buf) + if (strict_strtoul(buf, 0, &new) < 0) return -EINVAL; if (new > THRESHOLD_MAX) @@ -313,7 +309,7 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t count) smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); - return end - buf; + return size; } struct threshold_block_cross_cpu { -- cgit v1.2.3 From cc3aec52ab8e013984270a79d1aa51f691d239b0 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 30 Apr 2009 15:58:22 +0900 Subject: x86, mce: trivial clean up for therm_throt.c This patch removes following checkpatch warning: WARNING: Use #include instead of +#include Signed-off-by: Hidetoshi Seto Cc: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/therm_throt.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index a2b5d7ddb19..7b1ae2e20ba 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -20,7 +20,6 @@ #include #include -#include /* How long to wait between reporting thermal events */ #define CHECK_INTERVAL (300 * HZ) -- cgit v1.2.3 From 14a02530e2239f753a0f3f089847e723adbdaa47 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 30 Apr 2009 16:04:51 +0900 Subject: x86, mce: trivial clean up for mce.c This fixs following checkpatch warnings: WARNING: Use #include instead of +#include WARNING: Use #include instead of +#include WARNING: line over 80 characters + set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags); WARNING: braces {} are not necessary for any arm of this statement + if (mce_notify_user()) { [...] + } else { [...] Signed-off-by: Hidetoshi Seto Cc: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 4375ffb5459..1d0aa9c4e15 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -27,14 +28,13 @@ #include #include #include +#include #include #include -#include #include #include #include -#include #include "mce.h" @@ -125,7 +125,8 @@ void mce_log(struct mce *mce) * interesting ones: */ if (entry >= MCE_LOG_LEN) { - set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags); + set_bit(MCE_OVERFLOW, + (unsigned long *)&mcelog.flags); return; } /* Old left over entry. Skip: */ @@ -556,11 +557,10 @@ static void mcheck_timer(unsigned long data) * polling interval, otherwise increase the polling interval. */ n = &__get_cpu_var(next_interval); - if (mce_notify_user()) { + if (mce_notify_user()) *n = max(*n/2, HZ/100); - } else { + else *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); - } t->expires = jiffies + *n; add_timer(t); -- cgit v1.2.3 From 34fa1967aa0827776e37feb5666df0327575a0f2 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 8 Apr 2009 12:31:18 +0200 Subject: x86, mce: trivial clean up for mce_amd_64.c Fix for followings: WARNING: Use #include instead of +#include ERROR: Macros with multiple statements should be enclosed in a do - while loop +#define THRESHOLD_ATTR(_name, _mode, _show, _store) \ +{ \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ +}; WARNING: usage of NR_CPUS is often wrong - consider using cpu_possible(), num_possible_cpus(), for_each_possible_cpu(), etc + if (cpu >= NR_CPUS) Signed-off-by: Hidetoshi Seto Cc: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 0c563432e25..ddae21620bd 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -24,7 +25,6 @@ #include #include -#include #include #include #include @@ -344,17 +344,13 @@ static ssize_t store_error_count(struct threshold_block *b, return 1; } -#define THRESHOLD_ATTR(_name, _mode, _show, _store) \ -{ \ - .attr = {.name = __stringify(_name), .mode = _mode }, \ - .show = _show, \ - .store = _store, \ +#define RW_ATTR(val) \ +static struct threshold_attr val = { \ + .attr = {.name = __stringify(val), .mode = 0644 }, \ + .show = show_## val, \ + .store = store_## val, \ }; -#define RW_ATTR(name) \ -static struct threshold_attr name = \ - THRESHOLD_ATTR(name, 0644, show_## name, store_## name) - RW_ATTR(interrupt_enable); RW_ATTR(threshold_limit); RW_ATTR(error_count); @@ -675,9 +671,6 @@ static void threshold_remove_device(unsigned int cpu) static void __cpuinit amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu) { - if (cpu >= NR_CPUS) - return; - switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: -- cgit v1.2.3 From 61a021a0700c22ee527d73d92f9acb109ff478f8 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Tue, 14 Apr 2009 17:09:04 +0900 Subject: x86, mce: trivial clean up for mce_intel_64.c Fix for: WARNING: space prohibited between function name and open parenthesis '(' + for_each_online_cpu (cpu) { Signed-off-by: Hidetoshi Seto Cc: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index 13abafcb72e..eff3740501a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -196,7 +196,7 @@ void cmci_rediscover(int dying) return; cpumask_copy(old, ¤t->cpus_allowed); - for_each_online_cpu (cpu) { + for_each_online_cpu(cpu) { if (cpu == dying) continue; if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) -- cgit v1.2.3 From 98a9c8c3ba13dfc3df8e6d2a126d2fa4e4621e9c Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 28 May 2009 11:41:01 +0900 Subject: x86, mce: trivial clean up for mce-inject.c Fix for: WARNING: Use #include instead of +#include WARNING: usage of NR_CPUS is often wrong - consider using cpu_possible(), num_possible_cpus(), for_each_possible_cpu(), etc + if (m.cpu >= NR_CPUS || !cpu_online(m.cpu)) ERROR: trailing whitespace +/* $ Signed-off-by: Hidetoshi Seto Cc: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce-inject.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 673c7285502..7b3a5428396 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -11,13 +11,13 @@ * Andi Kleen * Ying Huang */ +#include #include #include #include #include #include #include -#include #include /* Update fake mce registers on current CPU. */ @@ -93,7 +93,7 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf, if (copy_from_user(&m, ubuf, usize)) return -EFAULT; - if (m.cpu >= NR_CPUS || !cpu_online(m.cpu)) + if (m.cpu >= num_possible_cpus() || !cpu_online(m.cpu)) return -EINVAL; dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL); -- cgit v1.2.3 From c323d95fa4dbe0b6bf6d59e24a0b7db067dd08a7 Mon Sep 17 00:00:00 2001 From: Yong Wang Date: Fri, 29 May 2009 13:28:35 +0800 Subject: perf_counter/x86: Always use NMI for performance-monitoring interrupt Always use NMI for performance-monitoring interrupt as there could be racy situations if we switch between irq and nmi mode frequently. Signed-off-by: Yong Wang LKML-Reference: <20090529052835.GA13657@ywang-moblin2.bj.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/cpu/perf_counter.c | 19 +++++-------------- 2 files changed, 6 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 89b63b5fad3..60df2efd7c8 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1135,7 +1135,7 @@ void __cpuinit setup_local_APIC(void) apic_write(APIC_ESR, 0); } #endif - perf_counters_lapic_init(0); + perf_counters_lapic_init(); preempt_disable(); diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 2eeaa99add1..316b0c995f3 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -604,7 +604,7 @@ try_generic: hwc->counter_base = x86_pmu.perfctr; } - perf_counters_lapic_init(hwc->nmi); + perf_counters_lapic_init(); x86_pmu.disable(hwc, idx); @@ -863,24 +863,15 @@ void set_perf_counter_pending(void) apic->send_IPI_self(LOCAL_PENDING_VECTOR); } -void perf_counters_lapic_init(int nmi) +void perf_counters_lapic_init(void) { - u32 apic_val; - if (!x86_pmu_initialized()) return; /* - * Enable the performance counter vector in the APIC LVT: + * Always use NMI for PMU */ - apic_val = apic_read(APIC_LVTERR); - - apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED); - if (nmi) - apic_write(APIC_LVTPC, APIC_DM_NMI); - else - apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR); - apic_write(APIC_LVTERR, apic_val); + apic_write(APIC_LVTPC, APIC_DM_NMI); } static int __kprobes @@ -1054,7 +1045,7 @@ void __init init_hw_perf_counters(void) pr_info("... counter mask: %016Lx\n", perf_counter_mask); - perf_counters_lapic_init(0); + perf_counters_lapic_init(); register_die_notifier(&perf_counter_nmi_notifier); } -- cgit v1.2.3 From 61c8c67e3ad67ea1d1360f2e88688bd942834756 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 26 May 2009 14:58:39 -0700 Subject: acpi-cpufreq: fix printk typo and indentation Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Len Brown --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 208ecf6643d..54b6de2cd94 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -693,8 +693,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE && policy->cpuinfo.transition_latency > 20 * 1000) { policy->cpuinfo.transition_latency = 20 * 1000; - printk_once(KERN_INFO "Capping off P-state tranision" - " latency at 20 uS\n"); + printk_once(KERN_INFO + "P-state transition latency capped at 20 uS\n"); } /* table init */ -- cgit v1.2.3 From 58f892e022e88438183c48661dcdc6a2997dab99 Mon Sep 17 00:00:00 2001 From: Naga Chumbalkar Date: Tue, 26 May 2009 21:48:07 +0000 Subject: x86: Print real IOAPIC version for x86-64 Fix the fact that the IOAPIC version number in the x86_64 code path always gets assigned to 0, instead of the correct value. Before the patch: (from "dmesg" output): ACPI: IOAPIC (id[0x08] address[0xfec00000] gsi_base[0]) IOAPIC[0]: apic_id 8, version 0, address 0xfec00000, GSI 0-23 <--- After the patch: ACPI: IOAPIC (id[0x08] address[0xfec00000] gsi_base[0]) IOAPIC[0]: apic_id 8, version 32, address 0xfec00000, GSI 0-23 <--- History: io_apic_get_version() was compiled out of the x86_64 code path in the commit f2c2cca3acef8b253a36381d9b469ad4fb08563a: Author: Andi Kleen Date: Tue Sep 26 10:52:37 2006 +0200 [PATCH] Remove APIC version/cpu capability mpparse checking/printing ACPI went to great trouble to get the APIC version and CPU capabilities of different CPUs before passing them to the mpparser. But all that data was used was to print it out. Actually it even faked some data based on the boot cpu, not on the actual CPU being booted. Remove all this code because it's not needed. Cc: len.brown@intel.com At the time, the IOAPIC version number was deliberately not printed in the x86_64 code path. However, after the x86 and x86_64 files were merged, the net result is that the IOAPIC version is printed incorrectly in the x86_64 code path. The patch below provides a fix. I have tested it with acpi, and with acpi=off, and did not see any problems. Signed-off-by: Naga Chumbalkar Acked-by: Yinghai Lu LKML-Reference: <20090416014230.4885.94926.sendpatchset@localhost.localdomain> Signed-off-by: Ingo Molnar ************************* --- arch/x86/kernel/acpi/boot.c | 5 +---- arch/x86/kernel/apic/io_apic.c | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 844e5e25213..631086159c5 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -985,11 +985,8 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); mp_ioapics[idx].apicid = uniq_ioapic_id(id); -#ifdef CONFIG_X86_32 mp_ioapics[idx].apicver = io_apic_get_version(idx); -#else - mp_ioapics[idx].apicver = 0; -#endif + /* * Build basic GSI lookup table to facilitate gsi->io_apic lookups * and to prevent reprogramming of IOAPIC pins (PCI GSIs). diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ac7f3b6ad58..f712f8ff403 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -4012,6 +4012,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) return apic_id; } +#endif int __init io_apic_get_version(int ioapic) { @@ -4024,7 +4025,6 @@ int __init io_apic_get_version(int ioapic) return reg_01.bits.version; } -#endif int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) { -- cgit v1.2.3 From 3d58829b0510244596079c1d2f1762c53aef2e97 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 28 May 2009 09:54:47 +0200 Subject: x86, apic: Restore irqs on fail paths lapic_resume forgets to restore interrupts on fail paths. Fix that. Signed-off-by: Jiri Slaby Acked-by: Cyrill Gorcunov LKML-Reference: <1243497289-18591-1-git-send-email-jirislaby@gmail.com> Signed-off-by: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/kernel/apic/apic.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b0fd26442c4..e82488d3f0b 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2027,7 +2027,7 @@ static int lapic_resume(struct sys_device *dev) unsigned int l, h; unsigned long flags; int maxlvt; - int ret; + int ret = 0; struct IO_APIC_route_entry **ioapic_entries = NULL; if (!apic_pm_state.active) @@ -2038,14 +2038,15 @@ static int lapic_resume(struct sys_device *dev) ioapic_entries = alloc_ioapic_entries(); if (!ioapic_entries) { WARN(1, "Alloc ioapic_entries in lapic resume failed."); - return -ENOMEM; + ret = -ENOMEM; + goto restore; } ret = save_IO_APIC_setup(ioapic_entries); if (ret) { WARN(1, "Saving IO-APIC state failed: %d\n", ret); free_ioapic_entries(ioapic_entries); - return ret; + goto restore; } mask_IO_APIC_setup(ioapic_entries); @@ -2097,10 +2098,10 @@ static int lapic_resume(struct sys_device *dev) restore_IO_APIC_setup(ioapic_entries); free_ioapic_entries(ioapic_entries); } - +restore: local_irq_restore(flags); - return 0; + return ret; } /* -- cgit v1.2.3 From b23f3325ed465f1bd914384884269af0d106778c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 15:13:03 +0200 Subject: perf_counter: Rename various fields A few renames: s/irq_period/sample_period/ s/irq_freq/sample_freq/ s/PERF_RECORD_/PERF_SAMPLE_/ s/record_type/sample_type/ And change both the new sample_type and read_format to u64. Reported-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 316b0c995f3..ec06aa5e928 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -290,11 +290,11 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->nmi = 1; hw_event->nmi = 1; - if (!hwc->irq_period) - hwc->irq_period = x86_pmu.max_period; + if (!hwc->sample_period) + hwc->sample_period = x86_pmu.max_period; atomic64_set(&hwc->period_left, - min(x86_pmu.max_period, hwc->irq_period)); + min(x86_pmu.max_period, hwc->sample_period)); /* * Raw event type provide the config in the event structure @@ -462,7 +462,7 @@ x86_perf_counter_set_period(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { s64 left = atomic64_read(&hwc->period_left); - s64 period = min(x86_pmu.max_period, hwc->irq_period); + s64 period = min(x86_pmu.max_period, hwc->sample_period); int err; /* -- cgit v1.2.3 From 8a016db386195b193e2a8aeddff9fe937dcb7a40 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 15:27:45 +0200 Subject: perf_counter: Remove the last nmi/irq bits IRQ (non-NMI) sampling is not used anymore - remove the last few bits. Signed-off-by: Peter Zijlstra Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index ec06aa5e928..9e144fbebd2 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -284,12 +284,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter) if (!hw_event->exclude_kernel) hwc->config |= ARCH_PERFMON_EVENTSEL_OS; - /* - * Use NMI events all the time: - */ - hwc->nmi = 1; - hw_event->nmi = 1; - if (!hwc->sample_period) hwc->sample_period = x86_pmu.max_period; -- cgit v1.2.3 From e4abb5d4f7ddabc1fc7c392cf0a10d8e5868c9ca Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 16:08:20 +0200 Subject: perf_counter: x86: Emulate longer sample periods Do as Power already does, emulate sample periods up to 2^63-1 by composing them of smaller values limited by hardware capabilities. Only once we wrap the software period do we generate an overflow event. Just 10 lines of new code. Reported-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 9e144fbebd2..904571bea71 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -287,8 +287,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) if (!hwc->sample_period) hwc->sample_period = x86_pmu.max_period; - atomic64_set(&hwc->period_left, - min(x86_pmu.max_period, hwc->sample_period)); + atomic64_set(&hwc->period_left, hwc->sample_period); /* * Raw event type provide the config in the event structure @@ -451,13 +450,13 @@ static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); * Set the next IRQ period, based on the hwc->period_left value. * To be called with the counter disabled in hw: */ -static void +static int x86_perf_counter_set_period(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { s64 left = atomic64_read(&hwc->period_left); - s64 period = min(x86_pmu.max_period, hwc->sample_period); - int err; + s64 period = hwc->sample_period; + int err, ret = 0; /* * If we are way outside a reasoable range then just skip forward: @@ -465,11 +464,13 @@ x86_perf_counter_set_period(struct perf_counter *counter, if (unlikely(left <= -period)) { left = period; atomic64_set(&hwc->period_left, left); + ret = 1; } if (unlikely(left <= 0)) { left += period; atomic64_set(&hwc->period_left, left); + ret = 1; } /* * Quirk: certain CPUs dont like it if just 1 event is left: @@ -477,6 +478,9 @@ x86_perf_counter_set_period(struct perf_counter *counter, if (unlikely(left < 2)) left = 2; + if (left > x86_pmu.max_period) + left = x86_pmu.max_period; + per_cpu(prev_left[idx], smp_processor_id()) = left; /* @@ -487,6 +491,8 @@ x86_perf_counter_set_period(struct perf_counter *counter, err = checking_wrmsrl(hwc->counter_base + idx, (u64)(-left) & x86_pmu.counter_mask); + + return ret; } static inline void @@ -706,16 +712,19 @@ static void x86_pmu_disable(struct perf_counter *counter) * Save and restart an expired counter. Called by NMI contexts, * so it has to be careful about preempting normal counter ops: */ -static void intel_pmu_save_and_restart(struct perf_counter *counter) +static int intel_pmu_save_and_restart(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; int idx = hwc->idx; + int ret; x86_perf_counter_update(counter, hwc, idx); - x86_perf_counter_set_period(counter, hwc, idx); + ret = x86_perf_counter_set_period(counter, hwc, idx); if (counter->state == PERF_COUNTER_STATE_ACTIVE) intel_pmu_enable_counter(hwc, idx); + + return ret; } static void intel_pmu_reset(void) @@ -782,7 +791,9 @@ again: if (!test_bit(bit, cpuc->active_mask)) continue; - intel_pmu_save_and_restart(counter); + if (!intel_pmu_save_and_restart(counter)) + continue; + if (perf_counter_overflow(counter, nmi, regs, 0)) intel_pmu_disable_counter(&counter->hw, bit); } @@ -824,9 +835,11 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) continue; /* counter overflow */ - x86_perf_counter_set_period(counter, hwc, idx); handled = 1; inc_irq_stat(apic_perf_irqs); + if (!x86_perf_counter_set_period(counter, hwc, idx)) + continue; + if (perf_counter_overflow(counter, nmi, regs, 0)) amd_pmu_disable_counter(hwc, idx); } -- cgit v1.2.3 From 0d48696f87e3618b0d35bd3e4e9d7c188d51e7de Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 19:22:16 +0200 Subject: perf_counter: Rename perf_counter_hw_event => perf_counter_attr The structure isn't hw only and when I read event, I think about those things that fall out the other end. Rename the thing. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur Cc: Stephane Eranian LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 904571bea71..e16e8c13132 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -247,11 +247,11 @@ static inline int x86_pmu_initialized(void) } /* - * Setup the hardware configuration for a given hw_event_type + * Setup the hardware configuration for a given attr_type */ static int __hw_perf_counter_init(struct perf_counter *counter) { - struct perf_counter_hw_event *hw_event = &counter->hw_event; + struct perf_counter_attr *attr = &counter->attr; struct hw_perf_counter *hwc = &counter->hw; int err; @@ -279,9 +279,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter) /* * Count user and OS events unless requested not to. */ - if (!hw_event->exclude_user) + if (!attr->exclude_user) hwc->config |= ARCH_PERFMON_EVENTSEL_USR; - if (!hw_event->exclude_kernel) + if (!attr->exclude_kernel) hwc->config |= ARCH_PERFMON_EVENTSEL_OS; if (!hwc->sample_period) @@ -292,15 +292,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter) /* * Raw event type provide the config in the event structure */ - if (perf_event_raw(hw_event)) { - hwc->config |= x86_pmu.raw_event(perf_event_config(hw_event)); + if (perf_event_raw(attr)) { + hwc->config |= x86_pmu.raw_event(perf_event_config(attr)); } else { - if (perf_event_id(hw_event) >= x86_pmu.max_events) + if (perf_event_id(attr) >= x86_pmu.max_events) return -EINVAL; /* * The generic map: */ - hwc->config |= x86_pmu.event_map(perf_event_id(hw_event)); + hwc->config |= x86_pmu.event_map(perf_event_id(attr)); } counter->destroy = hw_perf_counter_destroy; -- cgit v1.2.3 From a32881066e58346f2901afe0ebdfbf0c562877e5 Mon Sep 17 00:00:00 2001 From: Yong Wang Date: Wed, 3 Jun 2009 13:12:55 +0800 Subject: perf_counter/x86: Remove the IRQ (non-NMI) handling bits Remove the IRQ (non-NMI) handling bits as NMI will be used always. Signed-off-by: Yong Wang Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090603051255.GA2791@ywang-moblin2.bj.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 21 ++++++--------------- arch/x86/kernel/entry_64.S | 2 -- arch/x86/kernel/irqinit_32.c | 1 - arch/x86/kernel/irqinit_64.c | 1 - 4 files changed, 6 insertions(+), 19 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index e16e8c13132..12cc05ed9f4 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -40,7 +40,7 @@ struct cpu_hw_counters { struct x86_pmu { const char *name; int version; - int (*handle_irq)(struct pt_regs *, int); + int (*handle_irq)(struct pt_regs *); void (*disable_all)(void); void (*enable_all)(void); void (*enable)(struct hw_perf_counter *, int); @@ -755,7 +755,7 @@ static void intel_pmu_reset(void) * This handler is triggered by the local APIC, so the APIC IRQ handling * rules apply: */ -static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) +static int intel_pmu_handle_irq(struct pt_regs *regs) { struct cpu_hw_counters *cpuc; struct cpu_hw_counters; @@ -794,7 +794,7 @@ again: if (!intel_pmu_save_and_restart(counter)) continue; - if (perf_counter_overflow(counter, nmi, regs, 0)) + if (perf_counter_overflow(counter, 1, regs, 0)) intel_pmu_disable_counter(&counter->hw, bit); } @@ -812,7 +812,7 @@ again: return 1; } -static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) +static int amd_pmu_handle_irq(struct pt_regs *regs) { int cpu, idx, handled = 0; struct cpu_hw_counters *cpuc; @@ -840,22 +840,13 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) if (!x86_perf_counter_set_period(counter, hwc, idx)) continue; - if (perf_counter_overflow(counter, nmi, regs, 0)) + if (perf_counter_overflow(counter, 1, regs, 0)) amd_pmu_disable_counter(hwc, idx); } return handled; } -void smp_perf_counter_interrupt(struct pt_regs *regs) -{ - irq_enter(); - apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR); - ack_APIC_irq(); - x86_pmu.handle_irq(regs, 0); - irq_exit(); -} - void smp_perf_pending_interrupt(struct pt_regs *regs) { irq_enter(); @@ -910,7 +901,7 @@ perf_counter_nmi_handler(struct notifier_block *self, * If the first NMI handles both, the latter will be empty and daze * the CPU. */ - x86_pmu.handle_irq(regs, 1); + x86_pmu.handle_irq(regs); return NOTIFY_STOP; } diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 89100461914..7985c010f8a 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1026,8 +1026,6 @@ apicinterrupt SPURIOUS_APIC_VECTOR \ spurious_interrupt smp_spurious_interrupt #ifdef CONFIG_PERF_COUNTERS -apicinterrupt LOCAL_PERF_VECTOR \ - perf_counter_interrupt smp_perf_counter_interrupt apicinterrupt LOCAL_PENDING_VECTOR \ perf_pending_interrupt smp_perf_pending_interrupt #endif diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 3190a6b961e..205bdd880d3 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -165,7 +165,6 @@ static void __init apic_intr_init(void) alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); # ifdef CONFIG_PERF_COUNTERS - alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt); alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); # endif diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 53ceb26f80f..fa6ef692000 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -155,7 +155,6 @@ static void __init apic_intr_init(void) /* Performance monitoring interrupt: */ #ifdef CONFIG_PERF_COUNTERS - alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt); alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); #endif } -- cgit v1.2.3 From 367d04c4ec02dad34d80452e32e3370db7fb6fee Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 28 May 2009 09:54:48 +0200 Subject: amd_iommu: fix lock imbalance In alloc_coherent there is an omitted unlock on the path where mapping fails. Add the unlock. [ Impact: fix lock imbalance in alloc_coherent ] Signed-off-by: Jiri Slaby Cc: Joerg Roedel Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index d6898833c36..9f89bb645b3 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1541,8 +1541,10 @@ static void *alloc_coherent(struct device *dev, size_t size, *dma_addr = __map_single(dev, iommu, domain->priv, paddr, size, DMA_BIDIRECTIONAL, true, dma_mask); - if (*dma_addr == bad_dma_address) + if (*dma_addr == bad_dma_address) { + spin_unlock_irqrestore(&domain->lock, flags); goto out_free; + } iommu_completion_wait(iommu); -- cgit v1.2.3 From 0e2595cdfd7df9f1128f7185152601ae5417483b Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 20 May 2009 08:10:57 -0500 Subject: x86: Fix UV BAU activation descriptor init The UV tlb shootdown code has a serious initialization error. An array of structures [32*8] is initialized as if it were [32]. The array is indexed by (cpu number on the blade)*8, so the short initialization works for up to 4 cpus on a blade. But above that, we provide an invalid opcode to the hub's broadcast assist unit. This patch changes the allocation of the array to use its symbolic dimensions for better clarity. And initializes all 32*8 entries. Shortened 'UV_ACTIVATION_DESCRIPTOR_SIZE' to 'UV_ADP_SIZE' per Ingo's recommendation. Tested on the UV simulator. Signed-off-by: Cliff Wickman Cc: LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index ed0c33761e6..16f0fd4f18e 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -715,7 +715,12 @@ uv_activation_descriptor_init(int node, int pnode) struct bau_desc *adp; struct bau_desc *ad2; - adp = (struct bau_desc *)kmalloc_node(16384, GFP_KERNEL, node); + /* + * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR) + * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per blade + */ + adp = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)* + UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node); BUG_ON(!adp); pa = uv_gpa(adp); /* need the real nasid*/ @@ -729,7 +734,13 @@ uv_activation_descriptor_init(int node, int pnode) (n << UV_DESC_BASE_PNODE_SHIFT | m)); } - for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) { + /* + * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each + * cpu even though we only use the first one; one descriptor can + * describe a broadcast to 256 nodes. + */ + for (i = 0, ad2 = adp; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR); + i++, ad2++) { memset(ad2, 0, sizeof(struct bau_desc)); ad2->header.sw_ack_flag = 1; /* -- cgit v1.2.3 From 128f048f0f0d2a477ad2555e7acd2ad15a1b6061 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 3 Jun 2009 22:19:36 +0200 Subject: perf_counter: Fix throttling lock-up Throttling logic is broken and we can lock up with too small hw sampling intervals. Make the throttling code more robust: disable counters even if we already disabled them. ( Also clean up whitespace damage i noticed while reading various pieces of code related to throttling. ) Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 12cc05ed9f4..8f53f3a7da2 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -91,7 +91,7 @@ static u64 intel_pmu_raw_event(u64 event) #define CORE_EVNTSEL_INV_MASK 0x00800000ULL #define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL -#define CORE_EVNTSEL_MASK \ +#define CORE_EVNTSEL_MASK \ (CORE_EVNTSEL_EVENT_MASK | \ CORE_EVNTSEL_UNIT_MASK | \ CORE_EVNTSEL_EDGE_MASK | \ -- cgit v1.2.3 From 01ca79f1411eae2a45352709c838b946b1af9fbd Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:52 +0200 Subject: x86, mce: add machine check exception count in /proc/interrupts Useful for debugging, but it's also good general policy to have a counter for all special interrupts there. This makes it easier to diagnose where a CPU is spending its time. [ Impact: feature, debugging tool ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++++ arch/x86/kernel/irq.c | 10 ++++++++++ 2 files changed, 14 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 1d0aa9c4e15..287268d2183 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -57,6 +57,8 @@ int mce_disabled; atomic_t mce_entry; +DEFINE_PER_CPU(unsigned, mce_exception_count); + /* * Tolerant levels: * 0: always panic on uncorrected errors, log corrected errors @@ -359,6 +361,8 @@ void do_machine_check(struct pt_regs *regs, long error_code) atomic_inc(&mce_entry); + __get_cpu_var(mce_exception_count)++; + if (notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL) == NOTIFY_STOP) goto out; diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index a05660bf029..05fc635c28c 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -12,6 +12,7 @@ #include #include #include +#include atomic_t irq_err_count; @@ -93,6 +94,12 @@ static int show_other_interrupts(struct seq_file *p, int prec) seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); seq_printf(p, " Threshold APIC interrupts\n"); # endif +#endif +#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) + seq_printf(p, "%*s: ", prec, "MCE"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); + seq_printf(p, " Machine check exceptions\n"); #endif seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) @@ -161,6 +168,9 @@ u64 arch_irq_stat_cpu(unsigned int cpu) { u64 sum = irq_stats(cpu)->__nmi_count; +#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) + sum += per_cpu(mce_exception_count, cpu); +#endif #ifdef CONFIG_X86_LOCAL_APIC sum += irq_stats(cpu)->apic_timer_irqs; sum += irq_stats(cpu)->irq_spurious_count; -- cgit v1.2.3 From ca84f69697da0f004135e45b63ca560b6bd3554e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:57 +0200 Subject: x86, mce: add MCE poll count to /proc/interrupts Keep a count of the machine check polls (or CMCI events) in /proc/interrupts. Andi needs this for debugging, but it's also useful in general to see what's going in by the kernel. Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++++ arch/x86/kernel/irq.c | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 287268d2183..784f6ae9d6f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -264,6 +264,8 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) } } +DEFINE_PER_CPU(unsigned, mce_poll_count); + /* * Poll for corrected events or events that happened before reset. * Those are just logged through /dev/mcelog. @@ -275,6 +277,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) struct mce m; int i; + __get_cpu_var(mce_poll_count)++; + mce_setup(&m); m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 05fc635c28c..eff46b5de62 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -100,6 +100,10 @@ static int show_other_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); seq_printf(p, " Machine check exceptions\n"); + seq_printf(p, "%*s: ", prec, "MCP"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(mce_poll_count, j)); + seq_printf(p, " Machine check polls\n"); #endif seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) -- cgit v1.2.3 From f6fb0ac0869500323c78fa21992fe1933af61e91 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:55 +0200 Subject: x86, mce: store record length into memory struct mce anchor This makes it easier for tools who want to extract the mcelog out of crash images or memory dumps to adapt to changing struct mce size. The length field replaces padding, so it's fully compatible. Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 784f6ae9d6f..3db047e7a0f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -108,8 +108,9 @@ EXPORT_PER_CPU_SYMBOL_GPL(injectm); */ static struct mce_log mcelog = { - MCE_LOG_SIGNATURE, - MCE_LOG_LEN, + .signature = MCE_LOG_SIGNATURE, + .len = MCE_LOG_LEN, + .recordlen = sizeof(struct mce), }; void mce_log(struct mce *mce) -- cgit v1.2.3 From d620c67fb92aa11736112f9a03e31d8e3079c57a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:56 +0200 Subject: x86, mce: support more than 256 CPUs in struct mce The old struct mce had a limitation to 256 CPUs. But x86 Linux supports more than that now with x2apic. Add a new field extcpu to report the extended number. Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce-inject.c | 10 +++++----- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 7b3a5428396..7d858fb4ce6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -23,14 +23,14 @@ /* Update fake mce registers on current CPU. */ static void inject_mce(struct mce *m) { - struct mce *i = &per_cpu(injectm, m->cpu); + struct mce *i = &per_cpu(injectm, m->extcpu); /* Make sure noone reads partially written injectm */ i->finished = 0; mb(); m->finished = 0; /* First set the fields after finished */ - i->cpu = m->cpu; + i->extcpu = m->extcpu; mb(); /* Now write record in order, finished last (except above) */ memcpy(i, m, sizeof(struct mce)); @@ -49,7 +49,7 @@ static void raise_mce(unsigned long data) { struct delayed_mce *dm = (struct delayed_mce *)data; struct mce *m = &dm->m; - int cpu = m->cpu; + int cpu = m->extcpu; inject_mce(m); if (m->status & MCI_STATUS_UC) { @@ -93,7 +93,7 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf, if (copy_from_user(&m, ubuf, usize)) return -EFAULT; - if (m.cpu >= num_possible_cpus() || !cpu_online(m.cpu)) + if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu)) return -EINVAL; dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL); @@ -108,7 +108,7 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf, memcpy(&dm->m, &m, sizeof(struct mce)); setup_timer(&dm->timer, raise_mce, (unsigned long)dm); dm->timer.expires = jiffies + 2; - add_timer_on(&dm->timer, m.cpu); + add_timer_on(&dm->timer, m.extcpu); return usize; } diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 3db047e7a0f..2c4dd6c422c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -94,7 +94,7 @@ static inline int skip_bank_init(int i) void mce_setup(struct mce *m) { memset(m, 0, sizeof(struct mce)); - m->cpu = smp_processor_id(); + m->cpu = m->extcpu = smp_processor_id(); rdtscll(m->tsc); } @@ -158,7 +158,7 @@ static void print_mce(struct mce *m) KERN_EMERG "HARDWARE ERROR\n" KERN_EMERG "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", - m->cpu, m->mcgstatus, m->bank, m->status); + m->extcpu, m->mcgstatus, m->bank, m->status); if (m->ip) { printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", -- cgit v1.2.3 From 8ee08347c1e8b5680b3b3ce081e42e97bcaa1abe Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:56 +0200 Subject: x86, mce: extend struct mce user interface with more information. Experience has shown that struct mce which is used to pass an machine check to the user space daemon currently a few limitations. Also some data which is useful to print at panic level is also missing. This patch addresses most of them. The same information is also printed out together with mce panic. struct mce can be painlessly extended in a compatible way, the mcelog user space code just ignores additional fields with a warning. - It doesn't provide a wall time timestamp. There have been a few complaints about that. Fix that by adding a 64bit time_t - It doesn't provide the exact CPU identification. This makes it awkward for mcelog to decode the event correctly, especially when there are variations in the supported MCE codes on different CPU models or when mcelog is running on a different host after a panic. Previously the administrator had to specify the correct CPU when mcelog ran on a different host, but with the more variation in machine checks now it's better to auto detect that. It's also useful for more detailed analysis of CPU events. Pass CPUID 1.EAX and the cpu vendor (as encoded in processor.h) instead. - Socket ID and initial APIC ID are useful to report because they allow to identify the failing CPU in some (not all) cases. This is also especially useful for the panic situation. This addresses one of the complaints from Thomas Gleixner earlier. - The MCG capabilities MSR needs to be reported for some advanced error processing in mcelog Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 2c4dd6c422c..ba68449c22a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -96,6 +96,15 @@ void mce_setup(struct mce *m) memset(m, 0, sizeof(struct mce)); m->cpu = m->extcpu = smp_processor_id(); rdtscll(m->tsc); + /* We hope get_seconds stays lockless */ + m->time = get_seconds(); + m->cpuvendor = boot_cpu_data.x86_vendor; + m->cpuid = cpuid_eax(1); +#ifdef CONFIG_SMP + m->socketid = cpu_data(m->extcpu).phys_proc_id; +#endif + m->apicid = cpu_data(m->extcpu).initial_apicid; + rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); } DEFINE_PER_CPU(struct mce, injectm); @@ -173,6 +182,9 @@ static void print_mce(struct mce *m) if (m->misc) printk("MISC %llx ", m->misc); printk("\n"); + printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", + m->cpuvendor, m->cpuid, m->time, m->socketid, + m->apicid); printk(KERN_EMERG "This is not a software problem!\n"); printk(KERN_EMERG "Run through mcelog --ascii to decode " "and contact your hardware vendor\n"); -- cgit v1.2.3 From de8a84d85ad8bb46d01d72ebc57030b95075603c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:53 +0200 Subject: x86, mce: log corrected errors when panicing Normally the machine check handler ignores corrected errors and leaves them to machine_check_poll(). But when panicing mcp won't run, so log all errors. Note: this can still miss some cases until the "early no way out" patch later is applied too. Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index ba68449c22a..86806e52fc4 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -412,9 +412,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) /* * Non uncorrected errors are handled by machine_check_poll - * Leave them alone. + * Leave them alone, unless this panics. */ - if ((m.status & MCI_STATUS_UC) == 0) + if ((m.status & MCI_STATUS_UC) == 0 && !no_way_out) continue; /* -- cgit v1.2.3 From a0189c70e5f17f4253dd7bc575c97469900e23d6 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:54 +0200 Subject: x86, mce: remove TSC print heuristic Previously mce_panic used a simple heuristic to avoid printing old so far unreported machine check events on a mce panic. This worked by comparing the TSC value at the start of the machine check handler with the event time stamp and only printing newer ones. This has a couple of issues, in particular on systems where the TSC is not fully synchronized between CPUs it could lose events or print old ones. It is also problematic with full system synchronization as it is added by the next patch. Remove the TSC heuristic and instead replace it with a simple heuristic to print corrected errors first and after that uncorrected errors and finally the worst machine check as determined by the machine check handler. This simplifies the code because there is no need to pass the original TSC value around. Contains fixes from Ying Huang [ Impact: bug fix, cleanup ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Cc: Ying Huang Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 86806e52fc4..6773610061d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -158,6 +158,7 @@ void mce_log(struct mce *mce) mcelog.entry[entry].finished = 1; wmb(); + mce->finished = 1; set_bit(0, ¬ify_user); } @@ -190,23 +191,29 @@ static void print_mce(struct mce *m) "and contact your hardware vendor\n"); } -static void mce_panic(char *msg, struct mce *backup, u64 start) +static void mce_panic(char *msg, struct mce *final) { int i; bust_spinlocks(1); console_verbose(); + /* First print corrected ones that are still unlogged */ for (i = 0; i < MCE_LOG_LEN; i++) { - u64 tsc = mcelog.entry[i].tsc; - - if ((s64)(tsc - start) < 0) + struct mce *m = &mcelog.entry[i]; + if ((m->status & MCI_STATUS_VAL) && + !(m->status & MCI_STATUS_UC)) + print_mce(m); + } + /* Now print uncorrected but with the final one last */ + for (i = 0; i < MCE_LOG_LEN; i++) { + struct mce *m = &mcelog.entry[i]; + if (!(m->status & MCI_STATUS_VAL)) continue; - print_mce(&mcelog.entry[i]); - if (backup && mcelog.entry[i].tsc == backup->tsc) - backup = NULL; + if (!final || memcmp(m, final, sizeof(struct mce))) + print_mce(m); } - if (backup) - print_mce(backup); + if (final) + print_mce(final); panic(msg); } @@ -362,7 +369,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) { struct mce m, panicm; int panicm_found = 0; - u64 mcestart = 0; int i; /* * If no_way_out gets set, there is no safe way to recover from this @@ -394,7 +400,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) if (!(m.mcgstatus & MCG_STATUS_RIPV)) no_way_out = 1; - rdtscll(mcestart); barrier(); for (i = 0; i < banks; i++) { @@ -478,7 +483,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) * has not set tolerant to an insane level, give up and die. */ if (no_way_out && tolerant < 3) - mce_panic("Machine check", &panicm, mcestart); + mce_panic("Machine check", &panicm); /* * If the error seems to be unrecoverable, something should be @@ -506,8 +511,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) if (user_space) { force_sig(SIGBUS, current); } else if (panic_on_oops || tolerant < 2) { - mce_panic("Uncorrected machine check", - &panicm, mcestart); + mce_panic("Uncorrected machine check", &panicm); } } -- cgit v1.2.3 From 817f32d02a52dd7f5941534e0699883691e918df Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:54 +0200 Subject: x86, mce: add table driven machine check grading The machine check grading (as in deciding what should be done for a given register value) has to be done multiple times soon and it's also getting more complicated. So it makes sense to consolidate it into a single function. To get smaller and more straight forward and possibly more extensible code I opted towards a new table driven method. The various rules are put into a table when is then executed by a very simple interpreter. The grading engine is in a new file mce-severity.c. I also added a private include file mce-internal.h, because mce.h is already a bit too cluttered. This is dead code right now, but will be used in followon patches. Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/Makefile | 1 + arch/x86/kernel/cpu/mcheck/mce-internal.h | 10 +++++ arch/x86/kernel/cpu/mcheck/mce-severity.c | 61 +++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+) create mode 100644 arch/x86/kernel/cpu/mcheck/mce-internal.h create mode 100644 arch/x86/kernel/cpu/mcheck/mce-severity.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 60ee182c6c5..45004faf67e 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,5 +1,6 @@ obj-y = mce.o therm_throt.o +obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o obj-$(CONFIG_X86_MCE_P4THERMAL) += mce_intel.o diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h new file mode 100644 index 00000000000..f126b4ae7a2 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -0,0 +1,10 @@ +#include + +enum severity_level { + MCE_NO_SEVERITY, + MCE_SOME_SEVERITY, + MCE_UC_SEVERITY, + MCE_PANIC_SEVERITY, +}; + +int mce_severity(struct mce *a, int tolerant, char **msg); diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c new file mode 100644 index 00000000000..c189e89a89a --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -0,0 +1,61 @@ +/* + * MCE grading rules. + * Copyright 2008, 2009 Intel Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + * Author: Andi Kleen + */ +#include +#include + +#include "mce-internal.h" + +/* + * Grade an mce by severity. In general the most severe ones are processed + * first. Since there are quite a lot of combinations test the bits in a + * table-driven way. The rules are simply processed in order, first + * match wins. + */ + +static struct severity { + u64 mask; + u64 result; + unsigned char sev; + unsigned char mcgmask; + unsigned char mcgres; + char *msg; +} severities[] = { +#define SEV(s) .sev = MCE_ ## s ## _SEVERITY +#define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r } +#define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r } +#define MCGMASK(x, res, s, m, r...) \ + { .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r } + BITCLR(MCI_STATUS_VAL, NO, "Invalid"), + BITCLR(MCI_STATUS_EN, NO, "Not enabled"), + BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"), + MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "No restart IP"), + BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"), + BITSET(MCI_STATUS_UC, UC, "Uncorrected"), + BITSET(0, SOME, "No match") /* always matches. keep at end */ +}; + +int mce_severity(struct mce *a, int tolerant, char **msg) +{ + struct severity *s; + for (s = severities;; s++) { + if ((a->status & s->mask) != s->result) + continue; + if ((a->mcgstatus & s->mcgmask) != s->mcgres) + continue; + if (s->sev > MCE_NO_SEVERITY && (a->status & MCI_STATUS_UC) && + tolerant < 1) + return MCE_PANIC_SEVERITY; + if (msg) + *msg = s->msg; + return s->sev; + } +} -- cgit v1.2.3 From bd19a5e6b73df276e1ccedf9059e9ee70c372d7d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:55 +0200 Subject: x86, mce: check early in exception handler if panic is needed The exception handler should behave differently if the exception is fatal versus one that can be returned from. In the first case it should never clear any registers because these need to be preserved for logging after the next boot. Otherwise it should clear them on each CPU step by step so that other CPUs sharing the same bank don't see duplicate events. Otherwise we risk reporting events multiple times on any CPUs which have shared machine check banks, which is a common problem on Intel Nehalem which has both SMT (two CPU threads sharing banks) and shared machine check banks in the uncore. Determine early in a special pass if any event requires a panic. This uses the mce_severity() function added earlier. This is needed for the next patch. Also fixes a problem together with an earlier patch that corrected events weren't logged on a fatal MCE. [ Impact: Feature ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 6773610061d..5031814ac94 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -36,6 +36,7 @@ #include #include +#include "mce-internal.h" #include "mce.h" /* Handle unconfigured int18 (should never happen) */ @@ -191,7 +192,7 @@ static void print_mce(struct mce *m) "and contact your hardware vendor\n"); } -static void mce_panic(char *msg, struct mce *final) +static void mce_panic(char *msg, struct mce *final, char *exp) { int i; @@ -214,6 +215,8 @@ static void mce_panic(char *msg, struct mce *final) } if (final) print_mce(final); + if (exp) + printk(KERN_EMERG "Machine check: %s\n", exp); panic(msg); } @@ -357,6 +360,22 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) } EXPORT_SYMBOL_GPL(machine_check_poll); +/* + * Do a quick check if any of the events requires a panic. + * This decides if we keep the events around or clear them. + */ +static int mce_no_way_out(struct mce *m, char **msg) +{ + int i; + + for (i = 0; i < banks; i++) { + m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); + if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) + return 1; + } + return 0; +} + /* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. @@ -381,6 +400,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) */ int kill_it = 0; DECLARE_BITMAP(toclear, MAX_NR_BANKS); + char *msg = "Unknown"; atomic_inc(&mce_entry); @@ -395,10 +415,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_setup(&m); m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); - - /* if the restart IP is not valid, we're done for */ - if (!(m.mcgstatus & MCG_STATUS_RIPV)) - no_way_out = 1; + no_way_out = mce_no_way_out(&m, &msg); barrier(); @@ -430,18 +447,13 @@ void do_machine_check(struct pt_regs *regs, long error_code) __set_bit(i, toclear); if (m.status & MCI_STATUS_EN) { - /* if PCC was set, there's no way out */ - no_way_out |= !!(m.status & MCI_STATUS_PCC); /* * If this error was uncorrectable and there was * an overflow, we're in trouble. If no overflow, * we might get away with just killing a task. */ - if (m.status & MCI_STATUS_UC) { - if (tolerant < 1 || m.status & MCI_STATUS_OVER) - no_way_out = 1; + if (m.status & MCI_STATUS_UC) kill_it = 1; - } } else { /* * Machine check event was not enabled. Clear, but @@ -483,7 +495,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) * has not set tolerant to an insane level, give up and die. */ if (no_way_out && tolerant < 3) - mce_panic("Machine check", &panicm); + mce_panic("Machine check", &panicm, msg); /* * If the error seems to be unrecoverable, something should be @@ -511,7 +523,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) if (user_space) { force_sig(SIGBUS, current); } else if (panic_on_oops || tolerant < 2) { - mce_panic("Uncorrected machine check", &panicm); + mce_panic("Uncorrected machine check", &panicm, msg); } } -- cgit v1.2.3 From ccc3c3192ae78dd56dcdf5353fd1a9ef5f9a3e2b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:54 +0200 Subject: x86, mce: implement bootstrapping for machine check wakeups Machine checks support waking up the mcelog daemon quickly. The original wake up code for this was pretty ugly, relying on a idle notifier and a special process flag. The reason it did it this way is that the machine check handler is not subject to normal interrupt locking rules so it's not safe to call wake_up(). Instead it set a process flag and then either did the wakeup in the syscall return or in the idle notifier. This patch adds a new "bootstraping" method as replacement. The idea is that the handler checks if it's in a state where it is unsafe to call wake_up(). If it's safe it calls it directly. When it's not safe -- that is it interrupted in a critical section with interrupts disables -- it uses a new "self IPI" to trigger an IPI to its own CPU. This can be done safely because IPI triggers are atomic with some care. The IPI is raised once the interrupts are reenabled and can then safely call wake_up(). When APICs are disabled the event is just queued and will be picked up eventually by the next polling timer. I think that's a reasonable compromise, since it should only happen quite rarely. Contains fixes from Ying Huang. [ solve conflict on irqinit, make it work on 32bit (entry_arch.h) - HS ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 54 ++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/entry_64.S | 5 ++++ arch/x86/kernel/irqinit.c | 3 +++ 3 files changed, 62 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5031814ac94..12178162785 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -32,7 +33,10 @@ #include #include +#include +#include #include +#include #include #include @@ -287,6 +291,54 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) } } +#ifdef CONFIG_X86_LOCAL_APIC +/* + * Called after interrupts have been reenabled again + * when a MCE happened during an interrupts off region + * in the kernel. + */ +asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) +{ + ack_APIC_irq(); + exit_idle(); + irq_enter(); + mce_notify_user(); + irq_exit(); +} +#endif + +static void mce_report_event(struct pt_regs *regs) +{ + if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { + mce_notify_user(); + return; + } + +#ifdef CONFIG_X86_LOCAL_APIC + /* + * Without APIC do not notify. The event will be picked + * up eventually. + */ + if (!cpu_has_apic) + return; + + /* + * When interrupts are disabled we cannot use + * kernel services safely. Trigger an self interrupt + * through the APIC to instead do the notification + * after interrupts are reenabled again. + */ + apic->send_IPI_self(MCE_SELF_VECTOR); + + /* + * Wait for idle afterwards again so that we don't leave the + * APIC in a non idle state because the normal APIC writes + * cannot exclude us. + */ + apic_wait_icr_idle(); +#endif +} + DEFINE_PER_CPU(unsigned, mce_poll_count); /* @@ -530,6 +582,8 @@ void do_machine_check(struct pt_regs *regs, long error_code) /* notify userspace ASAP */ set_thread_flag(TIF_MCE_NOTIFY); + mce_report_event(regs); + /* the last thing we do is clear state */ for (i = 0; i < banks; i++) { if (test_bit(i, toclear)) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index a31a7f29cff..711c130a841 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1011,6 +1011,11 @@ apicinterrupt THRESHOLD_APIC_VECTOR \ apicinterrupt THERMAL_APIC_VECTOR \ thermal_interrupt smp_thermal_interrupt +#ifdef CONFIG_X86_MCE +apicinterrupt MCE_SELF_VECTOR \ + mce_self_interrupt smp_mce_self_interrupt +#endif + #ifdef CONFIG_SMP apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ call_function_single_interrupt smp_call_function_single_interrupt diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index aab3d277766..441f6ec6e9d 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -187,6 +187,9 @@ static void __init apic_intr_init(void) #ifdef CONFIG_X86_THRESHOLD alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); #endif +#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC) + alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt); +#endif #if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) /* self generated IPI for local APIC timer */ -- cgit v1.2.3 From f94b61c2c9fdcc90773c49df9ccf9ede3ad0d7db Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:55 +0200 Subject: x86, mce: implement panic synchronization In some circumstances multiple CPUs can enter mce_panic() in parallel. This gives quite confused output because they will all dump the same machine check buffer. The other problem is that they would all panic in parallel, but not process each other's shutdown IPIs because interrupts are disabled. Detect this situation early on in mce_panic(). On the first CPU entering will do the panic, the others will just wait to be killed. For paranoia reasons in case the other CPU dies during the MCE I added a 5 seconds timeout. If it expires each CPU will panic on its own again. Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 12178162785..421020f1d7d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -196,10 +196,32 @@ static void print_mce(struct mce *m) "and contact your hardware vendor\n"); } +#define PANIC_TIMEOUT 5 /* 5 seconds */ + +static atomic_t mce_paniced; + +/* Panic in progress. Enable interrupts and wait for final IPI */ +static void wait_for_panic(void) +{ + long timeout = PANIC_TIMEOUT*USEC_PER_SEC; + preempt_disable(); + local_irq_enable(); + while (timeout-- > 0) + udelay(1); + panic("Panicing machine check CPU died"); +} + static void mce_panic(char *msg, struct mce *final, char *exp) { int i; + /* + * Make sure only one CPU runs in machine check panic + */ + if (atomic_add_return(1, &mce_paniced) > 1) + wait_for_panic(); + barrier(); + bust_spinlocks(1); console_verbose(); /* First print corrected ones that are still unlogged */ -- cgit v1.2.3 From 3c0797925f4ef9d55a32059d2af61a9c262e639d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:55 +0200 Subject: x86, mce: switch x86 machine check handler to Monarch election. On Intel platforms machine check exceptions are always broadcast to all CPUs. This patch makes the machine check handler synchronize all these machine checks, elect a Monarch to handle the event and collect the worst event from all CPUs and then process it first. This has some advantages: - When there is a truly data corrupting error the system panics as quickly as possible. This improves containment of corrupted data and makes sure the corrupted data never hits stable storage. - The panics are synchronized and do not reenter the panic code on multiple CPUs (which currently does not handle this well). - All the errors are reported. Currently it often happens that another CPU happens to do the panic first, but reports useless information (empty machine check) because the real error happened on another CPU which came in later. This is a big advantage on Nehalem where the 8 threads per CPU lead to often the wrong CPU winning the race and dumping useless information on a machine check. The problem also occurs in a less severe form on older CPUs. - The system can detect when no CPUs detected a machine check and shut down the system. This can happen when one CPU is so badly hung that that it cannot process a machine check anymore or when some external agent wants to stop the system by asserting the machine check pin. This follows Intel hardware recommendations. - This matches the recommended error model by the CPU designers. - The events can be output in true severity order - When a panic happens on another CPU it makes sure to be actually be able to process the stop IPI by enabling interrupts. The code is extremly careful to handle timeouts while waiting for other CPUs. It can't rely on the normal timing mechanisms (jiffies, ktime_get) because of its asynchronous/lockless nature, so it uses own timeouts using ndelay() and a "SPINUNIT" The timeout is configurable. By default it waits for upto one second for the other CPUs. This can be also disabled. From some informal testing AMD systems do not see to broadcast machine checks, so right now it's always disabled by default on non Intel CPUs or also on very old Intel systems. Includes fixes from Ying Huang Fixed a "ecception" in a comment (H.Seto) Moved global_nwo reset later based on suggestion from H.Seto v2: Avoid duplicate messages [ Impact: feature, fixes long standing problems. ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 360 +++++++++++++++++++++++++++++++++++---- 1 file changed, 331 insertions(+), 29 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 421020f1d7d..ba431893e31 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -28,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -60,6 +62,8 @@ int mce_disabled; #define MISC_MCELOG_MINOR 227 +#define SPINUNIT 100 /* 100ns */ + atomic_t mce_entry; DEFINE_PER_CPU(unsigned, mce_exception_count); @@ -77,6 +81,7 @@ static u64 *bank; static unsigned long notify_user; static int rip_msr; static int mce_bootlog = -1; +static int monarch_timeout = -1; static char trigger[128]; static char *trigger_argv[2] = { trigger, NULL }; @@ -84,6 +89,9 @@ static char *trigger_argv[2] = { trigger, NULL }; static unsigned long dont_init_banks; static DECLARE_WAIT_QUEUE_HEAD(mce_wait); +static DEFINE_PER_CPU(struct mce, mces_seen); +static int cpu_missing; + /* MCA banks polled by the period polling timer for corrected events */ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { @@ -241,6 +249,8 @@ static void mce_panic(char *msg, struct mce *final, char *exp) } if (final) print_mce(final); + if (cpu_missing) + printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); if (exp) printk(KERN_EMERG "Machine check: %s\n", exp); panic(msg); @@ -450,6 +460,264 @@ static int mce_no_way_out(struct mce *m, char **msg) return 0; } +/* + * Variable to establish order between CPUs while scanning. + * Each CPU spins initially until executing is equal its number. + */ +static atomic_t mce_executing; + +/* + * Defines order of CPUs on entry. First CPU becomes Monarch. + */ +static atomic_t mce_callin; + +/* + * Check if a timeout waiting for other CPUs happened. + */ +static int mce_timed_out(u64 *t) +{ + /* + * The others already did panic for some reason. + * Bail out like in a timeout. + * rmb() to tell the compiler that system_state + * might have been modified by someone else. + */ + rmb(); + if (atomic_read(&mce_paniced)) + wait_for_panic(); + if (!monarch_timeout) + goto out; + if ((s64)*t < SPINUNIT) { + /* CHECKME: Make panic default for 1 too? */ + if (tolerant < 1) + mce_panic("Timeout synchronizing machine check over CPUs", + NULL, NULL); + cpu_missing = 1; + return 1; + } + *t -= SPINUNIT; +out: + touch_nmi_watchdog(); + return 0; +} + +/* + * The Monarch's reign. The Monarch is the CPU who entered + * the machine check handler first. It waits for the others to + * raise the exception too and then grades them. When any + * error is fatal panic. Only then let the others continue. + * + * The other CPUs entering the MCE handler will be controlled by the + * Monarch. They are called Subjects. + * + * This way we prevent any potential data corruption in a unrecoverable case + * and also makes sure always all CPU's errors are examined. + * + * Also this detects the case of an machine check event coming from outer + * space (not detected by any CPUs) In this case some external agent wants + * us to shut down, so panic too. + * + * The other CPUs might still decide to panic if the handler happens + * in a unrecoverable place, but in this case the system is in a semi-stable + * state and won't corrupt anything by itself. It's ok to let the others + * continue for a bit first. + * + * All the spin loops have timeouts; when a timeout happens a CPU + * typically elects itself to be Monarch. + */ +static void mce_reign(void) +{ + int cpu; + struct mce *m = NULL; + int global_worst = 0; + char *msg = NULL; + char *nmsg = NULL; + + /* + * This CPU is the Monarch and the other CPUs have run + * through their handlers. + * Grade the severity of the errors of all the CPUs. + */ + for_each_possible_cpu(cpu) { + int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, + &nmsg); + if (severity > global_worst) { + msg = nmsg; + global_worst = severity; + m = &per_cpu(mces_seen, cpu); + } + } + + /* + * Cannot recover? Panic here then. + * This dumps all the mces in the log buffer and stops the + * other CPUs. + */ + if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) + mce_panic("Fatal machine check", m, msg); + + /* + * For UC somewhere we let the CPU who detects it handle it. + * Also must let continue the others, otherwise the handling + * CPU could deadlock on a lock. + */ + + /* + * No machine check event found. Must be some external + * source or one CPU is hung. Panic. + */ + if (!m && tolerant < 3) + mce_panic("Machine check from unknown source", NULL, NULL); + + /* + * Now clear all the mces_seen so that they don't reappear on + * the next mce. + */ + for_each_possible_cpu(cpu) + memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); +} + +static atomic_t global_nwo; + +/* + * Start of Monarch synchronization. This waits until all CPUs have + * entered the exception handler and then determines if any of them + * saw a fatal event that requires panic. Then it executes them + * in the entry order. + * TBD double check parallel CPU hotunplug + */ +static int mce_start(int no_way_out, int *order) +{ + int nwo; + int cpus = num_online_cpus(); + u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; + + if (!timeout) { + *order = -1; + return no_way_out; + } + + atomic_add(no_way_out, &global_nwo); + + /* + * Wait for everyone. + */ + while (atomic_read(&mce_callin) != cpus) { + if (mce_timed_out(&timeout)) { + atomic_set(&global_nwo, 0); + *order = -1; + return no_way_out; + } + ndelay(SPINUNIT); + } + + /* + * Cache the global no_way_out state. + */ + nwo = atomic_read(&global_nwo); + + /* + * Monarch starts executing now, the others wait. + */ + if (*order == 1) { + atomic_set(&mce_executing, 1); + return nwo; + } + + /* + * Now start the scanning loop one by one + * in the original callin order. + * This way when there are any shared banks it will + * be only seen by one CPU before cleared, avoiding duplicates. + */ + while (atomic_read(&mce_executing) < *order) { + if (mce_timed_out(&timeout)) { + atomic_set(&global_nwo, 0); + *order = -1; + return no_way_out; + } + ndelay(SPINUNIT); + } + return nwo; +} + +/* + * Synchronize between CPUs after main scanning loop. + * This invokes the bulk of the Monarch processing. + */ +static int mce_end(int order) +{ + int ret = -1; + u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; + + if (!timeout) + goto reset; + if (order < 0) + goto reset; + + /* + * Allow others to run. + */ + atomic_inc(&mce_executing); + + if (order == 1) { + /* CHECKME: Can this race with a parallel hotplug? */ + int cpus = num_online_cpus(); + + /* + * Monarch: Wait for everyone to go through their scanning + * loops. + */ + while (atomic_read(&mce_executing) <= cpus) { + if (mce_timed_out(&timeout)) + goto reset; + ndelay(SPINUNIT); + } + + mce_reign(); + barrier(); + ret = 0; + } else { + /* + * Subject: Wait for Monarch to finish. + */ + while (atomic_read(&mce_executing) != 0) { + if (mce_timed_out(&timeout)) + goto reset; + ndelay(SPINUNIT); + } + + /* + * Don't reset anything. That's done by the Monarch. + */ + return 0; + } + + /* + * Reset all global state. + */ +reset: + atomic_set(&global_nwo, 0); + atomic_set(&mce_callin, 0); + barrier(); + + /* + * Let others run again. + */ + atomic_set(&mce_executing, 0); + return ret; +} + +static void mce_clear_state(unsigned long *toclear) +{ + int i; + + for (i = 0; i < banks; i++) { + if (test_bit(i, toclear)) + mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + } +} + /* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. @@ -457,12 +725,23 @@ static int mce_no_way_out(struct mce *m, char **msg) * This is executed in NMI context not subject to normal locking rules. This * implies that most kernel services cannot be safely used. Don't even * think about putting a printk in there! + * + * On Intel systems this is entered on all CPUs in parallel through + * MCE broadcast. However some CPUs might be broken beyond repair, + * so be always careful when synchronizing with others. */ void do_machine_check(struct pt_regs *regs, long error_code) { - struct mce m, panicm; - int panicm_found = 0; + struct mce m, *final; int i; + int worst = 0; + int severity; + /* + * Establish sequential order between the CPUs entering the machine + * check handler. + */ + int order; + /* * If no_way_out gets set, there is no safe way to recover from this * MCE. If tolerant is cranked up, we'll try anyway. @@ -486,13 +765,23 @@ void do_machine_check(struct pt_regs *regs, long error_code) if (!banks) goto out; + order = atomic_add_return(1, &mce_callin); mce_setup(&m); m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); no_way_out = mce_no_way_out(&m, &msg); + final = &__get_cpu_var(mces_seen); + *final = m; + barrier(); + /* + * Go through all the banks in exclusion of the other CPUs. + * This way we don't report duplicated events on shared banks + * because the first one to see it will clear it. + */ + no_way_out = mce_start(no_way_out, &order); for (i = 0; i < banks; i++) { __clear_bit(i, toclear); if (!bank[i]) @@ -544,32 +833,32 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_get_rip(&m, regs); mce_log(&m); - /* - * Did this bank cause the exception? - * - * Assume that the bank with uncorrectable errors did it, - * and that there is only a single one: - */ - if ((m.status & MCI_STATUS_UC) && - (m.status & MCI_STATUS_EN)) { - panicm = m; - panicm_found = 1; + severity = mce_severity(&m, tolerant, NULL); + if (severity > worst) { + *final = m; + worst = severity; } } + if (!no_way_out) + mce_clear_state(toclear); + /* - * If we didn't find an uncorrectable error, pick - * the last one (shouldn't happen, just being safe). + * Do most of the synchronization with other CPUs. + * When there's any problem use only local no_way_out state. */ - if (!panicm_found) - panicm = m; + if (mce_end(order) < 0) + no_way_out = worst >= MCE_PANIC_SEVERITY; /* * If we have decided that we just CAN'T continue, and the user * has not set tolerant to an insane level, give up and die. + * + * This is mainly used in the case when the system doesn't + * support MCE broadcasting or it has been disabled. */ if (no_way_out && tolerant < 3) - mce_panic("Machine check", &panicm, msg); + mce_panic("Machine check", final, msg); /* * If the error seems to be unrecoverable, something should be @@ -585,7 +874,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) * instruction which caused the MCE. */ if (m.mcgstatus & MCG_STATUS_EIPV) - user_space = panicm.ip && (panicm.cs & 3); + user_space = final->ip && (final->cs & 3); /* * If we know that the error was in user space, send a @@ -597,20 +886,15 @@ void do_machine_check(struct pt_regs *regs, long error_code) if (user_space) { force_sig(SIGBUS, current); } else if (panic_on_oops || tolerant < 2) { - mce_panic("Uncorrected machine check", &panicm, msg); + mce_panic("Uncorrected machine check", final, msg); } } /* notify userspace ASAP */ set_thread_flag(TIF_MCE_NOTIFY); - mce_report_event(regs); - - /* the last thing we do is clear state */ - for (i = 0; i < banks; i++) { - if (test_bit(i, toclear)) - mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); - } + if (worst > 0) + mce_report_event(regs); mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); out: atomic_dec(&mce_entry); @@ -821,7 +1105,17 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) if (c->x86 == 6 && c->x86_model < 0x1A) __set_bit(0, &dont_init_banks); + + /* + * All newer Intel systems support MCE broadcasting. Enable + * synchronization with a one second timeout. + */ + if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && + monarch_timeout < 0) + monarch_timeout = USEC_PER_SEC; } + if (monarch_timeout < 0) + monarch_timeout = 0; } static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) @@ -1068,7 +1362,9 @@ static struct miscdevice mce_log_device = { /* * mce=off disables machine check - * mce=TOLERANCELEVEL (number, see above) + * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) + * monarchtimeout is how long to wait for other CPUs on machine + * check, or 0 to not wait * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. * mce=nobootlog Don't log MCEs from before booting. */ @@ -1082,9 +1378,13 @@ static int __init mcheck_enable(char *str) mce_disabled = 1; else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) mce_bootlog = (str[0] == 'b'); - else if (isdigit(str[0])) + else if (isdigit(str[0])) { get_option(&str, &tolerant); - else { + if (*str == ',') { + ++str; + get_option(&str, &monarch_timeout); + } + } else { printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", str); return 0; @@ -1221,6 +1521,7 @@ static ssize_t store_int_with_restart(struct sys_device *s, static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); +static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); static struct sysdev_ext_attribute attr_check_interval = { _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, @@ -1230,6 +1531,7 @@ static struct sysdev_ext_attribute attr_check_interval = { static struct sysdev_attribute *mce_attrs[] = { &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger, + &attr_monarch_timeout.attr, NULL }; -- cgit v1.2.3 From ac9603754dc7e286e62ae4f1067958d5b0075f99 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:58 +0200 Subject: x86, mce: make non Monarch panic message "Fatal machine check" too ... instead of "Machine check". This is for consistency with the Monarch panic message. Based on a report from Ying Huang. v2: But add a descriptive postfix so that the test suite can distingush. Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index ba431893e31..d5cb0b4c17f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -554,7 +554,7 @@ static void mce_reign(void) * other CPUs. */ if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) - mce_panic("Fatal machine check", m, msg); + mce_panic("Fatal Machine check", m, msg); /* * For UC somewhere we let the CPU who detects it handle it. @@ -858,7 +858,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) * support MCE broadcasting or it has been disabled. */ if (no_way_out && tolerant < 3) - mce_panic("Machine check", final, msg); + mce_panic("Fatal machine check on current CPU", final, msg); /* * If the error seems to be unrecoverable, something should be -- cgit v1.2.3 From 1b2797dcc9f0ad89bc382ace26c6baafbc7e33c2 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 27 May 2009 21:56:51 +0200 Subject: x86, mce: improve mce_get_rip Assume IP on the stack is valid when either EIPV or RIPV are set. This influences whether the machine check exception handler decides to return or panic. This fixes a test case in the mce-test suite and is more compliant to the specification. This currently only makes a difference in a artificial testing scenario with the mce-test test suite. Also in addition do not force the EIPV to be valid with the exact register MSRs, and keep in trust the CS value on stack even if MSR is available. [AK: combination of patches from Huang Ying and Hidetoshi Seto, with new description by me] [add some description, no code changed - HS] Signed-off-by: Huang Ying Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index d5cb0b4c17f..a7dc369a997 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -306,21 +306,22 @@ int mce_available(struct cpuinfo_x86 *c) return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); } +/* + * Get the address of the instruction at the time of the machine check + * error. + */ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) { - if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { + + if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) { m->ip = regs->ip; m->cs = regs->cs; } else { m->ip = 0; m->cs = 0; } - if (rip_msr) { - /* Assume the RIP in the MSR is exact. Is this true? */ - m->mcgstatus |= MCG_STATUS_EIPV; + if (rip_msr) m->ip = mce_rdmsrl(rip_msr); - m->cs = 0; - } } #ifdef CONFIG_X86_LOCAL_APIC -- cgit v1.2.3 From 29b0f591d678838435fbb3e15ef20266f1a9e01d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:56 +0200 Subject: x86, mce: default to panic timeout for machine checks Fatal machine checks can be logged to disk after boot, but only if the system did a warm reboot. That's unfortunately difficult with the default panic behaviour, which waits forever and the admin has to press the power button because modern systems usually miss a reset button. This clears the machine checks in the registers and make it impossible to log them. This patch changes the default for machine check panic to always reboot after 30s. Then the mce can be successfully logged after reboot. I believe this will improve machine check experience for any system running the X server. This is dependent on successfull boot logging of MCEs. This currently only works on Intel systems, on AMD there are quite a lot of systems around which leave junk in the machine check registers after boot, so it's disabled here. These systems will continue to default to endless waiting panic. v2: Only force panic timeout when it's shorter (H.Seto) v3: Only force timeout when there is no timeout (based on comment H.Seto) [ Fix changelog - HS ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index a7dc369a997..79d243145b8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -82,6 +82,7 @@ static unsigned long notify_user; static int rip_msr; static int mce_bootlog = -1; static int monarch_timeout = -1; +static int mce_panic_timeout; static char trigger[128]; static char *trigger_argv[2] = { trigger, NULL }; @@ -216,6 +217,8 @@ static void wait_for_panic(void) local_irq_enable(); while (timeout-- > 0) udelay(1); + if (panic_timeout == 0) + panic_timeout = mce_panic_timeout; panic("Panicing machine check CPU died"); } @@ -253,6 +256,8 @@ static void mce_panic(char *msg, struct mce *final, char *exp) printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); if (exp) printk(KERN_EMERG "Machine check: %s\n", exp); + if (panic_timeout == 0) + panic_timeout = mce_panic_timeout; panic(msg); } @@ -1117,6 +1122,8 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) } if (monarch_timeout < 0) monarch_timeout = 0; + if (mce_bootlog != 0) + mce_panic_timeout = 30; } static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) -- cgit v1.2.3 From 86503560e48153aba539ff117450d31ab2ef76d7 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:58 +0200 Subject: x86, mce: print header/footer only once for multiple MCEs When multiple MCEs are printed print the "HARDWARE ERROR" header and "This is not a software error" footer only once. This makes the output much more compact with many CPUs. Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 79d243145b8..ff9c732989d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -176,11 +176,13 @@ void mce_log(struct mce *mce) set_bit(0, ¬ify_user); } -static void print_mce(struct mce *m) +static void print_mce(struct mce *m, int *first) { - printk(KERN_EMERG "\n" - KERN_EMERG "HARDWARE ERROR\n" - KERN_EMERG + if (*first) { + printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n"); + *first = 0; + } + printk(KERN_EMERG "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", m->extcpu, m->mcgstatus, m->bank, m->status); if (m->ip) { @@ -200,9 +202,12 @@ static void print_mce(struct mce *m) printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); - printk(KERN_EMERG "This is not a software problem!\n"); - printk(KERN_EMERG "Run through mcelog --ascii to decode " - "and contact your hardware vendor\n"); +} + +static void print_mce_tail(void) +{ + printk(KERN_EMERG "This is not a software problem!\n" + KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n"); } #define PANIC_TIMEOUT 5 /* 5 seconds */ @@ -225,6 +230,7 @@ static void wait_for_panic(void) static void mce_panic(char *msg, struct mce *final, char *exp) { int i; + int first = 1; /* * Make sure only one CPU runs in machine check panic @@ -240,7 +246,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp) struct mce *m = &mcelog.entry[i]; if ((m->status & MCI_STATUS_VAL) && !(m->status & MCI_STATUS_UC)) - print_mce(m); + print_mce(m, &first); } /* Now print uncorrected but with the final one last */ for (i = 0; i < MCE_LOG_LEN; i++) { @@ -248,12 +254,13 @@ static void mce_panic(char *msg, struct mce *final, char *exp) if (!(m->status & MCI_STATUS_VAL)) continue; if (!final || memcmp(m, final, sizeof(struct mce))) - print_mce(m); + print_mce(m, &first); } if (final) - print_mce(final); + print_mce(final, &first); if (cpu_missing) printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); + print_mce_tail(); if (exp) printk(KERN_EMERG "Machine check: %s\n", exp); if (panic_timeout == 0) -- cgit v1.2.3 From ed7290d0ee8f81aa78bfe816f01b012f208cafc5 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:57 +0200 Subject: x86, mce: implement new status bits The x86 architecture recently added some new machine check status bits: S(ignalled) and AR (Action-Required). Signalled allows to check if a specific event caused an exception or was just logged through CMCI. AR allows the kernel to decide if an event needs immediate action or can be delayed or ignored. Implement support for these new status bits. mce_severity() uses the new bits to grade the machine check correctly and decide what to do. The exception handler uses AR to decide to kill or not. The S bit is used to separate events between the poll/CMCI handler and the exception handler. Classical UC always leads to panic. That was true before anyways because the existing CPUs always passed a PCC with it. Also corrects the rules whether to kill in user or kernel context and how to handle missing RIPV. The machine check handler largely uses the mce-severity grading engine now instead of making its own decisions. This means the logic is centralized in one place. This is useful because it has to be evaluated multiple times. v2: Some rule fixes; Add AO events Fix RIPV, RIPV|EIPV order (Ying Huang) Fix UCNA with AR=1 message (Ying Huang) Add comment about panicing in m_c_p. Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce-internal.h | 5 ++ arch/x86/kernel/cpu/mcheck/mce-severity.c | 82 ++++++++++++++++++++++++++++-- arch/x86/kernel/cpu/mcheck/mce.c | 84 ++++++++++++++++--------------- 3 files changed, 127 insertions(+), 44 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index f126b4ae7a2..54dcb8ff12e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -2,9 +2,14 @@ enum severity_level { MCE_NO_SEVERITY, + MCE_KEEP_SEVERITY, MCE_SOME_SEVERITY, + MCE_AO_SEVERITY, MCE_UC_SEVERITY, + MCE_AR_SEVERITY, MCE_PANIC_SEVERITY, }; int mce_severity(struct mce *a, int tolerant, char **msg); + +extern int mce_ser; diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index c189e89a89a..4f4d2caf404 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -19,43 +19,117 @@ * first. Since there are quite a lot of combinations test the bits in a * table-driven way. The rules are simply processed in order, first * match wins. + * + * Note this is only used for machine check exceptions, the corrected + * errors use much simpler rules. The exceptions still check for the corrected + * errors, but only to leave them alone for the CMCI handler (except for + * panic situations) */ +enum context { IN_KERNEL = 1, IN_USER = 2 }; +enum ser { SER_REQUIRED = 1, NO_SER = 2 }; + static struct severity { u64 mask; u64 result; unsigned char sev; unsigned char mcgmask; unsigned char mcgres; + unsigned char ser; + unsigned char context; char *msg; } severities[] = { +#define KERNEL .context = IN_KERNEL +#define USER .context = IN_USER +#define SER .ser = SER_REQUIRED +#define NOSER .ser = NO_SER #define SEV(s) .sev = MCE_ ## s ## _SEVERITY #define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r } #define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r } #define MCGMASK(x, res, s, m, r...) \ { .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r } +#define MASK(x, y, s, m, r...) \ + { .mask = x, .result = y, SEV(s), .msg = m, ## r } +#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) +#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) +#define MCACOD 0xffff + BITCLR(MCI_STATUS_VAL, NO, "Invalid"), BITCLR(MCI_STATUS_EN, NO, "Not enabled"), BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"), - MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "No restart IP"), + /* When MCIP is not set something is very confused */ + MCGMASK(MCG_STATUS_MCIP, 0, PANIC, "MCIP not set in MCA handler"), + /* Neither return not error IP -- no chance to recover -> PANIC */ + MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, PANIC, + "Neither restart nor error IP"), + MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP", + KERNEL), + BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", NOSER), + MASK(MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN, MCI_STATUS_UC, SOME, + "Spurious not enabled", SER), + + /* ignore OVER for UCNA */ + MASK(MCI_UC_SAR, MCI_STATUS_UC, KEEP, + "Uncorrected no action required", SER), + MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR, PANIC, + "Illegal combination (UCNA with AR=1)", SER), + MASK(MCI_STATUS_S, 0, KEEP, "Non signalled machine check", SER), + + /* AR add known MCACODs here */ + MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, PANIC, + "Action required with lost events", SER), + MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_SAR, PANIC, + "Action required; unknown MCACOD", SER), + + /* known AO MCACODs: */ + MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0, AO, + "Action optional: memory scrubbing error", SER), + MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, AO, + "Action optional: last level cache writeback error", SER), + + MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, SOME, + "Action optional unknown MCACOD", SER), + MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, SOME, + "Action optional with lost events", SER), BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"), BITSET(MCI_STATUS_UC, UC, "Uncorrected"), BITSET(0, SOME, "No match") /* always matches. keep at end */ }; +/* + * If the EIPV bit is set, it means the saved IP is the + * instruction which caused the MCE. + */ +static int error_context(struct mce *m) +{ + if (m->mcgstatus & MCG_STATUS_EIPV) + return (m->ip && (m->cs & 3) == 3) ? IN_USER : IN_KERNEL; + /* Unknown, assume kernel */ + return IN_KERNEL; +} + int mce_severity(struct mce *a, int tolerant, char **msg) { + enum context ctx = error_context(a); struct severity *s; + for (s = severities;; s++) { if ((a->status & s->mask) != s->result) continue; if ((a->mcgstatus & s->mcgmask) != s->mcgres) continue; - if (s->sev > MCE_NO_SEVERITY && (a->status & MCI_STATUS_UC) && - tolerant < 1) - return MCE_PANIC_SEVERITY; + if (s->ser == SER_REQUIRED && !mce_ser) + continue; + if (s->ser == NO_SER && mce_ser) + continue; + if (s->context && ctx != s->context) + continue; if (msg) *msg = s->msg; + if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) { + if (panic_on_oops || tolerant < 1) + return MCE_PANIC_SEVERITY; + } return s->sev; } } diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index ff9c732989d..f051a7807ab 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -83,6 +83,7 @@ static int rip_msr; static int mce_bootlog = -1; static int monarch_timeout = -1; static int mce_panic_timeout; +int mce_ser; static char trigger[128]; static char *trigger_argv[2] = { trigger, NULL }; @@ -391,6 +392,15 @@ DEFINE_PER_CPU(unsigned, mce_poll_count); * Those are just logged through /dev/mcelog. * * This is executed in standard interrupt context. + * + * Note: spec recommends to panic for fatal unsignalled + * errors here. However this would be quite problematic -- + * we would need to reimplement the Monarch handling and + * it would mess up the exclusion between exception handler + * and poll hander -- * so we skip this for now. + * These cases should not happen anyways, or only when the CPU + * is already totally * confused. In this case it's likely it will + * not fully execute the machine check handler either. */ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) { @@ -417,13 +427,13 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) continue; /* - * Uncorrected events are handled by the exception handler - * when it is enabled. But when the exception is disabled log - * everything. + * Uncorrected or signalled events are handled by the exception + * handler when it is enabled, so don't process those here. * * TBD do the same check for MCI_STATUS_EN here? */ - if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) + if (!(flags & MCP_UC) && + (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) continue; if (m.status & MCI_STATUS_MISCV) @@ -789,6 +799,12 @@ void do_machine_check(struct pt_regs *regs, long error_code) barrier(); + /* + * When no restart IP must always kill or panic. + */ + if (!(m.mcgstatus & MCG_STATUS_RIPV)) + kill_it = 1; + /* * Go through all the banks in exclusion of the other CPUs. * This way we don't report duplicated events on shared banks @@ -809,10 +825,11 @@ void do_machine_check(struct pt_regs *regs, long error_code) continue; /* - * Non uncorrected errors are handled by machine_check_poll - * Leave them alone, unless this panics. + * Non uncorrected or non signaled errors are handled by + * machine_check_poll. Leave them alone, unless this panics. */ - if ((m.status & MCI_STATUS_UC) == 0 && !no_way_out) + if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && + !no_way_out) continue; /* @@ -820,17 +837,16 @@ void do_machine_check(struct pt_regs *regs, long error_code) */ add_taint(TAINT_MACHINE_CHECK); - __set_bit(i, toclear); + severity = mce_severity(&m, tolerant, NULL); - if (m.status & MCI_STATUS_EN) { - /* - * If this error was uncorrectable and there was - * an overflow, we're in trouble. If no overflow, - * we might get away with just killing a task. - */ - if (m.status & MCI_STATUS_UC) - kill_it = 1; - } else { + /* + * When machine check was for corrected handler don't touch, + * unless we're panicing. + */ + if (severity == MCE_KEEP_SEVERITY && !no_way_out) + continue; + __set_bit(i, toclear); + if (severity == MCE_NO_SEVERITY) { /* * Machine check event was not enabled. Clear, but * ignore. @@ -838,6 +854,12 @@ void do_machine_check(struct pt_regs *regs, long error_code) continue; } + /* + * Kill on action required. + */ + if (severity == MCE_AR_SEVERITY) + kill_it = 1; + if (m.status & MCI_STATUS_MISCV) m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); if (m.status & MCI_STATUS_ADDRV) @@ -846,7 +868,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_get_rip(&m, regs); mce_log(&m); - severity = mce_severity(&m, tolerant, NULL); if (severity > worst) { *final = m; worst = severity; @@ -879,29 +900,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) * one task, do that. If the user has set the tolerance very * high, don't try to do anything at all. */ - if (kill_it && tolerant < 3) { - int user_space = 0; - - /* - * If the EIPV bit is set, it means the saved IP is the - * instruction which caused the MCE. - */ - if (m.mcgstatus & MCG_STATUS_EIPV) - user_space = final->ip && (final->cs & 3); - /* - * If we know that the error was in user space, send a - * SIGBUS. Otherwise, panic if tolerance is low. - * - * force_sig() takes an awful lot of locks and has a slight - * risk of deadlocking. - */ - if (user_space) { - force_sig(SIGBUS, current); - } else if (panic_on_oops || tolerant < 2) { - mce_panic("Uncorrected machine check", final, msg); - } - } + if (kill_it && tolerant < 3) + force_sig(SIGBUS, current); /* notify userspace ASAP */ set_thread_flag(TIF_MCE_NOTIFY); @@ -1049,6 +1050,9 @@ static int mce_cap_init(void) if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) rip_msr = MSR_IA32_MCG_EIP; + if (cap & MCG_SER_P) + mce_ser = 1; + return 0; } -- cgit v1.2.3 From 4611a6fa4b37cf6b8b6066ed0d605c994c62a1a0 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 27 May 2009 21:56:57 +0200 Subject: x86, mce: export MCE severities coverage via debugfs The MCE severity judgement code is data-driven, so code coverage tools such as gcov can not be used for measuring coverage. Instead a dedicated coverage mechanism is implemented. The kernel keeps track of rules executed and reports them in debugfs. This is useful for increasing coverage of the mce-test testsuite. Right now it's unconditionally enabled because it's very little code. Signed-off-by: Huang Ying Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce-severity.c | 83 +++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 4f4d2caf404..ff0807f9705 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -10,6 +10,9 @@ * Author: Andi Kleen */ #include +#include +#include +#include #include #include "mce-internal.h" @@ -37,6 +40,7 @@ static struct severity { unsigned char mcgres; unsigned char ser; unsigned char context; + unsigned char covered; char *msg; } severities[] = { #define KERNEL .context = IN_KERNEL @@ -126,6 +130,7 @@ int mce_severity(struct mce *a, int tolerant, char **msg) continue; if (msg) *msg = s->msg; + s->covered = 1; if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) { if (panic_on_oops || tolerant < 1) return MCE_PANIC_SEVERITY; @@ -133,3 +138,81 @@ int mce_severity(struct mce *a, int tolerant, char **msg) return s->sev; } } + +static void *s_start(struct seq_file *f, loff_t *pos) +{ + if (*pos >= ARRAY_SIZE(severities)) + return NULL; + return &severities[*pos]; +} + +static void *s_next(struct seq_file *f, void *data, loff_t *pos) +{ + if (++(*pos) >= ARRAY_SIZE(severities)) + return NULL; + return &severities[*pos]; +} + +static void s_stop(struct seq_file *f, void *data) +{ +} + +static int s_show(struct seq_file *f, void *data) +{ + struct severity *ser = data; + seq_printf(f, "%d\t%s\n", ser->covered, ser->msg); + return 0; +} + +static const struct seq_operations severities_seq_ops = { + .start = s_start, + .next = s_next, + .stop = s_stop, + .show = s_show, +}; + +static int severities_coverage_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &severities_seq_ops); +} + +static ssize_t severities_coverage_write(struct file *file, + const char __user *ubuf, + size_t count, loff_t *ppos) +{ + int i; + for (i = 0; i < ARRAY_SIZE(severities); i++) + severities[i].covered = 0; + return count; +} + +static const struct file_operations severities_coverage_fops = { + .open = severities_coverage_open, + .release = seq_release, + .read = seq_read, + .write = severities_coverage_write, +}; + +static int __init severities_debugfs_init(void) +{ + struct dentry *dmce = NULL, *fseverities_coverage = NULL; + + dmce = debugfs_create_dir("mce", NULL); + if (dmce == NULL) + goto err_out; + fseverities_coverage = debugfs_create_file("severities-coverage", + 0444, dmce, NULL, + &severities_coverage_fops); + if (fseverities_coverage == NULL) + goto err_out; + + return 0; + +err_out: + if (fseverities_coverage) + debugfs_remove(fseverities_coverage); + if (dmce) + debugfs_remove(dmce); + return -ENOMEM; +} +late_initcall(severities_debugfs_init); -- cgit v1.2.3 From 4ef702c10b5df18ab04921fc252c26421d4d6c75 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:52 +0200 Subject: x86: fix panic with interrupts off (needed for MCE) For some time each panic() called with interrupts disabled triggered the !irqs_disabled() WARN_ON in smp_call_function(), producing ugly backtraces and confusing users. This is a common situation with machine checks for example which tend to call panic with interrupts disabled, but will also hit in other situations e.g. panic during early boot. In fact it means that panic cannot be called in many circumstances, which would be bad. This all started with the new fancy queued smp_call_function, which is then used by the shutdown path to shut down the other CPUs. On closer examination it turned out that the fancy RCU smp_call_function() does lots of things not suitable in a panic situation anyways, like allocating memory and relying on complex system state. I originally tried to patch this over by checking for panic there, but it was quite complicated and the original patch was also not very popular. This also didn't fix some of the underlying complexity problems. The new code in post 2.6.29 tries to patch around this by checking for oops_in_progress, but that is not enough to make this fully safe and I don't think that's a real solution because panic has to be reliable. So instead use an own vector to reboot. This makes the reboot code extremly straight forward, which is definitely a big plus in a panic situation where it is important to avoid relying on too much kernel state. The new simple code is also safe to be called from interupts off region because it is very very simple. There can be situations where it is important that panic is reliable. For example on a fatal machine check the panic is needed to get the system up again and running as quickly as possible. So it's important that panic is reliable and all function it calls simple. This is why I came up with this simple vector scheme. It's very hard to beat in simplicity. Vectors are not particularly precious anymore since all big systems are using per CPU vectors. Another possibility would have been to use an NMI similar to kdump, but there is still the problem that NMIs don't work reliably on some systems due to BIOS issues. NMIs would have been able to stop CPUs running with interrupts off too. In the sake of universal reliability I opted for using a non NMI vector for now. I put the reboot vector into the highest priority bucket of the APIC vectors and moved the 64bit UV_BAU message down instead into the next lower priority. [ Impact: bug fix, fixes an old regression ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_64.S | 2 ++ arch/x86/kernel/irqinit.c | 3 +++ arch/x86/kernel/smp.c | 28 +++++++++++++++++++++++++++- 3 files changed, 32 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 711c130a841..4234b123565 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -976,6 +976,8 @@ END(\sym) #ifdef CONFIG_SMP apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \ irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt +apicinterrupt REBOOT_VECTOR \ + reboot_interrupt smp_reboot_interrupt #endif #ifdef CONFIG_X86_UV diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 441f6ec6e9d..4a69ec55be3 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -173,6 +173,9 @@ static void __init smp_intr_init(void) /* Low priority IPI to cleanup after moving an irq */ set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); + + /* IPI used for rebooting/stopping */ + alloc_intr_gate(REBOOT_VECTOR, reboot_interrupt); #endif #endif /* CONFIG_SMP */ } diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index f6db48c405b..bf1831aa14f 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -150,14 +150,40 @@ void native_send_call_func_ipi(const struct cpumask *mask) * this function calls the 'stop' function on all other CPUs in the system. */ +asmlinkage void smp_reboot_interrupt(void) +{ + ack_APIC_irq(); + irq_enter(); + stop_this_cpu(NULL); + irq_exit(); +} + static void native_smp_send_stop(void) { unsigned long flags; + unsigned long wait; if (reboot_force) return; - smp_call_function(stop_this_cpu, NULL, 0); + /* + * Use an own vector here because smp_call_function + * does lots of things not suitable in a panic situation. + * On most systems we could also use an NMI here, + * but there are a few systems around where NMI + * is problematic so stay with an non NMI for now + * (this implies we cannot stop CPUs spinning with irq off + * currently) + */ + if (num_online_cpus() > 1) { + apic->send_IPI_allbutself(REBOOT_VECTOR); + + /* Don't wait longer than a second */ + wait = USEC_PER_SEC; + while (num_online_cpus() > 1 && wait--) + udelay(1); + } + local_irq_save(flags); disable_local_APIC(); local_irq_restore(flags); -- cgit v1.2.3 From 9ff36ee9668ff41ec3274597c730524645929b0f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:58 +0200 Subject: x86, mce: rename mce_notify_user to mce_notify_irq Rename the mce_notify_user function to mce_notify_irq. The next patch will split the wakeup handling of interrupt context and of process context and it's better to give it a clearer name for this. Contains a fix from Ying Huang [ Impact: cleanup ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Cc: Huang Ying Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce-inject.c | 2 +- arch/x86/kernel/cpu/mcheck/mce.c | 10 +++++----- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 2 +- arch/x86/kernel/signal.c | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 7d858fb4ce6..a3a235a53f0 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -65,7 +65,7 @@ static void raise_mce(unsigned long data) memset(&b, 0xff, sizeof(mce_banks_t)); printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu); machine_check_poll(0, &b); - mce_notify_user(); + mce_notify_irq(); printk(KERN_INFO "Finished machine check poll on CPU %d\n", cpu); } diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index f051a7807ab..13e1b7ffe73 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -348,7 +348,7 @@ asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) ack_APIC_irq(); exit_idle(); irq_enter(); - mce_notify_user(); + mce_notify_irq(); irq_exit(); } #endif @@ -356,7 +356,7 @@ asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) static void mce_report_event(struct pt_regs *regs) { if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { - mce_notify_user(); + mce_notify_irq(); return; } @@ -968,7 +968,7 @@ static void mcheck_timer(unsigned long data) * polling interval, otherwise increase the polling interval. */ n = &__get_cpu_var(next_interval); - if (mce_notify_user()) + if (mce_notify_irq()) *n = max(*n/2, HZ/100); else *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); @@ -989,7 +989,7 @@ static DECLARE_WORK(mce_trigger_work, mce_do_trigger); * Can be called from interrupt context, but not from machine check/NMI * context. */ -int mce_notify_user(void) +int mce_notify_irq(void) { /* Not more than two messages every minute */ static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); @@ -1014,7 +1014,7 @@ int mce_notify_user(void) } return 0; } -EXPORT_SYMBOL_GPL(mce_notify_user); +EXPORT_SYMBOL_GPL(mce_notify_irq); /* * Initialize Machine Checks for a CPU. diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index eff3740501a..b7c5a2470b4 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -80,7 +80,7 @@ static int cmci_supported(int *banks) static void intel_threshold_interrupt(void) { machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); - mce_notify_user(); + mce_notify_irq(); } static void print_update(char *type, int *hdr, int num) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index d0851e3f77e..d5dc15bce00 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -860,7 +860,7 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) #ifdef CONFIG_X86_NEW_MCE /* notify userspace of pending MCEs */ if (thread_info_flags & _TIF_MCE_NOTIFY) - mce_notify_user(); + mce_notify_irq(); #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ /* deal with pending signal delivery */ -- cgit v1.2.3 From 9b1beaf2b551a8a1604f104025b24e9c535c8963 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:59 +0200 Subject: x86, mce: support action-optional machine checks Newer Intel CPUs support a new class of machine checks called recoverable action optional. Action Optional means that the CPU detected some form of corruption in the background and tells the OS about using a machine check exception. The OS can then take appropiate action, like killing the process with the corrupted data or logging the event properly to disk. This is done by the new generic high level memory failure handler added in a earlier patch. The high level handler takes the address with the failed memory and does the appropiate action, like killing the process. In this version of the patch the high level handler is stubbed out with a weak function to not create a direct dependency on the hwpoison branch. The high level handler cannot be directly called from the machine check exception though, because it has to run in a defined process context to be able to sleep when taking VM locks (it is not expected to sleep for a long time, just do so in some exceptional cases like lock contention) Thus the MCE handler has to queue a work item for process context, trigger process context and then call the high level handler from there. This patch adds two path to process context: through a per thread kernel exit notify_user() callback or through a high priority work item. The first runs when the process exits back to user space, the other when it goes to sleep and there is no higher priority process. The machine check handler will schedule both, and whoever runs first will grab the event. This is done because quick reaction to this event is critical to avoid a potential more fatal machine check when the corruption is consumed. There is a simple lock less ring buffer to queue the corrupted addresses between the exception handler and the process context handler. Then in process context it just calls the high level VM code with the corrupted PFNs. The code adds the required code to extract the failed address from the CPU's machine check registers. It doesn't try to handle all possible cases -- the specification has 6 different ways to specify memory address -- but only the linear address. Most of the required checking has been already done earlier in the mce_severity rule checking engine. Following the Intel recommendations Action Optional errors are only enabled for known situations (encoded in MCACODs). The errors are ignored otherwise, because they are action optional. v2: Improve comment, disable preemption while processing ring buffer (reported by Ying Huang) Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 133 +++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/signal.c | 2 +- 2 files changed, 134 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 13e1b7ffe73..d4e7b5947a0 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -105,6 +106,8 @@ static inline int skip_bank_init(int i) return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); } +static DEFINE_PER_CPU(struct work_struct, mce_work); + /* Do initial initialization of a struct mce */ void mce_setup(struct mce *m) { @@ -312,6 +315,61 @@ static void mce_wrmsrl(u32 msr, u64 v) wrmsrl(msr, v); } +/* + * Simple lockless ring to communicate PFNs from the exception handler with the + * process context work function. This is vastly simplified because there's + * only a single reader and a single writer. + */ +#define MCE_RING_SIZE 16 /* we use one entry less */ + +struct mce_ring { + unsigned short start; + unsigned short end; + unsigned long ring[MCE_RING_SIZE]; +}; +static DEFINE_PER_CPU(struct mce_ring, mce_ring); + +/* Runs with CPU affinity in workqueue */ +static int mce_ring_empty(void) +{ + struct mce_ring *r = &__get_cpu_var(mce_ring); + + return r->start == r->end; +} + +static int mce_ring_get(unsigned long *pfn) +{ + struct mce_ring *r; + int ret = 0; + + *pfn = 0; + get_cpu(); + r = &__get_cpu_var(mce_ring); + if (r->start == r->end) + goto out; + *pfn = r->ring[r->start]; + r->start = (r->start + 1) % MCE_RING_SIZE; + ret = 1; +out: + put_cpu(); + return ret; +} + +/* Always runs in MCE context with preempt off */ +static int mce_ring_add(unsigned long pfn) +{ + struct mce_ring *r = &__get_cpu_var(mce_ring); + unsigned next; + + next = (r->end + 1) % MCE_RING_SIZE; + if (next == r->start) + return -1; + r->ring[r->end] = pfn; + wmb(); + r->end = next; + return 0; +} + int mce_available(struct cpuinfo_x86 *c) { if (mce_disabled) @@ -319,6 +377,15 @@ int mce_available(struct cpuinfo_x86 *c) return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); } +static void mce_schedule_work(void) +{ + if (!mce_ring_empty()) { + struct work_struct *work = &__get_cpu_var(mce_work); + if (!work_pending(work)) + schedule_work(work); + } +} + /* * Get the address of the instruction at the time of the machine check * error. @@ -349,6 +416,7 @@ asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) exit_idle(); irq_enter(); mce_notify_irq(); + mce_schedule_work(); irq_exit(); } #endif @@ -357,6 +425,13 @@ static void mce_report_event(struct pt_regs *regs) { if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { mce_notify_irq(); + /* + * Triggering the work queue here is just an insurance + * policy in case the syscall exit notify handler + * doesn't run soon enough or ends up running on the + * wrong CPU (can happen when audit sleeps) + */ + mce_schedule_work(); return; } @@ -731,6 +806,23 @@ reset: return ret; } +/* + * Check if the address reported by the CPU is in a format we can parse. + * It would be possible to add code for most other cases, but all would + * be somewhat complicated (e.g. segment offset would require an instruction + * parser). So only support physical addresses upto page granuality for now. + */ +static int mce_usable_address(struct mce *m) +{ + if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) + return 0; + if ((m->misc & 0x3f) > PAGE_SHIFT) + return 0; + if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS) + return 0; + return 1; +} + static void mce_clear_state(unsigned long *toclear) { int i; @@ -865,6 +957,16 @@ void do_machine_check(struct pt_regs *regs, long error_code) if (m.status & MCI_STATUS_ADDRV) m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); + /* + * Action optional error. Queue address for later processing. + * When the ring overflows we just ignore the AO error. + * RED-PEN add some logging mechanism when + * usable_address or mce_add_ring fails. + * RED-PEN don't ignore overflow for tolerant == 0 + */ + if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) + mce_ring_add(m.addr >> PAGE_SHIFT); + mce_get_rip(&m, regs); mce_log(&m); @@ -916,6 +1018,36 @@ out: } EXPORT_SYMBOL_GPL(do_machine_check); +/* dummy to break dependency. actual code is in mm/memory-failure.c */ +void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) +{ + printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); +} + +/* + * Called after mce notification in process context. This code + * is allowed to sleep. Call the high level VM handler to process + * any corrupted pages. + * Assume that the work queue code only calls this one at a time + * per CPU. + * Note we don't disable preemption, so this code might run on the wrong + * CPU. In this case the event is picked up by the scheduled work queue. + * This is merely a fast path to expedite processing in some common + * cases. + */ +void mce_notify_process(void) +{ + unsigned long pfn; + mce_notify_irq(); + while (mce_ring_get(&pfn)) + memory_failure(pfn, MCE_VECTOR); +} + +static void mce_process_work(struct work_struct *dummy) +{ + mce_notify_process(); +} + #ifdef CONFIG_X86_MCE_INTEL /*** * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog @@ -1204,6 +1336,7 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) mce_init(); mce_cpu_features(c); mce_init_timer(); + INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); } /* diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index d5dc15bce00..4976888094f 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -860,7 +860,7 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) #ifdef CONFIG_X86_NEW_MCE /* notify userspace of pending MCEs */ if (thread_info_flags & _TIF_MCE_NOTIFY) - mce_notify_irq(); + mce_notify_process(); #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ /* deal with pending signal delivery */ -- cgit v1.2.3 From 8051dbd2dfd1427cc102888d7d96bf39de0be150 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Tue, 2 Jun 2009 16:53:23 +0900 Subject: x86, mce: fix for mce counters Make the MCE counters work on 32bit and add poll count in arch_irq_stat_cpu. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/irq.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index eff46b5de62..9773395aa75 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -95,7 +95,7 @@ static int show_other_interrupts(struct seq_file *p, int prec) seq_printf(p, " Threshold APIC interrupts\n"); # endif #endif -#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) +#ifdef CONFIG_X86_NEW_MCE seq_printf(p, "%*s: ", prec, "MCE"); for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); @@ -172,9 +172,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu) { u64 sum = irq_stats(cpu)->__nmi_count; -#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) - sum += per_cpu(mce_exception_count, cpu); -#endif #ifdef CONFIG_X86_LOCAL_APIC sum += irq_stats(cpu)->apic_timer_irqs; sum += irq_stats(cpu)->irq_spurious_count; @@ -191,6 +188,10 @@ u64 arch_irq_stat_cpu(unsigned int cpu) # ifdef CONFIG_X86_MCE_THRESHOLD sum += irq_stats(cpu)->irq_threshold_count; # endif +#endif +#ifdef CONFIG_X86_NEW_MCE + sum += per_cpu(mce_exception_count, cpu); + sum += per_cpu(mce_poll_count, cpu); #endif return sum; } -- cgit v1.2.3 From 2c701b10283b58937201004276319ef9d9051b5d Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Fri, 5 Jun 2009 12:37:07 -0400 Subject: [CPUFREQ] powernow-k8: check space_id of _PCT registers to be FFH The powernow-k8 driver checks to see that the Performance Control/Status Registers are declared as FFH (functional fixed hardware) by the BIOS. However, this check got broken in the commit: 0e64a0c982c06a6b8f5e2a7f29eb108fdf257b2f [CPUFREQ] checkpatch cleanups for powernow-k8 Fix based on an original patch from Naga Chumbalkar. Signed-off-by: Naga Chumbalkar Cc: Mark Langsdorf Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index f6b32d11235..35dc8fbe92b 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -835,7 +835,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { struct cpufreq_frequency_table *powernow_table; int ret_val = -ENODEV; - acpi_integer space_id; + acpi_integer control, status; if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { dprintk("register performance failed: bad ACPI data\n"); @@ -848,12 +848,13 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) goto err_out; } - space_id = data->acpi_data.control_register.space_id; - if ((space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) || - (space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) { + control = data->acpi_data.control_register.space_id; + status = data->acpi_data.status_register.space_id; + + if ((control != ACPI_ADR_SPACE_FIXED_HARDWARE) || + (status != ACPI_ADR_SPACE_FIXED_HARDWARE)) { dprintk("Invalid control/status registers (%x - %x)\n", - data->acpi_data.control_register.space_id, - space_id); + control, status); goto err_out; } -- cgit v1.2.3 From fe2245c905631a3a353504fc04388ce3dfaf9d9e Mon Sep 17 00:00:00 2001 From: Mark Langsdorf Date: Sun, 5 Jul 2009 15:50:52 -0500 Subject: x86: enable GART-IOMMU only after setting up protection methods The current code to set up the GART as an IOMMU enables GART translations before it removes the aperture from the kernel memory map, sets the GART PTEs to UC, sets up the guard and scratch pages, or does a wbinvd(). This leaves the possibility of cache aliasing open and can cause system crashes. Re-order the code so as to enable the GART translations only after all safeguards are in place and the tlb has been flushed. AMD has tested this patch on both Istanbul systems and 1st generation Opteron systems with APG enabled and seen no adverse effects. Istanbul systems with HT Assist enabled sometimes see MCE errors due to cache artifacts with the unmodified code. Signed-off-by: Mark Langsdorf Cc: Cc: Joerg Roedel Cc: akpm@linux-foundation.org Cc: jbarnes@virtuousgeek.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 1e8920d98f7..cfd9f906389 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -658,8 +658,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info) agp_gatt_table = gatt; - enable_gart_translations(); - error = sysdev_class_register(&gart_sysdev_class); if (!error) error = sysdev_register(&device_gart); @@ -816,6 +814,14 @@ void __init gart_iommu_init(void) * the pages as Not-Present: */ wbinvd(); + + /* + * Now all caches are flushed and we can safely enable + * GART hardware. Doing it early leaves the possibility + * of stale cache entries that can lead to GART PTE + * errors. + */ + enable_gart_translations(); /* * Try to workaround a bug (thanks to BenH): -- cgit v1.2.3 From a21ca2cac582886a3e95c8bb84ff7c52d4d15e54 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 6 Jun 2009 09:58:57 +0200 Subject: perf_counter: Separate out attr->type from attr->config Counter type is a frequently used value and we do a lot of bit juggling by encoding and decoding it from attr->config. Clean this up by creating a separate attr->type field. Also clean up the various similarly complex user-space bits all around counter attribute management. The net improvement is significant, and it will be easier to add a new major type (which is what triggered this cleanup). (This changes the ABI, all tools are adapted.) (PowerPC build-tested.) Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 8f53f3a7da2..430e048f285 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -292,15 +292,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter) /* * Raw event type provide the config in the event structure */ - if (perf_event_raw(attr)) { - hwc->config |= x86_pmu.raw_event(perf_event_config(attr)); + if (attr->type == PERF_TYPE_RAW) { + hwc->config |= x86_pmu.raw_event(attr->config); } else { - if (perf_event_id(attr) >= x86_pmu.max_events) + if (attr->config >= x86_pmu.max_events) return -EINVAL; /* * The generic map: */ - hwc->config |= x86_pmu.event_map(perf_event_id(attr)); + hwc->config |= x86_pmu.event_map(attr->config); } counter->destroy = hw_perf_counter_destroy; -- cgit v1.2.3 From 8326f44da090d6d304d29b9fdc7fb3e20889e329 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 5 Jun 2009 20:22:46 +0200 Subject: perf_counter: Implement generalized cache event types Extend generic event enumeration with the PERF_TYPE_HW_CACHE method. This is a 3-dimensional space: { L1-D, L1-I, L2, ITLB, DTLB, BPU } x { load, store, prefetch } x { accesses, misses } User-space passes in the 3 coordinates and the kernel provides a counter. (if the hardware supports that type and if the combination makes sense.) Combinations that make no sense produce a -EINVAL. Combinations that are not supported by the hardware produce -ENOTSUP. Extend the tools to deal with this, and rewrite the event symbol parsing code with various popular aliases for the units and access methods above. So 'l1-cache-miss' and 'l1d-read-ops' are both valid aliases. ( x86 is supported for now, with the Nehalem event table filled in, and with Core2 and Atom having placeholder tables. ) Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 201 +++++++++++++++++++++++++++++++++++-- 1 file changed, 193 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 430e048f285..e86679fa521 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -83,6 +83,128 @@ static u64 intel_pmu_event_map(int event) return intel_perfmon_event_map[event]; } +/* + * Generalized hw caching related event table, filled + * in on a per model basis. A value of 0 means + * 'not supported', -1 means 'event makes no sense on + * this CPU', any other value means the raw event + * ID. + */ + +#define C(x) PERF_COUNT_HW_CACHE_##x + +static u64 __read_mostly hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; + +static const u64 nehalem_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ + [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ + [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0480, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(L2 ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ + [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ + [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0xc024, /* L2_RQSTS.PREFETCHES */ + [ C(RESULT_MISS) ] = 0x8024, /* L2_RQSTS.PREFETCH_MISS */ + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISS_RETIRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static const u64 core2_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + /* To be filled in */ +}; + +static const u64 atom_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + /* To be filled in */ +}; + static u64 intel_pmu_raw_event(u64 event) { #define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL @@ -246,6 +368,39 @@ static inline int x86_pmu_initialized(void) return x86_pmu.handle_irq != NULL; } +static inline int +set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr) +{ + unsigned int cache_type, cache_op, cache_result; + u64 config, val; + + config = attr->config; + + cache_type = (config >> 0) & 0xff; + if (cache_type >= PERF_COUNT_HW_CACHE_MAX) + return -EINVAL; + + cache_op = (config >> 8) & 0xff; + if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) + return -EINVAL; + + cache_result = (config >> 16) & 0xff; + if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) + return -EINVAL; + + val = hw_cache_event_ids[cache_type][cache_op][cache_result]; + + if (val == 0) + return -ENOENT; + + if (val == -1) + return -EINVAL; + + hwc->config |= val; + + return 0; +} + /* * Setup the hardware configuration for a given attr_type */ @@ -288,22 +443,25 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->sample_period = x86_pmu.max_period; atomic64_set(&hwc->period_left, hwc->sample_period); + counter->destroy = hw_perf_counter_destroy; /* * Raw event type provide the config in the event structure */ if (attr->type == PERF_TYPE_RAW) { hwc->config |= x86_pmu.raw_event(attr->config); - } else { - if (attr->config >= x86_pmu.max_events) - return -EINVAL; - /* - * The generic map: - */ - hwc->config |= x86_pmu.event_map(attr->config); + return 0; } - counter->destroy = hw_perf_counter_destroy; + if (attr->type == PERF_TYPE_HW_CACHE) + return set_ext_hw_attr(hwc, attr); + + if (attr->config >= x86_pmu.max_events) + return -EINVAL; + /* + * The generic map: + */ + hwc->config |= x86_pmu.event_map(attr->config); return 0; } @@ -989,6 +1147,33 @@ static int intel_pmu_init(void) rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + /* + * Nehalem: + */ + switch (boot_cpu_data.x86_model) { + case 17: + memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, + sizeof(u64)*PERF_COUNT_HW_CACHE_MAX* + PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX); + + pr_info("... installed Core2 event tables\n"); + break; + default: + case 26: + memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, + sizeof(u64)*PERF_COUNT_HW_CACHE_MAX* + PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX); + + pr_info("... installed Nehalem/Corei7 event tables\n"); + break; + case 28: + memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, + sizeof(u64)*PERF_COUNT_HW_CACHE_MAX* + PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX); + + pr_info("... installed Atom event tables\n"); + break; + } return 0; } -- cgit v1.2.3 From 5095f59bda6793a7b8f0856096d6893fe98e0e51 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 5 Jun 2009 23:27:17 +0530 Subject: x86: cpu_debug: Remove model information to reduce encoding-decoding Remove model information, encoding/decoding and reduce bookkeeping. This, besides removing a lot of code and cleaning up the code, also enables these features on many more CPUs that were enumerated before. Reported-by: Ingo Molnar Signed-off-by: Jaswinder Singh Rajput Cc: Alan Cox LKML-Reference: <1244224637.8212.6.camel@ht.satnam> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/cpu_debug.c | 417 +++++++++------------------------------- 1 file changed, 96 insertions(+), 321 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c index 46e29ab96c6..86afe13fc31 100644 --- a/arch/x86/kernel/cpu/cpu_debug.c +++ b/arch/x86/kernel/cpu/cpu_debug.c @@ -32,9 +32,7 @@ static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]); static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]); -static DEFINE_PER_CPU(unsigned, cpu_modelflag); static DEFINE_PER_CPU(int, cpu_priv_count); -static DEFINE_PER_CPU(unsigned, cpu_model); static DEFINE_MUTEX(cpu_debug_lock); @@ -80,302 +78,102 @@ static struct cpu_file_base cpu_file[] = { { "value", CPU_REG_ALL, 1 }, }; -/* Intel Registers Range */ -static struct cpu_debug_range cpu_intel_range[] = { - { 0x00000000, 0x00000001, CPU_MC, CPU_INTEL_ALL }, - { 0x00000006, 0x00000007, CPU_MONITOR, CPU_CX_AT_XE }, - { 0x00000010, 0x00000010, CPU_TIME, CPU_INTEL_ALL }, - { 0x00000011, 0x00000013, CPU_PMC, CPU_INTEL_PENTIUM }, - { 0x00000017, 0x00000017, CPU_PLATFORM, CPU_PX_CX_AT_XE }, - { 0x0000001B, 0x0000001B, CPU_APIC, CPU_P6_CX_AT_XE }, - - { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_PX_CX_AT_XE }, - { 0x0000002B, 0x0000002B, CPU_POWERON, CPU_INTEL_XEON }, - { 0x0000002C, 0x0000002C, CPU_FREQ, CPU_INTEL_XEON }, - { 0x0000003A, 0x0000003A, CPU_CONTROL, CPU_CX_AT_XE }, - - { 0x00000040, 0x00000043, CPU_LBRANCH, CPU_PM_CX_AT_XE }, - { 0x00000044, 0x00000047, CPU_LBRANCH, CPU_PM_CO_AT }, - { 0x00000060, 0x00000063, CPU_LBRANCH, CPU_C2_AT }, - { 0x00000064, 0x00000067, CPU_LBRANCH, CPU_INTEL_ATOM }, - - { 0x00000079, 0x00000079, CPU_BIOS, CPU_P6_CX_AT_XE }, - { 0x00000088, 0x0000008A, CPU_CACHE, CPU_INTEL_P6 }, - { 0x0000008B, 0x0000008B, CPU_BIOS, CPU_P6_CX_AT_XE }, - { 0x0000009B, 0x0000009B, CPU_MONITOR, CPU_INTEL_XEON }, - - { 0x000000C1, 0x000000C2, CPU_PMC, CPU_P6_CX_AT }, - { 0x000000CD, 0x000000CD, CPU_FREQ, CPU_CX_AT }, - { 0x000000E7, 0x000000E8, CPU_PERF, CPU_CX_AT }, - { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_P6_CX_XE }, - - { 0x00000116, 0x00000116, CPU_CACHE, CPU_INTEL_P6 }, - { 0x00000118, 0x00000118, CPU_CACHE, CPU_INTEL_P6 }, - { 0x00000119, 0x00000119, CPU_CACHE, CPU_INTEL_PX }, - { 0x0000011A, 0x0000011B, CPU_CACHE, CPU_INTEL_P6 }, - { 0x0000011E, 0x0000011E, CPU_CACHE, CPU_PX_CX_AT }, - - { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_P6_CX_AT_XE }, - { 0x00000179, 0x0000017A, CPU_MC, CPU_PX_CX_AT_XE }, - { 0x0000017B, 0x0000017B, CPU_MC, CPU_P6_XE }, - { 0x00000186, 0x00000187, CPU_PMC, CPU_P6_CX_AT }, - { 0x00000198, 0x00000199, CPU_PERF, CPU_PM_CX_AT_XE }, - { 0x0000019A, 0x0000019A, CPU_TIME, CPU_PM_CX_AT_XE }, - { 0x0000019B, 0x0000019D, CPU_THERM, CPU_PM_CX_AT_XE }, - { 0x000001A0, 0x000001A0, CPU_MISC, CPU_PM_CX_AT_XE }, - - { 0x000001C9, 0x000001C9, CPU_LBRANCH, CPU_PM_CX_AT }, - { 0x000001D7, 0x000001D8, CPU_LBRANCH, CPU_INTEL_XEON }, - { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_CX_AT_XE }, - { 0x000001DA, 0x000001DA, CPU_LBRANCH, CPU_INTEL_XEON }, - { 0x000001DB, 0x000001DB, CPU_LBRANCH, CPU_P6_XE }, - { 0x000001DC, 0x000001DC, CPU_LBRANCH, CPU_INTEL_P6 }, - { 0x000001DD, 0x000001DE, CPU_LBRANCH, CPU_PX_CX_AT_XE }, - { 0x000001E0, 0x000001E0, CPU_LBRANCH, CPU_INTEL_P6 }, - - { 0x00000200, 0x0000020F, CPU_MTRR, CPU_P6_CX_XE }, - { 0x00000250, 0x00000250, CPU_MTRR, CPU_P6_CX_XE }, - { 0x00000258, 0x00000259, CPU_MTRR, CPU_P6_CX_XE }, - { 0x00000268, 0x0000026F, CPU_MTRR, CPU_P6_CX_XE }, - { 0x00000277, 0x00000277, CPU_PAT, CPU_C2_AT_XE }, - { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_P6_CX_XE }, - - { 0x00000300, 0x00000308, CPU_PMC, CPU_INTEL_XEON }, - { 0x00000309, 0x0000030B, CPU_PMC, CPU_C2_AT_XE }, - { 0x0000030C, 0x00000311, CPU_PMC, CPU_INTEL_XEON }, - { 0x00000345, 0x00000345, CPU_PMC, CPU_C2_AT }, - { 0x00000360, 0x00000371, CPU_PMC, CPU_INTEL_XEON }, - { 0x0000038D, 0x00000390, CPU_PMC, CPU_C2_AT }, - { 0x000003A0, 0x000003BE, CPU_PMC, CPU_INTEL_XEON }, - { 0x000003C0, 0x000003CD, CPU_PMC, CPU_INTEL_XEON }, - { 0x000003E0, 0x000003E1, CPU_PMC, CPU_INTEL_XEON }, - { 0x000003F0, 0x000003F0, CPU_PMC, CPU_INTEL_XEON }, - { 0x000003F1, 0x000003F1, CPU_PMC, CPU_C2_AT_XE }, - { 0x000003F2, 0x000003F2, CPU_PMC, CPU_INTEL_XEON }, - - { 0x00000400, 0x00000402, CPU_MC, CPU_PM_CX_AT_XE }, - { 0x00000403, 0x00000403, CPU_MC, CPU_INTEL_XEON }, - { 0x00000404, 0x00000406, CPU_MC, CPU_PM_CX_AT_XE }, - { 0x00000407, 0x00000407, CPU_MC, CPU_INTEL_XEON }, - { 0x00000408, 0x0000040A, CPU_MC, CPU_PM_CX_AT_XE }, - { 0x0000040B, 0x0000040B, CPU_MC, CPU_INTEL_XEON }, - { 0x0000040C, 0x0000040E, CPU_MC, CPU_PM_CX_XE }, - { 0x0000040F, 0x0000040F, CPU_MC, CPU_INTEL_XEON }, - { 0x00000410, 0x00000412, CPU_MC, CPU_PM_CX_AT_XE }, - { 0x00000413, 0x00000417, CPU_MC, CPU_CX_AT_XE }, - { 0x00000480, 0x0000048B, CPU_VMX, CPU_CX_AT_XE }, - - { 0x00000600, 0x00000600, CPU_DEBUG, CPU_PM_CX_AT_XE }, - { 0x00000680, 0x0000068F, CPU_LBRANCH, CPU_INTEL_XEON }, - { 0x000006C0, 0x000006CF, CPU_LBRANCH, CPU_INTEL_XEON }, - - { 0x000107CC, 0x000107D3, CPU_PMC, CPU_INTEL_XEON_MP }, - - { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_INTEL_XEON }, - { 0xC0000081, 0xC0000082, CPU_CALL, CPU_INTEL_XEON }, - { 0xC0000084, 0xC0000084, CPU_CALL, CPU_INTEL_XEON }, - { 0xC0000100, 0xC0000102, CPU_BASE, CPU_INTEL_XEON }, +/* CPU Registers Range */ +static struct cpu_debug_range cpu_reg_range[] = { + { 0x00000000, 0x00000001, CPU_MC, }, + { 0x00000006, 0x00000007, CPU_MONITOR, }, + { 0x00000010, 0x00000010, CPU_TIME, }, + { 0x00000011, 0x00000013, CPU_PMC, }, + { 0x00000017, 0x00000017, CPU_PLATFORM, }, + { 0x0000001B, 0x0000001B, CPU_APIC, }, + { 0x0000002A, 0x0000002B, CPU_POWERON, }, + { 0x0000002C, 0x0000002C, CPU_FREQ, }, + { 0x0000003A, 0x0000003A, CPU_CONTROL, }, + { 0x00000040, 0x00000047, CPU_LBRANCH, }, + { 0x00000060, 0x00000067, CPU_LBRANCH, }, + { 0x00000079, 0x00000079, CPU_BIOS, }, + { 0x00000088, 0x0000008A, CPU_CACHE, }, + { 0x0000008B, 0x0000008B, CPU_BIOS, }, + { 0x0000009B, 0x0000009B, CPU_MONITOR, }, + { 0x000000C1, 0x000000C4, CPU_PMC, }, + { 0x000000CD, 0x000000CD, CPU_FREQ, }, + { 0x000000E7, 0x000000E8, CPU_PERF, }, + { 0x000000FE, 0x000000FE, CPU_MTRR, }, + + { 0x00000116, 0x0000011E, CPU_CACHE, }, + { 0x00000174, 0x00000176, CPU_SYSENTER, }, + { 0x00000179, 0x0000017B, CPU_MC, }, + { 0x00000186, 0x00000189, CPU_PMC, }, + { 0x00000198, 0x00000199, CPU_PERF, }, + { 0x0000019A, 0x0000019A, CPU_TIME, }, + { 0x0000019B, 0x0000019D, CPU_THERM, }, + { 0x000001A0, 0x000001A0, CPU_MISC, }, + { 0x000001C9, 0x000001C9, CPU_LBRANCH, }, + { 0x000001D7, 0x000001D8, CPU_LBRANCH, }, + { 0x000001D9, 0x000001D9, CPU_DEBUG, }, + { 0x000001DA, 0x000001E0, CPU_LBRANCH, }, + + { 0x00000200, 0x0000020F, CPU_MTRR, }, + { 0x00000250, 0x00000250, CPU_MTRR, }, + { 0x00000258, 0x00000259, CPU_MTRR, }, + { 0x00000268, 0x0000026F, CPU_MTRR, }, + { 0x00000277, 0x00000277, CPU_PAT, }, + { 0x000002FF, 0x000002FF, CPU_MTRR, }, + + { 0x00000300, 0x00000311, CPU_PMC, }, + { 0x00000345, 0x00000345, CPU_PMC, }, + { 0x00000360, 0x00000371, CPU_PMC, }, + { 0x0000038D, 0x00000390, CPU_PMC, }, + { 0x000003A0, 0x000003BE, CPU_PMC, }, + { 0x000003C0, 0x000003CD, CPU_PMC, }, + { 0x000003E0, 0x000003E1, CPU_PMC, }, + { 0x000003F0, 0x000003F2, CPU_PMC, }, + + { 0x00000400, 0x00000417, CPU_MC, }, + { 0x00000480, 0x0000048B, CPU_VMX, }, + + { 0x00000600, 0x00000600, CPU_DEBUG, }, + { 0x00000680, 0x0000068F, CPU_LBRANCH, }, + { 0x000006C0, 0x000006CF, CPU_LBRANCH, }, + + { 0x000107CC, 0x000107D3, CPU_PMC, }, + + { 0xC0000080, 0xC0000080, CPU_FEATURES, }, + { 0xC0000081, 0xC0000084, CPU_CALL, }, + { 0xC0000100, 0xC0000102, CPU_BASE, }, + { 0xC0000103, 0xC0000103, CPU_TIME, }, + + { 0xC0010000, 0xC0010007, CPU_PMC, }, + { 0xC0010010, 0xC0010010, CPU_CONF, }, + { 0xC0010015, 0xC0010015, CPU_CONF, }, + { 0xC0010016, 0xC001001A, CPU_MTRR, }, + { 0xC001001D, 0xC001001D, CPU_MTRR, }, + { 0xC001001F, 0xC001001F, CPU_CONF, }, + { 0xC0010030, 0xC0010035, CPU_BIOS, }, + { 0xC0010044, 0xC0010048, CPU_MC, }, + { 0xC0010050, 0xC0010056, CPU_SMM, }, + { 0xC0010058, 0xC0010058, CPU_CONF, }, + { 0xC0010060, 0xC0010060, CPU_CACHE, }, + { 0xC0010061, 0xC0010068, CPU_SMM, }, + { 0xC0010069, 0xC001006B, CPU_SMM, }, + { 0xC0010070, 0xC0010071, CPU_SMM, }, + { 0xC0010111, 0xC0010113, CPU_SMM, }, + { 0xC0010114, 0xC0010118, CPU_SVM, }, + { 0xC0010140, 0xC0010141, CPU_OSVM, }, + { 0xC0011022, 0xC0011023, CPU_CONF, }, }; -/* AMD Registers Range */ -static struct cpu_debug_range cpu_amd_range[] = { - { 0x00000000, 0x00000001, CPU_MC, CPU_K10_PLUS, }, - { 0x00000010, 0x00000010, CPU_TIME, CPU_K8_PLUS, }, - { 0x0000001B, 0x0000001B, CPU_APIC, CPU_K8_PLUS, }, - { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_K7_PLUS }, - { 0x0000008B, 0x0000008B, CPU_VER, CPU_K8_PLUS }, - { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_K8_PLUS, }, - - { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_K8_PLUS, }, - { 0x00000179, 0x0000017B, CPU_MC, CPU_K8_PLUS, }, - { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_K8_PLUS, }, - { 0x000001DB, 0x000001DE, CPU_LBRANCH, CPU_K8_PLUS, }, - - { 0x00000200, 0x0000020F, CPU_MTRR, CPU_K8_PLUS, }, - { 0x00000250, 0x00000250, CPU_MTRR, CPU_K8_PLUS, }, - { 0x00000258, 0x00000259, CPU_MTRR, CPU_K8_PLUS, }, - { 0x00000268, 0x0000026F, CPU_MTRR, CPU_K8_PLUS, }, - { 0x00000277, 0x00000277, CPU_PAT, CPU_K8_PLUS, }, - { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_K8_PLUS, }, - - { 0x00000400, 0x00000413, CPU_MC, CPU_K8_PLUS, }, - - { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_AMD_ALL, }, - { 0xC0000081, 0xC0000084, CPU_CALL, CPU_K8_PLUS, }, - { 0xC0000100, 0xC0000102, CPU_BASE, CPU_K8_PLUS, }, - { 0xC0000103, 0xC0000103, CPU_TIME, CPU_K10_PLUS, }, - - { 0xC0010000, 0xC0010007, CPU_PMC, CPU_K8_PLUS, }, - { 0xC0010010, 0xC0010010, CPU_CONF, CPU_K7_PLUS, }, - { 0xC0010015, 0xC0010015, CPU_CONF, CPU_K7_PLUS, }, - { 0xC0010016, 0xC001001A, CPU_MTRR, CPU_K8_PLUS, }, - { 0xC001001D, 0xC001001D, CPU_MTRR, CPU_K8_PLUS, }, - { 0xC001001F, 0xC001001F, CPU_CONF, CPU_K8_PLUS, }, - { 0xC0010030, 0xC0010035, CPU_BIOS, CPU_K8_PLUS, }, - { 0xC0010044, 0xC0010048, CPU_MC, CPU_K8_PLUS, }, - { 0xC0010050, 0xC0010056, CPU_SMM, CPU_K0F_PLUS, }, - { 0xC0010058, 0xC0010058, CPU_CONF, CPU_K10_PLUS, }, - { 0xC0010060, 0xC0010060, CPU_CACHE, CPU_AMD_11, }, - { 0xC0010061, 0xC0010068, CPU_SMM, CPU_K10_PLUS, }, - { 0xC0010069, 0xC001006B, CPU_SMM, CPU_AMD_11, }, - { 0xC0010070, 0xC0010071, CPU_SMM, CPU_K10_PLUS, }, - { 0xC0010111, 0xC0010113, CPU_SMM, CPU_K8_PLUS, }, - { 0xC0010114, 0xC0010118, CPU_SVM, CPU_K10_PLUS, }, - { 0xC0010140, 0xC0010141, CPU_OSVM, CPU_K10_PLUS, }, - { 0xC0011022, 0xC0011023, CPU_CONF, CPU_K10_PLUS, }, -}; - - -/* Intel */ -static int get_intel_modelflag(unsigned model) -{ - int flag; - - switch (model) { - case 0x0501: - case 0x0502: - case 0x0504: - flag = CPU_INTEL_PENTIUM; - break; - case 0x0601: - case 0x0603: - case 0x0605: - case 0x0607: - case 0x0608: - case 0x060A: - case 0x060B: - flag = CPU_INTEL_P6; - break; - case 0x0609: - case 0x060D: - flag = CPU_INTEL_PENTIUM_M; - break; - case 0x060E: - flag = CPU_INTEL_CORE; - break; - case 0x060F: - case 0x0617: - flag = CPU_INTEL_CORE2; - break; - case 0x061C: - flag = CPU_INTEL_ATOM; - break; - case 0x0F00: - case 0x0F01: - case 0x0F02: - case 0x0F03: - case 0x0F04: - flag = CPU_INTEL_XEON_P4; - break; - case 0x0F06: - flag = CPU_INTEL_XEON_MP; - break; - default: - flag = CPU_NONE; - break; - } - - return flag; -} - -/* AMD */ -static int get_amd_modelflag(unsigned model) -{ - int flag; - - switch (model >> 8) { - case 0x6: - flag = CPU_AMD_K6; - break; - case 0x7: - flag = CPU_AMD_K7; - break; - case 0x8: - flag = CPU_AMD_K8; - break; - case 0xf: - flag = CPU_AMD_0F; - break; - case 0x10: - flag = CPU_AMD_10; - break; - case 0x11: - flag = CPU_AMD_11; - break; - default: - flag = CPU_NONE; - break; - } - - return flag; -} - -static int get_cpu_modelflag(unsigned cpu) -{ - int flag; - - flag = per_cpu(cpu_model, cpu); - - switch (flag >> 16) { - case X86_VENDOR_INTEL: - flag = get_intel_modelflag(flag); - break; - case X86_VENDOR_AMD: - flag = get_amd_modelflag(flag & 0xffff); - break; - default: - flag = CPU_NONE; - break; - } - - return flag; -} - -static int get_cpu_range_count(unsigned cpu) -{ - int index; - - switch (per_cpu(cpu_model, cpu) >> 16) { - case X86_VENDOR_INTEL: - index = ARRAY_SIZE(cpu_intel_range); - break; - case X86_VENDOR_AMD: - index = ARRAY_SIZE(cpu_amd_range); - break; - default: - index = 0; - break; - } - - return index; -} - static int is_typeflag_valid(unsigned cpu, unsigned flag) { - unsigned vendor, modelflag; - int i, index; + int i; /* Standard Registers should be always valid */ if (flag >= CPU_TSS) return 1; - modelflag = per_cpu(cpu_modelflag, cpu); - vendor = per_cpu(cpu_model, cpu) >> 16; - index = get_cpu_range_count(cpu); - - for (i = 0; i < index; i++) { - switch (vendor) { - case X86_VENDOR_INTEL: - if ((cpu_intel_range[i].model & modelflag) && - (cpu_intel_range[i].flag & flag)) - return 1; - break; - case X86_VENDOR_AMD: - if ((cpu_amd_range[i].model & modelflag) && - (cpu_amd_range[i].flag & flag)) - return 1; - break; - } + for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { + if (cpu_reg_range[i].flag == flag) + return 1; } /* Invalid */ @@ -385,26 +183,11 @@ static int is_typeflag_valid(unsigned cpu, unsigned flag) static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max, int index, unsigned flag) { - unsigned modelflag; - - modelflag = per_cpu(cpu_modelflag, cpu); - *max = 0; - switch (per_cpu(cpu_model, cpu) >> 16) { - case X86_VENDOR_INTEL: - if ((cpu_intel_range[index].model & modelflag) && - (cpu_intel_range[index].flag & flag)) { - *min = cpu_intel_range[index].min; - *max = cpu_intel_range[index].max; - } - break; - case X86_VENDOR_AMD: - if ((cpu_amd_range[index].model & modelflag) && - (cpu_amd_range[index].flag & flag)) { - *min = cpu_amd_range[index].min; - *max = cpu_amd_range[index].max; - } - break; - } + if (cpu_reg_range[index].flag == flag) { + *min = cpu_reg_range[index].min; + *max = cpu_reg_range[index].max; + } else + *max = 0; return *max; } @@ -434,7 +217,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag) unsigned msr, msr_min, msr_max; struct cpu_private *priv; u32 low, high; - int i, range; + int i; if (seq) { priv = seq->private; @@ -446,9 +229,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag) } } - range = get_cpu_range_count(cpu); - - for (i = 0; i < range; i++) { + for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag)) continue; @@ -788,13 +569,11 @@ static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry) { struct dentry *cpu_dentry = NULL; unsigned reg, reg_min, reg_max; - int i, range, err = 0; + int i, err = 0; char reg_dir[12]; u32 low, high; - range = get_cpu_range_count(cpu); - - for (i = 0; i < range; i++) { + for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { if (!get_cpu_range(cpu, ®_min, ®_max, i, cpu_base[type].flag)) continue; @@ -850,10 +629,6 @@ static int cpu_init_cpu(void) cpui = &cpu_data(cpu); if (!cpu_has(cpui, X86_FEATURE_MSR)) continue; - per_cpu(cpu_model, cpu) = ((cpui->x86_vendor << 16) | - (cpui->x86 << 8) | - (cpui->x86_model)); - per_cpu(cpu_modelflag, cpu) = get_cpu_modelflag(cpu); sprintf(cpu_dir, "cpu%d", cpu); cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir); -- cgit v1.2.3 From 4a4aca641bc4598e77b866804f47c651ec4a764d Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Fri, 5 Jun 2009 12:02:38 +0200 Subject: x86: Add quirk for reboot stalls on a Dell Optiplex 360 The Dell Optiplex 360 hangs on reboot, just like the Optiplex 330, so the same quirk is needed. Signed-off-by: Jean Delvare Cc: Steve Conklin Cc: Leann Ogasawara Cc: LKML-Reference: <200906051202.38311.jdelvare@suse.de> Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 667188e0b5a..d2d1ce8170f 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -192,6 +192,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "0KP561"), }, }, + { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */ + .callback = set_bios_reboot, + .ident = "Dell OptiPlex 360", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"), + DMI_MATCH(DMI_BOARD_NAME, "0T656F"), + }, + }, { /* Handle problems with rebooting on Dell 2400's */ .callback = set_bios_reboot, .ident = "Dell PowerEdge 2400", -- cgit v1.2.3 From 103428e57be323c3c5545db8ad12667099bc6005 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 7 Jun 2009 16:48:40 +0400 Subject: x86, apic: Fix dummy apic read operation together with broken MP handling Ingo Molnar reported that read_apic is buggy novadays: [ 0.000000] Using APIC driver default [ 0.000000] SMP: Allowing 1 CPUs, 0 hotplug CPUs [ 0.000000] Local APIC disabled by BIOS -- you can enable it with "lapic" [ 0.000000] APIC: disable apic facility [ 0.000000] ------------[ cut here ]------------ [ 0.000000] WARNING: at arch/x86/kernel/apic/apic.c:254 native_apic_read_dummy+0x2d/0x3b() [ 0.000000] Hardware name: HP OmniBook PC Indeed we still rely on apic->read operation for SMP compiled kernel. And instead of disfigure the SMP code with #ifdef we allow to call apic->read. To capture any unexpected results we check for apic->read being called for sane reason via WARN_ON_ONCE but(!) instead of OR we should use AND logical operation (thanks Yinghai for spotting the root of the problem). Along with that we could be have bad MP table and we are to fix it that way no SMP started and no complains about BIOS bug if apic was just disabled via command line. Signed-off-by: Cyrill Gorcunov Cc: Yinghai Lu LKML-Reference: <20090607124840.GD4547@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 9 ++++++++- arch/x86/kernel/smpboot.c | 8 +++++--- 2 files changed, 13 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index e82488d3f0b..a4c9cf0bf70 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -249,7 +249,7 @@ static void native_apic_write_dummy(u32 reg, u32 v) static u32 native_apic_read_dummy(u32 reg) { - WARN_ON_ONCE((cpu_has_apic || !disable_apic)); + WARN_ON_ONCE((cpu_has_apic && !disable_apic)); return 0; } @@ -1609,6 +1609,13 @@ void __init init_apic_mappings(void) new_apicid = read_apic_id(); if (boot_cpu_physical_apicid != new_apicid) { boot_cpu_physical_apicid = new_apicid; + /* + * yeah -- we lie about apic_version + * in case if apic was disabled via boot option + * but it's not a problem for SMP compiled kernel + * since smp_sanity_check is prepared for such a case + * and disable smp mode + */ apic_version[new_apicid] = GET_APIC_VERSION(apic_read(APIC_LVR)); } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index d2e8de95815..7c80007ea5f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -992,10 +992,12 @@ static int __init smp_sanity_check(unsigned max_cpus) */ if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && !cpu_has_apic) { - printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", - boot_cpu_physical_apicid); - printk(KERN_ERR "... forcing use of dummy APIC emulation." + if (!disable_apic) { + pr_err("BIOS bug, local APIC #%d not detected!...\n", + boot_cpu_physical_apicid); + pr_err("... forcing use of dummy APIC emulation." "(tell your hw vendor)\n"); + } smpboot_clear_io_apic(); arch_disable_smp_support(); return -1; -- cgit v1.2.3 From 3aa6b186f86c5d06d6d92d14311ffed51f091f40 Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Sun, 7 Jun 2009 16:23:48 +0200 Subject: x86: Fix non-lazy GS handling in sys_vm86() This fixes a stack corruption panic or null dereference oops due to a bad GS in resume_userspace() when returning from sys_vm86() and calling lockdep_sys_exit(). Only a problem when CONFIG_LOCKDEP and CONFIG_CC_STACKPROTECTOR enabled. Signed-off-by: Lubomir Rintel Cc: H. Peter Anvin LKML-Reference: <1244384628.2323.4.camel@bimbo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vm86_32.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index d7ac84e7fc1..6a177694058 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -287,10 +287,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk info->regs.pt.ds = 0; info->regs.pt.es = 0; info->regs.pt.fs = 0; - -/* we are clearing gs later just before "jmp resume_userspace", - * because it is not saved/restored. - */ +#ifndef CONFIG_X86_32_LAZY_GS + info->regs.pt.gs = 0; +#endif /* * The flags register is also special: we cannot trust that the user @@ -343,7 +342,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk __asm__ __volatile__( "movl %0,%%esp\n\t" "movl %1,%%ebp\n\t" +#ifdef CONFIG_X86_32_LAZY_GS "mov %2, %%gs\n\t" +#endif "jmp resume_userspace" : /* no outputs */ :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0)); -- cgit v1.2.3 From aeef50bc0483fa70ce0bddb686ec84a274b7f3d4 Mon Sep 17 00:00:00 2001 From: "Figo.zhang" Date: Sun, 7 Jun 2009 22:30:36 +0800 Subject: x86, microcode: Simplify vfree() use vfree() does its own 'NULL' check, so no need for check before calling it. In v2, remove the stray newline. [ Impact: cleanup ] Signed-off-by: Figo.zhang Cc: Dmitry Adamushko LKML-Reference: <1244385036.3402.11.camel@myhost> Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index c8be20f1644..366baa17991 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -241,10 +241,8 @@ static int install_equiv_cpu_table(const u8 *buf) static void free_equiv_cpu_table(void) { - if (equiv_cpu_table) { - vfree(equiv_cpu_table); - equiv_cpu_table = NULL; - } + vfree(equiv_cpu_table); + equiv_cpu_table = NULL; } static enum ucode_state @@ -279,8 +277,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) mc_header = (struct microcode_header_amd *)mc; if (get_matching_microcode(cpu, mc, new_rev)) { - if (new_mc) - vfree(new_mc); + vfree(new_mc); new_rev = mc_header->patch_id; new_mc = mc; } else @@ -292,8 +289,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) if (new_mc) { if (!leftover) { - if (uci->mc) - vfree(uci->mc); + vfree(uci->mc); uci->mc = new_mc; pr_debug("microcode: CPU%d found a matching microcode " "update with version 0x%x (current=0x%x)\n", -- cgit v1.2.3 From 0312af84164215a452f2a94957ebd9bce86e0204 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 8 Jun 2009 07:42:04 +0200 Subject: perf_counter, x86: Implement generalized cache event types, add Core2 support Fill in core2_hw_cache_event_id[] with the Core2 model specific events. The events can be used in all the tools via the -e (--event) parameter, for example "-e l1-misses" or -"-e l2-accesses" or "-e l2-write-misses". Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 85 +++++++++++++++++++++++++++++++++++++- 1 file changed, 84 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index e86679fa521..b1f71ff5025 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -194,7 +194,90 @@ static const u64 core2_hw_cache_event_ids [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = { - /* To be filled in */ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ + [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ + [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L2 ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ + [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ + [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, }; static const u64 atom_hw_cache_event_ids -- cgit v1.2.3 From ad689220614b6c7c0b13b70d742f358e9310e71e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 8 Jun 2009 09:30:41 +0200 Subject: perf_counter, x86: Implement generalized cache event types, add Atom support Fill in core2_hw_cache_event_id[] with the Atom model specific events. The events can be used in all the tools via the -e (--event) parameter, for example "-e l1-misses" or -"-e l2-accesses" or "-e l2-write-misses". ( Note: these are straight from the Intel manuals - not tested yet.) Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 85 +++++++++++++++++++++++++++++++++++++- 1 file changed, 84 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index b1f71ff5025..71590e09d16 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -285,7 +285,90 @@ static const u64 atom_hw_cache_event_ids [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = { - /* To be filled in */ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x2241, /* L1D_CACHE.ST */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L2 ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ + [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ + [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, }; static u64 intel_pmu_raw_event(u64 event) -- cgit v1.2.3 From 1123e3ad73697d64ad99f0104bbe49f8b52d7d65 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 29 May 2009 11:25:09 +0200 Subject: perf_counter: Clean up x86 boot messages Standardize and tidy up all the messages we print during perfcounter initialization. Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 46 ++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 22 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 71590e09d16..0339d195a3f 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1298,23 +1298,22 @@ static int intel_pmu_init(void) if (version < 2) return -ENODEV; - x86_pmu = intel_pmu; - x86_pmu.version = version; - x86_pmu.num_counters = eax.split.num_counters; + x86_pmu = intel_pmu; + x86_pmu.version = version; + x86_pmu.num_counters = eax.split.num_counters; + x86_pmu.counter_bits = eax.split.bit_width; + x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1; /* * Quirk: v2 perfmon does not report fixed-purpose counters, so * assume at least 3 counters: */ - x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); - - x86_pmu.counter_bits = eax.split.bit_width; - x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1; + x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); /* - * Nehalem: + * Install the hw-cache-events table: */ switch (boot_cpu_data.x86_model) { case 17: @@ -1322,7 +1321,7 @@ static int intel_pmu_init(void) sizeof(u64)*PERF_COUNT_HW_CACHE_MAX* PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX); - pr_info("... installed Core2 event tables\n"); + pr_cont("Core2 events, "); break; default: case 26: @@ -1330,14 +1329,14 @@ static int intel_pmu_init(void) sizeof(u64)*PERF_COUNT_HW_CACHE_MAX* PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX); - pr_info("... installed Nehalem/Corei7 event tables\n"); + pr_cont("Nehalem/Corei7 events, "); break; case 28: memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, sizeof(u64)*PERF_COUNT_HW_CACHE_MAX* PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX); - pr_info("... installed Atom event tables\n"); + pr_cont("Atom events, "); break; } return 0; @@ -1353,6 +1352,8 @@ void __init init_hw_perf_counters(void) { int err; + pr_info("Performance Counters: "); + switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: err = intel_pmu_init(); @@ -1363,14 +1364,13 @@ void __init init_hw_perf_counters(void) default: return; } - if (err != 0) + if (err != 0) { + pr_cont("no PMU driver, software counters only.\n"); return; + } - pr_info("%s Performance Monitoring support detected.\n", x86_pmu.name); - pr_info("... version: %d\n", x86_pmu.version); - pr_info("... bit width: %d\n", x86_pmu.counter_bits); + pr_cont("%s PMU driver.\n", x86_pmu.name); - pr_info("... num counters: %d\n", x86_pmu.num_counters); if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { x86_pmu.num_counters = X86_PMC_MAX_GENERIC; WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", @@ -1379,23 +1379,25 @@ void __init init_hw_perf_counters(void) perf_counter_mask = (1 << x86_pmu.num_counters) - 1; perf_max_counters = x86_pmu.num_counters; - pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask); - pr_info("... max period: %016Lx\n", x86_pmu.max_period); - if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); } - pr_info("... fixed counters: %d\n", x86_pmu.num_counters_fixed); perf_counter_mask |= ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; - pr_info("... counter mask: %016Lx\n", perf_counter_mask); - perf_counters_lapic_init(); register_die_notifier(&perf_counter_nmi_notifier); + + pr_info("... version: %d\n", x86_pmu.version); + pr_info("... bit width: %d\n", x86_pmu.counter_bits); + pr_info("... generic counters: %d\n", x86_pmu.num_counters); + pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask); + pr_info("... max period: %016Lx\n", x86_pmu.max_period); + pr_info("... fixed-purpose counters: %d\n", x86_pmu.num_counters_fixed); + pr_info("... counter mask: %016Lx\n", perf_counter_mask); } static inline void x86_pmu_read(struct perf_counter *counter) -- cgit v1.2.3 From c4ed3f04ba9defe22aa729d1646f970f791c03d7 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Mon, 8 Jun 2009 10:44:05 -0500 Subject: x86, UV: Fix macros for multiple coherency domains Fix bug in the SGI UV macros that support systems with multiple coherency domains. The macros used for referencing global MMR (chipset registers) are failing to correctly "or" the NASID (node identifier) bits that reside above M+N. These high bits are supplied automatically by the chipset for memory accesses coming from the processor socket. However, the bits must be present for references to the special global MMR space used to map chipset registers. (See uv_hub.h for more details ...) The bug results in references to invalid/incorrect nodes. Signed-off-by: Jack Steiner Cc: LKML-Reference: <20090608154405.GA16395@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_uv_x.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 2bda6935297..39f2af4b546 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -562,7 +562,7 @@ void __init uv_system_init(void) union uvh_node_id_u node_id; unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; - int max_pnode = 0; + int gnode_extra, max_pnode = 0; unsigned long mmr_base, present, paddr; unsigned short pnode_mask; @@ -574,6 +574,13 @@ void __init uv_system_init(void) mmr_base = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & ~UV_MMR_ENABLE; + pnode_mask = (1 << n_val) - 1; + node_id.v = uv_read_local_mmr(UVH_NODE_ID); + gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1; + gnode_upper = ((unsigned long)gnode_extra << m_val); + printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n", + n_val, m_val, gnode_upper, gnode_extra); + printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base); for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) @@ -607,11 +614,6 @@ void __init uv_system_init(void) } } - pnode_mask = (1 << n_val) - 1; - node_id.v = uv_read_local_mmr(UVH_NODE_ID); - gnode_upper = (((unsigned long)node_id.s.node_id) & - ~((1 << n_val) - 1)) << m_val; - uv_bios_init(); uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id, &sn_region_size); @@ -634,6 +636,7 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; + uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; -- cgit v1.2.3 From f86748e91a14bd6cc49477560f33ed5d59896e89 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 8 Jun 2009 22:33:10 +0200 Subject: perf_counter, x86: Implement generalized cache event types, add AMD support Fill in amd_hw_cache_event_id[] with the AMD CPU specific events, for family 0x0f, 0x10 and 0x11. There's apparently no distinction between load and store events, so we only fill in the load events. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 102 +++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 0339d195a3f..93af821ebe5 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -389,6 +389,97 @@ static u64 intel_pmu_raw_event(u64 event) return event & CORE_EVNTSEL_MASK; } +static const u64 amd_0f_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */ + [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L2 ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ + [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */ + [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + /* * AMD Performance Monitor K7 and later. */ @@ -1345,6 +1436,17 @@ static int intel_pmu_init(void) static int amd_pmu_init(void) { x86_pmu = amd_pmu; + + switch (boot_cpu_data.x86) { + case 0x0f: + case 0x10: + case 0x11: + memcpy(hw_cache_event_ids, amd_0f_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + pr_cont("AMD Family 0f/10/11 events, "); + break; + } return 0; } -- cgit v1.2.3 From 820a644211bc1ac7715333abdb0f0b9ea4fbb549 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 8 Jun 2009 19:10:25 +0200 Subject: perf_counter, x86: Clean up hw_cache_event ids copies Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 93af821ebe5..56001feeffc 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1409,23 +1409,20 @@ static int intel_pmu_init(void) switch (boot_cpu_data.x86_model) { case 17: memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, - sizeof(u64)*PERF_COUNT_HW_CACHE_MAX* - PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX); + sizeof(hw_cache_event_ids)); pr_cont("Core2 events, "); break; default: case 26: memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, - sizeof(u64)*PERF_COUNT_HW_CACHE_MAX* - PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX); + sizeof(hw_cache_event_ids)); pr_cont("Nehalem/Corei7 events, "); break; case 28: memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, - sizeof(u64)*PERF_COUNT_HW_CACHE_MAX* - PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX); + sizeof(hw_cache_event_ids)); pr_cont("Atom events, "); break; -- cgit v1.2.3 From 29150078d7a1758df8c7a6cd2ec066ac65e1fab9 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 9 Jun 2009 10:54:18 +0200 Subject: amd-iommu: remove BUS_NOTIFY_BOUND_DRIVER handling Handling this event causes device assignment in KVM to fail because the device gets re-attached as soon as the pci-stub registers as the driver for the device. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 8510e90ebfe..81872604eb7 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1145,17 +1145,6 @@ static int device_change_notifier(struct notifier_block *nb, "to a non-dma-ops domain\n", dev_name(dev)); switch (action) { - case BUS_NOTIFY_BOUND_DRIVER: - if (domain) - goto out; - dma_domain = find_protection_domain(devid); - if (!dma_domain) - dma_domain = iommu->default_dom; - attach_device(iommu, &dma_domain->domain, devid); - DUMP_printk(KERN_INFO "AMD IOMMU: Using protection domain " - "%d for device %s\n", - dma_domain->domain.id, dev_name(dev)); - break; case BUS_NOTIFY_UNBOUND_DRIVER: if (!domain) goto out; -- cgit v1.2.3 From 71ff3bca2f70264effe8cbbdd5bc10cf6be5f2f0 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 8 Jun 2009 13:47:33 -0700 Subject: amd-iommu: detach device explicitly before attaching it to a new domain This fixes a bug with a device that could not be assigned to a KVM guest because it is still assigned to a dma_ops protection domain. [chrisw: simply remove WARN_ON(), will always fire since dev->driver will be pci-sub] Signed-off-by: Chris Wright Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 81872604eb7..772e91088e4 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -2073,7 +2073,7 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, old_domain = domain_for_device(devid); if (old_domain) - return -EBUSY; + detach_device(old_domain, devid); attach_device(iommu, domain, devid); -- cgit v1.2.3 From e9a22a13c71986851e931bdfa054f68839ff8576 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 9 Jun 2009 12:00:37 +0200 Subject: amd-iommu: remove unnecessary "AMD IOMMU: " prefix That prefix is already included in the DUMP_printk macro. So there is no need to repeat it in the format string. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 772e91088e4..1c60554537c 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1266,9 +1266,8 @@ static int get_device_resources(struct device *dev, dma_dom = (*iommu)->default_dom; *domain = &dma_dom->domain; attach_device(*iommu, *domain, *bdf); - DUMP_printk(KERN_INFO "AMD IOMMU: Using protection domain " - "%d for device %s\n", - (*domain)->id, dev_name(dev)); + DUMP_printk("Using protection domain %d for device %s\n", + (*domain)->id, dev_name(dev)); } if (domain_for_device(_bdf) == NULL) -- cgit v1.2.3 From eaa958402ea40851097d051f52ba1bb7a885efe9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 6 Jun 2009 14:51:36 -0700 Subject: cpumask: alloc zeroed cpumask for static cpumask_var_ts These are defined as static cpumask_var_t so if MAXSMP is not used, they are cleared already. Avoid surprises when MAXSMP is enabled. Signed-off-by: Yinghai Lu Signed-off-by: Rusty Russell --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 2 +- arch/x86/kernel/cpu/cpufreq/powernow-k7.c | 2 +- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 2 +- arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c | 2 +- arch/x86/kernel/cpu/mcheck/mce_64.c | 2 +- arch/x86/kernel/tlb_uv.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 54b6de2cd94..752e8c6b2c7 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -550,7 +550,7 @@ static int __init acpi_cpufreq_early_init(void) return -ENOMEM; } for_each_possible_cpu(i) { - if (!alloc_cpumask_var_node( + if (!zalloc_cpumask_var_node( &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map, GFP_KERNEL, cpu_to_node(i))) { diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c index a8363e5be4e..d47c775eb0a 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c @@ -322,7 +322,7 @@ static int powernow_acpi_init(void) goto err0; } - if (!alloc_cpumask_var(&acpi_processor_perf->shared_cpu_map, + if (!zalloc_cpumask_var(&acpi_processor_perf->shared_cpu_map, GFP_KERNEL)) { retval = -ENOMEM; goto err05; diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 35dc8fbe92b..cf52215d9eb 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -887,7 +887,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) /* notify BIOS that we exist */ acpi_processor_notify_smm(THIS_MODULE); - if (!alloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) { + if (!zalloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) { printk(KERN_ERR PFX "unable to alloc powernow_k8_data cpumask\n"); ret_val = -ENOMEM; diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index c9f1fdc0283..55c831ed71c 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c @@ -471,7 +471,7 @@ static int centrino_target (struct cpufreq_policy *policy, if (unlikely(!alloc_cpumask_var(&saved_mask, GFP_KERNEL))) return -ENOMEM; - if (unlikely(!alloc_cpumask_var(&covered_cpus, GFP_KERNEL))) { + if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL))) { free_cpumask_var(saved_mask); return -ENOMEM; } diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 6fb0b359d2a..09dd1d414fc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -1163,7 +1163,7 @@ static __init int mce_init_device(void) if (!mce_available(&boot_cpu_data)) return -EIO; - alloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); + zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); err = mce_init_banks(); if (err) diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index ed0c33761e6..8c7b03b0cfc 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -832,7 +832,7 @@ static int __init uv_bau_init(void) return 0; for_each_possible_cpu(cur_cpu) - alloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), + zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), GFP_KERNEL, cpu_to_node(cur_cpu)); uv_bau_retry_limit = 1; -- cgit v1.2.3 From 42937e81a82b6bbc51a309c83da140b3a7ca5945 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Mon, 8 Jun 2009 15:55:09 +0200 Subject: x86: Detect use of extended APIC ID for AMD CPUs Booting a 32-bit kernel on Magny-Cours results in the following panic: ... Using APIC driver default ... Overriding APIC driver with bigsmp ... Getting VERSION: 80050010 Getting VERSION: 80050010 Getting ID: 10000000 Getting ID: ef000000 Getting LVT0: 700 Getting LVT1: 10000 Kernel panic - not syncing: Boot APIC ID in local APIC unexpected (16 vs 0) Pid: 1, comm: swapper Not tainted 2.6.30-rcX #2 Call Trace: [] ? panic+0x38/0xd3 [] ? native_smp_prepare_cpus+0x259/0x31f [] ? kernel_init+0x3e/0x141 [] ? kernel_init+0x0/0x141 [] ? kernel_thread_helper+0x7/0x10 The reason is that default_get_apic_id handled extension of local APIC ID field just in case of XAPIC. Thus for this AMD CPU, default_get_apic_id() returns 0 and bigsmp_get_apic_id() returns 16 which leads to the respective kernel panic. This patch introduces a Linux specific feature flag to indicate support for extended APIC id (8 bits instead of 4 bits width) and sets the flag on AMD CPUs if applicable. Signed-off-by: Andreas Herrmann Cc: LKML-Reference: <20090608135509.GA12431@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 7e4a459daa6..0802e151c2c 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -6,6 +6,7 @@ #include #include #include +#include #ifdef CONFIG_X86_64 # include @@ -351,6 +352,15 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) (c->x86_model == 8 && c->x86_mask >= 8)) set_cpu_cap(c, X86_FEATURE_K6_MTRR); #endif +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) + /* check CPU config space for extended APIC ID */ + if (c->x86 >= 0xf) { + unsigned int val; + val = read_pci_config(0, 24, 0, 0x68); + if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18))) + set_cpu_cap(c, X86_FEATURE_EXTD_APICID); + } +#endif } static void __cpuinit init_amd(struct cpuinfo_x86 *c) -- cgit v1.2.3 From fecc8ac8496fce96069724f54daba8e7078b0082 Mon Sep 17 00:00:00 2001 From: Yong Wang Date: Tue, 9 Jun 2009 21:15:53 +0800 Subject: perf_counter, x86: Correct some event and umask values for Intel processors Correct some event and UMASK values according to Intel SDM, in the Nehalem and Atom tables. Signed-off-by: Yong Wang Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: <20090609131553.GA12489@ywang-moblin2.bj.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 56001feeffc..40978aac6e0 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -119,7 +119,7 @@ static const u64 nehalem_hw_cache_event_ids }, [ C(L1I ) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0480, /* L1I.READS */ + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ }, [ C(OP_WRITE) ] = { @@ -162,7 +162,7 @@ static const u64 nehalem_hw_cache_event_ids [ C(ITLB) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISS_RETIRED */ + [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */ }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = -1, @@ -291,7 +291,7 @@ static const u64 atom_hw_cache_event_ids [ C(RESULT_MISS) ] = 0, }, [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x2241, /* L1D_CACHE.ST */ + [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */ [ C(RESULT_MISS) ] = 0, }, [ C(OP_PREFETCH) ] = { @@ -301,8 +301,8 @@ static const u64 atom_hw_cache_event_ids }, [ C(L1I ) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */ - [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */ + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = -1, @@ -329,11 +329,11 @@ static const u64 atom_hw_cache_event_ids }, [ C(DTLB) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */ [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */ }, [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */ [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */ }, [ C(OP_PREFETCH) ] = { -- cgit v1.2.3 From a90ede7b17d122acd58e6e1ff911be9dcf5263cc Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 11 Feb 2009 22:45:42 -0200 Subject: KVM: x86: paravirt skip pit-through-ioapic boot check Skip the test which checks if the PIT is properly routed when using the IOAPIC, aimed at buggy hardware. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kernel/kvm.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 33019ddb56b..057173db6ad 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -27,6 +27,7 @@ #include #include #include +#include #define MMU_QUEUE_SIZE 1024 @@ -230,6 +231,9 @@ static void paravirt_ops_setup(void) pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu; pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu; } +#ifdef CONFIG_X86_IO_APIC + no_timer_check = 1; +#endif } void __init kvm_guest_init(void) -- cgit v1.2.3 From 32f8840064d88cc3f6e85203aec7b6b57bebcb97 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 7 May 2009 17:55:12 -0300 Subject: KVM: use smp_send_reschedule in kvm_vcpu_kick KVM uses a function call IPI to cause the exit of a guest running on a physical cpu. For virtual interrupt notification there is no need to wait on IPI receival, or to execute any function. This is exactly what the reschedule IPI does, without the overhead of function IPI. So use it instead of smp_call_function_single in kvm_vcpu_kick. Also change the "guest_mode" variable to a bit in vcpu->requests, and use that to collapse multiple IPI's that would be issued between the first one and zeroing of guest mode. This allows kvm_vcpu_kick to called with interrupts disabled. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kernel/smp.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 13f33ea8cca..3b2e55e8ad2 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -172,6 +172,9 @@ void smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); inc_irq_stat(irq_resched_count); + /* + * KVM uses this interrupt to force a cpu out of guest mode + */ } void smp_call_function_interrupt(struct pt_regs *regs) -- cgit v1.2.3 From a0861c02a981c943573478ea13b29b1fb958ee5b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 8 Jun 2009 17:37:09 +0800 Subject: KVM: Add VT-x machine check support VT-x needs an explicit MC vector intercept to handle machine checks in the hyper visor. It also has a special option to catch machine checks that happen during VT entry. Do these interceptions and forward them to the Linux machine check handler. Make it always look like user space is interrupted because the machine check handler treats kernel/user space differently. Thanks to Jiang Yunhong for help and testing. Cc: stable@kernel.org Signed-off-by: Andi Kleen Signed-off-by: Huang Ying Signed-off-by: Avi Kivity --- arch/x86/kernel/cpu/mcheck/mce_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 09dd1d414fc..289cc481502 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -420,6 +420,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) out2: atomic_dec(&mce_entry); } +EXPORT_SYMBOL_GPL(do_machine_check); #ifdef CONFIG_X86_MCE_INTEL /*** -- cgit v1.2.3 From dc81081b2d9a6a9d64dad1bef1e5fc9fb660e53e Mon Sep 17 00:00:00 2001 From: Yong Wang Date: Wed, 10 Jun 2009 17:06:12 +0800 Subject: perf_counter/x86: Fix the model number of Intel Core2 processors Fix the model number of Intel Core2 processors according to the documentation: Intel Processor Identification with the CPUID Instruction: http://www.intel.com/support/processors/sb/cs-009861.htm Signed-off-by: Yong Wang Also-Reported-by: Arnd Bergmann Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: <20090610090612.GA26580@ywang-moblin2.bj.intel.com> [ Added two more model numbers suggested by Arnd Bergmann ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 40978aac6e0..49f258537cb 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1407,7 +1407,10 @@ static int intel_pmu_init(void) * Install the hw-cache-events table: */ switch (boot_cpu_data.x86_model) { - case 17: + case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ + case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ + case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ + case 29: /* six-core 45 nm xeon "Dunnington" */ memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, sizeof(hw_cache_event_ids)); -- cgit v1.2.3 From bd2b5b12849a3446abad0b25e920f86f5480b309 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Jun 2009 13:40:57 +0200 Subject: perf_counter: More aggressive frequency adjustment Also employ the overflow handler to adjust the frequency, this results in a stable frequency in about 40~50 samples, instead of that many ticks. This also means we can start sampling at a sample period of 1 without running head-first into the throttle. It relies on sched_clock() to accurately measure the time difference between the overflow NMIs. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 49f258537cb..240ca563063 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -696,10 +696,11 @@ static int __hw_perf_counter_init(struct perf_counter *counter) if (!attr->exclude_kernel) hwc->config |= ARCH_PERFMON_EVENTSEL_OS; - if (!hwc->sample_period) + if (!hwc->sample_period) { hwc->sample_period = x86_pmu.max_period; + atomic64_set(&hwc->period_left, hwc->sample_period); + } - atomic64_set(&hwc->period_left, hwc->sample_period); counter->destroy = hw_perf_counter_destroy; /* -- cgit v1.2.3 From 0de51088e6a82bc8413d3ca9e28bbca2788b5b53 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Mon, 8 Jun 2009 18:27:54 +0800 Subject: CPUFREQ: Enable acpi-cpufreq driver for VIA/Centaur CPUs The VIA/Centaur C7, C7-M and Nano CPU's all support ACPI based cpu p-states using a MSR interface. The Linux driver just never made use of it, since in addition to the check for the EST flag it also checked if the vendor is Intel. Signed-off-by: Harald Welte [ Removed the vendor checks entirely - Linus ] Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 752e8c6b2c7..ae9b503220c 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -90,11 +90,7 @@ static int check_est_cpu(unsigned int cpuid) { struct cpuinfo_x86 *cpu = &cpu_data(cpuid); - if (cpu->x86_vendor != X86_VENDOR_INTEL || - !cpu_has(cpu, X86_FEATURE_EST)) - return 0; - - return 1; + return cpu_has(cpu, X86_FEATURE_EST); } static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data) -- cgit v1.2.3 From 0fea615e526b4b7eff0363ee02d5753e5f924089 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Mon, 8 Jun 2009 18:29:36 +0800 Subject: CPUFREQ: Mark e_powersaver driver as EXPERIMENTAL and DANGEROUS The e_powersaver driver for VIA's C7 CPU's needs to be marked as DANGEROUS as it configures the CPU to power states that are out of specification. According to Centaur, all systems with C7 and Nano CPU's support the ACPI p-state method. Thus, the acpi-cpufreq driver should be used instead. Signed-off-by: Harald Welte Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/cpufreq/Kconfig | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig index 52c83987547..f138c6c389b 100644 --- a/arch/x86/kernel/cpu/cpufreq/Kconfig +++ b/arch/x86/kernel/cpu/cpufreq/Kconfig @@ -220,11 +220,14 @@ config X86_LONGHAUL If in doubt, say N. config X86_E_POWERSAVER - tristate "VIA C7 Enhanced PowerSaver" + tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)" select CPU_FREQ_TABLE - depends on X86_32 + depends on X86_32 && EXPERIMENTAL help - This adds the CPUFreq driver for VIA C7 processors. + This adds the CPUFreq driver for VIA C7 processors. However, this driver + does not have any safeguards to prevent operating the CPU out of spec + and is thus considered dangerous. Please use the regular ACPI cpufreq + driver, enabled by CONFIG_X86_ACPI_CPUFREQ. If in doubt, say N. -- cgit v1.2.3 From df1a132bf3d3508f863336c80a27806a2ac947e0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Jun 2009 21:02:22 +0200 Subject: perf_counter: Introduce struct for sample data For easy extension of the sample data, put it in a structure. Signed-off-by: Peter Zijlstra Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 240ca563063..82a23d487f9 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1173,11 +1173,14 @@ static void intel_pmu_reset(void) */ static int intel_pmu_handle_irq(struct pt_regs *regs) { + struct perf_sample_data data; struct cpu_hw_counters *cpuc; - struct cpu_hw_counters; int bit, cpu, loops; u64 ack, status; + data.regs = regs; + data.addr = 0; + cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_counters, cpu); @@ -1210,7 +1213,7 @@ again: if (!intel_pmu_save_and_restart(counter)) continue; - if (perf_counter_overflow(counter, 1, regs, 0)) + if (perf_counter_overflow(counter, 1, &data)) intel_pmu_disable_counter(&counter->hw, bit); } @@ -1230,12 +1233,16 @@ again: static int amd_pmu_handle_irq(struct pt_regs *regs) { - int cpu, idx, handled = 0; + struct perf_sample_data data; struct cpu_hw_counters *cpuc; struct perf_counter *counter; struct hw_perf_counter *hwc; + int cpu, idx, handled = 0; u64 val; + data.regs = regs; + data.addr = 0; + cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_counters, cpu); @@ -1256,7 +1263,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) if (!x86_perf_counter_set_period(counter, hwc, idx)) continue; - if (perf_counter_overflow(counter, 1, regs, 0)) + if (perf_counter_overflow(counter, 1, &data)) amd_pmu_disable_counter(hwc, idx); } -- cgit v1.2.3 From 9e350de37ac9607012fcf9c5314a28fbddf8f43c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Jun 2009 21:34:59 +0200 Subject: perf_counter: Accurate period data We currently log hw.sample_period for PERF_SAMPLE_PERIOD, however this is incorrect. When we adjust the period, it will only take effect the next cycle but report it for the current cycle. So when we adjust the period for every cycle, we're always wrong. Solve this by keeping track of the last_period. Signed-off-by: Peter Zijlstra Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 82a23d487f9..57ae1bec81b 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -698,6 +698,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) if (!hwc->sample_period) { hwc->sample_period = x86_pmu.max_period; + hwc->last_period = hwc->sample_period; atomic64_set(&hwc->period_left, hwc->sample_period); } @@ -880,12 +881,14 @@ x86_perf_counter_set_period(struct perf_counter *counter, if (unlikely(left <= -period)) { left = period; atomic64_set(&hwc->period_left, left); + hwc->last_period = period; ret = 1; } if (unlikely(left <= 0)) { left += period; atomic64_set(&hwc->period_left, left); + hwc->last_period = period; ret = 1; } /* @@ -1257,9 +1260,12 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) if (val & (1ULL << (x86_pmu.counter_bits - 1))) continue; - /* counter overflow */ - handled = 1; - inc_irq_stat(apic_perf_irqs); + /* + * counter overflow + */ + handled = 1; + data.period = counter->hw.last_period; + if (!x86_perf_counter_set_period(counter, hwc, idx)) continue; @@ -1267,6 +1273,9 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) amd_pmu_disable_counter(hwc, idx); } + if (handled) + inc_irq_stat(apic_perf_irqs); + return handled; } -- cgit v1.2.3 From 77e26cca20013e9352a8df86a54640543304a23a Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 11 Jun 2009 16:04:35 +0900 Subject: x86, mce: Fix mce printing This patch: - Adds print_mce_head() instead of first flag - Makes the header to be printed always - Stops double printing of corrected errors [ This portion originates from Huang Ying's patch ] Originally-From: Huang Ying Signed-off-by: Hidetoshi Seto LKML-Reference: <4A30AC83.5010708@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index d4e7b5947a0..6a3127ecb5c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -180,12 +180,8 @@ void mce_log(struct mce *mce) set_bit(0, ¬ify_user); } -static void print_mce(struct mce *m, int *first) +static void print_mce(struct mce *m) { - if (*first) { - printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n"); - *first = 0; - } printk(KERN_EMERG "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", m->extcpu, m->mcgstatus, m->bank, m->status); @@ -208,6 +204,11 @@ static void print_mce(struct mce *m, int *first) m->apicid); } +static void print_mce_head(void) +{ + printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n"); +} + static void print_mce_tail(void) { printk(KERN_EMERG "This is not a software problem!\n" @@ -234,7 +235,6 @@ static void wait_for_panic(void) static void mce_panic(char *msg, struct mce *final, char *exp) { int i; - int first = 1; /* * Make sure only one CPU runs in machine check panic @@ -245,23 +245,27 @@ static void mce_panic(char *msg, struct mce *final, char *exp) bust_spinlocks(1); console_verbose(); + print_mce_head(); /* First print corrected ones that are still unlogged */ for (i = 0; i < MCE_LOG_LEN; i++) { struct mce *m = &mcelog.entry[i]; - if ((m->status & MCI_STATUS_VAL) && - !(m->status & MCI_STATUS_UC)) - print_mce(m, &first); + if (!(m->status & MCI_STATUS_VAL)) + continue; + if (!(m->status & MCI_STATUS_UC)) + print_mce(m); } /* Now print uncorrected but with the final one last */ for (i = 0; i < MCE_LOG_LEN; i++) { struct mce *m = &mcelog.entry[i]; if (!(m->status & MCI_STATUS_VAL)) continue; + if (!(m->status & MCI_STATUS_UC)) + continue; if (!final || memcmp(m, final, sizeof(struct mce))) - print_mce(m, &first); + print_mce(m); } if (final) - print_mce(final, &first); + print_mce(final); if (cpu_missing) printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); print_mce_tail(); -- cgit v1.2.3 From 62fdac5913f71f8f200bd2c9bd59a02e9a1498e9 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 11 Jun 2009 16:06:07 +0900 Subject: x86, mce: Add boot options for corrected errors This patch introduces three boot options (no_cmci, dont_log_ce and ignore_ce) to control handling for corrected errors. The "mce=no_cmci" boot option disables the CMCI feature. Since CMCI is a new feature so having boot controls to disable it will be a help if the hardware is misbehaving. The "mce=dont_log_ce" boot option disables logging for corrected errors. All reported corrected errors will be cleared silently. This option will be useful if you never care about corrected errors. The "mce=ignore_ce" boot option disables features for corrected errors, i.e. polling timer and cmci. All corrected events are not cleared and kept in bank MSRs. Usually this disablement is not recommended, however it will be a help if there are some conflict with the BIOS or hardware monitoring applications etc., that clears corrected events in banks instead of OS. [ And trivial cleanup (space -> tab) for doc is included. ] Signed-off-by: Hidetoshi Seto Reviewed-by: Andi Kleen LKML-Reference: <4A30ACDF.5030408@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 19 +++++++++++++++++-- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 3 +++ 2 files changed, 20 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 6a3127ecb5c..fabba15e455 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -84,6 +84,9 @@ static int rip_msr; static int mce_bootlog = -1; static int monarch_timeout = -1; static int mce_panic_timeout; +static int mce_dont_log_ce; +int mce_cmci_disabled; +int mce_ignore_ce; int mce_ser; static char trigger[128]; @@ -526,7 +529,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * Don't get the IP here because it's unlikely to * have anything to do with the actual error location. */ - if (!(flags & MCP_DONTLOG)) { + if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { mce_log(&m); add_taint(TAINT_MACHINE_CHECK); } @@ -1307,6 +1310,9 @@ static void mce_init_timer(void) struct timer_list *t = &__get_cpu_var(mce_timer); int *n = &__get_cpu_var(next_interval); + if (mce_ignore_ce) + return; + *n = check_interval * HZ; if (!*n) return; @@ -1517,7 +1523,10 @@ static struct miscdevice mce_log_device = { }; /* - * mce=off disables machine check + * mce=off Disables machine check + * mce=no_cmci Disables CMCI + * mce=dont_log_ce Clears corrected events silently, no log created for CEs. + * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) * monarchtimeout is how long to wait for other CPUs on machine * check, or 0 to not wait @@ -1532,6 +1541,12 @@ static int __init mcheck_enable(char *str) str++; if (!strcmp(str, "off")) mce_disabled = 1; + else if (!strcmp(str, "no_cmci")) + mce_cmci_disabled = 1; + else if (!strcmp(str, "dont_log_ce")) + mce_dont_log_ce = 1; + else if (!strcmp(str, "ignore_ce")) + mce_ignore_ce = 1; else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) mce_bootlog = (str[0] == 'b'); else if (isdigit(str[0])) { diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index b7c5a2470b4..046087e9808 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -57,6 +57,9 @@ static int cmci_supported(int *banks) { u64 cap; + if (mce_cmci_disabled || mce_ignore_ce) + return 0; + /* * Vendor check is not strictly needed, but the initial * initialization is vendor keyed and this -- cgit v1.2.3 From f4dbfa8f3131a84257223393905f7efad0ca5996 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2009 14:06:28 +0200 Subject: perf_counter: Standardize event names Pure renames only, to PERF_COUNT_HW_* and PERF_COUNT_SW_*. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 57ae1bec81b..572fb434a66 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -69,13 +69,13 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { */ static const u64 intel_perfmon_event_map[] = { - [PERF_COUNT_CPU_CYCLES] = 0x003c, - [PERF_COUNT_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_CACHE_REFERENCES] = 0x4f2e, - [PERF_COUNT_CACHE_MISSES] = 0x412e, - [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_BRANCH_MISSES] = 0x00c5, - [PERF_COUNT_BUS_CYCLES] = 0x013c, + [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, }; static u64 intel_pmu_event_map(int event) @@ -485,12 +485,12 @@ static const u64 amd_0f_hw_cache_event_ids */ static const u64 amd_perfmon_event_map[] = { - [PERF_COUNT_CPU_CYCLES] = 0x0076, - [PERF_COUNT_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_CACHE_REFERENCES] = 0x0080, - [PERF_COUNT_CACHE_MISSES] = 0x0081, - [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, + [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, }; static u64 amd_pmu_event_map(int event) @@ -970,11 +970,11 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) event = hwc->config & ARCH_PERFMON_EVENT_MASK; - if (unlikely(event == x86_pmu.event_map(PERF_COUNT_INSTRUCTIONS))) + if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) return X86_PMC_IDX_FIXED_INSTRUCTIONS; - if (unlikely(event == x86_pmu.event_map(PERF_COUNT_CPU_CYCLES))) + if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) return X86_PMC_IDX_FIXED_CPU_CYCLES; - if (unlikely(event == x86_pmu.event_map(PERF_COUNT_BUS_CYCLES))) + if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) return X86_PMC_IDX_FIXED_BUS_CYCLES; return -1; -- cgit v1.2.3 From 8be6e8f3c3a13900169f1141870562d0c723b010 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2009 14:19:11 +0200 Subject: perf_counter: Rename L2 to LL cache The top (fastest) and last level (biggest) caches are the most interesting ones, performance wise. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: [ Fixed the Nehalem LL table to LLC Reference/Miss events ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 572fb434a66..895c82e7845 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -131,7 +131,7 @@ static const u64 nehalem_hw_cache_event_ids [ C(RESULT_MISS) ] = 0x0, }, }, - [ C(L2 ) ] = { + [ C(LL ) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ @@ -141,8 +141,8 @@ static const u64 nehalem_hw_cache_event_ids [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ }, [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0xc024, /* L2_RQSTS.PREFETCHES */ - [ C(RESULT_MISS) ] = 0x8024, /* L2_RQSTS.PREFETCH_MISS */ + [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ + [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ }, }, [ C(DTLB) ] = { @@ -222,7 +222,7 @@ static const u64 core2_hw_cache_event_ids [ C(RESULT_MISS) ] = 0, }, }, - [ C(L2 ) ] = { + [ C(LL ) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ @@ -313,7 +313,7 @@ static const u64 atom_hw_cache_event_ids [ C(RESULT_MISS) ] = 0, }, }, - [ C(L2 ) ] = { + [ C(LL ) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ @@ -422,7 +422,7 @@ static const u64 amd_0f_hw_cache_event_ids [ C(RESULT_MISS) ] = 0, }, }, - [ C(L2 ) ] = { + [ C(LL ) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0, [ C(RESULT_MISS) ] = 0, -- cgit v1.2.3 From 38c7fed2f5ffee17e1fa3e0f78b0e1bf43d52d13 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 25 May 2009 15:10:58 +0300 Subject: x86: remove some alloc_bootmem_cpumask_var calling Now that we set up the slab allocator earlier, we can get rid of some alloc_bootmem_cpumask_var() calls in boot code. Cc: Ingo Molnar Cc: Johannes Weiner Cc: Linus Torvalds Signed-off-by: Yinghai Lu Signed-off-by: Pekka Enberg --- arch/x86/kernel/apic/io_apic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1946fac42ab..139201a562b 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -185,8 +185,8 @@ int __init arch_early_irq_init(void) for (i = 0; i < count; i++) { desc = irq_to_desc(i); desc->chip_data = &cfg[i]; - alloc_bootmem_cpumask_var(&cfg[i].domain); - alloc_bootmem_cpumask_var(&cfg[i].old_domain); + alloc_cpumask_var(&cfg[i].domain, GFP_NOWAIT); + alloc_cpumask_var(&cfg[i].old_domain, GFP_NOWAIT); if (i < NR_IRQS_LEGACY) cpumask_setall(cfg[i].domain); } -- cgit v1.2.3 From dad213aeb59718623fc59defeff95fe8c3feb8a0 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 28 May 2009 18:14:40 -0700 Subject: irq/cpumask: make memoryless node zero happy Don't hardcode to node zero for early boot IRQ setup memory allocations. [ penberg@cs.helsinki.fi: minor cleanups ] Cc: Ingo Molnar Cc: Johannes Weiner Cc: Linus Torvalds Signed-off-by: Yinghai Lu Signed-off-by: Pekka Enberg --- arch/x86/kernel/apic/io_apic.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 139201a562b..94605e7f6a5 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -177,16 +177,18 @@ int __init arch_early_irq_init(void) struct irq_cfg *cfg; struct irq_desc *desc; int count; + int node; int i; cfg = irq_cfgx; count = ARRAY_SIZE(irq_cfgx); + node= cpu_to_node(boot_cpu_id); for (i = 0; i < count; i++) { desc = irq_to_desc(i); desc->chip_data = &cfg[i]; - alloc_cpumask_var(&cfg[i].domain, GFP_NOWAIT); - alloc_cpumask_var(&cfg[i].old_domain, GFP_NOWAIT); + alloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); + alloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); if (i < NR_IRQS_LEGACY) cpumask_setall(cfg[i].domain); } -- cgit v1.2.3 From 8c5dd8f43367f4f266dd616f11658005bc2d20ef Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 4 Jun 2009 19:14:22 -0700 Subject: x86: handle initrd that extends into unusable memory On a system where system memory (according e820) is not covered by mtrr, mtrr_trim_memory converts a portion of memory to reserved, but bootloader has already put the initrd in that range. Thus, we need to have 64bit to use relocate_initrd too. [ Impact: fix using initrd when mtrr_trim_memory happen ] Signed-off-by: Yinghai Lu Signed-off-by: H. Peter Anvin Cc: stable@kernel.org --- arch/x86/kernel/setup.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d1c636bf31a..be5ae80f897 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -301,15 +301,13 @@ static void __init reserve_brk(void) #ifdef CONFIG_BLK_DEV_INITRD -#ifdef CONFIG_X86_32 - #define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) static void __init relocate_initrd(void) { u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; - u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; + u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; u64 ramdisk_here; unsigned long slop, clen, mapaddr; char *p, *q; @@ -365,14 +363,13 @@ static void __init relocate_initrd(void) ramdisk_image, ramdisk_image + ramdisk_size - 1, ramdisk_here, ramdisk_here + ramdisk_size - 1); } -#endif static void __init reserve_initrd(void) { u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; u64 ramdisk_end = ramdisk_image + ramdisk_size; - u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; + u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; if (!boot_params.hdr.type_of_loader || !ramdisk_image || !ramdisk_size) @@ -402,14 +399,8 @@ static void __init reserve_initrd(void) return; } -#ifdef CONFIG_X86_32 relocate_initrd(); -#else - printk(KERN_ERR "initrd extends beyond end of memory " - "(0x%08llx > 0x%08llx)\ndisabling initrd\n", - ramdisk_end, end_of_lowmem); - initrd_start = 0; -#endif + free_early(ramdisk_image, ramdisk_end); } #else -- cgit v1.2.3 From 12274e96b42884f043dfaa87eb1e5e10281bfac3 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 11 Jun 2009 15:07:48 -0700 Subject: x86: use zalloc_cpumask_var in arch_early_irq_init So we make sure MAXSMP gets a cleared cpumask Signed-off-by: Yinghai Lu Signed-off-by: Linus Torvalds --- arch/x86/kernel/apic/io_apic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 94605e7f6a5..ef8d9290c7e 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -187,8 +187,8 @@ int __init arch_early_irq_init(void) for (i = 0; i < count; i++) { desc = irq_to_desc(i); desc->chip_data = &cfg[i]; - alloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); - alloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); + zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); + zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); if (i < NR_IRQS_LEGACY) cpumask_setall(cfg[i].domain); } -- cgit v1.2.3 From 1260866a271abc84274345580f3d613d3bceff87 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 11 May 2009 13:22:00 +0100 Subject: x86: Provide _sdata in the vmlinux.lds.S file _sdata is a common symbol defined by many architectures and made available to the kernel via asm-generic/sections.h. Kmemleak uses this symbol when scanning the data sections. [ Impact: add new global symbol ] Signed-off-by: Catalin Marinas LKML-Reference: <20090511122105.26556.96593.stgit@pc1117.cambridge.arm.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 4c85b2e2bb6..367e8788204 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -108,6 +108,8 @@ SECTIONS /* Data */ . = ALIGN(PAGE_SIZE); .data : AT(ADDR(.data) - LOAD_OFFSET) { + /* Start of data section */ + _sdata = .; DATA_DATA CONSTRUCTORS -- cgit v1.2.3 From 55cd63676e0c5710fbe1ea86dfd9f8ea9aaa90f2 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 12 Jun 2009 11:36:52 +0300 Subject: x86: make zap_low_mapping could be used early Only one cpu is there, just call __flush_tlb for it. Fixes the following boot warning on x86: [ 0.000000] Memory: 885032k/915540k available (5993k kernel code, 29844k reserved, 3842k data, 428k init, 0k highmem) [ 0.000000] virtual kernel memory layout: [ 0.000000] fixmap : 0xffe17000 - 0xfffff000 (1952 kB) [ 0.000000] vmalloc : 0xf8615000 - 0xffe15000 ( 120 MB) [ 0.000000] lowmem : 0xc0000000 - 0xf7e15000 ( 894 MB) [ 0.000000] .init : 0xc19a5000 - 0xc1a10000 ( 428 kB) [ 0.000000] .data : 0xc15da4bb - 0xc199af6c (3842 kB) [ 0.000000] .text : 0xc1000000 - 0xc15da4bb (5993 kB) [ 0.000000] Checking if this processor honours the WP bit even in supervisor mode...Ok. [ 0.000000] ------------[ cut here ]------------ [ 0.000000] WARNING: at kernel/smp.c:369 smp_call_function_many+0x50/0x1b0() [ 0.000000] Hardware name: System Product Name [ 0.000000] Modules linked in: [ 0.000000] Pid: 0, comm: swapper Not tainted 2.6.30-tip #52504 [ 0.000000] Call Trace: [ 0.000000] [] warn_slowpath_common+0x65/0x95 [ 0.000000] [] warn_slowpath_null+0x12/0x15 [ 0.000000] [] smp_call_function_many+0x50/0x1b0 [ 0.000000] [] ? do_flush_tlb_all+0x0/0x41 [ 0.000000] [] ? do_flush_tlb_all+0x0/0x41 [ 0.000000] [] smp_call_function+0x31/0x58 [ 0.000000] [] ? do_flush_tlb_all+0x0/0x41 [ 0.000000] [] on_each_cpu+0x26/0x65 [ 0.000000] [] flush_tlb_all+0x19/0x1b [ 0.000000] [] zap_low_mappings+0x4d/0x56 [ 0.000000] [] ? printk+0x14/0x17 [ 0.000000] [] mem_init+0x23d/0x245 [ 0.000000] [] start_kernel+0x17a/0x2d5 [ 0.000000] [] ? unknown_bootoption+0x0/0x19a [ 0.000000] [] __init_begin+0x39/0x41 [ 0.000000] ---[ end trace 4eaa2a86a8e2da22 ]--- Reported-by: Ingo Molnar Signed-off-by: Yinghai Lu Signed-off-by: Pekka Enberg --- arch/x86/kernel/smpboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 7c80007ea5f..2fecda69ee6 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -873,7 +873,7 @@ int __cpuinit native_cpu_up(unsigned int cpu) err = do_boot_cpu(apicid, cpu); - zap_low_mappings(); + zap_low_mappings(false); low_mappings = 0; #else err = do_boot_cpu(apicid, cpu); -- cgit v1.2.3 From dff5da6d09daaab40a8741dce0ed3c2e94079de2 Mon Sep 17 00:00:00 2001 From: Yong Wang Date: Fri, 12 Jun 2009 16:08:55 +0800 Subject: perf_counter/x86: Add a quirk for Atom processors The fixed-function performance counters do not work on current Atom processors. Use the general-purpose ones instead. Signed-off-by: Yong Wang Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: <20090612080855.GA2286@ywang-moblin2.bj.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 895c82e7845..275bc142cd5 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -968,6 +968,13 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) if (!x86_pmu.num_counters_fixed) return -1; + /* + * Quirk, IA32_FIXED_CTRs do not work on current Atom processors: + */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + boot_cpu_data.x86_model == 28) + return -1; + event = hwc->config & ARCH_PERFMON_EVENT_MASK; if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) -- cgit v1.2.3 From 2d5bf28fb9e3c178db4f5536e2fe38d3a5ed7f40 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Wed, 3 Jun 2009 21:46:09 -0400 Subject: x86 module: merge the same functions in module_32.c and module_64.c Merge the same functions both in module_32.c and module_64.c into module.c. This is the first step to merge both of them finally. Signed-off-by: WANG Cong Cc: Ingo Molnar Signed-off-by: Rusty Russell --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/module.c | 98 +++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/module_32.c | 62 ---------------------------- arch/x86/kernel/module_64.c | 61 ---------------------------- 4 files changed, 99 insertions(+), 124 deletions(-) create mode 100644 arch/x86/kernel/module.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 4f78bd68212..1cd55c769cd 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -73,7 +73,7 @@ obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o obj-$(CONFIG_KPROBES) += kprobes.o -obj-$(CONFIG_MODULES) += module_$(BITS).o +obj-$(CONFIG_MODULES) += module.o module_$(BITS).o obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o obj-$(CONFIG_KGDB) += kgdb.o diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c new file mode 100644 index 00000000000..1ea42d08349 --- /dev/null +++ b/arch/x86/kernel/module.c @@ -0,0 +1,98 @@ +/* Kernel module help for x86. + Copyright (C) 2001 Rusty Russell. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(fmt...) +#endif + +/* Free memory returned from module_alloc */ +void module_free(struct module *mod, void *module_region) +{ + vfree(module_region); + /* FIXME: If module_region == mod->init_region, trim exception + table entries. */ +} + +/* We don't need anything special. */ +int module_frob_arch_sections(Elf_Ehdr *hdr, + Elf_Shdr *sechdrs, + char *secstrings, + struct module *mod) +{ + return 0; +} + +int module_finalize(const Elf_Ehdr *hdr, + const Elf_Shdr *sechdrs, + struct module *me) +{ + const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, + *para = NULL; + char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + + for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { + if (!strcmp(".text", secstrings + s->sh_name)) + text = s; + if (!strcmp(".altinstructions", secstrings + s->sh_name)) + alt = s; + if (!strcmp(".smp_locks", secstrings + s->sh_name)) + locks = s; + if (!strcmp(".parainstructions", secstrings + s->sh_name)) + para = s; + } + + if (alt) { + /* patch .altinstructions */ + void *aseg = (void *)alt->sh_addr; + apply_alternatives(aseg, aseg + alt->sh_size); + } + if (locks && text) { + void *lseg = (void *)locks->sh_addr; + void *tseg = (void *)text->sh_addr; + alternatives_smp_module_add(me, me->name, + lseg, lseg + locks->sh_size, + tseg, tseg + text->sh_size); + } + + if (para) { + void *pseg = (void *)para->sh_addr; + apply_paravirt(pseg, pseg + para->sh_size); + } + + return module_bug_finalize(hdr, sechdrs, me); +} + +void module_arch_cleanup(struct module *mod) +{ + alternatives_smp_module_del(mod); + module_bug_cleanup(mod); +} diff --git a/arch/x86/kernel/module_32.c b/arch/x86/kernel/module_32.c index 0edd819050e..6145e68feb0 100644 --- a/arch/x86/kernel/module_32.c +++ b/arch/x86/kernel/module_32.c @@ -37,23 +37,6 @@ void *module_alloc(unsigned long size) } -/* Free memory returned from module_alloc */ -void module_free(struct module *mod, void *module_region) -{ - vfree(module_region); - /* FIXME: If module_region == mod->init_region, trim exception - table entries. */ -} - -/* We don't need anything special. */ -int module_frob_arch_sections(Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - char *secstrings, - struct module *mod) -{ - return 0; -} - int apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, @@ -105,48 +88,3 @@ int apply_relocate_add(Elf32_Shdr *sechdrs, return -ENOEXEC; } -int module_finalize(const Elf_Ehdr *hdr, - const Elf_Shdr *sechdrs, - struct module *me) -{ - const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, - *para = NULL; - char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; - - for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { - if (!strcmp(".text", secstrings + s->sh_name)) - text = s; - if (!strcmp(".altinstructions", secstrings + s->sh_name)) - alt = s; - if (!strcmp(".smp_locks", secstrings + s->sh_name)) - locks = s; - if (!strcmp(".parainstructions", secstrings + s->sh_name)) - para = s; - } - - if (alt) { - /* patch .altinstructions */ - void *aseg = (void *)alt->sh_addr; - apply_alternatives(aseg, aseg + alt->sh_size); - } - if (locks && text) { - void *lseg = (void *)locks->sh_addr; - void *tseg = (void *)text->sh_addr; - alternatives_smp_module_add(me, me->name, - lseg, lseg + locks->sh_size, - tseg, tseg + text->sh_size); - } - - if (para) { - void *pseg = (void *)para->sh_addr; - apply_paravirt(pseg, pseg + para->sh_size); - } - - return module_bug_finalize(hdr, sechdrs, me); -} - -void module_arch_cleanup(struct module *mod) -{ - alternatives_smp_module_del(mod); - module_bug_cleanup(mod); -} diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c index c23880b90b5..68ec7d87bb4 100644 --- a/arch/x86/kernel/module_64.c +++ b/arch/x86/kernel/module_64.c @@ -33,13 +33,6 @@ #define DEBUGP(fmt...) #ifndef CONFIG_UML -void module_free(struct module *mod, void *module_region) -{ - vfree(module_region); - /* FIXME: If module_region == mod->init_region, trim exception - table entries. */ -} - void *module_alloc(unsigned long size) { struct vm_struct *area; @@ -58,15 +51,6 @@ void *module_alloc(unsigned long size) } #endif -/* We don't need anything special. */ -int module_frob_arch_sections(Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - char *secstrings, - struct module *mod) -{ - return 0; -} - int apply_relocate_add(Elf64_Shdr *sechdrs, const char *strtab, unsigned int symindex, @@ -147,48 +131,3 @@ int apply_relocate(Elf_Shdr *sechdrs, return -ENOSYS; } -int module_finalize(const Elf_Ehdr *hdr, - const Elf_Shdr *sechdrs, - struct module *me) -{ - const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, - *para = NULL; - char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; - - for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { - if (!strcmp(".text", secstrings + s->sh_name)) - text = s; - if (!strcmp(".altinstructions", secstrings + s->sh_name)) - alt = s; - if (!strcmp(".smp_locks", secstrings + s->sh_name)) - locks = s; - if (!strcmp(".parainstructions", secstrings + s->sh_name)) - para = s; - } - - if (alt) { - /* patch .altinstructions */ - void *aseg = (void *)alt->sh_addr; - apply_alternatives(aseg, aseg + alt->sh_size); - } - if (locks && text) { - void *lseg = (void *)locks->sh_addr; - void *tseg = (void *)text->sh_addr; - alternatives_smp_module_add(me, me->name, - lseg, lseg + locks->sh_size, - tseg, tseg + text->sh_size); - } - - if (para) { - void *pseg = (void *)para->sh_addr; - apply_paravirt(pseg, pseg + para->sh_size); - } - - return module_bug_finalize(hdr, sechdrs, me); -} - -void module_arch_cleanup(struct module *mod) -{ - alternatives_smp_module_del(mod); - module_bug_cleanup(mod); -} -- cgit v1.2.3 From 0fdc83b950df9e2eb45db6fca9c3d92c66fd5028 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Wed, 3 Jun 2009 21:46:19 -0400 Subject: x86 module: merge the rest functions with macros Merge the rest functions together, with proper preprocessing directives. Finally remove module_{32|64}.c. Signed-off-by: WANG Cong Cc: Ingo Molnar Signed-off-by: Rusty Russell --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/module.c | 160 ++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/module_32.c | 90 ------------------------- arch/x86/kernel/module_64.c | 133 ------------------------------------ 4 files changed, 161 insertions(+), 224 deletions(-) delete mode 100644 arch/x86/kernel/module_32.c delete mode 100644 arch/x86/kernel/module_64.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 1cd55c769cd..f3477bb8456 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -73,7 +73,7 @@ obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o obj-$(CONFIG_KPROBES) += kprobes.o -obj-$(CONFIG_MODULES) += module.o module_$(BITS).o +obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o obj-$(CONFIG_KGDB) += kgdb.o diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 1ea42d08349..b92fbcf4860 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -34,6 +34,32 @@ #define DEBUGP(fmt...) #endif +#if defined(CONFIG_UML) || defined(CONFIG_X86_32) +void *module_alloc(unsigned long size) +{ + if (size == 0) + return NULL; + return vmalloc_exec(size); +} +#else /*X86_64*/ +void *module_alloc(unsigned long size) +{ + struct vm_struct *area; + + if (!size) + return NULL; + size = PAGE_ALIGN(size); + if (size > MODULES_LEN) + return NULL; + + area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END); + if (!area) + return NULL; + + return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC); +} +#endif + /* Free memory returned from module_alloc */ void module_free(struct module *mod, void *module_region) { @@ -51,6 +77,140 @@ int module_frob_arch_sections(Elf_Ehdr *hdr, return 0; } +#ifdef CONFIG_X86_32 +int apply_relocate(Elf32_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + unsigned int i; + Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr; + Elf32_Sym *sym; + uint32_t *location; + + DEBUGP("Applying relocate section %u to %u\n", relsec, + sechdrs[relsec].sh_info); + for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { + /* This is where to make the change */ + location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr + + rel[i].r_offset; + /* This is the symbol it is referring to. Note that all + undefined symbols have been resolved. */ + sym = (Elf32_Sym *)sechdrs[symindex].sh_addr + + ELF32_R_SYM(rel[i].r_info); + + switch (ELF32_R_TYPE(rel[i].r_info)) { + case R_386_32: + /* We add the value into the location given */ + *location += sym->st_value; + break; + case R_386_PC32: + /* Add the value, subtract its postition */ + *location += sym->st_value - (uint32_t)location; + break; + default: + printk(KERN_ERR "module %s: Unknown relocation: %u\n", + me->name, ELF32_R_TYPE(rel[i].r_info)); + return -ENOEXEC; + } + } + return 0; +} + +int apply_relocate_add(Elf32_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n", + me->name); + return -ENOEXEC; +} +#else /*X86_64*/ +int apply_relocate_add(Elf64_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + unsigned int i; + Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr; + Elf64_Sym *sym; + void *loc; + u64 val; + + DEBUGP("Applying relocate section %u to %u\n", relsec, + sechdrs[relsec].sh_info); + for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { + /* This is where to make the change */ + loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr + + rel[i].r_offset; + + /* This is the symbol it is referring to. Note that all + undefined symbols have been resolved. */ + sym = (Elf64_Sym *)sechdrs[symindex].sh_addr + + ELF64_R_SYM(rel[i].r_info); + + DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n", + (int)ELF64_R_TYPE(rel[i].r_info), + sym->st_value, rel[i].r_addend, (u64)loc); + + val = sym->st_value + rel[i].r_addend; + + switch (ELF64_R_TYPE(rel[i].r_info)) { + case R_X86_64_NONE: + break; + case R_X86_64_64: + *(u64 *)loc = val; + break; + case R_X86_64_32: + *(u32 *)loc = val; + if (val != *(u32 *)loc) + goto overflow; + break; + case R_X86_64_32S: + *(s32 *)loc = val; + if ((s64)val != *(s32 *)loc) + goto overflow; + break; + case R_X86_64_PC32: + val -= (u64)loc; + *(u32 *)loc = val; +#if 0 + if ((s64)val != *(s32 *)loc) + goto overflow; +#endif + break; + default: + printk(KERN_ERR "module %s: Unknown rela relocation: %llu\n", + me->name, ELF64_R_TYPE(rel[i].r_info)); + return -ENOEXEC; + } + } + return 0; + +overflow: + printk(KERN_ERR "overflow in relocation type %d val %Lx\n", + (int)ELF64_R_TYPE(rel[i].r_info), val); + printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n", + me->name); + return -ENOEXEC; +} + +int apply_relocate(Elf_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + printk(KERN_ERR "non add relocation not supported\n"); + return -ENOSYS; +} + +#endif + int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) diff --git a/arch/x86/kernel/module_32.c b/arch/x86/kernel/module_32.c deleted file mode 100644 index 6145e68feb0..00000000000 --- a/arch/x86/kernel/module_32.c +++ /dev/null @@ -1,90 +0,0 @@ -/* Kernel module help for i386. - Copyright (C) 2001 Rusty Russell. - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -*/ -#include -#include -#include -#include -#include -#include -#include - -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(fmt...) -#endif - -void *module_alloc(unsigned long size) -{ - if (size == 0) - return NULL; - return vmalloc_exec(size); -} - - -int apply_relocate(Elf32_Shdr *sechdrs, - const char *strtab, - unsigned int symindex, - unsigned int relsec, - struct module *me) -{ - unsigned int i; - Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr; - Elf32_Sym *sym; - uint32_t *location; - - DEBUGP("Applying relocate section %u to %u\n", relsec, - sechdrs[relsec].sh_info); - for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { - /* This is where to make the change */ - location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr - + rel[i].r_offset; - /* This is the symbol it is referring to. Note that all - undefined symbols have been resolved. */ - sym = (Elf32_Sym *)sechdrs[symindex].sh_addr - + ELF32_R_SYM(rel[i].r_info); - - switch (ELF32_R_TYPE(rel[i].r_info)) { - case R_386_32: - /* We add the value into the location given */ - *location += sym->st_value; - break; - case R_386_PC32: - /* Add the value, subtract its postition */ - *location += sym->st_value - (uint32_t)location; - break; - default: - printk(KERN_ERR "module %s: Unknown relocation: %u\n", - me->name, ELF32_R_TYPE(rel[i].r_info)); - return -ENOEXEC; - } - } - return 0; -} - -int apply_relocate_add(Elf32_Shdr *sechdrs, - const char *strtab, - unsigned int symindex, - unsigned int relsec, - struct module *me) -{ - printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n", - me->name); - return -ENOEXEC; -} - diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c deleted file mode 100644 index 68ec7d87bb4..00000000000 --- a/arch/x86/kernel/module_64.c +++ /dev/null @@ -1,133 +0,0 @@ -/* Kernel module help for x86-64 - Copyright (C) 2001 Rusty Russell. - Copyright (C) 2002,2003 Andi Kleen, SuSE Labs. - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -*/ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#define DEBUGP(fmt...) - -#ifndef CONFIG_UML -void *module_alloc(unsigned long size) -{ - struct vm_struct *area; - - if (!size) - return NULL; - size = PAGE_ALIGN(size); - if (size > MODULES_LEN) - return NULL; - - area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END); - if (!area) - return NULL; - - return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC); -} -#endif - -int apply_relocate_add(Elf64_Shdr *sechdrs, - const char *strtab, - unsigned int symindex, - unsigned int relsec, - struct module *me) -{ - unsigned int i; - Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr; - Elf64_Sym *sym; - void *loc; - u64 val; - - DEBUGP("Applying relocate section %u to %u\n", relsec, - sechdrs[relsec].sh_info); - for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { - /* This is where to make the change */ - loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr - + rel[i].r_offset; - - /* This is the symbol it is referring to. Note that all - undefined symbols have been resolved. */ - sym = (Elf64_Sym *)sechdrs[symindex].sh_addr - + ELF64_R_SYM(rel[i].r_info); - - DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n", - (int)ELF64_R_TYPE(rel[i].r_info), - sym->st_value, rel[i].r_addend, (u64)loc); - - val = sym->st_value + rel[i].r_addend; - - switch (ELF64_R_TYPE(rel[i].r_info)) { - case R_X86_64_NONE: - break; - case R_X86_64_64: - *(u64 *)loc = val; - break; - case R_X86_64_32: - *(u32 *)loc = val; - if (val != *(u32 *)loc) - goto overflow; - break; - case R_X86_64_32S: - *(s32 *)loc = val; - if ((s64)val != *(s32 *)loc) - goto overflow; - break; - case R_X86_64_PC32: - val -= (u64)loc; - *(u32 *)loc = val; -#if 0 - if ((s64)val != *(s32 *)loc) - goto overflow; -#endif - break; - default: - printk(KERN_ERR "module %s: Unknown rela relocation: %llu\n", - me->name, ELF64_R_TYPE(rel[i].r_info)); - return -ENOEXEC; - } - } - return 0; - -overflow: - printk(KERN_ERR "overflow in relocation type %d val %Lx\n", - (int)ELF64_R_TYPE(rel[i].r_info), val); - printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n", - me->name); - return -ENOEXEC; -} - -int apply_relocate(Elf_Shdr *sechdrs, - const char *strtab, - unsigned int symindex, - unsigned int relsec, - struct module *me) -{ - printk(KERN_ERR "non add relocation not supported\n"); - return -ENOSYS; -} - -- cgit v1.2.3 From c398df30d5caad626ac72bfab0361a7b0f67a661 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Wed, 3 Jun 2009 21:46:46 -0400 Subject: module: merge module_alloc() finally As Christoph Hellwig suggested, module_alloc() actually can be unified for i386 and x86_64 (of course, also UML). Signed-off-by: WANG Cong Cc: Christoph Hellwig Cc: 'Ingo Molnar' Signed-off-by: Rusty Russell --- arch/x86/kernel/module.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index b92fbcf4860..894bb718a6f 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -34,14 +34,6 @@ #define DEBUGP(fmt...) #endif -#if defined(CONFIG_UML) || defined(CONFIG_X86_32) -void *module_alloc(unsigned long size) -{ - if (size == 0) - return NULL; - return vmalloc_exec(size); -} -#else /*X86_64*/ void *module_alloc(unsigned long size) { struct vm_struct *area; @@ -56,9 +48,9 @@ void *module_alloc(unsigned long size) if (!area) return NULL; - return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC); + return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM, + PAGE_KERNEL_EXEC); } -#endif /* Free memory returned from module_alloc */ void module_free(struct module *mod, void *module_region) -- cgit v1.2.3 From 5933048c69edb546f1e93c26dc93816f0be9f754 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 21:47:04 -0600 Subject: module: cleanup FIXME comments about trimming exception table entries. Everyone cut and paste this comment from my original one. We now do it generically, so cut the comments. Signed-off-by: Rusty Russell Cc: Amerigo Wang --- arch/x86/kernel/module.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 894bb718a6f..89f386f044e 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -56,8 +56,6 @@ void *module_alloc(unsigned long size) void module_free(struct module *mod, void *module_region) { vfree(module_region); - /* FIXME: If module_region == mod->init_region, trim exception - table entries. */ } /* We don't need anything special. */ -- cgit v1.2.3 From 61f4bc83fea248a3092beb7ba43daa5629615513 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 22:27:03 -0600 Subject: lguest: optimize by coding restore_flags and irq_enable in assembler. The downside of the last patch which made restore_flags and irq_enable check interrupts is that they are now too big to be patched directly into the callsites, so the C versions are always used. But the C versions go via PV_CALLEE_SAVE_REGS_THUNK which saves all the registers. In fact, we don't need any registers in the fast path, so we can do better than this if we actually code them in assembler. The results are in the noise, but since it's about the same amount of code, it's worth applying. 1GB Guest->Host: input(suppressed),output(suppressed) Before: Seconds: 0:16.53 Packets: 377268,753673 Interrupts: 22461,24297 Notifications: 1(5245),21303(732370) Net IRQs triggered: 377023(245),42578(711095) After: Seconds: 0:16.48 Packets: 377289,753673 Interrupts: 22281,24465 Notifications: 1(5245),21296(732377) Net IRQs triggered: 377060(229),42564(711109) Signed-off-by: Rusty Russell --- arch/x86/kernel/asm-offsets_32.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 1a830cbd701..dfdbf640389 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -126,6 +126,7 @@ void foo(void) #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) BLANK(); OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); + OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending); OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir); BLANK(); -- cgit v1.2.3 From ce4b3c55475e451cb489e857640396c37ca88974 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 18 Apr 2009 13:44:57 +0200 Subject: PM/ACPI/x86: Fix sparse warning in arch/x86/kernel/acpi/sleep.c One of the numbers in arch/x86/kernel/acpi/sleep.c is long, but it is not annotated appropriately, so sparese warns about it. Fix that. [rjw: added the changelog.] Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/acpi/sleep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 7c243a2c511..ca93638ba43 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -104,7 +104,7 @@ int acpi_save_state_mem(void) initial_gs = per_cpu_offset(smp_processor_id()); #endif initial_code = (unsigned long)wakeup_long64; - saved_magic = 0x123456789abcdef0; + saved_magic = 0x123456789abcdef0L; #endif /* CONFIG_64BIT */ return 0; -- cgit v1.2.3 From e39a71ef80877f4e30d808af9acceec80f4d2f7c Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Fri, 15 May 2009 00:53:26 +0200 Subject: PM: Rename device_power_down/up() Rename the functions performing "_noirq" dev_pm_ops operations from device_power_down() and device_power_up() to device_suspend_noirq() and device_resume_noirq(). The new function names are chosen to show that the functions are responsible for calling the _noirq() versions to finalize the suspend/resume operation. The current function names do not perform power down/up anymore so the names may be misleading. Global function renames: - device_power_down() -> device_suspend_noirq() - device_power_up() -> device_resume_noirq() Static function renames: - suspend_device_noirq() -> __device_suspend_noirq() - resume_device_noirq() -> __device_resume_noirq() Signed-off-by: Magnus Damm Acked-by: Greg Kroah-Hartman Acked-by: Len Brown Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/apm_32.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 49e0939bac4..31ae547da15 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -1235,7 +1235,7 @@ static int suspend(int vetoable) device_suspend(PMSG_SUSPEND); - device_power_down(PMSG_SUSPEND); + device_suspend_noirq(PMSG_SUSPEND); local_irq_disable(); sysdev_suspend(PMSG_SUSPEND); @@ -1259,7 +1259,7 @@ static int suspend(int vetoable) sysdev_resume(); local_irq_enable(); - device_power_up(PMSG_RESUME); + device_resume_noirq(PMSG_RESUME); device_resume(PMSG_RESUME); queue_event(APM_NORMAL_RESUME, NULL); @@ -1277,7 +1277,7 @@ static void standby(void) { int err; - device_power_down(PMSG_SUSPEND); + device_suspend_noirq(PMSG_SUSPEND); local_irq_disable(); sysdev_suspend(PMSG_SUSPEND); @@ -1291,7 +1291,7 @@ static void standby(void) sysdev_resume(); local_irq_enable(); - device_power_up(PMSG_RESUME); + device_resume_noirq(PMSG_RESUME); } static apm_event_t get_event(void) -- cgit v1.2.3 From d161630297a20802d01c55847bfcba85d2118a9f Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Sun, 24 May 2009 22:05:42 +0200 Subject: PM core: rename suspend and resume functions This patch (as1241) renames a bunch of functions in the PM core. Rather than go through a boring list of name changes, suffice it to say that in the end we have a bunch of pairs of functions: device_resume_noirq dpm_resume_noirq device_resume dpm_resume device_complete dpm_complete device_suspend_noirq dpm_suspend_noirq device_suspend dpm_suspend device_prepare dpm_prepare in which device_X does the X operation on a single device and dpm_X invokes device_X for all devices in the dpm_list. In addition, the old dpm_power_up and device_resume_noirq have been combined into a single function (dpm_resume_noirq). Lastly, dpm_suspend_start and dpm_resume_end are the renamed versions of the former top-level device_suspend and device_resume routines. Signed-off-by: Alan Stern Acked-by: Magnus Damm Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/apm_32.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 31ae547da15..79302e9a33a 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -1233,9 +1233,9 @@ static int suspend(int vetoable) int err; struct apm_user *as; - device_suspend(PMSG_SUSPEND); + dpm_suspend_start(PMSG_SUSPEND); - device_suspend_noirq(PMSG_SUSPEND); + dpm_suspend_noirq(PMSG_SUSPEND); local_irq_disable(); sysdev_suspend(PMSG_SUSPEND); @@ -1259,9 +1259,9 @@ static int suspend(int vetoable) sysdev_resume(); local_irq_enable(); - device_resume_noirq(PMSG_RESUME); + dpm_resume_noirq(PMSG_RESUME); - device_resume(PMSG_RESUME); + dpm_resume_end(PMSG_RESUME); queue_event(APM_NORMAL_RESUME, NULL); spin_lock(&user_list_lock); for (as = user_list; as != NULL; as = as->next) { @@ -1277,7 +1277,7 @@ static void standby(void) { int err; - device_suspend_noirq(PMSG_SUSPEND); + dpm_suspend_noirq(PMSG_SUSPEND); local_irq_disable(); sysdev_suspend(PMSG_SUSPEND); @@ -1291,7 +1291,7 @@ static void standby(void) sysdev_resume(); local_irq_enable(); - device_resume_noirq(PMSG_RESUME); + dpm_resume_noirq(PMSG_RESUME); } static apm_event_t get_event(void) @@ -1376,7 +1376,7 @@ static void check_events(void) ignore_bounce = 1; if ((event != APM_NORMAL_RESUME) || (ignore_normal_resume == 0)) { - device_resume(PMSG_RESUME); + dpm_resume_end(PMSG_RESUME); queue_event(event, NULL); } ignore_normal_resume = 0; -- cgit v1.2.3 From acc6be5405b90c9f0fb0eb8a74ec4d4b7b5bf48f Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Tue, 20 May 2008 11:15:43 +0200 Subject: x86: add save_stack_trace_bp() for tracing from a specific stack frame This will help kmemcheck (and possibly other debugging tools) since we can now simply pass regs->bp to the stack tracer instead of specifying the number of stack frames to skip, which is unreliable if gcc decides to inline functions, etc. Note that this makes the API incomplete for other architectures, but I expect that those can be updated lazily, e.g. when they need it. Cc: Arjan van de Ven Signed-off-by: Vegard Nossum --- arch/x86/kernel/stacktrace.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 4aaf7e48394..c3eb207181f 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -77,6 +77,13 @@ void save_stack_trace(struct stack_trace *trace) } EXPORT_SYMBOL_GPL(save_stack_trace); +void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp) +{ + dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace); + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = ULONG_MAX; +} + void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); -- cgit v1.2.3 From f85612967c93b67b10dd240e3e8bf8a0eee9def7 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 4 Apr 2008 00:53:23 +0200 Subject: x86: add hooks for kmemcheck The hooks that we modify are: - Page fault handler (to handle kmemcheck faults) - Debug exception handler (to hide pages after single-stepping the instruction that caused the page fault) Also redefine memset() to use the optimized version if kmemcheck is enabled. (Thanks to Pekka Enberg for minimizing the impact on the page fault handler.) As kmemcheck doesn't handle MMX/SSE instructions (yet), we also disable the optimized xor code, and rely instead on the generic C implementation in order to avoid false-positive warnings. Signed-off-by: Vegard Nossum [whitespace fixlet] Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar [rebased for mainline inclusion] Signed-off-by: Vegard Nossum --- arch/x86/kernel/cpu/intel.c | 23 +++++++++++++++++++++++ arch/x86/kernel/traps.c | 5 +++++ 2 files changed, 28 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index daed39ba261..3260ab04499 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -86,6 +86,29 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) */ if (c->x86 == 6 && c->x86_model < 15) clear_cpu_cap(c, X86_FEATURE_PAT); + +#ifdef CONFIG_KMEMCHECK + /* + * P4s have a "fast strings" feature which causes single- + * stepping REP instructions to only generate a #DB on + * cache-line boundaries. + * + * Ingo Molnar reported a Pentium D (model 6) and a Xeon + * (model 2) with the same problem. + */ + if (c->x86 == 15) { + u64 misc_enable; + + rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); + + if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) { + printk(KERN_INFO "kmemcheck: Disabling fast string operations\n"); + + misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING; + wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); + } + } +#endif } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 07d60c870ce..e7a28e6aa4b 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -45,6 +45,7 @@ #include #endif +#include #include #include #include @@ -534,6 +535,10 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) get_debugreg(condition, 6); + /* Catch kmemcheck conditions first of all! */ + if (condition & DR_STEP && kmemcheck_trap(regs)) + return; + /* * The processor cleared BTF, so don't mark that we need it set. */ -- cgit v1.2.3 From 2dff440525f8faba8836e9f05297b76f23b4af30 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Sat, 31 May 2008 15:56:17 +0200 Subject: kmemcheck: add mm functions With kmemcheck enabled, the slab allocator needs to do this: 1. Tell kmemcheck to allocate the shadow memory which stores the status of each byte in the allocation proper, e.g. whether it is initialized or uninitialized. 2. Tell kmemcheck which parts of memory that should be marked uninitialized. There are actually a few more states, such as "not yet allocated" and "recently freed". If a slab cache is set up using the SLAB_NOTRACK flag, it will never return memory that can take page faults because of kmemcheck. If a slab cache is NOT set up using the SLAB_NOTRACK flag, callers can still request memory with the __GFP_NOTRACK flag. This does not prevent the page faults from occuring, however, but marks the object in question as being initialized so that no warnings will ever be produced for this object. In addition to (and in contrast to) __GFP_NOTRACK, the __GFP_NOTRACK_FALSE_POSITIVE flag indicates that the allocation should not be tracked _because_ it would produce a false positive. Their values are identical, but need not be so in the future (for example, we could now enable/disable false positives with a config option). Parts of this patch were contributed by Pekka Enberg but merged for atomicity. Signed-off-by: Vegard Nossum Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar [rebased for mainline inclusion] Signed-off-by: Vegard Nossum --- arch/x86/kernel/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 3bb2be1649b..994dd6a4a2a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -63,7 +63,7 @@ void arch_task_cache_init(void) task_xstate_cachep = kmem_cache_create("task_xstate", xstate_size, __alignof__(union thread_xstate), - SLAB_PANIC, NULL); + SLAB_PANIC | SLAB_NOTRACK, NULL); } /* -- cgit v1.2.3 From d405640539555b601e52f7d18f1f0b1345d18bf5 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Thu, 30 Apr 2009 15:23:42 +0200 Subject: Driver Core: misc: add nodename support for misc devices. This adds support for misc devices to report their requested nodename to userspace. It also updates a number of misc drivers to provide the needed subdirectory and device name to be used for them. Signed-off-by: Kay Sievers Signed-off-by: Jan Blunck Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/microcode_core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 9c4461501fc..9371448290a 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -236,6 +236,7 @@ static const struct file_operations microcode_fops = { static struct miscdevice microcode_dev = { .minor = MICROCODE_MINOR, .name = "microcode", + .devnode = "cpu/microcode", .fops = µcode_fops, }; -- cgit v1.2.3 From 07e9bb8eebb04c81323cbd3eabf07a3f6f05b204 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Thu, 30 Apr 2009 15:23:42 +0200 Subject: Driver Core: x86: add nodename for cpuid and msr drivers. This adds support to the x86 cpuid and msr drivers to report the proper device name to userspace for their devices. Signed-off-by: Kay Sievers Signed-off-by: Jan Blunck Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpuid.c | 6 ++++++ arch/x86/kernel/msr.c | 6 ++++++ 2 files changed, 12 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 2ac1f0c2beb..b07af886124 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -182,6 +182,11 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier = .notifier_call = cpuid_class_cpu_callback, }; +static char *cpuid_nodename(struct device *dev) +{ + return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); +} + static int __init cpuid_init(void) { int i, err = 0; @@ -198,6 +203,7 @@ static int __init cpuid_init(void) err = PTR_ERR(cpuid_class); goto out_chrdev; } + cpuid_class->nodename = cpuid_nodename; for_each_online_cpu(i) { err = cpuid_device_create(i); if (err != 0) diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 3cf3413ec62..98fd6cd4e3a 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -196,6 +196,11 @@ static struct notifier_block __refdata msr_class_cpu_notifier = { .notifier_call = msr_class_cpu_callback, }; +static char *msr_nodename(struct device *dev) +{ + return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); +} + static int __init msr_init(void) { int i, err = 0; @@ -212,6 +217,7 @@ static int __init msr_init(void) err = PTR_ERR(msr_class); goto out_chrdev; } + msr_class->nodename = msr_nodename; for_each_online_cpu(i) { err = msr_device_create(i); if (err != 0) -- cgit v1.2.3 From 08604bd9935dc98fb62ef61d5b7baa7ccc10f8c2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 16 Jun 2009 15:31:12 -0700 Subject: time: move PIT_TICK_RATE to linux/timex.h PIT_TICK_RATE is currently defined in four architectures, but in three different places. While linux/timex.h is not the perfect place for it, it is still a reasonable replacement for those drivers that traditionally use asm/timex.h to get CLOCK_TICK_RATE and expect it to be the PIT frequency. Note that for Alpha, the actual value changed from 1193182UL to 1193180UL. This is unlikely to make a difference, and probably can only improve accuracy. There was a discussion on the correct value of CLOCK_TICK_RATE a few years ago, after which every existing instance was getting changed to 1193182. According to the specification, it should be 1193181.818181... Signed-off-by: Arnd Bergmann Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Len Brown Cc: john stultz Cc: Dmitry Torokhov Cc: Takashi Iwai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/i8253.c | 1 + arch/x86/kernel/tsc.c | 1 + 2 files changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index c2e0bb0890d..5cf36c053ac 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 3e1c057e98f..ae3180c506a 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From bb1f17b0372de93758653ca3454bc0df18dc2e5c Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 16 Jun 2009 15:31:18 -0700 Subject: mm: consolidate init_mm definition * create mm/init-mm.c, move init_mm there * remove INIT_MM, initialize init_mm with C99 initializer * unexport init_mm on all arches: init_mm is already unexported on x86. One strange place is some OMAP driver (drivers/video/omap/) which won't build modular, but it's already wants get_vm_area() export. Somebody should look there. [akpm@linux-foundation.org: add missing #includes] Signed-off-by: Alexey Dobriyan Cc: Mike Frysinger Cc: Americo Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/init_task.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index df3bf269bea..270ff83efc1 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c @@ -12,7 +12,6 @@ static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -struct mm_struct init_mm = INIT_MM(init_mm); /* * Initial thread structure. -- cgit v1.2.3 From a9c569539312cfd3c820b38036679a9d72c55331 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 16 Jun 2009 15:33:44 -0700 Subject: use printk_once() in several places There are some places to be able to use printk_once instead of hard coding. Signed-off-by: Minchan Kim Cc: Dominik Brodowski Cc: David S. Miller Cc: Ingo Molnar Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/common.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 3ffdcfa9abd..9fa33886c0d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -487,7 +487,6 @@ out: static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) { char *v = c->x86_vendor_id; - static int printed; int i; for (i = 0; i < X86_VENDOR_NUM; i++) { @@ -504,13 +503,9 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) } } - if (!printed) { - printed++; - printk(KERN_ERR - "CPU: vendor_id '%s' unknown, using generic init.\n", v); - - printk(KERN_ERR "CPU: Your system may be unstable.\n"); - } + printk_once(KERN_ERR + "CPU: vendor_id '%s' unknown, using generic init.\n" \ + "CPU: Your system may be unstable.\n", v); c->x86_vendor = X86_VENDOR_UNKNOWN; this_cpu = &default_cpu; -- cgit v1.2.3