From d662ed26734473d4cb5f3d78cebfec8f9126e97c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 9 Jan 2009 17:01:53 +1100 Subject: powerpc/perf_counter: Add perf_counter system call on powerpc ... with an empty/dummy asm/perf_counter.h so it builds. Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/perf_counter.h | 10 ++++++++++ arch/powerpc/include/asm/systbl.h | 1 + arch/powerpc/include/asm/unistd.h | 3 ++- arch/powerpc/platforms/Kconfig.cputype | 1 + 4 files changed, 14 insertions(+), 1 deletion(-) create mode 100644 arch/powerpc/include/asm/perf_counter.h (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h new file mode 100644 index 00000000000..59530ae1d53 --- /dev/null +++ b/arch/powerpc/include/asm/perf_counter.h @@ -0,0 +1,10 @@ +/* + * Performance counter support - PowerPC-specific definitions. + * + * Copyright 2008-2009 Paul Mackerras, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h index 803def23665..da300c4d288 100644 --- a/arch/powerpc/include/asm/systbl.h +++ b/arch/powerpc/include/asm/systbl.h @@ -322,3 +322,4 @@ SYSCALL_SPU(epoll_create1) SYSCALL_SPU(dup3) SYSCALL_SPU(pipe2) SYSCALL(inotify_init1) +SYSCALL(perf_counter_open) diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index e07d0c76ed7..7cef5afe89d 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -341,10 +341,11 @@ #define __NR_dup3 316 #define __NR_pipe2 317 #define __NR_inotify_init1 318 +#define __NR_perf_counter_open 319 #ifdef __KERNEL__ -#define __NR_syscalls 319 +#define __NR_syscalls 320 #define __NR__exit __NR_exit #define NR_syscalls __NR_syscalls diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 3d0c776f888..94dd1fb9a00 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -1,6 +1,7 @@ config PPC64 bool "64-bit kernel" default n + select HAVE_PERF_COUNTERS help This option selects whether a 32-bit or a 64-bit kernel will be built. -- cgit v1.2.3 From 93a6d3ce6962044fe9badf528fed46b455d58292 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 9 Jan 2009 16:52:19 +1100 Subject: powerpc: Provide a way to defer perf counter work until interrupts are enabled Because 64-bit powerpc uses lazy (soft) interrupt disabling, it is possible for a performance monitor exception to come in when the kernel thinks interrupts are disabled (i.e. when they are soft-disabled but hard-enabled). In such a situation the performance monitor exception handler might have some processing to do (such as process wakeups) which can't be done in what is effectively an NMI handler. This provides a way to defer that work until interrupts get enabled, either in raw_local_irq_restore() or by returning from an interrupt handler to code that had interrupts enabled. We have a per-processor flag that indicates that there is work pending to do when interrupts subsequently get re-enabled. This flag is checked in the interrupt return path and in raw_local_irq_restore(), and if it is set, perf_counter_do_pending() is called to do the pending work. Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/hw_irq.h | 31 +++++++++++++++++++++++++++++++ arch/powerpc/include/asm/paca.h | 1 + arch/powerpc/kernel/asm-offsets.c | 1 + arch/powerpc/kernel/entry_64.S | 9 +++++++++ arch/powerpc/kernel/irq.c | 10 ++++++++++ 5 files changed, 52 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index f75a5fc64d2..e10f151c3db 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -131,5 +131,36 @@ static inline int irqs_disabled_flags(unsigned long flags) */ struct hw_interrupt_type; +#ifdef CONFIG_PERF_COUNTERS +static inline unsigned long get_perf_counter_pending(void) +{ + unsigned long x; + + asm volatile("lbz %0,%1(13)" + : "=r" (x) + : "i" (offsetof(struct paca_struct, perf_counter_pending))); + return x; +} + +static inline void set_perf_counter_pending(int x) +{ + asm volatile("stb %0,%1(13)" : : + "r" (x), + "i" (offsetof(struct paca_struct, perf_counter_pending))); +} + +extern void perf_counter_do_pending(void); + +#else + +static inline unsigned long get_perf_counter_pending(void) +{ + return 0; +} + +static inline void set_perf_counter_pending(int x) {} +static inline void perf_counter_do_pending(void) {} +#endif /* CONFIG_PERF_COUNTERS */ + #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_HW_IRQ_H */ diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 082b3aedf14..6ef05572301 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -99,6 +99,7 @@ struct paca_struct { u8 soft_enabled; /* irq soft-enable flag */ u8 hard_enabled; /* set if irqs are enabled in MSR */ u8 io_sync; /* writel() needs spin_unlock sync */ + u8 perf_counter_pending; /* PM interrupt while soft-disabled */ /* Stuff for accurate time accounting */ u64 user_time; /* accumulated usermode TB ticks */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 661d07d2146..cea46290011 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -127,6 +127,7 @@ int main(void) DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr)); DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled)); DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled)); + DEFINE(PACAPERFPEND, offsetof(struct paca_struct, perf_counter_pending)); DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache)); DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr)); DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id)); diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 383ed6eb008..f30b4e553c5 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -526,6 +526,15 @@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES) 2: TRACE_AND_RESTORE_IRQ(r5); +#ifdef CONFIG_PERF_COUNTERS + /* check paca->perf_counter_pending if we're enabling ints */ + lbz r3,PACAPERFPEND(r13) + and. r3,r3,r5 + beq 27f + bl .perf_counter_do_pending +27: +#endif /* CONFIG_PERF_COUNTERS */ + /* extract EE bit and use it to restore paca->hard_enabled */ ld r3,_MSR(r1) rldicl r4,r3,49,63 /* r0 = (r3 >> 15) & 1 */ diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index ac222d0ab12..4efb886ea43 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -104,6 +104,13 @@ static inline notrace void set_soft_enabled(unsigned long enable) : : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled))); } +#ifdef CONFIG_PERF_COUNTERS +notrace void __weak perf_counter_do_pending(void) +{ + set_perf_counter_pending(0); +} +#endif + notrace void raw_local_irq_restore(unsigned long en) { /* @@ -135,6 +142,9 @@ notrace void raw_local_irq_restore(unsigned long en) iseries_handle_interrupts(); } + if (get_perf_counter_pending()) + perf_counter_do_pending(); + /* * if (get_paca()->hard_enabled) return; * But again we need to take care that gcc gets hard_enabled directly -- cgit v1.2.3 From 4574910e5087085a1f330ff8373cee4503f5c77c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 9 Jan 2009 20:21:55 +1100 Subject: powerpc/perf_counter: Add generic support for POWER-family PMU hardware This provides the architecture-specific functions needed to access PMU hardware on the 64-bit PowerPC processors. It has been designed for the IBM POWER family (POWER 4/4+/5/5+/6 and PPC970) but will hopefully also suit other 64-bit PowerPC machines (although probably not Cell given how different it is in this area). This doesn't include back-ends for any specific processors. This implements a system which allows back-ends to express the constraints that their hardware has on what events can be counted simultaneously. The constraints are expressed as a 64-bit mask + 64-bit value for each event, and the encoding is capable of expressing the constraints arising from having a set of multiplexers feeding an event bus, with some events being available through multiple multiplexer settings, such as we get on POWER4 and PPC970. Furthermore, the back-end can supply alternative event codes for each event, and the constraint checking code will try all possible combinations of alternative event codes to try to find a combination that will fit. Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/perf_counter.h | 62 +++ arch/powerpc/kernel/Makefile | 1 + arch/powerpc/kernel/perf_counter.c | 754 ++++++++++++++++++++++++++++++++ 3 files changed, 817 insertions(+) create mode 100644 arch/powerpc/kernel/perf_counter.c (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h index 59530ae1d53..9d7ff6d7fb5 100644 --- a/arch/powerpc/include/asm/perf_counter.h +++ b/arch/powerpc/include/asm/perf_counter.h @@ -8,3 +8,65 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ +#include + +#define MAX_HWCOUNTERS 8 +#define MAX_EVENT_ALTERNATIVES 8 + +/* + * This struct provides the constants and functions needed to + * describe the PMU on a particular POWER-family CPU. + */ +struct power_pmu { + int n_counter; + int max_alternatives; + u64 add_fields; + u64 test_adder; + int (*compute_mmcr)(unsigned int events[], int n_ev, + unsigned int hwc[], u64 mmcr[]); + int (*get_constraint)(unsigned int event, u64 *mskp, u64 *valp); + int (*get_alternatives)(unsigned int event, unsigned int alt[]); + void (*disable_pmc)(unsigned int pmc, u64 mmcr[]); + int n_generic; + int *generic_events; +}; + +extern struct power_pmu *ppmu; + +/* + * The power_pmu.get_constraint function returns a 64-bit value and + * a 64-bit mask that express the constraints between this event and + * other events. + * + * The value and mask are divided up into (non-overlapping) bitfields + * of three different types: + * + * Select field: this expresses the constraint that some set of bits + * in MMCR* needs to be set to a specific value for this event. For a + * select field, the mask contains 1s in every bit of the field, and + * the value contains a unique value for each possible setting of the + * MMCR* bits. The constraint checking code will ensure that two events + * that set the same field in their masks have the same value in their + * value dwords. + * + * Add field: this expresses the constraint that there can be at most + * N events in a particular class. A field of k bits can be used for + * N <= 2^(k-1) - 1. The mask has the most significant bit of the field + * set (and the other bits 0), and the value has only the least significant + * bit of the field set. In addition, the 'add_fields' and 'test_adder' + * in the struct power_pmu for this processor come into play. The + * add_fields value contains 1 in the LSB of the field, and the + * test_adder contains 2^(k-1) - 1 - N in the field. + * + * NAND field: this expresses the constraint that you may not have events + * in all of a set of classes. (For example, on PPC970, you can't select + * events from the FPU, ISU and IDU simultaneously, although any two are + * possible.) For N classes, the field is N+1 bits wide, and each class + * is assigned one bit from the least-significant N bits. The mask has + * only the most-significant bit set, and the value has only the bit + * for the event's class set. The test_adder has the least significant + * bit set in the field. + * + * If an event is not subject to the constraint expressed by a particular + * field, then it will have 0 in both the mask and value for that field. + */ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 1308a86e907..fde190bbb2b 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -94,6 +94,7 @@ obj-$(CONFIG_AUDIT) += audit.o obj64-$(CONFIG_AUDIT) += compat_audit.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o +obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c new file mode 100644 index 00000000000..c7d4c2966a5 --- /dev/null +++ b/arch/powerpc/kernel/perf_counter.c @@ -0,0 +1,754 @@ +/* + * Performance counter support - powerpc architecture code + * + * Copyright 2008-2009 Paul Mackerras, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include +#include +#include +#include +#include + +struct cpu_hw_counters { + int n_counters; + int n_percpu; + int disabled; + int n_added; + struct perf_counter *counter[MAX_HWCOUNTERS]; + unsigned int events[MAX_HWCOUNTERS]; + u64 mmcr[3]; +}; +DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters); + +struct power_pmu *ppmu; + +void perf_counter_print_debug(void) +{ +} + +/* + * Return 1 for a software counter, 0 for a hardware counter + */ +static inline int is_software_counter(struct perf_counter *counter) +{ + return !counter->hw_event.raw && counter->hw_event.type < 0; +} + +/* + * Read one performance monitor counter (PMC). + */ +static unsigned long read_pmc(int idx) +{ + unsigned long val; + + switch (idx) { + case 1: + val = mfspr(SPRN_PMC1); + break; + case 2: + val = mfspr(SPRN_PMC2); + break; + case 3: + val = mfspr(SPRN_PMC3); + break; + case 4: + val = mfspr(SPRN_PMC4); + break; + case 5: + val = mfspr(SPRN_PMC5); + break; + case 6: + val = mfspr(SPRN_PMC6); + break; + case 7: + val = mfspr(SPRN_PMC7); + break; + case 8: + val = mfspr(SPRN_PMC8); + break; + default: + printk(KERN_ERR "oops trying to read PMC%d\n", idx); + val = 0; + } + return val; +} + +/* + * Write one PMC. + */ +static void write_pmc(int idx, unsigned long val) +{ + switch (idx) { + case 1: + mtspr(SPRN_PMC1, val); + break; + case 2: + mtspr(SPRN_PMC2, val); + break; + case 3: + mtspr(SPRN_PMC3, val); + break; + case 4: + mtspr(SPRN_PMC4, val); + break; + case 5: + mtspr(SPRN_PMC5, val); + break; + case 6: + mtspr(SPRN_PMC6, val); + break; + case 7: + mtspr(SPRN_PMC7, val); + break; + case 8: + mtspr(SPRN_PMC8, val); + break; + default: + printk(KERN_ERR "oops trying to write PMC%d\n", idx); + } +} + +/* + * Check if a set of events can all go on the PMU at once. + * If they can't, this will look at alternative codes for the events + * and see if any combination of alternative codes is feasible. + * The feasible set is returned in event[]. + */ +static int power_check_constraints(unsigned int event[], int n_ev) +{ + u64 mask, value, nv; + unsigned int alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES]; + u64 amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES]; + u64 avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES]; + u64 smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS]; + int n_alt[MAX_HWCOUNTERS], choice[MAX_HWCOUNTERS]; + int i, j; + u64 addf = ppmu->add_fields; + u64 tadd = ppmu->test_adder; + + if (n_ev > ppmu->n_counter) + return -1; + + /* First see if the events will go on as-is */ + for (i = 0; i < n_ev; ++i) { + alternatives[i][0] = event[i]; + if (ppmu->get_constraint(event[i], &amasks[i][0], + &avalues[i][0])) + return -1; + choice[i] = 0; + } + value = mask = 0; + for (i = 0; i < n_ev; ++i) { + nv = (value | avalues[i][0]) + (value & avalues[i][0] & addf); + if ((((nv + tadd) ^ value) & mask) != 0 || + (((nv + tadd) ^ avalues[i][0]) & amasks[i][0]) != 0) + break; + value = nv; + mask |= amasks[i][0]; + } + if (i == n_ev) + return 0; /* all OK */ + + /* doesn't work, gather alternatives... */ + if (!ppmu->get_alternatives) + return -1; + for (i = 0; i < n_ev; ++i) { + n_alt[i] = ppmu->get_alternatives(event[i], alternatives[i]); + for (j = 1; j < n_alt[i]; ++j) + ppmu->get_constraint(alternatives[i][j], + &amasks[i][j], &avalues[i][j]); + } + + /* enumerate all possibilities and see if any will work */ + i = 0; + j = -1; + value = mask = nv = 0; + while (i < n_ev) { + if (j >= 0) { + /* we're backtracking, restore context */ + value = svalues[i]; + mask = smasks[i]; + j = choice[i]; + } + /* + * See if any alternative k for event i, + * where k > j, will satisfy the constraints. + */ + while (++j < n_alt[i]) { + nv = (value | avalues[i][j]) + + (value & avalues[i][j] & addf); + if ((((nv + tadd) ^ value) & mask) == 0 && + (((nv + tadd) ^ avalues[i][j]) + & amasks[i][j]) == 0) + break; + } + if (j >= n_alt[i]) { + /* + * No feasible alternative, backtrack + * to event i-1 and continue enumerating its + * alternatives from where we got up to. + */ + if (--i < 0) + return -1; + } else { + /* + * Found a feasible alternative for event i, + * remember where we got up to with this event, + * go on to the next event, and start with + * the first alternative for it. + */ + choice[i] = j; + svalues[i] = value; + smasks[i] = mask; + value = nv; + mask |= amasks[i][j]; + ++i; + j = -1; + } + } + + /* OK, we have a feasible combination, tell the caller the solution */ + for (i = 0; i < n_ev; ++i) + event[i] = alternatives[i][choice[i]]; + return 0; +} + +static void power_perf_read(struct perf_counter *counter) +{ + long val, delta, prev; + + if (!counter->hw.idx) + return; + /* + * Performance monitor interrupts come even when interrupts + * are soft-disabled, as long as interrupts are hard-enabled. + * Therefore we treat them like NMIs. + */ + do { + prev = atomic64_read(&counter->hw.prev_count); + barrier(); + val = read_pmc(counter->hw.idx); + } while (atomic64_cmpxchg(&counter->hw.prev_count, prev, val) != prev); + + /* The counters are only 32 bits wide */ + delta = (val - prev) & 0xfffffffful; + atomic64_add(delta, &counter->count); + atomic64_sub(delta, &counter->hw.period_left); +} + +/* + * Disable all counters to prevent PMU interrupts and to allow + * counters to be added or removed. + */ +u64 hw_perf_save_disable(void) +{ + struct cpu_hw_counters *cpuhw; + unsigned long ret; + unsigned long flags; + + local_irq_save(flags); + cpuhw = &__get_cpu_var(cpu_hw_counters); + + ret = cpuhw->disabled; + if (!ret) { + cpuhw->disabled = 1; + cpuhw->n_added = 0; + + /* + * Set the 'freeze counters' bit. + * The barrier is to make sure the mtspr has been + * executed and the PMU has frozen the counters + * before we return. + */ + mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_FC); + mb(); + } + local_irq_restore(flags); + return ret; +} + +/* + * Re-enable all counters if disable == 0. + * If we were previously disabled and counters were added, then + * put the new config on the PMU. + */ +void hw_perf_restore(u64 disable) +{ + struct perf_counter *counter; + struct cpu_hw_counters *cpuhw; + unsigned long flags; + long i; + unsigned long val; + s64 left; + unsigned int hwc_index[MAX_HWCOUNTERS]; + + if (disable) + return; + local_irq_save(flags); + cpuhw = &__get_cpu_var(cpu_hw_counters); + cpuhw->disabled = 0; + + /* + * If we didn't change anything, or only removed counters, + * no need to recalculate MMCR* settings and reset the PMCs. + * Just reenable the PMU with the current MMCR* settings + * (possibly updated for removal of counters). + */ + if (!cpuhw->n_added) { + mtspr(SPRN_MMCRA, cpuhw->mmcr[2]); + mtspr(SPRN_MMCR1, cpuhw->mmcr[1]); + mtspr(SPRN_MMCR0, cpuhw->mmcr[0]); + goto out; + } + + /* + * Compute MMCR* values for the new set of counters + */ + if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_counters, hwc_index, + cpuhw->mmcr)) { + /* shouldn't ever get here */ + printk(KERN_ERR "oops compute_mmcr failed\n"); + goto out; + } + + /* + * Write the new configuration to MMCR* with the freeze + * bit set and set the hardware counters to their initial values. + * Then unfreeze the counters. + */ + mtspr(SPRN_MMCRA, cpuhw->mmcr[2]); + mtspr(SPRN_MMCR1, cpuhw->mmcr[1]); + mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)) + | MMCR0_FC); + + /* + * Read off any pre-existing counters that need to move + * to another PMC. + */ + for (i = 0; i < cpuhw->n_counters; ++i) { + counter = cpuhw->counter[i]; + if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) { + power_perf_read(counter); + write_pmc(counter->hw.idx, 0); + counter->hw.idx = 0; + } + } + + /* + * Initialize the PMCs for all the new and moved counters. + */ + for (i = 0; i < cpuhw->n_counters; ++i) { + counter = cpuhw->counter[i]; + if (counter->hw.idx) + continue; + val = 0; + if (counter->hw_event.irq_period) { + left = atomic64_read(&counter->hw.period_left); + if (left < 0x80000000L) + val = 0x80000000L - left; + } + atomic64_set(&counter->hw.prev_count, val); + counter->hw.idx = hwc_index[i] + 1; + write_pmc(counter->hw.idx, val); + } + mb(); + cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE; + mtspr(SPRN_MMCR0, cpuhw->mmcr[0]); + + out: + local_irq_restore(flags); +} + +static int collect_events(struct perf_counter *group, int max_count, + struct perf_counter *ctrs[], unsigned int *events) +{ + int n = 0; + struct perf_counter *counter; + + if (!is_software_counter(group)) { + if (n >= max_count) + return -1; + ctrs[n] = group; + events[n++] = group->hw.config; + } + list_for_each_entry(counter, &group->sibling_list, list_entry) { + if (!is_software_counter(counter) && + counter->state != PERF_COUNTER_STATE_OFF) { + if (n >= max_count) + return -1; + ctrs[n] = counter; + events[n++] = counter->hw.config; + } + } + return n; +} + +static void counter_sched_in(struct perf_counter *counter, int cpu) +{ + counter->state = PERF_COUNTER_STATE_ACTIVE; + counter->oncpu = cpu; + if (is_software_counter(counter)) + counter->hw_ops->enable(counter); +} + +/* + * Called to enable a whole group of counters. + * Returns 1 if the group was enabled, or -EAGAIN if it could not be. + * Assumes the caller has disabled interrupts and has + * frozen the PMU with hw_perf_save_disable. + */ +int hw_perf_group_sched_in(struct perf_counter *group_leader, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx, int cpu) +{ + struct cpu_hw_counters *cpuhw; + long i, n, n0; + struct perf_counter *sub; + + cpuhw = &__get_cpu_var(cpu_hw_counters); + n0 = cpuhw->n_counters; + n = collect_events(group_leader, ppmu->n_counter - n0, + &cpuhw->counter[n0], &cpuhw->events[n0]); + if (n < 0) + return -EAGAIN; + if (power_check_constraints(cpuhw->events, n + n0)) + return -EAGAIN; + cpuhw->n_counters = n0 + n; + cpuhw->n_added += n; + + /* + * OK, this group can go on; update counter states etc., + * and enable any software counters + */ + for (i = n0; i < n0 + n; ++i) + cpuhw->counter[i]->hw.config = cpuhw->events[i]; + n = 1; + counter_sched_in(group_leader, cpu); + list_for_each_entry(sub, &group_leader->sibling_list, list_entry) { + if (sub->state != PERF_COUNTER_STATE_OFF) { + counter_sched_in(sub, cpu); + ++n; + } + } + cpuctx->active_oncpu += n; + ctx->nr_active += n; + + return 1; +} + +/* + * Add a counter to the PMU. + * If all counters are not already frozen, then we disable and + * re-enable the PMU in order to get hw_perf_restore to do the + * actual work of reconfiguring the PMU. + */ +static int power_perf_enable(struct perf_counter *counter) +{ + struct cpu_hw_counters *cpuhw; + unsigned long flags; + u64 pmudis; + int n0; + int ret = -EAGAIN; + + local_irq_save(flags); + pmudis = hw_perf_save_disable(); + + /* + * Add the counter to the list (if there is room) + * and check whether the total set is still feasible. + */ + cpuhw = &__get_cpu_var(cpu_hw_counters); + n0 = cpuhw->n_counters; + if (n0 >= ppmu->n_counter) + goto out; + cpuhw->counter[n0] = counter; + cpuhw->events[n0] = counter->hw.config; + if (power_check_constraints(cpuhw->events, n0 + 1)) + goto out; + + counter->hw.config = cpuhw->events[n0]; + ++cpuhw->n_counters; + ++cpuhw->n_added; + + ret = 0; + out: + hw_perf_restore(pmudis); + local_irq_restore(flags); + return ret; +} + +/* + * Remove a counter from the PMU. + */ +static void power_perf_disable(struct perf_counter *counter) +{ + struct cpu_hw_counters *cpuhw; + long i; + u64 pmudis; + unsigned long flags; + + local_irq_save(flags); + pmudis = hw_perf_save_disable(); + + power_perf_read(counter); + + cpuhw = &__get_cpu_var(cpu_hw_counters); + for (i = 0; i < cpuhw->n_counters; ++i) { + if (counter == cpuhw->counter[i]) { + while (++i < cpuhw->n_counters) + cpuhw->counter[i-1] = cpuhw->counter[i]; + --cpuhw->n_counters; + ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr); + write_pmc(counter->hw.idx, 0); + counter->hw.idx = 0; + break; + } + } + if (cpuhw->n_counters == 0) { + /* disable exceptions if no counters are running */ + cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE); + } + + hw_perf_restore(pmudis); + local_irq_restore(flags); +} + +struct hw_perf_counter_ops power_perf_ops = { + .enable = power_perf_enable, + .disable = power_perf_disable, + .read = power_perf_read +}; + +const struct hw_perf_counter_ops * +hw_perf_counter_init(struct perf_counter *counter) +{ + unsigned long ev; + struct perf_counter *ctrs[MAX_HWCOUNTERS]; + unsigned int events[MAX_HWCOUNTERS]; + int n; + + if (!ppmu) + return NULL; + if ((s64)counter->hw_event.irq_period < 0) + return NULL; + ev = counter->hw_event.type; + if (!counter->hw_event.raw) { + if (ev >= ppmu->n_generic || + ppmu->generic_events[ev] == 0) + return NULL; + ev = ppmu->generic_events[ev]; + } + counter->hw.config_base = ev; + counter->hw.idx = 0; + + /* + * If this is in a group, check if it can go on with all the + * other hardware counters in the group. We assume the counter + * hasn't been linked into its leader's sibling list at this point. + */ + n = 0; + if (counter->group_leader != counter) { + n = collect_events(counter->group_leader, ppmu->n_counter - 1, + ctrs, events); + if (n < 0) + return NULL; + } + events[n++] = ev; + if (power_check_constraints(events, n)) + return NULL; + + counter->hw.config = events[n - 1]; + atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period); + return &power_perf_ops; +} + +/* + * Handle wakeups. + */ +void perf_counter_do_pending(void) +{ + int i; + struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters); + struct perf_counter *counter; + + set_perf_counter_pending(0); + for (i = 0; i < cpuhw->n_counters; ++i) { + counter = cpuhw->counter[i]; + if (counter && counter->wakeup_pending) { + counter->wakeup_pending = 0; + wake_up(&counter->waitq); + } + } +} + +/* + * Record data for an irq counter. + * This function was lifted from the x86 code; maybe it should + * go in the core? + */ +static void perf_store_irq_data(struct perf_counter *counter, u64 data) +{ + struct perf_data *irqdata = counter->irqdata; + + if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) { + irqdata->overrun++; + } else { + u64 *p = (u64 *) &irqdata->data[irqdata->len]; + + *p = data; + irqdata->len += sizeof(u64); + } +} + +/* + * Record all the values of the counters in a group + */ +static void perf_handle_group(struct perf_counter *counter) +{ + struct perf_counter *leader, *sub; + + leader = counter->group_leader; + list_for_each_entry(sub, &leader->sibling_list, list_entry) { + if (sub != counter) + sub->hw_ops->read(sub); + perf_store_irq_data(counter, sub->hw_event.type); + perf_store_irq_data(counter, atomic64_read(&sub->count)); + } +} + +/* + * A counter has overflowed; update its count and record + * things if requested. Note that interrupts are hard-disabled + * here so there is no possibility of being interrupted. + */ +static void record_and_restart(struct perf_counter *counter, long val, + struct pt_regs *regs) +{ + s64 prev, delta, left; + int record = 0; + + /* we don't have to worry about interrupts here */ + prev = atomic64_read(&counter->hw.prev_count); + delta = (val - prev) & 0xfffffffful; + atomic64_add(delta, &counter->count); + + /* + * See if the total period for this counter has expired, + * and update for the next period. + */ + val = 0; + left = atomic64_read(&counter->hw.period_left) - delta; + if (counter->hw_event.irq_period) { + if (left <= 0) { + left += counter->hw_event.irq_period; + if (left <= 0) + left = counter->hw_event.irq_period; + record = 1; + } + if (left < 0x80000000L) + val = 0x80000000L - left; + } + write_pmc(counter->hw.idx, val); + atomic64_set(&counter->hw.prev_count, val); + atomic64_set(&counter->hw.period_left, left); + + /* + * Finally record data if requested. + */ + if (record) { + switch (counter->hw_event.record_type) { + case PERF_RECORD_SIMPLE: + break; + case PERF_RECORD_IRQ: + perf_store_irq_data(counter, instruction_pointer(regs)); + counter->wakeup_pending = 1; + break; + case PERF_RECORD_GROUP: + perf_handle_group(counter); + counter->wakeup_pending = 1; + break; + } + } +} + +/* + * Performance monitor interrupt stuff + */ +static void perf_counter_interrupt(struct pt_regs *regs) +{ + int i; + struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters); + struct perf_counter *counter; + long val; + int need_wakeup = 0, found = 0; + + for (i = 0; i < cpuhw->n_counters; ++i) { + counter = cpuhw->counter[i]; + val = read_pmc(counter->hw.idx); + if ((int)val < 0) { + /* counter has overflowed */ + found = 1; + record_and_restart(counter, val, regs); + if (counter->wakeup_pending) + need_wakeup = 1; + } + } + + /* + * In case we didn't find and reset the counter that caused + * the interrupt, scan all counters and reset any that are + * negative, to avoid getting continual interrupts. + * Any that we processed in the previous loop will not be negative. + */ + if (!found) { + for (i = 0; i < ppmu->n_counter; ++i) { + val = read_pmc(i + 1); + if ((int)val < 0) + write_pmc(i + 1, 0); + } + } + + /* + * Reset MMCR0 to its normal value. This will set PMXE and + * clear FC (freeze counters) and PMAO (perf mon alert occurred) + * and thus allow interrupts to occur again. + * XXX might want to use MSR.PM to keep the counters frozen until + * we get back out of this interrupt. + */ + mtspr(SPRN_MMCR0, cpuhw->mmcr[0]); + + /* + * If we need a wakeup, check whether interrupts were soft-enabled + * when we took the interrupt. If they were, we can wake stuff up + * immediately; otherwise we'll have to set a flag and do the + * wakeup when interrupts get soft-enabled. + */ + if (need_wakeup) { + if (regs->softe) { + irq_enter(); + perf_counter_do_pending(); + irq_exit(); + } else { + set_perf_counter_pending(1); + } + } +} + +static int init_perf_counters(void) +{ + if (reserve_pmc_hardware(perf_counter_interrupt)) { + printk(KERN_ERR "Couldn't init performance monitor subsystem\n"); + return -EBUSY; + } + + return 0; +} + +arch_initcall(init_perf_counters); -- cgit v1.2.3 From 16b067993dee3dfde61b20027e0b168dc06201ee Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 10 Jan 2009 16:34:07 +1100 Subject: powerpc/perf_counter: Add support for PPC970 family This adds the back-end for the PMU on the PPC970 family. The PPC970 allows events from the ISU to be selected in two different ways. Rather than use alternative event codes to express this, we instead use a single encoding for ISU events and express the resulting constraint (that you can't select events from all three of FPU/IFU/VPU, ISU and IDU/STS at the same time, since they all come in through only 2 multiplexers) using a NAND constraint field, and work out which multiplexer is used for ISU events at compute_mmcr time. Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/Makefile | 2 +- arch/powerpc/kernel/perf_counter.c | 13 ++ arch/powerpc/kernel/ppc970-pmu.c | 375 +++++++++++++++++++++++++++++++++++++ 3 files changed, 389 insertions(+), 1 deletion(-) create mode 100644 arch/powerpc/kernel/ppc970-pmu.c (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index fde190bbb2b..45798f6fb13 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -94,7 +94,7 @@ obj-$(CONFIG_AUDIT) += audit.o obj64-$(CONFIG_AUDIT) += compat_audit.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o -obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o +obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o ppc970-pmu.o obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index c7d4c2966a5..5561ecb02a4 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -741,13 +741,26 @@ static void perf_counter_interrupt(struct pt_regs *regs) } } +extern struct power_pmu ppc970_pmu; + static int init_perf_counters(void) { + unsigned long pvr; + if (reserve_pmc_hardware(perf_counter_interrupt)) { printk(KERN_ERR "Couldn't init performance monitor subsystem\n"); return -EBUSY; } + /* XXX should get this from cputable */ + pvr = mfspr(SPRN_PVR); + switch (PVR_VER(pvr)) { + case PV_970: + case PV_970FX: + case PV_970MP: + ppmu = &ppc970_pmu; + break; + } return 0; } diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c new file mode 100644 index 00000000000..c3256580be1 --- /dev/null +++ b/arch/powerpc/kernel/ppc970-pmu.c @@ -0,0 +1,375 @@ +/* + * Performance counter support for PPC970-family processors. + * + * Copyright 2008-2009 Paul Mackerras, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include + +/* + * Bits in event code for PPC970 + */ +#define PM_PMC_SH 12 /* PMC number (1-based) for direct events */ +#define PM_PMC_MSK 0xf +#define PM_UNIT_SH 8 /* TTMMUX number and setting - unit select */ +#define PM_UNIT_MSK 0xf +#define PM_BYTE_SH 4 /* Byte number of event bus to use */ +#define PM_BYTE_MSK 3 +#define PM_PMCSEL_MSK 0xf + +/* Values in PM_UNIT field */ +#define PM_NONE 0 +#define PM_FPU 1 +#define PM_VPU 2 +#define PM_ISU 3 +#define PM_IFU 4 +#define PM_IDU 5 +#define PM_STS 6 +#define PM_LSU0 7 +#define PM_LSU1U 8 +#define PM_LSU1L 9 +#define PM_LASTUNIT 9 + +/* + * Bits in MMCR0 for PPC970 + */ +#define MMCR0_PMC1SEL_SH 8 +#define MMCR0_PMC2SEL_SH 1 +#define MMCR_PMCSEL_MSK 0x1f + +/* + * Bits in MMCR1 for PPC970 + */ +#define MMCR1_TTM0SEL_SH 62 +#define MMCR1_TTM1SEL_SH 59 +#define MMCR1_TTM3SEL_SH 53 +#define MMCR1_TTMSEL_MSK 3 +#define MMCR1_TD_CP_DBG0SEL_SH 50 +#define MMCR1_TD_CP_DBG1SEL_SH 48 +#define MMCR1_TD_CP_DBG2SEL_SH 46 +#define MMCR1_TD_CP_DBG3SEL_SH 44 +#define MMCR1_PMC1_ADDER_SEL_SH 39 +#define MMCR1_PMC2_ADDER_SEL_SH 38 +#define MMCR1_PMC6_ADDER_SEL_SH 37 +#define MMCR1_PMC5_ADDER_SEL_SH 36 +#define MMCR1_PMC8_ADDER_SEL_SH 35 +#define MMCR1_PMC7_ADDER_SEL_SH 34 +#define MMCR1_PMC3_ADDER_SEL_SH 33 +#define MMCR1_PMC4_ADDER_SEL_SH 32 +#define MMCR1_PMC3SEL_SH 27 +#define MMCR1_PMC4SEL_SH 22 +#define MMCR1_PMC5SEL_SH 17 +#define MMCR1_PMC6SEL_SH 12 +#define MMCR1_PMC7SEL_SH 7 +#define MMCR1_PMC8SEL_SH 2 + +static short mmcr1_adder_bits[8] = { + MMCR1_PMC1_ADDER_SEL_SH, + MMCR1_PMC2_ADDER_SEL_SH, + MMCR1_PMC3_ADDER_SEL_SH, + MMCR1_PMC4_ADDER_SEL_SH, + MMCR1_PMC5_ADDER_SEL_SH, + MMCR1_PMC6_ADDER_SEL_SH, + MMCR1_PMC7_ADDER_SEL_SH, + MMCR1_PMC8_ADDER_SEL_SH +}; + +/* + * Bits in MMCRA + */ + +/* + * Layout of constraint bits: + * 6666555555555544444444443333333333222222222211111111110000000000 + * 3210987654321098765432109876543210987654321098765432109876543210 + * <><>[ >[ >[ >< >< >< >< ><><><><><><><><> + * T0T1 UC PS1 PS2 B0 B1 B2 B3 P1P2P3P4P5P6P7P8 + * + * T0 - TTM0 constraint + * 46-47: TTM0SEL value (0=FPU, 2=IFU, 3=VPU) 0xC000_0000_0000 + * + * T1 - TTM1 constraint + * 44-45: TTM1SEL value (0=IDU, 3=STS) 0x3000_0000_0000 + * + * UC - unit constraint: can't have all three of FPU|IFU|VPU, ISU, IDU|STS + * 43: UC3 error 0x0800_0000_0000 + * 42: FPU|IFU|VPU events needed 0x0400_0000_0000 + * 41: ISU events needed 0x0200_0000_0000 + * 40: IDU|STS events needed 0x0100_0000_0000 + * + * PS1 + * 39: PS1 error 0x0080_0000_0000 + * 36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000 + * + * PS2 + * 35: PS2 error 0x0008_0000_0000 + * 32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000 + * + * B0 + * 28-31: Byte 0 event source 0xf000_0000 + * Encoding as for the event code + * + * B1, B2, B3 + * 24-27, 20-23, 16-19: Byte 1, 2, 3 event sources + * + * P1 + * 15: P1 error 0x8000 + * 14-15: Count of events needing PMC1 + * + * P2..P8 + * 0-13: Count of events needing PMC2..PMC8 + */ + +/* Masks and values for using events from the various units */ +static u64 unit_cons[PM_LASTUNIT+1][2] = { + [PM_FPU] = { 0xc80000000000ull, 0x040000000000ull }, + [PM_VPU] = { 0xc80000000000ull, 0xc40000000000ull }, + [PM_ISU] = { 0x080000000000ull, 0x020000000000ull }, + [PM_IFU] = { 0xc80000000000ull, 0x840000000000ull }, + [PM_IDU] = { 0x380000000000ull, 0x010000000000ull }, + [PM_STS] = { 0x380000000000ull, 0x310000000000ull }, +}; + +static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp) +{ + int pmc, byte, unit, sh; + u64 mask = 0, value = 0; + int grp = -1; + + pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; + if (pmc) { + if (pmc > 8) + return -1; + sh = (pmc - 1) * 2; + mask |= 2 << sh; + value |= 1 << sh; + grp = ((pmc - 1) >> 1) & 1; + } + unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; + if (unit) { + if (unit > PM_LASTUNIT) + return -1; + mask |= unit_cons[unit][0]; + value |= unit_cons[unit][1]; + byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; + /* + * Bus events on bytes 0 and 2 can be counted + * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8. + */ + if (!pmc) + grp = byte & 1; + /* Set byte lane select field */ + mask |= 0xfULL << (28 - 4 * byte); + value |= (u64)unit << (28 - 4 * byte); + } + if (grp == 0) { + /* increment PMC1/2/5/6 field */ + mask |= 0x8000000000ull; + value |= 0x1000000000ull; + } else if (grp == 1) { + /* increment PMC3/4/7/8 field */ + mask |= 0x800000000ull; + value |= 0x100000000ull; + } + *maskp = mask; + *valp = value; + return 0; +} + +static int p970_get_alternatives(unsigned int event, unsigned int alt[]) +{ + alt[0] = event; + + /* 2 alternatives for LSU empty */ + if (event == 0x2002 || event == 0x3002) { + alt[1] = event ^ 0x1000; + return 2; + } + + return 1; +} + +static int p970_compute_mmcr(unsigned int event[], int n_ev, + unsigned int hwc[], u64 mmcr[]) +{ + u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0; + unsigned int pmc, unit, byte, psel; + unsigned int ttm, grp; + unsigned int pmc_inuse = 0; + unsigned int pmc_grp_use[2]; + unsigned char busbyte[4]; + unsigned char unituse[16]; + unsigned char unitmap[] = { 0, 0<<3, 3<<3, 1<<3, 2<<3, 0|4, 3|4 }; + unsigned char ttmuse[2]; + unsigned char pmcsel[8]; + int i; + + if (n_ev > 8) + return -1; + + /* First pass to count resource use */ + pmc_grp_use[0] = pmc_grp_use[1] = 0; + memset(busbyte, 0, sizeof(busbyte)); + memset(unituse, 0, sizeof(unituse)); + for (i = 0; i < n_ev; ++i) { + pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; + if (pmc) { + if (pmc_inuse & (1 << (pmc - 1))) + return -1; + pmc_inuse |= 1 << (pmc - 1); + /* count 1/2/5/6 vs 3/4/7/8 use */ + ++pmc_grp_use[((pmc - 1) >> 1) & 1]; + } + unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; + byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; + if (unit) { + if (unit > PM_LASTUNIT) + return -1; + if (!pmc) + ++pmc_grp_use[byte & 1]; + if (busbyte[byte] && busbyte[byte] != unit) + return -1; + busbyte[byte] = unit; + unituse[unit] = 1; + } + } + if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4) + return -1; + + /* + * Assign resources and set multiplexer selects. + * + * PM_ISU can go either on TTM0 or TTM1, but that's the only + * choice we have to deal with. + */ + if (unituse[PM_ISU] & + (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_VPU])) + unitmap[PM_ISU] = 2 | 4; /* move ISU to TTM1 */ + /* Set TTM[01]SEL fields. */ + ttmuse[0] = ttmuse[1] = 0; + for (i = PM_FPU; i <= PM_STS; ++i) { + if (!unituse[i]) + continue; + ttm = unitmap[i]; + ++ttmuse[(ttm >> 2) & 1]; + mmcr1 |= (u64)(ttm & ~4) << MMCR1_TTM1SEL_SH; + } + /* Check only one unit per TTMx */ + if (ttmuse[0] > 1 || ttmuse[1] > 1) + return -1; + + /* Set byte lane select fields and TTM3SEL. */ + for (byte = 0; byte < 4; ++byte) { + unit = busbyte[byte]; + if (!unit) + continue; + if (unit <= PM_STS) + ttm = (unitmap[unit] >> 2) & 1; + else if (unit == PM_LSU0) + ttm = 2; + else { + ttm = 3; + if (unit == PM_LSU1L && byte >= 2) + mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte); + } + mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte); + } + + /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */ + memset(pmcsel, 0x8, sizeof(pmcsel)); /* 8 means don't count */ + for (i = 0; i < n_ev; ++i) { + pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; + unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; + byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; + psel = event[i] & PM_PMCSEL_MSK; + if (!pmc) { + /* Bus event or any-PMC direct event */ + if (unit) + psel |= 0x10 | ((byte & 2) << 2); + else + psel |= 8; + for (pmc = 0; pmc < 8; ++pmc) { + if (pmc_inuse & (1 << pmc)) + continue; + grp = (pmc >> 1) & 1; + if (unit) { + if (grp == (byte & 1)) + break; + } else if (pmc_grp_use[grp] < 4) { + ++pmc_grp_use[grp]; + break; + } + } + pmc_inuse |= 1 << pmc; + } else { + /* Direct event */ + --pmc; + if (psel == 0 && (byte & 2)) + /* add events on higher-numbered bus */ + mmcr1 |= 1ull << mmcr1_adder_bits[pmc]; + } + pmcsel[pmc] = psel; + hwc[i] = pmc; + } + for (pmc = 0; pmc < 2; ++pmc) + mmcr0 |= pmcsel[pmc] << (MMCR0_PMC1SEL_SH - 7 * pmc); + for (; pmc < 8; ++pmc) + mmcr1 |= (u64)pmcsel[pmc] << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2)); + if (pmc_inuse & 1) + mmcr0 |= MMCR0_PMC1CE; + if (pmc_inuse & 0xfe) + mmcr0 |= MMCR0_PMCjCE; + + mmcra |= 0x2000; /* mark only one IOP per PPC instruction */ + + /* Return MMCRx values */ + mmcr[0] = mmcr0; + mmcr[1] = mmcr1; + mmcr[2] = mmcra; + return 0; +} + +static void p970_disable_pmc(unsigned int pmc, u64 mmcr[]) +{ + int shift, i; + + if (pmc <= 1) { + shift = MMCR0_PMC1SEL_SH - 7 * pmc; + i = 0; + } else { + shift = MMCR1_PMC3SEL_SH - 5 * (pmc - 2); + i = 1; + } + /* + * Setting the PMCxSEL field to 0x08 disables PMC x. + */ + mmcr[i] = (mmcr[i] & ~(0x1fUL << shift)) | (0x08UL << shift); +} + +static int ppc970_generic_events[] = { + [PERF_COUNT_CPU_CYCLES] = 7, + [PERF_COUNT_INSTRUCTIONS] = 1, + [PERF_COUNT_CACHE_REFERENCES] = 0x8810, /* PM_LD_REF_L1 */ + [PERF_COUNT_CACHE_MISSES] = 0x3810, /* PM_LD_MISS_L1 */ + [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x431, /* PM_BR_ISSUED */ + [PERF_COUNT_BRANCH_MISSES] = 0x327, /* PM_GRP_BR_MPRED */ +}; + +struct power_pmu ppc970_pmu = { + .n_counter = 8, + .max_alternatives = 2, + .add_fields = 0x001100005555ull, + .test_adder = 0x013300000000ull, + .compute_mmcr = p970_compute_mmcr, + .get_constraint = p970_get_constraint, + .get_alternatives = p970_get_alternatives, + .disable_pmc = p970_disable_pmc, + .n_generic = ARRAY_SIZE(ppc970_generic_events), + .generic_events = ppc970_generic_events, +}; -- cgit v1.2.3 From f78628374a13bc150db77c6e02d4f2c0a7f932ef Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 9 Jan 2009 21:05:35 +1100 Subject: powerpc/perf_counter: Add support for POWER6 This adds the back-end for the PMU on the POWER6 processor. Fortunately, the event selection hardware is somewhat simpler on POWER6 than on other POWER family processors, so the constraints fit into only 32 bits. Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/Makefile | 2 +- arch/powerpc/kernel/perf_counter.c | 4 + arch/powerpc/kernel/power6-pmu.c | 283 +++++++++++++++++++++++++++++++++++++ 3 files changed, 288 insertions(+), 1 deletion(-) create mode 100644 arch/powerpc/kernel/power6-pmu.c (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 45798f6fb13..0ebf4d04d4b 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -94,7 +94,7 @@ obj-$(CONFIG_AUDIT) += audit.o obj64-$(CONFIG_AUDIT) += compat_audit.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o -obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o ppc970-pmu.o +obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o ppc970-pmu.o power6-pmu.o obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 5561ecb02a4..df3fe057dee 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -742,6 +742,7 @@ static void perf_counter_interrupt(struct pt_regs *regs) } extern struct power_pmu ppc970_pmu; +extern struct power_pmu power6_pmu; static int init_perf_counters(void) { @@ -760,6 +761,9 @@ static int init_perf_counters(void) case PV_970MP: ppmu = &ppc970_pmu; break; + case 0x3e: + ppmu = &power6_pmu; + break; } return 0; } diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c new file mode 100644 index 00000000000..b1f61f3c97b --- /dev/null +++ b/arch/powerpc/kernel/power6-pmu.c @@ -0,0 +1,283 @@ +/* + * Performance counter support for POWER6 processors. + * + * Copyright 2008-2009 Paul Mackerras, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include + +/* + * Bits in event code for POWER6 + */ +#define PM_PMC_SH 20 /* PMC number (1-based) for direct events */ +#define PM_PMC_MSK 0x7 +#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH) +#define PM_UNIT_SH 16 /* Unit event comes (TTMxSEL encoding) */ +#define PM_UNIT_MSK 0xf +#define PM_UNIT_MSKS (PM_UNIT_MSK << PM_UNIT_SH) +#define PM_LLAV 0x8000 /* Load lookahead match value */ +#define PM_LLA 0x4000 /* Load lookahead match enable */ +#define PM_BYTE_SH 12 /* Byte of event bus to use */ +#define PM_BYTE_MSK 3 +#define PM_SUBUNIT_SH 8 /* Subunit event comes from (NEST_SEL enc.) */ +#define PM_SUBUNIT_MSK 7 +#define PM_SUBUNIT_MSKS (PM_SUBUNIT_MSK << PM_SUBUNIT_SH) +#define PM_PMCSEL_MSK 0xff /* PMCxSEL value */ +#define PM_BUSEVENT_MSK 0xf3700 + +/* + * Bits in MMCR1 for POWER6 + */ +#define MMCR1_TTM0SEL_SH 60 +#define MMCR1_TTMSEL_SH(n) (MMCR1_TTM0SEL_SH - (n) * 4) +#define MMCR1_TTMSEL_MSK 0xf +#define MMCR1_TTMSEL(m, n) (((m) >> MMCR1_TTMSEL_SH(n)) & MMCR1_TTMSEL_MSK) +#define MMCR1_NESTSEL_SH 45 +#define MMCR1_NESTSEL_MSK 0x7 +#define MMCR1_NESTSEL(m) (((m) >> MMCR1_NESTSEL_SH) & MMCR1_NESTSEL_MSK) +#define MMCR1_PMC1_LLA ((u64)1 << 44) +#define MMCR1_PMC1_LLA_VALUE ((u64)1 << 39) +#define MMCR1_PMC1_ADDR_SEL ((u64)1 << 35) +#define MMCR1_PMC1SEL_SH 24 +#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8) +#define MMCR1_PMCSEL_MSK 0xff + +/* + * Assign PMC numbers and compute MMCR1 value for a set of events + */ +static int p6_compute_mmcr(unsigned int event[], int n_ev, + unsigned int hwc[], u64 mmcr[]) +{ + u64 mmcr1 = 0; + int i; + unsigned int pmc, ev, b, u, s, psel; + unsigned int ttmset = 0; + unsigned int pmc_inuse = 0; + + if (n_ev > 4) + return -1; + for (i = 0; i < n_ev; ++i) { + pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; + if (pmc) { + if (pmc_inuse & (1 << (pmc - 1))) + return -1; /* collision! */ + pmc_inuse |= 1 << (pmc - 1); + } + } + for (i = 0; i < n_ev; ++i) { + ev = event[i]; + pmc = (ev >> PM_PMC_SH) & PM_PMC_MSK; + if (pmc) { + --pmc; + } else { + /* can go on any PMC; find a free one */ + for (pmc = 0; pmc < 4; ++pmc) + if (!(pmc_inuse & (1 << pmc))) + break; + pmc_inuse |= 1 << pmc; + } + hwc[i] = pmc; + psel = ev & PM_PMCSEL_MSK; + if (ev & PM_BUSEVENT_MSK) { + /* this event uses the event bus */ + b = (ev >> PM_BYTE_SH) & PM_BYTE_MSK; + u = (ev >> PM_UNIT_SH) & PM_UNIT_MSK; + /* check for conflict on this byte of event bus */ + if ((ttmset & (1 << b)) && MMCR1_TTMSEL(mmcr1, b) != u) + return -1; + mmcr1 |= (u64)u << MMCR1_TTMSEL_SH(b); + ttmset |= 1 << b; + if (u == 5) { + /* Nest events have a further mux */ + s = (ev >> PM_SUBUNIT_SH) & PM_SUBUNIT_MSK; + if ((ttmset & 0x10) && + MMCR1_NESTSEL(mmcr1) != s) + return -1; + ttmset |= 0x10; + mmcr1 |= (u64)s << MMCR1_NESTSEL_SH; + } + if (0x30 <= psel && psel <= 0x3d) { + /* these need the PMCx_ADDR_SEL bits */ + if (b >= 2) + mmcr1 |= MMCR1_PMC1_ADDR_SEL >> pmc; + } + /* bus select values are different for PMC3/4 */ + if (pmc >= 2 && (psel & 0x90) == 0x80) + psel ^= 0x20; + } + if (ev & PM_LLA) { + mmcr1 |= MMCR1_PMC1_LLA >> pmc; + if (ev & PM_LLAV) + mmcr1 |= MMCR1_PMC1_LLA_VALUE >> pmc; + } + mmcr1 |= (u64)psel << MMCR1_PMCSEL_SH(pmc); + } + mmcr[0] = 0; + if (pmc_inuse & 1) + mmcr[0] = MMCR0_PMC1CE; + if (pmc_inuse & 0xe) + mmcr[0] |= MMCR0_PMCjCE; + mmcr[1] = mmcr1; + mmcr[2] = 0; + return 0; +} + +/* + * Layout of constraint bits: + * + * 0-1 add field: number of uses of PMC1 (max 1) + * 2-3, 4-5, 6-7: ditto for PMC2, 3, 4 + * 8-10 select field: nest (subunit) event selector + * 16-19 select field: unit on byte 0 of event bus + * 20-23, 24-27, 28-31 ditto for bytes 1, 2, 3 + */ +static int p6_get_constraint(unsigned int event, u64 *maskp, u64 *valp) +{ + int pmc, byte, sh; + unsigned int mask = 0, value = 0; + + pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; + if (pmc) { + if (pmc > 4) + return -1; + sh = (pmc - 1) * 2; + mask |= 2 << sh; + value |= 1 << sh; + } + if (event & PM_BUSEVENT_MSK) { + byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; + sh = byte * 4; + mask |= PM_UNIT_MSKS << sh; + value |= (event & PM_UNIT_MSKS) << sh; + if ((event & PM_UNIT_MSKS) == (5 << PM_UNIT_SH)) { + mask |= PM_SUBUNIT_MSKS; + value |= event & PM_SUBUNIT_MSKS; + } + } + *maskp = mask; + *valp = value; + return 0; +} + +#define MAX_ALT 4 /* at most 4 alternatives for any event */ + +static const unsigned int event_alternatives[][MAX_ALT] = { + { 0x0130e8, 0x2000f6, 0x3000fc }, /* PM_PTEG_RELOAD_VALID */ + { 0x080080, 0x10000d, 0x30000c, 0x4000f0 }, /* PM_LD_MISS_L1 */ + { 0x080088, 0x200054, 0x3000f0 }, /* PM_ST_MISS_L1 */ + { 0x10000a, 0x2000f4 }, /* PM_RUN_CYC */ + { 0x10000b, 0x2000f5 }, /* PM_RUN_COUNT */ + { 0x10000e, 0x400010 }, /* PM_PURR */ + { 0x100010, 0x4000f8 }, /* PM_FLUSH */ + { 0x10001a, 0x200010 }, /* PM_MRK_INST_DISP */ + { 0x100026, 0x3000f8 }, /* PM_TB_BIT_TRANS */ + { 0x100054, 0x2000f0 }, /* PM_ST_FIN */ + { 0x100056, 0x2000fc }, /* PM_L1_ICACHE_MISS */ + { 0x1000f0, 0x40000a }, /* PM_INST_IMC_MATCH_CMPL */ + { 0x1000f8, 0x200008 }, /* PM_GCT_EMPTY_CYC */ + { 0x1000fc, 0x400006 }, /* PM_LSU_DERAT_MISS_CYC */ + { 0x20000e, 0x400007 }, /* PM_LSU_DERAT_MISS */ + { 0x200012, 0x300012 }, /* PM_INST_DISP */ + { 0x2000f2, 0x3000f2 }, /* PM_INST_DISP */ + { 0x2000f8, 0x300010 }, /* PM_EXT_INT */ + { 0x2000fe, 0x300056 }, /* PM_DATA_FROM_L2MISS */ + { 0x2d0030, 0x30001a }, /* PM_MRK_FPU_FIN */ + { 0x30000a, 0x400018 }, /* PM_MRK_INST_FIN */ + { 0x3000f6, 0x40000e }, /* PM_L1_DCACHE_RELOAD_VALID */ + { 0x3000fe, 0x400056 }, /* PM_DATA_FROM_L3MISS */ +}; + +/* + * This could be made more efficient with a binary search on + * a presorted list, if necessary + */ +static int find_alternatives_list(unsigned int event) +{ + int i, j; + unsigned int alt; + + for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) { + if (event < event_alternatives[i][0]) + return -1; + for (j = 0; j < MAX_ALT; ++j) { + alt = event_alternatives[i][j]; + if (!alt || event < alt) + break; + if (event == alt) + return i; + } + } + return -1; +} + +static int p6_get_alternatives(unsigned int event, unsigned int alt[]) +{ + int i, j; + unsigned int aevent, psel, pmc; + unsigned int nalt = 1; + + alt[0] = event; + + /* check the alternatives table */ + i = find_alternatives_list(event); + if (i >= 0) { + /* copy out alternatives from list */ + for (j = 0; j < MAX_ALT; ++j) { + aevent = event_alternatives[i][j]; + if (!aevent) + break; + if (aevent != event) + alt[nalt++] = aevent; + } + + } else { + /* Check for alternative ways of computing sum events */ + /* PMCSEL 0x32 counter N == PMCSEL 0x34 counter 5-N */ + psel = event & (PM_PMCSEL_MSK & ~1); /* ignore edge bit */ + pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; + if (pmc && (psel == 0x32 || psel == 0x34)) + alt[nalt++] = ((event ^ 0x6) & ~PM_PMC_MSKS) | + ((5 - pmc) << PM_PMC_SH); + + /* PMCSEL 0x38 counter N == PMCSEL 0x3a counter N+/-2 */ + if (pmc && (psel == 0x38 || psel == 0x3a)) + alt[nalt++] = ((event ^ 0x2) & ~PM_PMC_MSKS) | + ((pmc > 2? pmc - 2: pmc + 2) << PM_PMC_SH); + } + + return nalt; +} + +static void p6_disable_pmc(unsigned int pmc, u64 mmcr[]) +{ + /* Set PMCxSEL to 0 to disable PMCx */ + mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc)); +} + +static int power6_generic_events[] = { + [PERF_COUNT_CPU_CYCLES] = 0x1e, + [PERF_COUNT_INSTRUCTIONS] = 2, + [PERF_COUNT_CACHE_REFERENCES] = 0x280030, /* LD_REF_L1 */ + [PERF_COUNT_CACHE_MISSES] = 0x30000c, /* LD_MISS_L1 */ + [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x410a0, /* BR_PRED */ + [PERF_COUNT_BRANCH_MISSES] = 0x400052, /* BR_MPRED */ +}; + +struct power_pmu power6_pmu = { + .n_counter = 4, + .max_alternatives = MAX_ALT, + .add_fields = 0x55, + .test_adder = 0, + .compute_mmcr = p6_compute_mmcr, + .get_constraint = p6_get_constraint, + .get_alternatives = p6_get_alternatives, + .disable_pmc = p6_disable_pmc, + .n_generic = ARRAY_SIZE(power6_generic_events), + .generic_events = power6_generic_events, +}; -- cgit v1.2.3 From 01d0287f068de2934109ba9b989d8807526cccc2 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 14 Jan 2009 13:44:19 +1100 Subject: powerpc/perf_counter: Make sure PMU gets enabled properly This makes sure that we call the platform-specific ppc_md.enable_pmcs function on each CPU before we try to use the PMU on that CPU. If the CPU goes off-line and then on-line, we need to do the enable_pmcs call again, so we use the hw_perf_counter_setup hook to ensure that. It gets called as each CPU comes online, but it isn't called on the CPU that is coming up, so this adds the CPU number as an argument to it (there were no non-empty instances of hw_perf_counter_setup before). This also arranges to set the pmcregs_in_use field of the lppaca (data structure shared with the hypervisor) on each CPU when we are using the PMU and clear it when we are not. This allows the hypervisor to optimize partition switches by not saving/restoring the PMU registers when we aren't using the PMU. Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/perf_counter.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index df3fe057dee..85ad25923c2 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -15,6 +15,7 @@ #include #include #include +#include struct cpu_hw_counters { int n_counters; @@ -24,6 +25,7 @@ struct cpu_hw_counters { struct perf_counter *counter[MAX_HWCOUNTERS]; unsigned int events[MAX_HWCOUNTERS]; u64 mmcr[3]; + u8 pmcs_enabled; }; DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters); @@ -261,6 +263,15 @@ u64 hw_perf_save_disable(void) cpuhw->disabled = 1; cpuhw->n_added = 0; + /* + * Check if we ever enabled the PMU on this cpu. + */ + if (!cpuhw->pmcs_enabled) { + if (ppc_md.enable_pmcs) + ppc_md.enable_pmcs(); + cpuhw->pmcs_enabled = 1; + } + /* * Set the 'freeze counters' bit. * The barrier is to make sure the mtspr has been @@ -305,6 +316,8 @@ void hw_perf_restore(u64 disable) mtspr(SPRN_MMCRA, cpuhw->mmcr[2]); mtspr(SPRN_MMCR1, cpuhw->mmcr[1]); mtspr(SPRN_MMCR0, cpuhw->mmcr[0]); + if (cpuhw->n_counters == 0) + get_lppaca()->pmcregs_in_use = 0; goto out; } @@ -323,6 +336,7 @@ void hw_perf_restore(u64 disable) * bit set and set the hardware counters to their initial values. * Then unfreeze the counters. */ + get_lppaca()->pmcregs_in_use = 1; mtspr(SPRN_MMCRA, cpuhw->mmcr[2]); mtspr(SPRN_MMCR1, cpuhw->mmcr[1]); mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)) @@ -741,6 +755,14 @@ static void perf_counter_interrupt(struct pt_regs *regs) } } +void hw_perf_counter_setup(int cpu) +{ + struct cpu_hw_counters *cpuhw = &per_cpu(cpu_hw_counters, cpu); + + memset(cpuhw, 0, sizeof(*cpuhw)); + cpuhw->mmcr[0] = MMCR0_FC; +} + extern struct power_pmu ppc970_pmu; extern struct power_pmu power6_pmu; -- cgit v1.2.3 From 3b6f9e5cb21964b7ce12bf81076f830885563ec8 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 14 Jan 2009 21:00:30 +1100 Subject: perf_counter: Add support for pinned and exclusive counter groups Impact: New perf_counter features A pinned counter group is one that the user wants to have on the CPU whenever possible, i.e. whenever the associated task is running, for a per-task group, or always for a per-cpu group. If the system cannot satisfy that, it puts the group into an error state where it is not scheduled any more and reads from it return EOF (i.e. 0 bytes read). The group can be released from error state and made readable again using prctl(PR_TASK_PERF_COUNTERS_ENABLE). When we have finer-grained enable/disable controls on counters we'll be able to reset the error state on individual groups. An exclusive group is one that the user wants to be the only group using the CPU performance monitor hardware whenever it is on. The counter group scheduler will not schedule an exclusive group if there are already other groups on the CPU and will not schedule other groups onto the CPU if there is an exclusive group scheduled (that statement does not apply to groups containing only software counters, which can always go on and which do not prevent an exclusive group from going on). With an exclusive group, we will be able to let users program PMU registers at a low level without the concern that those settings will perturb other measurements. Along the way this reorganizes things a little: - is_software_counter() is moved to perf_counter.h. - cpuctx->active_oncpu now records the number of hardware counters on the CPU, i.e. it now excludes software counters. Nothing was reading cpuctx->active_oncpu before, so this change is harmless. - A new cpuctx->exclusive field records whether we currently have an exclusive group on the CPU. - counter_sched_out moves higher up in perf_counter.c and gets called from __perf_counter_remove_from_context and __perf_counter_exit_task, where we used to have essentially the same code. - __perf_counter_sched_in now goes through the counter list twice, doing the pinned counters in the first loop and the non-pinned counters in the second loop, in order to give the pinned counters the best chance to be scheduled in. Note that only a group leader can be exclusive or pinned, and that attribute applies to the whole group. This avoids some awkwardness in some corner cases (e.g. where a group leader is closed and the other group members get added to the context list). If we want to relax that restriction later, we can, and it is easier to relax a restriction than to apply a new one. This doesn't yet handle the case where a pinned counter is inherited and goes into error state in the child - the error state is not propagated up to the parent when the child exits, and arguably it should. Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/perf_counter.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 85ad25923c2..5b0211348c7 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -35,14 +35,6 @@ void perf_counter_print_debug(void) { } -/* - * Return 1 for a software counter, 0 for a hardware counter - */ -static inline int is_software_counter(struct perf_counter *counter) -{ - return !counter->hw_event.raw && counter->hw_event.type < 0; -} - /* * Read one performance monitor counter (PMC). */ @@ -443,6 +435,7 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader, */ for (i = n0; i < n0 + n; ++i) cpuhw->counter[i]->hw.config = cpuhw->events[i]; + cpuctx->active_oncpu += n; n = 1; counter_sched_in(group_leader, cpu); list_for_each_entry(sub, &group_leader->sibling_list, list_entry) { @@ -451,7 +444,6 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader, ++n; } } - cpuctx->active_oncpu += n; ctx->nr_active += n; return 1; -- cgit v1.2.3 From 0475f9ea8e2cc030298908949e0d5da9f2fc2cfe Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 11 Feb 2009 14:35:35 +1100 Subject: perf_counters: allow users to count user, kernel and/or hypervisor events Impact: new perf_counter feature This extends the perf_counter_hw_event struct with bits that specify that events in user, kernel and/or hypervisor mode should not be counted (i.e. should be excluded), and adds code to program the PMU mode selection bits accordingly on x86 and powerpc. For software counters, we don't currently have the infrastructure to distinguish which mode an event occurs in, so we currently fail the counter initialization if the setting of the hw_event.exclude_* bits would require us to distinguish. Context switches and CPU migrations are currently considered to occur in kernel mode. On x86, this changes the previous policy that only root can count kernel events. Now non-root users can count kernel events or exclude them. Non-root users still can't use NMI events, though. On x86 we don't appear to have any way to control whether hypervisor events are counted or not, so hw_event.exclude_hv is ignored. On powerpc, the selection of whether to count events in user, kernel and/or hypervisor mode is PMU-wide, not per-counter, so this adds a check that the hw_event.exclude_* settings are the same as other events on the PMU. Counters being added to a group have to have the same settings as the other hardware counters in the group. Counters and groups can only be enabled in hw_perf_group_sched_in or power_perf_enable if they have the same settings as any other counters already on the PMU. If we are not running on a hypervisor, the exclude_hv setting is ignored (by forcing it to 0) since we can't ever get any hypervisor events. Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/perf_counter.c | 68 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 3 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 5b0211348c7..bd6ba85beb5 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -16,6 +16,7 @@ #include #include #include +#include struct cpu_hw_counters { int n_counters; @@ -214,6 +215,36 @@ static int power_check_constraints(unsigned int event[], int n_ev) return 0; } +/* + * Check if newly-added counters have consistent settings for + * exclude_{user,kernel,hv} with each other and any previously + * added counters. + */ +static int check_excludes(struct perf_counter **ctrs, int n_prev, int n_new) +{ + int eu, ek, eh; + int i, n; + struct perf_counter *counter; + + n = n_prev + n_new; + if (n <= 1) + return 0; + + eu = ctrs[0]->hw_event.exclude_user; + ek = ctrs[0]->hw_event.exclude_kernel; + eh = ctrs[0]->hw_event.exclude_hv; + if (n_prev == 0) + n_prev = 1; + for (i = n_prev; i < n; ++i) { + counter = ctrs[i]; + if (counter->hw_event.exclude_user != eu || + counter->hw_event.exclude_kernel != ek || + counter->hw_event.exclude_hv != eh) + return -EAGAIN; + } + return 0; +} + static void power_perf_read(struct perf_counter *counter) { long val, delta, prev; @@ -323,6 +354,20 @@ void hw_perf_restore(u64 disable) goto out; } + /* + * Add in MMCR0 freeze bits corresponding to the + * hw_event.exclude_* bits for the first counter. + * We have already checked that all counters have the + * same values for these bits as the first counter. + */ + counter = cpuhw->counter[0]; + if (counter->hw_event.exclude_user) + cpuhw->mmcr[0] |= MMCR0_FCP; + if (counter->hw_event.exclude_kernel) + cpuhw->mmcr[0] |= MMCR0_FCS; + if (counter->hw_event.exclude_hv) + cpuhw->mmcr[0] |= MMCR0_FCHV; + /* * Write the new configuration to MMCR* with the freeze * bit set and set the hardware counters to their initial values. @@ -424,6 +469,8 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader, &cpuhw->counter[n0], &cpuhw->events[n0]); if (n < 0) return -EAGAIN; + if (check_excludes(cpuhw->counter, n0, n)) + return -EAGAIN; if (power_check_constraints(cpuhw->events, n + n0)) return -EAGAIN; cpuhw->n_counters = n0 + n; @@ -476,6 +523,8 @@ static int power_perf_enable(struct perf_counter *counter) goto out; cpuhw->counter[n0] = counter; cpuhw->events[n0] = counter->hw.config; + if (check_excludes(cpuhw->counter, n0, 1)) + goto out; if (power_check_constraints(cpuhw->events, n0 + 1)) goto out; @@ -554,6 +603,17 @@ hw_perf_counter_init(struct perf_counter *counter) counter->hw.config_base = ev; counter->hw.idx = 0; + /* + * If we are not running on a hypervisor, force the + * exclude_hv bit to 0 so that we don't care what + * the user set it to. This also means that we don't + * set the MMCR0_FCHV bit, which unconditionally freezes + * the counters on the PPC970 variants used in Apple G5 + * machines (since MSR.HV is always 1 on those machines). + */ + if (!firmware_has_feature(FW_FEATURE_LPAR)) + counter->hw_event.exclude_hv = 0; + /* * If this is in a group, check if it can go on with all the * other hardware counters in the group. We assume the counter @@ -566,11 +626,13 @@ hw_perf_counter_init(struct perf_counter *counter) if (n < 0) return NULL; } - events[n++] = ev; - if (power_check_constraints(events, n)) + events[n] = ev; + if (check_excludes(ctrs, n, 1)) + return NULL; + if (power_check_constraints(events, n + 1)) return NULL; - counter->hw.config = events[n - 1]; + counter->hw.config = events[n]; atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period); return &power_perf_ops; } -- cgit v1.2.3 From d095cd46dac104e4d2a4967c7c19b55a12f78240 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 23 Feb 2009 23:01:28 +1100 Subject: perfcounters/powerpc: Make exclude_kernel bit work on Apple G5 processors Currently, setting hw_event.exclude_kernel does nothing on the PPC970 variants used in Apple G5 machines, because they have the HV (hypervisor) bit in the MSR forced to 1, so as far as the PMU is concerned, the kernel runs in hypervisor mode. Thus we have to use the MMCR0_FCHV (freeze counters in hypervisor mode) bit rather than the MMCR0_FCS (freeze counters in supervisor mode) bit. This checks the MSR.HV bit at startup, and if it is set, we set the freeze_counters_kernel variable to MMCR0_FCHV (it was initialized to MMCR0_FCS). We then use that whenever we need to exclude kernel events. Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/perf_counter.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index bd6ba85beb5..6e27913ec0d 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -32,6 +32,15 @@ DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters); struct power_pmu *ppmu; +/* + * Normally, to ignore kernel events we set the FCS (freeze counters + * in supervisor mode) bit in MMCR0, but if the kernel runs with the + * hypervisor bit set in the MSR, or if we are running on a processor + * where the hypervisor bit is forced to 1 (as on Apple G5 processors), + * then we need to use the FCHV bit to ignore kernel events. + */ +static unsigned int freeze_counters_kernel = MMCR0_FCS; + void perf_counter_print_debug(void) { } @@ -364,7 +373,7 @@ void hw_perf_restore(u64 disable) if (counter->hw_event.exclude_user) cpuhw->mmcr[0] |= MMCR0_FCP; if (counter->hw_event.exclude_kernel) - cpuhw->mmcr[0] |= MMCR0_FCS; + cpuhw->mmcr[0] |= freeze_counters_kernel; if (counter->hw_event.exclude_hv) cpuhw->mmcr[0] |= MMCR0_FCHV; @@ -606,10 +615,7 @@ hw_perf_counter_init(struct perf_counter *counter) /* * If we are not running on a hypervisor, force the * exclude_hv bit to 0 so that we don't care what - * the user set it to. This also means that we don't - * set the MMCR0_FCHV bit, which unconditionally freezes - * the counters on the PPC970 variants used in Apple G5 - * machines (since MSR.HV is always 1 on those machines). + * the user set it to. */ if (!firmware_has_feature(FW_FEATURE_LPAR)) counter->hw_event.exclude_hv = 0; @@ -841,6 +847,13 @@ static int init_perf_counters(void) ppmu = &power6_pmu; break; } + + /* + * Use FCHV to ignore kernel events if MSR.HV is set. + */ + if (mfmsr() & MSR_HV) + freeze_counters_kernel = MMCR0_FCHV; + return 0; } -- cgit v1.2.3 From 742bd95ba96e19b3f7196c3a0834ebc17c8ba006 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 24 Feb 2009 11:33:56 +1100 Subject: perfcounters/powerpc: Add support for POWER5 processors This adds the back-end for the PMU on the POWER5 processor. This knows how to use the fixed-function PMC5 and PMC6 (instructions completed and run cycles). Unlike POWER6, PMC5/6 obey the freeze conditions and can generate interrupts, so their use doesn't impose any extra restrictions. POWER5+ is different and is not supported by this patch. Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/Makefile | 3 +- arch/powerpc/kernel/perf_counter.c | 4 + arch/powerpc/kernel/power5-pmu.c | 475 +++++++++++++++++++++++++++++++++++++ 3 files changed, 481 insertions(+), 1 deletion(-) create mode 100644 arch/powerpc/kernel/power5-pmu.c (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 7c941ec3b23..b4c6f466164 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -94,7 +94,8 @@ obj-$(CONFIG_AUDIT) += audit.o obj64-$(CONFIG_AUDIT) += compat_audit.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o -obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o ppc970-pmu.o power6-pmu.o +obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o ppc970-pmu.o power5-pmu.o \ + power6-pmu.o obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 6e27913ec0d..112332d07fc 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -824,6 +824,7 @@ void hw_perf_counter_setup(int cpu) } extern struct power_pmu ppc970_pmu; +extern struct power_pmu power5_pmu; extern struct power_pmu power6_pmu; static int init_perf_counters(void) @@ -843,6 +844,9 @@ static int init_perf_counters(void) case PV_970MP: ppmu = &ppc970_pmu; break; + case PV_POWER5: + ppmu = &power5_pmu; + break; case 0x3e: ppmu = &power6_pmu; break; diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c new file mode 100644 index 00000000000..379ed1087cc --- /dev/null +++ b/arch/powerpc/kernel/power5-pmu.c @@ -0,0 +1,475 @@ +/* + * Performance counter support for POWER5 (not POWER5++) processors. + * + * Copyright 2009 Paul Mackerras, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include + +/* + * Bits in event code for POWER5 (not POWER5++) + */ +#define PM_PMC_SH 20 /* PMC number (1-based) for direct events */ +#define PM_PMC_MSK 0xf +#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH) +#define PM_UNIT_SH 16 /* TTMMUX number and setting - unit select */ +#define PM_UNIT_MSK 0xf +#define PM_BYTE_SH 12 /* Byte number of event bus to use */ +#define PM_BYTE_MSK 7 +#define PM_GRS_SH 8 /* Storage subsystem mux select */ +#define PM_GRS_MSK 7 +#define PM_BUSEVENT_MSK 0x80 /* Set if event uses event bus */ +#define PM_PMCSEL_MSK 0x7f + +/* Values in PM_UNIT field */ +#define PM_FPU 0 +#define PM_ISU0 1 +#define PM_IFU 2 +#define PM_ISU1 3 +#define PM_IDU 4 +#define PM_ISU0_ALT 6 +#define PM_GRS 7 +#define PM_LSU0 8 +#define PM_LSU1 0xc +#define PM_LASTUNIT 0xc + +/* + * Bits in MMCR1 for POWER5 + */ +#define MMCR1_TTM0SEL_SH 62 +#define MMCR1_TTM1SEL_SH 60 +#define MMCR1_TTM2SEL_SH 58 +#define MMCR1_TTM3SEL_SH 56 +#define MMCR1_TTMSEL_MSK 3 +#define MMCR1_TD_CP_DBG0SEL_SH 54 +#define MMCR1_TD_CP_DBG1SEL_SH 52 +#define MMCR1_TD_CP_DBG2SEL_SH 50 +#define MMCR1_TD_CP_DBG3SEL_SH 48 +#define MMCR1_GRS_L2SEL_SH 46 +#define MMCR1_GRS_L2SEL_MSK 3 +#define MMCR1_GRS_L3SEL_SH 44 +#define MMCR1_GRS_L3SEL_MSK 3 +#define MMCR1_GRS_MCSEL_SH 41 +#define MMCR1_GRS_MCSEL_MSK 7 +#define MMCR1_GRS_FABSEL_SH 39 +#define MMCR1_GRS_FABSEL_MSK 3 +#define MMCR1_PMC1_ADDER_SEL_SH 35 +#define MMCR1_PMC2_ADDER_SEL_SH 34 +#define MMCR1_PMC3_ADDER_SEL_SH 33 +#define MMCR1_PMC4_ADDER_SEL_SH 32 +#define MMCR1_PMC1SEL_SH 25 +#define MMCR1_PMC2SEL_SH 17 +#define MMCR1_PMC3SEL_SH 9 +#define MMCR1_PMC4SEL_SH 1 +#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8) +#define MMCR1_PMCSEL_MSK 0x7f + +/* + * Bits in MMCRA + */ + +/* + * Layout of constraint bits: + * 6666555555555544444444443333333333222222222211111111110000000000 + * 3210987654321098765432109876543210987654321098765432109876543210 + * <><>[ ><><>< ><> [ >[ >[ >< >< >< >< ><><><><><><> + * T0T1 NC G0G1G2 G3 UC PS1PS2 B0 B1 B2 B3 P6P5P4P3P2P1 + * + * T0 - TTM0 constraint + * 54-55: TTM0SEL value (0=FPU, 2=IFU, 3=ISU1) 0xc0_0000_0000_0000 + * + * T1 - TTM1 constraint + * 52-53: TTM1SEL value (0=IDU, 3=GRS) 0x30_0000_0000_0000 + * + * NC - number of counters + * 51: NC error 0x0008_0000_0000_0000 + * 48-50: number of events needing PMC1-4 0x0007_0000_0000_0000 + * + * G0..G3 - GRS mux constraints + * 46-47: GRS_L2SEL value + * 44-45: GRS_L3SEL value + * 41-44: GRS_MCSEL value + * 39-40: GRS_FABSEL value + * Note that these match up with their bit positions in MMCR1 + * + * UC - unit constraint: can't have all three of FPU|IFU|ISU1, ISU0, IDU|GRS + * 37: UC3 error 0x20_0000_0000 + * 36: FPU|IFU|ISU1 events needed 0x10_0000_0000 + * 35: ISU0 events needed 0x08_0000_0000 + * 34: IDU|GRS events needed 0x04_0000_0000 + * + * PS1 + * 33: PS1 error 0x2_0000_0000 + * 31-32: count of events needing PMC1/2 0x1_8000_0000 + * + * PS2 + * 30: PS2 error 0x4000_0000 + * 28-29: count of events needing PMC3/4 0x3000_0000 + * + * B0 + * 24-27: Byte 0 event source 0x0f00_0000 + * Encoding as for the event code + * + * B1, B2, B3 + * 20-23, 16-19, 12-15: Byte 1, 2, 3 event sources + * + * P1..P6 + * 0-11: Count of events needing PMC1..PMC6 + */ + +static const int grsel_shift[8] = { + MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, + MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, + MMCR1_GRS_MCSEL_SH, MMCR1_GRS_FABSEL_SH +}; + +/* Masks and values for using events from the various units */ +static u64 unit_cons[PM_LASTUNIT+1][2] = { + [PM_FPU] = { 0xc0002000000000ull, 0x00001000000000ull }, + [PM_ISU0] = { 0x00002000000000ull, 0x00000800000000ull }, + [PM_ISU1] = { 0xc0002000000000ull, 0xc0001000000000ull }, + [PM_IFU] = { 0xc0002000000000ull, 0x80001000000000ull }, + [PM_IDU] = { 0x30002000000000ull, 0x00000400000000ull }, + [PM_GRS] = { 0x30002000000000ull, 0x30000400000000ull }, +}; + +static int power5_get_constraint(unsigned int event, u64 *maskp, u64 *valp) +{ + int pmc, byte, unit, sh; + int bit, fmask; + u64 mask = 0, value = 0; + int grp = -1; + + pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; + if (pmc) { + if (pmc > 6) + return -1; + sh = (pmc - 1) * 2; + mask |= 2 << sh; + value |= 1 << sh; + if (pmc <= 4) + grp = (pmc - 1) >> 1; + else if (event != 0x500009 && event != 0x600005) + return -1; + } + if (event & PM_BUSEVENT_MSK) { + unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; + if (unit > PM_LASTUNIT) + return -1; + if (unit == PM_ISU0_ALT) + unit = PM_ISU0; + mask |= unit_cons[unit][0]; + value |= unit_cons[unit][1]; + byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; + if (byte >= 4) { + if (unit != PM_LSU1) + return -1; + /* Map LSU1 low word (bytes 4-7) to unit LSU1+1 */ + ++unit; + byte &= 3; + } + if (unit == PM_GRS) { + bit = event & 7; + fmask = (bit == 6)? 7: 3; + sh = grsel_shift[bit]; + mask |= (u64)fmask << sh; + value |= (u64)((event >> PM_GRS_SH) & fmask) << sh; + } + /* + * Bus events on bytes 0 and 2 can be counted + * on PMC1/2; bytes 1 and 3 on PMC3/4. + */ + if (!pmc) + grp = byte & 1; + /* Set byte lane select field */ + mask |= 0xfULL << (24 - 4 * byte); + value |= (u64)unit << (24 - 4 * byte); + } + if (grp == 0) { + /* increment PMC1/2 field */ + mask |= 0x200000000ull; + value |= 0x080000000ull; + } else if (grp == 1) { + /* increment PMC3/4 field */ + mask |= 0x40000000ull; + value |= 0x10000000ull; + } + if (pmc < 5) { + /* need a counter from PMC1-4 set */ + mask |= 0x8000000000000ull; + value |= 0x1000000000000ull; + } + *maskp = mask; + *valp = value; + return 0; +} + +#define MAX_ALT 3 /* at most 3 alternatives for any event */ + +static const unsigned int event_alternatives[][MAX_ALT] = { + { 0x120e4, 0x400002 }, /* PM_GRP_DISP_REJECT */ + { 0x410c7, 0x441084 }, /* PM_THRD_L2MISS_BOTH_CYC */ + { 0x100005, 0x600005 }, /* PM_RUN_CYC */ + { 0x100009, 0x200009, 0x500009 }, /* PM_INST_CMPL */ + { 0x300009, 0x400009 }, /* PM_INST_DISP */ +}; + +/* + * Scan the alternatives table for a match and return the + * index into the alternatives table if found, else -1. + */ +static int find_alternative(unsigned int event) +{ + int i, j; + + for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) { + if (event < event_alternatives[i][0]) + break; + for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j) + if (event == event_alternatives[i][j]) + return i; + } + return -1; +} + +static const unsigned char bytedecode_alternatives[4][4] = { + /* PMC 1 */ { 0x21, 0x23, 0x25, 0x27 }, + /* PMC 2 */ { 0x07, 0x17, 0x0e, 0x1e }, + /* PMC 3 */ { 0x20, 0x22, 0x24, 0x26 }, + /* PMC 4 */ { 0x07, 0x17, 0x0e, 0x1e } +}; + +/* + * Some direct events for decodes of event bus byte 3 have alternative + * PMCSEL values on other counters. This returns the alternative + * event code for those that do, or -1 otherwise. + */ +static int find_alternative_bdecode(unsigned int event) +{ + int pmc, altpmc, pp, j; + + pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; + if (pmc == 0 || pmc > 4) + return -1; + altpmc = 5 - pmc; /* 1 <-> 4, 2 <-> 3 */ + pp = event & PM_PMCSEL_MSK; + for (j = 0; j < 4; ++j) { + if (bytedecode_alternatives[pmc - 1][j] == pp) { + return (event & ~(PM_PMC_MSKS | PM_PMCSEL_MSK)) | + (altpmc << PM_PMC_SH) | + bytedecode_alternatives[altpmc - 1][j]; + } + } + return -1; +} + +static int power5_get_alternatives(unsigned int event, unsigned int alt[]) +{ + int i, j, ae, nalt = 1; + + alt[0] = event; + nalt = 1; + i = find_alternative(event); + if (i >= 0) { + for (j = 0; j < MAX_ALT; ++j) { + ae = event_alternatives[i][j]; + if (ae && ae != event) + alt[nalt++] = ae; + } + } else { + ae = find_alternative_bdecode(event); + if (ae > 0) + alt[nalt++] = ae; + } + return nalt; +} + +static int power5_compute_mmcr(unsigned int event[], int n_ev, + unsigned int hwc[], u64 mmcr[]) +{ + u64 mmcr1 = 0; + unsigned int pmc, unit, byte, psel; + unsigned int ttm, grp; + int i, isbus, bit, grsel; + unsigned int pmc_inuse = 0; + unsigned int pmc_grp_use[2]; + unsigned char busbyte[4]; + unsigned char unituse[16]; + int ttmuse; + + if (n_ev > 6) + return -1; + + /* First pass to count resource use */ + pmc_grp_use[0] = pmc_grp_use[1] = 0; + memset(busbyte, 0, sizeof(busbyte)); + memset(unituse, 0, sizeof(unituse)); + for (i = 0; i < n_ev; ++i) { + pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; + if (pmc) { + if (pmc > 6) + return -1; + if (pmc_inuse & (1 << (pmc - 1))) + return -1; + pmc_inuse |= 1 << (pmc - 1); + /* count 1/2 vs 3/4 use */ + if (pmc <= 4) + ++pmc_grp_use[(pmc - 1) >> 1]; + } + if (event[i] & PM_BUSEVENT_MSK) { + unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; + byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; + if (unit > PM_LASTUNIT) + return -1; + if (unit == PM_ISU0_ALT) + unit = PM_ISU0; + if (byte >= 4) { + if (unit != PM_LSU1) + return -1; + ++unit; + byte &= 3; + } + if (!pmc) + ++pmc_grp_use[byte & 1]; + if (busbyte[byte] && busbyte[byte] != unit) + return -1; + busbyte[byte] = unit; + unituse[unit] = 1; + } + } + if (pmc_grp_use[0] > 2 || pmc_grp_use[1] > 2) + return -1; + + /* + * Assign resources and set multiplexer selects. + * + * PM_ISU0 can go either on TTM0 or TTM1, but that's the only + * choice we have to deal with. + */ + if (unituse[PM_ISU0] & + (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_ISU1])) { + unituse[PM_ISU0_ALT] = 1; /* move ISU to TTM1 */ + unituse[PM_ISU0] = 0; + } + /* Set TTM[01]SEL fields. */ + ttmuse = 0; + for (i = PM_FPU; i <= PM_ISU1; ++i) { + if (!unituse[i]) + continue; + if (ttmuse++) + return -1; + mmcr1 |= (u64)i << MMCR1_TTM0SEL_SH; + } + ttmuse = 0; + for (; i <= PM_GRS; ++i) { + if (!unituse[i]) + continue; + if (ttmuse++) + return -1; + mmcr1 |= (u64)(i & 3) << MMCR1_TTM1SEL_SH; + } + if (ttmuse > 1) + return -1; + + /* Set byte lane select fields, TTM[23]SEL and GRS_*SEL. */ + for (byte = 0; byte < 4; ++byte) { + unit = busbyte[byte]; + if (!unit) + continue; + if (unit == PM_ISU0 && unituse[PM_ISU0_ALT]) { + /* get ISU0 through TTM1 rather than TTM0 */ + unit = PM_ISU0_ALT; + } else if (unit == PM_LSU1 + 1) { + /* select lower word of LSU1 for this byte */ + mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte); + } + ttm = unit >> 2; + mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte); + } + + /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */ + for (i = 0; i < n_ev; ++i) { + pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; + unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; + byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; + psel = event[i] & PM_PMCSEL_MSK; + isbus = event[i] & PM_BUSEVENT_MSK; + if (!pmc) { + /* Bus event or any-PMC direct event */ + for (pmc = 0; pmc < 4; ++pmc) { + if (pmc_inuse & (1 << pmc)) + continue; + grp = (pmc >> 1) & 1; + if (isbus) { + if (grp == (byte & 1)) + break; + } else if (pmc_grp_use[grp] < 2) { + ++pmc_grp_use[grp]; + break; + } + } + pmc_inuse |= 1 << pmc; + } else if (pmc <= 4) { + /* Direct event */ + --pmc; + if ((psel == 8 || psel == 0x10) && isbus && (byte & 2)) + /* add events on higher-numbered bus */ + mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc); + } else { + /* Instructions or run cycles on PMC5/6 */ + --pmc; + } + if (isbus && unit == PM_GRS) { + bit = psel & 7; + grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK; + mmcr1 |= (u64)grsel << grsel_shift[bit]; + } + if (pmc <= 3) + mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc); + hwc[i] = pmc; + } + + /* Return MMCRx values */ + mmcr[0] = 0; + if (pmc_inuse & 1) + mmcr[0] = MMCR0_PMC1CE; + if (pmc_inuse & 0x3e) + mmcr[0] |= MMCR0_PMCjCE; + mmcr[1] = mmcr1; + mmcr[2] = 0; + return 0; +} + +static void power5_disable_pmc(unsigned int pmc, u64 mmcr[]) +{ + if (pmc <= 3) + mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc)); +} + +static int power5_generic_events[] = { + [PERF_COUNT_CPU_CYCLES] = 0xf, + [PERF_COUNT_INSTRUCTIONS] = 0x100009, + [PERF_COUNT_CACHE_REFERENCES] = 0x4c1090, /* LD_REF_L1 */ + [PERF_COUNT_CACHE_MISSES] = 0x3c1088, /* LD_MISS_L1 */ + [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x230e4, /* BR_ISSUED */ + [PERF_COUNT_BRANCH_MISSES] = 0x230e5, /* BR_MPRED_CR */ +}; + +struct power_pmu power5_pmu = { + .n_counter = 6, + .max_alternatives = MAX_ALT, + .add_fields = 0x7000090000555ull, + .test_adder = 0x3000490000000ull, + .compute_mmcr = power5_compute_mmcr, + .get_constraint = power5_get_constraint, + .get_alternatives = power5_get_alternatives, + .disable_pmc = power5_disable_pmc, + .n_generic = ARRAY_SIZE(power5_generic_events), + .generic_events = power5_generic_events, +}; -- cgit v1.2.3 From f3dfd2656deb81a0addee4f4ceff66b50a387388 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 26 Feb 2009 22:43:46 +1100 Subject: perfcounters: fix a few minor cleanliness issues This fixes three issues noticed by Arnd Bergmann: - Add #ifdef __KERNEL__ and move some things around in perf_counter.h to make sure only the bits that userspace needs are exported to userspace. - Use __u64, __s64, __u32 types in the structs exported to userspace rather than u64, s64, u32. - Make the sys_perf_counter_open syscall available to the SPUs on Cell platforms. And one issue that I noticed in looking at the code again: - Wrap the perf_counter_open syscall with SYSCALL_DEFINE4 so we get the proper handling of int arguments on ppc64 (and some other 64-bit architectures). Reported-by: Arnd Bergmann Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/systbl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h index 4c8095f6bec..d312eec8abb 100644 --- a/arch/powerpc/include/asm/systbl.h +++ b/arch/powerpc/include/asm/systbl.h @@ -322,4 +322,4 @@ SYSCALL_SPU(epoll_create1) SYSCALL_SPU(dup3) SYSCALL_SPU(pipe2) SYSCALL(inotify_init1) -SYSCALL(perf_counter_open) +SYSCALL_SPU(perf_counter_open) -- cgit v1.2.3 From 86028598de16538f02519141756ccf4accfc29a6 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 5 Mar 2009 14:05:57 +1100 Subject: perfcounters/powerpc: fix oops with multiple counters in a group Impact: fix oops-causing bug This fixes a bug in the powerpc hw_perf_counter_init where the code didn't initialize ctrs[n] before passing the ctrs array to check_excludes, leading to possible oopses and other incorrect behaviour. This fixes it by initializing ctrs[n] correctly. Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/perf_counter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 112332d07fc..4fec112386f 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -633,6 +633,7 @@ hw_perf_counter_init(struct perf_counter *counter) return NULL; } events[n] = ev; + ctrs[n] = counter; if (check_excludes(ctrs, n, 1)) return NULL; if (power_check_constraints(events, n + 1)) -- cgit v1.2.3 From aabbaa6036fd847c583f585c6bae82b5a033e6c7 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 6 Mar 2009 16:27:10 +1100 Subject: perfcounters/powerpc: add support for POWER5+ processors Impact: more hardware support This adds the back-end for the PMU on the POWER5+ processors (i.e. GS, including GS DD3 aka POWER5++). This doesn't use the fixed-function PMC5 and PMC6 since they don't respect the freeze conditions and don't generate interrupts, as on POWER6. Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/Makefile | 4 +- arch/powerpc/kernel/perf_counter.c | 4 + arch/powerpc/kernel/power5+-pmu.c | 452 +++++++++++++++++++++++++++++++++++++ 3 files changed, 458 insertions(+), 2 deletions(-) create mode 100644 arch/powerpc/kernel/power5+-pmu.c (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index b4c6f466164..49851e0d8fd 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -94,8 +94,8 @@ obj-$(CONFIG_AUDIT) += audit.o obj64-$(CONFIG_AUDIT) += compat_audit.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o -obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o ppc970-pmu.o power5-pmu.o \ - power6-pmu.o +obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o ppc970-pmu.o \ + power5-pmu.o power5+-pmu.o power6-pmu.o obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 4fec112386f..162f3981fa2 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -826,6 +826,7 @@ void hw_perf_counter_setup(int cpu) extern struct power_pmu ppc970_pmu; extern struct power_pmu power5_pmu; +extern struct power_pmu power5p_pmu; extern struct power_pmu power6_pmu; static int init_perf_counters(void) @@ -848,6 +849,9 @@ static int init_perf_counters(void) case PV_POWER5: ppmu = &power5_pmu; break; + case PV_POWER5p: + ppmu = &power5p_pmu; + break; case 0x3e: ppmu = &power6_pmu; break; diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c new file mode 100644 index 00000000000..cec21ea65b0 --- /dev/null +++ b/arch/powerpc/kernel/power5+-pmu.c @@ -0,0 +1,452 @@ +/* + * Performance counter support for POWER5 (not POWER5++) processors. + * + * Copyright 2009 Paul Mackerras, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include + +/* + * Bits in event code for POWER5+ (POWER5 GS) and POWER5++ (POWER5 GS DD3) + */ +#define PM_PMC_SH 20 /* PMC number (1-based) for direct events */ +#define PM_PMC_MSK 0xf +#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH) +#define PM_UNIT_SH 16 /* TTMMUX number and setting - unit select */ +#define PM_UNIT_MSK 0xf +#define PM_BYTE_SH 12 /* Byte number of event bus to use */ +#define PM_BYTE_MSK 7 +#define PM_GRS_SH 8 /* Storage subsystem mux select */ +#define PM_GRS_MSK 7 +#define PM_BUSEVENT_MSK 0x80 /* Set if event uses event bus */ +#define PM_PMCSEL_MSK 0x7f + +/* Values in PM_UNIT field */ +#define PM_FPU 0 +#define PM_ISU0 1 +#define PM_IFU 2 +#define PM_ISU1 3 +#define PM_IDU 4 +#define PM_ISU0_ALT 6 +#define PM_GRS 7 +#define PM_LSU0 8 +#define PM_LSU1 0xc +#define PM_LASTUNIT 0xc + +/* + * Bits in MMCR1 for POWER5+ + */ +#define MMCR1_TTM0SEL_SH 62 +#define MMCR1_TTM1SEL_SH 60 +#define MMCR1_TTM2SEL_SH 58 +#define MMCR1_TTM3SEL_SH 56 +#define MMCR1_TTMSEL_MSK 3 +#define MMCR1_TD_CP_DBG0SEL_SH 54 +#define MMCR1_TD_CP_DBG1SEL_SH 52 +#define MMCR1_TD_CP_DBG2SEL_SH 50 +#define MMCR1_TD_CP_DBG3SEL_SH 48 +#define MMCR1_GRS_L2SEL_SH 46 +#define MMCR1_GRS_L2SEL_MSK 3 +#define MMCR1_GRS_L3SEL_SH 44 +#define MMCR1_GRS_L3SEL_MSK 3 +#define MMCR1_GRS_MCSEL_SH 41 +#define MMCR1_GRS_MCSEL_MSK 7 +#define MMCR1_GRS_FABSEL_SH 39 +#define MMCR1_GRS_FABSEL_MSK 3 +#define MMCR1_PMC1_ADDER_SEL_SH 35 +#define MMCR1_PMC2_ADDER_SEL_SH 34 +#define MMCR1_PMC3_ADDER_SEL_SH 33 +#define MMCR1_PMC4_ADDER_SEL_SH 32 +#define MMCR1_PMC1SEL_SH 25 +#define MMCR1_PMC2SEL_SH 17 +#define MMCR1_PMC3SEL_SH 9 +#define MMCR1_PMC4SEL_SH 1 +#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8) +#define MMCR1_PMCSEL_MSK 0x7f + +/* + * Bits in MMCRA + */ + +/* + * Layout of constraint bits: + * 6666555555555544444444443333333333222222222211111111110000000000 + * 3210987654321098765432109876543210987654321098765432109876543210 + * [ ><><>< ><> <><>[ > < >< >< >< ><><><><> + * NC G0G1G2 G3 T0T1 UC B0 B1 B2 B3 P4P3P2P1 + * + * NC - number of counters + * 51: NC error 0x0008_0000_0000_0000 + * 48-50: number of events needing PMC1-4 0x0007_0000_0000_0000 + * + * G0..G3 - GRS mux constraints + * 46-47: GRS_L2SEL value + * 44-45: GRS_L3SEL value + * 41-44: GRS_MCSEL value + * 39-40: GRS_FABSEL value + * Note that these match up with their bit positions in MMCR1 + * + * T0 - TTM0 constraint + * 36-37: TTM0SEL value (0=FPU, 2=IFU, 3=ISU1) 0x30_0000_0000 + * + * T1 - TTM1 constraint + * 34-35: TTM1SEL value (0=IDU, 3=GRS) 0x0c_0000_0000 + * + * UC - unit constraint: can't have all three of FPU|IFU|ISU1, ISU0, IDU|GRS + * 33: UC3 error 0x02_0000_0000 + * 32: FPU|IFU|ISU1 events needed 0x01_0000_0000 + * 31: ISU0 events needed 0x01_8000_0000 + * 30: IDU|GRS events needed 0x00_4000_0000 + * + * B0 + * 20-23: Byte 0 event source 0x00f0_0000 + * Encoding as for the event code + * + * B1, B2, B3 + * 16-19, 12-15, 8-11: Byte 1, 2, 3 event sources + * + * P4 + * 7: P1 error 0x80 + * 6-7: Count of events needing PMC4 + * + * P1..P3 + * 0-6: Count of events needing PMC1..PMC3 + */ + +static const int grsel_shift[8] = { + MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, + MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, + MMCR1_GRS_MCSEL_SH, MMCR1_GRS_FABSEL_SH +}; + +/* Masks and values for using events from the various units */ +static u64 unit_cons[PM_LASTUNIT+1][2] = { + [PM_FPU] = { 0x3200000000ull, 0x0100000000ull }, + [PM_ISU0] = { 0x0200000000ull, 0x0080000000ull }, + [PM_ISU1] = { 0x3200000000ull, 0x3100000000ull }, + [PM_IFU] = { 0x3200000000ull, 0x2100000000ull }, + [PM_IDU] = { 0x0e00000000ull, 0x0040000000ull }, + [PM_GRS] = { 0x0e00000000ull, 0x0c40000000ull }, +}; + +static int power5p_get_constraint(unsigned int event, u64 *maskp, u64 *valp) +{ + int pmc, byte, unit, sh; + int bit, fmask; + u64 mask = 0, value = 0; + + pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; + if (pmc) { + if (pmc > 4) + return -1; + sh = (pmc - 1) * 2; + mask |= 2 << sh; + value |= 1 << sh; + } + if (event & PM_BUSEVENT_MSK) { + unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; + if (unit > PM_LASTUNIT) + return -1; + if (unit == PM_ISU0_ALT) + unit = PM_ISU0; + mask |= unit_cons[unit][0]; + value |= unit_cons[unit][1]; + byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; + if (byte >= 4) { + if (unit != PM_LSU1) + return -1; + /* Map LSU1 low word (bytes 4-7) to unit LSU1+1 */ + ++unit; + byte &= 3; + } + if (unit == PM_GRS) { + bit = event & 7; + fmask = (bit == 6)? 7: 3; + sh = grsel_shift[bit]; + mask |= (u64)fmask << sh; + value |= (u64)((event >> PM_GRS_SH) & fmask) << sh; + } + /* Set byte lane select field */ + mask |= 0xfULL << (20 - 4 * byte); + value |= (u64)unit << (20 - 4 * byte); + } + mask |= 0x8000000000000ull; + value |= 0x1000000000000ull; + *maskp = mask; + *valp = value; + return 0; +} + +#define MAX_ALT 3 /* at most 3 alternatives for any event */ + +static const unsigned int event_alternatives[][MAX_ALT] = { + { 0x100c0, 0x40001f }, /* PM_GCT_FULL_CYC */ + { 0x120e4, 0x400002 }, /* PM_GRP_DISP_REJECT */ + { 0x230e2, 0x323087 }, /* PM_BR_PRED_CR */ + { 0x230e3, 0x223087, 0x3230a0 }, /* PM_BR_PRED_TA */ + { 0x410c7, 0x441084 }, /* PM_THRD_L2MISS_BOTH_CYC */ + { 0x800c4, 0xc20e0 }, /* PM_DTLB_MISS */ + { 0xc50c6, 0xc60e0 }, /* PM_MRK_DTLB_MISS */ + { 0x100009, 0x200009 }, /* PM_INST_CMPL */ + { 0x200015, 0x300015 }, /* PM_LSU_LMQ_SRQ_EMPTY_CYC */ + { 0x300009, 0x400009 }, /* PM_INST_DISP */ +}; + +/* + * Scan the alternatives table for a match and return the + * index into the alternatives table if found, else -1. + */ +static int find_alternative(unsigned int event) +{ + int i, j; + + for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) { + if (event < event_alternatives[i][0]) + break; + for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j) + if (event == event_alternatives[i][j]) + return i; + } + return -1; +} + +static const unsigned char bytedecode_alternatives[4][4] = { + /* PMC 1 */ { 0x21, 0x23, 0x25, 0x27 }, + /* PMC 2 */ { 0x07, 0x17, 0x0e, 0x1e }, + /* PMC 3 */ { 0x20, 0x22, 0x24, 0x26 }, + /* PMC 4 */ { 0x07, 0x17, 0x0e, 0x1e } +}; + +/* + * Some direct events for decodes of event bus byte 3 have alternative + * PMCSEL values on other counters. This returns the alternative + * event code for those that do, or -1 otherwise. This also handles + * alternative PCMSEL values for add events. + */ +static int find_alternative_bdecode(unsigned int event) +{ + int pmc, altpmc, pp, j; + + pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; + if (pmc == 0 || pmc > 4) + return -1; + altpmc = 5 - pmc; /* 1 <-> 4, 2 <-> 3 */ + pp = event & PM_PMCSEL_MSK; + for (j = 0; j < 4; ++j) { + if (bytedecode_alternatives[pmc - 1][j] == pp) { + return (event & ~(PM_PMC_MSKS | PM_PMCSEL_MSK)) | + (altpmc << PM_PMC_SH) | + bytedecode_alternatives[altpmc - 1][j]; + } + } + + /* new decode alternatives for power5+ */ + if (pmc == 1 && (pp == 0x0d || pp == 0x0e)) + return event + (2 << PM_PMC_SH) + (0x2e - 0x0d); + if (pmc == 3 && (pp == 0x2e || pp == 0x2f)) + return event - (2 << PM_PMC_SH) - (0x2e - 0x0d); + + /* alternative add event encodings */ + if (pp == 0x10 || pp == 0x28) + return ((event ^ (0x10 ^ 0x28)) & ~PM_PMC_MSKS) | + (altpmc << PM_PMC_SH); + + return -1; +} + +static int power5p_get_alternatives(unsigned int event, unsigned int alt[]) +{ + int i, j, ae, nalt = 1; + + alt[0] = event; + nalt = 1; + i = find_alternative(event); + if (i >= 0) { + for (j = 0; j < MAX_ALT; ++j) { + ae = event_alternatives[i][j]; + if (ae && ae != event) + alt[nalt++] = ae; + } + } else { + ae = find_alternative_bdecode(event); + if (ae > 0) + alt[nalt++] = ae; + } + return nalt; +} + +static int power5p_compute_mmcr(unsigned int event[], int n_ev, + unsigned int hwc[], u64 mmcr[]) +{ + u64 mmcr1 = 0; + unsigned int pmc, unit, byte, psel; + unsigned int ttm; + int i, isbus, bit, grsel; + unsigned int pmc_inuse = 0; + unsigned char busbyte[4]; + unsigned char unituse[16]; + int ttmuse; + + if (n_ev > 4) + return -1; + + /* First pass to count resource use */ + memset(busbyte, 0, sizeof(busbyte)); + memset(unituse, 0, sizeof(unituse)); + for (i = 0; i < n_ev; ++i) { + pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; + if (pmc) { + if (pmc > 4) + return -1; + if (pmc_inuse & (1 << (pmc - 1))) + return -1; + pmc_inuse |= 1 << (pmc - 1); + } + if (event[i] & PM_BUSEVENT_MSK) { + unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; + byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; + if (unit > PM_LASTUNIT) + return -1; + if (unit == PM_ISU0_ALT) + unit = PM_ISU0; + if (byte >= 4) { + if (unit != PM_LSU1) + return -1; + ++unit; + byte &= 3; + } + if (busbyte[byte] && busbyte[byte] != unit) + return -1; + busbyte[byte] = unit; + unituse[unit] = 1; + } + } + + /* + * Assign resources and set multiplexer selects. + * + * PM_ISU0 can go either on TTM0 or TTM1, but that's the only + * choice we have to deal with. + */ + if (unituse[PM_ISU0] & + (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_ISU1])) { + unituse[PM_ISU0_ALT] = 1; /* move ISU to TTM1 */ + unituse[PM_ISU0] = 0; + } + /* Set TTM[01]SEL fields. */ + ttmuse = 0; + for (i = PM_FPU; i <= PM_ISU1; ++i) { + if (!unituse[i]) + continue; + if (ttmuse++) + return -1; + mmcr1 |= (u64)i << MMCR1_TTM0SEL_SH; + } + ttmuse = 0; + for (; i <= PM_GRS; ++i) { + if (!unituse[i]) + continue; + if (ttmuse++) + return -1; + mmcr1 |= (u64)(i & 3) << MMCR1_TTM1SEL_SH; + } + if (ttmuse > 1) + return -1; + + /* Set byte lane select fields, TTM[23]SEL and GRS_*SEL. */ + for (byte = 0; byte < 4; ++byte) { + unit = busbyte[byte]; + if (!unit) + continue; + if (unit == PM_ISU0 && unituse[PM_ISU0_ALT]) { + /* get ISU0 through TTM1 rather than TTM0 */ + unit = PM_ISU0_ALT; + } else if (unit == PM_LSU1 + 1) { + /* select lower word of LSU1 for this byte */ + mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte); + } + ttm = unit >> 2; + mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte); + } + + /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */ + for (i = 0; i < n_ev; ++i) { + pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; + unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; + byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; + psel = event[i] & PM_PMCSEL_MSK; + isbus = event[i] & PM_BUSEVENT_MSK; + if (!pmc) { + /* Bus event or any-PMC direct event */ + for (pmc = 0; pmc < 4; ++pmc) { + if (!(pmc_inuse & (1 << pmc))) + break; + } + if (pmc >= 4) + return -1; + pmc_inuse |= 1 << pmc; + } else { + /* Direct event */ + --pmc; + if (isbus && (byte & 2) && + (psel == 8 || psel == 0x10 || psel == 0x28)) + /* add events on higher-numbered bus */ + mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc); + } + if (isbus && unit == PM_GRS) { + bit = psel & 7; + grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK; + mmcr1 |= (u64)grsel << grsel_shift[bit]; + } + if ((psel & 0x58) == 0x40 && (byte & 1) != ((pmc >> 1) & 1)) + /* select alternate byte lane */ + psel |= 0x10; + if (pmc <= 3) + mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc); + hwc[i] = pmc; + } + + /* Return MMCRx values */ + mmcr[0] = 0; + if (pmc_inuse & 1) + mmcr[0] = MMCR0_PMC1CE; + if (pmc_inuse & 0x3e) + mmcr[0] |= MMCR0_PMCjCE; + mmcr[1] = mmcr1; + mmcr[2] = 0; + return 0; +} + +static void power5p_disable_pmc(unsigned int pmc, u64 mmcr[]) +{ + if (pmc <= 3) + mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc)); +} + +static int power5p_generic_events[] = { + [PERF_COUNT_CPU_CYCLES] = 0xf, + [PERF_COUNT_INSTRUCTIONS] = 0x100009, + [PERF_COUNT_CACHE_REFERENCES] = 0x1c10a8, /* LD_REF_L1 */ + [PERF_COUNT_CACHE_MISSES] = 0x3c1088, /* LD_MISS_L1 */ + [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x230e4, /* BR_ISSUED */ + [PERF_COUNT_BRANCH_MISSES] = 0x230e5, /* BR_MPRED_CR */ +}; + +struct power_pmu power5p_pmu = { + .n_counter = 4, + .max_alternatives = MAX_ALT, + .add_fields = 0x7000000000055ull, + .test_adder = 0x3000040000000ull, + .compute_mmcr = power5p_compute_mmcr, + .get_constraint = power5p_get_constraint, + .get_alternatives = power5p_get_alternatives, + .disable_pmc = power5p_disable_pmc, + .n_generic = ARRAY_SIZE(power5p_generic_events), + .generic_events = power5p_generic_events, +}; -- cgit v1.2.3 From 880860e392d92c457e8116cdee39ec4d109174ee Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 6 Mar 2009 16:30:52 +1100 Subject: perfcounters/powerpc: add support for POWER4 processors Impact: more hardware support This adds the back-end for the PMU on the POWER4 and POWER4+ processors (GP and GQ). This is quite similar to the PPC970, with 8 PMCs, but has fewer events than the PPC970. Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/Makefile | 2 +- arch/powerpc/kernel/perf_counter.c | 5 + arch/powerpc/kernel/power4-pmu.c | 557 +++++++++++++++++++++++++++++++++++++ 3 files changed, 563 insertions(+), 1 deletion(-) create mode 100644 arch/powerpc/kernel/power4-pmu.c (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 49851e0d8fd..8e5e2c74971 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -94,7 +94,7 @@ obj-$(CONFIG_AUDIT) += audit.o obj64-$(CONFIG_AUDIT) += compat_audit.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o -obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o ppc970-pmu.o \ +obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o power4-pmu.o ppc970-pmu.o \ power5-pmu.o power5+-pmu.o power6-pmu.o obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 162f3981fa2..0e33d27cd46 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -824,6 +824,7 @@ void hw_perf_counter_setup(int cpu) cpuhw->mmcr[0] = MMCR0_FC; } +extern struct power_pmu power4_pmu; extern struct power_pmu ppc970_pmu; extern struct power_pmu power5_pmu; extern struct power_pmu power5p_pmu; @@ -841,6 +842,10 @@ static int init_perf_counters(void) /* XXX should get this from cputable */ pvr = mfspr(SPRN_PVR); switch (PVR_VER(pvr)) { + case PV_POWER4: + case PV_POWER4p: + ppmu = &power4_pmu; + break; case PV_970: case PV_970FX: case PV_970MP: diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c new file mode 100644 index 00000000000..1407b19ab61 --- /dev/null +++ b/arch/powerpc/kernel/power4-pmu.c @@ -0,0 +1,557 @@ +/* + * Performance counter support for POWER4 (GP) and POWER4+ (GQ) processors. + * + * Copyright 2009 Paul Mackerras, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include + +/* + * Bits in event code for POWER4 + */ +#define PM_PMC_SH 12 /* PMC number (1-based) for direct events */ +#define PM_PMC_MSK 0xf +#define PM_UNIT_SH 8 /* TTMMUX number and setting - unit select */ +#define PM_UNIT_MSK 0xf +#define PM_LOWER_SH 6 +#define PM_LOWER_MSK 1 +#define PM_LOWER_MSKS 0x40 +#define PM_BYTE_SH 4 /* Byte number of event bus to use */ +#define PM_BYTE_MSK 3 +#define PM_PMCSEL_MSK 7 + +/* + * Unit code values + */ +#define PM_FPU 1 +#define PM_ISU1 2 +#define PM_IFU 3 +#define PM_IDU0 4 +#define PM_ISU1_ALT 6 +#define PM_ISU2 7 +#define PM_IFU_ALT 8 +#define PM_LSU0 9 +#define PM_LSU1 0xc +#define PM_GPS 0xf + +/* + * Bits in MMCR0 for POWER4 + */ +#define MMCR0_PMC1SEL_SH 8 +#define MMCR0_PMC2SEL_SH 1 +#define MMCR_PMCSEL_MSK 0x1f + +/* + * Bits in MMCR1 for POWER4 + */ +#define MMCR1_TTM0SEL_SH 62 +#define MMCR1_TTC0SEL_SH 61 +#define MMCR1_TTM1SEL_SH 59 +#define MMCR1_TTC1SEL_SH 58 +#define MMCR1_TTM2SEL_SH 56 +#define MMCR1_TTC2SEL_SH 55 +#define MMCR1_TTM3SEL_SH 53 +#define MMCR1_TTC3SEL_SH 52 +#define MMCR1_TTMSEL_MSK 3 +#define MMCR1_TD_CP_DBG0SEL_SH 50 +#define MMCR1_TD_CP_DBG1SEL_SH 48 +#define MMCR1_TD_CP_DBG2SEL_SH 46 +#define MMCR1_TD_CP_DBG3SEL_SH 44 +#define MMCR1_DEBUG0SEL_SH 43 +#define MMCR1_DEBUG1SEL_SH 42 +#define MMCR1_DEBUG2SEL_SH 41 +#define MMCR1_DEBUG3SEL_SH 40 +#define MMCR1_PMC1_ADDER_SEL_SH 39 +#define MMCR1_PMC2_ADDER_SEL_SH 38 +#define MMCR1_PMC6_ADDER_SEL_SH 37 +#define MMCR1_PMC5_ADDER_SEL_SH 36 +#define MMCR1_PMC8_ADDER_SEL_SH 35 +#define MMCR1_PMC7_ADDER_SEL_SH 34 +#define MMCR1_PMC3_ADDER_SEL_SH 33 +#define MMCR1_PMC4_ADDER_SEL_SH 32 +#define MMCR1_PMC3SEL_SH 27 +#define MMCR1_PMC4SEL_SH 22 +#define MMCR1_PMC5SEL_SH 17 +#define MMCR1_PMC6SEL_SH 12 +#define MMCR1_PMC7SEL_SH 7 +#define MMCR1_PMC8SEL_SH 2 /* note bit 0 is in MMCRA for GP */ + +static short mmcr1_adder_bits[8] = { + MMCR1_PMC1_ADDER_SEL_SH, + MMCR1_PMC2_ADDER_SEL_SH, + MMCR1_PMC3_ADDER_SEL_SH, + MMCR1_PMC4_ADDER_SEL_SH, + MMCR1_PMC5_ADDER_SEL_SH, + MMCR1_PMC6_ADDER_SEL_SH, + MMCR1_PMC7_ADDER_SEL_SH, + MMCR1_PMC8_ADDER_SEL_SH +}; + +/* + * Bits in MMCRA + */ +#define MMCRA_PMC8SEL0_SH 17 /* PMC8SEL bit 0 for GP */ + +/* + * Layout of constraint bits: + * 6666555555555544444444443333333333222222222211111111110000000000 + * 3210987654321098765432109876543210987654321098765432109876543210 + * |[ >[ >[ >|||[ >[ >< >< >< >< ><><><><><><><><> + * | UC1 UC2 UC3 ||| PS1 PS2 B0 B1 B2 B3 P1P2P3P4P5P6P7P8 + * \SMPL ||\TTC3SEL + * |\TTC_IFU_SEL + * \TTM2SEL0 + * + * SMPL - SAMPLE_ENABLE constraint + * 56: SAMPLE_ENABLE value 0x0100_0000_0000_0000 + * + * UC1 - unit constraint 1: can't have all three of FPU/ISU1/IDU0|ISU2 + * 55: UC1 error 0x0080_0000_0000_0000 + * 54: FPU events needed 0x0040_0000_0000_0000 + * 53: ISU1 events needed 0x0020_0000_0000_0000 + * 52: IDU0|ISU2 events needed 0x0010_0000_0000_0000 + * + * UC2 - unit constraint 2: can't have all three of FPU/IFU/LSU0 + * 51: UC2 error 0x0008_0000_0000_0000 + * 50: FPU events needed 0x0004_0000_0000_0000 + * 49: IFU events needed 0x0002_0000_0000_0000 + * 48: LSU0 events needed 0x0001_0000_0000_0000 + * + * UC3 - unit constraint 3: can't have all four of LSU0/IFU/IDU0|ISU2/ISU1 + * 47: UC3 error 0x8000_0000_0000 + * 46: LSU0 events needed 0x4000_0000_0000 + * 45: IFU events needed 0x2000_0000_0000 + * 44: IDU0|ISU2 events needed 0x1000_0000_0000 + * 43: ISU1 events needed 0x0800_0000_0000 + * + * TTM2SEL0 + * 42: 0 = IDU0 events needed + * 1 = ISU2 events needed 0x0400_0000_0000 + * + * TTC_IFU_SEL + * 41: 0 = IFU.U events needed + * 1 = IFU.L events needed 0x0200_0000_0000 + * + * TTC3SEL + * 40: 0 = LSU1.U events needed + * 1 = LSU1.L events needed 0x0100_0000_0000 + * + * PS1 + * 39: PS1 error 0x0080_0000_0000 + * 36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000 + * + * PS2 + * 35: PS2 error 0x0008_0000_0000 + * 32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000 + * + * B0 + * 28-31: Byte 0 event source 0xf000_0000 + * 1 = FPU + * 2 = ISU1 + * 3 = IFU + * 4 = IDU0 + * 7 = ISU2 + * 9 = LSU0 + * c = LSU1 + * f = GPS + * + * B1, B2, B3 + * 24-27, 20-23, 16-19: Byte 1, 2, 3 event sources + * + * P8 + * 15: P8 error 0x8000 + * 14-15: Count of events needing PMC8 + * + * P1..P7 + * 0-13: Count of events needing PMC1..PMC7 + * + * Note: this doesn't allow events using IFU.U to be combined with events + * using IFU.L, though that is feasible (using TTM0 and TTM2). However + * there are no listed events for IFU.L (they are debug events not + * verified for performance monitoring) so this shouldn't cause a + * problem. + */ + +static struct unitinfo { + u64 value, mask; + int unit; + int lowerbit; +} p4_unitinfo[16] = { + [PM_FPU] = { 0x44000000000000ull, 0x88000000000000ull, PM_FPU, 0 }, + [PM_ISU1] = { 0x20080000000000ull, 0x88000000000000ull, PM_ISU1, 0 }, + [PM_ISU1_ALT] = + { 0x20080000000000ull, 0x88000000000000ull, PM_ISU1, 0 }, + [PM_IFU] = { 0x02200000000000ull, 0x08820000000000ull, PM_IFU, 41 }, + [PM_IFU_ALT] = + { 0x02200000000000ull, 0x08820000000000ull, PM_IFU, 41 }, + [PM_IDU0] = { 0x10100000000000ull, 0x80840000000000ull, PM_IDU0, 1 }, + [PM_ISU2] = { 0x10140000000000ull, 0x80840000000000ull, PM_ISU2, 0 }, + [PM_LSU0] = { 0x01400000000000ull, 0x08800000000000ull, PM_LSU0, 0 }, + [PM_LSU1] = { 0x00000000000000ull, 0x00010000000000ull, PM_LSU1, 40 }, + [PM_GPS] = { 0x00000000000000ull, 0x00000000000000ull, PM_GPS, 0 } +}; + +static unsigned char direct_marked_event[8] = { + (1<<2) | (1<<3), /* PMC1: PM_MRK_GRP_DISP, PM_MRK_ST_CMPL */ + (1<<3) | (1<<5), /* PMC2: PM_THRESH_TIMEO, PM_MRK_BRU_FIN */ + (1<<3), /* PMC3: PM_MRK_ST_CMPL_INT */ + (1<<4) | (1<<5), /* PMC4: PM_MRK_GRP_CMPL, PM_MRK_CRU_FIN */ + (1<<4) | (1<<5), /* PMC5: PM_MRK_GRP_TIMEO */ + (1<<3) | (1<<4) | (1<<5), + /* PMC6: PM_MRK_ST_GPS, PM_MRK_FXU_FIN, PM_MRK_GRP_ISSUED */ + (1<<4) | (1<<5), /* PMC7: PM_MRK_FPU_FIN, PM_MRK_INST_FIN */ + (1<<4), /* PMC8: PM_MRK_LSU_FIN */ +}; + +/* + * Returns 1 if event counts things relating to marked instructions + * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not. + */ +static int p4_marked_instr_event(unsigned int event) +{ + int pmc, psel, unit, byte, bit; + unsigned int mask; + + pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; + psel = event & PM_PMCSEL_MSK; + if (pmc) { + if (direct_marked_event[pmc - 1] & (1 << psel)) + return 1; + if (psel == 0) /* add events */ + bit = (pmc <= 4)? pmc - 1: 8 - pmc; + else if (psel == 6) /* decode events */ + bit = 4; + else + return 0; + } else + bit = psel; + + byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; + unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; + mask = 0; + switch (unit) { + case PM_LSU1: + if (event & PM_LOWER_MSKS) + mask = 1 << 28; /* byte 7 bit 4 */ + else + mask = 6 << 24; /* byte 3 bits 1 and 2 */ + break; + case PM_LSU0: + /* byte 3, bit 3; byte 2 bits 0,2,3,4,5; byte 1 */ + mask = 0x083dff00; + } + return (mask >> (byte * 8 + bit)) & 1; +} + +static int p4_get_constraint(unsigned int event, u64 *maskp, u64 *valp) +{ + int pmc, byte, unit, lower, sh; + u64 mask = 0, value = 0; + int grp = -1; + + pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; + if (pmc) { + if (pmc > 8) + return -1; + sh = (pmc - 1) * 2; + mask |= 2 << sh; + value |= 1 << sh; + grp = ((pmc - 1) >> 1) & 1; + } + unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; + byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; + if (unit) { + lower = (event >> PM_LOWER_SH) & PM_LOWER_MSK; + + /* + * Bus events on bytes 0 and 2 can be counted + * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8. + */ + if (!pmc) + grp = byte & 1; + + if (!p4_unitinfo[unit].unit) + return -1; + mask |= p4_unitinfo[unit].mask; + value |= p4_unitinfo[unit].value; + sh = p4_unitinfo[unit].lowerbit; + if (sh > 1) + value |= (u64)lower << sh; + else if (lower != sh) + return -1; + unit = p4_unitinfo[unit].unit; + + /* Set byte lane select field */ + mask |= 0xfULL << (28 - 4 * byte); + value |= (u64)unit << (28 - 4 * byte); + } + if (grp == 0) { + /* increment PMC1/2/5/6 field */ + mask |= 0x8000000000ull; + value |= 0x1000000000ull; + } else { + /* increment PMC3/4/7/8 field */ + mask |= 0x800000000ull; + value |= 0x100000000ull; + } + + /* Marked instruction events need sample_enable set */ + if (p4_marked_instr_event(event)) { + mask |= 1ull << 56; + value |= 1ull << 56; + } + + /* PMCSEL=6 decode events on byte 2 need sample_enable clear */ + if (pmc && (event & PM_PMCSEL_MSK) == 6 && byte == 2) + mask |= 1ull << 56; + + *maskp = mask; + *valp = value; + return 0; +} + +static unsigned int ppc_inst_cmpl[] = { + 0x1001, 0x4001, 0x6001, 0x7001, 0x8001 +}; + +static int p4_get_alternatives(unsigned int event, unsigned int alt[]) +{ + int i, j, na; + + alt[0] = event; + na = 1; + + /* 2 possibilities for PM_GRP_DISP_REJECT */ + if (event == 0x8003 || event == 0x0224) { + alt[1] = event ^ (0x8003 ^ 0x0224); + return 2; + } + + /* 2 possibilities for PM_ST_MISS_L1 */ + if (event == 0x0c13 || event == 0x0c23) { + alt[1] = event ^ (0x0c13 ^ 0x0c23); + return 2; + } + + /* several possibilities for PM_INST_CMPL */ + for (i = 0; i < ARRAY_SIZE(ppc_inst_cmpl); ++i) { + if (event == ppc_inst_cmpl[i]) { + for (j = 0; j < ARRAY_SIZE(ppc_inst_cmpl); ++j) + if (j != i) + alt[na++] = ppc_inst_cmpl[j]; + break; + } + } + + return na; +} + +static int p4_compute_mmcr(unsigned int event[], int n_ev, + unsigned int hwc[], u64 mmcr[]) +{ + u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0; + unsigned int pmc, unit, byte, psel, lower; + unsigned int ttm, grp; + unsigned int pmc_inuse = 0; + unsigned int pmc_grp_use[2]; + unsigned char busbyte[4]; + unsigned char unituse[16]; + unsigned int unitlower = 0; + int i; + + if (n_ev > 8) + return -1; + + /* First pass to count resource use */ + pmc_grp_use[0] = pmc_grp_use[1] = 0; + memset(busbyte, 0, sizeof(busbyte)); + memset(unituse, 0, sizeof(unituse)); + for (i = 0; i < n_ev; ++i) { + pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; + if (pmc) { + if (pmc_inuse & (1 << (pmc - 1))) + return -1; + pmc_inuse |= 1 << (pmc - 1); + /* count 1/2/5/6 vs 3/4/7/8 use */ + ++pmc_grp_use[((pmc - 1) >> 1) & 1]; + } + unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; + byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; + lower = (event[i] >> PM_LOWER_SH) & PM_LOWER_MSK; + if (unit) { + if (!pmc) + ++pmc_grp_use[byte & 1]; + if (unit == 6 || unit == 8) + /* map alt ISU1/IFU codes: 6->2, 8->3 */ + unit = (unit >> 1) - 1; + if (busbyte[byte] && busbyte[byte] != unit) + return -1; + busbyte[byte] = unit; + lower <<= unit; + if (unituse[unit] && lower != (unitlower & lower)) + return -1; + unituse[unit] = 1; + unitlower |= lower; + } + } + if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4) + return -1; + + /* + * Assign resources and set multiplexer selects. + * + * Units 1,2,3 are on TTM0, 4,6,7 on TTM1, 8,10 on TTM2. + * Each TTMx can only select one unit, but since + * units 2 and 6 are both ISU1, and 3 and 8 are both IFU, + * we have some choices. + */ + if (unituse[2] & (unituse[1] | (unituse[3] & unituse[9]))) { + unituse[6] = 1; /* Move 2 to 6 */ + unituse[2] = 0; + } + if (unituse[3] & (unituse[1] | unituse[2])) { + unituse[8] = 1; /* Move 3 to 8 */ + unituse[3] = 0; + unitlower = (unitlower & ~8) | ((unitlower & 8) << 5); + } + /* Check only one unit per TTMx */ + if (unituse[1] + unituse[2] + unituse[3] > 1 || + unituse[4] + unituse[6] + unituse[7] > 1 || + unituse[8] + unituse[9] > 1 || + (unituse[5] | unituse[10] | unituse[11] | + unituse[13] | unituse[14])) + return -1; + + /* Set TTMxSEL fields. Note, units 1-3 => TTM0SEL codes 0-2 */ + mmcr1 |= (u64)(unituse[3] * 2 + unituse[2]) << MMCR1_TTM0SEL_SH; + mmcr1 |= (u64)(unituse[7] * 3 + unituse[6] * 2) << MMCR1_TTM1SEL_SH; + mmcr1 |= (u64)unituse[9] << MMCR1_TTM2SEL_SH; + + /* Set TTCxSEL fields. */ + if (unitlower & 0xe) + mmcr1 |= 1ull << MMCR1_TTC0SEL_SH; + if (unitlower & 0xf0) + mmcr1 |= 1ull << MMCR1_TTC1SEL_SH; + if (unitlower & 0xf00) + mmcr1 |= 1ull << MMCR1_TTC2SEL_SH; + if (unitlower & 0x7000) + mmcr1 |= 1ull << MMCR1_TTC3SEL_SH; + + /* Set byte lane select fields. */ + for (byte = 0; byte < 4; ++byte) { + unit = busbyte[byte]; + if (!unit) + continue; + if (unit == 0xf) { + /* special case for GPS */ + mmcr1 |= 1ull << (MMCR1_DEBUG0SEL_SH - byte); + } else { + if (!unituse[unit]) + ttm = unit - 1; /* 2->1, 3->2 */ + else + ttm = unit >> 2; + mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2*byte); + } + } + + /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */ + for (i = 0; i < n_ev; ++i) { + pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; + unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; + byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; + psel = event[i] & PM_PMCSEL_MSK; + if (!pmc) { + /* Bus event or 00xxx direct event (off or cycles) */ + if (unit) + psel |= 0x10 | ((byte & 2) << 2); + for (pmc = 0; pmc < 8; ++pmc) { + if (pmc_inuse & (1 << pmc)) + continue; + grp = (pmc >> 1) & 1; + if (unit) { + if (grp == (byte & 1)) + break; + } else if (pmc_grp_use[grp] < 4) { + ++pmc_grp_use[grp]; + break; + } + } + pmc_inuse |= 1 << pmc; + } else { + /* Direct event */ + --pmc; + if (psel == 0 && (byte & 2)) + /* add events on higher-numbered bus */ + mmcr1 |= 1ull << mmcr1_adder_bits[pmc]; + else if (psel == 6 && byte == 3) + /* seem to need to set sample_enable here */ + mmcra |= MMCRA_SAMPLE_ENABLE; + psel |= 8; + } + if (pmc <= 1) + mmcr0 |= psel << (MMCR0_PMC1SEL_SH - 7 * pmc); + else + mmcr1 |= psel << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2)); + if (pmc == 7) /* PMC8 */ + mmcra |= (psel & 1) << MMCRA_PMC8SEL0_SH; + hwc[i] = pmc; + if (p4_marked_instr_event(event[i])) + mmcra |= MMCRA_SAMPLE_ENABLE; + } + + if (pmc_inuse & 1) + mmcr0 |= MMCR0_PMC1CE; + if (pmc_inuse & 0xfe) + mmcr0 |= MMCR0_PMCjCE; + + mmcra |= 0x2000; /* mark only one IOP per PPC instruction */ + + /* Return MMCRx values */ + mmcr[0] = mmcr0; + mmcr[1] = mmcr1; + mmcr[2] = mmcra; + return 0; +} + +static void p4_disable_pmc(unsigned int pmc, u64 mmcr[]) +{ + /* + * Setting the PMCxSEL field to 0 disables PMC x. + * (Note that pmc is 0-based here, not 1-based.) + */ + if (pmc <= 1) { + mmcr[0] &= ~(0x1fUL << (MMCR0_PMC1SEL_SH - 7 * pmc)); + } else { + mmcr[1] &= ~(0x1fUL << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2))); + if (pmc == 7) + mmcr[2] &= ~(1UL << MMCRA_PMC8SEL0_SH); + } +} + +static int p4_generic_events[] = { + [PERF_COUNT_CPU_CYCLES] = 7, + [PERF_COUNT_INSTRUCTIONS] = 0x1001, + [PERF_COUNT_CACHE_REFERENCES] = 0x8c10, /* PM_LD_REF_L1 */ + [PERF_COUNT_CACHE_MISSES] = 0x3c10, /* PM_LD_MISS_L1 */ + [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x330, /* PM_BR_ISSUED */ + [PERF_COUNT_BRANCH_MISSES] = 0x331, /* PM_BR_MPRED_CR */ +}; + +struct power_pmu power4_pmu = { + .n_counter = 8, + .max_alternatives = 5, + .add_fields = 0x0000001100005555ull, + .test_adder = 0x0011083300000000ull, + .compute_mmcr = p4_compute_mmcr, + .get_constraint = p4_get_constraint, + .get_alternatives = p4_get_alternatives, + .disable_pmc = p4_disable_pmc, + .n_generic = ARRAY_SIZE(p4_generic_events), + .generic_events = p4_generic_events, +}; -- cgit v1.2.3 From 7dd1fcc258b65da718f01e4684a7b9244501a9fb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:33 +0100 Subject: perf_counter: provide pagefault software events We use the generic software counter infrastructure to provide page fault events. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/powerpc/mm/fault.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 76993941cac..eda5b0ca4af 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -170,6 +171,8 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, die("Weird page fault", regs, SIGSEGV); } + perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs); + /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the * kernel and should generate an OOPS. Unfortunately, in the case of an -- cgit v1.2.3 From ac17dc8e58f3069ea895cfff963adf98ff3cf6b2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:34 +0100 Subject: perf_counter: provide major/minor page fault software events Provide separate sw counters for major and minor page faults. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/powerpc/mm/fault.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index eda5b0ca4af..17bbf6f91fb 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -312,6 +312,7 @@ good_area: } if (ret & VM_FAULT_MAJOR) { current->maj_flt++; + perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs); #ifdef CONFIG_PPC_SMLPAR if (firmware_has_feature(FW_FEATURE_CMO)) { preempt_disable(); @@ -319,8 +320,10 @@ good_area: preempt_enable(); } #endif - } else + } else { current->min_flt++; + perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs); + } up_read(&mm->mmap_sem); return 0; -- cgit v1.2.3 From b6c5a71da1477d261bc36254fe1f20d32b57598d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 16 Mar 2009 21:00:00 +1100 Subject: perf_counter: abstract wakeup flag setting in core to fix powerpc build Impact: build fix for powerpc Commit bd753921015e7905 ("perf_counter: software counter event infrastructure") introduced a use of TIF_PERF_COUNTERS into the core perfcounter code. This breaks the build on powerpc because we use a flag in a per-cpu area to signal wakeups on powerpc rather than a thread_info flag, because the thread_info flags have to be manipulated with atomic operations and are thus slower than per-cpu flags. This fixes the by changing the core to use an abstracted set_perf_counter_pending() function, which is defined on x86 to set the TIF_PERF_COUNTERS flag and on powerpc to set the per-cpu flag (paca->perf_counter_pending). It changes the previous powerpc definition of set_perf_counter_pending to not take an argument and adds a clear_perf_counter_pending, so as to simplify the definition on x86. On x86, set_perf_counter_pending() is defined as a macro. Defining it as a static inline in arch/x86/include/asm/perf_counters.h causes compile failures because gets included early in , and the definitions of set_tsk_thread_flag etc. are therefore not available in . (On powerpc this problem is avoided by defining set_perf_counter_pending etc. in .) Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/hw_irq.h | 14 +++++++++++--- arch/powerpc/kernel/irq.c | 11 +++-------- arch/powerpc/kernel/perf_counter.c | 3 +-- 3 files changed, 15 insertions(+), 13 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index b43076ff92c..cb32d571c9c 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -142,10 +142,17 @@ static inline unsigned long get_perf_counter_pending(void) return x; } -static inline void set_perf_counter_pending(int x) +static inline void set_perf_counter_pending(void) { asm volatile("stb %0,%1(13)" : : - "r" (x), + "r" (1), + "i" (offsetof(struct paca_struct, perf_counter_pending))); +} + +static inline void clear_perf_counter_pending(void) +{ + asm volatile("stb %0,%1(13)" : : + "r" (0), "i" (offsetof(struct paca_struct, perf_counter_pending))); } @@ -158,7 +165,8 @@ static inline unsigned long get_perf_counter_pending(void) return 0; } -static inline void set_perf_counter_pending(int x) {} +static inline void set_perf_counter_pending(void) {} +static inline void clear_perf_counter_pending(void) {} static inline void perf_counter_do_pending(void) {} #endif /* CONFIG_PERF_COUNTERS */ diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 0d2e37c5773..469e9635ff0 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -104,13 +104,6 @@ static inline notrace void set_soft_enabled(unsigned long enable) : : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled))); } -#ifdef CONFIG_PERF_COUNTERS -notrace void __weak perf_counter_do_pending(void) -{ - set_perf_counter_pending(0); -} -#endif - notrace void raw_local_irq_restore(unsigned long en) { /* @@ -142,8 +135,10 @@ notrace void raw_local_irq_restore(unsigned long en) iseries_handle_interrupts(); } - if (get_perf_counter_pending()) + if (get_perf_counter_pending()) { + clear_perf_counter_pending(); perf_counter_do_pending(); + } /* * if (get_paca()->hard_enabled) return; diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 0e33d27cd46..5008762e8bf 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -653,7 +653,6 @@ void perf_counter_do_pending(void) struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters); struct perf_counter *counter; - set_perf_counter_pending(0); for (i = 0; i < cpuhw->n_counters; ++i) { counter = cpuhw->counter[i]; if (counter && counter->wakeup_pending) { @@ -811,7 +810,7 @@ static void perf_counter_interrupt(struct pt_regs *regs) perf_counter_do_pending(); irq_exit(); } else { - set_perf_counter_pending(1); + set_perf_counter_pending(); } } } -- cgit v1.2.3 From b8e83514b64577b48bfb794fe85fcde40a9343ca Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2009 20:26:18 +0100 Subject: perf_counter: revamp syscall input ABI Impact: modify ABI The hardware/software classification in hw_event->type became a little strained due to the addition of tracepoint tracing. Instead split up the field and provide a type field to explicitly specify the counter type, while using the event_id field to specify which event to use. Raw counters still work as before, only the raw config now goes into raw_event. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Orig-LKML-Reference: <20090319194233.836807573@chello.nl> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 5008762e8bf..26f69dc7130 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -602,7 +602,7 @@ hw_perf_counter_init(struct perf_counter *counter) return NULL; if ((s64)counter->hw_event.irq_period < 0) return NULL; - ev = counter->hw_event.type; + ev = counter->hw_event.event_id; if (!counter->hw_event.raw) { if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) @@ -692,7 +692,7 @@ static void perf_handle_group(struct perf_counter *counter) list_for_each_entry(sub, &leader->sibling_list, list_entry) { if (sub != counter) sub->hw_ops->read(sub); - perf_store_irq_data(counter, sub->hw_event.type); + perf_store_irq_data(counter, sub->hw_event.event_config); perf_store_irq_data(counter, atomic64_read(&sub->count)); } } -- cgit v1.2.3 From 0322cd6ec504b0bf08ca7b2c3d7f43bda37d79c9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2009 20:26:19 +0100 Subject: perf_counter: unify irq output code Impact: cleanup Having 3 slightly different copies of the same code around does nobody any good. First step in revamping the output format. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Orig-LKML-Reference: <20090319194233.929962222@chello.nl> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 51 ++------------------------------------ 1 file changed, 2 insertions(+), 49 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 26f69dc7130..88b72eb4af1 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -662,41 +662,6 @@ void perf_counter_do_pending(void) } } -/* - * Record data for an irq counter. - * This function was lifted from the x86 code; maybe it should - * go in the core? - */ -static void perf_store_irq_data(struct perf_counter *counter, u64 data) -{ - struct perf_data *irqdata = counter->irqdata; - - if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) { - irqdata->overrun++; - } else { - u64 *p = (u64 *) &irqdata->data[irqdata->len]; - - *p = data; - irqdata->len += sizeof(u64); - } -} - -/* - * Record all the values of the counters in a group - */ -static void perf_handle_group(struct perf_counter *counter) -{ - struct perf_counter *leader, *sub; - - leader = counter->group_leader; - list_for_each_entry(sub, &leader->sibling_list, list_entry) { - if (sub != counter) - sub->hw_ops->read(sub); - perf_store_irq_data(counter, sub->hw_event.event_config); - perf_store_irq_data(counter, atomic64_read(&sub->count)); - } -} - /* * A counter has overflowed; update its count and record * things if requested. Note that interrupts are hard-disabled @@ -736,20 +701,8 @@ static void record_and_restart(struct perf_counter *counter, long val, /* * Finally record data if requested. */ - if (record) { - switch (counter->hw_event.record_type) { - case PERF_RECORD_SIMPLE: - break; - case PERF_RECORD_IRQ: - perf_store_irq_data(counter, instruction_pointer(regs)); - counter->wakeup_pending = 1; - break; - case PERF_RECORD_GROUP: - perf_handle_group(counter); - counter->wakeup_pending = 1; - break; - } - } + if (record) + perf_counter_output(counter, 1, regs); } /* -- cgit v1.2.3 From db4fb5acf20295063d1d5105e67724eb51440207 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 19 Mar 2009 20:26:20 +0100 Subject: perf_counter: powerpc: clean up perc_counter_interrupt Impact: cleanup This updates the powerpc perf_counter_interrupt following on from the "perf_counter: unify irq output code" patch. Since we now use the generic perf_counter_output code, which sets the perf_counter_pending flag directly, we no longer need the need_wakeup variable. This removes need_wakeup and makes perf_counter_interrupt use get_perf_counter_pending() instead. Signed-off-by: Paul Mackerras Signed-off-by: Peter Zijlstra Cc: Steven Rostedt Orig-LKML-Reference: <20090319194234.024464535@chello.nl> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 88b72eb4af1..830ca9c4494 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -723,8 +723,6 @@ static void perf_counter_interrupt(struct pt_regs *regs) /* counter has overflowed */ found = 1; record_and_restart(counter, val, regs); - if (counter->wakeup_pending) - need_wakeup = 1; } } @@ -754,17 +752,14 @@ static void perf_counter_interrupt(struct pt_regs *regs) /* * If we need a wakeup, check whether interrupts were soft-enabled * when we took the interrupt. If they were, we can wake stuff up - * immediately; otherwise we'll have to set a flag and do the - * wakeup when interrupts get soft-enabled. + * immediately; otherwise we'll have do the wakeup when interrupts + * get soft-enabled. */ - if (need_wakeup) { - if (regs->softe) { - irq_enter(); - perf_counter_do_pending(); - irq_exit(); - } else { - set_perf_counter_pending(); - } + if (get_perf_counter_pending() && regs->softe) { + irq_enter(); + clear_perf_counter_pending(); + perf_counter_do_pending(); + irq_exit(); } } -- cgit v1.2.3 From 9aaa131a279834dff75c290c91f0058f62d72d46 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 21 Mar 2009 15:31:47 +1100 Subject: perf_counter: fix type/event_id layout on big-endian systems Impact: build fix for powerpc Commit db3a944aca35ae61 ("perf_counter: revamp syscall input ABI") expanded the hw_event.type field into a union of structs containing bitfields. In particular it introduced a type field and a raw_type field, with the intention that the 1-bit raw_type field should overlay the most-significant bit of the 8-bit type field, and in fact perf_counter_alloc() now assumes that (or at least, assumes that raw_type doesn't overlay any of the bits that are 1 in the values of PERF_TYPE_{HARDWARE,SOFTWARE,TRACEPOINT}). Unfortunately this is not true on big-endian systems such as PowerPC, where bitfields are laid out from left to right, i.e. from most significant bit to least significant. This means that setting hw_event.type = PERF_TYPE_SOFTWARE will set hw_event.raw_type to 1. This fixes it by making the layout depend on whether or not __BIG_ENDIAN_BITFIELD is defined. It's a bit ugly, but that's what we get for using bitfields in a user/kernel ABI. Also, that commit didn't fix up some places in arch/powerpc/kernel/ perf_counter.c where hw_event.raw and hw_event.event_id were used. This fixes them too. Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/perf_counter.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 830ca9c4494..6413d9c0313 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -602,12 +602,13 @@ hw_perf_counter_init(struct perf_counter *counter) return NULL; if ((s64)counter->hw_event.irq_period < 0) return NULL; - ev = counter->hw_event.event_id; - if (!counter->hw_event.raw) { - if (ev >= ppmu->n_generic || - ppmu->generic_events[ev] == 0) + if (!counter->hw_event.raw_type) { + ev = counter->hw_event.event_id; + if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) return NULL; ev = ppmu->generic_events[ev]; + } else { + ev = counter->hw_event.raw_event_id; } counter->hw.config_base = ev; counter->hw.idx = 0; -- cgit v1.2.3 From f4a2deb4860497f4332cf6a1acddab3dd628ddf0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 Mar 2009 18:22:06 +0100 Subject: perf_counter: remove the event config bitfields Since the bitfields turned into a bit of a mess, remove them and rely on good old masks. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Orig-LKML-Reference: <20090323172417.059499915@chello.nl> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 6413d9c0313..d05651584d4 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -602,13 +602,13 @@ hw_perf_counter_init(struct perf_counter *counter) return NULL; if ((s64)counter->hw_event.irq_period < 0) return NULL; - if (!counter->hw_event.raw_type) { - ev = counter->hw_event.event_id; + if (!perf_event_raw(&counter->hw_event)) { + ev = perf_event_id(&counter->hw_event); if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) return NULL; ev = ppmu->generic_events[ev]; } else { - ev = counter->hw_event.raw_event_id; + ev = perf_event_config(&counter->hw_event); } counter->hw.config_base = ev; counter->hw.idx = 0; -- cgit v1.2.3 From 37d81828385f8ff823caaaf1a83e72d065b6cfa1 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 23 Mar 2009 18:22:08 +0100 Subject: perf_counter: add an mmap method to allow userspace to read hardware counters Impact: new feature giving performance improvement This adds the ability for userspace to do an mmap on a hardware counter fd and get access to a read-only page that contains the information needed to translate a hardware counter value to the full 64-bit counter value that would be returned by a read on the fd. This is useful on architectures that allow user programs to read the hardware counters, such as PowerPC. The mmap will only succeed if the counter is a hardware counter monitoring the current process. On my quad 2.5GHz PowerPC 970MP machine, userspace can read a counter and translate it to the full 64-bit value in about 30ns using the mmapped page, compared to about 830ns for the read syscall on the counter, so this does give a significant performance improvement. Signed-off-by: Paul Mackerras Signed-off-by: Peter Zijlstra Orig-LKML-Reference: <20090323172417.297057964@chello.nl> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index d05651584d4..e4349281b07 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -417,6 +417,8 @@ void hw_perf_restore(u64 disable) atomic64_set(&counter->hw.prev_count, val); counter->hw.idx = hwc_index[i] + 1; write_pmc(counter->hw.idx, val); + if (counter->user_page) + perf_counter_update_userpage(counter); } mb(); cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE; @@ -572,6 +574,8 @@ static void power_perf_disable(struct perf_counter *counter) ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr); write_pmc(counter->hw.idx, 0); counter->hw.idx = 0; + if (counter->user_page) + perf_counter_update_userpage(counter); break; } } @@ -698,6 +702,8 @@ static void record_and_restart(struct perf_counter *counter, long val, write_pmc(counter->hw.idx, val); atomic64_set(&counter->hw.prev_count, val); atomic64_set(&counter->hw.period_left, left); + if (counter->user_page) + perf_counter_update_userpage(counter); /* * Finally record data if requested. -- cgit v1.2.3 From 7b732a75047738e4f85438ed2f9cd34bf5f2a19a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 Mar 2009 18:22:10 +0100 Subject: perf_counter: new output ABI - part 1 Impact: Rework the perfcounter output ABI use sys_read() only for instant data and provide mmap() output for all async overflow data. The first mmap() determines the size of the output buffer. The mmap() size must be a PAGE_SIZE multiple of 1+pages, where pages must be a power of 2 or 0. Further mmap()s of the same fd must have the same size. Once all maps are gone, you can again mmap() with a new size. In case of 0 extra pages there is no data output and the first page only contains meta data. When there are data pages, a poll() event will be generated for each full page of data. Furthermore, the output is circular. This means that although 1 page is a valid configuration, its useless, since we'll start overwriting it the instant we report a full page. Future work will focus on the output format (currently maintained) where we'll likey want each entry denoted by a header which includes a type and length. Further future work will allow to splice() the fd, also containing the async overflow data -- splice() would be mutually exclusive with mmap() of the data. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Orig-LKML-Reference: <20090323172417.470536358@chello.nl> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index e4349281b07..d48596ab655 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -417,8 +417,7 @@ void hw_perf_restore(u64 disable) atomic64_set(&counter->hw.prev_count, val); counter->hw.idx = hwc_index[i] + 1; write_pmc(counter->hw.idx, val); - if (counter->user_page) - perf_counter_update_userpage(counter); + perf_counter_update_userpage(counter); } mb(); cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE; @@ -574,8 +573,7 @@ static void power_perf_disable(struct perf_counter *counter) ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr); write_pmc(counter->hw.idx, 0); counter->hw.idx = 0; - if (counter->user_page) - perf_counter_update_userpage(counter); + perf_counter_update_userpage(counter); break; } } @@ -702,8 +700,7 @@ static void record_and_restart(struct perf_counter *counter, long val, write_pmc(counter->hw.idx, val); atomic64_set(&counter->hw.prev_count, val); atomic64_set(&counter->hw.period_left, left); - if (counter->user_page) - perf_counter_update_userpage(counter); + perf_counter_update_userpage(counter); /* * Finally record data if requested. -- cgit v1.2.3 From 53cfbf593758916aac41db728f029986a62f1254 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 25 Mar 2009 22:46:58 +1100 Subject: perf_counter: record time running and time enabled for each counter Impact: new functionality Currently, if there are more counters enabled than can fit on the CPU, the kernel will multiplex the counters on to the hardware using round-robin scheduling. That isn't too bad for sampling counters, but for counting counters it means that the value read from a counter represents some unknown fraction of the true count of events that occurred while the counter was enabled. This remedies the situation by keeping track of how long each counter is enabled for, and how long it is actually on the cpu and counting events. These times are recorded in nanoseconds using the task clock for per-task counters and the cpu clock for per-cpu counters. These values can be supplied to userspace on a read from the counter. Userspace requests that they be supplied after the counter value by setting the PERF_FORMAT_TOTAL_TIME_ENABLED and/or PERF_FORMAT_TOTAL_TIME_RUNNING bits in the hw_event.read_format field when creating the counter. (There is no way to change the read format after the counter is created, though it would be possible to add some way to do that.) Using this information it is possible for userspace to scale the count it reads from the counter to get an estimate of the true count: true_count_estimate = count * total_time_enabled / total_time_running This also lets userspace detect the situation where the counter never got to go on the cpu: total_time_running == 0. This functionality has been requested by the PAPI developers, and will be generally needed for interpreting the count values from counting counters correctly. In the implementation, this keeps 5 time values (in nanoseconds) for each counter: total_time_enabled and total_time_running are used when the counter is in state OFF or ERROR and for reporting back to userspace. When the counter is in state INACTIVE or ACTIVE, it is the tstamp_enabled, tstamp_running and tstamp_stopped values that are relevant, and total_time_enabled and total_time_running are determined from them. (tstamp_stopped is only used in INACTIVE state.) The reason for doing it like this is that it means that only counters being enabled or disabled at sched-in and sched-out time need to be updated. There are no new loops that iterate over all counters to update total_time_enabled or total_time_running. This also keeps separate child_total_time_running and child_total_time_enabled fields that get added in when reporting the totals to userspace. They are separate fields so that they can be atomic. We don't want to use atomics for total_time_running, total_time_enabled etc., because then we would have to use atomic sequences to update them, which are slower than regular arithmetic and memory accesses. It is possible to measure total_time_running by adding a task_clock counter to each group of counters, and total_time_enabled can be measured approximately with a top-level task_clock counter (though inaccuracies will creep in if you need to disable and enable groups since it is not possible in general to disable/enable the top-level task_clock counter simultaneously with another group). However, that adds extra overhead - I measured around 15% increase in the context switch latency reported by lat_ctx (from lmbench) when a task_clock counter was added to each of 2 groups, and around 25% increase when a task_clock counter was added to each of 4 groups. (In both cases a top-level task-clock counter was also added.) In contrast, the code added in this commit gives better information with no overhead that I could measure (in fact in some cases I measured lower times with this code, but the differences were all less than one standard deviation). [ v2: address review comments by Andrew Morton. ] Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Andrew Morton Orig-LKML-Reference: <18890.6578.728637.139402@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index d48596ab655..df007fe0cc0 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -455,6 +455,8 @@ static void counter_sched_in(struct perf_counter *counter, int cpu) { counter->state = PERF_COUNTER_STATE_ACTIVE; counter->oncpu = cpu; + counter->tstamp_running += counter->ctx->time_now - + counter->tstamp_stopped; if (is_software_counter(counter)) counter->hw_ops->enable(counter); } -- cgit v1.2.3 From 925d519ab82b6dd7aca9420d809ee83819c08db2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:02 +0200 Subject: perf_counter: unify and fix delayed counter wakeup While going over the wakeup code I noticed delayed wakeups only work for hardware counters but basically all software counters rely on them. This patch unifies and generalizes the delayed wakeup to fix this issue. Since we're dealing with NMI context bits here, use a cmpxchg() based single link list implementation to track counters that have pending wakeups. [ This should really be generic code for delayed wakeups, but since we cannot use cmpxchg()/xchg() in generic code, I've let it live in the perf_counter code. -- Eric Dumazet could use it to aggregate the network wakeups. ] Furthermore, the x86 method of using TIF flags was flawed in that its quite possible to end up setting the bit on the idle task, loosing the wakeup. The powerpc method uses per-cpu storage and does appear to be sufficient. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Orig-LKML-Reference: <20090330171023.153932974@chello.nl> Signed-off-by: Ingo Molnar --- arch/powerpc/include/asm/hw_irq.h | 4 ++-- arch/powerpc/kernel/irq.c | 2 +- arch/powerpc/kernel/perf_counter.c | 22 ++-------------------- 3 files changed, 5 insertions(+), 23 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index cb32d571c9c..20a44d0c9fd 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -132,7 +132,7 @@ static inline int irqs_disabled_flags(unsigned long flags) struct irq_chip; #ifdef CONFIG_PERF_COUNTERS -static inline unsigned long get_perf_counter_pending(void) +static inline unsigned long test_perf_counter_pending(void) { unsigned long x; @@ -160,7 +160,7 @@ extern void perf_counter_do_pending(void); #else -static inline unsigned long get_perf_counter_pending(void) +static inline unsigned long test_perf_counter_pending(void) { return 0; } diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 469e9635ff0..2cd471f92fe 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -135,7 +135,7 @@ notrace void raw_local_irq_restore(unsigned long en) iseries_handle_interrupts(); } - if (get_perf_counter_pending()) { + if (test_perf_counter_pending()) { clear_perf_counter_pending(); perf_counter_do_pending(); } diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index df007fe0cc0..cde720fc495 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -649,24 +649,6 @@ hw_perf_counter_init(struct perf_counter *counter) return &power_perf_ops; } -/* - * Handle wakeups. - */ -void perf_counter_do_pending(void) -{ - int i; - struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters); - struct perf_counter *counter; - - for (i = 0; i < cpuhw->n_counters; ++i) { - counter = cpuhw->counter[i]; - if (counter && counter->wakeup_pending) { - counter->wakeup_pending = 0; - wake_up(&counter->waitq); - } - } -} - /* * A counter has overflowed; update its count and record * things if requested. Note that interrupts are hard-disabled @@ -720,7 +702,7 @@ static void perf_counter_interrupt(struct pt_regs *regs) struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters); struct perf_counter *counter; long val; - int need_wakeup = 0, found = 0; + int found = 0; for (i = 0; i < cpuhw->n_counters; ++i) { counter = cpuhw->counter[i]; @@ -761,7 +743,7 @@ static void perf_counter_interrupt(struct pt_regs *regs) * immediately; otherwise we'll have do the wakeup when interrupts * get soft-enabled. */ - if (get_perf_counter_pending() && regs->softe) { + if (test_perf_counter_pending() && regs->softe) { irq_enter(); clear_perf_counter_pending(); perf_counter_do_pending(); -- cgit v1.2.3 From 7595d63b3a9ce65d14c4fbd0e7de448a343d7215 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 30 Mar 2009 19:07:07 +0200 Subject: perf_counter: powerpc: only reserve PMU hardware when we need it Impact: cooperate with oprofile At present, on PowerPC, if you have perf_counters compiled in, oprofile doesn't work. There is code to allow the PMU to be shared between competing subsystems, such as perf_counters and oprofile, but currently the perf_counter subsystem reserves the PMU for itself at boot time, and never releases it. This makes perf_counter play nicely with oprofile. Now we keep a count of how many perf_counter instances are counting hardware events, and reserve the PMU when that count becomes non-zero, and release the PMU when that count becomes zero. This means that it is possible to have perf_counters compiled in and still use oprofile, as long as there are no hardware perf_counters active. This also means that if oprofile is active, sys_perf_counter_open will fail if the hw_event specifies a hardware event. To avoid races with other tasks creating and destroying perf_counters, we use a mutex. We use atomic_inc_not_zero and atomic_add_unless to avoid having to take the mutex unless there is a possibility of the count going between 0 and 1. Signed-off-by: Paul Mackerras Signed-off-by: Peter Zijlstra Orig-LKML-Reference: <20090330171023.627912475@chello.nl> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 47 ++++++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 5 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index cde720fc495..560dd1e7b52 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -41,6 +41,8 @@ struct power_pmu *ppmu; */ static unsigned int freeze_counters_kernel = MMCR0_FCS; +static void perf_counter_interrupt(struct pt_regs *regs); + void perf_counter_print_debug(void) { } @@ -594,6 +596,24 @@ struct hw_perf_counter_ops power_perf_ops = { .read = power_perf_read }; +/* Number of perf_counters counting hardware events */ +static atomic_t num_counters; +/* Used to avoid races in calling reserve/release_pmc_hardware */ +static DEFINE_MUTEX(pmc_reserve_mutex); + +/* + * Release the PMU if this is the last perf_counter. + */ +static void hw_perf_counter_destroy(struct perf_counter *counter) +{ + if (!atomic_add_unless(&num_counters, -1, 1)) { + mutex_lock(&pmc_reserve_mutex); + if (atomic_dec_return(&num_counters) == 0) + release_pmc_hardware(); + mutex_unlock(&pmc_reserve_mutex); + } +} + const struct hw_perf_counter_ops * hw_perf_counter_init(struct perf_counter *counter) { @@ -601,6 +621,7 @@ hw_perf_counter_init(struct perf_counter *counter) struct perf_counter *ctrs[MAX_HWCOUNTERS]; unsigned int events[MAX_HWCOUNTERS]; int n; + int err; if (!ppmu) return NULL; @@ -646,6 +667,27 @@ hw_perf_counter_init(struct perf_counter *counter) counter->hw.config = events[n]; atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period); + + /* + * See if we need to reserve the PMU. + * If no counters are currently in use, then we have to take a + * mutex to ensure that we don't race with another task doing + * reserve_pmc_hardware or release_pmc_hardware. + */ + err = 0; + if (!atomic_inc_not_zero(&num_counters)) { + mutex_lock(&pmc_reserve_mutex); + if (atomic_read(&num_counters) == 0 && + reserve_pmc_hardware(perf_counter_interrupt)) + err = -EBUSY; + else + atomic_inc(&num_counters); + mutex_unlock(&pmc_reserve_mutex); + } + counter->destroy = hw_perf_counter_destroy; + + if (err) + return NULL; return &power_perf_ops; } @@ -769,11 +811,6 @@ static int init_perf_counters(void) { unsigned long pvr; - if (reserve_pmc_hardware(perf_counter_interrupt)) { - printk(KERN_ERR "Couldn't init performance monitor subsystem\n"); - return -EBUSY; - } - /* XXX should get this from cputable */ pvr = mfspr(SPRN_PVR); switch (PVR_VER(pvr)) { -- cgit v1.2.3 From d5d2bc0dd0379deddb9ede66fec90a3083eaec57 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 30 Mar 2009 19:07:08 +0200 Subject: perf_counter: make it possible for hw_perf_counter_init to return error codes Impact: better error reporting At present, if hw_perf_counter_init encounters an error, all it can do is return NULL, which causes sys_perf_counter_open to return an EINVAL error to userspace. This isn't very informative for userspace; it means that userspace can't tell the difference between "sorry, oprofile is already using the PMU" and "we don't support this CPU" and "this CPU doesn't support the requested generic hardware event". This commit uses the PTR_ERR/ERR_PTR/IS_ERR set of macros to let hw_perf_counter_init return an error code on error rather than just NULL if it wishes. If it does so, that error code will be returned from sys_perf_counter_open to userspace. If it returns NULL, an EINVAL error will be returned to userspace, as before. This also adapts the powerpc hw_perf_counter_init to make use of this to return ENXIO, EINVAL, EBUSY, or EOPNOTSUPP as appropriate. It would be good to add extra error numbers in future to allow userspace to distinguish the various errors that are currently reported as EINVAL, i.e. irq_period < 0, too many events in a group, conflict between exclude_* settings in a group, and PMU resource conflict in a group. [ v2: fix a bug pointed out by Corey Ashford where error returns from hw_perf_counter_init were not handled correctly in the case of raw hardware events.] Signed-off-by: Paul Mackerras Signed-off-by: Peter Zijlstra Orig-LKML-Reference: <20090330171023.682428180@chello.nl> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 560dd1e7b52..0a4d14f279a 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -624,13 +624,13 @@ hw_perf_counter_init(struct perf_counter *counter) int err; if (!ppmu) - return NULL; + return ERR_PTR(-ENXIO); if ((s64)counter->hw_event.irq_period < 0) - return NULL; + return ERR_PTR(-EINVAL); if (!perf_event_raw(&counter->hw_event)) { ev = perf_event_id(&counter->hw_event); if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) - return NULL; + return ERR_PTR(-EOPNOTSUPP); ev = ppmu->generic_events[ev]; } else { ev = perf_event_config(&counter->hw_event); @@ -656,14 +656,14 @@ hw_perf_counter_init(struct perf_counter *counter) n = collect_events(counter->group_leader, ppmu->n_counter - 1, ctrs, events); if (n < 0) - return NULL; + return ERR_PTR(-EINVAL); } events[n] = ev; ctrs[n] = counter; if (check_excludes(ctrs, n, 1)) - return NULL; + return ERR_PTR(-EINVAL); if (power_check_constraints(events, n + 1)) - return NULL; + return ERR_PTR(-EINVAL); counter->hw.config = events[n]; atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period); @@ -687,7 +687,7 @@ hw_perf_counter_init(struct perf_counter *counter) counter->destroy = hw_perf_counter_destroy; if (err) - return NULL; + return ERR_PTR(err); return &power_perf_ops; } -- cgit v1.2.3 From f6c7d5fe58b4846ee0cb4b98b6042489705eced4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:04 +0200 Subject: perf_counter: theres more to overflow than writing events Prepare for more generic overflow handling. The new perf_counter_overflow() method will handle the generic bits of the counter overflow, and can return a !0 return value, in which case the counter should be (soft) disabled, so that it won't count until it's properly disabled. XXX: do powerpc and swcounter Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094517.812109629@chello.nl> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 0a4d14f279a..f88c35d0710 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -732,7 +732,7 @@ static void record_and_restart(struct perf_counter *counter, long val, * Finally record data if requested. */ if (record) - perf_counter_output(counter, 1, regs); + perf_counter_overflow(counter, 1, regs); } /* -- cgit v1.2.3 From dc66270b51a62b1a6888d5309229e638a305c47b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 8 Apr 2009 20:30:10 +1000 Subject: perf_counter: fix powerpc build Commit 4af4998b ("perf_counter: rework context time") changed struct perf_counter_context to have a 'time' field instead of a 'time_now' field, but neglected to fix the place in the powerpc perf_counter.c where the time_now field was accessed. This fixes it. Signed-off-by: Paul Mackerras Cc: Peter Zijlstra LKML-Reference: <18908.31922.411398.147810@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index f88c35d0710..0e5651385dd 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -457,8 +457,7 @@ static void counter_sched_in(struct perf_counter *counter, int cpu) { counter->state = PERF_COUNTER_STATE_ACTIVE; counter->oncpu = cpu; - counter->tstamp_running += counter->ctx->time_now - - counter->tstamp_stopped; + counter->tstamp_running += counter->ctx->time - counter->tstamp_stopped; if (is_software_counter(counter)) counter->hw_ops->enable(counter); } -- cgit v1.2.3 From f708223d49ac39f5af1643985056206c98033f5b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 8 Apr 2009 20:30:18 +1000 Subject: perf_counter: powerpc: set sample enable bit for marked instruction events Impact: enable access to hardware feature POWER processors have the ability to "mark" a subset of the instructions and provide more detailed information on what happens to the marked instructions as they flow through the pipeline. This marking is enabled by the "sample enable" bit in MMCRA, and there are synchronization requirements around setting and clearing the bit. This adds logic to the processor-specific back-ends so that they know which events relate to marked instructions and set the sampling enable bit if any event that we want to put on the PMU is a marked instruction event. It also adds logic to the generic powerpc code to do the necessary synchronization if that bit is set. Signed-off-by: Paul Mackerras Cc: Peter Zijlstra LKML-Reference: <18908.31930.1024.228867@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 28 +++++++-- arch/powerpc/kernel/power5+-pmu.c | 103 +++++++++++++++++++++++++++++- arch/powerpc/kernel/power5-pmu.c | 96 +++++++++++++++++++++++++++- arch/powerpc/kernel/power6-pmu.c | 126 ++++++++++++++++++++++++++++++++++++- arch/powerpc/kernel/ppc970-pmu.c | 72 ++++++++++++++++++++- 5 files changed, 413 insertions(+), 12 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 0e5651385dd..0697ade84dd 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -306,6 +306,15 @@ u64 hw_perf_save_disable(void) cpuhw->pmcs_enabled = 1; } + /* + * Disable instruction sampling if it was enabled + */ + if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) { + mtspr(SPRN_MMCRA, + cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE); + mb(); + } + /* * Set the 'freeze counters' bit. * The barrier is to make sure the mtspr has been @@ -347,12 +356,11 @@ void hw_perf_restore(u64 disable) * (possibly updated for removal of counters). */ if (!cpuhw->n_added) { - mtspr(SPRN_MMCRA, cpuhw->mmcr[2]); + mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE); mtspr(SPRN_MMCR1, cpuhw->mmcr[1]); - mtspr(SPRN_MMCR0, cpuhw->mmcr[0]); if (cpuhw->n_counters == 0) get_lppaca()->pmcregs_in_use = 0; - goto out; + goto out_enable; } /* @@ -385,7 +393,7 @@ void hw_perf_restore(u64 disable) * Then unfreeze the counters. */ get_lppaca()->pmcregs_in_use = 1; - mtspr(SPRN_MMCRA, cpuhw->mmcr[2]); + mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE); mtspr(SPRN_MMCR1, cpuhw->mmcr[1]); mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)) | MMCR0_FC); @@ -421,10 +429,20 @@ void hw_perf_restore(u64 disable) write_pmc(counter->hw.idx, val); perf_counter_update_userpage(counter); } - mb(); cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE; + + out_enable: + mb(); mtspr(SPRN_MMCR0, cpuhw->mmcr[0]); + /* + * Enable instruction sampling if necessary + */ + if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) { + mb(); + mtspr(SPRN_MMCRA, cpuhw->mmcr[2]); + } + out: local_irq_restore(flags); } diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c index cec21ea65b0..1222c8ea3c2 100644 --- a/arch/powerpc/kernel/power5+-pmu.c +++ b/arch/powerpc/kernel/power5+-pmu.c @@ -1,5 +1,5 @@ /* - * Performance counter support for POWER5 (not POWER5++) processors. + * Performance counter support for POWER5+/++ (not POWER5) processors. * * Copyright 2009 Paul Mackerras, IBM Corporation. * @@ -281,10 +281,107 @@ static int power5p_get_alternatives(unsigned int event, unsigned int alt[]) return nalt; } +/* + * Map of which direct events on which PMCs are marked instruction events. + * Indexed by PMCSEL value, bit i (LE) set if PMC i is a marked event. + * Bit 0 is set if it is marked for all PMCs. + * The 0x80 bit indicates a byte decode PMCSEL value. + */ +static unsigned char direct_event_is_marked[0x28] = { + 0, /* 00 */ + 0x1f, /* 01 PM_IOPS_CMPL */ + 0x2, /* 02 PM_MRK_GRP_DISP */ + 0xe, /* 03 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */ + 0, /* 04 */ + 0x1c, /* 05 PM_MRK_BRU_FIN, PM_MRK_INST_FIN, PM_MRK_CRU_FIN */ + 0x80, /* 06 */ + 0x80, /* 07 */ + 0, 0, 0,/* 08 - 0a */ + 0x18, /* 0b PM_THRESH_TIMEO, PM_MRK_GRP_TIMEO */ + 0, /* 0c */ + 0x80, /* 0d */ + 0x80, /* 0e */ + 0, /* 0f */ + 0, /* 10 */ + 0x14, /* 11 PM_MRK_GRP_BR_REDIR, PM_MRK_GRP_IC_MISS */ + 0, /* 12 */ + 0x10, /* 13 PM_MRK_GRP_CMPL */ + 0x1f, /* 14 PM_GRP_MRK, PM_MRK_{FXU,FPU,LSU}_FIN */ + 0x2, /* 15 PM_MRK_GRP_ISSUED */ + 0x80, /* 16 */ + 0x80, /* 17 */ + 0, 0, 0, 0, 0, + 0x80, /* 1d */ + 0x80, /* 1e */ + 0, /* 1f */ + 0x80, /* 20 */ + 0x80, /* 21 */ + 0x80, /* 22 */ + 0x80, /* 23 */ + 0x80, /* 24 */ + 0x80, /* 25 */ + 0x80, /* 26 */ + 0x80, /* 27 */ +}; + +/* + * Returns 1 if event counts things relating to marked instructions + * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not. + */ +static int power5p_marked_instr_event(unsigned int event) +{ + int pmc, psel; + int bit, byte, unit; + u32 mask; + + pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; + psel = event & PM_PMCSEL_MSK; + if (pmc >= 5) + return 0; + + bit = -1; + if (psel < sizeof(direct_event_is_marked)) { + if (direct_event_is_marked[psel] & (1 << pmc)) + return 1; + if (direct_event_is_marked[psel] & 0x80) + bit = 4; + else if (psel == 0x08) + bit = pmc - 1; + else if (psel == 0x10) + bit = 4 - pmc; + else if (psel == 0x1b && (pmc == 1 || pmc == 3)) + bit = 4; + } else if ((psel & 0x48) == 0x40) { + bit = psel & 7; + } else if (psel == 0x28) { + bit = pmc - 1; + } else if (pmc == 3 && (psel == 0x2e || psel == 0x2f)) { + bit = 4; + } + + if (!(event & PM_BUSEVENT_MSK) || bit == -1) + return 0; + + byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; + unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; + if (unit == PM_LSU0) { + /* byte 1 bits 0-7, byte 2 bits 0,2-4,6 */ + mask = 0x5dff00; + } else if (unit == PM_LSU1 && byte >= 4) { + byte -= 4; + /* byte 5 bits 6-7, byte 6 bits 0,4, byte 7 bits 0-4,6 */ + mask = 0x5f11c000; + } else + return 0; + + return (mask >> (byte * 8 + bit)) & 1; +} + static int power5p_compute_mmcr(unsigned int event[], int n_ev, unsigned int hwc[], u64 mmcr[]) { u64 mmcr1 = 0; + u64 mmcra = 0; unsigned int pmc, unit, byte, psel; unsigned int ttm; int i, isbus, bit, grsel; @@ -404,6 +501,8 @@ static int power5p_compute_mmcr(unsigned int event[], int n_ev, grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK; mmcr1 |= (u64)grsel << grsel_shift[bit]; } + if (power5p_marked_instr_event(event[i])) + mmcra |= MMCRA_SAMPLE_ENABLE; if ((psel & 0x58) == 0x40 && (byte & 1) != ((pmc >> 1) & 1)) /* select alternate byte lane */ psel |= 0x10; @@ -419,7 +518,7 @@ static int power5p_compute_mmcr(unsigned int event[], int n_ev, if (pmc_inuse & 0x3e) mmcr[0] |= MMCR0_PMCjCE; mmcr[1] = mmcr1; - mmcr[2] = 0; + mmcr[2] = mmcra; return 0; } diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c index 379ed1087cc..116c4bb1809 100644 --- a/arch/powerpc/kernel/power5-pmu.c +++ b/arch/powerpc/kernel/power5-pmu.c @@ -290,10 +290,102 @@ static int power5_get_alternatives(unsigned int event, unsigned int alt[]) return nalt; } +/* + * Map of which direct events on which PMCs are marked instruction events. + * Indexed by PMCSEL value, bit i (LE) set if PMC i is a marked event. + * Bit 0 is set if it is marked for all PMCs. + * The 0x80 bit indicates a byte decode PMCSEL value. + */ +static unsigned char direct_event_is_marked[0x28] = { + 0, /* 00 */ + 0x1f, /* 01 PM_IOPS_CMPL */ + 0x2, /* 02 PM_MRK_GRP_DISP */ + 0xe, /* 03 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */ + 0, /* 04 */ + 0x1c, /* 05 PM_MRK_BRU_FIN, PM_MRK_INST_FIN, PM_MRK_CRU_FIN */ + 0x80, /* 06 */ + 0x80, /* 07 */ + 0, 0, 0,/* 08 - 0a */ + 0x18, /* 0b PM_THRESH_TIMEO, PM_MRK_GRP_TIMEO */ + 0, /* 0c */ + 0x80, /* 0d */ + 0x80, /* 0e */ + 0, /* 0f */ + 0, /* 10 */ + 0x14, /* 11 PM_MRK_GRP_BR_REDIR, PM_MRK_GRP_IC_MISS */ + 0, /* 12 */ + 0x10, /* 13 PM_MRK_GRP_CMPL */ + 0x1f, /* 14 PM_GRP_MRK, PM_MRK_{FXU,FPU,LSU}_FIN */ + 0x2, /* 15 PM_MRK_GRP_ISSUED */ + 0x80, /* 16 */ + 0x80, /* 17 */ + 0, 0, 0, 0, 0, + 0x80, /* 1d */ + 0x80, /* 1e */ + 0, /* 1f */ + 0x80, /* 20 */ + 0x80, /* 21 */ + 0x80, /* 22 */ + 0x80, /* 23 */ + 0x80, /* 24 */ + 0x80, /* 25 */ + 0x80, /* 26 */ + 0x80, /* 27 */ +}; + +/* + * Returns 1 if event counts things relating to marked instructions + * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not. + */ +static int power5_marked_instr_event(unsigned int event) +{ + int pmc, psel; + int bit, byte, unit; + u32 mask; + + pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; + psel = event & PM_PMCSEL_MSK; + if (pmc >= 5) + return 0; + + bit = -1; + if (psel < sizeof(direct_event_is_marked)) { + if (direct_event_is_marked[psel] & (1 << pmc)) + return 1; + if (direct_event_is_marked[psel] & 0x80) + bit = 4; + else if (psel == 0x08) + bit = pmc - 1; + else if (psel == 0x10) + bit = 4 - pmc; + else if (psel == 0x1b && (pmc == 1 || pmc == 3)) + bit = 4; + } else if ((psel & 0x58) == 0x40) + bit = psel & 7; + + if (!(event & PM_BUSEVENT_MSK)) + return 0; + + byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; + unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; + if (unit == PM_LSU0) { + /* byte 1 bits 0-7, byte 2 bits 0,2-4,6 */ + mask = 0x5dff00; + } else if (unit == PM_LSU1 && byte >= 4) { + byte -= 4; + /* byte 4 bits 1,3,5,7, byte 5 bits 6-7, byte 7 bits 0-4,6 */ + mask = 0x5f00c0aa; + } else + return 0; + + return (mask >> (byte * 8 + bit)) & 1; +} + static int power5_compute_mmcr(unsigned int event[], int n_ev, unsigned int hwc[], u64 mmcr[]) { u64 mmcr1 = 0; + u64 mmcra = 0; unsigned int pmc, unit, byte, psel; unsigned int ttm, grp; int i, isbus, bit, grsel; @@ -430,6 +522,8 @@ static int power5_compute_mmcr(unsigned int event[], int n_ev, grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK; mmcr1 |= (u64)grsel << grsel_shift[bit]; } + if (power5_marked_instr_event(event[i])) + mmcra |= MMCRA_SAMPLE_ENABLE; if (pmc <= 3) mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc); hwc[i] = pmc; @@ -442,7 +536,7 @@ static int power5_compute_mmcr(unsigned int event[], int n_ev, if (pmc_inuse & 0x3e) mmcr[0] |= MMCR0_PMCjCE; mmcr[1] = mmcr1; - mmcr[2] = 0; + mmcr[2] = mmcra; return 0; } diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c index b1f61f3c97b..fce1fc290a1 100644 --- a/arch/powerpc/kernel/power6-pmu.c +++ b/arch/powerpc/kernel/power6-pmu.c @@ -48,6 +48,127 @@ #define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8) #define MMCR1_PMCSEL_MSK 0xff +/* + * Map of which direct events on which PMCs are marked instruction events. + * Indexed by PMCSEL value >> 1. + * Bottom 4 bits are a map of which PMCs are interesting, + * top 4 bits say what sort of event: + * 0 = direct marked event, + * 1 = byte decode event, + * 4 = add/and event (PMC1 -> bits 0 & 4), + * 5 = add/and event (PMC1 -> bits 1 & 5), + * 6 = add/and event (PMC1 -> bits 2 & 6), + * 7 = add/and event (PMC1 -> bits 3 & 7). + */ +static unsigned char direct_event_is_marked[0x60 >> 1] = { + 0, /* 00 */ + 0, /* 02 */ + 0, /* 04 */ + 0x07, /* 06 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */ + 0x04, /* 08 PM_MRK_DFU_FIN */ + 0x06, /* 0a PM_MRK_IFU_FIN, PM_MRK_INST_FIN */ + 0, /* 0c */ + 0, /* 0e */ + 0x02, /* 10 PM_MRK_INST_DISP */ + 0x08, /* 12 PM_MRK_LSU_DERAT_MISS */ + 0, /* 14 */ + 0, /* 16 */ + 0x0c, /* 18 PM_THRESH_TIMEO, PM_MRK_INST_FIN */ + 0x0f, /* 1a PM_MRK_INST_DISP, PM_MRK_{FXU,FPU,LSU}_FIN */ + 0x01, /* 1c PM_MRK_INST_ISSUED */ + 0, /* 1e */ + 0, /* 20 */ + 0, /* 22 */ + 0, /* 24 */ + 0, /* 26 */ + 0x15, /* 28 PM_MRK_DATA_FROM_L2MISS, PM_MRK_DATA_FROM_L3MISS */ + 0, /* 2a */ + 0, /* 2c */ + 0, /* 2e */ + 0x4f, /* 30 */ + 0x7f, /* 32 */ + 0x4f, /* 34 */ + 0x5f, /* 36 */ + 0x6f, /* 38 */ + 0x4f, /* 3a */ + 0, /* 3c */ + 0x08, /* 3e PM_MRK_INST_TIMEO */ + 0x1f, /* 40 */ + 0x1f, /* 42 */ + 0x1f, /* 44 */ + 0x1f, /* 46 */ + 0x1f, /* 48 */ + 0x1f, /* 4a */ + 0x1f, /* 4c */ + 0x1f, /* 4e */ + 0, /* 50 */ + 0x05, /* 52 PM_MRK_BR_TAKEN, PM_MRK_BR_MPRED */ + 0x1c, /* 54 PM_MRK_PTEG_FROM_L3MISS, PM_MRK_PTEG_FROM_L2MISS */ + 0x02, /* 56 PM_MRK_LD_MISS_L1 */ + 0, /* 58 */ + 0, /* 5a */ + 0, /* 5c */ + 0, /* 5e */ +}; + +/* + * Masks showing for each unit which bits are marked events. + * These masks are in LE order, i.e. 0x00000001 is byte 0, bit 0. + */ +static u32 marked_bus_events[16] = { + 0x01000000, /* direct events set 1: byte 3 bit 0 */ + 0x00010000, /* direct events set 2: byte 2 bit 0 */ + 0, 0, 0, 0, /* IDU, IFU, nest: nothing */ + 0x00000088, /* VMX set 1: byte 0 bits 3, 7 */ + 0x000000c0, /* VMX set 2: byte 0 bits 4-7 */ + 0x04010000, /* LSU set 1: byte 2 bit 0, byte 3 bit 2 */ + 0xff010000u, /* LSU set 2: byte 2 bit 0, all of byte 3 */ + 0, /* LSU set 3 */ + 0x00000010, /* VMX set 3: byte 0 bit 4 */ + 0, /* BFP set 1 */ + 0x00000022, /* BFP set 2: byte 0 bits 1, 5 */ + 0, 0 +}; + +/* + * Returns 1 if event counts things relating to marked instructions + * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not. + */ +static int power6_marked_instr_event(unsigned int event) +{ + int pmc, psel, ptype; + int bit, byte, unit; + u32 mask; + + pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; + psel = (event & PM_PMCSEL_MSK) >> 1; /* drop edge/level bit */ + if (pmc >= 5) + return 0; + + bit = -1; + if (psel < sizeof(direct_event_is_marked)) { + ptype = direct_event_is_marked[psel]; + if (pmc == 0 || !(ptype & (1 << (pmc - 1)))) + return 0; + ptype >>= 4; + if (ptype == 0) + return 1; + if (ptype == 1) + bit = 0; + else + bit = ptype ^ (pmc - 1); + } else if ((psel & 0x48) == 0x40) + bit = psel & 7; + + if (!(event & PM_BUSEVENT_MSK) || bit == -1) + return 0; + + byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; + unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; + mask = marked_bus_events[unit]; + return (mask >> (byte * 8 + bit)) & 1; +} + /* * Assign PMC numbers and compute MMCR1 value for a set of events */ @@ -55,6 +176,7 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev, unsigned int hwc[], u64 mmcr[]) { u64 mmcr1 = 0; + u64 mmcra = 0; int i; unsigned int pmc, ev, b, u, s, psel; unsigned int ttmset = 0; @@ -116,6 +238,8 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev, if (ev & PM_LLAV) mmcr1 |= MMCR1_PMC1_LLA_VALUE >> pmc; } + if (power6_marked_instr_event(event[i])) + mmcra |= MMCRA_SAMPLE_ENABLE; mmcr1 |= (u64)psel << MMCR1_PMCSEL_SH(pmc); } mmcr[0] = 0; @@ -124,7 +248,7 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev, if (pmc_inuse & 0xe) mmcr[0] |= MMCR0_PMCjCE; mmcr[1] = mmcr1; - mmcr[2] = 0; + mmcr[2] = mmcra; return 0; } diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c index c3256580be1..aed8ccd7c07 100644 --- a/arch/powerpc/kernel/ppc970-pmu.c +++ b/arch/powerpc/kernel/ppc970-pmu.c @@ -19,6 +19,8 @@ #define PM_PMC_MSK 0xf #define PM_UNIT_SH 8 /* TTMMUX number and setting - unit select */ #define PM_UNIT_MSK 0xf +#define PM_SPCSEL_SH 6 +#define PM_SPCSEL_MSK 3 #define PM_BYTE_SH 4 /* Byte number of event bus to use */ #define PM_BYTE_MSK 3 #define PM_PMCSEL_MSK 0xf @@ -88,8 +90,11 @@ static short mmcr1_adder_bits[8] = { * Layout of constraint bits: * 6666555555555544444444443333333333222222222211111111110000000000 * 3210987654321098765432109876543210987654321098765432109876543210 - * <><>[ >[ >[ >< >< >< >< ><><><><><><><><> - * T0T1 UC PS1 PS2 B0 B1 B2 B3 P1P2P3P4P5P6P7P8 + * <><><>[ >[ >[ >< >< >< >< ><><><><><><><><> + * SPT0T1 UC PS1 PS2 B0 B1 B2 B3 P1P2P3P4P5P6P7P8 + * + * SP - SPCSEL constraint + * 48-49: SPCSEL value 0x3_0000_0000_0000 * * T0 - TTM0 constraint * 46-47: TTM0SEL value (0=FPU, 2=IFU, 3=VPU) 0xC000_0000_0000 @@ -126,6 +131,57 @@ static short mmcr1_adder_bits[8] = { * 0-13: Count of events needing PMC2..PMC8 */ +static unsigned char direct_marked_event[8] = { + (1<<2) | (1<<3), /* PMC1: PM_MRK_GRP_DISP, PM_MRK_ST_CMPL */ + (1<<3) | (1<<5), /* PMC2: PM_THRESH_TIMEO, PM_MRK_BRU_FIN */ + (1<<3) | (1<<5), /* PMC3: PM_MRK_ST_CMPL_INT, PM_MRK_VMX_FIN */ + (1<<4) | (1<<5), /* PMC4: PM_MRK_GRP_CMPL, PM_MRK_CRU_FIN */ + (1<<4) | (1<<5), /* PMC5: PM_GRP_MRK, PM_MRK_GRP_TIMEO */ + (1<<3) | (1<<4) | (1<<5), + /* PMC6: PM_MRK_ST_STS, PM_MRK_FXU_FIN, PM_MRK_GRP_ISSUED */ + (1<<4) | (1<<5), /* PMC7: PM_MRK_FPU_FIN, PM_MRK_INST_FIN */ + (1<<4) /* PMC8: PM_MRK_LSU_FIN */ +}; + +/* + * Returns 1 if event counts things relating to marked instructions + * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not. + */ +static int p970_marked_instr_event(unsigned int event) +{ + int pmc, psel, unit, byte, bit; + unsigned int mask; + + pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; + psel = event & PM_PMCSEL_MSK; + if (pmc) { + if (direct_marked_event[pmc - 1] & (1 << psel)) + return 1; + if (psel == 0) /* add events */ + bit = (pmc <= 4)? pmc - 1: 8 - pmc; + else if (psel == 7 || psel == 13) /* decode events */ + bit = 4; + else + return 0; + } else + bit = psel; + + byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; + unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; + mask = 0; + switch (unit) { + case PM_VPU: + mask = 0x4c; /* byte 0 bits 2,3,6 */ + case PM_LSU0: + /* byte 2 bits 0,2,3,4,6; all of byte 1 */ + mask = 0x085dff00; + case PM_LSU1L: + mask = 0x50 << 24; /* byte 3 bits 4,6 */ + break; + } + return (mask >> (byte * 8 + bit)) & 1; +} + /* Masks and values for using events from the various units */ static u64 unit_cons[PM_LASTUNIT+1][2] = { [PM_FPU] = { 0xc80000000000ull, 0x040000000000ull }, @@ -138,7 +194,7 @@ static u64 unit_cons[PM_LASTUNIT+1][2] = { static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp) { - int pmc, byte, unit, sh; + int pmc, byte, unit, sh, spcsel; u64 mask = 0, value = 0; int grp = -1; @@ -177,6 +233,11 @@ static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp) mask |= 0x800000000ull; value |= 0x100000000ull; } + spcsel = (event >> PM_SPCSEL_SH) & PM_SPCSEL_MSK; + if (spcsel) { + mask |= 3ull << 48; + value |= (u64)spcsel << 48; + } *maskp = mask; *valp = value; return 0; @@ -209,6 +270,7 @@ static int p970_compute_mmcr(unsigned int event[], int n_ev, unsigned char ttmuse[2]; unsigned char pmcsel[8]; int i; + int spcsel; if (n_ev > 8) return -1; @@ -316,6 +378,10 @@ static int p970_compute_mmcr(unsigned int event[], int n_ev, } pmcsel[pmc] = psel; hwc[i] = pmc; + spcsel = (event[i] >> PM_SPCSEL_SH) & PM_SPCSEL_MSK; + mmcr1 |= spcsel; + if (p970_marked_instr_event(event[i])) + mmcra |= MMCRA_SAMPLE_ENABLE; } for (pmc = 0; pmc < 2; ++pmc) mmcr0 |= pmcsel[pmc] << (MMCR0_PMC1SEL_SH - 7 * pmc); -- cgit v1.2.3 From 78f13e9525ba777da25c4ddab89f28e9366a8b7c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Apr 2009 15:01:33 +0200 Subject: perf_counter: allow for data addresses to be recorded Paul suggested we allow for data addresses to be recorded along with the traditional IPs as power can provide these. For now, only the software pagefault events provide data addresses, but in the future power might as well for some events. x86 doesn't seem capable of providing this atm. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090408130409.394816925@chello.nl> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 2 +- arch/powerpc/mm/fault.c | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 0697ade84dd..c9d019f1907 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -749,7 +749,7 @@ static void record_and_restart(struct perf_counter *counter, long val, * Finally record data if requested. */ if (record) - perf_counter_overflow(counter, 1, regs); + perf_counter_overflow(counter, 1, regs, 0); } /* diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 17bbf6f91fb..ac0e112031b 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -171,7 +171,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, die("Weird page fault", regs, SIGSEGV); } - perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs); + perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs, address); /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the @@ -312,7 +312,8 @@ good_area: } if (ret & VM_FAULT_MAJOR) { current->maj_flt++; - perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs); + perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, + regs, address); #ifdef CONFIG_PPC_SMLPAR if (firmware_has_feature(FW_FEATURE_CMO)) { preempt_disable(); @@ -322,7 +323,8 @@ good_area: #endif } else { current->min_flt++; - perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs); + perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, + regs, address); } up_read(&mm->mmap_sem); return 0; -- cgit v1.2.3 From ca8f2d7f019a8547f39ddb9ed0144932f12807f2 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 9 Apr 2009 14:42:56 +1000 Subject: perf_counter: powerpc: add nmi_enter/nmi_exit calls Impact: fix potential deadlocks on powerpc Now that the core is using in_nmi() (added in e30e08f6, "perf_counter: fix NMI race in task clock"), we need the powerpc perf_counter_interrupt to call nmi_enter() and nmi_exit() in those cases where the interrupt happens when interrupts are soft-disabled. If interrupts were soft-enabled, we can treat it as a regular interrupt and do irq_enter/irq_exit around the whole routine. This lets us get rid of the test_perf_counter_pending() call at the end of perf_counter_interrupt, thus simplifying things a little. Signed-off-by: Paul Mackerras Cc: Peter Zijlstra LKML-Reference: <18909.31952.873098.336615@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index c9d019f1907..bd76d0fa2c3 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -714,7 +714,7 @@ hw_perf_counter_init(struct perf_counter *counter) * here so there is no possibility of being interrupted. */ static void record_and_restart(struct perf_counter *counter, long val, - struct pt_regs *regs) + struct pt_regs *regs, int nmi) { s64 prev, delta, left; int record = 0; @@ -749,7 +749,7 @@ static void record_and_restart(struct perf_counter *counter, long val, * Finally record data if requested. */ if (record) - perf_counter_overflow(counter, 1, regs, 0); + perf_counter_overflow(counter, nmi, regs, 0); } /* @@ -762,6 +762,17 @@ static void perf_counter_interrupt(struct pt_regs *regs) struct perf_counter *counter; long val; int found = 0; + int nmi; + + /* + * If interrupts were soft-disabled when this PMU interrupt + * occurred, treat it as an NMI. + */ + nmi = !regs->softe; + if (nmi) + nmi_enter(); + else + irq_enter(); for (i = 0; i < cpuhw->n_counters; ++i) { counter = cpuhw->counter[i]; @@ -769,7 +780,7 @@ static void perf_counter_interrupt(struct pt_regs *regs) if ((int)val < 0) { /* counter has overflowed */ found = 1; - record_and_restart(counter, val, regs); + record_and_restart(counter, val, regs, nmi); } } @@ -796,18 +807,10 @@ static void perf_counter_interrupt(struct pt_regs *regs) */ mtspr(SPRN_MMCR0, cpuhw->mmcr[0]); - /* - * If we need a wakeup, check whether interrupts were soft-enabled - * when we took the interrupt. If they were, we can wake stuff up - * immediately; otherwise we'll have do the wakeup when interrupts - * get soft-enabled. - */ - if (test_perf_counter_pending() && regs->softe) { - irq_enter(); - clear_perf_counter_pending(); - perf_counter_do_pending(); + if (nmi) + nmi_exit(); + else irq_exit(); - } } void hw_perf_counter_setup(int cpu) -- cgit v1.2.3 From 4aeb0b4239bb3b67ed402cb9cef3e000c892cadf Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:03 +0200 Subject: perfcounters: rename struct hw_perf_counter_ops into struct pmu This patch renames struct hw_perf_counter_ops into struct pmu. It introduces a structure to describe a cpu specific pmu (performance monitoring unit). It may contain ops and data. The new name of the structure fits better, is shorter, and thus better to handle. Where it was appropriate, names of function and variable have been changed too. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-7-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index bd76d0fa2c3..d9bbe5efc64 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -256,7 +256,7 @@ static int check_excludes(struct perf_counter **ctrs, int n_prev, int n_new) return 0; } -static void power_perf_read(struct perf_counter *counter) +static void power_pmu_read(struct perf_counter *counter) { long val, delta, prev; @@ -405,7 +405,7 @@ void hw_perf_restore(u64 disable) for (i = 0; i < cpuhw->n_counters; ++i) { counter = cpuhw->counter[i]; if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) { - power_perf_read(counter); + power_pmu_read(counter); write_pmc(counter->hw.idx, 0); counter->hw.idx = 0; } @@ -477,7 +477,7 @@ static void counter_sched_in(struct perf_counter *counter, int cpu) counter->oncpu = cpu; counter->tstamp_running += counter->ctx->time - counter->tstamp_stopped; if (is_software_counter(counter)) - counter->hw_ops->enable(counter); + counter->pmu->enable(counter); } /* @@ -533,7 +533,7 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader, * re-enable the PMU in order to get hw_perf_restore to do the * actual work of reconfiguring the PMU. */ -static int power_perf_enable(struct perf_counter *counter) +static int power_pmu_enable(struct perf_counter *counter) { struct cpu_hw_counters *cpuhw; unsigned long flags; @@ -573,7 +573,7 @@ static int power_perf_enable(struct perf_counter *counter) /* * Remove a counter from the PMU. */ -static void power_perf_disable(struct perf_counter *counter) +static void power_pmu_disable(struct perf_counter *counter) { struct cpu_hw_counters *cpuhw; long i; @@ -583,7 +583,7 @@ static void power_perf_disable(struct perf_counter *counter) local_irq_save(flags); pmudis = hw_perf_save_disable(); - power_perf_read(counter); + power_pmu_read(counter); cpuhw = &__get_cpu_var(cpu_hw_counters); for (i = 0; i < cpuhw->n_counters; ++i) { @@ -607,10 +607,10 @@ static void power_perf_disable(struct perf_counter *counter) local_irq_restore(flags); } -struct hw_perf_counter_ops power_perf_ops = { - .enable = power_perf_enable, - .disable = power_perf_disable, - .read = power_perf_read +struct pmu power_pmu = { + .enable = power_pmu_enable, + .disable = power_pmu_disable, + .read = power_pmu_read, }; /* Number of perf_counters counting hardware events */ @@ -631,8 +631,7 @@ static void hw_perf_counter_destroy(struct perf_counter *counter) } } -const struct hw_perf_counter_ops * -hw_perf_counter_init(struct perf_counter *counter) +const struct pmu *hw_perf_counter_init(struct perf_counter *counter) { unsigned long ev; struct perf_counter *ctrs[MAX_HWCOUNTERS]; @@ -705,7 +704,7 @@ hw_perf_counter_init(struct perf_counter *counter) if (err) return ERR_PTR(err); - return &power_perf_ops; + return &power_pmu; } /* -- cgit v1.2.3 From ab7ef2e50a557af92f4f90689f51fadadafc16b2 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 29 Apr 2009 22:38:51 +1000 Subject: perf_counter: powerpc: allow use of limited-function counters POWER5+ and POWER6 have two hardware counters with limited functionality: PMC5 counts instructions completed in run state and PMC6 counts cycles in run state. (Run state is the state when a hardware RUN bit is 1; the idle task clears RUN while waiting for work to do and sets it when there is work to do.) These counters can't be written to by the kernel, can't generate interrupts, and don't obey the freeze conditions. That means we can only use them for per-task counters (where we know we'll always be in run state; we can't put a per-task counter on an idle task), and only if we don't want interrupts and we do want to count in all processor modes. Obviously some counters can't go on a limited hardware counter, but there are also situations where we can only put a counter on a limited hardware counter - if there are already counters on that exclude some processor modes and we want to put on a per-task cycle or instruction counter that doesn't exclude any processor mode, it could go on if it can use a limited hardware counter. To keep track of these constraints, this adds a flags argument to the processor-specific get_alternatives() functions, with three bits defined: one to say that we can accept alternative event codes that go on limited counters, one to say we only want alternatives on limited counters, and one to say that this is a per-task counter and therefore events that are gated by run state are equivalent to those that aren't (e.g. a "cycles" event is equivalent to a "cycles in run state" event). These flags are computed for each counter and stored in the counter->hw.counter_base field (slightly wonky name for what it does, but it was an existing unused field). Since the limited counters don't freeze when we freeze the other counters, we need some special handling to avoid getting skew between things counted on the limited counters and those counted on normal counters. To minimize this skew, if we are using any limited counters, we read PMC5 and PMC6 immediately after setting and clearing the freeze bit. This is done in a single asm in the new write_mmcr0() function. The code here is specific to PMC5 and PMC6 being the limited hardware counters. Being more general (e.g. having a bitmap of limited hardware counter numbers) would have meant more complex code to read the limited counters when freezing and unfreezing the normal counters, with conditional branches, which would have increased the skew. Since it isn't necessary for the code to be more general at this stage, it isn't. This also extends the back-ends for POWER5+ and POWER6 to be able to handle up to 6 counters rather than the 4 they previously handled. Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Robert Richter LKML-Reference: <18936.19035.163066.892208@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- arch/powerpc/include/asm/perf_counter.h | 13 +- arch/powerpc/kernel/perf_counter.c | 297 ++++++++++++++++++++++++++++---- arch/powerpc/kernel/power4-pmu.c | 3 +- arch/powerpc/kernel/power5+-pmu.c | 117 +++++++++++-- arch/powerpc/kernel/power5-pmu.c | 3 +- arch/powerpc/kernel/power6-pmu.c | 119 +++++++++++-- arch/powerpc/kernel/ppc970-pmu.c | 3 +- 7 files changed, 479 insertions(+), 76 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h index 9d7ff6d7fb5..56d66c38143 100644 --- a/arch/powerpc/include/asm/perf_counter.h +++ b/arch/powerpc/include/asm/perf_counter.h @@ -12,6 +12,7 @@ #define MAX_HWCOUNTERS 8 #define MAX_EVENT_ALTERNATIVES 8 +#define MAX_LIMITED_HWCOUNTERS 2 /* * This struct provides the constants and functions needed to @@ -25,14 +26,24 @@ struct power_pmu { int (*compute_mmcr)(unsigned int events[], int n_ev, unsigned int hwc[], u64 mmcr[]); int (*get_constraint)(unsigned int event, u64 *mskp, u64 *valp); - int (*get_alternatives)(unsigned int event, unsigned int alt[]); + int (*get_alternatives)(unsigned int event, unsigned int flags, + unsigned int alt[]); void (*disable_pmc)(unsigned int pmc, u64 mmcr[]); + int (*limited_pmc_event)(unsigned int event); + int limited_pmc5_6; /* PMC5 and PMC6 have limited function */ int n_generic; int *generic_events; }; extern struct power_pmu *ppmu; +/* + * Values for flags to get_alternatives() + */ +#define PPMU_LIMITED_PMC_OK 1 /* can put this on a limited PMC */ +#define PPMU_LIMITED_PMC_REQD 2 /* have to put this on a limited PMC */ +#define PPMU_ONLY_COUNT_RUN 4 /* only counting in run state */ + /* * The power_pmu.get_constraint function returns a 64-bit value and * a 64-bit mask that express the constraints between this event and diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index d9bbe5efc64..15cdc8e6722 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -23,10 +23,14 @@ struct cpu_hw_counters { int n_percpu; int disabled; int n_added; + int n_limited; + u8 pmcs_enabled; struct perf_counter *counter[MAX_HWCOUNTERS]; unsigned int events[MAX_HWCOUNTERS]; + unsigned int flags[MAX_HWCOUNTERS]; u64 mmcr[3]; - u8 pmcs_enabled; + struct perf_counter *limited_counter[MAX_LIMITED_HWCOUNTERS]; + u8 limited_hwidx[MAX_LIMITED_HWCOUNTERS]; }; DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters); @@ -127,7 +131,8 @@ static void write_pmc(int idx, unsigned long val) * and see if any combination of alternative codes is feasible. * The feasible set is returned in event[]. */ -static int power_check_constraints(unsigned int event[], int n_ev) +static int power_check_constraints(unsigned int event[], unsigned int cflags[], + int n_ev) { u64 mask, value, nv; unsigned int alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES]; @@ -144,11 +149,15 @@ static int power_check_constraints(unsigned int event[], int n_ev) /* First see if the events will go on as-is */ for (i = 0; i < n_ev; ++i) { - alternatives[i][0] = event[i]; + if ((cflags[i] & PPMU_LIMITED_PMC_REQD) + && !ppmu->limited_pmc_event(event[i])) { + ppmu->get_alternatives(event[i], cflags[i], + alternatives[i]); + event[i] = alternatives[i][0]; + } if (ppmu->get_constraint(event[i], &amasks[i][0], &avalues[i][0])) return -1; - choice[i] = 0; } value = mask = 0; for (i = 0; i < n_ev; ++i) { @@ -166,7 +175,9 @@ static int power_check_constraints(unsigned int event[], int n_ev) if (!ppmu->get_alternatives) return -1; for (i = 0; i < n_ev; ++i) { - n_alt[i] = ppmu->get_alternatives(event[i], alternatives[i]); + choice[i] = 0; + n_alt[i] = ppmu->get_alternatives(event[i], cflags[i], + alternatives[i]); for (j = 1; j < n_alt[i]; ++j) ppmu->get_constraint(alternatives[i][j], &amasks[i][j], &avalues[i][j]); @@ -231,28 +242,41 @@ static int power_check_constraints(unsigned int event[], int n_ev) * exclude_{user,kernel,hv} with each other and any previously * added counters. */ -static int check_excludes(struct perf_counter **ctrs, int n_prev, int n_new) +static int check_excludes(struct perf_counter **ctrs, unsigned int cflags[], + int n_prev, int n_new) { - int eu, ek, eh; - int i, n; + int eu = 0, ek = 0, eh = 0; + int i, n, first; struct perf_counter *counter; n = n_prev + n_new; if (n <= 1) return 0; - eu = ctrs[0]->hw_event.exclude_user; - ek = ctrs[0]->hw_event.exclude_kernel; - eh = ctrs[0]->hw_event.exclude_hv; - if (n_prev == 0) - n_prev = 1; - for (i = n_prev; i < n; ++i) { + first = 1; + for (i = 0; i < n; ++i) { + if (cflags[i] & PPMU_LIMITED_PMC_OK) { + cflags[i] &= ~PPMU_LIMITED_PMC_REQD; + continue; + } counter = ctrs[i]; - if (counter->hw_event.exclude_user != eu || - counter->hw_event.exclude_kernel != ek || - counter->hw_event.exclude_hv != eh) + if (first) { + eu = counter->hw_event.exclude_user; + ek = counter->hw_event.exclude_kernel; + eh = counter->hw_event.exclude_hv; + first = 0; + } else if (counter->hw_event.exclude_user != eu || + counter->hw_event.exclude_kernel != ek || + counter->hw_event.exclude_hv != eh) { return -EAGAIN; + } } + + if (eu || ek || eh) + for (i = 0; i < n; ++i) + if (cflags[i] & PPMU_LIMITED_PMC_OK) + cflags[i] |= PPMU_LIMITED_PMC_REQD; + return 0; } @@ -279,6 +303,85 @@ static void power_pmu_read(struct perf_counter *counter) atomic64_sub(delta, &counter->hw.period_left); } +/* + * On some machines, PMC5 and PMC6 can't be written, don't respect + * the freeze conditions, and don't generate interrupts. This tells + * us if `counter' is using such a PMC. + */ +static int is_limited_pmc(int pmcnum) +{ + return ppmu->limited_pmc5_6 && (pmcnum == 5 || pmcnum == 6); +} + +static void freeze_limited_counters(struct cpu_hw_counters *cpuhw, + unsigned long pmc5, unsigned long pmc6) +{ + struct perf_counter *counter; + u64 val, prev, delta; + int i; + + for (i = 0; i < cpuhw->n_limited; ++i) { + counter = cpuhw->limited_counter[i]; + if (!counter->hw.idx) + continue; + val = (counter->hw.idx == 5) ? pmc5 : pmc6; + prev = atomic64_read(&counter->hw.prev_count); + counter->hw.idx = 0; + delta = (val - prev) & 0xfffffffful; + atomic64_add(delta, &counter->count); + } +} + +static void thaw_limited_counters(struct cpu_hw_counters *cpuhw, + unsigned long pmc5, unsigned long pmc6) +{ + struct perf_counter *counter; + u64 val; + int i; + + for (i = 0; i < cpuhw->n_limited; ++i) { + counter = cpuhw->limited_counter[i]; + counter->hw.idx = cpuhw->limited_hwidx[i]; + val = (counter->hw.idx == 5) ? pmc5 : pmc6; + atomic64_set(&counter->hw.prev_count, val); + perf_counter_update_userpage(counter); + } +} + +/* + * Since limited counters don't respect the freeze conditions, we + * have to read them immediately after freezing or unfreezing the + * other counters. We try to keep the values from the limited + * counters as consistent as possible by keeping the delay (in + * cycles and instructions) between freezing/unfreezing and reading + * the limited counters as small and consistent as possible. + * Therefore, if any limited counters are in use, we read them + * both, and always in the same order, to minimize variability, + * and do it inside the same asm that writes MMCR0. + */ +static void write_mmcr0(struct cpu_hw_counters *cpuhw, unsigned long mmcr0) +{ + unsigned long pmc5, pmc6; + + if (!cpuhw->n_limited) { + mtspr(SPRN_MMCR0, mmcr0); + return; + } + + /* + * Write MMCR0, then read PMC5 and PMC6 immediately. + */ + asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5" + : "=&r" (pmc5), "=&r" (pmc6) + : "r" (mmcr0), "i" (SPRN_MMCR0), + "i" (SPRN_PMC5), "i" (SPRN_PMC6)); + + if (mmcr0 & MMCR0_FC) + freeze_limited_counters(cpuhw, pmc5, pmc6); + else + thaw_limited_counters(cpuhw, pmc5, pmc6); +} + /* * Disable all counters to prevent PMU interrupts and to allow * counters to be added or removed. @@ -321,7 +424,7 @@ u64 hw_perf_save_disable(void) * executed and the PMU has frozen the counters * before we return. */ - mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_FC); + write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC); mb(); } local_irq_restore(flags); @@ -342,6 +445,8 @@ void hw_perf_restore(u64 disable) unsigned long val; s64 left; unsigned int hwc_index[MAX_HWCOUNTERS]; + int n_lim; + int idx; if (disable) return; @@ -414,10 +519,18 @@ void hw_perf_restore(u64 disable) /* * Initialize the PMCs for all the new and moved counters. */ + cpuhw->n_limited = n_lim = 0; for (i = 0; i < cpuhw->n_counters; ++i) { counter = cpuhw->counter[i]; if (counter->hw.idx) continue; + idx = hwc_index[i] + 1; + if (is_limited_pmc(idx)) { + cpuhw->limited_counter[n_lim] = counter; + cpuhw->limited_hwidx[n_lim] = idx; + ++n_lim; + continue; + } val = 0; if (counter->hw_event.irq_period) { left = atomic64_read(&counter->hw.period_left); @@ -425,15 +538,16 @@ void hw_perf_restore(u64 disable) val = 0x80000000L - left; } atomic64_set(&counter->hw.prev_count, val); - counter->hw.idx = hwc_index[i] + 1; - write_pmc(counter->hw.idx, val); + counter->hw.idx = idx; + write_pmc(idx, val); perf_counter_update_userpage(counter); } + cpuhw->n_limited = n_lim; cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE; out_enable: mb(); - mtspr(SPRN_MMCR0, cpuhw->mmcr[0]); + write_mmcr0(cpuhw, cpuhw->mmcr[0]); /* * Enable instruction sampling if necessary @@ -448,7 +562,8 @@ void hw_perf_restore(u64 disable) } static int collect_events(struct perf_counter *group, int max_count, - struct perf_counter *ctrs[], unsigned int *events) + struct perf_counter *ctrs[], unsigned int *events, + unsigned int *flags) { int n = 0; struct perf_counter *counter; @@ -457,6 +572,7 @@ static int collect_events(struct perf_counter *group, int max_count, if (n >= max_count) return -1; ctrs[n] = group; + flags[n] = group->hw.counter_base; events[n++] = group->hw.config; } list_for_each_entry(counter, &group->sibling_list, list_entry) { @@ -465,6 +581,7 @@ static int collect_events(struct perf_counter *group, int max_count, if (n >= max_count) return -1; ctrs[n] = counter; + flags[n] = counter->hw.counter_base; events[n++] = counter->hw.config; } } @@ -497,12 +614,14 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader, cpuhw = &__get_cpu_var(cpu_hw_counters); n0 = cpuhw->n_counters; n = collect_events(group_leader, ppmu->n_counter - n0, - &cpuhw->counter[n0], &cpuhw->events[n0]); + &cpuhw->counter[n0], &cpuhw->events[n0], + &cpuhw->flags[n0]); if (n < 0) return -EAGAIN; - if (check_excludes(cpuhw->counter, n0, n)) + if (check_excludes(cpuhw->counter, cpuhw->flags, n0, n)) return -EAGAIN; - if (power_check_constraints(cpuhw->events, n + n0)) + i = power_check_constraints(cpuhw->events, cpuhw->flags, n + n0); + if (i < 0) return -EAGAIN; cpuhw->n_counters = n0 + n; cpuhw->n_added += n; @@ -554,9 +673,10 @@ static int power_pmu_enable(struct perf_counter *counter) goto out; cpuhw->counter[n0] = counter; cpuhw->events[n0] = counter->hw.config; - if (check_excludes(cpuhw->counter, n0, 1)) + cpuhw->flags[n0] = counter->hw.counter_base; + if (check_excludes(cpuhw->counter, cpuhw->flags, n0, 1)) goto out; - if (power_check_constraints(cpuhw->events, n0 + 1)) + if (power_check_constraints(cpuhw->events, cpuhw->flags, n0 + 1)) goto out; counter->hw.config = cpuhw->events[n0]; @@ -592,12 +712,24 @@ static void power_pmu_disable(struct perf_counter *counter) cpuhw->counter[i-1] = cpuhw->counter[i]; --cpuhw->n_counters; ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr); - write_pmc(counter->hw.idx, 0); - counter->hw.idx = 0; + if (counter->hw.idx) { + write_pmc(counter->hw.idx, 0); + counter->hw.idx = 0; + } perf_counter_update_userpage(counter); break; } } + for (i = 0; i < cpuhw->n_limited; ++i) + if (counter == cpuhw->limited_counter[i]) + break; + if (i < cpuhw->n_limited) { + while (++i < cpuhw->n_limited) { + cpuhw->limited_counter[i-1] = cpuhw->limited_counter[i]; + cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i]; + } + --cpuhw->n_limited; + } if (cpuhw->n_counters == 0) { /* disable exceptions if no counters are running */ cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE); @@ -613,6 +745,61 @@ struct pmu power_pmu = { .read = power_pmu_read, }; +/* + * Return 1 if we might be able to put counter on a limited PMC, + * or 0 if not. + * A counter can only go on a limited PMC if it counts something + * that a limited PMC can count, doesn't require interrupts, and + * doesn't exclude any processor mode. + */ +static int can_go_on_limited_pmc(struct perf_counter *counter, unsigned int ev, + unsigned int flags) +{ + int n; + unsigned int alt[MAX_EVENT_ALTERNATIVES]; + + if (counter->hw_event.exclude_user + || counter->hw_event.exclude_kernel + || counter->hw_event.exclude_hv + || counter->hw_event.irq_period) + return 0; + + if (ppmu->limited_pmc_event(ev)) + return 1; + + /* + * The requested event isn't on a limited PMC already; + * see if any alternative code goes on a limited PMC. + */ + if (!ppmu->get_alternatives) + return 0; + + flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD; + n = ppmu->get_alternatives(ev, flags, alt); + if (n) + return alt[0]; + + return 0; +} + +/* + * Find an alternative event that goes on a normal PMC, if possible, + * and return the event code, or 0 if there is no such alternative. + * (Note: event code 0 is "don't count" on all machines.) + */ +static unsigned long normal_pmc_alternative(unsigned long ev, + unsigned long flags) +{ + unsigned int alt[MAX_EVENT_ALTERNATIVES]; + int n; + + flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD); + n = ppmu->get_alternatives(ev, flags, alt); + if (!n) + return 0; + return alt[0]; +} + /* Number of perf_counters counting hardware events */ static atomic_t num_counters; /* Used to avoid races in calling reserve/release_pmc_hardware */ @@ -633,9 +820,10 @@ static void hw_perf_counter_destroy(struct perf_counter *counter) const struct pmu *hw_perf_counter_init(struct perf_counter *counter) { - unsigned long ev; + unsigned long ev, flags; struct perf_counter *ctrs[MAX_HWCOUNTERS]; unsigned int events[MAX_HWCOUNTERS]; + unsigned int cflags[MAX_HWCOUNTERS]; int n; int err; @@ -661,7 +849,36 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter) */ if (!firmware_has_feature(FW_FEATURE_LPAR)) counter->hw_event.exclude_hv = 0; - + + /* + * If this is a per-task counter, then we can use + * PM_RUN_* events interchangeably with their non RUN_* + * equivalents, e.g. PM_RUN_CYC instead of PM_CYC. + * XXX we should check if the task is an idle task. + */ + flags = 0; + if (counter->ctx->task) + flags |= PPMU_ONLY_COUNT_RUN; + + /* + * If this machine has limited counters, check whether this + * event could go on a limited counter. + */ + if (ppmu->limited_pmc5_6) { + if (can_go_on_limited_pmc(counter, ev, flags)) { + flags |= PPMU_LIMITED_PMC_OK; + } else if (ppmu->limited_pmc_event(ev)) { + /* + * The requested event is on a limited PMC, + * but we can't use a limited PMC; see if any + * alternative goes on a normal PMC. + */ + ev = normal_pmc_alternative(ev, flags); + if (!ev) + return ERR_PTR(-EINVAL); + } + } + /* * If this is in a group, check if it can go on with all the * other hardware counters in the group. We assume the counter @@ -670,18 +887,20 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter) n = 0; if (counter->group_leader != counter) { n = collect_events(counter->group_leader, ppmu->n_counter - 1, - ctrs, events); + ctrs, events, cflags); if (n < 0) return ERR_PTR(-EINVAL); } events[n] = ev; ctrs[n] = counter; - if (check_excludes(ctrs, n, 1)) + cflags[n] = flags; + if (check_excludes(ctrs, cflags, n, 1)) return ERR_PTR(-EINVAL); - if (power_check_constraints(events, n + 1)) + if (power_check_constraints(events, cflags, n + 1)) return ERR_PTR(-EINVAL); counter->hw.config = events[n]; + counter->hw.counter_base = cflags[n]; atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period); /* @@ -763,6 +982,10 @@ static void perf_counter_interrupt(struct pt_regs *regs) int found = 0; int nmi; + if (cpuhw->n_limited) + freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5), + mfspr(SPRN_PMC6)); + /* * If interrupts were soft-disabled when this PMU interrupt * occurred, treat it as an NMI. @@ -775,6 +998,8 @@ static void perf_counter_interrupt(struct pt_regs *regs) for (i = 0; i < cpuhw->n_counters; ++i) { counter = cpuhw->counter[i]; + if (is_limited_pmc(counter->hw.idx)) + continue; val = read_pmc(counter->hw.idx); if ((int)val < 0) { /* counter has overflowed */ @@ -791,6 +1016,8 @@ static void perf_counter_interrupt(struct pt_regs *regs) */ if (!found) { for (i = 0; i < ppmu->n_counter; ++i) { + if (is_limited_pmc(i + 1)) + continue; val = read_pmc(i + 1); if ((int)val < 0) write_pmc(i + 1, 0); @@ -804,7 +1031,7 @@ static void perf_counter_interrupt(struct pt_regs *regs) * XXX might want to use MSR.PM to keep the counters frozen until * we get back out of this interrupt. */ - mtspr(SPRN_MMCR0, cpuhw->mmcr[0]); + write_mmcr0(cpuhw, cpuhw->mmcr[0]); if (nmi) nmi_exit(); diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c index 1407b19ab61..744a2756958 100644 --- a/arch/powerpc/kernel/power4-pmu.c +++ b/arch/powerpc/kernel/power4-pmu.c @@ -320,7 +320,8 @@ static unsigned int ppc_inst_cmpl[] = { 0x1001, 0x4001, 0x6001, 0x7001, 0x8001 }; -static int p4_get_alternatives(unsigned int event, unsigned int alt[]) +static int p4_get_alternatives(unsigned int event, unsigned int flags, + unsigned int alt[]) { int i, j, na; diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c index 1222c8ea3c2..8154eaa2404 100644 --- a/arch/powerpc/kernel/power5+-pmu.c +++ b/arch/powerpc/kernel/power5+-pmu.c @@ -78,8 +78,8 @@ * Layout of constraint bits: * 6666555555555544444444443333333333222222222211111111110000000000 * 3210987654321098765432109876543210987654321098765432109876543210 - * [ ><><>< ><> <><>[ > < >< >< >< ><><><><> - * NC G0G1G2 G3 T0T1 UC B0 B1 B2 B3 P4P3P2P1 + * [ ><><>< ><> <><>[ > < >< >< >< ><><><><><><> + * NC G0G1G2 G3 T0T1 UC B0 B1 B2 B3 P6P5P4P3P2P1 * * NC - number of counters * 51: NC error 0x0008_0000_0000_0000 @@ -105,18 +105,18 @@ * 30: IDU|GRS events needed 0x00_4000_0000 * * B0 - * 20-23: Byte 0 event source 0x00f0_0000 + * 24-27: Byte 0 event source 0x0f00_0000 * Encoding as for the event code * * B1, B2, B3 - * 16-19, 12-15, 8-11: Byte 1, 2, 3 event sources + * 20-23, 16-19, 12-15: Byte 1, 2, 3 event sources * - * P4 - * 7: P1 error 0x80 - * 6-7: Count of events needing PMC4 + * P6 + * 11: P6 error 0x800 + * 10-11: Count of events needing PMC6 * - * P1..P3 - * 0-6: Count of events needing PMC1..PMC3 + * P1..P5 + * 0-9: Count of events needing PMC1..PMC5 */ static const int grsel_shift[8] = { @@ -143,11 +143,13 @@ static int power5p_get_constraint(unsigned int event, u64 *maskp, u64 *valp) pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; if (pmc) { - if (pmc > 4) + if (pmc > 6) return -1; sh = (pmc - 1) * 2; mask |= 2 << sh; value |= 1 << sh; + if (pmc >= 5 && !(event == 0x500009 || event == 0x600005)) + return -1; } if (event & PM_BUSEVENT_MSK) { unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; @@ -173,16 +175,26 @@ static int power5p_get_constraint(unsigned int event, u64 *maskp, u64 *valp) value |= (u64)((event >> PM_GRS_SH) & fmask) << sh; } /* Set byte lane select field */ - mask |= 0xfULL << (20 - 4 * byte); - value |= (u64)unit << (20 - 4 * byte); + mask |= 0xfULL << (24 - 4 * byte); + value |= (u64)unit << (24 - 4 * byte); + } + if (pmc < 5) { + /* need a counter from PMC1-4 set */ + mask |= 0x8000000000000ull; + value |= 0x1000000000000ull; } - mask |= 0x8000000000000ull; - value |= 0x1000000000000ull; *maskp = mask; *valp = value; return 0; } +static int power5p_limited_pmc_event(unsigned int event) +{ + int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; + + return pmc == 5 || pmc == 6; +} + #define MAX_ALT 3 /* at most 3 alternatives for any event */ static const unsigned int event_alternatives[][MAX_ALT] = { @@ -193,6 +205,7 @@ static const unsigned int event_alternatives[][MAX_ALT] = { { 0x410c7, 0x441084 }, /* PM_THRD_L2MISS_BOTH_CYC */ { 0x800c4, 0xc20e0 }, /* PM_DTLB_MISS */ { 0xc50c6, 0xc60e0 }, /* PM_MRK_DTLB_MISS */ + { 0x100005, 0x600005 }, /* PM_RUN_CYC */ { 0x100009, 0x200009 }, /* PM_INST_CMPL */ { 0x200015, 0x300015 }, /* PM_LSU_LMQ_SRQ_EMPTY_CYC */ { 0x300009, 0x400009 }, /* PM_INST_DISP */ @@ -260,24 +273,85 @@ static int find_alternative_bdecode(unsigned int event) return -1; } -static int power5p_get_alternatives(unsigned int event, unsigned int alt[]) +static int power5p_get_alternatives(unsigned int event, unsigned int flags, + unsigned int alt[]) { int i, j, ae, nalt = 1; + int nlim; alt[0] = event; nalt = 1; + nlim = power5p_limited_pmc_event(event); i = find_alternative(event); if (i >= 0) { for (j = 0; j < MAX_ALT; ++j) { ae = event_alternatives[i][j]; if (ae && ae != event) alt[nalt++] = ae; + nlim += power5p_limited_pmc_event(ae); } } else { ae = find_alternative_bdecode(event); if (ae > 0) alt[nalt++] = ae; } + + if (flags & PPMU_ONLY_COUNT_RUN) { + /* + * We're only counting in RUN state, + * so PM_CYC is equivalent to PM_RUN_CYC + * and PM_INST_CMPL === PM_RUN_INST_CMPL. + * This doesn't include alternatives that don't provide + * any extra flexibility in assigning PMCs (e.g. + * 0x100005 for PM_RUN_CYC vs. 0xf for PM_CYC). + * Note that even with these additional alternatives + * we never end up with more than 3 alternatives for any event. + */ + j = nalt; + for (i = 0; i < nalt; ++i) { + switch (alt[i]) { + case 0xf: /* PM_CYC */ + alt[j++] = 0x600005; /* PM_RUN_CYC */ + ++nlim; + break; + case 0x600005: /* PM_RUN_CYC */ + alt[j++] = 0xf; + break; + case 0x100009: /* PM_INST_CMPL */ + alt[j++] = 0x500009; /* PM_RUN_INST_CMPL */ + ++nlim; + break; + case 0x500009: /* PM_RUN_INST_CMPL */ + alt[j++] = 0x100009; /* PM_INST_CMPL */ + alt[j++] = 0x200009; + break; + } + } + nalt = j; + } + + if (!(flags & PPMU_LIMITED_PMC_OK) && nlim) { + /* remove the limited PMC events */ + j = 0; + for (i = 0; i < nalt; ++i) { + if (!power5p_limited_pmc_event(alt[i])) { + alt[j] = alt[i]; + ++j; + } + } + nalt = j; + } else if ((flags & PPMU_LIMITED_PMC_REQD) && nlim < nalt) { + /* remove all but the limited PMC events */ + j = 0; + for (i = 0; i < nalt; ++i) { + if (power5p_limited_pmc_event(alt[i])) { + alt[j] = alt[i]; + ++j; + } + } + nalt = j; + } + return nalt; } @@ -390,7 +464,7 @@ static int power5p_compute_mmcr(unsigned int event[], int n_ev, unsigned char unituse[16]; int ttmuse; - if (n_ev > 4) + if (n_ev > 6) return -1; /* First pass to count resource use */ @@ -399,7 +473,7 @@ static int power5p_compute_mmcr(unsigned int event[], int n_ev, for (i = 0; i < n_ev; ++i) { pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; if (pmc) { - if (pmc > 4) + if (pmc > 6) return -1; if (pmc_inuse & (1 << (pmc - 1))) return -1; @@ -488,13 +562,16 @@ static int power5p_compute_mmcr(unsigned int event[], int n_ev, if (pmc >= 4) return -1; pmc_inuse |= 1 << pmc; - } else { + } else if (pmc <= 4) { /* Direct event */ --pmc; if (isbus && (byte & 2) && (psel == 8 || psel == 0x10 || psel == 0x28)) /* add events on higher-numbered bus */ mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc); + } else { + /* Instructions or run cycles on PMC5/6 */ + --pmc; } if (isbus && unit == PM_GRS) { bit = psel & 7; @@ -538,7 +615,7 @@ static int power5p_generic_events[] = { }; struct power_pmu power5p_pmu = { - .n_counter = 4, + .n_counter = 6, .max_alternatives = MAX_ALT, .add_fields = 0x7000000000055ull, .test_adder = 0x3000040000000ull, @@ -548,4 +625,6 @@ struct power_pmu power5p_pmu = { .disable_pmc = power5p_disable_pmc, .n_generic = ARRAY_SIZE(power5p_generic_events), .generic_events = power5p_generic_events, + .limited_pmc5_6 = 1, + .limited_pmc_event = power5p_limited_pmc_event, }; diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c index 116c4bb1809..6e667dc8647 100644 --- a/arch/powerpc/kernel/power5-pmu.c +++ b/arch/powerpc/kernel/power5-pmu.c @@ -269,7 +269,8 @@ static int find_alternative_bdecode(unsigned int event) return -1; } -static int power5_get_alternatives(unsigned int event, unsigned int alt[]) +static int power5_get_alternatives(unsigned int event, unsigned int flags, + unsigned int alt[]) { int i, j, ae, nalt = 1; diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c index fce1fc290a1..d44049f0ae2 100644 --- a/arch/powerpc/kernel/power6-pmu.c +++ b/arch/powerpc/kernel/power6-pmu.c @@ -182,7 +182,7 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev, unsigned int ttmset = 0; unsigned int pmc_inuse = 0; - if (n_ev > 4) + if (n_ev > 6) return -1; for (i = 0; i < n_ev; ++i) { pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; @@ -202,6 +202,8 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev, for (pmc = 0; pmc < 4; ++pmc) if (!(pmc_inuse & (1 << pmc))) break; + if (pmc >= 4) + return -1; pmc_inuse |= 1 << pmc; } hwc[i] = pmc; @@ -240,7 +242,8 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev, } if (power6_marked_instr_event(event[i])) mmcra |= MMCRA_SAMPLE_ENABLE; - mmcr1 |= (u64)psel << MMCR1_PMCSEL_SH(pmc); + if (pmc < 4) + mmcr1 |= (u64)psel << MMCR1_PMCSEL_SH(pmc); } mmcr[0] = 0; if (pmc_inuse & 1) @@ -256,19 +259,20 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev, * Layout of constraint bits: * * 0-1 add field: number of uses of PMC1 (max 1) - * 2-3, 4-5, 6-7: ditto for PMC2, 3, 4 - * 8-10 select field: nest (subunit) event selector + * 2-3, 4-5, 6-7, 8-9, 10-11: ditto for PMC2, 3, 4, 5, 6 + * 12-15 add field: number of uses of PMC1-4 (max 4) * 16-19 select field: unit on byte 0 of event bus * 20-23, 24-27, 28-31 ditto for bytes 1, 2, 3 + * 32-34 select field: nest (subunit) event selector */ static int p6_get_constraint(unsigned int event, u64 *maskp, u64 *valp) { - int pmc, byte, sh; - unsigned int mask = 0, value = 0; + int pmc, byte, sh, subunit; + u64 mask = 0, value = 0; pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; if (pmc) { - if (pmc > 4) + if (pmc > 4 && !(event == 0x500009 || event == 0x600005)) return -1; sh = (pmc - 1) * 2; mask |= 2 << sh; @@ -276,26 +280,38 @@ static int p6_get_constraint(unsigned int event, u64 *maskp, u64 *valp) } if (event & PM_BUSEVENT_MSK) { byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; - sh = byte * 4; + sh = byte * 4 + (16 - PM_UNIT_SH); mask |= PM_UNIT_MSKS << sh; - value |= (event & PM_UNIT_MSKS) << sh; + value |= (u64)(event & PM_UNIT_MSKS) << sh; if ((event & PM_UNIT_MSKS) == (5 << PM_UNIT_SH)) { - mask |= PM_SUBUNIT_MSKS; - value |= event & PM_SUBUNIT_MSKS; + subunit = (event >> PM_SUBUNIT_SH) & PM_SUBUNIT_MSK; + mask |= (u64)PM_SUBUNIT_MSK << 32; + value |= (u64)subunit << 32; } } + if (pmc <= 4) { + mask |= 0x8000; /* add field for count of PMC1-4 uses */ + value |= 0x1000; + } *maskp = mask; *valp = value; return 0; } +static int p6_limited_pmc_event(unsigned int event) +{ + int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; + + return pmc == 5 || pmc == 6; +} + #define MAX_ALT 4 /* at most 4 alternatives for any event */ static const unsigned int event_alternatives[][MAX_ALT] = { { 0x0130e8, 0x2000f6, 0x3000fc }, /* PM_PTEG_RELOAD_VALID */ { 0x080080, 0x10000d, 0x30000c, 0x4000f0 }, /* PM_LD_MISS_L1 */ { 0x080088, 0x200054, 0x3000f0 }, /* PM_ST_MISS_L1 */ - { 0x10000a, 0x2000f4 }, /* PM_RUN_CYC */ + { 0x10000a, 0x2000f4, 0x600005 }, /* PM_RUN_CYC */ { 0x10000b, 0x2000f5 }, /* PM_RUN_COUNT */ { 0x10000e, 0x400010 }, /* PM_PURR */ { 0x100010, 0x4000f8 }, /* PM_FLUSH */ @@ -340,13 +356,15 @@ static int find_alternatives_list(unsigned int event) return -1; } -static int p6_get_alternatives(unsigned int event, unsigned int alt[]) +static int p6_get_alternatives(unsigned int event, unsigned int flags, + unsigned int alt[]) { - int i, j; + int i, j, nlim; unsigned int aevent, psel, pmc; unsigned int nalt = 1; alt[0] = event; + nlim = p6_limited_pmc_event(event); /* check the alternatives table */ i = find_alternatives_list(event); @@ -358,6 +376,7 @@ static int p6_get_alternatives(unsigned int event, unsigned int alt[]) break; if (aevent != event) alt[nalt++] = aevent; + nlim += p6_limited_pmc_event(aevent); } } else { @@ -375,13 +394,75 @@ static int p6_get_alternatives(unsigned int event, unsigned int alt[]) ((pmc > 2? pmc - 2: pmc + 2) << PM_PMC_SH); } + if (flags & PPMU_ONLY_COUNT_RUN) { + /* + * We're only counting in RUN state, + * so PM_CYC is equivalent to PM_RUN_CYC, + * PM_INST_CMPL === PM_RUN_INST_CMPL, PM_PURR === PM_RUN_PURR. + * This doesn't include alternatives that don't provide + * any extra flexibility in assigning PMCs (e.g. + * 0x10000a for PM_RUN_CYC vs. 0x1e for PM_CYC). + * Note that even with these additional alternatives + * we never end up with more than 4 alternatives for any event. + */ + j = nalt; + for (i = 0; i < nalt; ++i) { + switch (alt[i]) { + case 0x1e: /* PM_CYC */ + alt[j++] = 0x600005; /* PM_RUN_CYC */ + ++nlim; + break; + case 0x10000a: /* PM_RUN_CYC */ + alt[j++] = 0x1e; /* PM_CYC */ + break; + case 2: /* PM_INST_CMPL */ + alt[j++] = 0x500009; /* PM_RUN_INST_CMPL */ + ++nlim; + break; + case 0x500009: /* PM_RUN_INST_CMPL */ + alt[j++] = 2; /* PM_INST_CMPL */ + break; + case 0x10000e: /* PM_PURR */ + alt[j++] = 0x4000f4; /* PM_RUN_PURR */ + break; + case 0x4000f4: /* PM_RUN_PURR */ + alt[j++] = 0x10000e; /* PM_PURR */ + break; + } + } + nalt = j; + } + + if (!(flags & PPMU_LIMITED_PMC_OK) && nlim) { + /* remove the limited PMC events */ + j = 0; + for (i = 0; i < nalt; ++i) { + if (!p6_limited_pmc_event(alt[i])) { + alt[j] = alt[i]; + ++j; + } + } + nalt = j; + } else if ((flags & PPMU_LIMITED_PMC_REQD) && nlim < nalt) { + /* remove all but the limited PMC events */ + j = 0; + for (i = 0; i < nalt; ++i) { + if (p6_limited_pmc_event(alt[i])) { + alt[j] = alt[i]; + ++j; + } + } + nalt = j; + } + return nalt; } static void p6_disable_pmc(unsigned int pmc, u64 mmcr[]) { /* Set PMCxSEL to 0 to disable PMCx */ - mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc)); + if (pmc <= 3) + mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc)); } static int power6_generic_events[] = { @@ -394,14 +475,16 @@ static int power6_generic_events[] = { }; struct power_pmu power6_pmu = { - .n_counter = 4, + .n_counter = 6, .max_alternatives = MAX_ALT, - .add_fields = 0x55, - .test_adder = 0, + .add_fields = 0x1555, + .test_adder = 0x3000, .compute_mmcr = p6_compute_mmcr, .get_constraint = p6_get_constraint, .get_alternatives = p6_get_alternatives, .disable_pmc = p6_disable_pmc, .n_generic = ARRAY_SIZE(power6_generic_events), .generic_events = power6_generic_events, + .limited_pmc5_6 = 1, + .limited_pmc_event = p6_limited_pmc_event, }; diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c index aed8ccd7c07..af2d1884058 100644 --- a/arch/powerpc/kernel/ppc970-pmu.c +++ b/arch/powerpc/kernel/ppc970-pmu.c @@ -243,7 +243,8 @@ static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp) return 0; } -static int p970_get_alternatives(unsigned int event, unsigned int alt[]) +static int p970_get_alternatives(unsigned int event, unsigned int flags, + unsigned int alt[]) { alt[0] = event; -- cgit v1.2.3 From 9e35ad388bea89f7d6f375af4c0ae98803688666 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 May 2009 16:21:38 +0200 Subject: perf_counter: Rework the perf counter disable/enable The current disable/enable mechanism is: token = hw_perf_save_disable(); ... /* do bits */ ... hw_perf_restore(token); This works well, provided that the use nests properly. Except we don't. x86 NMI/INT throttling has non-nested use of this, breaking things. Therefore provide a reference counter disable/enable interface, where the first disable disables the hardware, and the last enable enables the hardware again. [ Impact: refactor, simplify the PMU disable/enable logic ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 15cdc8e6722..bb1b463c136 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -386,7 +386,7 @@ static void write_mmcr0(struct cpu_hw_counters *cpuhw, unsigned long mmcr0) * Disable all counters to prevent PMU interrupts and to allow * counters to be added or removed. */ -u64 hw_perf_save_disable(void) +void hw_perf_disable(void) { struct cpu_hw_counters *cpuhw; unsigned long ret; @@ -428,7 +428,6 @@ u64 hw_perf_save_disable(void) mb(); } local_irq_restore(flags); - return ret; } /* @@ -436,7 +435,7 @@ u64 hw_perf_save_disable(void) * If we were previously disabled and counters were added, then * put the new config on the PMU. */ -void hw_perf_restore(u64 disable) +void hw_perf_enable(void) { struct perf_counter *counter; struct cpu_hw_counters *cpuhw; @@ -448,9 +447,12 @@ void hw_perf_restore(u64 disable) int n_lim; int idx; - if (disable) - return; local_irq_save(flags); + if (!cpuhw->disabled) { + local_irq_restore(flags); + return; + } + cpuhw = &__get_cpu_var(cpu_hw_counters); cpuhw->disabled = 0; @@ -649,19 +651,18 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader, /* * Add a counter to the PMU. * If all counters are not already frozen, then we disable and - * re-enable the PMU in order to get hw_perf_restore to do the + * re-enable the PMU in order to get hw_perf_enable to do the * actual work of reconfiguring the PMU. */ static int power_pmu_enable(struct perf_counter *counter) { struct cpu_hw_counters *cpuhw; unsigned long flags; - u64 pmudis; int n0; int ret = -EAGAIN; local_irq_save(flags); - pmudis = hw_perf_save_disable(); + perf_disable(); /* * Add the counter to the list (if there is room) @@ -685,7 +686,7 @@ static int power_pmu_enable(struct perf_counter *counter) ret = 0; out: - hw_perf_restore(pmudis); + perf_enable(); local_irq_restore(flags); return ret; } @@ -697,11 +698,10 @@ static void power_pmu_disable(struct perf_counter *counter) { struct cpu_hw_counters *cpuhw; long i; - u64 pmudis; unsigned long flags; local_irq_save(flags); - pmudis = hw_perf_save_disable(); + perf_disable(); power_pmu_read(counter); @@ -735,7 +735,7 @@ static void power_pmu_disable(struct perf_counter *counter) cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE); } - hw_perf_restore(pmudis); + perf_enable(); local_irq_restore(flags); } -- cgit v1.2.3 From 60db5e09c13109b13830cc9dcae688003fd39e79 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 May 2009 15:19:28 +0200 Subject: perf_counter: frequency based adaptive irq_period Instead of specifying the irq_period for a counter, provide a target interrupt frequency and dynamically adapt the irq_period to match this frequency. [ Impact: new perf-counter attribute/feature ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo LKML-Reference: <20090515132018.646195868@chello.nl> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index bb1b463c136..db8d5cafc15 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -534,7 +534,7 @@ void hw_perf_enable(void) continue; } val = 0; - if (counter->hw_event.irq_period) { + if (counter->hw.irq_period) { left = atomic64_read(&counter->hw.period_left); if (left < 0x80000000L) val = 0x80000000L - left; @@ -829,8 +829,6 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter) if (!ppmu) return ERR_PTR(-ENXIO); - if ((s64)counter->hw_event.irq_period < 0) - return ERR_PTR(-EINVAL); if (!perf_event_raw(&counter->hw_event)) { ev = perf_event_id(&counter->hw_event); if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) @@ -901,7 +899,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter) counter->hw.config = events[n]; counter->hw.counter_base = cflags[n]; - atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period); + atomic64_set(&counter->hw.period_left, counter->hw.irq_period); /* * See if we need to reserve the PMU. @@ -934,6 +932,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter) static void record_and_restart(struct perf_counter *counter, long val, struct pt_regs *regs, int nmi) { + u64 period = counter->hw.irq_period; s64 prev, delta, left; int record = 0; @@ -948,11 +947,11 @@ static void record_and_restart(struct perf_counter *counter, long val, */ val = 0; left = atomic64_read(&counter->hw.period_left) - delta; - if (counter->hw_event.irq_period) { + if (period) { if (left <= 0) { - left += counter->hw_event.irq_period; + left += period; if (left <= 0) - left = counter->hw_event.irq_period; + left = period; record = 1; } if (left < 0x80000000L) -- cgit v1.2.3 From ef923214a4816c289e4af2d67a9ebb1a31e4ac61 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 14 May 2009 13:29:14 +1000 Subject: perf_counter: powerpc: use u64 for event codes internally Although the perf_counter API allows 63-bit raw event codes, internally in the powerpc back-end we had been using 32-bit event codes. This expands them to 64 bits so that we can add bits for specifying threshold start/stop events and instruction sampling modes later. This also corrects the return value of can_go_on_limited_pmc; we were returning an event code rather than just a 0/1 value in some circumstances. That didn't particularly matter while event codes were 32-bit, but now that event codes are 64-bit it might, so this fixes it. [ Impact: extend PowerPC perfcounter interfaces from u32 to u64 ] Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo LKML-Reference: <18955.36874.472452.353104@drongo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- arch/powerpc/include/asm/perf_counter.h | 10 +++++----- arch/powerpc/kernel/perf_counter.c | 26 ++++++++++++-------------- arch/powerpc/kernel/power4-pmu.c | 9 ++++----- arch/powerpc/kernel/power5+-pmu.c | 14 +++++++------- arch/powerpc/kernel/power5-pmu.c | 16 ++++++++-------- arch/powerpc/kernel/power6-pmu.c | 16 ++++++++-------- arch/powerpc/kernel/ppc970-pmu.c | 9 ++++----- 7 files changed, 48 insertions(+), 52 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h index 56d66c38143..ceea76a48e3 100644 --- a/arch/powerpc/include/asm/perf_counter.h +++ b/arch/powerpc/include/asm/perf_counter.h @@ -23,13 +23,13 @@ struct power_pmu { int max_alternatives; u64 add_fields; u64 test_adder; - int (*compute_mmcr)(unsigned int events[], int n_ev, + int (*compute_mmcr)(u64 events[], int n_ev, unsigned int hwc[], u64 mmcr[]); - int (*get_constraint)(unsigned int event, u64 *mskp, u64 *valp); - int (*get_alternatives)(unsigned int event, unsigned int flags, - unsigned int alt[]); + int (*get_constraint)(u64 event, u64 *mskp, u64 *valp); + int (*get_alternatives)(u64 event, unsigned int flags, + u64 alt[]); void (*disable_pmc)(unsigned int pmc, u64 mmcr[]); - int (*limited_pmc_event)(unsigned int event); + int (*limited_pmc_event)(u64 event); int limited_pmc5_6; /* PMC5 and PMC6 have limited function */ int n_generic; int *generic_events; diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index db8d5cafc15..8d4cafc84b8 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -26,7 +26,7 @@ struct cpu_hw_counters { int n_limited; u8 pmcs_enabled; struct perf_counter *counter[MAX_HWCOUNTERS]; - unsigned int events[MAX_HWCOUNTERS]; + u64 events[MAX_HWCOUNTERS]; unsigned int flags[MAX_HWCOUNTERS]; u64 mmcr[3]; struct perf_counter *limited_counter[MAX_LIMITED_HWCOUNTERS]; @@ -131,11 +131,11 @@ static void write_pmc(int idx, unsigned long val) * and see if any combination of alternative codes is feasible. * The feasible set is returned in event[]. */ -static int power_check_constraints(unsigned int event[], unsigned int cflags[], +static int power_check_constraints(u64 event[], unsigned int cflags[], int n_ev) { u64 mask, value, nv; - unsigned int alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES]; + u64 alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES]; u64 amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES]; u64 avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES]; u64 smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS]; @@ -564,7 +564,7 @@ void hw_perf_enable(void) } static int collect_events(struct perf_counter *group, int max_count, - struct perf_counter *ctrs[], unsigned int *events, + struct perf_counter *ctrs[], u64 *events, unsigned int *flags) { int n = 0; @@ -752,11 +752,11 @@ struct pmu power_pmu = { * that a limited PMC can count, doesn't require interrupts, and * doesn't exclude any processor mode. */ -static int can_go_on_limited_pmc(struct perf_counter *counter, unsigned int ev, +static int can_go_on_limited_pmc(struct perf_counter *counter, u64 ev, unsigned int flags) { int n; - unsigned int alt[MAX_EVENT_ALTERNATIVES]; + u64 alt[MAX_EVENT_ALTERNATIVES]; if (counter->hw_event.exclude_user || counter->hw_event.exclude_kernel @@ -776,10 +776,8 @@ static int can_go_on_limited_pmc(struct perf_counter *counter, unsigned int ev, flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD; n = ppmu->get_alternatives(ev, flags, alt); - if (n) - return alt[0]; - return 0; + return n > 0; } /* @@ -787,10 +785,9 @@ static int can_go_on_limited_pmc(struct perf_counter *counter, unsigned int ev, * and return the event code, or 0 if there is no such alternative. * (Note: event code 0 is "don't count" on all machines.) */ -static unsigned long normal_pmc_alternative(unsigned long ev, - unsigned long flags) +static u64 normal_pmc_alternative(u64 ev, unsigned long flags) { - unsigned int alt[MAX_EVENT_ALTERNATIVES]; + u64 alt[MAX_EVENT_ALTERNATIVES]; int n; flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD); @@ -820,9 +817,10 @@ static void hw_perf_counter_destroy(struct perf_counter *counter) const struct pmu *hw_perf_counter_init(struct perf_counter *counter) { - unsigned long ev, flags; + u64 ev; + unsigned long flags; struct perf_counter *ctrs[MAX_HWCOUNTERS]; - unsigned int events[MAX_HWCOUNTERS]; + u64 events[MAX_HWCOUNTERS]; unsigned int cflags[MAX_HWCOUNTERS]; int n; int err; diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c index 744a2756958..836fa118eb1 100644 --- a/arch/powerpc/kernel/power4-pmu.c +++ b/arch/powerpc/kernel/power4-pmu.c @@ -213,7 +213,7 @@ static unsigned char direct_marked_event[8] = { * Returns 1 if event counts things relating to marked instructions * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not. */ -static int p4_marked_instr_event(unsigned int event) +static int p4_marked_instr_event(u64 event) { int pmc, psel, unit, byte, bit; unsigned int mask; @@ -249,7 +249,7 @@ static int p4_marked_instr_event(unsigned int event) return (mask >> (byte * 8 + bit)) & 1; } -static int p4_get_constraint(unsigned int event, u64 *maskp, u64 *valp) +static int p4_get_constraint(u64 event, u64 *maskp, u64 *valp) { int pmc, byte, unit, lower, sh; u64 mask = 0, value = 0; @@ -320,8 +320,7 @@ static unsigned int ppc_inst_cmpl[] = { 0x1001, 0x4001, 0x6001, 0x7001, 0x8001 }; -static int p4_get_alternatives(unsigned int event, unsigned int flags, - unsigned int alt[]) +static int p4_get_alternatives(u64 event, unsigned int flags, u64 alt[]) { int i, j, na; @@ -353,7 +352,7 @@ static int p4_get_alternatives(unsigned int event, unsigned int flags, return na; } -static int p4_compute_mmcr(unsigned int event[], int n_ev, +static int p4_compute_mmcr(u64 event[], int n_ev, unsigned int hwc[], u64 mmcr[]) { u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0; diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c index 8154eaa2404..3ac0654372a 100644 --- a/arch/powerpc/kernel/power5+-pmu.c +++ b/arch/powerpc/kernel/power5+-pmu.c @@ -135,7 +135,7 @@ static u64 unit_cons[PM_LASTUNIT+1][2] = { [PM_GRS] = { 0x0e00000000ull, 0x0c40000000ull }, }; -static int power5p_get_constraint(unsigned int event, u64 *maskp, u64 *valp) +static int power5p_get_constraint(u64 event, u64 *maskp, u64 *valp) { int pmc, byte, unit, sh; int bit, fmask; @@ -188,7 +188,7 @@ static int power5p_get_constraint(unsigned int event, u64 *maskp, u64 *valp) return 0; } -static int power5p_limited_pmc_event(unsigned int event) +static int power5p_limited_pmc_event(u64 event) { int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; @@ -273,11 +273,11 @@ static int find_alternative_bdecode(unsigned int event) return -1; } -static int power5p_get_alternatives(unsigned int event, unsigned int flags, - unsigned int alt[]) +static int power5p_get_alternatives(u64 event, unsigned int flags, u64 alt[]) { - int i, j, ae, nalt = 1; + int i, j, nalt = 1; int nlim; + u64 ae; alt[0] = event; nalt = 1; @@ -402,7 +402,7 @@ static unsigned char direct_event_is_marked[0x28] = { * Returns 1 if event counts things relating to marked instructions * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not. */ -static int power5p_marked_instr_event(unsigned int event) +static int power5p_marked_instr_event(u64 event) { int pmc, psel; int bit, byte, unit; @@ -451,7 +451,7 @@ static int power5p_marked_instr_event(unsigned int event) return (mask >> (byte * 8 + bit)) & 1; } -static int power5p_compute_mmcr(unsigned int event[], int n_ev, +static int power5p_compute_mmcr(u64 event[], int n_ev, unsigned int hwc[], u64 mmcr[]) { u64 mmcr1 = 0; diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c index 6e667dc8647..d5344968ee9 100644 --- a/arch/powerpc/kernel/power5-pmu.c +++ b/arch/powerpc/kernel/power5-pmu.c @@ -139,7 +139,7 @@ static u64 unit_cons[PM_LASTUNIT+1][2] = { [PM_GRS] = { 0x30002000000000ull, 0x30000400000000ull }, }; -static int power5_get_constraint(unsigned int event, u64 *maskp, u64 *valp) +static int power5_get_constraint(u64 event, u64 *maskp, u64 *valp) { int pmc, byte, unit, sh; int bit, fmask; @@ -224,7 +224,7 @@ static const unsigned int event_alternatives[][MAX_ALT] = { * Scan the alternatives table for a match and return the * index into the alternatives table if found, else -1. */ -static int find_alternative(unsigned int event) +static int find_alternative(u64 event) { int i, j; @@ -250,7 +250,7 @@ static const unsigned char bytedecode_alternatives[4][4] = { * PMCSEL values on other counters. This returns the alternative * event code for those that do, or -1 otherwise. */ -static int find_alternative_bdecode(unsigned int event) +static u64 find_alternative_bdecode(u64 event) { int pmc, altpmc, pp, j; @@ -269,10 +269,10 @@ static int find_alternative_bdecode(unsigned int event) return -1; } -static int power5_get_alternatives(unsigned int event, unsigned int flags, - unsigned int alt[]) +static int power5_get_alternatives(u64 event, unsigned int flags, u64 alt[]) { - int i, j, ae, nalt = 1; + int i, j, nalt = 1; + u64 ae; alt[0] = event; nalt = 1; @@ -338,7 +338,7 @@ static unsigned char direct_event_is_marked[0x28] = { * Returns 1 if event counts things relating to marked instructions * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not. */ -static int power5_marked_instr_event(unsigned int event) +static int power5_marked_instr_event(u64 event) { int pmc, psel; int bit, byte, unit; @@ -382,7 +382,7 @@ static int power5_marked_instr_event(unsigned int event) return (mask >> (byte * 8 + bit)) & 1; } -static int power5_compute_mmcr(unsigned int event[], int n_ev, +static int power5_compute_mmcr(u64 event[], int n_ev, unsigned int hwc[], u64 mmcr[]) { u64 mmcr1 = 0; diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c index d44049f0ae2..ab7c615c458 100644 --- a/arch/powerpc/kernel/power6-pmu.c +++ b/arch/powerpc/kernel/power6-pmu.c @@ -134,7 +134,7 @@ static u32 marked_bus_events[16] = { * Returns 1 if event counts things relating to marked instructions * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not. */ -static int power6_marked_instr_event(unsigned int event) +static int power6_marked_instr_event(u64 event) { int pmc, psel, ptype; int bit, byte, unit; @@ -172,7 +172,7 @@ static int power6_marked_instr_event(unsigned int event) /* * Assign PMC numbers and compute MMCR1 value for a set of events */ -static int p6_compute_mmcr(unsigned int event[], int n_ev, +static int p6_compute_mmcr(u64 event[], int n_ev, unsigned int hwc[], u64 mmcr[]) { u64 mmcr1 = 0; @@ -265,7 +265,7 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev, * 20-23, 24-27, 28-31 ditto for bytes 1, 2, 3 * 32-34 select field: nest (subunit) event selector */ -static int p6_get_constraint(unsigned int event, u64 *maskp, u64 *valp) +static int p6_get_constraint(u64 event, u64 *maskp, u64 *valp) { int pmc, byte, sh, subunit; u64 mask = 0, value = 0; @@ -298,7 +298,7 @@ static int p6_get_constraint(unsigned int event, u64 *maskp, u64 *valp) return 0; } -static int p6_limited_pmc_event(unsigned int event) +static int p6_limited_pmc_event(u64 event) { int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; @@ -337,7 +337,7 @@ static const unsigned int event_alternatives[][MAX_ALT] = { * This could be made more efficient with a binary search on * a presorted list, if necessary */ -static int find_alternatives_list(unsigned int event) +static int find_alternatives_list(u64 event) { int i, j; unsigned int alt; @@ -356,12 +356,12 @@ static int find_alternatives_list(unsigned int event) return -1; } -static int p6_get_alternatives(unsigned int event, unsigned int flags, - unsigned int alt[]) +static int p6_get_alternatives(u64 event, unsigned int flags, u64 alt[]) { int i, j, nlim; - unsigned int aevent, psel, pmc; + unsigned int psel, pmc; unsigned int nalt = 1; + u64 aevent; alt[0] = event; nlim = p6_limited_pmc_event(event); diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c index af2d1884058..eed47c4523f 100644 --- a/arch/powerpc/kernel/ppc970-pmu.c +++ b/arch/powerpc/kernel/ppc970-pmu.c @@ -147,7 +147,7 @@ static unsigned char direct_marked_event[8] = { * Returns 1 if event counts things relating to marked instructions * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not. */ -static int p970_marked_instr_event(unsigned int event) +static int p970_marked_instr_event(u64 event) { int pmc, psel, unit, byte, bit; unsigned int mask; @@ -192,7 +192,7 @@ static u64 unit_cons[PM_LASTUNIT+1][2] = { [PM_STS] = { 0x380000000000ull, 0x310000000000ull }, }; -static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp) +static int p970_get_constraint(u64 event, u64 *maskp, u64 *valp) { int pmc, byte, unit, sh, spcsel; u64 mask = 0, value = 0; @@ -243,8 +243,7 @@ static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp) return 0; } -static int p970_get_alternatives(unsigned int event, unsigned int flags, - unsigned int alt[]) +static int p970_get_alternatives(u64 event, unsigned int flags, u64 alt[]) { alt[0] = event; @@ -257,7 +256,7 @@ static int p970_get_alternatives(unsigned int event, unsigned int flags, return 1; } -static int p970_compute_mmcr(unsigned int event[], int n_ev, +static int p970_compute_mmcr(u64 event[], int n_ev, unsigned int hwc[], u64 mmcr[]) { u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0; -- cgit v1.2.3 From 0bbd0d4be8d5d3676c126e06e3c75c16def00441 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 14 May 2009 13:31:48 +1000 Subject: perf_counter: powerpc: supply more precise information on counter overflow events This uses values from the MMCRA, SIAR and SDAR registers on powerpc to supply more precise information for overflow events, including a data address when PERF_RECORD_ADDR is specified. Since POWER6 uses different bit positions in MMCRA from earlier processors, this converts the struct power_pmu limited_pmc5_6 field, which only had 0/1 values, into a flags field and defines bit values for its previous use (PPMU_LIMITED_PMC5_6) and a new flag (PPMU_ALT_SIPR) to indicate that the processor uses the POWER6 bit positions rather than the earlier positions. It also adds definitions in reg.h for the new and old positions of the bit that indicates that the SIAR and SDAR values come from the same instruction. For the data address, the SDAR value is supplied if we are not doing instruction sampling. In that case there is no guarantee that the address given in the PERF_RECORD_ADDR subrecord will correspond to the instruction whose address is given in the PERF_RECORD_IP subrecord. If instruction sampling is enabled (e.g. because this counter is counting a marked instruction event), then we only supply the SDAR value for the PERF_RECORD_ADDR subrecord if it corresponds to the instruction whose address is in the PERF_RECORD_IP subrecord. Otherwise we supply 0. [ Impact: support more PMU hardware features on PowerPC ] Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo LKML-Reference: <18955.37028.48861.555309@drongo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- arch/powerpc/include/asm/perf_counter.h | 14 +++++- arch/powerpc/include/asm/reg.h | 2 + arch/powerpc/kernel/perf_counter.c | 84 +++++++++++++++++++++++++++++++-- arch/powerpc/kernel/power5+-pmu.c | 2 +- arch/powerpc/kernel/power6-pmu.c | 2 +- 5 files changed, 97 insertions(+), 7 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h index ceea76a48e3..1c60f0ca792 100644 --- a/arch/powerpc/include/asm/perf_counter.h +++ b/arch/powerpc/include/asm/perf_counter.h @@ -30,13 +30,19 @@ struct power_pmu { u64 alt[]); void (*disable_pmc)(unsigned int pmc, u64 mmcr[]); int (*limited_pmc_event)(u64 event); - int limited_pmc5_6; /* PMC5 and PMC6 have limited function */ + u32 flags; int n_generic; int *generic_events; }; extern struct power_pmu *ppmu; +/* + * Values for power_pmu.flags + */ +#define PPMU_LIMITED_PMC5_6 1 /* PMC5/6 have limited function */ +#define PPMU_ALT_SIPR 2 /* uses alternate posn for SIPR/HV */ + /* * Values for flags to get_alternatives() */ @@ -44,6 +50,12 @@ extern struct power_pmu *ppmu; #define PPMU_LIMITED_PMC_REQD 2 /* have to put this on a limited PMC */ #define PPMU_ONLY_COUNT_RUN 4 /* only counting in run state */ +struct pt_regs; +extern unsigned long perf_misc_flags(struct pt_regs *regs); +#define perf_misc_flags(regs) perf_misc_flags(regs) + +extern unsigned long perf_instruction_pointer(struct pt_regs *regs); + /* * The power_pmu.get_constraint function returns a 64-bit value and * a 64-bit mask that express the constraints between this event and diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index e8018d540e8..fb359b0a693 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -492,11 +492,13 @@ #define MMCR0_FCHV 0x00000001UL /* freeze conditions in hypervisor mode */ #define SPRN_MMCR1 798 #define SPRN_MMCRA 0x312 +#define MMCRA_SDSYNC 0x80000000UL /* SDAR synced with SIAR */ #define MMCRA_SIHV 0x10000000UL /* state of MSR HV when SIAR set */ #define MMCRA_SIPR 0x08000000UL /* state of MSR PR when SIAR set */ #define MMCRA_SLOT 0x07000000UL /* SLOT bits (37-39) */ #define MMCRA_SLOT_SHIFT 24 #define MMCRA_SAMPLE_ENABLE 0x00000001UL /* enable sampling */ +#define POWER6_MMCRA_SDSYNC 0x0000080000000000ULL /* SDAR/SIAR synced */ #define POWER6_MMCRA_SIHV 0x0000040000000000ULL #define POWER6_MMCRA_SIPR 0x0000020000000000ULL #define POWER6_MMCRA_THRM 0x00000020UL diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 8d4cafc84b8..6baae5a5c33 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -17,6 +17,7 @@ #include #include #include +#include struct cpu_hw_counters { int n_counters; @@ -310,7 +311,8 @@ static void power_pmu_read(struct perf_counter *counter) */ static int is_limited_pmc(int pmcnum) { - return ppmu->limited_pmc5_6 && (pmcnum == 5 || pmcnum == 6); + return (ppmu->flags & PPMU_LIMITED_PMC5_6) + && (pmcnum == 5 || pmcnum == 6); } static void freeze_limited_counters(struct cpu_hw_counters *cpuhw, @@ -860,7 +862,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter) * If this machine has limited counters, check whether this * event could go on a limited counter. */ - if (ppmu->limited_pmc5_6) { + if (ppmu->flags & PPMU_LIMITED_PMC5_6) { if (can_go_on_limited_pmc(counter, ev, flags)) { flags |= PPMU_LIMITED_PMC_OK; } else if (ppmu->limited_pmc_event(ev)) { @@ -933,6 +935,7 @@ static void record_and_restart(struct perf_counter *counter, long val, u64 period = counter->hw.irq_period; s64 prev, delta, left; int record = 0; + u64 addr, mmcra, sdsync; /* we don't have to worry about interrupts here */ prev = atomic64_read(&counter->hw.prev_count); @@ -963,8 +966,76 @@ static void record_and_restart(struct perf_counter *counter, long val, /* * Finally record data if requested. */ - if (record) - perf_counter_overflow(counter, nmi, regs, 0); + if (record) { + addr = 0; + if (counter->hw_event.record_type & PERF_RECORD_ADDR) { + /* + * The user wants a data address recorded. + * If we're not doing instruction sampling, + * give them the SDAR (sampled data address). + * If we are doing instruction sampling, then only + * give them the SDAR if it corresponds to the + * instruction pointed to by SIAR; this is indicated + * by the [POWER6_]MMCRA_SDSYNC bit in MMCRA. + */ + mmcra = regs->dsisr; + sdsync = (ppmu->flags & PPMU_ALT_SIPR) ? + POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC; + if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync)) + addr = mfspr(SPRN_SDAR); + } + perf_counter_overflow(counter, nmi, regs, addr); + } +} + +/* + * Called from generic code to get the misc flags (i.e. processor mode) + * for an event. + */ +unsigned long perf_misc_flags(struct pt_regs *regs) +{ + unsigned long mmcra; + + if (TRAP(regs) != 0xf00) { + /* not a PMU interrupt */ + return user_mode(regs) ? PERF_EVENT_MISC_USER : + PERF_EVENT_MISC_KERNEL; + } + + mmcra = regs->dsisr; + if (ppmu->flags & PPMU_ALT_SIPR) { + if (mmcra & POWER6_MMCRA_SIHV) + return PERF_EVENT_MISC_HYPERVISOR; + return (mmcra & POWER6_MMCRA_SIPR) ? PERF_EVENT_MISC_USER : + PERF_EVENT_MISC_KERNEL; + } + if (mmcra & MMCRA_SIHV) + return PERF_EVENT_MISC_HYPERVISOR; + return (mmcra & MMCRA_SIPR) ? PERF_EVENT_MISC_USER : + PERF_EVENT_MISC_KERNEL; +} + +/* + * Called from generic code to get the instruction pointer + * for an event. + */ +unsigned long perf_instruction_pointer(struct pt_regs *regs) +{ + unsigned long mmcra; + unsigned long ip; + unsigned long slot; + + if (TRAP(regs) != 0xf00) + return regs->nip; /* not a PMU interrupt */ + + ip = mfspr(SPRN_SIAR); + mmcra = regs->dsisr; + if ((mmcra & MMCRA_SAMPLE_ENABLE) && !(ppmu->flags & PPMU_ALT_SIPR)) { + slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT; + if (slot > 1) + ip += 4 * (slot - 1); + } + return ip; } /* @@ -983,6 +1054,11 @@ static void perf_counter_interrupt(struct pt_regs *regs) freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5), mfspr(SPRN_PMC6)); + /* + * Overload regs->dsisr to store MMCRA so we only need to read it once. + */ + regs->dsisr = mfspr(SPRN_MMCRA); + /* * If interrupts were soft-disabled when this PMU interrupt * occurred, treat it as an NMI. diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c index 3ac0654372a..c6cdfc165d6 100644 --- a/arch/powerpc/kernel/power5+-pmu.c +++ b/arch/powerpc/kernel/power5+-pmu.c @@ -625,6 +625,6 @@ struct power_pmu power5p_pmu = { .disable_pmc = power5p_disable_pmc, .n_generic = ARRAY_SIZE(power5p_generic_events), .generic_events = power5p_generic_events, - .limited_pmc5_6 = 1, + .flags = PPMU_LIMITED_PMC5_6, .limited_pmc_event = power5p_limited_pmc_event, }; diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c index ab7c615c458..cd4fbe06c35 100644 --- a/arch/powerpc/kernel/power6-pmu.c +++ b/arch/powerpc/kernel/power6-pmu.c @@ -485,6 +485,6 @@ struct power_pmu power6_pmu = { .disable_pmc = p6_disable_pmc, .n_generic = ARRAY_SIZE(power6_generic_events), .generic_events = power6_generic_events, - .limited_pmc5_6 = 1, + .flags = PPMU_LIMITED_PMC5_6 | PPMU_ALT_SIPR, .limited_pmc_event = p6_limited_pmc_event, }; -- cgit v1.2.3 From c0daaf3f1f672defa3a45ca449b76d0e86c55892 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 18 May 2009 14:02:12 +1000 Subject: perf_counter: powerpc: initialize cpuhw pointer before use Commit 9e35ad38 ("perf_counter: Rework the perf counter disable/enable") added code to the powerpc hw_perf_enable (renamed from hw_perf_restore) to test cpuhw->disabled and return immediately if it is not set (i.e. if the PMU is already enabled). Unfortunately the test got added before cpuhw was initialized, resulting in an oops the first time hw_perf_enable got called. This fixes it by moving the initialization of cpuhw to before cpuhw->disabled is tested. [ Impact: fix oops-causing bug on powerpc ] Signed-off-by: Paul Mackerras Cc: Peter Zijlstra Cc: Corey Ashford LKML-Reference: <18960.56772.869734.304631@drongo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 6baae5a5c33..fe21b2440f2 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -450,12 +450,11 @@ void hw_perf_enable(void) int idx; local_irq_save(flags); + cpuhw = &__get_cpu_var(cpu_hw_counters); if (!cpuhw->disabled) { local_irq_restore(flags); return; } - - cpuhw = &__get_cpu_var(cpu_hw_counters); cpuhw->disabled = 0; /* -- cgit v1.2.3 From 8a7b8cb91f26a671f22cedc7fd54508667f2d9b9 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 26 May 2009 16:27:59 +1000 Subject: perf_counter: powerpc: Implement interrupt throttling This implements interrupt throttling on powerpc. Since we don't have individual count enable/disable or interrupt enable/disable controls per counter, this simply sets the hardware counter to 0, meaning that it will not interrupt again until it has counted 2^31 counts, which will take at least 2^30 cycles assuming a maximum of 2 counts per cycle. Also, we set counter->hw.period_left to the maximum possible value (2^63 - 1), so we won't report overflows for this counter for the forseeable future. The unthrottle operation restores counter->hw.period_left and the hardware counter so that we will once again report a counter overflow after counter->hw.irq_period counts. [ Impact: new perfcounters robustness feature on PowerPC ] Signed-off-by: Paul Mackerras Cc: Peter Zijlstra Cc: Corey Ashford LKML-Reference: <18971.35823.643362.446774@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 48 ++++++++++++++++++++++++++++++++++---- 1 file changed, 43 insertions(+), 5 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index fe21b2440f2..f96d55f55bd 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -740,10 +740,37 @@ static void power_pmu_disable(struct perf_counter *counter) local_irq_restore(flags); } +/* + * Re-enable interrupts on a counter after they were throttled + * because they were coming too fast. + */ +static void power_pmu_unthrottle(struct perf_counter *counter) +{ + s64 val, left; + unsigned long flags; + + if (!counter->hw.idx || !counter->hw.irq_period) + return; + local_irq_save(flags); + perf_disable(); + power_pmu_read(counter); + left = counter->hw.irq_period; + val = 0; + if (left < 0x80000000L) + val = 0x80000000L - left; + write_pmc(counter->hw.idx, val); + atomic64_set(&counter->hw.prev_count, val); + atomic64_set(&counter->hw.period_left, left); + perf_counter_update_userpage(counter); + perf_enable(); + local_irq_restore(flags); +} + struct pmu power_pmu = { .enable = power_pmu_enable, .disable = power_pmu_disable, .read = power_pmu_read, + .unthrottle = power_pmu_unthrottle, }; /* @@ -957,10 +984,6 @@ static void record_and_restart(struct perf_counter *counter, long val, if (left < 0x80000000L) val = 0x80000000L - left; } - write_pmc(counter->hw.idx, val); - atomic64_set(&counter->hw.prev_count, val); - atomic64_set(&counter->hw.period_left, left); - perf_counter_update_userpage(counter); /* * Finally record data if requested. @@ -983,8 +1006,23 @@ static void record_and_restart(struct perf_counter *counter, long val, if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync)) addr = mfspr(SPRN_SDAR); } - perf_counter_overflow(counter, nmi, regs, addr); + if (perf_counter_overflow(counter, nmi, regs, addr)) { + /* + * Interrupts are coming too fast - throttle them + * by setting the counter to 0, so it will be + * at least 2^30 cycles until the next interrupt + * (assuming each counter counts at most 2 counts + * per cycle). + */ + val = 0; + left = ~0ULL >> 1; + } } + + write_pmc(counter->hw.idx, val); + atomic64_set(&counter->hw.prev_count, val); + atomic64_set(&counter->hw.period_left, left); + perf_counter_update_userpage(counter); } /* -- cgit v1.2.3 From b23f3325ed465f1bd914384884269af0d106778c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 15:13:03 +0200 Subject: perf_counter: Rename various fields A few renames: s/irq_period/sample_period/ s/irq_freq/sample_freq/ s/PERF_RECORD_/PERF_SAMPLE_/ s/record_type/sample_type/ And change both the new sample_type and read_format to u64. Reported-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index f96d55f55bd..c9633321e7a 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -535,7 +535,7 @@ void hw_perf_enable(void) continue; } val = 0; - if (counter->hw.irq_period) { + if (counter->hw.sample_period) { left = atomic64_read(&counter->hw.period_left); if (left < 0x80000000L) val = 0x80000000L - left; @@ -749,12 +749,12 @@ static void power_pmu_unthrottle(struct perf_counter *counter) s64 val, left; unsigned long flags; - if (!counter->hw.idx || !counter->hw.irq_period) + if (!counter->hw.idx || !counter->hw.sample_period) return; local_irq_save(flags); perf_disable(); power_pmu_read(counter); - left = counter->hw.irq_period; + left = counter->hw.sample_period; val = 0; if (left < 0x80000000L) val = 0x80000000L - left; @@ -789,7 +789,7 @@ static int can_go_on_limited_pmc(struct perf_counter *counter, u64 ev, if (counter->hw_event.exclude_user || counter->hw_event.exclude_kernel || counter->hw_event.exclude_hv - || counter->hw_event.irq_period) + || counter->hw_event.sample_period) return 0; if (ppmu->limited_pmc_event(ev)) @@ -925,7 +925,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter) counter->hw.config = events[n]; counter->hw.counter_base = cflags[n]; - atomic64_set(&counter->hw.period_left, counter->hw.irq_period); + atomic64_set(&counter->hw.period_left, counter->hw.sample_period); /* * See if we need to reserve the PMU. @@ -958,7 +958,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter) static void record_and_restart(struct perf_counter *counter, long val, struct pt_regs *regs, int nmi) { - u64 period = counter->hw.irq_period; + u64 period = counter->hw.sample_period; s64 prev, delta, left; int record = 0; u64 addr, mmcra, sdsync; -- cgit v1.2.3 From 0d48696f87e3618b0d35bd3e4e9d7c188d51e7de Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 19:22:16 +0200 Subject: perf_counter: Rename perf_counter_hw_event => perf_counter_attr The structure isn't hw only and when I read event, I think about those things that fall out the other end. Rename the thing. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur Cc: Stephane Eranian LKML-Reference: Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index c9633321e7a..ea54686cb78 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -262,13 +262,13 @@ static int check_excludes(struct perf_counter **ctrs, unsigned int cflags[], } counter = ctrs[i]; if (first) { - eu = counter->hw_event.exclude_user; - ek = counter->hw_event.exclude_kernel; - eh = counter->hw_event.exclude_hv; + eu = counter->attr.exclude_user; + ek = counter->attr.exclude_kernel; + eh = counter->attr.exclude_hv; first = 0; - } else if (counter->hw_event.exclude_user != eu || - counter->hw_event.exclude_kernel != ek || - counter->hw_event.exclude_hv != eh) { + } else if (counter->attr.exclude_user != eu || + counter->attr.exclude_kernel != ek || + counter->attr.exclude_hv != eh) { return -EAGAIN; } } @@ -483,16 +483,16 @@ void hw_perf_enable(void) /* * Add in MMCR0 freeze bits corresponding to the - * hw_event.exclude_* bits for the first counter. + * attr.exclude_* bits for the first counter. * We have already checked that all counters have the * same values for these bits as the first counter. */ counter = cpuhw->counter[0]; - if (counter->hw_event.exclude_user) + if (counter->attr.exclude_user) cpuhw->mmcr[0] |= MMCR0_FCP; - if (counter->hw_event.exclude_kernel) + if (counter->attr.exclude_kernel) cpuhw->mmcr[0] |= freeze_counters_kernel; - if (counter->hw_event.exclude_hv) + if (counter->attr.exclude_hv) cpuhw->mmcr[0] |= MMCR0_FCHV; /* @@ -786,10 +786,10 @@ static int can_go_on_limited_pmc(struct perf_counter *counter, u64 ev, int n; u64 alt[MAX_EVENT_ALTERNATIVES]; - if (counter->hw_event.exclude_user - || counter->hw_event.exclude_kernel - || counter->hw_event.exclude_hv - || counter->hw_event.sample_period) + if (counter->attr.exclude_user + || counter->attr.exclude_kernel + || counter->attr.exclude_hv + || counter->attr.sample_period) return 0; if (ppmu->limited_pmc_event(ev)) @@ -855,13 +855,13 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter) if (!ppmu) return ERR_PTR(-ENXIO); - if (!perf_event_raw(&counter->hw_event)) { - ev = perf_event_id(&counter->hw_event); + if (!perf_event_raw(&counter->attr)) { + ev = perf_event_id(&counter->attr); if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) return ERR_PTR(-EOPNOTSUPP); ev = ppmu->generic_events[ev]; } else { - ev = perf_event_config(&counter->hw_event); + ev = perf_event_config(&counter->attr); } counter->hw.config_base = ev; counter->hw.idx = 0; @@ -872,7 +872,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter) * the user set it to. */ if (!firmware_has_feature(FW_FEATURE_LPAR)) - counter->hw_event.exclude_hv = 0; + counter->attr.exclude_hv = 0; /* * If this is a per-task counter, then we can use @@ -990,7 +990,7 @@ static void record_and_restart(struct perf_counter *counter, long val, */ if (record) { addr = 0; - if (counter->hw_event.record_type & PERF_RECORD_ADDR) { + if (counter->attr.record_type & PERF_RECORD_ADDR) { /* * The user wants a data address recorded. * If we're not doing instruction sampling, -- cgit v1.2.3 From 6984efb692e97ce5f75f26e595685c04c2061bac Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 3 Jun 2009 19:38:58 +1000 Subject: perf_counter: powerpc: Fix event alternative code generation on POWER5/5+ Commit ef923214 ("perf_counter: powerpc: use u64 for event codes internally") introduced a bug where the return value from function find_alternative_bdecode gets put into a u64 variable and later tested to see if it is < 0. The effect is that we get extra, bogus event code alternatives on POWER5 and POWER5+, leading to error messages such as "oops compute_mmcr failed" being printed and counters not counting properly. This fixes it by using s64 for the return type of find_alternative_bdecode and for the local variable that the caller puts the value in. It also makes the event argument a u64 on POWER5+ for consistency. Signed-off-by: Paul Mackerras Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur Cc: Stephane Eranian LKML-Reference: <18982.17586.666132.90983@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/power5+-pmu.c | 4 ++-- arch/powerpc/kernel/power5-pmu.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c index c6cdfc165d6..8471e3c2e46 100644 --- a/arch/powerpc/kernel/power5+-pmu.c +++ b/arch/powerpc/kernel/power5+-pmu.c @@ -242,7 +242,7 @@ static const unsigned char bytedecode_alternatives[4][4] = { * event code for those that do, or -1 otherwise. This also handles * alternative PCMSEL values for add events. */ -static int find_alternative_bdecode(unsigned int event) +static s64 find_alternative_bdecode(u64 event) { int pmc, altpmc, pp, j; @@ -277,7 +277,7 @@ static int power5p_get_alternatives(u64 event, unsigned int flags, u64 alt[]) { int i, j, nalt = 1; int nlim; - u64 ae; + s64 ae; alt[0] = event; nalt = 1; diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c index d5344968ee9..1b44c5fca18 100644 --- a/arch/powerpc/kernel/power5-pmu.c +++ b/arch/powerpc/kernel/power5-pmu.c @@ -250,7 +250,7 @@ static const unsigned char bytedecode_alternatives[4][4] = { * PMCSEL values on other counters. This returns the alternative * event code for those that do, or -1 otherwise. */ -static u64 find_alternative_bdecode(u64 event) +static s64 find_alternative_bdecode(u64 event) { int pmc, altpmc, pp, j; @@ -272,7 +272,7 @@ static u64 find_alternative_bdecode(u64 event) static int power5_get_alternatives(u64 event, unsigned int flags, u64 alt[]) { int i, j, nalt = 1; - u64 ae; + s64 ae; alt[0] = event; nalt = 1; -- cgit v1.2.3 From dcd945e0d8a6d654e3e1de51faea9f98f1504aa5 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 3 Jun 2009 19:40:36 +1000 Subject: perf_counter: powerpc: Fix race causing "oops trying to read PMC0" errors When using interrupting counters and limited (non-interrupting) counters at the same time, it's possible that we get an interrupt in write_mmcr0() after writing MMCR0 but before we have set up the counters using limited PMCs. What happens then is that we get into perf_counter_interrupt() with counter->hw.idx = 0 for the limited counters, leading to the "oops trying to read PMC0" error message being printed. This fixes the problem by making perf_counter_interrupt() robust against counter->hw.idx being zero (the counter is just ignored in that case) and also by changing write_mmcr0() to write MMCR0 initially with the counter overflow interrupt enable bits masked (set to 0). If the MMCR0 value requested by the caller has either of those bits set, we write MMCR0 again with the requested value of those bits after setting up the limited counters properly. Signed-off-by: Paul Mackerras Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur Cc: Stephane Eranian LKML-Reference: <18982.17684.138182.954599@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index ea54686cb78..4cc4ac5c791 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -372,16 +372,28 @@ static void write_mmcr0(struct cpu_hw_counters *cpuhw, unsigned long mmcr0) /* * Write MMCR0, then read PMC5 and PMC6 immediately. + * To ensure we don't get a performance monitor interrupt + * between writing MMCR0 and freezing/thawing the limited + * counters, we first write MMCR0 with the counter overflow + * interrupt enable bits turned off. */ asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5" : "=&r" (pmc5), "=&r" (pmc6) - : "r" (mmcr0), "i" (SPRN_MMCR0), + : "r" (mmcr0 & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)), + "i" (SPRN_MMCR0), "i" (SPRN_PMC5), "i" (SPRN_PMC6)); if (mmcr0 & MMCR0_FC) freeze_limited_counters(cpuhw, pmc5, pmc6); else thaw_limited_counters(cpuhw, pmc5, pmc6); + + /* + * Write the full MMCR0 including the counter overflow interrupt + * enable bits, if necessary. + */ + if (mmcr0 & (MMCR0_PMC1CE | MMCR0_PMCjCE)) + mtspr(SPRN_MMCR0, mmcr0); } /* @@ -1108,7 +1120,7 @@ static void perf_counter_interrupt(struct pt_regs *regs) for (i = 0; i < cpuhw->n_counters; ++i) { counter = cpuhw->counter[i]; - if (is_limited_pmc(counter->hw.idx)) + if (!counter->hw.idx || is_limited_pmc(counter->hw.idx)) continue; val = read_pmc(counter->hw.idx); if ((int)val < 0) { -- cgit v1.2.3 From 1b58c2515be48d5df79d20210ac5a86e30094de2 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 4 Jun 2009 09:49:59 +1000 Subject: perf_counter: powerpc: Use new identifier names in powerpc-specific code Commit b23f3325 ("perf_counter: Rename various fields") fixed up most of the uses of the renamed fields, but missed one instance of "record_type" in powerpc-specific code which needs to be changed to "sample_type", and a "PERF_RECORD_ADDR" in the same statement that needs to be changed to "PERF_SAMPLE_ADDR", causing compilation errors on powerpc. This fixes it. Signed-off-by: Paul Mackerras Cc: Peter Zijlstra LKML-Reference: <18983.3111.770392.800486@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 4cc4ac5c791..232b00a36f7 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -1002,7 +1002,7 @@ static void record_and_restart(struct perf_counter *counter, long val, */ if (record) { addr = 0; - if (counter->attr.record_type & PERF_RECORD_ADDR) { + if (counter->attr.sample_type & PERF_SAMPLE_ADDR) { /* * The user wants a data address recorded. * If we're not doing instruction sampling, -- cgit v1.2.3 From a21ca2cac582886a3e95c8bb84ff7c52d4d15e54 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 6 Jun 2009 09:58:57 +0200 Subject: perf_counter: Separate out attr->type from attr->config Counter type is a frequently used value and we do a lot of bit juggling by encoding and decoding it from attr->config. Clean this up by creating a separate attr->type field. Also clean up the various similarly complex user-space bits all around counter attribute management. The net improvement is significant, and it will be easier to add a new major type (which is what triggered this cleanup). (This changes the ABI, all tools are adapted.) (PowerPC build-tested.) Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 232b00a36f7..4786ad9a288 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -867,13 +867,13 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter) if (!ppmu) return ERR_PTR(-ENXIO); - if (!perf_event_raw(&counter->attr)) { - ev = perf_event_id(&counter->attr); + if (counter->attr.type != PERF_TYPE_RAW) { + ev = counter->attr.config; if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) return ERR_PTR(-EOPNOTSUPP); ev = ppmu->generic_events[ev]; } else { - ev = perf_event_config(&counter->attr); + ev = counter->attr.config; } counter->hw.config_base = ev; counter->hw.idx = 0; -- cgit v1.2.3 From df1a132bf3d3508f863336c80a27806a2ac947e0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Jun 2009 21:02:22 +0200 Subject: perf_counter: Introduce struct for sample data For easy extension of the sample data, put it in a structure. Signed-off-by: Peter Zijlstra Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 4786ad9a288..5e0bf399c43 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -1001,7 +1001,11 @@ static void record_and_restart(struct perf_counter *counter, long val, * Finally record data if requested. */ if (record) { - addr = 0; + struct perf_sample_data data = { + .regs = regs, + .addr = 0, + }; + if (counter->attr.sample_type & PERF_SAMPLE_ADDR) { /* * The user wants a data address recorded. @@ -1016,9 +1020,9 @@ static void record_and_restart(struct perf_counter *counter, long val, sdsync = (ppmu->flags & PPMU_ALT_SIPR) ? POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC; if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync)) - addr = mfspr(SPRN_SDAR); + data.addr = mfspr(SPRN_SDAR); } - if (perf_counter_overflow(counter, nmi, regs, addr)) { + if (perf_counter_overflow(counter, nmi, &data)) { /* * Interrupts are coming too fast - throttle them * by setting the counter to 0, so it will be -- cgit v1.2.3 From 9e350de37ac9607012fcf9c5314a28fbddf8f43c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Jun 2009 21:34:59 +0200 Subject: perf_counter: Accurate period data We currently log hw.sample_period for PERF_SAMPLE_PERIOD, however this is incorrect. When we adjust the period, it will only take effect the next cycle but report it for the current cycle. So when we adjust the period for every cycle, we're always wrong. Solve this by keeping track of the last_period. Signed-off-by: Peter Zijlstra Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 5e0bf399c43..4990ce2e5f0 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -767,6 +767,7 @@ static void power_pmu_unthrottle(struct perf_counter *counter) perf_disable(); power_pmu_read(counter); left = counter->hw.sample_period; + counter->hw.last_period = left; val = 0; if (left < 0x80000000L) val = 0x80000000L - left; @@ -937,7 +938,8 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter) counter->hw.config = events[n]; counter->hw.counter_base = cflags[n]; - atomic64_set(&counter->hw.period_left, counter->hw.sample_period); + counter->hw.last_period = counter->hw.sample_period; + atomic64_set(&counter->hw.period_left, counter->hw.last_period); /* * See if we need to reserve the PMU. @@ -1002,8 +1004,9 @@ static void record_and_restart(struct perf_counter *counter, long val, */ if (record) { struct perf_sample_data data = { - .regs = regs, - .addr = 0, + .regs = regs, + .addr = 0, + .period = counter->hw.last_period, }; if (counter->attr.sample_type & PERF_SAMPLE_ADDR) { -- cgit v1.2.3 From 4da52960fd1ae3ddd14901bc88b608cbeaa4b9a6 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 11 Jun 2009 14:54:01 +1000 Subject: perf_counters: powerpc: Add support for POWER7 processors This adds the back-end for the PMU on POWER7 processors. POWER7 has 4 fully-programmable counters and two fixed-function counters (which do respect the freeze conditions, can generate interrupts, and are writable, unlike PMC5/6 on POWER5+/6). Signed-off-by: Paul Mackerras Cc: Peter Zijlstra LKML-Reference: <18992.36329.189378.17992@drongo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/Makefile | 3 +- arch/powerpc/kernel/perf_counter.c | 4 + arch/powerpc/kernel/power7-pmu.c | 316 +++++++++++++++++++++++++++++++++++++ 3 files changed, 322 insertions(+), 1 deletion(-) create mode 100644 arch/powerpc/kernel/power7-pmu.c (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 9ba1bb731fc..a2c683403c2 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -95,7 +95,8 @@ obj64-$(CONFIG_AUDIT) += compat_audit.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o power4-pmu.o ppc970-pmu.o \ - power5-pmu.o power5+-pmu.o power6-pmu.o + power5-pmu.o power5+-pmu.o power6-pmu.o \ + power7-pmu.o obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 4990ce2e5f0..5d12e68aac1 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -1181,6 +1181,7 @@ extern struct power_pmu ppc970_pmu; extern struct power_pmu power5_pmu; extern struct power_pmu power5p_pmu; extern struct power_pmu power6_pmu; +extern struct power_pmu power7_pmu; static int init_perf_counters(void) { @@ -1207,6 +1208,9 @@ static int init_perf_counters(void) case 0x3e: ppmu = &power6_pmu; break; + case 0x3f: + ppmu = &power7_pmu; + break; } /* diff --git a/arch/powerpc/kernel/power7-pmu.c b/arch/powerpc/kernel/power7-pmu.c new file mode 100644 index 00000000000..dfac48d8ff4 --- /dev/null +++ b/arch/powerpc/kernel/power7-pmu.c @@ -0,0 +1,316 @@ +/* + * Performance counter support for POWER7 processors. + * + * Copyright 2009 Paul Mackerras, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include + +/* + * Bits in event code for POWER7 + */ +#define PM_PMC_SH 16 /* PMC number (1-based) for direct events */ +#define PM_PMC_MSK 0xf +#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH) +#define PM_UNIT_SH 12 /* TTMMUX number and setting - unit select */ +#define PM_UNIT_MSK 0xf +#define PM_COMBINE_SH 11 /* Combined event bit */ +#define PM_COMBINE_MSK 1 +#define PM_COMBINE_MSKS 0x800 +#define PM_L2SEL_SH 8 /* L2 event select */ +#define PM_L2SEL_MSK 7 +#define PM_PMCSEL_MSK 0xff + +/* + * Bits in MMCR1 for POWER7 + */ +#define MMCR1_TTM0SEL_SH 60 +#define MMCR1_TTM1SEL_SH 56 +#define MMCR1_TTM2SEL_SH 52 +#define MMCR1_TTM3SEL_SH 48 +#define MMCR1_TTMSEL_MSK 0xf +#define MMCR1_L2SEL_SH 45 +#define MMCR1_L2SEL_MSK 7 +#define MMCR1_PMC1_COMBINE_SH 35 +#define MMCR1_PMC2_COMBINE_SH 34 +#define MMCR1_PMC3_COMBINE_SH 33 +#define MMCR1_PMC4_COMBINE_SH 32 +#define MMCR1_PMC1SEL_SH 24 +#define MMCR1_PMC2SEL_SH 16 +#define MMCR1_PMC3SEL_SH 8 +#define MMCR1_PMC4SEL_SH 0 +#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8) +#define MMCR1_PMCSEL_MSK 0xff + +/* + * Bits in MMCRA + */ + +/* + * Layout of constraint bits: + * 6666555555555544444444443333333333222222222211111111110000000000 + * 3210987654321098765432109876543210987654321098765432109876543210 + * [ ><><><><><><> + * NC P6P5P4P3P2P1 + * + * NC - number of counters + * 15: NC error 0x8000 + * 12-14: number of events needing PMC1-4 0x7000 + * + * P6 + * 11: P6 error 0x800 + * 10-11: Count of events needing PMC6 + * + * P1..P5 + * 0-9: Count of events needing PMC1..PMC5 + */ + +static int power7_get_constraint(u64 event, u64 *maskp, u64 *valp) +{ + int pmc, sh; + u64 mask = 0, value = 0; + + pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; + if (pmc) { + if (pmc > 6) + return -1; + sh = (pmc - 1) * 2; + mask |= 2 << sh; + value |= 1 << sh; + if (pmc >= 5 && !(event == 0x500fa || event == 0x600f4)) + return -1; + } + if (pmc < 5) { + /* need a counter from PMC1-4 set */ + mask |= 0x8000; + value |= 0x1000; + } + *maskp = mask; + *valp = value; + return 0; +} + +#define MAX_ALT 2 /* at most 2 alternatives for any event */ + +static const unsigned int event_alternatives[][MAX_ALT] = { + { 0x200f2, 0x300f2 }, /* PM_INST_DISP */ + { 0x200f4, 0x600f4 }, /* PM_RUN_CYC */ + { 0x400fa, 0x500fa }, /* PM_RUN_INST_CMPL */ +}; + +/* + * Scan the alternatives table for a match and return the + * index into the alternatives table if found, else -1. + */ +static int find_alternative(u64 event) +{ + int i, j; + + for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) { + if (event < event_alternatives[i][0]) + break; + for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j) + if (event == event_alternatives[i][j]) + return i; + } + return -1; +} + +static s64 find_alternative_decode(u64 event) +{ + int pmc, psel; + + /* this only handles the 4x decode events */ + pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; + psel = event & PM_PMCSEL_MSK; + if ((pmc == 2 || pmc == 4) && (psel & ~7) == 0x40) + return event - (1 << PM_PMC_SH) + 8; + if ((pmc == 1 || pmc == 3) && (psel & ~7) == 0x48) + return event + (1 << PM_PMC_SH) - 8; + return -1; +} + +static int power7_get_alternatives(u64 event, unsigned int flags, u64 alt[]) +{ + int i, j, nalt = 1; + s64 ae; + + alt[0] = event; + nalt = 1; + i = find_alternative(event); + if (i >= 0) { + for (j = 0; j < MAX_ALT; ++j) { + ae = event_alternatives[i][j]; + if (ae && ae != event) + alt[nalt++] = ae; + } + } else { + ae = find_alternative_decode(event); + if (ae > 0) + alt[nalt++] = ae; + } + + if (flags & PPMU_ONLY_COUNT_RUN) { + /* + * We're only counting in RUN state, + * so PM_CYC is equivalent to PM_RUN_CYC + * and PM_INST_CMPL === PM_RUN_INST_CMPL. + * This doesn't include alternatives that don't provide + * any extra flexibility in assigning PMCs. + */ + j = nalt; + for (i = 0; i < nalt; ++i) { + switch (alt[i]) { + case 0x1e: /* PM_CYC */ + alt[j++] = 0x600f4; /* PM_RUN_CYC */ + break; + case 0x600f4: /* PM_RUN_CYC */ + alt[j++] = 0x1e; + break; + case 0x2: /* PM_PPC_CMPL */ + alt[j++] = 0x500fa; /* PM_RUN_INST_CMPL */ + break; + case 0x500fa: /* PM_RUN_INST_CMPL */ + alt[j++] = 0x2; /* PM_PPC_CMPL */ + break; + } + } + nalt = j; + } + + return nalt; +} + +/* + * Returns 1 if event counts things relating to marked instructions + * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not. + */ +static int power7_marked_instr_event(u64 event) +{ + int pmc, psel; + int unit; + + pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; + unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; + psel = event & PM_PMCSEL_MSK & ~1; /* trim off edge/level bit */ + if (pmc >= 5) + return 0; + + switch (psel >> 4) { + case 2: + return pmc == 2 || pmc == 4; + case 3: + if (psel == 0x3c) + return pmc == 1; + if (psel == 0x3e) + return pmc != 2; + return 1; + case 4: + case 5: + return unit == 0xd; + case 6: + if (psel == 0x64) + return pmc >= 3; + case 8: + return unit == 0xd; + } + return 0; +} + +static int power7_compute_mmcr(u64 event[], int n_ev, + unsigned int hwc[], u64 mmcr[]) +{ + u64 mmcr1 = 0; + u64 mmcra = 0; + unsigned int pmc, unit, combine, l2sel, psel; + unsigned int pmc_inuse = 0; + int i; + + /* First pass to count resource use */ + for (i = 0; i < n_ev; ++i) { + pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; + if (pmc) { + if (pmc > 6) + return -1; + if (pmc_inuse & (1 << (pmc - 1))) + return -1; + pmc_inuse |= 1 << (pmc - 1); + } + } + + /* Second pass: assign PMCs, set all MMCR1 fields */ + for (i = 0; i < n_ev; ++i) { + pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; + unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; + combine = (event[i] >> PM_COMBINE_SH) & PM_COMBINE_MSK; + l2sel = (event[i] >> PM_L2SEL_SH) & PM_L2SEL_MSK; + psel = event[i] & PM_PMCSEL_MSK; + if (!pmc) { + /* Bus event or any-PMC direct event */ + for (pmc = 0; pmc < 4; ++pmc) { + if (!(pmc_inuse & (1 << pmc))) + break; + } + if (pmc >= 4) + return -1; + pmc_inuse |= 1 << pmc; + } else { + /* Direct or decoded event */ + --pmc; + } + if (pmc <= 3) { + mmcr1 |= (u64) unit << (MMCR1_TTM0SEL_SH - 4 * pmc); + mmcr1 |= (u64) combine << (MMCR1_PMC1_COMBINE_SH - pmc); + mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc); + if (unit == 6) /* L2 events */ + mmcr1 |= (u64) l2sel << MMCR1_L2SEL_SH; + } + if (power7_marked_instr_event(event[i])) + mmcra |= MMCRA_SAMPLE_ENABLE; + hwc[i] = pmc; + } + + /* Return MMCRx values */ + mmcr[0] = 0; + if (pmc_inuse & 1) + mmcr[0] = MMCR0_PMC1CE; + if (pmc_inuse & 0x3e) + mmcr[0] |= MMCR0_PMCjCE; + mmcr[1] = mmcr1; + mmcr[2] = mmcra; + return 0; +} + +static void power7_disable_pmc(unsigned int pmc, u64 mmcr[]) +{ + if (pmc <= 3) + mmcr[1] &= ~(0xffULL << MMCR1_PMCSEL_SH(pmc)); +} + +static int power7_generic_events[] = { + [PERF_COUNT_CPU_CYCLES] = 0x1e, + [PERF_COUNT_INSTRUCTIONS] = 2, + [PERF_COUNT_CACHE_REFERENCES] = 0xc880, /* LD_REF_L1_LSU */ + [PERF_COUNT_CACHE_MISSES] = 0x400f0, /* LD_MISS_L1 */ + [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x10068, /* BRU_FIN */ + [PERF_COUNT_BRANCH_MISSES] = 0x400f6, /* BR_MPRED */ +}; + +struct power_pmu power7_pmu = { + .n_counter = 6, + .max_alternatives = MAX_ALT + 1, + .add_fields = 0x1555ull, + .test_adder = 0x3000ull, + .compute_mmcr = power7_compute_mmcr, + .get_constraint = power7_get_constraint, + .get_alternatives = power7_get_alternatives, + .disable_pmc = power7_disable_pmc, + .n_generic = ARRAY_SIZE(power7_generic_events), + .generic_events = power7_generic_events, +}; -- cgit v1.2.3 From 106b506c3a8b74daa5751e83ed3e46438fcf9a52 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 11 Jun 2009 14:55:42 +1000 Subject: perf_counter: powerpc: Implement generalized cache events for POWER processors This adds tables of event codes for the generalized cache events for all the currently supported powerpc processors: POWER{4,5,5+,6,7} and PPC970*, plus powerpc-specific code to use these tables when a generalized cache event is requested. Signed-off-by: Paul Mackerras Cc: Peter Zijlstra LKML-Reference: <18992.36430.933526.742969@drongo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- arch/powerpc/include/asm/perf_counter.h | 3 +++ arch/powerpc/kernel/perf_counter.c | 42 ++++++++++++++++++++++++++++-- arch/powerpc/kernel/power4-pmu.c | 41 +++++++++++++++++++++++++++++ arch/powerpc/kernel/power5+-pmu.c | 45 ++++++++++++++++++++++++++++++-- arch/powerpc/kernel/power5-pmu.c | 41 +++++++++++++++++++++++++++++ arch/powerpc/kernel/power6-pmu.c | 46 +++++++++++++++++++++++++++++++-- arch/powerpc/kernel/power7-pmu.c | 41 +++++++++++++++++++++++++++++ arch/powerpc/kernel/ppc970-pmu.c | 41 +++++++++++++++++++++++++++++ 8 files changed, 294 insertions(+), 6 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h index 1c60f0ca792..cc7c887705b 100644 --- a/arch/powerpc/include/asm/perf_counter.h +++ b/arch/powerpc/include/asm/perf_counter.h @@ -33,6 +33,9 @@ struct power_pmu { u32 flags; int n_generic; int *generic_events; + int (*cache_events)[PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; }; extern struct power_pmu *ppmu; diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 5d12e68aac1..bb202388170 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -856,6 +856,36 @@ static void hw_perf_counter_destroy(struct perf_counter *counter) } } +/* + * Translate a generic cache event config to a raw event code. + */ +static int hw_perf_cache_event(u64 config, u64 *eventp) +{ + unsigned long type, op, result; + int ev; + + if (!ppmu->cache_events) + return -EINVAL; + + /* unpack config */ + type = config & 0xff; + op = (config >> 8) & 0xff; + result = (config >> 16) & 0xff; + + if (type >= PERF_COUNT_HW_CACHE_MAX || + op >= PERF_COUNT_HW_CACHE_OP_MAX || + result >= PERF_COUNT_HW_CACHE_RESULT_MAX) + return -EINVAL; + + ev = (*ppmu->cache_events)[type][op][result]; + if (ev == 0) + return -EOPNOTSUPP; + if (ev == -1) + return -EINVAL; + *eventp = ev; + return 0; +} + const struct pmu *hw_perf_counter_init(struct perf_counter *counter) { u64 ev; @@ -868,13 +898,21 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter) if (!ppmu) return ERR_PTR(-ENXIO); - if (counter->attr.type != PERF_TYPE_RAW) { + switch (counter->attr.type) { + case PERF_TYPE_HARDWARE: ev = counter->attr.config; if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) return ERR_PTR(-EOPNOTSUPP); ev = ppmu->generic_events[ev]; - } else { + break; + case PERF_TYPE_HW_CACHE: + err = hw_perf_cache_event(counter->attr.config, &ev); + if (err) + return ERR_PTR(err); + break; + case PERF_TYPE_RAW: ev = counter->attr.config; + break; } counter->hw.config_base = ev; counter->hw.idx = 0; diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c index 836fa118eb1..0e94b685722 100644 --- a/arch/powerpc/kernel/power4-pmu.c +++ b/arch/powerpc/kernel/power4-pmu.c @@ -543,6 +543,46 @@ static int p4_generic_events[] = { [PERF_COUNT_BRANCH_MISSES] = 0x331, /* PM_BR_MPRED_CR */ }; +#define C(x) PERF_COUNT_HW_CACHE_##x + +/* + * Table of generalized cache-related events. + * 0 means not supported, -1 means nonsensical, other values + * are event codes. + */ +static int power4_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { + [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0x8c10, 0x3c10 }, + [C(OP_WRITE)] = { 0x7c10, 0xc13 }, + [C(OP_PREFETCH)] = { 0xc35, 0 }, + }, + [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0, 0 }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { 0, 0 }, + }, + [C(L2)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0, 0 }, + [C(OP_WRITE)] = { 0, 0 }, + [C(OP_PREFETCH)] = { 0xc34, 0 }, + }, + [C(DTLB)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0, 0x904 }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { -1, -1 }, + }, + [C(ITLB)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0, 0x900 }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { -1, -1 }, + }, + [C(BPU)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0x330, 0x331 }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { -1, -1 }, + }, +}; + struct power_pmu power4_pmu = { .n_counter = 8, .max_alternatives = 5, @@ -554,4 +594,5 @@ struct power_pmu power4_pmu = { .disable_pmc = p4_disable_pmc, .n_generic = ARRAY_SIZE(p4_generic_events), .generic_events = p4_generic_events, + .cache_events = &power4_cache_events, }; diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c index 8471e3c2e46..bbf2cbb0738 100644 --- a/arch/powerpc/kernel/power5+-pmu.c +++ b/arch/powerpc/kernel/power5+-pmu.c @@ -614,6 +614,46 @@ static int power5p_generic_events[] = { [PERF_COUNT_BRANCH_MISSES] = 0x230e5, /* BR_MPRED_CR */ }; +#define C(x) PERF_COUNT_HW_CACHE_##x + +/* + * Table of generalized cache-related events. + * 0 means not supported, -1 means nonsensical, other values + * are event codes. + */ +static int power5p_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { + [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0x1c10a8, 0x3c1088 }, + [C(OP_WRITE)] = { 0x2c10a8, 0xc10c3 }, + [C(OP_PREFETCH)] = { 0xc70e7, -1 }, + }, + [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0, 0 }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { 0, 0 }, + }, + [C(L2)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0, 0 }, + [C(OP_WRITE)] = { 0, 0 }, + [C(OP_PREFETCH)] = { 0xc50c3, 0 }, + }, + [C(DTLB)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0xc20e4, 0x800c4 }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { -1, -1 }, + }, + [C(ITLB)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0, 0x800c0 }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { -1, -1 }, + }, + [C(BPU)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0x230e4, 0x230e5 }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { -1, -1 }, + }, +}; + struct power_pmu power5p_pmu = { .n_counter = 6, .max_alternatives = MAX_ALT, @@ -623,8 +663,9 @@ struct power_pmu power5p_pmu = { .get_constraint = power5p_get_constraint, .get_alternatives = power5p_get_alternatives, .disable_pmc = power5p_disable_pmc, + .limited_pmc_event = power5p_limited_pmc_event, + .flags = PPMU_LIMITED_PMC5_6, .n_generic = ARRAY_SIZE(power5p_generic_events), .generic_events = power5p_generic_events, - .flags = PPMU_LIMITED_PMC5_6, - .limited_pmc_event = power5p_limited_pmc_event, + .cache_events = &power5p_cache_events, }; diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c index 1b44c5fca18..670cf10b91e 100644 --- a/arch/powerpc/kernel/power5-pmu.c +++ b/arch/powerpc/kernel/power5-pmu.c @@ -556,6 +556,46 @@ static int power5_generic_events[] = { [PERF_COUNT_BRANCH_MISSES] = 0x230e5, /* BR_MPRED_CR */ }; +#define C(x) PERF_COUNT_HW_CACHE_##x + +/* + * Table of generalized cache-related events. + * 0 means not supported, -1 means nonsensical, other values + * are event codes. + */ +static int power5_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { + [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0x4c1090, 0x3c1088 }, + [C(OP_WRITE)] = { 0x3c1090, 0xc10c3 }, + [C(OP_PREFETCH)] = { 0xc70e7, 0 }, + }, + [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0, 0 }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { 0, 0 }, + }, + [C(L2)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0, 0x3c309b }, + [C(OP_WRITE)] = { 0, 0 }, + [C(OP_PREFETCH)] = { 0xc50c3, 0 }, + }, + [C(DTLB)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0x2c4090, 0x800c4 }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { -1, -1 }, + }, + [C(ITLB)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0, 0x800c0 }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { -1, -1 }, + }, + [C(BPU)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0x230e4, 0x230e5 }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { -1, -1 }, + }, +}; + struct power_pmu power5_pmu = { .n_counter = 6, .max_alternatives = MAX_ALT, @@ -567,4 +607,5 @@ struct power_pmu power5_pmu = { .disable_pmc = power5_disable_pmc, .n_generic = ARRAY_SIZE(power5_generic_events), .generic_events = power5_generic_events, + .cache_events = &power5_cache_events, }; diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c index cd4fbe06c35..4da70786609 100644 --- a/arch/powerpc/kernel/power6-pmu.c +++ b/arch/powerpc/kernel/power6-pmu.c @@ -474,6 +474,47 @@ static int power6_generic_events[] = { [PERF_COUNT_BRANCH_MISSES] = 0x400052, /* BR_MPRED */ }; +#define C(x) PERF_COUNT_HW_CACHE_##x + +/* + * Table of generalized cache-related events. + * 0 means not supported, -1 means nonsensical, other values + * are event codes. + * The "DTLB" and "ITLB" events relate to the DERAT and IERAT. + */ +static int power6_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { + [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0x80082, 0x80080 }, + [C(OP_WRITE)] = { 0x80086, 0x80088 }, + [C(OP_PREFETCH)] = { 0x810a4, 0 }, + }, + [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0, 0x100056 }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { 0x4008c, 0 }, + }, + [C(L2)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0x150730, 0x250532 }, + [C(OP_WRITE)] = { 0x250432, 0x150432 }, + [C(OP_PREFETCH)] = { 0x810a6, 0 }, + }, + [C(DTLB)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0, 0x20000e }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { -1, -1 }, + }, + [C(ITLB)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0, 0x420ce }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { -1, -1 }, + }, + [C(BPU)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0x430e6, 0x400052 }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { -1, -1 }, + }, +}; + struct power_pmu power6_pmu = { .n_counter = 6, .max_alternatives = MAX_ALT, @@ -483,8 +524,9 @@ struct power_pmu power6_pmu = { .get_constraint = p6_get_constraint, .get_alternatives = p6_get_alternatives, .disable_pmc = p6_disable_pmc, + .limited_pmc_event = p6_limited_pmc_event, + .flags = PPMU_LIMITED_PMC5_6 | PPMU_ALT_SIPR, .n_generic = ARRAY_SIZE(power6_generic_events), .generic_events = power6_generic_events, - .flags = PPMU_LIMITED_PMC5_6 | PPMU_ALT_SIPR, - .limited_pmc_event = p6_limited_pmc_event, + .cache_events = &power6_cache_events, }; diff --git a/arch/powerpc/kernel/power7-pmu.c b/arch/powerpc/kernel/power7-pmu.c index dfac48d8ff4..060e0deb399 100644 --- a/arch/powerpc/kernel/power7-pmu.c +++ b/arch/powerpc/kernel/power7-pmu.c @@ -302,6 +302,46 @@ static int power7_generic_events[] = { [PERF_COUNT_BRANCH_MISSES] = 0x400f6, /* BR_MPRED */ }; +#define C(x) PERF_COUNT_HW_CACHE_##x + +/* + * Table of generalized cache-related events. + * 0 means not supported, -1 means nonsensical, other values + * are event codes. + */ +static int power7_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { + [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0x400f0, 0xc880 }, + [C(OP_WRITE)] = { 0, 0x300f0 }, + [C(OP_PREFETCH)] = { 0xd8b8, 0 }, + }, + [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0, 0x200fc }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { 0x408a, 0 }, + }, + [C(L2)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0x6080, 0x6084 }, + [C(OP_WRITE)] = { 0x6082, 0x6086 }, + [C(OP_PREFETCH)] = { 0, 0 }, + }, + [C(DTLB)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0, 0x300fc }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { -1, -1 }, + }, + [C(ITLB)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0, 0x400fc }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { -1, -1 }, + }, + [C(BPU)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0x10068, 0x400f6 }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { -1, -1 }, + }, +}; + struct power_pmu power7_pmu = { .n_counter = 6, .max_alternatives = MAX_ALT + 1, @@ -313,4 +353,5 @@ struct power_pmu power7_pmu = { .disable_pmc = power7_disable_pmc, .n_generic = ARRAY_SIZE(power7_generic_events), .generic_events = power7_generic_events, + .cache_events = &power7_cache_events, }; diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c index eed47c4523f..336adf1736a 100644 --- a/arch/powerpc/kernel/ppc970-pmu.c +++ b/arch/powerpc/kernel/ppc970-pmu.c @@ -427,6 +427,46 @@ static int ppc970_generic_events[] = { [PERF_COUNT_BRANCH_MISSES] = 0x327, /* PM_GRP_BR_MPRED */ }; +#define C(x) PERF_COUNT_HW_CACHE_##x + +/* + * Table of generalized cache-related events. + * 0 means not supported, -1 means nonsensical, other values + * are event codes. + */ +static int ppc970_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { + [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0x8810, 0x3810 }, + [C(OP_WRITE)] = { 0x7810, 0x813 }, + [C(OP_PREFETCH)] = { 0x731, 0 }, + }, + [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0, 0 }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { 0, 0 }, + }, + [C(L2)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0, 0 }, + [C(OP_WRITE)] = { 0, 0 }, + [C(OP_PREFETCH)] = { 0x733, 0 }, + }, + [C(DTLB)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0, 0x704 }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { -1, -1 }, + }, + [C(ITLB)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0, 0x700 }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { -1, -1 }, + }, + [C(BPU)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0x431, 0x327 }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { -1, -1 }, + }, +}; + struct power_pmu ppc970_pmu = { .n_counter = 8, .max_alternatives = 2, @@ -438,4 +478,5 @@ struct power_pmu ppc970_pmu = { .disable_pmc = p970_disable_pmc, .n_generic = ARRAY_SIZE(ppc970_generic_events), .generic_events = ppc970_generic_events, + .cache_events = &ppc970_cache_events, }; -- cgit v1.2.3 From f4dbfa8f3131a84257223393905f7efad0ca5996 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2009 14:06:28 +0200 Subject: perf_counter: Standardize event names Pure renames only, to PERF_COUNT_HW_* and PERF_COUNT_SW_*. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/power4-pmu.c | 12 ++++++------ arch/powerpc/kernel/power5+-pmu.c | 12 ++++++------ arch/powerpc/kernel/power5-pmu.c | 12 ++++++------ arch/powerpc/kernel/power6-pmu.c | 12 ++++++------ arch/powerpc/kernel/ppc970-pmu.c | 12 ++++++------ arch/powerpc/mm/fault.c | 6 +++--- 6 files changed, 33 insertions(+), 33 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c index 0e94b685722..73956f084b2 100644 --- a/arch/powerpc/kernel/power4-pmu.c +++ b/arch/powerpc/kernel/power4-pmu.c @@ -535,12 +535,12 @@ static void p4_disable_pmc(unsigned int pmc, u64 mmcr[]) } static int p4_generic_events[] = { - [PERF_COUNT_CPU_CYCLES] = 7, - [PERF_COUNT_INSTRUCTIONS] = 0x1001, - [PERF_COUNT_CACHE_REFERENCES] = 0x8c10, /* PM_LD_REF_L1 */ - [PERF_COUNT_CACHE_MISSES] = 0x3c10, /* PM_LD_MISS_L1 */ - [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x330, /* PM_BR_ISSUED */ - [PERF_COUNT_BRANCH_MISSES] = 0x331, /* PM_BR_MPRED_CR */ + [PERF_COUNT_HW_CPU_CYCLES] = 7, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x1001, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x8c10, /* PM_LD_REF_L1 */ + [PERF_COUNT_HW_CACHE_MISSES] = 0x3c10, /* PM_LD_MISS_L1 */ + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x330, /* PM_BR_ISSUED */ + [PERF_COUNT_HW_BRANCH_MISSES] = 0x331, /* PM_BR_MPRED_CR */ }; #define C(x) PERF_COUNT_HW_CACHE_##x diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c index bbf2cbb0738..5f8b7741e97 100644 --- a/arch/powerpc/kernel/power5+-pmu.c +++ b/arch/powerpc/kernel/power5+-pmu.c @@ -606,12 +606,12 @@ static void power5p_disable_pmc(unsigned int pmc, u64 mmcr[]) } static int power5p_generic_events[] = { - [PERF_COUNT_CPU_CYCLES] = 0xf, - [PERF_COUNT_INSTRUCTIONS] = 0x100009, - [PERF_COUNT_CACHE_REFERENCES] = 0x1c10a8, /* LD_REF_L1 */ - [PERF_COUNT_CACHE_MISSES] = 0x3c1088, /* LD_MISS_L1 */ - [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x230e4, /* BR_ISSUED */ - [PERF_COUNT_BRANCH_MISSES] = 0x230e5, /* BR_MPRED_CR */ + [PERF_COUNT_HW_CPU_CYCLES] = 0xf, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x100009, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x1c10a8, /* LD_REF_L1 */ + [PERF_COUNT_HW_CACHE_MISSES] = 0x3c1088, /* LD_MISS_L1 */ + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x230e4, /* BR_ISSUED */ + [PERF_COUNT_HW_BRANCH_MISSES] = 0x230e5, /* BR_MPRED_CR */ }; #define C(x) PERF_COUNT_HW_CACHE_##x diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c index 670cf10b91e..d54723ab627 100644 --- a/arch/powerpc/kernel/power5-pmu.c +++ b/arch/powerpc/kernel/power5-pmu.c @@ -548,12 +548,12 @@ static void power5_disable_pmc(unsigned int pmc, u64 mmcr[]) } static int power5_generic_events[] = { - [PERF_COUNT_CPU_CYCLES] = 0xf, - [PERF_COUNT_INSTRUCTIONS] = 0x100009, - [PERF_COUNT_CACHE_REFERENCES] = 0x4c1090, /* LD_REF_L1 */ - [PERF_COUNT_CACHE_MISSES] = 0x3c1088, /* LD_MISS_L1 */ - [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x230e4, /* BR_ISSUED */ - [PERF_COUNT_BRANCH_MISSES] = 0x230e5, /* BR_MPRED_CR */ + [PERF_COUNT_HW_CPU_CYCLES] = 0xf, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x100009, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4c1090, /* LD_REF_L1 */ + [PERF_COUNT_HW_CACHE_MISSES] = 0x3c1088, /* LD_MISS_L1 */ + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x230e4, /* BR_ISSUED */ + [PERF_COUNT_HW_BRANCH_MISSES] = 0x230e5, /* BR_MPRED_CR */ }; #define C(x) PERF_COUNT_HW_CACHE_##x diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c index 4da70786609..0cd406ee765 100644 --- a/arch/powerpc/kernel/power6-pmu.c +++ b/arch/powerpc/kernel/power6-pmu.c @@ -466,12 +466,12 @@ static void p6_disable_pmc(unsigned int pmc, u64 mmcr[]) } static int power6_generic_events[] = { - [PERF_COUNT_CPU_CYCLES] = 0x1e, - [PERF_COUNT_INSTRUCTIONS] = 2, - [PERF_COUNT_CACHE_REFERENCES] = 0x280030, /* LD_REF_L1 */ - [PERF_COUNT_CACHE_MISSES] = 0x30000c, /* LD_MISS_L1 */ - [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x410a0, /* BR_PRED */ - [PERF_COUNT_BRANCH_MISSES] = 0x400052, /* BR_MPRED */ + [PERF_COUNT_HW_CPU_CYCLES] = 0x1e, + [PERF_COUNT_HW_INSTRUCTIONS] = 2, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x280030, /* LD_REF_L1 */ + [PERF_COUNT_HW_CACHE_MISSES] = 0x30000c, /* LD_MISS_L1 */ + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x410a0, /* BR_PRED */ + [PERF_COUNT_HW_BRANCH_MISSES] = 0x400052, /* BR_MPRED */ }; #define C(x) PERF_COUNT_HW_CACHE_##x diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c index 336adf1736a..46a20640942 100644 --- a/arch/powerpc/kernel/ppc970-pmu.c +++ b/arch/powerpc/kernel/ppc970-pmu.c @@ -419,12 +419,12 @@ static void p970_disable_pmc(unsigned int pmc, u64 mmcr[]) } static int ppc970_generic_events[] = { - [PERF_COUNT_CPU_CYCLES] = 7, - [PERF_COUNT_INSTRUCTIONS] = 1, - [PERF_COUNT_CACHE_REFERENCES] = 0x8810, /* PM_LD_REF_L1 */ - [PERF_COUNT_CACHE_MISSES] = 0x3810, /* PM_LD_MISS_L1 */ - [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x431, /* PM_BR_ISSUED */ - [PERF_COUNT_BRANCH_MISSES] = 0x327, /* PM_GRP_BR_MPRED */ + [PERF_COUNT_HW_CPU_CYCLES] = 7, + [PERF_COUNT_HW_INSTRUCTIONS] = 1, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x8810, /* PM_LD_REF_L1 */ + [PERF_COUNT_HW_CACHE_MISSES] = 0x3810, /* PM_LD_MISS_L1 */ + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x431, /* PM_BR_ISSUED */ + [PERF_COUNT_HW_BRANCH_MISSES] = 0x327, /* PM_GRP_BR_MPRED */ }; #define C(x) PERF_COUNT_HW_CACHE_##x diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index ac0e112031b..5beffc8f481 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -171,7 +171,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, die("Weird page fault", regs, SIGSEGV); } - perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs, address); + perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the @@ -312,7 +312,7 @@ good_area: } if (ret & VM_FAULT_MAJOR) { current->maj_flt++; - perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, + perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, regs, address); #ifdef CONFIG_PPC_SMLPAR if (firmware_has_feature(FW_FEATURE_CMO)) { @@ -323,7 +323,7 @@ good_area: #endif } else { current->min_flt++; - perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, + perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, regs, address); } up_read(&mm->mmap_sem); -- cgit v1.2.3 From 8be6e8f3c3a13900169f1141870562d0c723b010 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2009 14:19:11 +0200 Subject: perf_counter: Rename L2 to LL cache The top (fastest) and last level (biggest) caches are the most interesting ones, performance wise. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: [ Fixed the Nehalem LL table to LLC Reference/Miss events ] Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/power4-pmu.c | 2 +- arch/powerpc/kernel/power5+-pmu.c | 2 +- arch/powerpc/kernel/power5-pmu.c | 2 +- arch/powerpc/kernel/power6-pmu.c | 2 +- arch/powerpc/kernel/power7-pmu.c | 2 +- arch/powerpc/kernel/ppc970-pmu.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c index 73956f084b2..07bd308a5fa 100644 --- a/arch/powerpc/kernel/power4-pmu.c +++ b/arch/powerpc/kernel/power4-pmu.c @@ -561,7 +561,7 @@ static int power4_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { [C(OP_WRITE)] = { -1, -1 }, [C(OP_PREFETCH)] = { 0, 0 }, }, - [C(L2)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */ [C(OP_READ)] = { 0, 0 }, [C(OP_WRITE)] = { 0, 0 }, [C(OP_PREFETCH)] = { 0xc34, 0 }, diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c index 5f8b7741e97..41e5d2d958d 100644 --- a/arch/powerpc/kernel/power5+-pmu.c +++ b/arch/powerpc/kernel/power5+-pmu.c @@ -632,7 +632,7 @@ static int power5p_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { [C(OP_WRITE)] = { -1, -1 }, [C(OP_PREFETCH)] = { 0, 0 }, }, - [C(L2)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */ [C(OP_READ)] = { 0, 0 }, [C(OP_WRITE)] = { 0, 0 }, [C(OP_PREFETCH)] = { 0xc50c3, 0 }, diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c index d54723ab627..05600b66221 100644 --- a/arch/powerpc/kernel/power5-pmu.c +++ b/arch/powerpc/kernel/power5-pmu.c @@ -574,7 +574,7 @@ static int power5_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { [C(OP_WRITE)] = { -1, -1 }, [C(OP_PREFETCH)] = { 0, 0 }, }, - [C(L2)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */ [C(OP_READ)] = { 0, 0x3c309b }, [C(OP_WRITE)] = { 0, 0 }, [C(OP_PREFETCH)] = { 0xc50c3, 0 }, diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c index 0cd406ee765..46f74bebcfd 100644 --- a/arch/powerpc/kernel/power6-pmu.c +++ b/arch/powerpc/kernel/power6-pmu.c @@ -493,7 +493,7 @@ static int power6_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { [C(OP_WRITE)] = { -1, -1 }, [C(OP_PREFETCH)] = { 0x4008c, 0 }, }, - [C(L2)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */ [C(OP_READ)] = { 0x150730, 0x250532 }, [C(OP_WRITE)] = { 0x250432, 0x150432 }, [C(OP_PREFETCH)] = { 0x810a6, 0 }, diff --git a/arch/powerpc/kernel/power7-pmu.c b/arch/powerpc/kernel/power7-pmu.c index 060e0deb399..b3f7d1216ba 100644 --- a/arch/powerpc/kernel/power7-pmu.c +++ b/arch/powerpc/kernel/power7-pmu.c @@ -320,7 +320,7 @@ static int power7_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { [C(OP_WRITE)] = { -1, -1 }, [C(OP_PREFETCH)] = { 0x408a, 0 }, }, - [C(L2)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */ [C(OP_READ)] = { 0x6080, 0x6084 }, [C(OP_WRITE)] = { 0x6082, 0x6086 }, [C(OP_PREFETCH)] = { 0, 0 }, diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c index 46a20640942..ba0a357a89f 100644 --- a/arch/powerpc/kernel/ppc970-pmu.c +++ b/arch/powerpc/kernel/ppc970-pmu.c @@ -445,7 +445,7 @@ static int ppc970_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { [C(OP_WRITE)] = { -1, -1 }, [C(OP_PREFETCH)] = { 0, 0 }, }, - [C(L2)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */ [C(OP_READ)] = { 0, 0 }, [C(OP_WRITE)] = { 0, 0 }, [C(OP_PREFETCH)] = { 0x733, 0 }, -- cgit v1.2.3