From 241771ef016b5c0c83cd7a4372a74321c973c1e6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Dec 2008 10:39:53 +0100
Subject: performance counters: x86 support

Implement performance counters for x86 Intel CPUs.

It's simplified right now: the PERFMON CPU feature is assumed,
which is available in Core2 and later Intel CPUs.

The design is flexible to be extended to more CPU types as well.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig                               |   1 +
 arch/x86/ia32/ia32entry.S                      |   3 +-
 arch/x86/include/asm/hardirq_32.h              |   1 +
 arch/x86/include/asm/hw_irq.h                  |   2 +
 arch/x86/include/asm/intel_arch_perfmon.h      |  34 +-
 arch/x86/include/asm/irq_vectors.h             |   5 +
 arch/x86/include/asm/mach-default/entry_arch.h |   5 +
 arch/x86/include/asm/pda.h                     |   1 +
 arch/x86/include/asm/thread_info.h             |   4 +-
 arch/x86/include/asm/unistd_32.h               |   1 +
 arch/x86/include/asm/unistd_64.h               |   3 +-
 arch/x86/kernel/apic.c                         |   2 +
 arch/x86/kernel/cpu/Makefile                   |  12 +-
 arch/x86/kernel/cpu/common.c                   |   2 +
 arch/x86/kernel/cpu/perf_counter.c             | 571 +++++++++++++++++++++++++
 arch/x86/kernel/entry_64.S                     |   5 +
 arch/x86/kernel/irq.c                          |   5 +
 arch/x86/kernel/irqinit_32.c                   |   3 +
 arch/x86/kernel/irqinit_64.c                   |   5 +
 arch/x86/kernel/signal.c                       |   7 +-
 arch/x86/kernel/syscall_table_32.S             |   1 +
 21 files changed, 652 insertions(+), 21 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/perf_counter.c

(limited to 'arch')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d4d4cb7629e..f2fdc186724 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -643,6 +643,7 @@ config X86_UP_IOAPIC
 config X86_LOCAL_APIC
 	def_bool y
 	depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
+	select HAVE_PERF_COUNTERS
 
 config X86_IO_APIC
 	def_bool y
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 256b00b6189..3c14ed07dc4 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -823,7 +823,8 @@ ia32_sys_call_table:
 	.quad compat_sys_signalfd4
 	.quad sys_eventfd2
 	.quad sys_epoll_create1
-	.quad sys_dup3			/* 330 */
+	.quad sys_dup3				/* 330 */
 	.quad sys_pipe2
 	.quad sys_inotify_init1
+	.quad sys_perf_counter_open
 ia32_syscall_end:
diff --git a/arch/x86/include/asm/hardirq_32.h b/arch/x86/include/asm/hardirq_32.h
index 5ca135e72f2..b3e475dc933 100644
--- a/arch/x86/include/asm/hardirq_32.h
+++ b/arch/x86/include/asm/hardirq_32.h
@@ -9,6 +9,7 @@ typedef struct {
 	unsigned long idle_timestamp;
 	unsigned int __nmi_count;	/* arch dependent */
 	unsigned int apic_timer_irqs;	/* arch dependent */
+	unsigned int apic_perf_irqs;	/* arch dependent */
 	unsigned int irq0_irqs;
 	unsigned int irq_resched_count;
 	unsigned int irq_call_count;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 8de644b6b95..aa93e53b85e 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -30,6 +30,8 @@
 /* Interrupt handlers registered during init_IRQ */
 extern void apic_timer_interrupt(void);
 extern void error_interrupt(void);
+extern void perf_counter_interrupt(void);
+
 extern void spurious_interrupt(void);
 extern void thermal_interrupt(void);
 extern void reschedule_interrupt(void);
diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/intel_arch_perfmon.h
index fa0fd068bc2..71598a9eab6 100644
--- a/arch/x86/include/asm/intel_arch_perfmon.h
+++ b/arch/x86/include/asm/intel_arch_perfmon.h
@@ -1,22 +1,24 @@
 #ifndef _ASM_X86_INTEL_ARCH_PERFMON_H
 #define _ASM_X86_INTEL_ARCH_PERFMON_H
 
-#define MSR_ARCH_PERFMON_PERFCTR0		0xc1
-#define MSR_ARCH_PERFMON_PERFCTR1		0xc2
+#define MSR_ARCH_PERFMON_PERFCTR0			      0xc1
+#define MSR_ARCH_PERFMON_PERFCTR1			      0xc2
 
-#define MSR_ARCH_PERFMON_EVENTSEL0		0x186
-#define MSR_ARCH_PERFMON_EVENTSEL1		0x187
+#define MSR_ARCH_PERFMON_EVENTSEL0			     0x186
+#define MSR_ARCH_PERFMON_EVENTSEL1			     0x187
 
-#define ARCH_PERFMON_EVENTSEL0_ENABLE	(1 << 22)
-#define ARCH_PERFMON_EVENTSEL_INT	(1 << 20)
-#define ARCH_PERFMON_EVENTSEL_OS	(1 << 17)
-#define ARCH_PERFMON_EVENTSEL_USR	(1 << 16)
+#define ARCH_PERFMON_EVENTSEL0_ENABLE			  (1 << 22)
+#define ARCH_PERFMON_EVENTSEL_INT			  (1 << 20)
+#define ARCH_PERFMON_EVENTSEL_OS			  (1 << 17)
+#define ARCH_PERFMON_EVENTSEL_USR			  (1 << 16)
 
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL	(0x3c)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK	(0x00 << 8)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0)
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL		      0x3c
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK		(0x00 << 8)
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 		 0
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
-	(1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
+		(1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
+
+#define ARCH_PERFMON_BRANCH_MISSES_RETIRED			 6
 
 union cpuid10_eax {
 	struct {
@@ -28,4 +30,12 @@ union cpuid10_eax {
 	unsigned int full;
 };
 
+#ifdef CONFIG_PERF_COUNTERS
+extern void init_hw_perf_counters(void);
+extern void perf_counters_lapic_init(int nmi);
+#else
+static inline void init_hw_perf_counters(void)		{ }
+static inline void perf_counters_lapic_init(int nmi)	{ }
+#endif
+
 #endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 0005adb0f94..b8d277f1252 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -86,6 +86,11 @@
  */
 #define LOCAL_TIMER_VECTOR	0xef
 
+/*
+ * Performance monitoring interrupt vector:
+ */
+#define LOCAL_PERF_VECTOR	0xee
+
 /*
  * First APIC vector available to drivers: (vectors 0x30-0xee) we
  * start at 0x31(0x41) to spread out vectors evenly between priority
diff --git a/arch/x86/include/asm/mach-default/entry_arch.h b/arch/x86/include/asm/mach-default/entry_arch.h
index 6b1add8e31d..ad31e5d90e9 100644
--- a/arch/x86/include/asm/mach-default/entry_arch.h
+++ b/arch/x86/include/asm/mach-default/entry_arch.h
@@ -25,10 +25,15 @@ BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
  * a much simpler SMP time architecture:
  */
 #ifdef CONFIG_X86_LOCAL_APIC
+
 BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
 BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
 BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
 
+#ifdef CONFIG_PERF_COUNTERS
+BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR)
+#endif
+
 #ifdef CONFIG_X86_MCE_P4THERMAL
 BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
 #endif
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index 2fbfff88df3..90a8d9d4206 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -30,6 +30,7 @@ struct x8664_pda {
 	short isidle;
 	struct mm_struct *active_mm;
 	unsigned apic_timer_irqs;
+	unsigned apic_perf_irqs;
 	unsigned irq0_irqs;
 	unsigned irq_resched_count;
 	unsigned irq_call_count;
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e44d379faad..810bf266d13 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -80,6 +80,7 @@ struct thread_info {
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
+#define TIF_PERF_COUNTERS	11	/* notify perf counter work */
 #define TIF_NOTSC		16	/* TSC is not accessible in userland */
 #define TIF_IA32		17	/* 32bit process */
 #define TIF_FORK		18	/* ret_from_fork */
@@ -103,6 +104,7 @@ struct thread_info {
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
 #define _TIF_MCE_NOTIFY		(1 << TIF_MCE_NOTIFY)
+#define _TIF_PERF_COUNTERS	(1 << TIF_PERF_COUNTERS)
 #define _TIF_NOTSC		(1 << TIF_NOTSC)
 #define _TIF_IA32		(1 << TIF_IA32)
 #define _TIF_FORK		(1 << TIF_FORK)
@@ -135,7 +137,7 @@ struct thread_info {
 
 /* Only used for 64 bit */
 #define _TIF_DO_NOTIFY_MASK						\
-	(_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
+	(_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_PERF_COUNTERS|_TIF_NOTIFY_RESUME)
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index f2bba78430a..7e47658b0a6 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -338,6 +338,7 @@
 #define __NR_dup3		330
 #define __NR_pipe2		331
 #define __NR_inotify_init1	332
+#define __NR_perf_counter_open	333
 
 #ifdef __KERNEL__
 
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index d2e415e6666..53025feaf88 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -653,7 +653,8 @@ __SYSCALL(__NR_dup3, sys_dup3)
 __SYSCALL(__NR_pipe2, sys_pipe2)
 #define __NR_inotify_init1			294
 __SYSCALL(__NR_inotify_init1, sys_inotify_init1)
-
+#define __NR_perf_counter_open		295
+__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 16f94879b52..8ab8c185867 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -31,6 +31,7 @@
 #include <linux/dmi.h>
 #include <linux/dmar.h>
 
+#include <asm/intel_arch_perfmon.h>
 #include <asm/atomic.h>
 #include <asm/smp.h>
 #include <asm/mtrr.h>
@@ -1147,6 +1148,7 @@ void __cpuinit setup_local_APIC(void)
 		apic_write(APIC_ESR, 0);
 	}
 #endif
+	perf_counters_lapic_init(0);
 
 	preempt_disable();
 
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 82ec6075c05..89e53361fe2 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -1,5 +1,5 @@
 #
-# Makefile for x86-compatible CPU details and quirks
+# Makefile for x86-compatible CPU details, features and quirks
 #
 
 obj-y			:= intel_cacheinfo.o addon_cpuid_features.o
@@ -16,11 +16,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR_64)	+= centaur_64.o
 obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
-obj-$(CONFIG_X86_MCE)	+= mcheck/
-obj-$(CONFIG_MTRR)	+= mtrr/
-obj-$(CONFIG_CPU_FREQ)	+= cpufreq/
+obj-$(CONFIG_PERF_COUNTERS)		+= perf_counter.o
 
-obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
+obj-$(CONFIG_X86_MCE)			+= mcheck/
+obj-$(CONFIG_MTRR)			+= mtrr/
+obj-$(CONFIG_CPU_FREQ)			+= cpufreq/
+
+obj-$(CONFIG_X86_LOCAL_APIC)		+= perfctr-watchdog.o
 
 quiet_cmd_mkcapflags = MKCAP   $@
       cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b9c9ea0217a..4461011db47 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -17,6 +17,7 @@
 #include <asm/mmu_context.h>
 #include <asm/mtrr.h>
 #include <asm/mce.h>
+#include <asm/intel_arch_perfmon.h>
 #include <asm/pat.h>
 #include <asm/asm.h>
 #include <asm/numa.h>
@@ -750,6 +751,7 @@ void __init identify_boot_cpu(void)
 #else
 	vgetcpu_set_mode();
 #endif
+	init_hw_perf_counters();
 }
 
 void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
new file mode 100644
index 00000000000..82440cbed0e
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -0,0 +1,571 @@
+/*
+ * Performance counter x86 architecture code
+ *
+ *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_counter.h>
+#include <linux/capability.h>
+#include <linux/notifier.h>
+#include <linux/hardirq.h>
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/sched.h>
+
+#include <asm/intel_arch_perfmon.h>
+#include <asm/apic.h>
+
+static bool perf_counters_initialized __read_mostly;
+
+/*
+ * Number of (generic) HW counters:
+ */
+static int nr_hw_counters __read_mostly;
+static u32 perf_counter_mask __read_mostly;
+
+/* No support for fixed function counters yet */
+
+#define MAX_HW_COUNTERS		8
+
+struct cpu_hw_counters {
+	struct perf_counter	*counters[MAX_HW_COUNTERS];
+	unsigned long		used[BITS_TO_LONGS(MAX_HW_COUNTERS)];
+	int			enable_all;
+};
+
+/*
+ * Intel PerfMon v3. Used on Core2 and later.
+ */
+static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
+
+const int intel_perfmon_event_map[] =
+{
+  [PERF_COUNT_CYCLES]			= 0x003c,
+  [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
+  [PERF_COUNT_CACHE_REFERENCES]		= 0x4f2e,
+  [PERF_COUNT_CACHE_MISSES]		= 0x412e,
+  [PERF_COUNT_BRANCH_INSTRUCTIONS]	= 0x00c4,
+  [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
+};
+
+const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
+
+/*
+ * Setup the hardware configuration for a given hw_event_type
+ */
+int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type)
+{
+	struct hw_perf_counter *hwc = &counter->hw;
+
+	if (unlikely(!perf_counters_initialized))
+		return -EINVAL;
+
+	/*
+	 * Count user events, and generate PMC IRQs:
+	 * (keep 'enabled' bit clear for now)
+	 */
+	hwc->config = ARCH_PERFMON_EVENTSEL_USR | ARCH_PERFMON_EVENTSEL_INT;
+
+	/*
+	 * If privileged enough, count OS events too, and allow
+	 * NMI events as well:
+	 */
+	hwc->nmi = 0;
+	if (capable(CAP_SYS_ADMIN)) {
+		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
+		if (hw_event_type & PERF_COUNT_NMI)
+			hwc->nmi = 1;
+	}
+
+	hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0;
+	hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0;
+
+	hwc->irq_period = counter->__irq_period;
+	/*
+	 * Intel PMCs cannot be accessed sanely above 32 bit width,
+	 * so we install an artificial 1<<31 period regardless of
+	 * the generic counter period:
+	 */
+	if (!hwc->irq_period)
+		hwc->irq_period = 0x7FFFFFFF;
+
+	hwc->next_count = -((s32) hwc->irq_period);
+
+	/*
+	 * Negative event types mean raw encoded event+umask values:
+	 */
+	if (hw_event_type < 0) {
+		counter->hw_event_type = -hw_event_type;
+		counter->hw_event_type &= ~PERF_COUNT_NMI;
+	} else {
+		hw_event_type &= ~PERF_COUNT_NMI;
+		if (hw_event_type >= max_intel_perfmon_events)
+			return -EINVAL;
+		/*
+		 * The generic map:
+		 */
+		counter->hw_event_type = intel_perfmon_event_map[hw_event_type];
+	}
+	hwc->config |= counter->hw_event_type;
+	counter->wakeup_pending = 0;
+
+	return 0;
+}
+
+static void __hw_perf_enable_all(void)
+{
+	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
+}
+
+void hw_perf_enable_all(void)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+	cpuc->enable_all = 1;
+	__hw_perf_enable_all();
+}
+
+void hw_perf_disable_all(void)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+	cpuc->enable_all = 0;
+	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
+}
+
+static DEFINE_PER_CPU(u64, prev_next_count[MAX_HW_COUNTERS]);
+
+static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx)
+{
+	per_cpu(prev_next_count[idx], smp_processor_id()) = hwc->next_count;
+
+	wrmsr(hwc->counter_base + idx, hwc->next_count, 0);
+	wrmsr(hwc->config_base + idx, hwc->config, 0);
+}
+
+void hw_perf_counter_enable(struct perf_counter *counter)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+	struct hw_perf_counter *hwc = &counter->hw;
+	int idx = hwc->idx;
+
+	/* Try to get the previous counter again */
+	if (test_and_set_bit(idx, cpuc->used)) {
+		idx = find_first_zero_bit(cpuc->used, nr_hw_counters);
+		set_bit(idx, cpuc->used);
+		hwc->idx = idx;
+	}
+
+	perf_counters_lapic_init(hwc->nmi);
+
+	wrmsr(hwc->config_base + idx,
+	      hwc->config & ~ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
+
+	cpuc->counters[idx] = counter;
+	counter->hw.config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+	__hw_perf_counter_enable(hwc, idx);
+}
+
+#ifdef CONFIG_X86_64
+static inline void atomic64_counter_set(struct perf_counter *counter, u64 val)
+{
+	atomic64_set(&counter->count, val);
+}
+
+static inline u64 atomic64_counter_read(struct perf_counter *counter)
+{
+	return atomic64_read(&counter->count);
+}
+#else
+/*
+ * Todo: add proper atomic64_t support to 32-bit x86:
+ */
+static inline void atomic64_counter_set(struct perf_counter *counter, u64 val64)
+{
+	u32 *val32 = (void *)&val64;
+
+	atomic_set(counter->count32 + 0, *(val32 + 0));
+	atomic_set(counter->count32 + 1, *(val32 + 1));
+}
+
+static inline u64 atomic64_counter_read(struct perf_counter *counter)
+{
+	return atomic_read(counter->count32 + 0) |
+		(u64) atomic_read(counter->count32 + 1) << 32;
+}
+#endif
+
+static void __hw_perf_save_counter(struct perf_counter *counter,
+				   struct hw_perf_counter *hwc, int idx)
+{
+	s64 raw = -1;
+	s64 delta;
+	int err;
+
+	/*
+	 * Get the raw hw counter value:
+	 */
+	err = rdmsrl_safe(hwc->counter_base + idx, &raw);
+	WARN_ON_ONCE(err);
+
+	/*
+	 * Rebase it to zero (it started counting at -irq_period),
+	 * to see the delta since ->prev_count:
+	 */
+	delta = (s64)hwc->irq_period + (s64)(s32)raw;
+
+	atomic64_counter_set(counter, hwc->prev_count + delta);
+
+	/*
+	 * Adjust the ->prev_count offset - if we went beyond
+	 * irq_period of units, then we got an IRQ and the counter
+	 * was set back to -irq_period:
+	 */
+	while (delta >= (s64)hwc->irq_period) {
+		hwc->prev_count += hwc->irq_period;
+		delta -= (s64)hwc->irq_period;
+	}
+
+	/*
+	 * Calculate the next raw counter value we'll write into
+	 * the counter at the next sched-in time:
+	 */
+	delta -= (s64)hwc->irq_period;
+
+	hwc->next_count = (s32)delta;
+}
+
+void perf_counter_print_debug(void)
+{
+	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, next_count;
+	int cpu, err, idx;
+
+	local_irq_disable();
+
+	cpu = smp_processor_id();
+
+	err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_CTRL, &ctrl);
+	WARN_ON_ONCE(err);
+
+	err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_STATUS, &status);
+	WARN_ON_ONCE(err);
+
+	err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_OVF_CTRL, &overflow);
+	WARN_ON_ONCE(err);
+
+	printk(KERN_INFO "\n");
+	printk(KERN_INFO "CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
+	printk(KERN_INFO "CPU#%d: status:     %016llx\n", cpu, status);
+	printk(KERN_INFO "CPU#%d: overflow:   %016llx\n", cpu, overflow);
+
+	for (idx = 0; idx < nr_hw_counters; idx++) {
+		err = rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0 + idx, &pmc_ctrl);
+		WARN_ON_ONCE(err);
+
+		err = rdmsrl_safe(MSR_ARCH_PERFMON_PERFCTR0 + idx, &pmc_count);
+		WARN_ON_ONCE(err);
+
+		next_count = per_cpu(prev_next_count[idx], cpu);
+
+		printk(KERN_INFO "CPU#%d: PMC%d ctrl:  %016llx\n",
+			cpu, idx, pmc_ctrl);
+		printk(KERN_INFO "CPU#%d: PMC%d count: %016llx\n",
+			cpu, idx, pmc_count);
+		printk(KERN_INFO "CPU#%d: PMC%d next:  %016llx\n",
+			cpu, idx, next_count);
+	}
+	local_irq_enable();
+}
+
+void hw_perf_counter_disable(struct perf_counter *counter)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+	struct hw_perf_counter *hwc = &counter->hw;
+	unsigned int idx = hwc->idx;
+
+	counter->hw.config &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+	wrmsr(hwc->config_base + idx, hwc->config, 0);
+
+	clear_bit(idx, cpuc->used);
+	cpuc->counters[idx] = NULL;
+	__hw_perf_save_counter(counter, hwc, idx);
+}
+
+void hw_perf_counter_read(struct perf_counter *counter)
+{
+	struct hw_perf_counter *hwc = &counter->hw;
+	unsigned long addr = hwc->counter_base + hwc->idx;
+	s64 offs, val = -1LL;
+	s32 val32;
+	int err;
+
+	/* Careful: NMI might modify the counter offset */
+	do {
+		offs = hwc->prev_count;
+		err = rdmsrl_safe(addr, &val);
+		WARN_ON_ONCE(err);
+	} while (offs != hwc->prev_count);
+
+	val32 = (s32) val;
+	val =  (s64)hwc->irq_period + (s64)val32;
+	atomic64_counter_set(counter, hwc->prev_count + val);
+}
+
+static void perf_store_irq_data(struct perf_counter *counter, u64 data)
+{
+	struct perf_data *irqdata = counter->irqdata;
+
+	if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
+		irqdata->overrun++;
+	} else {
+		u64 *p = (u64 *) &irqdata->data[irqdata->len];
+
+		*p = data;
+		irqdata->len += sizeof(u64);
+	}
+}
+
+static void perf_save_and_restart(struct perf_counter *counter)
+{
+	struct hw_perf_counter *hwc = &counter->hw;
+	int idx = hwc->idx;
+
+	wrmsr(hwc->config_base + idx,
+	      hwc->config & ~ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
+
+	if (hwc->config & ARCH_PERFMON_EVENTSEL0_ENABLE) {
+		__hw_perf_save_counter(counter, hwc, idx);
+		__hw_perf_counter_enable(hwc, idx);
+	}
+}
+
+static void
+perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
+{
+	struct perf_counter_context *ctx = leader->ctx;
+	struct perf_counter *counter;
+	int bit;
+
+	list_for_each_entry(counter, &ctx->counters, list) {
+		if (counter->record_type != PERF_RECORD_SIMPLE ||
+		    counter == leader)
+			continue;
+
+		if (counter->active) {
+			/*
+			 * When counter was not in the overflow mask, we have to
+			 * read it from hardware. We read it as well, when it
+			 * has not been read yet and clear the bit in the
+			 * status mask.
+			 */
+			bit = counter->hw.idx;
+			if (!test_bit(bit, (unsigned long *) overflown) ||
+			    test_bit(bit, (unsigned long *) status)) {
+				clear_bit(bit, (unsigned long *) status);
+				perf_save_and_restart(counter);
+			}
+		}
+		perf_store_irq_data(leader, counter->hw_event_type);
+		perf_store_irq_data(leader, atomic64_counter_read(counter));
+	}
+}
+
+/*
+ * This handler is triggered by the local APIC, so the APIC IRQ handling
+ * rules apply:
+ */
+static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
+{
+	int bit, cpu = smp_processor_id();
+	struct cpu_hw_counters *cpuc;
+	u64 ack, status;
+
+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+	if (!status) {
+		ack_APIC_irq();
+		return;
+	}
+
+	/* Disable counters globally */
+	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
+	ack_APIC_irq();
+
+	cpuc = &per_cpu(cpu_hw_counters, cpu);
+
+again:
+	ack = status;
+	for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) {
+		struct perf_counter *counter = cpuc->counters[bit];
+
+		clear_bit(bit, (unsigned long *) &status);
+		if (!counter)
+			continue;
+
+		perf_save_and_restart(counter);
+
+		switch (counter->record_type) {
+		case PERF_RECORD_SIMPLE:
+			continue;
+		case PERF_RECORD_IRQ:
+			perf_store_irq_data(counter, instruction_pointer(regs));
+			break;
+		case PERF_RECORD_GROUP:
+			perf_store_irq_data(counter, counter->hw_event_type);
+			perf_store_irq_data(counter,
+					    atomic64_counter_read(counter));
+			perf_handle_group(counter, &status, &ack);
+			break;
+		}
+		/*
+		 * From NMI context we cannot call into the scheduler to
+		 * do a task wakeup - but we mark these counters as
+		 * wakeup_pending and initate a wakeup callback:
+		 */
+		if (nmi) {
+			counter->wakeup_pending = 1;
+			set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
+		} else {
+			wake_up(&counter->waitq);
+		}
+	}
+
+	wrmsr(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack, 0);
+
+	/*
+	 * Repeat if there is more work to be done:
+	 */
+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+	if (status)
+		goto again;
+
+	/*
+	 * Do not reenable when global enable is off:
+	 */
+	if (cpuc->enable_all)
+		__hw_perf_enable_all();
+}
+
+void smp_perf_counter_interrupt(struct pt_regs *regs)
+{
+	irq_enter();
+#ifdef CONFIG_X86_64
+	add_pda(apic_perf_irqs, 1);
+#else
+	per_cpu(irq_stat, smp_processor_id()).apic_perf_irqs++;
+#endif
+	apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
+	__smp_perf_counter_interrupt(regs, 0);
+
+	irq_exit();
+}
+
+/*
+ * This handler is triggered by NMI contexts:
+ */
+void perf_counter_notify(struct pt_regs *regs)
+{
+	struct cpu_hw_counters *cpuc;
+	unsigned long flags;
+	int bit, cpu;
+
+	local_irq_save(flags);
+	cpu = smp_processor_id();
+	cpuc = &per_cpu(cpu_hw_counters, cpu);
+
+	for_each_bit(bit, cpuc->used, nr_hw_counters) {
+		struct perf_counter *counter = cpuc->counters[bit];
+
+		if (!counter)
+			continue;
+
+		if (counter->wakeup_pending) {
+			counter->wakeup_pending = 0;
+			wake_up(&counter->waitq);
+		}
+	}
+
+	local_irq_restore(flags);
+}
+
+void __cpuinit perf_counters_lapic_init(int nmi)
+{
+	u32 apic_val;
+
+	if (!perf_counters_initialized)
+		return;
+	/*
+	 * Enable the performance counter vector in the APIC LVT:
+	 */
+	apic_val = apic_read(APIC_LVTERR);
+
+	apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED);
+	if (nmi)
+		apic_write(APIC_LVTPC, APIC_DM_NMI);
+	else
+		apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
+	apic_write(APIC_LVTERR, apic_val);
+}
+
+static int __kprobes
+perf_counter_nmi_handler(struct notifier_block *self,
+			 unsigned long cmd, void *__args)
+{
+	struct die_args *args = __args;
+	struct pt_regs *regs;
+
+	if (likely(cmd != DIE_NMI_IPI))
+		return NOTIFY_DONE;
+
+	regs = args->regs;
+
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+	__smp_perf_counter_interrupt(regs, 1);
+
+	return NOTIFY_STOP;
+}
+
+static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
+	.notifier_call		= perf_counter_nmi_handler
+};
+
+void __init init_hw_perf_counters(void)
+{
+	union cpuid10_eax eax;
+	unsigned int unused;
+	unsigned int ebx;
+
+	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+		return;
+
+	/*
+	 * Check whether the Architectural PerfMon supports
+	 * Branch Misses Retired Event or not.
+	 */
+	cpuid(10, &(eax.full), &ebx, &unused, &unused);
+	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
+		return;
+
+	printk(KERN_INFO "Intel Performance Monitoring support detected.\n");
+
+	printk(KERN_INFO "... version:      %d\n", eax.split.version_id);
+	printk(KERN_INFO "... num_counters: %d\n", eax.split.num_counters);
+	nr_hw_counters = eax.split.num_counters;
+	if (nr_hw_counters > MAX_HW_COUNTERS) {
+		nr_hw_counters = MAX_HW_COUNTERS;
+		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
+			nr_hw_counters, MAX_HW_COUNTERS);
+	}
+	perf_counter_mask = (1 << nr_hw_counters) - 1;
+	perf_max_counters = nr_hw_counters;
+
+	printk(KERN_INFO "... bit_width:    %d\n", eax.split.bit_width);
+	printk(KERN_INFO "... mask_length:  %d\n", eax.split.mask_length);
+
+	perf_counters_lapic_init(0);
+	register_die_notifier(&perf_counter_nmi_notifier);
+
+	perf_counters_initialized = true;
+}
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 3194636a429..fc013cfde30 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -984,6 +984,11 @@ apicinterrupt ERROR_APIC_VECTOR \
 apicinterrupt SPURIOUS_APIC_VECTOR \
 	spurious_interrupt smp_spurious_interrupt
 
+#ifdef CONFIG_PERF_COUNTERS
+apicinterrupt LOCAL_PERF_VECTOR \
+	perf_counter_interrupt smp_perf_counter_interrupt
+#endif
+
 /*
  * Exception entry points.
  */
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index d1d4dc52f64..d92bc71e41a 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -56,6 +56,10 @@ static int show_other_interrupts(struct seq_file *p)
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
 	seq_printf(p, "  Local timer interrupts\n");
+	seq_printf(p, "CNT: ");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
+	seq_printf(p, "  Performance counter interrupts\n");
 #endif
 #ifdef CONFIG_SMP
 	seq_printf(p, "RES: ");
@@ -160,6 +164,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 
 #ifdef CONFIG_X86_LOCAL_APIC
 	sum += irq_stats(cpu)->apic_timer_irqs;
+	sum += irq_stats(cpu)->apic_perf_irqs;
 #endif
 #ifdef CONFIG_SMP
 	sum += irq_stats(cpu)->irq_resched_count;
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 607db63044a..6a33b5e3016 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -160,6 +160,9 @@ void __init native_init_IRQ(void)
 	/* IPI vectors for APIC spurious and error interrupts */
 	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+# ifdef CONFIG_PERF_COUNTERS
+	alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
+# endif
 #endif
 
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 8670b3ce626..91d785c25ad 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -138,6 +138,11 @@ static void __init apic_intr_init(void)
 	/* IPI vectors for APIC spurious and error interrupts */
 	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+
+	/* Performance monitoring interrupt: */
+#ifdef CONFIG_PERF_COUNTERS
+	alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
+#endif
 }
 
 void __init native_init_IRQ(void)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index b1cc6da6420..dee553c503d 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,7 +6,7 @@
  *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes
  *  2000-2002   x86-64 support by Andi Kleen
  */
-
+#include <linux/perf_counter.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
@@ -891,6 +891,11 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 		tracehook_notify_resume(regs);
 	}
 
+	if (thread_info_flags & _TIF_PERF_COUNTERS) {
+		clear_thread_flag(TIF_PERF_COUNTERS);
+		perf_counter_notify(regs);
+	}
+
 #ifdef CONFIG_X86_32
 	clear_thread_flag(TIF_IRET);
 #endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index d44395ff34c..496726ddcea 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -332,3 +332,4 @@ ENTRY(sys_call_table)
 	.long sys_dup3			/* 330 */
 	.long sys_pipe2
 	.long sys_inotify_init1
+	.long sys_perf_counter_open
-- 
cgit v1.2.3


From 87b9cf4623ad4e5fc009e48c020593dffd5d3793 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 8 Dec 2008 14:20:16 +0100
Subject: x86, perfcounters: read out MSR_CORE_PERF_GLOBAL_STATUS with counters
 disabled

Impact: make perfcounter NMI and IRQ sequence more robust

Make __smp_perf_counter_interrupt() a bit more conservative: first disable
all counters, then read out the status. Most invocations are because there
are real events, so there's no performance impact.

Code flow gets a bit simpler as well this way.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 82440cbed0e..615e953208e 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -383,18 +383,16 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 	struct cpu_hw_counters *cpuc;
 	u64 ack, status;
 
-	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
-	if (!status) {
-		ack_APIC_irq();
-		return;
-	}
-
 	/* Disable counters globally */
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
 	ack_APIC_irq();
 
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+	if (!status)
+		goto out;
+
 again:
 	ack = status;
 	for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) {
@@ -440,7 +438,7 @@ again:
 	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 	if (status)
 		goto again;
-
+out:
 	/*
 	 * Do not reenable when global enable is off:
 	 */
-- 
cgit v1.2.3


From 7e2ae34749edf19e76e594b9c4b2cdde1066afc5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 9 Dec 2008 11:40:46 +0100
Subject: perfcounters, x86: simplify disable/enable of counters

Impact: fix spurious missed counter wakeups

In the case of NMI events, close a race window that can occur if an NMI
hits counter code that temporarily disables+enables a counter, and the NMI
leaks into the disabled section.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 40 ++++++++++++++++++++++++++------------
 1 file changed, 28 insertions(+), 12 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 615e953208e..7d528ffc2d2 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -136,14 +136,25 @@ void hw_perf_disable_all(void)
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
 }
 
+static inline void
+__hw_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
+{
+	wrmsr(hwc->config_base + idx, hwc->config, 0);
+}
+
 static DEFINE_PER_CPU(u64, prev_next_count[MAX_HW_COUNTERS]);
 
-static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx)
+static void __hw_perf_counter_set_period(struct hw_perf_counter *hwc, int idx)
 {
 	per_cpu(prev_next_count[idx], smp_processor_id()) = hwc->next_count;
 
 	wrmsr(hwc->counter_base + idx, hwc->next_count, 0);
-	wrmsr(hwc->config_base + idx, hwc->config, 0);
+}
+
+static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx)
+{
+	wrmsr(hwc->config_base + idx,
+	      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
 }
 
 void hw_perf_counter_enable(struct perf_counter *counter)
@@ -161,11 +172,11 @@ void hw_perf_counter_enable(struct perf_counter *counter)
 
 	perf_counters_lapic_init(hwc->nmi);
 
-	wrmsr(hwc->config_base + idx,
-	      hwc->config & ~ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
+	__hw_perf_counter_disable(hwc, idx);
 
 	cpuc->counters[idx] = counter;
-	counter->hw.config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+
+	__hw_perf_counter_set_period(hwc, idx);
 	__hw_perf_counter_enable(hwc, idx);
 }
 
@@ -286,8 +297,7 @@ void hw_perf_counter_disable(struct perf_counter *counter)
 	struct hw_perf_counter *hwc = &counter->hw;
 	unsigned int idx = hwc->idx;
 
-	counter->hw.config &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
-	wrmsr(hwc->config_base + idx, hwc->config, 0);
+	__hw_perf_counter_disable(hwc, idx);
 
 	clear_bit(idx, cpuc->used);
 	cpuc->counters[idx] = NULL;
@@ -328,18 +338,24 @@ static void perf_store_irq_data(struct perf_counter *counter, u64 data)
 	}
 }
 
+/*
+ * NMI-safe enable method:
+ */
 static void perf_save_and_restart(struct perf_counter *counter)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
 	int idx = hwc->idx;
+	u64 pmc_ctrl;
+	int err;
 
-	wrmsr(hwc->config_base + idx,
-	      hwc->config & ~ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
+	err = rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0 + idx, &pmc_ctrl);
+	WARN_ON_ONCE(err);
 
-	if (hwc->config & ARCH_PERFMON_EVENTSEL0_ENABLE) {
-		__hw_perf_save_counter(counter, hwc, idx);
+	__hw_perf_save_counter(counter, hwc, idx);
+	__hw_perf_counter_set_period(hwc, idx);
+
+	if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE)
 		__hw_perf_counter_enable(hwc, idx);
-	}
 }
 
 static void
-- 
cgit v1.2.3


From 1e12567678054bc1d4c944ecfad17624b3e49345 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 9 Dec 2008 12:18:18 +0100
Subject: perfcounters, x86: clean up debug code

Impact: cleanup

Get rid of unused debug code.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 35 ++++++++++++-----------------------
 1 file changed, 12 insertions(+), 23 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 7d528ffc2d2..919ec46679b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -214,13 +214,11 @@ static void __hw_perf_save_counter(struct perf_counter *counter,
 {
 	s64 raw = -1;
 	s64 delta;
-	int err;
 
 	/*
 	 * Get the raw hw counter value:
 	 */
-	err = rdmsrl_safe(hwc->counter_base + idx, &raw);
-	WARN_ON_ONCE(err);
+	rdmsrl(hwc->counter_base + idx, raw);
 
 	/*
 	 * Rebase it to zero (it started counting at -irq_period),
@@ -252,20 +250,18 @@ static void __hw_perf_save_counter(struct perf_counter *counter,
 void perf_counter_print_debug(void)
 {
 	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, next_count;
-	int cpu, err, idx;
+	int cpu, idx;
+
+	if (!nr_hw_counters)
+		return;
 
 	local_irq_disable();
 
 	cpu = smp_processor_id();
 
-	err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_CTRL, &ctrl);
-	WARN_ON_ONCE(err);
-
-	err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_STATUS, &status);
-	WARN_ON_ONCE(err);
-
-	err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_OVF_CTRL, &overflow);
-	WARN_ON_ONCE(err);
+	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+	rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
 
 	printk(KERN_INFO "\n");
 	printk(KERN_INFO "CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
@@ -273,11 +269,8 @@ void perf_counter_print_debug(void)
 	printk(KERN_INFO "CPU#%d: overflow:   %016llx\n", cpu, overflow);
 
 	for (idx = 0; idx < nr_hw_counters; idx++) {
-		err = rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0 + idx, &pmc_ctrl);
-		WARN_ON_ONCE(err);
-
-		err = rdmsrl_safe(MSR_ARCH_PERFMON_PERFCTR0 + idx, &pmc_count);
-		WARN_ON_ONCE(err);
+		rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
+		rdmsrl(MSR_ARCH_PERFMON_PERFCTR0  + idx, pmc_count);
 
 		next_count = per_cpu(prev_next_count[idx], cpu);
 
@@ -310,13 +303,11 @@ void hw_perf_counter_read(struct perf_counter *counter)
 	unsigned long addr = hwc->counter_base + hwc->idx;
 	s64 offs, val = -1LL;
 	s32 val32;
-	int err;
 
 	/* Careful: NMI might modify the counter offset */
 	do {
 		offs = hwc->prev_count;
-		err = rdmsrl_safe(addr, &val);
-		WARN_ON_ONCE(err);
+		rdmsrl(addr, val);
 	} while (offs != hwc->prev_count);
 
 	val32 = (s32) val;
@@ -346,10 +337,8 @@ static void perf_save_and_restart(struct perf_counter *counter)
 	struct hw_perf_counter *hwc = &counter->hw;
 	int idx = hwc->idx;
 	u64 pmc_ctrl;
-	int err;
 
-	err = rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0 + idx, &pmc_ctrl);
-	WARN_ON_ONCE(err);
+	rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
 
 	__hw_perf_save_counter(counter, hwc, idx);
 	__hw_perf_counter_set_period(hwc, idx);
-- 
cgit v1.2.3


From 43874d238d5f208854a73c3225ca2a22833eec8b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 9 Dec 2008 12:23:59 +0100
Subject: perfcounters: consolidate global-disable codepaths

Impact: cleanup

Simplify global disable handling.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 919ec46679b..6a93d1f04d9 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -33,7 +33,6 @@ static u32 perf_counter_mask __read_mostly;
 struct cpu_hw_counters {
 	struct perf_counter	*counters[MAX_HW_COUNTERS];
 	unsigned long		used[BITS_TO_LONGS(MAX_HW_COUNTERS)];
-	int			enable_all;
 };
 
 /*
@@ -115,24 +114,13 @@ int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type)
 	return 0;
 }
 
-static void __hw_perf_enable_all(void)
-{
-	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
-}
-
 void hw_perf_enable_all(void)
 {
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-
-	cpuc->enable_all = 1;
-	__hw_perf_enable_all();
+	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
 }
 
 void hw_perf_disable_all(void)
 {
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-
-	cpuc->enable_all = 0;
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
 }
 
@@ -385,8 +373,10 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
 static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 {
 	int bit, cpu = smp_processor_id();
+	u64 ack, status, saved_global;
 	struct cpu_hw_counters *cpuc;
-	u64 ack, status;
+
+	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global);
 
 	/* Disable counters globally */
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
@@ -445,10 +435,9 @@ again:
 		goto again;
 out:
 	/*
-	 * Do not reenable when global enable is off:
+	 * Restore - do not reenable when global enable is off:
 	 */
-	if (cpuc->enable_all)
-		__hw_perf_enable_all();
+	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, saved_global, 0);
 }
 
 void smp_perf_counter_interrupt(struct pt_regs *regs)
-- 
cgit v1.2.3


From 4ac13294e44664bb7edf4daf52edb71e7c6bbe84 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Dec 2008 21:43:39 +0100
Subject: perf counters: protect them against CSTATE transitions

Impact: fix rare lost events problem

There are CPUs whose performance counters misbehave on CSTATE transitions,
so provide a way to just disable/enable them around deep idle methods.

(hw_perf_enable_all() is cheap on x86.)

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 6a93d1f04d9..0a7f3bea2dc 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -12,6 +12,7 @@
 #include <linux/notifier.h>
 #include <linux/hardirq.h>
 #include <linux/kprobes.h>
+#include <linux/module.h>
 #include <linux/kdebug.h>
 #include <linux/sched.h>
 
@@ -119,10 +120,21 @@ void hw_perf_enable_all(void)
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
 }
 
-void hw_perf_disable_all(void)
+void hw_perf_restore_ctrl(u64 ctrl)
 {
+	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
+}
+EXPORT_SYMBOL_GPL(hw_perf_restore_ctrl);
+
+u64 hw_perf_disable_all(void)
+{
+	u64 ctrl;
+
+	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
+	return ctrl;
 }
+EXPORT_SYMBOL_GPL(hw_perf_disable_all);
 
 static inline void
 __hw_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
-- 
cgit v1.2.3


From dfa7c899b401d7dc5d85aca416aee64ac82812f2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 8 Dec 2008 19:35:37 +0100
Subject: perf counters: expand use of counter->event

Impact: change syscall, cleanup

Make use of the new perf_counters event type.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 0a7f3bea2dc..30e7ebf7827 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -56,9 +56,10 @@ const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
 /*
  * Setup the hardware configuration for a given hw_event_type
  */
-int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type)
+int hw_perf_counter_init(struct perf_counter *counter)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
+	u32 hw_event_type = counter->event.hw_event_type;
 
 	if (unlikely(!perf_counters_initialized))
 		return -EINVAL;
@@ -83,7 +84,7 @@ int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type)
 	hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0;
 	hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0;
 
-	hwc->irq_period = counter->__irq_period;
+	hwc->irq_period = counter->event.hw_event_period;
 	/*
 	 * Intel PMCs cannot be accessed sanely above 32 bit width,
 	 * so we install an artificial 1<<31 period regardless of
@@ -95,21 +96,19 @@ int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type)
 	hwc->next_count = -((s32) hwc->irq_period);
 
 	/*
-	 * Negative event types mean raw encoded event+umask values:
+	 * Raw event type provide the config in the event structure
 	 */
-	if (hw_event_type < 0) {
-		counter->hw_event_type = -hw_event_type;
-		counter->hw_event_type &= ~PERF_COUNT_NMI;
+	hw_event_type &= ~PERF_COUNT_NMI;
+	if (hw_event_type == PERF_COUNT_RAW) {
+		hwc->config |= counter->event.hw_raw_ctrl;
 	} else {
-		hw_event_type &= ~PERF_COUNT_NMI;
 		if (hw_event_type >= max_intel_perfmon_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		counter->hw_event_type = intel_perfmon_event_map[hw_event_type];
+		hwc->config |= intel_perfmon_event_map[hw_event_type];
 	}
-	hwc->config |= counter->hw_event_type;
 	counter->wakeup_pending = 0;
 
 	return 0;
@@ -373,7 +372,7 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
 				perf_save_and_restart(counter);
 			}
 		}
-		perf_store_irq_data(leader, counter->hw_event_type);
+		perf_store_irq_data(leader, counter->event.hw_event_type);
 		perf_store_irq_data(leader, atomic64_counter_read(counter));
 	}
 }
@@ -418,7 +417,8 @@ again:
 			perf_store_irq_data(counter, instruction_pointer(regs));
 			break;
 		case PERF_RECORD_GROUP:
-			perf_store_irq_data(counter, counter->hw_event_type);
+			perf_store_irq_data(counter,
+					    counter->event.hw_event_type);
 			perf_store_irq_data(counter,
 					    atomic64_counter_read(counter));
 			perf_handle_group(counter, &status, &ack);
-- 
cgit v1.2.3


From 9f66a3810fe0d4100972db84290f3ae4a4d77025 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 10 Dec 2008 12:33:23 +0100
Subject: perf counters: restructure the API

Impact: clean up new API

Thorough cleanup of the new perf counters API, we now get clean separation
of the various concepts:

 - introduce perf_counter_hw_event to separate out the event source details

 - move special type flags into separate attributes: PERF_COUNT_NMI,
   PERF_COUNT_RAW

 - extend the type to u64 and reserve it fully to the architecture in the
   raw type case.

And make use of all these changes in the core and x86 perfcounters code.

Also change the syscall signature to:

  asmlinkage int sys_perf_counter_open(

	struct perf_counter_hw_event	*hw_event_uptr		__user,
	pid_t				pid,
	int				cpu,
	int				group_fd);

( Note that group_fd is unused for now - it's reserved for the counter
  groups abstraction. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 30e7ebf7827..ef1936a871a 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -58,8 +58,8 @@ const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
  */
 int hw_perf_counter_init(struct perf_counter *counter)
 {
+	struct perf_counter_hw_event *hw_event = &counter->hw_event;
 	struct hw_perf_counter *hwc = &counter->hw;
-	u32 hw_event_type = counter->event.hw_event_type;
 
 	if (unlikely(!perf_counters_initialized))
 		return -EINVAL;
@@ -77,14 +77,14 @@ int hw_perf_counter_init(struct perf_counter *counter)
 	hwc->nmi = 0;
 	if (capable(CAP_SYS_ADMIN)) {
 		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
-		if (hw_event_type & PERF_COUNT_NMI)
+		if (hw_event->nmi)
 			hwc->nmi = 1;
 	}
 
-	hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0;
-	hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0;
+	hwc->config_base	= MSR_ARCH_PERFMON_EVENTSEL0;
+	hwc->counter_base	= MSR_ARCH_PERFMON_PERFCTR0;
 
-	hwc->irq_period = counter->event.hw_event_period;
+	hwc->irq_period		= hw_event->irq_period;
 	/*
 	 * Intel PMCs cannot be accessed sanely above 32 bit width,
 	 * so we install an artificial 1<<31 period regardless of
@@ -93,21 +93,20 @@ int hw_perf_counter_init(struct perf_counter *counter)
 	if (!hwc->irq_period)
 		hwc->irq_period = 0x7FFFFFFF;
 
-	hwc->next_count = -((s32) hwc->irq_period);
+	hwc->next_count	= -(s32)hwc->irq_period;
 
 	/*
 	 * Raw event type provide the config in the event structure
 	 */
-	hw_event_type &= ~PERF_COUNT_NMI;
-	if (hw_event_type == PERF_COUNT_RAW) {
-		hwc->config |= counter->event.hw_raw_ctrl;
+	if (hw_event->raw) {
+		hwc->config |= hw_event->type;
 	} else {
-		if (hw_event_type >= max_intel_perfmon_events)
+		if (hw_event->type >= max_intel_perfmon_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= intel_perfmon_event_map[hw_event_type];
+		hwc->config |= intel_perfmon_event_map[hw_event->type];
 	}
 	counter->wakeup_pending = 0;
 
@@ -354,7 +353,7 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
 	int bit;
 
 	list_for_each_entry(counter, &ctx->counters, list) {
-		if (counter->record_type != PERF_RECORD_SIMPLE ||
+		if (counter->hw_event.record_type != PERF_RECORD_SIMPLE ||
 		    counter == leader)
 			continue;
 
@@ -372,7 +371,7 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
 				perf_save_and_restart(counter);
 			}
 		}
-		perf_store_irq_data(leader, counter->event.hw_event_type);
+		perf_store_irq_data(leader, counter->hw_event.type);
 		perf_store_irq_data(leader, atomic64_counter_read(counter));
 	}
 }
@@ -410,7 +409,7 @@ again:
 
 		perf_save_and_restart(counter);
 
-		switch (counter->record_type) {
+		switch (counter->hw_event.record_type) {
 		case PERF_RECORD_SIMPLE:
 			continue;
 		case PERF_RECORD_IRQ:
@@ -418,7 +417,7 @@ again:
 			break;
 		case PERF_RECORD_GROUP:
 			perf_store_irq_data(counter,
-					    counter->event.hw_event_type);
+					    counter->hw_event.type);
 			perf_store_irq_data(counter,
 					    atomic64_counter_read(counter));
 			perf_handle_group(counter, &status, &ack);
-- 
cgit v1.2.3


From 04289bb9891882202d7e961c4c04d2376930e9f9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Dec 2008 08:38:42 +0100
Subject: perf counters: add support for group counters

Impact: add group counters

This patch adds the "counter groups" abstraction.

Groups of counters behave much like normal 'single' counters, with a
few semantic and behavioral extensions on top of that.

A counter group is created by creating a new counter with the open()
syscall's group-leader group_fd file descriptor parameter pointing
to another, already existing counter.

Groups of counters are scheduled in and out in one atomic group, and
they are also roundrobin-scheduled atomically.

Counters that are member of a group can also record events with an
(atomic) extended timestamp that extends to all members of the group,
if the record type is set to PERF_RECORD_GROUP.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index ef1936a871a..54b4ad0cce6 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -346,18 +346,22 @@ static void perf_save_and_restart(struct perf_counter *counter)
 }
 
 static void
-perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
+perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 {
-	struct perf_counter_context *ctx = leader->ctx;
-	struct perf_counter *counter;
+	struct perf_counter *counter, *group_leader = sibling->group_leader;
 	int bit;
 
-	list_for_each_entry(counter, &ctx->counters, list) {
-		if (counter->hw_event.record_type != PERF_RECORD_SIMPLE ||
-		    counter == leader)
-			continue;
+	/*
+	 * Store the counter's own timestamp first:
+	 */
+	perf_store_irq_data(sibling, sibling->hw_event.type);
+	perf_store_irq_data(sibling, atomic64_counter_read(sibling));
 
-		if (counter->active) {
+	/*
+	 * Then store sibling timestamps (if any):
+	 */
+	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
+		if (!counter->active) {
 			/*
 			 * When counter was not in the overflow mask, we have to
 			 * read it from hardware. We read it as well, when it
@@ -371,8 +375,8 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
 				perf_save_and_restart(counter);
 			}
 		}
-		perf_store_irq_data(leader, counter->hw_event.type);
-		perf_store_irq_data(leader, atomic64_counter_read(counter));
+		perf_store_irq_data(sibling, counter->hw_event.type);
+		perf_store_irq_data(sibling, atomic64_counter_read(counter));
 	}
 }
 
@@ -416,10 +420,6 @@ again:
 			perf_store_irq_data(counter, instruction_pointer(regs));
 			break;
 		case PERF_RECORD_GROUP:
-			perf_store_irq_data(counter,
-					    counter->hw_event.type);
-			perf_store_irq_data(counter,
-					    atomic64_counter_read(counter));
 			perf_handle_group(counter, &status, &ack);
 			break;
 		}
-- 
cgit v1.2.3


From 621a01eac89b5e2f81a4cf576568b31f40a02724 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Dec 2008 12:46:46 +0100
Subject: perf counters: hw driver API

Impact: restructure code, introduce hw_ops driver abstraction

Introduce this abstraction to handle counter details:

 struct hw_perf_counter_ops {
	void (*hw_perf_counter_enable)	(struct perf_counter *counter);
	void (*hw_perf_counter_disable)	(struct perf_counter *counter);
	void (*hw_perf_counter_read)	(struct perf_counter *counter);
 };

This will be useful to support assymetric hw details, and it will also
be useful to implement "software counters". (Counters that count kernel
managed sw events such as pagefaults, context-switches, wall-clock time
or task-local time.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 37 +++++++++++++++++++++++++++----------
 1 file changed, 27 insertions(+), 10 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 54b4ad0cce6..718b635dece 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -56,7 +56,7 @@ const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
 /*
  * Setup the hardware configuration for a given hw_event_type
  */
-int hw_perf_counter_init(struct perf_counter *counter)
+static int __hw_perf_counter_init(struct perf_counter *counter)
 {
 	struct perf_counter_hw_event *hw_event = &counter->hw_event;
 	struct hw_perf_counter *hwc = &counter->hw;
@@ -135,7 +135,7 @@ u64 hw_perf_disable_all(void)
 EXPORT_SYMBOL_GPL(hw_perf_disable_all);
 
 static inline void
-__hw_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
+__x86_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
 {
 	wrmsr(hwc->config_base + idx, hwc->config, 0);
 }
@@ -149,13 +149,13 @@ static void __hw_perf_counter_set_period(struct hw_perf_counter *hwc, int idx)
 	wrmsr(hwc->counter_base + idx, hwc->next_count, 0);
 }
 
-static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx)
+static void __x86_perf_counter_enable(struct hw_perf_counter *hwc, int idx)
 {
 	wrmsr(hwc->config_base + idx,
 	      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
 }
 
-void hw_perf_counter_enable(struct perf_counter *counter)
+static void x86_perf_counter_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
@@ -170,12 +170,12 @@ void hw_perf_counter_enable(struct perf_counter *counter)
 
 	perf_counters_lapic_init(hwc->nmi);
 
-	__hw_perf_counter_disable(hwc, idx);
+	__x86_perf_counter_disable(hwc, idx);
 
 	cpuc->counters[idx] = counter;
 
 	__hw_perf_counter_set_period(hwc, idx);
-	__hw_perf_counter_enable(hwc, idx);
+	__x86_perf_counter_enable(hwc, idx);
 }
 
 #ifdef CONFIG_X86_64
@@ -282,20 +282,20 @@ void perf_counter_print_debug(void)
 	local_irq_enable();
 }
 
-void hw_perf_counter_disable(struct perf_counter *counter)
+static void x86_perf_counter_disable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
 	unsigned int idx = hwc->idx;
 
-	__hw_perf_counter_disable(hwc, idx);
+	__x86_perf_counter_disable(hwc, idx);
 
 	clear_bit(idx, cpuc->used);
 	cpuc->counters[idx] = NULL;
 	__hw_perf_save_counter(counter, hwc, idx);
 }
 
-void hw_perf_counter_read(struct perf_counter *counter)
+static void x86_perf_counter_read(struct perf_counter *counter)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
 	unsigned long addr = hwc->counter_base + hwc->idx;
@@ -342,7 +342,7 @@ static void perf_save_and_restart(struct perf_counter *counter)
 	__hw_perf_counter_set_period(hwc, idx);
 
 	if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE)
-		__hw_perf_counter_enable(hwc, idx);
+		__x86_perf_counter_enable(hwc, idx);
 }
 
 static void
@@ -572,3 +572,20 @@ void __init init_hw_perf_counters(void)
 
 	perf_counters_initialized = true;
 }
+
+static struct hw_perf_counter_ops x86_perf_counter_ops = {
+	.hw_perf_counter_enable		= x86_perf_counter_enable,
+	.hw_perf_counter_disable	= x86_perf_counter_disable,
+	.hw_perf_counter_read		= x86_perf_counter_read,
+};
+
+struct hw_perf_counter_ops *hw_perf_counter_init(struct perf_counter *counter)
+{
+	int err;
+
+	err = __hw_perf_counter_init(counter);
+	if (err)
+		return NULL;
+
+	return &x86_perf_counter_ops;
+}
-- 
cgit v1.2.3


From 5c92d12411dfe5f0f3d1b1c1e2f756245e6f7249 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Dec 2008 13:21:10 +0100
Subject: perf counters: implement PERF_COUNT_CPU_CLOCK

Impact: add new perf-counter type

The 'CPU clock' counter counts the amount of CPU clock time that is
elapsing, in nanoseconds. (regardless of how much of it the task is
spending on a CPU executing)

This counter type is a Linux kernel based abstraction, it is available
even if the hardware does not support native hardware performance counters.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 36 ++++--------------------------------
 1 file changed, 4 insertions(+), 32 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 718b635dece..43c8e9a38b4 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -178,35 +178,6 @@ static void x86_perf_counter_enable(struct perf_counter *counter)
 	__x86_perf_counter_enable(hwc, idx);
 }
 
-#ifdef CONFIG_X86_64
-static inline void atomic64_counter_set(struct perf_counter *counter, u64 val)
-{
-	atomic64_set(&counter->count, val);
-}
-
-static inline u64 atomic64_counter_read(struct perf_counter *counter)
-{
-	return atomic64_read(&counter->count);
-}
-#else
-/*
- * Todo: add proper atomic64_t support to 32-bit x86:
- */
-static inline void atomic64_counter_set(struct perf_counter *counter, u64 val64)
-{
-	u32 *val32 = (void *)&val64;
-
-	atomic_set(counter->count32 + 0, *(val32 + 0));
-	atomic_set(counter->count32 + 1, *(val32 + 1));
-}
-
-static inline u64 atomic64_counter_read(struct perf_counter *counter)
-{
-	return atomic_read(counter->count32 + 0) |
-		(u64) atomic_read(counter->count32 + 1) << 32;
-}
-#endif
-
 static void __hw_perf_save_counter(struct perf_counter *counter,
 				   struct hw_perf_counter *hwc, int idx)
 {
@@ -309,7 +280,7 @@ static void x86_perf_counter_read(struct perf_counter *counter)
 	} while (offs != hwc->prev_count);
 
 	val32 = (s32) val;
-	val =  (s64)hwc->irq_period + (s64)val32;
+	val = (s64)hwc->irq_period + (s64)val32;
 	atomic64_counter_set(counter, hwc->prev_count + val);
 }
 
@@ -573,13 +544,14 @@ void __init init_hw_perf_counters(void)
 	perf_counters_initialized = true;
 }
 
-static struct hw_perf_counter_ops x86_perf_counter_ops = {
+static const struct hw_perf_counter_ops x86_perf_counter_ops = {
 	.hw_perf_counter_enable		= x86_perf_counter_enable,
 	.hw_perf_counter_disable	= x86_perf_counter_disable,
 	.hw_perf_counter_read		= x86_perf_counter_read,
 };
 
-struct hw_perf_counter_ops *hw_perf_counter_init(struct perf_counter *counter)
+const struct hw_perf_counter_ops *
+hw_perf_counter_init(struct perf_counter *counter)
 {
 	int err;
 
-- 
cgit v1.2.3


From 01b2838c4298c5e0d30b4993c195ac34dd9df61e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Dec 2008 13:45:51 +0100
Subject: perf counters: consolidate hw_perf save/restore APIs

Impact: cleanup

Rename them to better match up the usual IRQ disable/enable APIs:

 hw_perf_disable_all()  => hw_perf_save_disable()
 hw_perf_restore_ctrl() => hw_perf_restore()

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 43c8e9a38b4..3e1dbebe22b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -118,13 +118,13 @@ void hw_perf_enable_all(void)
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
 }
 
-void hw_perf_restore_ctrl(u64 ctrl)
+void hw_perf_restore(u64 ctrl)
 {
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
 }
-EXPORT_SYMBOL_GPL(hw_perf_restore_ctrl);
+EXPORT_SYMBOL_GPL(hw_perf_restore);
 
-u64 hw_perf_disable_all(void)
+u64 hw_perf_save_disable(void)
 {
 	u64 ctrl;
 
@@ -132,7 +132,7 @@ u64 hw_perf_disable_all(void)
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
 	return ctrl;
 }
-EXPORT_SYMBOL_GPL(hw_perf_disable_all);
+EXPORT_SYMBOL_GPL(hw_perf_save_disable);
 
 static inline void
 __x86_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
-- 
cgit v1.2.3


From 6a930700c8b655a9e25e42fc4adc0b225ebbcefc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Dec 2008 15:17:03 +0100
Subject: perf counters: clean up state transitions

Impact: cleanup

Introduce a proper enum for the 3 states of a counter:

	PERF_COUNTER_STATE_OFF		= -1
	PERF_COUNTER_STATE_INACTIVE	=  0
	PERF_COUNTER_STATE_ACTIVE	=  1

and rename counter->active to counter->state and propagate the
changes everywhere.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 3e1dbebe22b..4854cca7fff 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -332,7 +332,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 	 * Then store sibling timestamps (if any):
 	 */
 	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
-		if (!counter->active) {
+		if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
 			/*
 			 * When counter was not in the overflow mask, we have to
 			 * read it from hardware. We read it as well, when it
-- 
cgit v1.2.3


From 9b194e831fb2c322ed81a373e49620f34edc2778 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 14 Dec 2008 20:22:35 +0100
Subject: x86: implement atomic64_t on 32-bit

Impact: new API

Implement the atomic64_t APIs on 32-bit as well. Will be used by
the performance counters code.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/atomic_32.h | 218 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 218 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
index ad5b9f6ecdd..9927e01b03c 100644
--- a/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@ -255,5 +255,223 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
 #define smp_mb__before_atomic_inc()	barrier()
 #define smp_mb__after_atomic_inc()	barrier()
 
+/* An 64bit atomic type */
+
+typedef struct {
+	unsigned long long counter;
+} atomic64_t;
+
+#define ATOMIC64_INIT(val)	{ (val) }
+
+/**
+ * atomic64_read - read atomic64 variable
+ * @v: pointer of type atomic64_t
+ *
+ * Atomically reads the value of @v.
+ * Doesn't imply a read memory barrier.
+ */
+#define __atomic64_read(ptr)		((ptr)->counter)
+
+static inline unsigned long long
+cmpxchg8b(unsigned long long *ptr, unsigned long long old, unsigned long long new)
+{
+	asm volatile(
+
+		LOCK_PREFIX "cmpxchg8b (%[ptr])\n"
+
+		     :		"=A" (old)
+
+		     : [ptr]	"D" (ptr),
+				"A" (old),
+				"b" (ll_low(new)),
+				"c" (ll_high(new))
+
+		     : "memory");
+
+	return old;
+}
+
+static inline unsigned long long
+atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val,
+		 unsigned long long new_val)
+{
+	return cmpxchg8b(&ptr->counter, old_val, new_val);
+}
+
+/**
+ * atomic64_set - set atomic64 variable
+ * @ptr:      pointer to type atomic64_t
+ * @new_val:  value to assign
+ *
+ * Atomically sets the value of @ptr to @new_val.
+ */
+static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val)
+{
+	unsigned long long old_val;
+
+	do {
+		old_val = atomic_read(ptr);
+	} while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
+}
+
+/**
+ * atomic64_read - read atomic64 variable
+ * @ptr:      pointer to type atomic64_t
+ *
+ * Atomically reads the value of @ptr and returns it.
+ */
+static inline unsigned long long atomic64_read(atomic64_t *ptr)
+{
+	unsigned long long curr_val;
+
+	do {
+		curr_val = __atomic64_read(ptr);
+	} while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val);
+
+	return curr_val;
+}
+
+/**
+ * atomic64_add_return - add and return
+ * @delta: integer value to add
+ * @ptr:   pointer to type atomic64_t
+ *
+ * Atomically adds @delta to @ptr and returns @delta + *@ptr
+ */
+static inline unsigned long long
+atomic64_add_return(unsigned long long delta, atomic64_t *ptr)
+{
+	unsigned long long old_val, new_val;
+
+	do {
+		old_val = atomic_read(ptr);
+		new_val = old_val + delta;
+
+	} while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
+
+	return new_val;
+}
+
+static inline long atomic64_sub_return(unsigned long long delta, atomic64_t *ptr)
+{
+	return atomic64_add_return(-delta, ptr);
+}
+
+static inline long atomic64_inc_return(atomic64_t *ptr)
+{
+	return atomic64_add_return(1, ptr);
+}
+
+static inline long atomic64_dec_return(atomic64_t *ptr)
+{
+	return atomic64_sub_return(1, ptr);
+}
+
+/**
+ * atomic64_add - add integer to atomic64 variable
+ * @delta: integer value to add
+ * @ptr:   pointer to type atomic64_t
+ *
+ * Atomically adds @delta to @ptr.
+ */
+static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr)
+{
+	atomic64_add_return(delta, ptr);
+}
+
+/**
+ * atomic64_sub - subtract the atomic64 variable
+ * @delta: integer value to subtract
+ * @ptr:   pointer to type atomic64_t
+ *
+ * Atomically subtracts @delta from @ptr.
+ */
+static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr)
+{
+	atomic64_add(-delta, ptr);
+}
+
+/**
+ * atomic64_sub_and_test - subtract value from variable and test result
+ * @delta: integer value to subtract
+ * @ptr:   pointer to type atomic64_t
+ *
+ * Atomically subtracts @delta from @ptr and returns
+ * true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int
+atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr)
+{
+	unsigned long long old_val = atomic64_sub_return(delta, ptr);
+
+	return old_val == 0;
+}
+
+/**
+ * atomic64_inc - increment atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically increments @ptr by 1.
+ */
+static inline void atomic64_inc(atomic64_t *ptr)
+{
+	atomic64_add(1, ptr);
+}
+
+/**
+ * atomic64_dec - decrement atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically decrements @ptr by 1.
+ */
+static inline void atomic64_dec(atomic64_t *ptr)
+{
+	atomic64_sub(1, ptr);
+}
+
+/**
+ * atomic64_dec_and_test - decrement and test
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically decrements @ptr by 1 and
+ * returns true if the result is 0, or false for all other
+ * cases.
+ */
+static inline int atomic64_dec_and_test(atomic64_t *ptr)
+{
+	return atomic64_sub_and_test(1, ptr);
+}
+
+/**
+ * atomic64_inc_and_test - increment and test
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically increments @ptr by 1
+ * and returns true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int atomic64_inc_and_test(atomic64_t *ptr)
+{
+	return atomic64_sub_and_test(-1, ptr);
+}
+
+/**
+ * atomic64_add_negative - add and test if negative
+ * @delta: integer value to add
+ * @ptr:   pointer to type atomic64_t
+ *
+ * Atomically adds @delta to @ptr and returns true
+ * if the result is negative, or false when
+ * result is greater than or equal to zero.
+ */
+static inline int
+atomic64_add_negative(unsigned long long delta, atomic64_t *ptr)
+{
+	long long old_val = atomic64_add_return(delta, ptr);
+
+	return old_val < 0;
+}
+
 #include <asm-generic/atomic.h>
 #endif /* _ASM_X86_ATOMIC_32_H */
-- 
cgit v1.2.3


From ee06094f8279e1312fc0a31591320cc7b6f0ab1e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 13 Dec 2008 09:00:03 +0100
Subject: perfcounters: restructure x86 counter math

Impact: restructure code

Change counter math from absolute values to clear delta logic.

We try to extract elapsed deltas from the raw hw counter - and put
that into the generic counter.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig                   |   2 +-
 arch/x86/kernel/cpu/perf_counter.c | 230 ++++++++++++++++++++-----------------
 2 files changed, 125 insertions(+), 107 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f2fdc186724..fe94490bab6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -643,7 +643,7 @@ config X86_UP_IOAPIC
 config X86_LOCAL_APIC
 	def_bool y
 	depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
-	select HAVE_PERF_COUNTERS
+	select HAVE_PERF_COUNTERS if (!M386 && !M486)
 
 config X86_IO_APIC
 	def_bool y
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index b903f8df72b..5afae13d8d5 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -53,6 +53,48 @@ const int intel_perfmon_event_map[] =
 
 const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
 
+/*
+ * Propagate counter elapsed time into the generic counter.
+ * Can only be executed on the CPU where the counter is active.
+ * Returns the delta events processed.
+ */
+static void
+x86_perf_counter_update(struct perf_counter *counter,
+			struct hw_perf_counter *hwc, int idx)
+{
+	u64 prev_raw_count, new_raw_count, delta;
+
+	WARN_ON_ONCE(counter->state != PERF_COUNTER_STATE_ACTIVE);
+	/*
+	 * Careful: an NMI might modify the previous counter value.
+	 *
+	 * Our tactic to handle this is to first atomically read and
+	 * exchange a new raw count - then add that new-prev delta
+	 * count to the generic counter atomically:
+	 */
+again:
+	prev_raw_count = atomic64_read(&hwc->prev_count);
+	rdmsrl(hwc->counter_base + idx, new_raw_count);
+
+	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+					new_raw_count) != prev_raw_count)
+		goto again;
+
+	/*
+	 * Now we have the new raw value and have updated the prev
+	 * timestamp already. We can now calculate the elapsed delta
+	 * (counter-)time and add that to the generic counter.
+	 *
+	 * Careful, not all hw sign-extends above the physical width
+	 * of the count, so we do that by clipping the delta to 32 bits:
+	 */
+	delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count);
+	WARN_ON_ONCE((int)delta < 0);
+
+	atomic64_add(delta, &counter->count);
+	atomic64_sub(delta, &hwc->period_left);
+}
+
 /*
  * Setup the hardware configuration for a given hw_event_type
  */
@@ -90,10 +132,10 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	 * so we install an artificial 1<<31 period regardless of
 	 * the generic counter period:
 	 */
-	if (!hwc->irq_period)
+	if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF)
 		hwc->irq_period = 0x7FFFFFFF;
 
-	hwc->next_count	= -(s32)hwc->irq_period;
+	atomic64_set(&hwc->period_left, hwc->irq_period);
 
 	/*
 	 * Raw event type provide the config in the event structure
@@ -118,12 +160,6 @@ void hw_perf_enable_all(void)
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
 }
 
-void hw_perf_restore(u64 ctrl)
-{
-	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
-}
-EXPORT_SYMBOL_GPL(hw_perf_restore);
-
 u64 hw_perf_save_disable(void)
 {
 	u64 ctrl;
@@ -134,27 +170,74 @@ u64 hw_perf_save_disable(void)
 }
 EXPORT_SYMBOL_GPL(hw_perf_save_disable);
 
+void hw_perf_restore(u64 ctrl)
+{
+	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
+}
+EXPORT_SYMBOL_GPL(hw_perf_restore);
+
 static inline void
-__x86_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
+__x86_perf_counter_disable(struct perf_counter *counter,
+			   struct hw_perf_counter *hwc, unsigned int idx)
 {
-	wrmsr(hwc->config_base + idx, hwc->config, 0);
+	int err;
+
+	err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
+	WARN_ON_ONCE(err);
 }
 
-static DEFINE_PER_CPU(u64, prev_next_count[MAX_HW_COUNTERS]);
+static DEFINE_PER_CPU(u64, prev_left[MAX_HW_COUNTERS]);
 
-static void __hw_perf_counter_set_period(struct hw_perf_counter *hwc, int idx)
+/*
+ * Set the next IRQ period, based on the hwc->period_left value.
+ * To be called with the counter disabled in hw:
+ */
+static void
+__hw_perf_counter_set_period(struct perf_counter *counter,
+			     struct hw_perf_counter *hwc, int idx)
 {
-	per_cpu(prev_next_count[idx], smp_processor_id()) = hwc->next_count;
+	s32 left = atomic64_read(&hwc->period_left);
+	s32 period = hwc->irq_period;
+
+	WARN_ON_ONCE(period <= 0);
+
+	/*
+	 * If we are way outside a reasoable range then just skip forward:
+	 */
+	if (unlikely(left <= -period)) {
+		left = period;
+		atomic64_set(&hwc->period_left, left);
+	}
+
+	if (unlikely(left <= 0)) {
+		left += period;
+		atomic64_set(&hwc->period_left, left);
+	}
 
-	wrmsr(hwc->counter_base + idx, hwc->next_count, 0);
+	WARN_ON_ONCE(left <= 0);
+
+	per_cpu(prev_left[idx], smp_processor_id()) = left;
+
+	/*
+	 * The hw counter starts counting from this counter offset,
+	 * mark it to be able to extra future deltas:
+	 */
+	atomic64_set(&hwc->prev_count, (u64)(s64)-left);
+
+	wrmsr(hwc->counter_base + idx, -left, 0);
 }
 
-static void __x86_perf_counter_enable(struct hw_perf_counter *hwc, int idx)
+static void
+__x86_perf_counter_enable(struct perf_counter *counter,
+			  struct hw_perf_counter *hwc, int idx)
 {
 	wrmsr(hwc->config_base + idx,
 	      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
 }
 
+/*
+ * Find a PMC slot for the freshly enabled / scheduled in counter:
+ */
 static void x86_perf_counter_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
@@ -170,55 +253,17 @@ static void x86_perf_counter_enable(struct perf_counter *counter)
 
 	perf_counters_lapic_init(hwc->nmi);
 
-	__x86_perf_counter_disable(hwc, idx);
+	__x86_perf_counter_disable(counter, hwc, idx);
 
 	cpuc->counters[idx] = counter;
 
-	__hw_perf_counter_set_period(hwc, idx);
-	__x86_perf_counter_enable(hwc, idx);
-}
-
-static void __hw_perf_save_counter(struct perf_counter *counter,
-				   struct hw_perf_counter *hwc, int idx)
-{
-	s64 raw = -1;
-	s64 delta;
-
-	/*
-	 * Get the raw hw counter value:
-	 */
-	rdmsrl(hwc->counter_base + idx, raw);
-
-	/*
-	 * Rebase it to zero (it started counting at -irq_period),
-	 * to see the delta since ->prev_count:
-	 */
-	delta = (s64)hwc->irq_period + (s64)(s32)raw;
-
-	atomic64_counter_set(counter, hwc->prev_count + delta);
-
-	/*
-	 * Adjust the ->prev_count offset - if we went beyond
-	 * irq_period of units, then we got an IRQ and the counter
-	 * was set back to -irq_period:
-	 */
-	while (delta >= (s64)hwc->irq_period) {
-		hwc->prev_count += hwc->irq_period;
-		delta -= (s64)hwc->irq_period;
-	}
-
-	/*
-	 * Calculate the next raw counter value we'll write into
-	 * the counter at the next sched-in time:
-	 */
-	delta -= (s64)hwc->irq_period;
-
-	hwc->next_count = (s32)delta;
+	__hw_perf_counter_set_period(counter, hwc, idx);
+	__x86_perf_counter_enable(counter, hwc, idx);
 }
 
 void perf_counter_print_debug(void)
 {
-	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, next_count;
+	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left;
 	int cpu, idx;
 
 	if (!nr_hw_counters)
@@ -241,14 +286,14 @@ void perf_counter_print_debug(void)
 		rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
 		rdmsrl(MSR_ARCH_PERFMON_PERFCTR0  + idx, pmc_count);
 
-		next_count = per_cpu(prev_next_count[idx], cpu);
+		prev_left = per_cpu(prev_left[idx], cpu);
 
 		printk(KERN_INFO "CPU#%d: PMC%d ctrl:  %016llx\n",
 			cpu, idx, pmc_ctrl);
 		printk(KERN_INFO "CPU#%d: PMC%d count: %016llx\n",
 			cpu, idx, pmc_count);
-		printk(KERN_INFO "CPU#%d: PMC%d next:  %016llx\n",
-			cpu, idx, next_count);
+		printk(KERN_INFO "CPU#%d: PMC%d left:  %016llx\n",
+			cpu, idx, prev_left);
 	}
 	local_irq_enable();
 }
@@ -259,29 +304,16 @@ static void x86_perf_counter_disable(struct perf_counter *counter)
 	struct hw_perf_counter *hwc = &counter->hw;
 	unsigned int idx = hwc->idx;
 
-	__x86_perf_counter_disable(hwc, idx);
+	__x86_perf_counter_disable(counter, hwc, idx);
 
 	clear_bit(idx, cpuc->used);
 	cpuc->counters[idx] = NULL;
-	__hw_perf_save_counter(counter, hwc, idx);
-}
 
-static void x86_perf_counter_read(struct perf_counter *counter)
-{
-	struct hw_perf_counter *hwc = &counter->hw;
-	unsigned long addr = hwc->counter_base + hwc->idx;
-	s64 offs, val = -1LL;
-	s32 val32;
-
-	/* Careful: NMI might modify the counter offset */
-	do {
-		offs = hwc->prev_count;
-		rdmsrl(addr, val);
-	} while (offs != hwc->prev_count);
-
-	val32 = (s32) val;
-	val = (s64)hwc->irq_period + (s64)val32;
-	atomic64_counter_set(counter, hwc->prev_count + val);
+	/*
+	 * Drain the remaining delta count out of a counter
+	 * that we are disabling:
+	 */
+	x86_perf_counter_update(counter, hwc, idx);
 }
 
 static void perf_store_irq_data(struct perf_counter *counter, u64 data)
@@ -299,7 +331,8 @@ static void perf_store_irq_data(struct perf_counter *counter, u64 data)
 }
 
 /*
- * NMI-safe enable method:
+ * Save and restart an expired counter. Called by NMI contexts,
+ * so it has to be careful about preempting normal counter ops:
  */
 static void perf_save_and_restart(struct perf_counter *counter)
 {
@@ -309,45 +342,25 @@ static void perf_save_and_restart(struct perf_counter *counter)
 
 	rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
 
-	__hw_perf_save_counter(counter, hwc, idx);
-	__hw_perf_counter_set_period(hwc, idx);
+	x86_perf_counter_update(counter, hwc, idx);
+	__hw_perf_counter_set_period(counter, hwc, idx);
 
 	if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE)
-		__x86_perf_counter_enable(hwc, idx);
+		__x86_perf_counter_enable(counter, hwc, idx);
 }
 
 static void
 perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 {
 	struct perf_counter *counter, *group_leader = sibling->group_leader;
-	int bit;
-
-	/*
-	 * Store the counter's own timestamp first:
-	 */
-	perf_store_irq_data(sibling, sibling->hw_event.type);
-	perf_store_irq_data(sibling, atomic64_counter_read(sibling));
 
 	/*
-	 * Then store sibling timestamps (if any):
+	 * Store sibling timestamps (if any):
 	 */
 	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
-		if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
-			/*
-			 * When counter was not in the overflow mask, we have to
-			 * read it from hardware. We read it as well, when it
-			 * has not been read yet and clear the bit in the
-			 * status mask.
-			 */
-			bit = counter->hw.idx;
-			if (!test_bit(bit, (unsigned long *) overflown) ||
-			    test_bit(bit, (unsigned long *) status)) {
-				clear_bit(bit, (unsigned long *) status);
-				perf_save_and_restart(counter);
-			}
-		}
+		x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
 		perf_store_irq_data(sibling, counter->hw_event.type);
-		perf_store_irq_data(sibling, atomic64_counter_read(counter));
+		perf_store_irq_data(sibling, atomic64_read(&counter->count));
 	}
 }
 
@@ -540,6 +553,11 @@ void __init init_hw_perf_counters(void)
 	perf_counters_initialized = true;
 }
 
+static void x86_perf_counter_read(struct perf_counter *counter)
+{
+	x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
+}
+
 static const struct hw_perf_counter_ops x86_perf_counter_ops = {
 	.hw_perf_counter_enable		= x86_perf_counter_enable,
 	.hw_perf_counter_disable	= x86_perf_counter_disable,
-- 
cgit v1.2.3


From 2b9ff0db19b5e2c77000b7201525f9c3d6e8328d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 14 Dec 2008 18:36:30 +0100
Subject: perfcounters: fix non-intel-perfmon CPUs

Do not write MSR_CORE_PERF_GLOBAL_CTRL on CPUs where it does not exist.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 5afae13d8d5..6d30f603b62 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -157,6 +157,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 
 void hw_perf_enable_all(void)
 {
+	if (unlikely(!perf_counters_initialized))
+		return;
+
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
 }
 
@@ -164,14 +167,21 @@ u64 hw_perf_save_disable(void)
 {
 	u64 ctrl;
 
+	if (unlikely(!perf_counters_initialized))
+		return 0;
+
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
+
 	return ctrl;
 }
 EXPORT_SYMBOL_GPL(hw_perf_save_disable);
 
 void hw_perf_restore(u64 ctrl)
 {
+	if (unlikely(!perf_counters_initialized))
+		return;
+
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
 }
 EXPORT_SYMBOL_GPL(hw_perf_restore);
-- 
cgit v1.2.3


From 75f224cf7700ed6006574dc3f2efa29860727570 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 14 Dec 2008 21:58:46 +0100
Subject: perfcounters: fix lapic initialization

Fix non-working NMI sampling in certain bootup scenarios.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 6d30f603b62..8a154bd7ba9 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -557,10 +557,10 @@ void __init init_hw_perf_counters(void)
 	printk(KERN_INFO "... bit_width:    %d\n", eax.split.bit_width);
 	printk(KERN_INFO "... mask_length:  %d\n", eax.split.mask_length);
 
+	perf_counters_initialized = true;
+
 	perf_counters_lapic_init(0);
 	register_die_notifier(&perf_counter_nmi_notifier);
-
-	perf_counters_initialized = true;
 }
 
 static void x86_perf_counter_read(struct perf_counter *counter)
-- 
cgit v1.2.3


From 94c46572a6d9bb497eda0a14099d9f1360d57d5d Mon Sep 17 00:00:00 2001
From: Jaswinder Singh <jaswinder@infradead.org>
Date: Fri, 19 Dec 2008 22:37:58 +0530
Subject: x86: perf_counter.c intel_perfmon_event_map and
 max_intel_perfmon_events should be static

Impact: cleanup, avoid sparse warnings, reduce kernel size a bit

Fixes these sparse warnings:
 arch/x86/kernel/cpu/perf_counter.c:44:11: warning: symbol 'intel_perfmon_event_map' was not declared. Should it be static?
 arch/x86/kernel/cpu/perf_counter.c:54:11: warning: symbol 'max_intel_perfmon_events' was not declared. Should it be static?

Signed-off-by: Jaswinder Singh <jaswinder@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 8a154bd7ba9..bdbdb56eaa3 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -41,7 +41,7 @@ struct cpu_hw_counters {
  */
 static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
 
-const int intel_perfmon_event_map[] =
+static const int intel_perfmon_event_map[] =
 {
   [PERF_COUNT_CYCLES]			= 0x003c,
   [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
@@ -51,7 +51,7 @@ const int intel_perfmon_event_map[] =
   [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
 };
 
-const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
+static const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
 
 /*
  * Propagate counter elapsed time into the generic counter.
-- 
cgit v1.2.3


From 8fb9331391af95ca1f4e5c0a0da8120b13cbae01 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 23 Dec 2008 12:04:16 +0100
Subject: perfcounters: remove warnings

Impact: remove debug checks

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index bdbdb56eaa3..89fad5d4fb3 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -64,7 +64,6 @@ x86_perf_counter_update(struct perf_counter *counter,
 {
 	u64 prev_raw_count, new_raw_count, delta;
 
-	WARN_ON_ONCE(counter->state != PERF_COUNTER_STATE_ACTIVE);
 	/*
 	 * Careful: an NMI might modify the previous counter value.
 	 *
@@ -89,7 +88,6 @@ again:
 	 * of the count, so we do that by clipping the delta to 32 bits:
 	 */
 	delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count);
-	WARN_ON_ONCE((int)delta < 0);
 
 	atomic64_add(delta, &counter->count);
 	atomic64_sub(delta, &hwc->period_left);
@@ -193,7 +191,6 @@ __x86_perf_counter_disable(struct perf_counter *counter,
 	int err;
 
 	err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
-	WARN_ON_ONCE(err);
 }
 
 static DEFINE_PER_CPU(u64, prev_left[MAX_HW_COUNTERS]);
@@ -209,8 +206,6 @@ __hw_perf_counter_set_period(struct perf_counter *counter,
 	s32 left = atomic64_read(&hwc->period_left);
 	s32 period = hwc->irq_period;
 
-	WARN_ON_ONCE(period <= 0);
-
 	/*
 	 * If we are way outside a reasoable range then just skip forward:
 	 */
@@ -224,8 +219,6 @@ __hw_perf_counter_set_period(struct perf_counter *counter,
 		atomic64_set(&hwc->period_left, left);
 	}
 
-	WARN_ON_ONCE(left <= 0);
-
 	per_cpu(prev_left[idx], smp_processor_id()) = left;
 
 	/*
-- 
cgit v1.2.3


From 5c167b8585c8d91206b395d57011ead7711e322f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 17 Dec 2008 09:02:19 +0100
Subject: x86, perfcounters: rename intel_arch_perfmon.h => perf_counter.h

Impact: rename include file

We'll be providing an asm/perf_counter.h to the generic perfcounter code,
so use the already existing x86 file for this purpose and rename it.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/intel_arch_perfmon.h | 41 -------------------------------
 arch/x86/include/asm/perf_counter.h       | 41 +++++++++++++++++++++++++++++++
 arch/x86/kernel/apic.c                    |  2 +-
 arch/x86/kernel/cpu/common.c              |  2 +-
 arch/x86/kernel/cpu/perf_counter.c        |  2 +-
 arch/x86/kernel/cpu/perfctr-watchdog.c    |  2 +-
 arch/x86/oprofile/op_model_ppro.c         |  2 +-
 7 files changed, 46 insertions(+), 46 deletions(-)
 delete mode 100644 arch/x86/include/asm/intel_arch_perfmon.h
 create mode 100644 arch/x86/include/asm/perf_counter.h

(limited to 'arch')

diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/intel_arch_perfmon.h
deleted file mode 100644
index 71598a9eab6..00000000000
--- a/arch/x86/include/asm/intel_arch_perfmon.h
+++ /dev/null
@@ -1,41 +0,0 @@
-#ifndef _ASM_X86_INTEL_ARCH_PERFMON_H
-#define _ASM_X86_INTEL_ARCH_PERFMON_H
-
-#define MSR_ARCH_PERFMON_PERFCTR0			      0xc1
-#define MSR_ARCH_PERFMON_PERFCTR1			      0xc2
-
-#define MSR_ARCH_PERFMON_EVENTSEL0			     0x186
-#define MSR_ARCH_PERFMON_EVENTSEL1			     0x187
-
-#define ARCH_PERFMON_EVENTSEL0_ENABLE			  (1 << 22)
-#define ARCH_PERFMON_EVENTSEL_INT			  (1 << 20)
-#define ARCH_PERFMON_EVENTSEL_OS			  (1 << 17)
-#define ARCH_PERFMON_EVENTSEL_USR			  (1 << 16)
-
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL		      0x3c
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK		(0x00 << 8)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 		 0
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
-		(1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
-
-#define ARCH_PERFMON_BRANCH_MISSES_RETIRED			 6
-
-union cpuid10_eax {
-	struct {
-		unsigned int version_id:8;
-		unsigned int num_counters:8;
-		unsigned int bit_width:8;
-		unsigned int mask_length:8;
-	} split;
-	unsigned int full;
-};
-
-#ifdef CONFIG_PERF_COUNTERS
-extern void init_hw_perf_counters(void);
-extern void perf_counters_lapic_init(int nmi);
-#else
-static inline void init_hw_perf_counters(void)		{ }
-static inline void perf_counters_lapic_init(int nmi)	{ }
-#endif
-
-#endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
new file mode 100644
index 00000000000..9dadce1124e
--- /dev/null
+++ b/arch/x86/include/asm/perf_counter.h
@@ -0,0 +1,41 @@
+#ifndef _ASM_X86_PERF_COUNTER_H
+#define _ASM_X86_PERF_COUNTER_H
+
+#define MSR_ARCH_PERFMON_PERFCTR0			      0xc1
+#define MSR_ARCH_PERFMON_PERFCTR1			      0xc2
+
+#define MSR_ARCH_PERFMON_EVENTSEL0			     0x186
+#define MSR_ARCH_PERFMON_EVENTSEL1			     0x187
+
+#define ARCH_PERFMON_EVENTSEL0_ENABLE			  (1 << 22)
+#define ARCH_PERFMON_EVENTSEL_INT			  (1 << 20)
+#define ARCH_PERFMON_EVENTSEL_OS			  (1 << 17)
+#define ARCH_PERFMON_EVENTSEL_USR			  (1 << 16)
+
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL		      0x3c
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK		(0x00 << 8)
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 		 0
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
+		(1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
+
+#define ARCH_PERFMON_BRANCH_MISSES_RETIRED			 6
+
+union cpuid10_eax {
+	struct {
+		unsigned int version_id:8;
+		unsigned int num_counters:8;
+		unsigned int bit_width:8;
+		unsigned int mask_length:8;
+	} split;
+	unsigned int full;
+};
+
+#ifdef CONFIG_PERF_COUNTERS
+extern void init_hw_perf_counters(void);
+extern void perf_counters_lapic_init(int nmi);
+#else
+static inline void init_hw_perf_counters(void)		{ }
+static inline void perf_counters_lapic_init(int nmi)	{ }
+#endif
+
+#endif /* _ASM_X86_PERF_COUNTER_H */
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 0579ec1cd6e..4f859acb156 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -31,7 +31,7 @@
 #include <linux/dmi.h>
 #include <linux/dmar.h>
 
-#include <asm/intel_arch_perfmon.h>
+#include <asm/perf_counter.h>
 #include <asm/atomic.h>
 #include <asm/smp.h>
 #include <asm/mtrr.h>
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4461011db47..ad331b4d623 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -17,7 +17,7 @@
 #include <asm/mmu_context.h>
 #include <asm/mtrr.h>
 #include <asm/mce.h>
-#include <asm/intel_arch_perfmon.h>
+#include <asm/perf_counter.h>
 #include <asm/pat.h>
 #include <asm/asm.h>
 #include <asm/numa.h>
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 89fad5d4fb3..a4a3a09a654 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -16,7 +16,7 @@
 #include <linux/kdebug.h>
 #include <linux/sched.h>
 
-#include <asm/intel_arch_perfmon.h>
+#include <asm/perf_counter.h>
 #include <asm/apic.h>
 
 static bool perf_counters_initialized __read_mostly;
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 9abd48b2267..d6f5b9fbde3 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -20,7 +20,7 @@
 #include <linux/kprobes.h>
 
 #include <asm/apic.h>
-#include <asm/intel_arch_perfmon.h>
+#include <asm/perf_counter.h>
 
 struct nmi_watchdog_ctlblk {
 	unsigned int cccr_msr;
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index e9f80c744cf..07c914555a5 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -18,7 +18,7 @@
 #include <asm/msr.h>
 #include <asm/apic.h>
 #include <asm/nmi.h>
-#include <asm/intel_arch_perfmon.h>
+#include <asm/perf_counter.h>
 
 #include "op_x86_model.h"
 #include "op_counter.h"
-- 
cgit v1.2.3


From eb2b861810d4ff72454c83996b891df4e0aaff9a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 17 Dec 2008 09:09:13 +0100
Subject: x86, perfcounters: prepare for fixed-mode PMCs

Impact: refactor the x86 code for fixed-mode PMCs

Extend the data structures and rename the existing facilities
to allow for a 'generic' versus 'fixed' counter distinction.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_counter.h | 11 ++++++++
 arch/x86/kernel/cpu/perf_counter.c  | 53 ++++++++++++++++++-------------------
 2 files changed, 37 insertions(+), 27 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index 9dadce1124e..dd5a4a559e2 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -1,6 +1,13 @@
 #ifndef _ASM_X86_PERF_COUNTER_H
 #define _ASM_X86_PERF_COUNTER_H
 
+/*
+ * Performance counter hw details:
+ */
+
+#define X86_PMC_MAX_GENERIC					8
+#define X86_PMC_MAX_FIXED					3
+
 #define MSR_ARCH_PERFMON_PERFCTR0			      0xc1
 #define MSR_ARCH_PERFMON_PERFCTR1			      0xc2
 
@@ -20,6 +27,10 @@
 
 #define ARCH_PERFMON_BRANCH_MISSES_RETIRED			 6
 
+/*
+ * Intel "Architectural Performance Monitoring" CPUID
+ * detection/enumeration details:
+ */
 union cpuid10_eax {
 	struct {
 		unsigned int version_id:8;
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index a4a3a09a654..fc3af868823 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -27,13 +27,12 @@ static bool perf_counters_initialized __read_mostly;
 static int nr_hw_counters __read_mostly;
 static u32 perf_counter_mask __read_mostly;
 
-/* No support for fixed function counters yet */
-
-#define MAX_HW_COUNTERS		8
-
 struct cpu_hw_counters {
-	struct perf_counter	*counters[MAX_HW_COUNTERS];
-	unsigned long		used[BITS_TO_LONGS(MAX_HW_COUNTERS)];
+	struct perf_counter	*generic[X86_PMC_MAX_GENERIC];
+	unsigned long		used[BITS_TO_LONGS(X86_PMC_MAX_GENERIC)];
+
+	struct perf_counter	*fixed[X86_PMC_MAX_FIXED];
+	unsigned long		used_fixed[BITS_TO_LONGS(X86_PMC_MAX_FIXED)];
 };
 
 /*
@@ -185,7 +184,7 @@ void hw_perf_restore(u64 ctrl)
 EXPORT_SYMBOL_GPL(hw_perf_restore);
 
 static inline void
-__x86_perf_counter_disable(struct perf_counter *counter,
+__pmc_generic_disable(struct perf_counter *counter,
 			   struct hw_perf_counter *hwc, unsigned int idx)
 {
 	int err;
@@ -193,7 +192,7 @@ __x86_perf_counter_disable(struct perf_counter *counter,
 	err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
 }
 
-static DEFINE_PER_CPU(u64, prev_left[MAX_HW_COUNTERS]);
+static DEFINE_PER_CPU(u64, prev_left[X86_PMC_MAX_GENERIC]);
 
 /*
  * Set the next IRQ period, based on the hwc->period_left value.
@@ -231,7 +230,7 @@ __hw_perf_counter_set_period(struct perf_counter *counter,
 }
 
 static void
-__x86_perf_counter_enable(struct perf_counter *counter,
+__pmc_generic_enable(struct perf_counter *counter,
 			  struct hw_perf_counter *hwc, int idx)
 {
 	wrmsr(hwc->config_base + idx,
@@ -241,7 +240,7 @@ __x86_perf_counter_enable(struct perf_counter *counter,
 /*
  * Find a PMC slot for the freshly enabled / scheduled in counter:
  */
-static void x86_perf_counter_enable(struct perf_counter *counter)
+static void pmc_generic_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
@@ -256,12 +255,12 @@ static void x86_perf_counter_enable(struct perf_counter *counter)
 
 	perf_counters_lapic_init(hwc->nmi);
 
-	__x86_perf_counter_disable(counter, hwc, idx);
+	__pmc_generic_disable(counter, hwc, idx);
 
-	cpuc->counters[idx] = counter;
+	cpuc->generic[idx] = counter;
 
 	__hw_perf_counter_set_period(counter, hwc, idx);
-	__x86_perf_counter_enable(counter, hwc, idx);
+	__pmc_generic_enable(counter, hwc, idx);
 }
 
 void perf_counter_print_debug(void)
@@ -301,16 +300,16 @@ void perf_counter_print_debug(void)
 	local_irq_enable();
 }
 
-static void x86_perf_counter_disable(struct perf_counter *counter)
+static void pmc_generic_disable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
 	unsigned int idx = hwc->idx;
 
-	__x86_perf_counter_disable(counter, hwc, idx);
+	__pmc_generic_disable(counter, hwc, idx);
 
 	clear_bit(idx, cpuc->used);
-	cpuc->counters[idx] = NULL;
+	cpuc->generic[idx] = NULL;
 
 	/*
 	 * Drain the remaining delta count out of a counter
@@ -349,7 +348,7 @@ static void perf_save_and_restart(struct perf_counter *counter)
 	__hw_perf_counter_set_period(counter, hwc, idx);
 
 	if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE)
-		__x86_perf_counter_enable(counter, hwc, idx);
+		__pmc_generic_enable(counter, hwc, idx);
 }
 
 static void
@@ -392,7 +391,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 again:
 	ack = status;
 	for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) {
-		struct perf_counter *counter = cpuc->counters[bit];
+		struct perf_counter *counter = cpuc->generic[bit];
 
 		clear_bit(bit, (unsigned long *) &status);
 		if (!counter)
@@ -412,7 +411,7 @@ again:
 		}
 		/*
 		 * From NMI context we cannot call into the scheduler to
-		 * do a task wakeup - but we mark these counters as
+		 * do a task wakeup - but we mark these generic as
 		 * wakeup_pending and initate a wakeup callback:
 		 */
 		if (nmi) {
@@ -462,7 +461,7 @@ void perf_counter_notify(struct pt_regs *regs)
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
 	for_each_bit(bit, cpuc->used, nr_hw_counters) {
-		struct perf_counter *counter = cpuc->counters[bit];
+		struct perf_counter *counter = cpuc->generic[bit];
 
 		if (!counter)
 			continue;
@@ -539,10 +538,10 @@ void __init init_hw_perf_counters(void)
 	printk(KERN_INFO "... version:      %d\n", eax.split.version_id);
 	printk(KERN_INFO "... num_counters: %d\n", eax.split.num_counters);
 	nr_hw_counters = eax.split.num_counters;
-	if (nr_hw_counters > MAX_HW_COUNTERS) {
-		nr_hw_counters = MAX_HW_COUNTERS;
+	if (nr_hw_counters > X86_PMC_MAX_GENERIC) {
+		nr_hw_counters = X86_PMC_MAX_GENERIC;
 		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
-			nr_hw_counters, MAX_HW_COUNTERS);
+			nr_hw_counters, X86_PMC_MAX_GENERIC);
 	}
 	perf_counter_mask = (1 << nr_hw_counters) - 1;
 	perf_max_counters = nr_hw_counters;
@@ -556,15 +555,15 @@ void __init init_hw_perf_counters(void)
 	register_die_notifier(&perf_counter_nmi_notifier);
 }
 
-static void x86_perf_counter_read(struct perf_counter *counter)
+static void pmc_generic_read(struct perf_counter *counter)
 {
 	x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
 }
 
 static const struct hw_perf_counter_ops x86_perf_counter_ops = {
-	.hw_perf_counter_enable		= x86_perf_counter_enable,
-	.hw_perf_counter_disable	= x86_perf_counter_disable,
-	.hw_perf_counter_read		= x86_perf_counter_read,
+	.hw_perf_counter_enable		= pmc_generic_enable,
+	.hw_perf_counter_disable	= pmc_generic_disable,
+	.hw_perf_counter_read		= pmc_generic_read,
 };
 
 const struct hw_perf_counter_ops *
-- 
cgit v1.2.3


From 703e937c83bbad79075a7846e062e447c2fee6a4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 17 Dec 2008 10:51:15 +0100
Subject: perfcounters: add fixed-mode PMC enumeration

Enumerate fixed-mode PMCs based on CPUID, and feed that into the
perfcounter code.

Does not use fixed-mode PMCs yet.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_counter.h | 23 +++++++++++++++++++++++
 arch/x86/kernel/cpu/perf_counter.c  | 23 +++++++++++++++++------
 2 files changed, 40 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index dd5a4a559e2..945a315e6d6 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -41,6 +41,29 @@ union cpuid10_eax {
 	unsigned int full;
 };
 
+union cpuid10_edx {
+	struct {
+		unsigned int num_counters_fixed:4;
+		unsigned int reserved:28;
+	} split;
+	unsigned int full;
+};
+
+
+/*
+ * Fixed-purpose performance counters:
+ */
+
+/* Instr_Retired.Any: */
+#define MSR_ARCH_PERFMON_FIXED_CTR0			0x309
+
+/* CPU_CLK_Unhalted.Core: */
+#define MSR_ARCH_PERFMON_FIXED_CTR1			0x30a
+
+/* CPU_CLK_Unhalted.Ref: */
+#define MSR_ARCH_PERFMON_FIXED_CTR2			0x30b
+
+
 #ifdef CONFIG_PERF_COUNTERS
 extern void init_hw_perf_counters(void);
 extern void perf_counters_lapic_init(int nmi);
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index fc3af868823..2fca50c4597 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -27,6 +27,8 @@ static bool perf_counters_initialized __read_mostly;
 static int nr_hw_counters __read_mostly;
 static u32 perf_counter_mask __read_mostly;
 
+static int nr_hw_counters_fixed __read_mostly;
+
 struct cpu_hw_counters {
 	struct perf_counter	*generic[X86_PMC_MAX_GENERIC];
 	unsigned long		used[BITS_TO_LONGS(X86_PMC_MAX_GENERIC)];
@@ -519,8 +521,9 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
 void __init init_hw_perf_counters(void)
 {
 	union cpuid10_eax eax;
-	unsigned int unused;
 	unsigned int ebx;
+	unsigned int unused;
+	union cpuid10_edx edx;
 
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
 		return;
@@ -529,14 +532,14 @@ void __init init_hw_perf_counters(void)
 	 * Check whether the Architectural PerfMon supports
 	 * Branch Misses Retired Event or not.
 	 */
-	cpuid(10, &(eax.full), &ebx, &unused, &unused);
+	cpuid(10, &eax.full, &ebx, &unused, &edx.full);
 	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
 		return;
 
 	printk(KERN_INFO "Intel Performance Monitoring support detected.\n");
 
-	printk(KERN_INFO "... version:      %d\n", eax.split.version_id);
-	printk(KERN_INFO "... num_counters: %d\n", eax.split.num_counters);
+	printk(KERN_INFO "... version:         %d\n", eax.split.version_id);
+	printk(KERN_INFO "... num counters:    %d\n", eax.split.num_counters);
 	nr_hw_counters = eax.split.num_counters;
 	if (nr_hw_counters > X86_PMC_MAX_GENERIC) {
 		nr_hw_counters = X86_PMC_MAX_GENERIC;
@@ -546,8 +549,16 @@ void __init init_hw_perf_counters(void)
 	perf_counter_mask = (1 << nr_hw_counters) - 1;
 	perf_max_counters = nr_hw_counters;
 
-	printk(KERN_INFO "... bit_width:    %d\n", eax.split.bit_width);
-	printk(KERN_INFO "... mask_length:  %d\n", eax.split.mask_length);
+	printk(KERN_INFO "... bit width:       %d\n", eax.split.bit_width);
+	printk(KERN_INFO "... mask length:     %d\n", eax.split.mask_length);
+
+	nr_hw_counters_fixed = edx.split.num_counters_fixed;
+	if (nr_hw_counters_fixed > X86_PMC_MAX_FIXED) {
+		nr_hw_counters_fixed = X86_PMC_MAX_FIXED;
+		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
+			nr_hw_counters_fixed, X86_PMC_MAX_FIXED);
+	}
+	printk(KERN_INFO "... fixed counters:  %d\n", nr_hw_counters_fixed);
 
 	perf_counters_initialized = true;
 
-- 
cgit v1.2.3


From 862a1a5f346fe7e9181ea51eaae48cf2cd70f746 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 17 Dec 2008 13:09:20 +0100
Subject: x86, perfcounters: refactor code for fixed-function PMCs

Impact: clean up

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_counter.h | 14 ++++++-
 arch/x86/kernel/cpu/perf_counter.c  | 73 ++++++++++++++++++++-----------------
 2 files changed, 52 insertions(+), 35 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index 945a315e6d6..13745deb16c 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -8,6 +8,10 @@
 #define X86_PMC_MAX_GENERIC					8
 #define X86_PMC_MAX_FIXED					3
 
+#define X86_PMC_IDX_GENERIC				        0
+#define X86_PMC_IDX_FIXED				       32
+#define X86_PMC_IDX_MAX					       64
+
 #define MSR_ARCH_PERFMON_PERFCTR0			      0xc1
 #define MSR_ARCH_PERFMON_PERFCTR1			      0xc2
 
@@ -54,6 +58,15 @@ union cpuid10_edx {
  * Fixed-purpose performance counters:
  */
 
+/*
+ * All 3 fixed-mode PMCs are configured via this single MSR:
+ */
+#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL			0x38d
+
+/*
+ * The counts are available in three separate MSRs:
+ */
+
 /* Instr_Retired.Any: */
 #define MSR_ARCH_PERFMON_FIXED_CTR0			0x309
 
@@ -63,7 +76,6 @@ union cpuid10_edx {
 /* CPU_CLK_Unhalted.Ref: */
 #define MSR_ARCH_PERFMON_FIXED_CTR2			0x30b
 
-
 #ifdef CONFIG_PERF_COUNTERS
 extern void init_hw_perf_counters(void);
 extern void perf_counters_lapic_init(int nmi);
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 2fca50c4597..358af526640 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -24,17 +24,14 @@ static bool perf_counters_initialized __read_mostly;
 /*
  * Number of (generic) HW counters:
  */
-static int nr_hw_counters __read_mostly;
-static u32 perf_counter_mask __read_mostly;
+static int nr_counters_generic __read_mostly;
+static u64 perf_counter_mask __read_mostly;
 
-static int nr_hw_counters_fixed __read_mostly;
+static int nr_counters_fixed __read_mostly;
 
 struct cpu_hw_counters {
-	struct perf_counter	*generic[X86_PMC_MAX_GENERIC];
-	unsigned long		used[BITS_TO_LONGS(X86_PMC_MAX_GENERIC)];
-
-	struct perf_counter	*fixed[X86_PMC_MAX_FIXED];
-	unsigned long		used_fixed[BITS_TO_LONGS(X86_PMC_MAX_FIXED)];
+	struct perf_counter	*counters[X86_PMC_IDX_MAX];
+	unsigned long		used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 };
 
 /*
@@ -159,7 +156,7 @@ void hw_perf_enable_all(void)
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask);
 }
 
 u64 hw_perf_save_disable(void)
@@ -170,7 +167,7 @@ u64 hw_perf_save_disable(void)
 		return 0;
 
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
-	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
 
 	return ctrl;
 }
@@ -181,7 +178,7 @@ void hw_perf_restore(u64 ctrl)
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 }
 EXPORT_SYMBOL_GPL(hw_perf_restore);
 
@@ -239,6 +236,11 @@ __pmc_generic_enable(struct perf_counter *counter,
 	      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
 }
 
+static int fixed_mode_idx(struct hw_perf_counter *hwc)
+{
+	return -1;
+}
+
 /*
  * Find a PMC slot for the freshly enabled / scheduled in counter:
  */
@@ -250,7 +252,7 @@ static void pmc_generic_enable(struct perf_counter *counter)
 
 	/* Try to get the previous counter again */
 	if (test_and_set_bit(idx, cpuc->used)) {
-		idx = find_first_zero_bit(cpuc->used, nr_hw_counters);
+		idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
 		set_bit(idx, cpuc->used);
 		hwc->idx = idx;
 	}
@@ -259,7 +261,7 @@ static void pmc_generic_enable(struct perf_counter *counter)
 
 	__pmc_generic_disable(counter, hwc, idx);
 
-	cpuc->generic[idx] = counter;
+	cpuc->counters[idx] = counter;
 
 	__hw_perf_counter_set_period(counter, hwc, idx);
 	__pmc_generic_enable(counter, hwc, idx);
@@ -270,7 +272,7 @@ void perf_counter_print_debug(void)
 	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left;
 	int cpu, idx;
 
-	if (!nr_hw_counters)
+	if (!nr_counters_generic)
 		return;
 
 	local_irq_disable();
@@ -286,7 +288,7 @@ void perf_counter_print_debug(void)
 	printk(KERN_INFO "CPU#%d: status:     %016llx\n", cpu, status);
 	printk(KERN_INFO "CPU#%d: overflow:   %016llx\n", cpu, overflow);
 
-	for (idx = 0; idx < nr_hw_counters; idx++) {
+	for (idx = 0; idx < nr_counters_generic; idx++) {
 		rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
 		rdmsrl(MSR_ARCH_PERFMON_PERFCTR0  + idx, pmc_count);
 
@@ -311,7 +313,7 @@ static void pmc_generic_disable(struct perf_counter *counter)
 	__pmc_generic_disable(counter, hwc, idx);
 
 	clear_bit(idx, cpuc->used);
-	cpuc->generic[idx] = NULL;
+	cpuc->counters[idx] = NULL;
 
 	/*
 	 * Drain the remaining delta count out of a counter
@@ -381,7 +383,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global);
 
 	/* Disable counters globally */
-	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
 	ack_APIC_irq();
 
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
@@ -392,8 +394,8 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 
 again:
 	ack = status;
-	for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) {
-		struct perf_counter *counter = cpuc->generic[bit];
+	for_each_bit(bit, (unsigned long *) &status, nr_counters_generic) {
+		struct perf_counter *counter = cpuc->counters[bit];
 
 		clear_bit(bit, (unsigned long *) &status);
 		if (!counter)
@@ -424,7 +426,7 @@ again:
 		}
 	}
 
-	wrmsr(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack, 0);
+	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
 
 	/*
 	 * Repeat if there is more work to be done:
@@ -436,7 +438,7 @@ out:
 	/*
 	 * Restore - do not reenable when global enable is off:
 	 */
-	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, saved_global, 0);
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global);
 }
 
 void smp_perf_counter_interrupt(struct pt_regs *regs)
@@ -462,8 +464,8 @@ void perf_counter_notify(struct pt_regs *regs)
 	cpu = smp_processor_id();
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
-	for_each_bit(bit, cpuc->used, nr_hw_counters) {
-		struct perf_counter *counter = cpuc->generic[bit];
+	for_each_bit(bit, cpuc->used, X86_PMC_IDX_MAX) {
+		struct perf_counter *counter = cpuc->counters[bit];
 
 		if (!counter)
 			continue;
@@ -540,26 +542,29 @@ void __init init_hw_perf_counters(void)
 
 	printk(KERN_INFO "... version:         %d\n", eax.split.version_id);
 	printk(KERN_INFO "... num counters:    %d\n", eax.split.num_counters);
-	nr_hw_counters = eax.split.num_counters;
-	if (nr_hw_counters > X86_PMC_MAX_GENERIC) {
-		nr_hw_counters = X86_PMC_MAX_GENERIC;
+	nr_counters_generic = eax.split.num_counters;
+	if (nr_counters_generic > X86_PMC_MAX_GENERIC) {
+		nr_counters_generic = X86_PMC_MAX_GENERIC;
 		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
-			nr_hw_counters, X86_PMC_MAX_GENERIC);
+			nr_counters_generic, X86_PMC_MAX_GENERIC);
 	}
-	perf_counter_mask = (1 << nr_hw_counters) - 1;
-	perf_max_counters = nr_hw_counters;
+	perf_counter_mask = (1 << nr_counters_generic) - 1;
+	perf_max_counters = nr_counters_generic;
 
 	printk(KERN_INFO "... bit width:       %d\n", eax.split.bit_width);
 	printk(KERN_INFO "... mask length:     %d\n", eax.split.mask_length);
 
-	nr_hw_counters_fixed = edx.split.num_counters_fixed;
-	if (nr_hw_counters_fixed > X86_PMC_MAX_FIXED) {
-		nr_hw_counters_fixed = X86_PMC_MAX_FIXED;
+	nr_counters_fixed = edx.split.num_counters_fixed;
+	if (nr_counters_fixed > X86_PMC_MAX_FIXED) {
+		nr_counters_fixed = X86_PMC_MAX_FIXED;
 		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
-			nr_hw_counters_fixed, X86_PMC_MAX_FIXED);
+			nr_counters_fixed, X86_PMC_MAX_FIXED);
 	}
-	printk(KERN_INFO "... fixed counters:  %d\n", nr_hw_counters_fixed);
+	printk(KERN_INFO "... fixed counters:  %d\n", nr_counters_fixed);
+
+	perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED;
 
+	printk(KERN_INFO "... counter mask:    %016Lx\n", perf_counter_mask);
 	perf_counters_initialized = true;
 
 	perf_counters_lapic_init(0);
-- 
cgit v1.2.3


From 7671581f1666ef4b54a1c1e598c51ac44c060a9b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 17 Dec 2008 14:20:28 +0100
Subject: perfcounters: hw ops rename

Impact: rename field names

Shorten them.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 358af526640..b6755712142 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -577,9 +577,9 @@ static void pmc_generic_read(struct perf_counter *counter)
 }
 
 static const struct hw_perf_counter_ops x86_perf_counter_ops = {
-	.hw_perf_counter_enable		= pmc_generic_enable,
-	.hw_perf_counter_disable	= pmc_generic_disable,
-	.hw_perf_counter_read		= pmc_generic_read,
+	.enable		= pmc_generic_enable,
+	.disable	= pmc_generic_disable,
+	.read		= pmc_generic_read,
 };
 
 const struct hw_perf_counter_ops *
-- 
cgit v1.2.3


From 95cdd2e7851cce79ab839cb0b3cbe68d7911d0f1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 21 Dec 2008 13:50:42 +0100
Subject: perfcounters: enable lowlevel pmc code to schedule counters

Allow lowlevel ->enable() op to return an error if a counter can not be
added. This can be used to handle counter constraints.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index b6755712142..74090a393a7 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -244,7 +244,7 @@ static int fixed_mode_idx(struct hw_perf_counter *hwc)
 /*
  * Find a PMC slot for the freshly enabled / scheduled in counter:
  */
-static void pmc_generic_enable(struct perf_counter *counter)
+static int pmc_generic_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
@@ -253,6 +253,8 @@ static void pmc_generic_enable(struct perf_counter *counter)
 	/* Try to get the previous counter again */
 	if (test_and_set_bit(idx, cpuc->used)) {
 		idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
+		if (idx == nr_counters_generic)
+			return -EAGAIN;
 		set_bit(idx, cpuc->used);
 		hwc->idx = idx;
 	}
@@ -265,6 +267,8 @@ static void pmc_generic_enable(struct perf_counter *counter)
 
 	__hw_perf_counter_set_period(counter, hwc, idx);
 	__pmc_generic_enable(counter, hwc, idx);
+
+	return 0;
 }
 
 void perf_counter_print_debug(void)
-- 
cgit v1.2.3


From 0dff86aa7b9ec65a6d07167b7afb050b5fc98ddc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 23 Dec 2008 12:28:12 +0100
Subject: x86, perfcounters: print out the ->used bitmask

Impact: extend debug printouts

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 74090a393a7..f3359c2b391 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -255,6 +255,7 @@ static int pmc_generic_enable(struct perf_counter *counter)
 		idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
 		if (idx == nr_counters_generic)
 			return -EAGAIN;
+
 		set_bit(idx, cpuc->used);
 		hwc->idx = idx;
 	}
@@ -274,6 +275,7 @@ static int pmc_generic_enable(struct perf_counter *counter)
 void perf_counter_print_debug(void)
 {
 	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left;
+	struct cpu_hw_counters *cpuc;
 	int cpu, idx;
 
 	if (!nr_counters_generic)
@@ -282,6 +284,7 @@ void perf_counter_print_debug(void)
 	local_irq_disable();
 
 	cpu = smp_processor_id();
+	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
@@ -291,6 +294,7 @@ void perf_counter_print_debug(void)
 	printk(KERN_INFO "CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
 	printk(KERN_INFO "CPU#%d: status:     %016llx\n", cpu, status);
 	printk(KERN_INFO "CPU#%d: overflow:   %016llx\n", cpu, overflow);
+	printk(KERN_INFO "CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
 		rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
-- 
cgit v1.2.3


From f650a672359819454c3d8d4135ecd1558cde0b24 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 23 Dec 2008 12:17:29 +0100
Subject: perfcounters: add PERF_COUNT_BUS_CYCLES

Generalize "bus cycles" hw events - and map them to CPU_CLK_Unhalted.Ref
on x86. (which is a good enough approximation)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index f3359c2b391..86b2fdd344a 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -41,12 +41,13 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
 
 static const int intel_perfmon_event_map[] =
 {
-  [PERF_COUNT_CYCLES]			= 0x003c,
+  [PERF_COUNT_CPU_CYCLES]		= 0x003c,
   [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
   [PERF_COUNT_CACHE_REFERENCES]		= 0x4f2e,
   [PERF_COUNT_CACHE_MISSES]		= 0x412e,
   [PERF_COUNT_BRANCH_INSTRUCTIONS]	= 0x00c4,
   [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
+  [PERF_COUNT_BUS_CYCLES]		= 0x013c,
 };
 
 static const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
-- 
cgit v1.2.3


From 2f18d1e8d07ae67dd0afce875287756d4bd31a46 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 22 Dec 2008 11:10:42 +0100
Subject: x86, perfcounters: add support for fixed-function pmcs

Impact: extend performance counter support on x86 Intel CPUs

Modern Intel CPUs have 3 "fixed-function" performance counters, which
count these hardware events:

    Instr_Retired.Any
    CPU_CLK_Unhalted.Core
    CPU_CLK_Unhalted.Ref

Add support for them to the performance counters subsystem.

Their use is transparent to user-space: the counter scheduler is
extended to automatically recognize the cases where a fixed-function
PMC can be utilized instead of a generic PMC. In such cases the
generic PMC is kept available for more counters.

The above fixed-function events map to these generic counter hw events:

        PERF_COUNT_INSTRUCTIONS
        PERF_COUNT_CPU_CYCLES
        PERF_COUNT_BUS_CYCLES

(The 'bus' cycles are in reality often CPU-ish cycles, just with a fixed
 frequency.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_counter.h |   8 ++
 arch/x86/kernel/cpu/perf_counter.c  | 149 ++++++++++++++++++++++++++++++------
 2 files changed, 133 insertions(+), 24 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index 13745deb16c..2e08ed73664 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -23,6 +23,11 @@
 #define ARCH_PERFMON_EVENTSEL_OS			  (1 << 17)
 #define ARCH_PERFMON_EVENTSEL_USR			  (1 << 16)
 
+/*
+ * Includes eventsel and unit mask as well:
+ */
+#define ARCH_PERFMON_EVENT_MASK				    0xffff
+
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL		      0x3c
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK		(0x00 << 8)
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 		 0
@@ -69,12 +74,15 @@ union cpuid10_edx {
 
 /* Instr_Retired.Any: */
 #define MSR_ARCH_PERFMON_FIXED_CTR0			0x309
+#define X86_PMC_IDX_FIXED_INSTRUCTIONS			(X86_PMC_IDX_FIXED + 0)
 
 /* CPU_CLK_Unhalted.Core: */
 #define MSR_ARCH_PERFMON_FIXED_CTR1			0x30a
+#define X86_PMC_IDX_FIXED_CPU_CYCLES			(X86_PMC_IDX_FIXED + 1)
 
 /* CPU_CLK_Unhalted.Ref: */
 #define MSR_ARCH_PERFMON_FIXED_CTR2			0x30b
+#define X86_PMC_IDX_FIXED_BUS_CYCLES			(X86_PMC_IDX_FIXED + 2)
 
 #ifdef CONFIG_PERF_COUNTERS
 extern void init_hw_perf_counters(void);
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 86b2fdd344a..da46eca1254 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -26,6 +26,7 @@ static bool perf_counters_initialized __read_mostly;
  */
 static int nr_counters_generic __read_mostly;
 static u64 perf_counter_mask __read_mostly;
+static u64 counter_value_mask __read_mostly;
 
 static int nr_counters_fixed __read_mostly;
 
@@ -120,9 +121,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 			hwc->nmi = 1;
 	}
 
-	hwc->config_base	= MSR_ARCH_PERFMON_EVENTSEL0;
-	hwc->counter_base	= MSR_ARCH_PERFMON_PERFCTR0;
-
 	hwc->irq_period		= hw_event->irq_period;
 	/*
 	 * Intel PMCs cannot be accessed sanely above 32 bit width,
@@ -183,16 +181,34 @@ void hw_perf_restore(u64 ctrl)
 }
 EXPORT_SYMBOL_GPL(hw_perf_restore);
 
+static inline void
+__pmc_fixed_disable(struct perf_counter *counter,
+		    struct hw_perf_counter *hwc, unsigned int __idx)
+{
+	int idx = __idx - X86_PMC_IDX_FIXED;
+	u64 ctrl_val, mask;
+	int err;
+
+	mask = 0xfULL << (idx * 4);
+
+	rdmsrl(hwc->config_base, ctrl_val);
+	ctrl_val &= ~mask;
+	err = checking_wrmsrl(hwc->config_base, ctrl_val);
+}
+
 static inline void
 __pmc_generic_disable(struct perf_counter *counter,
 			   struct hw_perf_counter *hwc, unsigned int idx)
 {
 	int err;
 
+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
+		return __pmc_fixed_disable(counter, hwc, idx);
+
 	err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
 }
 
-static DEFINE_PER_CPU(u64, prev_left[X86_PMC_MAX_GENERIC]);
+static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
 
 /*
  * Set the next IRQ period, based on the hwc->period_left value.
@@ -202,8 +218,9 @@ static void
 __hw_perf_counter_set_period(struct perf_counter *counter,
 			     struct hw_perf_counter *hwc, int idx)
 {
-	s32 left = atomic64_read(&hwc->period_left);
+	s64 left = atomic64_read(&hwc->period_left);
 	s32 period = hwc->irq_period;
+	int err;
 
 	/*
 	 * If we are way outside a reasoable range then just skip forward:
@@ -224,21 +241,64 @@ __hw_perf_counter_set_period(struct perf_counter *counter,
 	 * The hw counter starts counting from this counter offset,
 	 * mark it to be able to extra future deltas:
 	 */
-	atomic64_set(&hwc->prev_count, (u64)(s64)-left);
+	atomic64_set(&hwc->prev_count, (u64)-left);
 
-	wrmsr(hwc->counter_base + idx, -left, 0);
+	err = checking_wrmsrl(hwc->counter_base + idx,
+			     (u64)(-left) & counter_value_mask);
+}
+
+static inline void
+__pmc_fixed_enable(struct perf_counter *counter,
+		   struct hw_perf_counter *hwc, unsigned int __idx)
+{
+	int idx = __idx - X86_PMC_IDX_FIXED;
+	u64 ctrl_val, bits, mask;
+	int err;
+
+	/*
+	 * Enable IRQ generation (0x8) and ring-3 counting (0x2),
+	 * and enable ring-0 counting if allowed:
+	 */
+	bits = 0x8ULL | 0x2ULL;
+	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+		bits |= 0x1;
+	bits <<= (idx * 4);
+	mask = 0xfULL << (idx * 4);
+
+	rdmsrl(hwc->config_base, ctrl_val);
+	ctrl_val &= ~mask;
+	ctrl_val |= bits;
+	err = checking_wrmsrl(hwc->config_base, ctrl_val);
 }
 
 static void
 __pmc_generic_enable(struct perf_counter *counter,
 			  struct hw_perf_counter *hwc, int idx)
 {
+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
+		return __pmc_fixed_enable(counter, hwc, idx);
+
 	wrmsr(hwc->config_base + idx,
 	      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
 }
 
-static int fixed_mode_idx(struct hw_perf_counter *hwc)
+static int
+fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 {
+	unsigned int event;
+
+	if (unlikely(hwc->nmi))
+		return -1;
+
+	event = hwc->config & ARCH_PERFMON_EVENT_MASK;
+
+	if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_INSTRUCTIONS]))
+		return X86_PMC_IDX_FIXED_INSTRUCTIONS;
+	if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_CPU_CYCLES]))
+		return X86_PMC_IDX_FIXED_CPU_CYCLES;
+	if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_BUS_CYCLES]))
+		return X86_PMC_IDX_FIXED_BUS_CYCLES;
+
 	return -1;
 }
 
@@ -249,16 +309,39 @@ static int pmc_generic_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
-	int idx = hwc->idx;
+	int idx;
 
-	/* Try to get the previous counter again */
-	if (test_and_set_bit(idx, cpuc->used)) {
-		idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
-		if (idx == nr_counters_generic)
-			return -EAGAIN;
+	idx = fixed_mode_idx(counter, hwc);
+	if (idx >= 0) {
+		/*
+		 * Try to get the fixed counter, if that is already taken
+		 * then try to get a generic counter:
+		 */
+		if (test_and_set_bit(idx, cpuc->used))
+			goto try_generic;
 
-		set_bit(idx, cpuc->used);
+		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
+		/*
+		 * We set it so that counter_base + idx in wrmsr/rdmsr maps to
+		 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
+		 */
+		hwc->counter_base =
+			MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
 		hwc->idx = idx;
+	} else {
+		idx = hwc->idx;
+		/* Try to get the previous generic counter again */
+		if (test_and_set_bit(idx, cpuc->used)) {
+try_generic:
+			idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
+			if (idx == nr_counters_generic)
+				return -EAGAIN;
+
+			set_bit(idx, cpuc->used);
+			hwc->idx = idx;
+		}
+		hwc->config_base  = MSR_ARCH_PERFMON_EVENTSEL0;
+		hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0;
 	}
 
 	perf_counters_lapic_init(hwc->nmi);
@@ -266,6 +349,10 @@ static int pmc_generic_enable(struct perf_counter *counter)
 	__pmc_generic_disable(counter, hwc, idx);
 
 	cpuc->counters[idx] = counter;
+	/*
+	 * Make it visible before enabling the hw:
+	 */
+	smp_wmb();
 
 	__hw_perf_counter_set_period(counter, hwc, idx);
 	__pmc_generic_enable(counter, hwc, idx);
@@ -275,7 +362,7 @@ static int pmc_generic_enable(struct perf_counter *counter)
 
 void perf_counter_print_debug(void)
 {
-	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left;
+	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
 	struct cpu_hw_counters *cpuc;
 	int cpu, idx;
 
@@ -290,11 +377,13 @@ void perf_counter_print_debug(void)
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 	rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
+	rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
 
 	printk(KERN_INFO "\n");
 	printk(KERN_INFO "CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
 	printk(KERN_INFO "CPU#%d: status:     %016llx\n", cpu, status);
 	printk(KERN_INFO "CPU#%d: overflow:   %016llx\n", cpu, overflow);
+	printk(KERN_INFO "CPU#%d: fixed:      %016llx\n", cpu, fixed);
 	printk(KERN_INFO "CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
@@ -303,13 +392,19 @@ void perf_counter_print_debug(void)
 
 		prev_left = per_cpu(prev_left[idx], cpu);
 
-		printk(KERN_INFO "CPU#%d: PMC%d ctrl:  %016llx\n",
+		printk(KERN_INFO "CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
 			cpu, idx, pmc_ctrl);
-		printk(KERN_INFO "CPU#%d: PMC%d count: %016llx\n",
+		printk(KERN_INFO "CPU#%d:   gen-PMC%d count: %016llx\n",
 			cpu, idx, pmc_count);
-		printk(KERN_INFO "CPU#%d: PMC%d left:  %016llx\n",
+		printk(KERN_INFO "CPU#%d:   gen-PMC%d left:  %016llx\n",
 			cpu, idx, prev_left);
 	}
+	for (idx = 0; idx < nr_counters_fixed; idx++) {
+		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
+
+		printk(KERN_INFO "CPU#%d: fixed-PMC%d count: %016llx\n",
+			cpu, idx, pmc_count);
+	}
 	local_irq_enable();
 }
 
@@ -323,6 +418,11 @@ static void pmc_generic_disable(struct perf_counter *counter)
 
 	clear_bit(idx, cpuc->used);
 	cpuc->counters[idx] = NULL;
+	/*
+	 * Make sure the cleared pointer becomes visible before we
+	 * (potentially) free the counter:
+	 */
+	smp_wmb();
 
 	/*
 	 * Drain the remaining delta count out of a counter
@@ -353,14 +453,11 @@ static void perf_save_and_restart(struct perf_counter *counter)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
 	int idx = hwc->idx;
-	u64 pmc_ctrl;
-
-	rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
 
 	x86_perf_counter_update(counter, hwc, idx);
 	__hw_perf_counter_set_period(counter, hwc, idx);
 
-	if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE)
+	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
 		__pmc_generic_enable(counter, hwc, idx);
 }
 
@@ -373,6 +470,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 	 * Store sibling timestamps (if any):
 	 */
 	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
+
 		x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
 		perf_store_irq_data(sibling, counter->hw_event.type);
 		perf_store_irq_data(sibling, atomic64_read(&counter->count));
@@ -403,7 +501,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 
 again:
 	ack = status;
-	for_each_bit(bit, (unsigned long *) &status, nr_counters_generic) {
+	for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
 		struct perf_counter *counter = cpuc->counters[bit];
 
 		clear_bit(bit, (unsigned long *) &status);
@@ -561,6 +659,9 @@ void __init init_hw_perf_counters(void)
 	perf_max_counters = nr_counters_generic;
 
 	printk(KERN_INFO "... bit width:       %d\n", eax.split.bit_width);
+	counter_value_mask = (1ULL << eax.split.bit_width) - 1;
+	printk(KERN_INFO "... value mask:      %016Lx\n", counter_value_mask);
+
 	printk(KERN_INFO "... mask length:     %d\n", eax.split.mask_length);
 
 	nr_counters_fixed = edx.split.num_counters_fixed;
-- 
cgit v1.2.3


From 2b583d8bc8d7105b58d7481a4a0ceb718dac49c6 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@infradead.org>
Date: Sat, 27 Dec 2008 19:15:43 +0530
Subject: x86: perf_counter remove unwanted hw_perf_enable_all

Impact: clean, reduce kernel size a bit, avoid sparse warnings

Fixes sparse warnings:

 arch/x86/kernel/cpu/perf_counter.c:153:6: warning: symbol 'hw_perf_enable_all' was not declared. Should it be static?
 arch/x86/kernel/cpu/perf_counter.c:279:3: warning: returning void-valued expression
 arch/x86/kernel/cpu/perf_counter.c:206:3: warning: returning void-valued expression
 arch/x86/kernel/cpu/perf_counter.c:206:3: warning: returning void-valued expression

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index da46eca1254..9376771f757 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -150,14 +150,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	return 0;
 }
 
-void hw_perf_enable_all(void)
-{
-	if (unlikely(!perf_counters_initialized))
-		return;
-
-	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask);
-}
-
 u64 hw_perf_save_disable(void)
 {
 	u64 ctrl;
@@ -200,12 +192,10 @@ static inline void
 __pmc_generic_disable(struct perf_counter *counter,
 			   struct hw_perf_counter *hwc, unsigned int idx)
 {
-	int err;
-
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
-		return __pmc_fixed_disable(counter, hwc, idx);
-
-	err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
+		__pmc_fixed_disable(counter, hwc, idx);
+	else
+		wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
 }
 
 static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
@@ -276,10 +266,10 @@ __pmc_generic_enable(struct perf_counter *counter,
 			  struct hw_perf_counter *hwc, int idx)
 {
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
-		return __pmc_fixed_enable(counter, hwc, idx);
-
-	wrmsr(hwc->config_base + idx,
-	      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
+		__pmc_fixed_enable(counter, hwc, idx);
+	else
+		wrmsr(hwc->config_base + idx,
+		      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
 }
 
 static int
-- 
cgit v1.2.3


From d662ed26734473d4cb5f3d78cebfec8f9126e97c Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 9 Jan 2009 17:01:53 +1100
Subject: powerpc/perf_counter: Add perf_counter system call on powerpc

... with an empty/dummy asm/perf_counter.h so it builds.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/perf_counter.h | 10 ++++++++++
 arch/powerpc/include/asm/systbl.h       |  1 +
 arch/powerpc/include/asm/unistd.h       |  3 ++-
 arch/powerpc/platforms/Kconfig.cputype  |  1 +
 4 files changed, 14 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/include/asm/perf_counter.h

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h
new file mode 100644
index 00000000000..59530ae1d53
--- /dev/null
+++ b/arch/powerpc/include/asm/perf_counter.h
@@ -0,0 +1,10 @@
+/*
+ * Performance counter support - PowerPC-specific definitions.
+ *
+ * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
index 803def23665..da300c4d288 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -322,3 +322,4 @@ SYSCALL_SPU(epoll_create1)
 SYSCALL_SPU(dup3)
 SYSCALL_SPU(pipe2)
 SYSCALL(inotify_init1)
+SYSCALL(perf_counter_open)
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
index e07d0c76ed7..7cef5afe89d 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -341,10 +341,11 @@
 #define __NR_dup3		316
 #define __NR_pipe2		317
 #define __NR_inotify_init1	318
+#define __NR_perf_counter_open	319
 
 #ifdef __KERNEL__
 
-#define __NR_syscalls		319
+#define __NR_syscalls		320
 
 #define __NR__exit __NR_exit
 #define NR_syscalls	__NR_syscalls
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 3d0c776f888..94dd1fb9a00 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -1,6 +1,7 @@
 config PPC64
 	bool "64-bit kernel"
 	default n
+	select HAVE_PERF_COUNTERS
 	help
 	  This option selects whether a 32-bit or a 64-bit kernel
 	  will be built.
-- 
cgit v1.2.3


From 93a6d3ce6962044fe9badf528fed46b455d58292 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 9 Jan 2009 16:52:19 +1100
Subject: powerpc: Provide a way to defer perf counter work until interrupts
 are enabled

Because 64-bit powerpc uses lazy (soft) interrupt disabling, it is
possible for a performance monitor exception to come in when the
kernel thinks interrupts are disabled (i.e. when they are
soft-disabled but hard-enabled).  In such a situation the performance
monitor exception handler might have some processing to do (such as
process wakeups) which can't be done in what is effectively an NMI
handler.

This provides a way to defer that work until interrupts get enabled,
either in raw_local_irq_restore() or by returning from an interrupt
handler to code that had interrupts enabled.  We have a per-processor
flag that indicates that there is work pending to do when interrupts
subsequently get re-enabled.  This flag is checked in the interrupt
return path and in raw_local_irq_restore(), and if it is set,
perf_counter_do_pending() is called to do the pending work.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/hw_irq.h | 31 +++++++++++++++++++++++++++++++
 arch/powerpc/include/asm/paca.h   |  1 +
 arch/powerpc/kernel/asm-offsets.c |  1 +
 arch/powerpc/kernel/entry_64.S    |  9 +++++++++
 arch/powerpc/kernel/irq.c         | 10 ++++++++++
 5 files changed, 52 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index f75a5fc64d2..e10f151c3db 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -131,5 +131,36 @@ static inline int irqs_disabled_flags(unsigned long flags)
  */
 struct hw_interrupt_type;
 
+#ifdef CONFIG_PERF_COUNTERS
+static inline unsigned long get_perf_counter_pending(void)
+{
+	unsigned long x;
+
+	asm volatile("lbz %0,%1(13)"
+		: "=r" (x)
+		: "i" (offsetof(struct paca_struct, perf_counter_pending)));
+	return x;
+}
+
+static inline void set_perf_counter_pending(int x)
+{
+	asm volatile("stb %0,%1(13)" : :
+		"r" (x),
+		"i" (offsetof(struct paca_struct, perf_counter_pending)));
+}
+
+extern void perf_counter_do_pending(void);
+
+#else
+
+static inline unsigned long get_perf_counter_pending(void)
+{
+	return 0;
+}
+
+static inline void set_perf_counter_pending(int x) {}
+static inline void perf_counter_do_pending(void) {}
+#endif /* CONFIG_PERF_COUNTERS */
+
 #endif	/* __KERNEL__ */
 #endif	/* _ASM_POWERPC_HW_IRQ_H */
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 082b3aedf14..6ef05572301 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -99,6 +99,7 @@ struct paca_struct {
 	u8 soft_enabled;		/* irq soft-enable flag */
 	u8 hard_enabled;		/* set if irqs are enabled in MSR */
 	u8 io_sync;			/* writel() needs spin_unlock sync */
+	u8 perf_counter_pending;	/* PM interrupt while soft-disabled */
 
 	/* Stuff for accurate time accounting */
 	u64 user_time;			/* accumulated usermode TB ticks */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 661d07d2146..cea46290011 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -127,6 +127,7 @@ int main(void)
 	DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr));
 	DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled));
 	DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled));
+	DEFINE(PACAPERFPEND, offsetof(struct paca_struct, perf_counter_pending));
 	DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache));
 	DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr));
 	DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id));
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 383ed6eb008..f30b4e553c5 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -526,6 +526,15 @@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES)
 2:
 	TRACE_AND_RESTORE_IRQ(r5);
 
+#ifdef CONFIG_PERF_COUNTERS
+	/* check paca->perf_counter_pending if we're enabling ints */
+	lbz	r3,PACAPERFPEND(r13)
+	and.	r3,r3,r5
+	beq	27f
+	bl	.perf_counter_do_pending
+27:
+#endif /* CONFIG_PERF_COUNTERS */
+
 	/* extract EE bit and use it to restore paca->hard_enabled */
 	ld	r3,_MSR(r1)
 	rldicl	r4,r3,49,63		/* r0 = (r3 >> 15) & 1 */
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index ac222d0ab12..4efb886ea43 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -104,6 +104,13 @@ static inline notrace void set_soft_enabled(unsigned long enable)
 	: : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled)));
 }
 
+#ifdef CONFIG_PERF_COUNTERS
+notrace void __weak perf_counter_do_pending(void)
+{
+	set_perf_counter_pending(0);
+}
+#endif
+
 notrace void raw_local_irq_restore(unsigned long en)
 {
 	/*
@@ -135,6 +142,9 @@ notrace void raw_local_irq_restore(unsigned long en)
 			iseries_handle_interrupts();
 	}
 
+	if (get_perf_counter_pending())
+		perf_counter_do_pending();
+
 	/*
 	 * if (get_paca()->hard_enabled) return;
 	 * But again we need to take care that gcc gets hard_enabled directly
-- 
cgit v1.2.3


From 4574910e5087085a1f330ff8373cee4503f5c77c Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 9 Jan 2009 20:21:55 +1100
Subject: powerpc/perf_counter: Add generic support for POWER-family PMU
 hardware

This provides the architecture-specific functions needed to access
PMU hardware on the 64-bit PowerPC processors.  It has been designed
for the IBM POWER family (POWER 4/4+/5/5+/6 and PPC970) but will
hopefully also suit other 64-bit PowerPC machines (although probably
not Cell given how different it is in this area).  This doesn't
include back-ends for any specific processors.

This implements a system which allows back-ends to express the
constraints that their hardware has on what events can be counted
simultaneously.  The constraints are expressed as a 64-bit mask +
64-bit value for each event, and the encoding is capable of
expressing the constraints arising from having a set of multiplexers
feeding an event bus, with some events being available through
multiple multiplexer settings, such as we get on POWER4 and PPC970.
Furthermore, the back-end can supply alternative event codes for
each event, and the constraint checking code will try all possible
combinations of alternative event codes to try to find a combination
that will fit.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/perf_counter.h |  62 +++
 arch/powerpc/kernel/Makefile            |   1 +
 arch/powerpc/kernel/perf_counter.c      | 754 ++++++++++++++++++++++++++++++++
 3 files changed, 817 insertions(+)
 create mode 100644 arch/powerpc/kernel/perf_counter.c

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h
index 59530ae1d53..9d7ff6d7fb5 100644
--- a/arch/powerpc/include/asm/perf_counter.h
+++ b/arch/powerpc/include/asm/perf_counter.h
@@ -8,3 +8,65 @@
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
  */
+#include <linux/types.h>
+
+#define MAX_HWCOUNTERS		8
+#define MAX_EVENT_ALTERNATIVES	8
+
+/*
+ * This struct provides the constants and functions needed to
+ * describe the PMU on a particular POWER-family CPU.
+ */
+struct power_pmu {
+	int	n_counter;
+	int	max_alternatives;
+	u64	add_fields;
+	u64	test_adder;
+	int	(*compute_mmcr)(unsigned int events[], int n_ev,
+				unsigned int hwc[], u64 mmcr[]);
+	int	(*get_constraint)(unsigned int event, u64 *mskp, u64 *valp);
+	int	(*get_alternatives)(unsigned int event, unsigned int alt[]);
+	void	(*disable_pmc)(unsigned int pmc, u64 mmcr[]);
+	int	n_generic;
+	int	*generic_events;
+};
+
+extern struct power_pmu *ppmu;
+
+/*
+ * The power_pmu.get_constraint function returns a 64-bit value and
+ * a 64-bit mask that express the constraints between this event and
+ * other events.
+ *
+ * The value and mask are divided up into (non-overlapping) bitfields
+ * of three different types:
+ *
+ * Select field: this expresses the constraint that some set of bits
+ * in MMCR* needs to be set to a specific value for this event.  For a
+ * select field, the mask contains 1s in every bit of the field, and
+ * the value contains a unique value for each possible setting of the
+ * MMCR* bits.  The constraint checking code will ensure that two events
+ * that set the same field in their masks have the same value in their
+ * value dwords.
+ *
+ * Add field: this expresses the constraint that there can be at most
+ * N events in a particular class.  A field of k bits can be used for
+ * N <= 2^(k-1) - 1.  The mask has the most significant bit of the field
+ * set (and the other bits 0), and the value has only the least significant
+ * bit of the field set.  In addition, the 'add_fields' and 'test_adder'
+ * in the struct power_pmu for this processor come into play.  The
+ * add_fields value contains 1 in the LSB of the field, and the
+ * test_adder contains 2^(k-1) - 1 - N in the field.
+ *
+ * NAND field: this expresses the constraint that you may not have events
+ * in all of a set of classes.  (For example, on PPC970, you can't select
+ * events from the FPU, ISU and IDU simultaneously, although any two are
+ * possible.)  For N classes, the field is N+1 bits wide, and each class
+ * is assigned one bit from the least-significant N bits.  The mask has
+ * only the most-significant bit set, and the value has only the bit
+ * for the event's class set.  The test_adder has the least significant
+ * bit set in the field.
+ *
+ * If an event is not subject to the constraint expressed by a particular
+ * field, then it will have 0 in both the mask and value for that field.
+ */
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 1308a86e907..fde190bbb2b 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -94,6 +94,7 @@ obj-$(CONFIG_AUDIT)		+= audit.o
 obj64-$(CONFIG_AUDIT)		+= compat_audit.o
 
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
+obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o
 
 obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
 
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
new file mode 100644
index 00000000000..c7d4c2966a5
--- /dev/null
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -0,0 +1,754 @@
+/*
+ * Performance counter support - powerpc architecture code
+ *
+ * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/perf_counter.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <asm/reg.h>
+#include <asm/pmc.h>
+
+struct cpu_hw_counters {
+	int n_counters;
+	int n_percpu;
+	int disabled;
+	int n_added;
+	struct perf_counter *counter[MAX_HWCOUNTERS];
+	unsigned int events[MAX_HWCOUNTERS];
+	u64 mmcr[3];
+};
+DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
+
+struct power_pmu *ppmu;
+
+void perf_counter_print_debug(void)
+{
+}
+
+/*
+ * Return 1 for a software counter, 0 for a hardware counter
+ */
+static inline int is_software_counter(struct perf_counter *counter)
+{
+	return !counter->hw_event.raw && counter->hw_event.type < 0;
+}
+
+/*
+ * Read one performance monitor counter (PMC).
+ */
+static unsigned long read_pmc(int idx)
+{
+	unsigned long val;
+
+	switch (idx) {
+	case 1:
+		val = mfspr(SPRN_PMC1);
+		break;
+	case 2:
+		val = mfspr(SPRN_PMC2);
+		break;
+	case 3:
+		val = mfspr(SPRN_PMC3);
+		break;
+	case 4:
+		val = mfspr(SPRN_PMC4);
+		break;
+	case 5:
+		val = mfspr(SPRN_PMC5);
+		break;
+	case 6:
+		val = mfspr(SPRN_PMC6);
+		break;
+	case 7:
+		val = mfspr(SPRN_PMC7);
+		break;
+	case 8:
+		val = mfspr(SPRN_PMC8);
+		break;
+	default:
+		printk(KERN_ERR "oops trying to read PMC%d\n", idx);
+		val = 0;
+	}
+	return val;
+}
+
+/*
+ * Write one PMC.
+ */
+static void write_pmc(int idx, unsigned long val)
+{
+	switch (idx) {
+	case 1:
+		mtspr(SPRN_PMC1, val);
+		break;
+	case 2:
+		mtspr(SPRN_PMC2, val);
+		break;
+	case 3:
+		mtspr(SPRN_PMC3, val);
+		break;
+	case 4:
+		mtspr(SPRN_PMC4, val);
+		break;
+	case 5:
+		mtspr(SPRN_PMC5, val);
+		break;
+	case 6:
+		mtspr(SPRN_PMC6, val);
+		break;
+	case 7:
+		mtspr(SPRN_PMC7, val);
+		break;
+	case 8:
+		mtspr(SPRN_PMC8, val);
+		break;
+	default:
+		printk(KERN_ERR "oops trying to write PMC%d\n", idx);
+	}
+}
+
+/*
+ * Check if a set of events can all go on the PMU at once.
+ * If they can't, this will look at alternative codes for the events
+ * and see if any combination of alternative codes is feasible.
+ * The feasible set is returned in event[].
+ */
+static int power_check_constraints(unsigned int event[], int n_ev)
+{
+	u64 mask, value, nv;
+	unsigned int alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
+	u64 amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
+	u64 avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
+	u64 smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS];
+	int n_alt[MAX_HWCOUNTERS], choice[MAX_HWCOUNTERS];
+	int i, j;
+	u64 addf = ppmu->add_fields;
+	u64 tadd = ppmu->test_adder;
+
+	if (n_ev > ppmu->n_counter)
+		return -1;
+
+	/* First see if the events will go on as-is */
+	for (i = 0; i < n_ev; ++i) {
+		alternatives[i][0] = event[i];
+		if (ppmu->get_constraint(event[i], &amasks[i][0],
+					 &avalues[i][0]))
+			return -1;
+		choice[i] = 0;
+	}
+	value = mask = 0;
+	for (i = 0; i < n_ev; ++i) {
+		nv = (value | avalues[i][0]) + (value & avalues[i][0] & addf);
+		if ((((nv + tadd) ^ value) & mask) != 0 ||
+		    (((nv + tadd) ^ avalues[i][0]) & amasks[i][0]) != 0)
+			break;
+		value = nv;
+		mask |= amasks[i][0];
+	}
+	if (i == n_ev)
+		return 0;	/* all OK */
+
+	/* doesn't work, gather alternatives... */
+	if (!ppmu->get_alternatives)
+		return -1;
+	for (i = 0; i < n_ev; ++i) {
+		n_alt[i] = ppmu->get_alternatives(event[i], alternatives[i]);
+		for (j = 1; j < n_alt[i]; ++j)
+			ppmu->get_constraint(alternatives[i][j],
+					     &amasks[i][j], &avalues[i][j]);
+	}
+
+	/* enumerate all possibilities and see if any will work */
+	i = 0;
+	j = -1;
+	value = mask = nv = 0;
+	while (i < n_ev) {
+		if (j >= 0) {
+			/* we're backtracking, restore context */
+			value = svalues[i];
+			mask = smasks[i];
+			j = choice[i];
+		}
+		/*
+		 * See if any alternative k for event i,
+		 * where k > j, will satisfy the constraints.
+		 */
+		while (++j < n_alt[i]) {
+			nv = (value | avalues[i][j]) +
+				(value & avalues[i][j] & addf);
+			if ((((nv + tadd) ^ value) & mask) == 0 &&
+			    (((nv + tadd) ^ avalues[i][j])
+			     & amasks[i][j]) == 0)
+				break;
+		}
+		if (j >= n_alt[i]) {
+			/*
+			 * No feasible alternative, backtrack
+			 * to event i-1 and continue enumerating its
+			 * alternatives from where we got up to.
+			 */
+			if (--i < 0)
+				return -1;
+		} else {
+			/*
+			 * Found a feasible alternative for event i,
+			 * remember where we got up to with this event,
+			 * go on to the next event, and start with
+			 * the first alternative for it.
+			 */
+			choice[i] = j;
+			svalues[i] = value;
+			smasks[i] = mask;
+			value = nv;
+			mask |= amasks[i][j];
+			++i;
+			j = -1;
+		}
+	}
+
+	/* OK, we have a feasible combination, tell the caller the solution */
+	for (i = 0; i < n_ev; ++i)
+		event[i] = alternatives[i][choice[i]];
+	return 0;
+}
+
+static void power_perf_read(struct perf_counter *counter)
+{
+	long val, delta, prev;
+
+	if (!counter->hw.idx)
+		return;
+	/*
+	 * Performance monitor interrupts come even when interrupts
+	 * are soft-disabled, as long as interrupts are hard-enabled.
+	 * Therefore we treat them like NMIs.
+	 */
+	do {
+		prev = atomic64_read(&counter->hw.prev_count);
+		barrier();
+		val = read_pmc(counter->hw.idx);
+	} while (atomic64_cmpxchg(&counter->hw.prev_count, prev, val) != prev);
+
+	/* The counters are only 32 bits wide */
+	delta = (val - prev) & 0xfffffffful;
+	atomic64_add(delta, &counter->count);
+	atomic64_sub(delta, &counter->hw.period_left);
+}
+
+/*
+ * Disable all counters to prevent PMU interrupts and to allow
+ * counters to be added or removed.
+ */
+u64 hw_perf_save_disable(void)
+{
+	struct cpu_hw_counters *cpuhw;
+	unsigned long ret;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	cpuhw = &__get_cpu_var(cpu_hw_counters);
+
+	ret = cpuhw->disabled;
+	if (!ret) {
+		cpuhw->disabled = 1;
+		cpuhw->n_added = 0;
+
+		/*
+		 * Set the 'freeze counters' bit.
+		 * The barrier is to make sure the mtspr has been
+		 * executed and the PMU has frozen the counters
+		 * before we return.
+		 */
+		mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_FC);
+		mb();
+	}
+	local_irq_restore(flags);
+	return ret;
+}
+
+/*
+ * Re-enable all counters if disable == 0.
+ * If we were previously disabled and counters were added, then
+ * put the new config on the PMU.
+ */
+void hw_perf_restore(u64 disable)
+{
+	struct perf_counter *counter;
+	struct cpu_hw_counters *cpuhw;
+	unsigned long flags;
+	long i;
+	unsigned long val;
+	s64 left;
+	unsigned int hwc_index[MAX_HWCOUNTERS];
+
+	if (disable)
+		return;
+	local_irq_save(flags);
+	cpuhw = &__get_cpu_var(cpu_hw_counters);
+	cpuhw->disabled = 0;
+
+	/*
+	 * If we didn't change anything, or only removed counters,
+	 * no need to recalculate MMCR* settings and reset the PMCs.
+	 * Just reenable the PMU with the current MMCR* settings
+	 * (possibly updated for removal of counters).
+	 */
+	if (!cpuhw->n_added) {
+		mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
+		mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
+		mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
+		goto out;
+	}
+
+	/*
+	 * Compute MMCR* values for the new set of counters
+	 */
+	if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_counters, hwc_index,
+			       cpuhw->mmcr)) {
+		/* shouldn't ever get here */
+		printk(KERN_ERR "oops compute_mmcr failed\n");
+		goto out;
+	}
+
+	/*
+	 * Write the new configuration to MMCR* with the freeze
+	 * bit set and set the hardware counters to their initial values.
+	 * Then unfreeze the counters.
+	 */
+	mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
+	mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
+	mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
+				| MMCR0_FC);
+
+	/*
+	 * Read off any pre-existing counters that need to move
+	 * to another PMC.
+	 */
+	for (i = 0; i < cpuhw->n_counters; ++i) {
+		counter = cpuhw->counter[i];
+		if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) {
+			power_perf_read(counter);
+			write_pmc(counter->hw.idx, 0);
+			counter->hw.idx = 0;
+		}
+	}
+
+	/*
+	 * Initialize the PMCs for all the new and moved counters.
+	 */
+	for (i = 0; i < cpuhw->n_counters; ++i) {
+		counter = cpuhw->counter[i];
+		if (counter->hw.idx)
+			continue;
+		val = 0;
+		if (counter->hw_event.irq_period) {
+			left = atomic64_read(&counter->hw.period_left);
+			if (left < 0x80000000L)
+				val = 0x80000000L - left;
+		}
+		atomic64_set(&counter->hw.prev_count, val);
+		counter->hw.idx = hwc_index[i] + 1;
+		write_pmc(counter->hw.idx, val);
+	}
+	mb();
+	cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
+	mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
+
+ out:
+	local_irq_restore(flags);
+}
+
+static int collect_events(struct perf_counter *group, int max_count,
+			  struct perf_counter *ctrs[], unsigned int *events)
+{
+	int n = 0;
+	struct perf_counter *counter;
+
+	if (!is_software_counter(group)) {
+		if (n >= max_count)
+			return -1;
+		ctrs[n] = group;
+		events[n++] = group->hw.config;
+	}
+	list_for_each_entry(counter, &group->sibling_list, list_entry) {
+		if (!is_software_counter(counter) &&
+		    counter->state != PERF_COUNTER_STATE_OFF) {
+			if (n >= max_count)
+				return -1;
+			ctrs[n] = counter;
+			events[n++] = counter->hw.config;
+		}
+	}
+	return n;
+}
+
+static void counter_sched_in(struct perf_counter *counter, int cpu)
+{
+	counter->state = PERF_COUNTER_STATE_ACTIVE;
+	counter->oncpu = cpu;
+	if (is_software_counter(counter))
+		counter->hw_ops->enable(counter);
+}
+
+/*
+ * Called to enable a whole group of counters.
+ * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
+ * Assumes the caller has disabled interrupts and has
+ * frozen the PMU with hw_perf_save_disable.
+ */
+int hw_perf_group_sched_in(struct perf_counter *group_leader,
+	       struct perf_cpu_context *cpuctx,
+	       struct perf_counter_context *ctx, int cpu)
+{
+	struct cpu_hw_counters *cpuhw;
+	long i, n, n0;
+	struct perf_counter *sub;
+
+	cpuhw = &__get_cpu_var(cpu_hw_counters);
+	n0 = cpuhw->n_counters;
+	n = collect_events(group_leader, ppmu->n_counter - n0,
+			   &cpuhw->counter[n0], &cpuhw->events[n0]);
+	if (n < 0)
+		return -EAGAIN;
+	if (power_check_constraints(cpuhw->events, n + n0))
+		return -EAGAIN;
+	cpuhw->n_counters = n0 + n;
+	cpuhw->n_added += n;
+
+	/*
+	 * OK, this group can go on; update counter states etc.,
+	 * and enable any software counters
+	 */
+	for (i = n0; i < n0 + n; ++i)
+		cpuhw->counter[i]->hw.config = cpuhw->events[i];
+	n = 1;
+	counter_sched_in(group_leader, cpu);
+	list_for_each_entry(sub, &group_leader->sibling_list, list_entry) {
+		if (sub->state != PERF_COUNTER_STATE_OFF) {
+			counter_sched_in(sub, cpu);
+			++n;
+		}
+	}
+	cpuctx->active_oncpu += n;
+	ctx->nr_active += n;
+
+	return 1;
+}
+
+/*
+ * Add a counter to the PMU.
+ * If all counters are not already frozen, then we disable and
+ * re-enable the PMU in order to get hw_perf_restore to do the
+ * actual work of reconfiguring the PMU.
+ */
+static int power_perf_enable(struct perf_counter *counter)
+{
+	struct cpu_hw_counters *cpuhw;
+	unsigned long flags;
+	u64 pmudis;
+	int n0;
+	int ret = -EAGAIN;
+
+	local_irq_save(flags);
+	pmudis = hw_perf_save_disable();
+
+	/*
+	 * Add the counter to the list (if there is room)
+	 * and check whether the total set is still feasible.
+	 */
+	cpuhw = &__get_cpu_var(cpu_hw_counters);
+	n0 = cpuhw->n_counters;
+	if (n0 >= ppmu->n_counter)
+		goto out;
+	cpuhw->counter[n0] = counter;
+	cpuhw->events[n0] = counter->hw.config;
+	if (power_check_constraints(cpuhw->events, n0 + 1))
+		goto out;
+
+	counter->hw.config = cpuhw->events[n0];
+	++cpuhw->n_counters;
+	++cpuhw->n_added;
+
+	ret = 0;
+ out:
+	hw_perf_restore(pmudis);
+	local_irq_restore(flags);
+	return ret;
+}
+
+/*
+ * Remove a counter from the PMU.
+ */
+static void power_perf_disable(struct perf_counter *counter)
+{
+	struct cpu_hw_counters *cpuhw;
+	long i;
+	u64 pmudis;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	pmudis = hw_perf_save_disable();
+
+	power_perf_read(counter);
+
+	cpuhw = &__get_cpu_var(cpu_hw_counters);
+	for (i = 0; i < cpuhw->n_counters; ++i) {
+		if (counter == cpuhw->counter[i]) {
+			while (++i < cpuhw->n_counters)
+				cpuhw->counter[i-1] = cpuhw->counter[i];
+			--cpuhw->n_counters;
+			ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
+			write_pmc(counter->hw.idx, 0);
+			counter->hw.idx = 0;
+			break;
+		}
+	}
+	if (cpuhw->n_counters == 0) {
+		/* disable exceptions if no counters are running */
+		cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
+	}
+
+	hw_perf_restore(pmudis);
+	local_irq_restore(flags);
+}
+
+struct hw_perf_counter_ops power_perf_ops = {
+	.enable = power_perf_enable,
+	.disable = power_perf_disable,
+	.read = power_perf_read
+};
+
+const struct hw_perf_counter_ops *
+hw_perf_counter_init(struct perf_counter *counter)
+{
+	unsigned long ev;
+	struct perf_counter *ctrs[MAX_HWCOUNTERS];
+	unsigned int events[MAX_HWCOUNTERS];
+	int n;
+
+	if (!ppmu)
+		return NULL;
+	if ((s64)counter->hw_event.irq_period < 0)
+		return NULL;
+	ev = counter->hw_event.type;
+	if (!counter->hw_event.raw) {
+		if (ev >= ppmu->n_generic ||
+		    ppmu->generic_events[ev] == 0)
+			return NULL;
+		ev = ppmu->generic_events[ev];
+	}
+	counter->hw.config_base = ev;
+	counter->hw.idx = 0;
+
+	/*
+	 * If this is in a group, check if it can go on with all the
+	 * other hardware counters in the group.  We assume the counter
+	 * hasn't been linked into its leader's sibling list at this point.
+	 */
+	n = 0;
+	if (counter->group_leader != counter) {
+		n = collect_events(counter->group_leader, ppmu->n_counter - 1,
+				   ctrs, events);
+		if (n < 0)
+			return NULL;
+	}
+	events[n++] = ev;
+	if (power_check_constraints(events, n))
+		return NULL;
+
+	counter->hw.config = events[n - 1];
+	atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period);
+	return &power_perf_ops;
+}
+
+/*
+ * Handle wakeups.
+ */
+void perf_counter_do_pending(void)
+{
+	int i;
+	struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
+	struct perf_counter *counter;
+
+	set_perf_counter_pending(0);
+	for (i = 0; i < cpuhw->n_counters; ++i) {
+		counter = cpuhw->counter[i];
+		if (counter && counter->wakeup_pending) {
+			counter->wakeup_pending = 0;
+			wake_up(&counter->waitq);
+		}
+	}
+}
+
+/*
+ * Record data for an irq counter.
+ * This function was lifted from the x86 code; maybe it should
+ * go in the core?
+ */
+static void perf_store_irq_data(struct perf_counter *counter, u64 data)
+{
+	struct perf_data *irqdata = counter->irqdata;
+
+	if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
+		irqdata->overrun++;
+	} else {
+		u64 *p = (u64 *) &irqdata->data[irqdata->len];
+
+		*p = data;
+		irqdata->len += sizeof(u64);
+	}
+}
+
+/*
+ * Record all the values of the counters in a group
+ */
+static void perf_handle_group(struct perf_counter *counter)
+{
+	struct perf_counter *leader, *sub;
+
+	leader = counter->group_leader;
+	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+		if (sub != counter)
+			sub->hw_ops->read(sub);
+		perf_store_irq_data(counter, sub->hw_event.type);
+		perf_store_irq_data(counter, atomic64_read(&sub->count));
+	}
+}
+
+/*
+ * A counter has overflowed; update its count and record
+ * things if requested.  Note that interrupts are hard-disabled
+ * here so there is no possibility of being interrupted.
+ */
+static void record_and_restart(struct perf_counter *counter, long val,
+			       struct pt_regs *regs)
+{
+	s64 prev, delta, left;
+	int record = 0;
+
+	/* we don't have to worry about interrupts here */
+	prev = atomic64_read(&counter->hw.prev_count);
+	delta = (val - prev) & 0xfffffffful;
+	atomic64_add(delta, &counter->count);
+
+	/*
+	 * See if the total period for this counter has expired,
+	 * and update for the next period.
+	 */
+	val = 0;
+	left = atomic64_read(&counter->hw.period_left) - delta;
+	if (counter->hw_event.irq_period) {
+		if (left <= 0) {
+			left += counter->hw_event.irq_period;
+			if (left <= 0)
+				left = counter->hw_event.irq_period;
+			record = 1;
+		}
+		if (left < 0x80000000L)
+			val = 0x80000000L - left;
+	}
+	write_pmc(counter->hw.idx, val);
+	atomic64_set(&counter->hw.prev_count, val);
+	atomic64_set(&counter->hw.period_left, left);
+
+	/*
+	 * Finally record data if requested.
+	 */
+	if (record) {
+		switch (counter->hw_event.record_type) {
+		case PERF_RECORD_SIMPLE:
+			break;
+		case PERF_RECORD_IRQ:
+			perf_store_irq_data(counter, instruction_pointer(regs));
+			counter->wakeup_pending = 1;
+			break;
+		case PERF_RECORD_GROUP:
+			perf_handle_group(counter);
+			counter->wakeup_pending = 1;
+			break;
+		}
+	}
+}
+
+/*
+ * Performance monitor interrupt stuff
+ */
+static void perf_counter_interrupt(struct pt_regs *regs)
+{
+	int i;
+	struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
+	struct perf_counter *counter;
+	long val;
+	int need_wakeup = 0, found = 0;
+
+	for (i = 0; i < cpuhw->n_counters; ++i) {
+		counter = cpuhw->counter[i];
+		val = read_pmc(counter->hw.idx);
+		if ((int)val < 0) {
+			/* counter has overflowed */
+			found = 1;
+			record_and_restart(counter, val, regs);
+			if (counter->wakeup_pending)
+				need_wakeup = 1;
+		}
+	}
+
+	/*
+	 * In case we didn't find and reset the counter that caused
+	 * the interrupt, scan all counters and reset any that are
+	 * negative, to avoid getting continual interrupts.
+	 * Any that we processed in the previous loop will not be negative.
+	 */
+	if (!found) {
+		for (i = 0; i < ppmu->n_counter; ++i) {
+			val = read_pmc(i + 1);
+			if ((int)val < 0)
+				write_pmc(i + 1, 0);
+		}
+	}
+
+	/*
+	 * Reset MMCR0 to its normal value.  This will set PMXE and
+	 * clear FC (freeze counters) and PMAO (perf mon alert occurred)
+	 * and thus allow interrupts to occur again.
+	 * XXX might want to use MSR.PM to keep the counters frozen until
+	 * we get back out of this interrupt.
+	 */
+	mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
+
+	/*
+	 * If we need a wakeup, check whether interrupts were soft-enabled
+	 * when we took the interrupt.  If they were, we can wake stuff up
+	 * immediately; otherwise we'll have to set a flag and do the
+	 * wakeup when interrupts get soft-enabled.
+	 */
+	if (need_wakeup) {
+		if (regs->softe) {
+			irq_enter();
+			perf_counter_do_pending();
+			irq_exit();
+		} else {
+			set_perf_counter_pending(1);
+		}
+	}
+}
+
+static int init_perf_counters(void)
+{
+	if (reserve_pmc_hardware(perf_counter_interrupt)) {
+		printk(KERN_ERR "Couldn't init performance monitor subsystem\n");
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+arch_initcall(init_perf_counters);
-- 
cgit v1.2.3


From 16b067993dee3dfde61b20027e0b168dc06201ee Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Sat, 10 Jan 2009 16:34:07 +1100
Subject: powerpc/perf_counter: Add support for PPC970 family

This adds the back-end for the PMU on the PPC970 family.

The PPC970 allows events from the ISU to be selected in two different
ways.  Rather than use alternative event codes to express this, we
instead use a single encoding for ISU events and express the
resulting constraint (that you can't select events from all three
of FPU/IFU/VPU, ISU and IDU/STS at the same time, since they all come
in through only 2 multiplexers) using a NAND constraint field, and
work out which multiplexer is used for ISU events at compute_mmcr
time.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/Makefile       |   2 +-
 arch/powerpc/kernel/perf_counter.c |  13 ++
 arch/powerpc/kernel/ppc970-pmu.c   | 375 +++++++++++++++++++++++++++++++++++++
 3 files changed, 389 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/kernel/ppc970-pmu.c

(limited to 'arch')

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index fde190bbb2b..45798f6fb13 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -94,7 +94,7 @@ obj-$(CONFIG_AUDIT)		+= audit.o
 obj64-$(CONFIG_AUDIT)		+= compat_audit.o
 
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
-obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o
+obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o ppc970-pmu.o
 
 obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
 
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index c7d4c2966a5..5561ecb02a4 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -741,13 +741,26 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 	}
 }
 
+extern struct power_pmu ppc970_pmu;
+
 static int init_perf_counters(void)
 {
+	unsigned long pvr;
+
 	if (reserve_pmc_hardware(perf_counter_interrupt)) {
 		printk(KERN_ERR "Couldn't init performance monitor subsystem\n");
 		return -EBUSY;
 	}
 
+	/* XXX should get this from cputable */
+	pvr = mfspr(SPRN_PVR);
+	switch (PVR_VER(pvr)) {
+	case PV_970:
+	case PV_970FX:
+	case PV_970MP:
+		ppmu = &ppc970_pmu;
+		break;
+	}
 	return 0;
 }
 
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
new file mode 100644
index 00000000000..c3256580be1
--- /dev/null
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -0,0 +1,375 @@
+/*
+ * Performance counter support for PPC970-family processors.
+ *
+ * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/string.h>
+#include <linux/perf_counter.h>
+#include <asm/reg.h>
+
+/*
+ * Bits in event code for PPC970
+ */
+#define PM_PMC_SH	12	/* PMC number (1-based) for direct events */
+#define PM_PMC_MSK	0xf
+#define PM_UNIT_SH	8	/* TTMMUX number and setting - unit select */
+#define PM_UNIT_MSK	0xf
+#define PM_BYTE_SH	4	/* Byte number of event bus to use */
+#define PM_BYTE_MSK	3
+#define PM_PMCSEL_MSK	0xf
+
+/* Values in PM_UNIT field */
+#define PM_NONE		0
+#define PM_FPU		1
+#define PM_VPU		2
+#define PM_ISU		3
+#define PM_IFU		4
+#define PM_IDU		5
+#define PM_STS		6
+#define PM_LSU0		7
+#define PM_LSU1U	8
+#define PM_LSU1L	9
+#define PM_LASTUNIT	9
+
+/*
+ * Bits in MMCR0 for PPC970
+ */
+#define MMCR0_PMC1SEL_SH	8
+#define MMCR0_PMC2SEL_SH	1
+#define MMCR_PMCSEL_MSK		0x1f
+
+/*
+ * Bits in MMCR1 for PPC970
+ */
+#define MMCR1_TTM0SEL_SH	62
+#define MMCR1_TTM1SEL_SH	59
+#define MMCR1_TTM3SEL_SH	53
+#define MMCR1_TTMSEL_MSK	3
+#define MMCR1_TD_CP_DBG0SEL_SH	50
+#define MMCR1_TD_CP_DBG1SEL_SH	48
+#define MMCR1_TD_CP_DBG2SEL_SH	46
+#define MMCR1_TD_CP_DBG3SEL_SH	44
+#define MMCR1_PMC1_ADDER_SEL_SH	39
+#define MMCR1_PMC2_ADDER_SEL_SH	38
+#define MMCR1_PMC6_ADDER_SEL_SH	37
+#define MMCR1_PMC5_ADDER_SEL_SH	36
+#define MMCR1_PMC8_ADDER_SEL_SH	35
+#define MMCR1_PMC7_ADDER_SEL_SH	34
+#define MMCR1_PMC3_ADDER_SEL_SH	33
+#define MMCR1_PMC4_ADDER_SEL_SH	32
+#define MMCR1_PMC3SEL_SH	27
+#define MMCR1_PMC4SEL_SH	22
+#define MMCR1_PMC5SEL_SH	17
+#define MMCR1_PMC6SEL_SH	12
+#define MMCR1_PMC7SEL_SH	7
+#define MMCR1_PMC8SEL_SH	2
+
+static short mmcr1_adder_bits[8] = {
+	MMCR1_PMC1_ADDER_SEL_SH,
+	MMCR1_PMC2_ADDER_SEL_SH,
+	MMCR1_PMC3_ADDER_SEL_SH,
+	MMCR1_PMC4_ADDER_SEL_SH,
+	MMCR1_PMC5_ADDER_SEL_SH,
+	MMCR1_PMC6_ADDER_SEL_SH,
+	MMCR1_PMC7_ADDER_SEL_SH,
+	MMCR1_PMC8_ADDER_SEL_SH
+};
+
+/*
+ * Bits in MMCRA
+ */
+
+/*
+ * Layout of constraint bits:
+ * 6666555555555544444444443333333333222222222211111111110000000000
+ * 3210987654321098765432109876543210987654321098765432109876543210
+ *                 <><>[  >[  >[  ><  ><  ><  ><  ><><><><><><><><>
+ *                 T0T1 UC  PS1 PS2 B0  B1  B2  B3 P1P2P3P4P5P6P7P8
+ *
+ * T0 - TTM0 constraint
+ *     46-47: TTM0SEL value (0=FPU, 2=IFU, 3=VPU) 0xC000_0000_0000
+ *
+ * T1 - TTM1 constraint
+ *     44-45: TTM1SEL value (0=IDU, 3=STS) 0x3000_0000_0000
+ *
+ * UC - unit constraint: can't have all three of FPU|IFU|VPU, ISU, IDU|STS
+ *     43: UC3 error 0x0800_0000_0000
+ *     42: FPU|IFU|VPU events needed 0x0400_0000_0000
+ *     41: ISU events needed 0x0200_0000_0000
+ *     40: IDU|STS events needed 0x0100_0000_0000
+ *
+ * PS1
+ *     39: PS1 error 0x0080_0000_0000
+ *     36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000
+ *
+ * PS2
+ *     35: PS2 error 0x0008_0000_0000
+ *     32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000
+ *
+ * B0
+ *     28-31: Byte 0 event source 0xf000_0000
+ *	      Encoding as for the event code
+ *
+ * B1, B2, B3
+ *     24-27, 20-23, 16-19: Byte 1, 2, 3 event sources
+ *
+ * P1
+ *     15: P1 error 0x8000
+ *     14-15: Count of events needing PMC1
+ *
+ * P2..P8
+ *     0-13: Count of events needing PMC2..PMC8
+ */
+
+/* Masks and values for using events from the various units */
+static u64 unit_cons[PM_LASTUNIT+1][2] = {
+	[PM_FPU] =   { 0xc80000000000ull, 0x040000000000ull },
+	[PM_VPU] =   { 0xc80000000000ull, 0xc40000000000ull },
+	[PM_ISU] =   { 0x080000000000ull, 0x020000000000ull },
+	[PM_IFU] =   { 0xc80000000000ull, 0x840000000000ull },
+	[PM_IDU] =   { 0x380000000000ull, 0x010000000000ull },
+	[PM_STS] =   { 0x380000000000ull, 0x310000000000ull },
+};
+
+static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
+{
+	int pmc, byte, unit, sh;
+	u64 mask = 0, value = 0;
+	int grp = -1;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	if (pmc) {
+		if (pmc > 8)
+			return -1;
+		sh = (pmc - 1) * 2;
+		mask |= 2 << sh;
+		value |= 1 << sh;
+		grp = ((pmc - 1) >> 1) & 1;
+	}
+	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+	if (unit) {
+		if (unit > PM_LASTUNIT)
+			return -1;
+		mask |= unit_cons[unit][0];
+		value |= unit_cons[unit][1];
+		byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+		/*
+		 * Bus events on bytes 0 and 2 can be counted
+		 * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8.
+		 */
+		if (!pmc)
+			grp = byte & 1;
+		/* Set byte lane select field */
+		mask  |= 0xfULL << (28 - 4 * byte);
+		value |= (u64)unit << (28 - 4 * byte);
+	}
+	if (grp == 0) {
+		/* increment PMC1/2/5/6 field */
+		mask  |= 0x8000000000ull;
+		value |= 0x1000000000ull;
+	} else if (grp == 1) {
+		/* increment PMC3/4/7/8 field */
+		mask  |= 0x800000000ull;
+		value |= 0x100000000ull;
+	}
+	*maskp = mask;
+	*valp = value;
+	return 0;
+}
+
+static int p970_get_alternatives(unsigned int event, unsigned int alt[])
+{
+	alt[0] = event;
+
+	/* 2 alternatives for LSU empty */
+	if (event == 0x2002 || event == 0x3002) {
+		alt[1] = event ^ 0x1000;
+		return 2;
+	}
+		
+	return 1;
+}
+
+static int p970_compute_mmcr(unsigned int event[], int n_ev,
+			     unsigned int hwc[], u64 mmcr[])
+{
+	u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0;
+	unsigned int pmc, unit, byte, psel;
+	unsigned int ttm, grp;
+	unsigned int pmc_inuse = 0;
+	unsigned int pmc_grp_use[2];
+	unsigned char busbyte[4];
+	unsigned char unituse[16];
+	unsigned char unitmap[] = { 0, 0<<3, 3<<3, 1<<3, 2<<3, 0|4, 3|4 };
+	unsigned char ttmuse[2];
+	unsigned char pmcsel[8];
+	int i;
+
+	if (n_ev > 8)
+		return -1;
+
+	/* First pass to count resource use */
+	pmc_grp_use[0] = pmc_grp_use[1] = 0;
+	memset(busbyte, 0, sizeof(busbyte));
+	memset(unituse, 0, sizeof(unituse));
+	for (i = 0; i < n_ev; ++i) {
+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+		if (pmc) {
+			if (pmc_inuse & (1 << (pmc - 1)))
+				return -1;
+			pmc_inuse |= 1 << (pmc - 1);
+			/* count 1/2/5/6 vs 3/4/7/8 use */
+			++pmc_grp_use[((pmc - 1) >> 1) & 1];
+		}
+		unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+		byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+		if (unit) {
+			if (unit > PM_LASTUNIT)
+				return -1;
+			if (!pmc)
+				++pmc_grp_use[byte & 1];
+			if (busbyte[byte] && busbyte[byte] != unit)
+				return -1;
+			busbyte[byte] = unit;
+			unituse[unit] = 1;
+		}
+	}
+	if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4)
+		return -1;
+
+	/*
+	 * Assign resources and set multiplexer selects.
+	 *
+	 * PM_ISU can go either on TTM0 or TTM1, but that's the only
+	 * choice we have to deal with.
+	 */
+	if (unituse[PM_ISU] &
+	    (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_VPU]))
+		unitmap[PM_ISU] = 2 | 4;	/* move ISU to TTM1 */
+	/* Set TTM[01]SEL fields. */
+	ttmuse[0] = ttmuse[1] = 0;
+	for (i = PM_FPU; i <= PM_STS; ++i) {
+		if (!unituse[i])
+			continue;
+		ttm = unitmap[i];
+		++ttmuse[(ttm >> 2) & 1];
+		mmcr1 |= (u64)(ttm & ~4) << MMCR1_TTM1SEL_SH;
+	}
+	/* Check only one unit per TTMx */
+	if (ttmuse[0] > 1 || ttmuse[1] > 1)
+		return -1;
+
+	/* Set byte lane select fields and TTM3SEL. */
+	for (byte = 0; byte < 4; ++byte) {
+		unit = busbyte[byte];
+		if (!unit)
+			continue;
+		if (unit <= PM_STS)
+			ttm = (unitmap[unit] >> 2) & 1;
+		else if (unit == PM_LSU0)
+			ttm = 2;
+		else {
+			ttm = 3;
+			if (unit == PM_LSU1L && byte >= 2)
+				mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
+		}
+		mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
+	}
+
+	/* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
+	memset(pmcsel, 0x8, sizeof(pmcsel));	/* 8 means don't count */
+	for (i = 0; i < n_ev; ++i) {
+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+		unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+		byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+		psel = event[i] & PM_PMCSEL_MSK;
+		if (!pmc) {
+			/* Bus event or any-PMC direct event */
+			if (unit)
+				psel |= 0x10 | ((byte & 2) << 2);
+			else
+				psel |= 8;
+			for (pmc = 0; pmc < 8; ++pmc) {
+				if (pmc_inuse & (1 << pmc))
+					continue;
+				grp = (pmc >> 1) & 1;
+				if (unit) {
+					if (grp == (byte & 1))
+						break;
+				} else if (pmc_grp_use[grp] < 4) {
+					++pmc_grp_use[grp];
+					break;
+				}
+			}
+			pmc_inuse |= 1 << pmc;
+		} else {
+			/* Direct event */
+			--pmc;
+			if (psel == 0 && (byte & 2))
+				/* add events on higher-numbered bus */
+				mmcr1 |= 1ull << mmcr1_adder_bits[pmc];
+		}
+		pmcsel[pmc] = psel;
+		hwc[i] = pmc;
+	}
+	for (pmc = 0; pmc < 2; ++pmc)
+		mmcr0 |= pmcsel[pmc] << (MMCR0_PMC1SEL_SH - 7 * pmc);
+	for (; pmc < 8; ++pmc)
+		mmcr1 |= (u64)pmcsel[pmc] << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2));
+	if (pmc_inuse & 1)
+		mmcr0 |= MMCR0_PMC1CE;
+	if (pmc_inuse & 0xfe)
+		mmcr0 |= MMCR0_PMCjCE;
+
+	mmcra |= 0x2000;	/* mark only one IOP per PPC instruction */
+
+	/* Return MMCRx values */
+	mmcr[0] = mmcr0;
+	mmcr[1] = mmcr1;
+	mmcr[2] = mmcra;
+	return 0;
+}
+
+static void p970_disable_pmc(unsigned int pmc, u64 mmcr[])
+{
+	int shift, i;
+
+	if (pmc <= 1) {
+		shift = MMCR0_PMC1SEL_SH - 7 * pmc;
+		i = 0;
+	} else {
+		shift = MMCR1_PMC3SEL_SH - 5 * (pmc - 2);
+		i = 1;
+	}
+	/*
+	 * Setting the PMCxSEL field to 0x08 disables PMC x.
+	 */
+	mmcr[i] = (mmcr[i] & ~(0x1fUL << shift)) | (0x08UL << shift);
+}
+
+static int ppc970_generic_events[] = {
+	[PERF_COUNT_CPU_CYCLES] = 7,
+	[PERF_COUNT_INSTRUCTIONS] = 1,
+	[PERF_COUNT_CACHE_REFERENCES] = 0x8810,		/* PM_LD_REF_L1 */
+	[PERF_COUNT_CACHE_MISSES] = 0x3810,		/* PM_LD_MISS_L1 */
+	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x431,	/* PM_BR_ISSUED */
+	[PERF_COUNT_BRANCH_MISSES] = 0x327,		/* PM_GRP_BR_MPRED */
+};
+
+struct power_pmu ppc970_pmu = {
+	.n_counter = 8,
+	.max_alternatives = 2,
+	.add_fields = 0x001100005555ull,
+	.test_adder = 0x013300000000ull,
+	.compute_mmcr = p970_compute_mmcr,
+	.get_constraint = p970_get_constraint,
+	.get_alternatives = p970_get_alternatives,
+	.disable_pmc = p970_disable_pmc,
+	.n_generic = ARRAY_SIZE(ppc970_generic_events),
+	.generic_events = ppc970_generic_events,
+};
-- 
cgit v1.2.3


From f78628374a13bc150db77c6e02d4f2c0a7f932ef Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 9 Jan 2009 21:05:35 +1100
Subject: powerpc/perf_counter: Add support for POWER6

This adds the back-end for the PMU on the POWER6 processor.
Fortunately, the event selection hardware is somewhat simpler on
POWER6 than on other POWER family processors, so the constraints
fit into only 32 bits.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/Makefile       |   2 +-
 arch/powerpc/kernel/perf_counter.c |   4 +
 arch/powerpc/kernel/power6-pmu.c   | 283 +++++++++++++++++++++++++++++++++++++
 3 files changed, 288 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/kernel/power6-pmu.c

(limited to 'arch')

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 45798f6fb13..0ebf4d04d4b 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -94,7 +94,7 @@ obj-$(CONFIG_AUDIT)		+= audit.o
 obj64-$(CONFIG_AUDIT)		+= compat_audit.o
 
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
-obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o ppc970-pmu.o
+obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o ppc970-pmu.o power6-pmu.o
 
 obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
 
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 5561ecb02a4..df3fe057dee 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -742,6 +742,7 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 }
 
 extern struct power_pmu ppc970_pmu;
+extern struct power_pmu power6_pmu;
 
 static int init_perf_counters(void)
 {
@@ -760,6 +761,9 @@ static int init_perf_counters(void)
 	case PV_970MP:
 		ppmu = &ppc970_pmu;
 		break;
+	case 0x3e:
+		ppmu = &power6_pmu;
+		break;
 	}
 	return 0;
 }
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
new file mode 100644
index 00000000000..b1f61f3c97b
--- /dev/null
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -0,0 +1,283 @@
+/*
+ * Performance counter support for POWER6 processors.
+ *
+ * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/perf_counter.h>
+#include <asm/reg.h>
+
+/*
+ * Bits in event code for POWER6
+ */
+#define PM_PMC_SH	20	/* PMC number (1-based) for direct events */
+#define PM_PMC_MSK	0x7
+#define PM_PMC_MSKS	(PM_PMC_MSK << PM_PMC_SH)
+#define PM_UNIT_SH	16	/* Unit event comes (TTMxSEL encoding) */
+#define PM_UNIT_MSK	0xf
+#define PM_UNIT_MSKS	(PM_UNIT_MSK << PM_UNIT_SH)
+#define PM_LLAV		0x8000	/* Load lookahead match value */
+#define PM_LLA		0x4000	/* Load lookahead match enable */
+#define PM_BYTE_SH	12	/* Byte of event bus to use */
+#define PM_BYTE_MSK	3
+#define PM_SUBUNIT_SH	8	/* Subunit event comes from (NEST_SEL enc.) */
+#define PM_SUBUNIT_MSK	7
+#define PM_SUBUNIT_MSKS	(PM_SUBUNIT_MSK << PM_SUBUNIT_SH)
+#define PM_PMCSEL_MSK	0xff	/* PMCxSEL value */
+#define PM_BUSEVENT_MSK	0xf3700
+
+/*
+ * Bits in MMCR1 for POWER6
+ */
+#define MMCR1_TTM0SEL_SH	60
+#define MMCR1_TTMSEL_SH(n)	(MMCR1_TTM0SEL_SH - (n) * 4)
+#define MMCR1_TTMSEL_MSK	0xf
+#define MMCR1_TTMSEL(m, n)	(((m) >> MMCR1_TTMSEL_SH(n)) & MMCR1_TTMSEL_MSK)
+#define MMCR1_NESTSEL_SH	45
+#define MMCR1_NESTSEL_MSK	0x7
+#define MMCR1_NESTSEL(m)	(((m) >> MMCR1_NESTSEL_SH) & MMCR1_NESTSEL_MSK)
+#define MMCR1_PMC1_LLA		((u64)1 << 44)
+#define MMCR1_PMC1_LLA_VALUE	((u64)1 << 39)
+#define MMCR1_PMC1_ADDR_SEL	((u64)1 << 35)
+#define MMCR1_PMC1SEL_SH	24
+#define MMCR1_PMCSEL_SH(n)	(MMCR1_PMC1SEL_SH - (n) * 8)
+#define MMCR1_PMCSEL_MSK	0xff
+
+/*
+ * Assign PMC numbers and compute MMCR1 value for a set of events
+ */
+static int p6_compute_mmcr(unsigned int event[], int n_ev,
+			   unsigned int hwc[], u64 mmcr[])
+{
+	u64 mmcr1 = 0;
+	int i;
+	unsigned int pmc, ev, b, u, s, psel;
+	unsigned int ttmset = 0;
+	unsigned int pmc_inuse = 0;
+
+	if (n_ev > 4)
+		return -1;
+	for (i = 0; i < n_ev; ++i) {
+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+		if (pmc) {
+			if (pmc_inuse & (1 << (pmc - 1)))
+				return -1;	/* collision! */
+			pmc_inuse |= 1 << (pmc - 1);
+		}
+	}
+	for (i = 0; i < n_ev; ++i) {
+		ev = event[i];
+		pmc = (ev >> PM_PMC_SH) & PM_PMC_MSK;
+		if (pmc) {
+			--pmc;
+		} else {
+			/* can go on any PMC; find a free one */
+			for (pmc = 0; pmc < 4; ++pmc)
+				if (!(pmc_inuse & (1 << pmc)))
+					break;
+			pmc_inuse |= 1 << pmc;
+		}
+		hwc[i] = pmc;
+		psel = ev & PM_PMCSEL_MSK;
+		if (ev & PM_BUSEVENT_MSK) {
+			/* this event uses the event bus */
+			b = (ev >> PM_BYTE_SH) & PM_BYTE_MSK;
+			u = (ev >> PM_UNIT_SH) & PM_UNIT_MSK;
+			/* check for conflict on this byte of event bus */
+			if ((ttmset & (1 << b)) && MMCR1_TTMSEL(mmcr1, b) != u)
+				return -1;
+			mmcr1 |= (u64)u << MMCR1_TTMSEL_SH(b);
+			ttmset |= 1 << b;
+			if (u == 5) {
+				/* Nest events have a further mux */
+				s = (ev >> PM_SUBUNIT_SH) & PM_SUBUNIT_MSK;
+				if ((ttmset & 0x10) &&
+				    MMCR1_NESTSEL(mmcr1) != s)
+					return -1;
+				ttmset |= 0x10;
+				mmcr1 |= (u64)s << MMCR1_NESTSEL_SH;
+			}
+			if (0x30 <= psel && psel <= 0x3d) {
+				/* these need the PMCx_ADDR_SEL bits */
+				if (b >= 2)
+					mmcr1 |= MMCR1_PMC1_ADDR_SEL >> pmc;
+			}
+			/* bus select values are different for PMC3/4 */
+			if (pmc >= 2 && (psel & 0x90) == 0x80)
+				psel ^= 0x20;
+		}
+		if (ev & PM_LLA) {
+			mmcr1 |= MMCR1_PMC1_LLA >> pmc;
+			if (ev & PM_LLAV)
+				mmcr1 |= MMCR1_PMC1_LLA_VALUE >> pmc;
+		}
+		mmcr1 |= (u64)psel << MMCR1_PMCSEL_SH(pmc);
+	}
+	mmcr[0] = 0;
+	if (pmc_inuse & 1)
+		mmcr[0] = MMCR0_PMC1CE;
+	if (pmc_inuse & 0xe)
+		mmcr[0] |= MMCR0_PMCjCE;
+	mmcr[1] = mmcr1;
+	mmcr[2] = 0;
+	return 0;
+}
+
+/*
+ * Layout of constraint bits:
+ *
+ *	0-1	add field: number of uses of PMC1 (max 1)
+ *	2-3, 4-5, 6-7: ditto for PMC2, 3, 4
+ *	8-10	select field: nest (subunit) event selector
+ *	16-19	select field: unit on byte 0 of event bus
+ *	20-23, 24-27, 28-31 ditto for bytes 1, 2, 3
+ */
+static int p6_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
+{
+	int pmc, byte, sh;
+	unsigned int mask = 0, value = 0;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	if (pmc) {
+		if (pmc > 4)
+			return -1;
+		sh = (pmc - 1) * 2;
+		mask |= 2 << sh;
+		value |= 1 << sh;
+	}
+	if (event & PM_BUSEVENT_MSK) {
+		byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+		sh = byte * 4;
+		mask |= PM_UNIT_MSKS << sh;
+		value |= (event & PM_UNIT_MSKS) << sh;
+		if ((event & PM_UNIT_MSKS) == (5 << PM_UNIT_SH)) {
+			mask |= PM_SUBUNIT_MSKS;
+			value |= event & PM_SUBUNIT_MSKS;
+		}
+	}
+	*maskp = mask;
+	*valp = value;
+	return 0;
+}
+
+#define MAX_ALT	4	/* at most 4 alternatives for any event */
+
+static const unsigned int event_alternatives[][MAX_ALT] = {
+	{ 0x0130e8, 0x2000f6, 0x3000fc },	/* PM_PTEG_RELOAD_VALID */
+	{ 0x080080, 0x10000d, 0x30000c, 0x4000f0 }, /* PM_LD_MISS_L1 */
+	{ 0x080088, 0x200054, 0x3000f0 },	/* PM_ST_MISS_L1 */
+	{ 0x10000a, 0x2000f4 },			/* PM_RUN_CYC */
+	{ 0x10000b, 0x2000f5 },			/* PM_RUN_COUNT */
+	{ 0x10000e, 0x400010 },			/* PM_PURR */
+	{ 0x100010, 0x4000f8 },			/* PM_FLUSH */
+	{ 0x10001a, 0x200010 },			/* PM_MRK_INST_DISP */
+	{ 0x100026, 0x3000f8 },			/* PM_TB_BIT_TRANS */
+	{ 0x100054, 0x2000f0 },			/* PM_ST_FIN */
+	{ 0x100056, 0x2000fc },			/* PM_L1_ICACHE_MISS */
+	{ 0x1000f0, 0x40000a },			/* PM_INST_IMC_MATCH_CMPL */
+	{ 0x1000f8, 0x200008 },			/* PM_GCT_EMPTY_CYC */
+	{ 0x1000fc, 0x400006 },			/* PM_LSU_DERAT_MISS_CYC */
+	{ 0x20000e, 0x400007 },			/* PM_LSU_DERAT_MISS */
+	{ 0x200012, 0x300012 },			/* PM_INST_DISP */
+	{ 0x2000f2, 0x3000f2 },			/* PM_INST_DISP */
+	{ 0x2000f8, 0x300010 },			/* PM_EXT_INT */
+	{ 0x2000fe, 0x300056 },			/* PM_DATA_FROM_L2MISS */
+	{ 0x2d0030, 0x30001a },			/* PM_MRK_FPU_FIN */
+	{ 0x30000a, 0x400018 },			/* PM_MRK_INST_FIN */
+	{ 0x3000f6, 0x40000e },			/* PM_L1_DCACHE_RELOAD_VALID */
+	{ 0x3000fe, 0x400056 },			/* PM_DATA_FROM_L3MISS */
+};
+
+/*
+ * This could be made more efficient with a binary search on
+ * a presorted list, if necessary
+ */
+static int find_alternatives_list(unsigned int event)
+{
+	int i, j;
+	unsigned int alt;
+
+	for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
+		if (event < event_alternatives[i][0])
+			return -1;
+		for (j = 0; j < MAX_ALT; ++j) {
+			alt = event_alternatives[i][j];
+			if (!alt || event < alt)
+				break;
+			if (event == alt)
+				return i;
+		}
+	}
+	return -1;
+}
+
+static int p6_get_alternatives(unsigned int event, unsigned int alt[])
+{
+	int i, j;
+	unsigned int aevent, psel, pmc;
+	unsigned int nalt = 1;
+
+	alt[0] = event;
+
+	/* check the alternatives table */
+	i = find_alternatives_list(event);
+	if (i >= 0) {
+		/* copy out alternatives from list */
+		for (j = 0; j < MAX_ALT; ++j) {
+			aevent = event_alternatives[i][j];
+			if (!aevent)
+				break;
+			if (aevent != event)
+				alt[nalt++] = aevent;
+		}
+
+	} else {
+		/* Check for alternative ways of computing sum events */
+		/* PMCSEL 0x32 counter N == PMCSEL 0x34 counter 5-N */
+		psel = event & (PM_PMCSEL_MSK & ~1);	/* ignore edge bit */
+		pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+		if (pmc && (psel == 0x32 || psel == 0x34))
+			alt[nalt++] = ((event ^ 0x6) & ~PM_PMC_MSKS) |
+				((5 - pmc) << PM_PMC_SH);
+
+		/* PMCSEL 0x38 counter N == PMCSEL 0x3a counter N+/-2 */
+		if (pmc && (psel == 0x38 || psel == 0x3a))
+			alt[nalt++] = ((event ^ 0x2) & ~PM_PMC_MSKS) |
+				((pmc > 2? pmc - 2: pmc + 2) << PM_PMC_SH);
+	}
+
+	return nalt;
+}
+
+static void p6_disable_pmc(unsigned int pmc, u64 mmcr[])
+{
+	/* Set PMCxSEL to 0 to disable PMCx */
+	mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc));
+}
+
+static int power6_generic_events[] = {
+	[PERF_COUNT_CPU_CYCLES] = 0x1e,
+	[PERF_COUNT_INSTRUCTIONS] = 2,
+	[PERF_COUNT_CACHE_REFERENCES] = 0x280030,	/* LD_REF_L1 */
+	[PERF_COUNT_CACHE_MISSES] = 0x30000c,		/* LD_MISS_L1 */
+	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x410a0,	/* BR_PRED */ 
+	[PERF_COUNT_BRANCH_MISSES] = 0x400052,		/* BR_MPRED */
+};
+
+struct power_pmu power6_pmu = {
+	.n_counter = 4,
+	.max_alternatives = MAX_ALT,
+	.add_fields = 0x55,
+	.test_adder = 0,
+	.compute_mmcr = p6_compute_mmcr,
+	.get_constraint = p6_get_constraint,
+	.get_alternatives = p6_get_alternatives,
+	.disable_pmc = p6_disable_pmc,
+	.n_generic = ARRAY_SIZE(power6_generic_events),
+	.generic_events = power6_generic_events,
+};
-- 
cgit v1.2.3


From 01d0287f068de2934109ba9b989d8807526cccc2 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 14 Jan 2009 13:44:19 +1100
Subject: powerpc/perf_counter: Make sure PMU gets enabled properly

This makes sure that we call the platform-specific ppc_md.enable_pmcs
function on each CPU before we try to use the PMU on that CPU.  If the
CPU goes off-line and then on-line, we need to do the enable_pmcs call
again, so we use the hw_perf_counter_setup hook to ensure that.  It gets
called as each CPU comes online, but it isn't called on the CPU that is
coming up, so this adds the CPU number as an argument to it (there were
no non-empty instances of hw_perf_counter_setup before).

This also arranges to set the pmcregs_in_use field of the lppaca (data
structure shared with the hypervisor) on each CPU when we are using the
PMU and clear it when we are not.  This allows the hypervisor to optimize
partition switches by not saving/restoring the PMU registers when we
aren't using the PMU.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/perf_counter.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index df3fe057dee..85ad25923c2 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -15,6 +15,7 @@
 #include <linux/hardirq.h>
 #include <asm/reg.h>
 #include <asm/pmc.h>
+#include <asm/machdep.h>
 
 struct cpu_hw_counters {
 	int n_counters;
@@ -24,6 +25,7 @@ struct cpu_hw_counters {
 	struct perf_counter *counter[MAX_HWCOUNTERS];
 	unsigned int events[MAX_HWCOUNTERS];
 	u64 mmcr[3];
+	u8 pmcs_enabled;
 };
 DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
 
@@ -261,6 +263,15 @@ u64 hw_perf_save_disable(void)
 		cpuhw->disabled = 1;
 		cpuhw->n_added = 0;
 
+		/*
+		 * Check if we ever enabled the PMU on this cpu.
+		 */
+		if (!cpuhw->pmcs_enabled) {
+			if (ppc_md.enable_pmcs)
+				ppc_md.enable_pmcs();
+			cpuhw->pmcs_enabled = 1;
+		}
+
 		/*
 		 * Set the 'freeze counters' bit.
 		 * The barrier is to make sure the mtspr has been
@@ -305,6 +316,8 @@ void hw_perf_restore(u64 disable)
 		mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
 		mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
 		mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
+		if (cpuhw->n_counters == 0)
+			get_lppaca()->pmcregs_in_use = 0;
 		goto out;
 	}
 
@@ -323,6 +336,7 @@ void hw_perf_restore(u64 disable)
 	 * bit set and set the hardware counters to their initial values.
 	 * Then unfreeze the counters.
 	 */
+	get_lppaca()->pmcregs_in_use = 1;
 	mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
 	mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
 	mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
@@ -741,6 +755,14 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 	}
 }
 
+void hw_perf_counter_setup(int cpu)
+{
+	struct cpu_hw_counters *cpuhw = &per_cpu(cpu_hw_counters, cpu);
+
+	memset(cpuhw, 0, sizeof(*cpuhw));
+	cpuhw->mmcr[0] = MMCR0_FC;
+}
+
 extern struct power_pmu ppc970_pmu;
 extern struct power_pmu power6_pmu;
 
-- 
cgit v1.2.3


From 3b6f9e5cb21964b7ce12bf81076f830885563ec8 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 14 Jan 2009 21:00:30 +1100
Subject: perf_counter: Add support for pinned and exclusive counter groups

Impact: New perf_counter features

A pinned counter group is one that the user wants to have on the CPU
whenever possible, i.e. whenever the associated task is running, for
a per-task group, or always for a per-cpu group.  If the system
cannot satisfy that, it puts the group into an error state where
it is not scheduled any more and reads from it return EOF (i.e. 0
bytes read).  The group can be released from error state and made
readable again using prctl(PR_TASK_PERF_COUNTERS_ENABLE).  When we
have finer-grained enable/disable controls on counters we'll be able
to reset the error state on individual groups.

An exclusive group is one that the user wants to be the only group
using the CPU performance monitor hardware whenever it is on.  The
counter group scheduler will not schedule an exclusive group if there
are already other groups on the CPU and will not schedule other groups
onto the CPU if there is an exclusive group scheduled (that statement
does not apply to groups containing only software counters, which can
always go on and which do not prevent an exclusive group from going on).
With an exclusive group, we will be able to let users program PMU
registers at a low level without the concern that those settings will
perturb other measurements.

Along the way this reorganizes things a little:
- is_software_counter() is moved to perf_counter.h.
- cpuctx->active_oncpu now records the number of hardware counters on
  the CPU, i.e. it now excludes software counters.  Nothing was reading
  cpuctx->active_oncpu before, so this change is harmless.
- A new cpuctx->exclusive field records whether we currently have an
  exclusive group on the CPU.
- counter_sched_out moves higher up in perf_counter.c and gets called
  from __perf_counter_remove_from_context and __perf_counter_exit_task,
  where we used to have essentially the same code.
- __perf_counter_sched_in now goes through the counter list twice, doing
  the pinned counters in the first loop and the non-pinned counters in
  the second loop, in order to give the pinned counters the best chance
  to be scheduled in.

Note that only a group leader can be exclusive or pinned, and that
attribute applies to the whole group.  This avoids some awkwardness in
some corner cases (e.g. where a group leader is closed and the other
group members get added to the context list).  If we want to relax that
restriction later, we can, and it is easier to relax a restriction than
to apply a new one.

This doesn't yet handle the case where a pinned counter is inherited
and goes into error state in the child - the error state is not
propagated up to the parent when the child exits, and arguably it
should.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/perf_counter.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 85ad25923c2..5b0211348c7 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -35,14 +35,6 @@ void perf_counter_print_debug(void)
 {
 }
 
-/*
- * Return 1 for a software counter, 0 for a hardware counter
- */
-static inline int is_software_counter(struct perf_counter *counter)
-{
-	return !counter->hw_event.raw && counter->hw_event.type < 0;
-}
-
 /*
  * Read one performance monitor counter (PMC).
  */
@@ -443,6 +435,7 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader,
 	 */
 	for (i = n0; i < n0 + n; ++i)
 		cpuhw->counter[i]->hw.config = cpuhw->events[i];
+	cpuctx->active_oncpu += n;
 	n = 1;
 	counter_sched_in(group_leader, cpu);
 	list_for_each_entry(sub, &group_leader->sibling_list, list_entry) {
@@ -451,7 +444,6 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader,
 			++n;
 		}
 	}
-	cpuctx->active_oncpu += n;
 	ctx->nr_active += n;
 
 	return 1;
-- 
cgit v1.2.3


From 1b023a96d9b44f50f4d8ff28c15f5b80e354760f Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Fri, 23 Jan 2009 10:13:01 +0100
Subject: perfcounters: throttle on too high IRQ rates

Starting kerneltop with only -c 100 seems to be a bad idea, it can
easily lock the system due to perfcounter IRQ overload.

So add throttling: if a new IRQ arrives in a shorter than
PERFMON_MIN_PERIOD_NS time, turn off perfcounters and untrottle them
from the next timer tick.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic.c             |  2 ++
 arch/x86/kernel/cpu/perf_counter.c | 38 ++++++++++++++++++++++++++++++++------
 2 files changed, 34 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 7b434e5b14c..849c23009bf 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -781,6 +781,8 @@ static void local_apic_timer_interrupt(void)
 	inc_irq_stat(apic_timer_irqs);
 
 	evt->event_handler(evt);
+
+	perf_counter_unthrottle();
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 9376771f757..1a040b179b5 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -33,6 +33,9 @@ static int nr_counters_fixed __read_mostly;
 struct cpu_hw_counters {
 	struct perf_counter	*counters[X86_PMC_IDX_MAX];
 	unsigned long		used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	u64			last_interrupt;
+	u64			global_enable;
+	int			throttled;
 };
 
 /*
@@ -474,16 +477,19 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 {
 	int bit, cpu = smp_processor_id();
-	u64 ack, status, saved_global;
-	struct cpu_hw_counters *cpuc;
+	u64 ack, status, now;
+	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
 
-	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global);
+	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
 
 	/* Disable counters globally */
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
 	ack_APIC_irq();
 
-	cpuc = &per_cpu(cpu_hw_counters, cpu);
+	now = sched_clock();
+	if (now - cpuc->last_interrupt < PERFMON_MIN_PERIOD_NS)
+		cpuc->throttled = 1;
+	cpuc->last_interrupt = now;
 
 	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 	if (!status)
@@ -533,9 +539,29 @@ again:
 		goto again;
 out:
 	/*
-	 * Restore - do not reenable when global enable is off:
+	 * Restore - do not reenable when global enable is off or throttled:
 	 */
-	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global);
+	if (!cpuc->throttled)
+		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
+}
+
+void perf_counter_unthrottle(void)
+{
+	struct cpu_hw_counters *cpuc;
+
+	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+		return;
+
+	if (unlikely(!perf_counters_initialized))
+		return;
+
+	cpuc = &per_cpu(cpu_hw_counters, smp_processor_id());
+	if (cpuc->throttled) {
+		if (printk_ratelimit())
+			printk(KERN_WARNING "PERFMON: max event frequency exceeded!\n");
+		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
+		cpuc->throttled = 0;
+	}
 }
 
 void smp_perf_counter_interrupt(struct pt_regs *regs)
-- 
cgit v1.2.3


From 4b39fd96855254a244f71245b41a91cdecb87d63 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Fri, 23 Jan 2009 14:36:16 +0100
Subject: perfcounters: ratelimit performance counter interrupts

Ratelimit performance counter interrupts to 100KHz per CPU.

This replaces the irq-delta-time based method.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 1a040b179b5..a56d4cf92f3 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -33,9 +33,8 @@ static int nr_counters_fixed __read_mostly;
 struct cpu_hw_counters {
 	struct perf_counter	*counters[X86_PMC_IDX_MAX];
 	unsigned long		used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-	u64			last_interrupt;
+	unsigned long		interrupts;
 	u64			global_enable;
-	int			throttled;
 };
 
 /*
@@ -470,6 +469,11 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 	}
 }
 
+/*
+ * Maximum interrupt frequency of 100KHz per CPU
+ */
+#define PERFMON_MAX_INTERRUPTS 100000/HZ
+
 /*
  * This handler is triggered by the local APIC, so the APIC IRQ handling
  * rules apply:
@@ -477,7 +481,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 {
 	int bit, cpu = smp_processor_id();
-	u64 ack, status, now;
+	u64 ack, status;
 	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
 
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
@@ -486,11 +490,6 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
 	ack_APIC_irq();
 
-	now = sched_clock();
-	if (now - cpuc->last_interrupt < PERFMON_MIN_PERIOD_NS)
-		cpuc->throttled = 1;
-	cpuc->last_interrupt = now;
-
 	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 	if (!status)
 		goto out;
@@ -541,13 +540,14 @@ out:
 	/*
 	 * Restore - do not reenable when global enable is off or throttled:
 	 */
-	if (!cpuc->throttled)
+	if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS)
 		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
 }
 
 void perf_counter_unthrottle(void)
 {
 	struct cpu_hw_counters *cpuc;
+	u64 global_enable;
 
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
 		return;
@@ -556,12 +556,15 @@ void perf_counter_unthrottle(void)
 		return;
 
 	cpuc = &per_cpu(cpu_hw_counters, smp_processor_id());
-	if (cpuc->throttled) {
+	if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
 		if (printk_ratelimit())
-			printk(KERN_WARNING "PERFMON: max event frequency exceeded!\n");
+			printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n");
 		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
-		cpuc->throttled = 0;
 	}
+	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, global_enable);
+	if (unlikely(cpuc->global_enable && !global_enable))
+		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
+	cpuc->interrupts = 0;
 }
 
 void smp_perf_counter_interrupt(struct pt_regs *regs)
-- 
cgit v1.2.3


From 3415dd9146c574bffe8f012c096bfc2bc62b9508 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Fri, 23 Jan 2009 14:16:53 +0100
Subject: perfcounters fix section mismatch warning in
 perf_counter.c::perf_counters_lapic_init()

Fix:

WARNING: arch/x86/kernel/built-in.o(.text+0xdd0f): Section mismatch in reference from the function pmc_generic_enable() to the function .cpuinit.text:perf_counters_lapic_init()
The function pmc_generic_enable() references
the function __cpuinit perf_counters_lapic_init().
This is often because pmc_generic_enable lacks a __cpuinit
annotation or the annotation of perf_counters_lapic_init is wrong.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index a56d4cf92f3..46c436cdd73 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -605,7 +605,7 @@ void perf_counter_notify(struct pt_regs *regs)
 	local_irq_restore(flags);
 }
 
-void __cpuinit perf_counters_lapic_init(int nmi)
+void perf_counters_lapic_init(int nmi)
 {
 	u32 apic_val;
 
-- 
cgit v1.2.3


From bb3f0b59ad005d2d2ecbbe9bd048eab6d1ecbd31 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sun, 25 Jan 2009 02:38:09 -0800
Subject: x86: make irqinit_32.c more like irqinit_64.c, v2

Impact: cleanup

1. add smp_intr_init and apic_intr_init for 32bit, the same as 64bit
2. move the apic_intr_init() call before set gate with interrupt[i]
3. for 64bit, if ia32_emulation is not used, will make per_cpu to use 0x80 vector.

[ v2: should use !test_bit() instead of test_bit() with 32bit ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irqinit_32.c | 56 ++++++++++++++++++++++++++------------------
 arch/x86/kernel/irqinit_64.c |  7 +++---
 arch/x86/kernel/traps.c      | 15 +++++-------
 3 files changed, 43 insertions(+), 35 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index c56496f8c6f..ddf3eb72f86 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -120,28 +120,8 @@ int vector_used_by_percpu_irq(unsigned int vector)
 	return 0;
 }
 
-/* Overridden in paravirt.c */
-void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
-
-void __init native_init_IRQ(void)
+static void __init smp_intr_init(void)
 {
-	int i;
-
-	/* all the set up before the call gates are initialised */
-	pre_intr_init_hook();
-
-	/*
-	 * Cover the whole vector space, no vector can escape
-	 * us. (some of these will be overridden and become
-	 * 'special' SMP interrupts)
-	 */
-	for (i =  FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
-		/* SYSCALL_VECTOR was reserved in trap_init. */
-		if (i != SYSCALL_VECTOR)
-			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
-	}
-
-
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
 	/*
 	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
@@ -170,8 +150,13 @@ void __init native_init_IRQ(void)
 	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
 	set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
 #endif
+}
 
+static void __init apic_intr_init(void)
+{
 #ifdef CONFIG_X86_LOCAL_APIC
+	smp_intr_init();
+
 	/* self generated IPI for local APIC timer */
 	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
 
@@ -181,12 +166,37 @@ void __init native_init_IRQ(void)
 # ifdef CONFIG_PERF_COUNTERS
 	alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
 # endif
-#endif
 
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
+# ifdef CONFIG_X86_MCE_P4THERMAL
 	/* thermal monitor LVT interrupt */
 	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+# endif
 #endif
+}
+
+/* Overridden in paravirt.c */
+void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
+
+void __init native_init_IRQ(void)
+{
+	int i;
+
+	/* all the set up before the call gates are initialised */
+	pre_intr_init_hook();
+
+	apic_intr_init();
+
+	/*
+	 * Cover the whole vector space, no vector can escape
+	 * us. (some of these will be overridden and become
+	 * 'special' SMP interrupts)
+	 */
+	for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
+		int vector = FIRST_EXTERNAL_VECTOR + i;
+		/* SYSCALL_VECTOR was reserved in trap_init. */
+		if (!test_bit(vector, used_vectors))
+			set_intr_gate(vector, interrupt[i]);
+	}
 
 	if (!acpi_ioapic)
 		setup_irq(2, &irq2);
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 6a71bfc51e5..16e1fc68750 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -162,6 +162,9 @@ void __init native_init_IRQ(void)
 	int i;
 
 	init_ISA_irqs();
+
+	apic_intr_init();
+
 	/*
 	 * Cover the whole vector space, no vector can escape
 	 * us. (some of these will be overridden and become
@@ -169,12 +172,10 @@ void __init native_init_IRQ(void)
 	 */
 	for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
 		int vector = FIRST_EXTERNAL_VECTOR + i;
-		if (vector != IA32_SYSCALL_VECTOR)
+		if (!test_bit(vector, used_vectors))
 			set_intr_gate(vector, interrupt[i]);
 	}
 
-	apic_intr_init();
-
 	if (!acpi_ioapic)
 		setup_irq(2, &irq2);
 }
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ed5aee5f3fc..d36a502d87a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -979,8 +979,13 @@ void __init trap_init(void)
 #endif
 	set_intr_gate(19, &simd_coprocessor_error);
 
+	/* Reserve all the builtin and the syscall vector: */
+	for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
+		set_bit(i, used_vectors);
+
 #ifdef CONFIG_IA32_EMULATION
 	set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
+	set_bit(IA32_SYSCALL_VECTOR, used_vectors);
 #endif
 
 #ifdef CONFIG_X86_32
@@ -997,17 +1002,9 @@ void __init trap_init(void)
 	}
 
 	set_system_trap_gate(SYSCALL_VECTOR, &system_call);
-#endif
-
-	/* Reserve all the builtin and the syscall vector: */
-	for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
-		set_bit(i, used_vectors);
-
-#ifdef CONFIG_X86_64
-	set_bit(IA32_SYSCALL_VECTOR, used_vectors);
-#else
 	set_bit(SYSCALL_VECTOR, used_vectors);
 #endif
+
 	/*
 	 * Should be a barrier for any external CPU state:
 	 */
-- 
cgit v1.2.3


From 15081c61362618a0c81cc8d04e45e7427bc1ed71 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Sun, 1 Feb 2009 22:07:39 +0530
Subject: x86: irqinit_32.c fix compilation warning

Fix:

  arch/x86/kernel/irqinit_32.c:124: warning: 'smp_intr_init' defined but not used

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irqinit_32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index ddf3eb72f86..520e6c1c5d2 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -154,9 +154,9 @@ static void __init smp_intr_init(void)
 
 static void __init apic_intr_init(void)
 {
-#ifdef CONFIG_X86_LOCAL_APIC
 	smp_intr_init();
 
+#ifdef CONFIG_X86_LOCAL_APIC
 	/* self generated IPI for local APIC timer */
 	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
 
-- 
cgit v1.2.3


From 5b75af0a02fcf3b8899f38ff6f22164c5d8e2fdd Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Wed, 4 Feb 2009 17:11:34 +0100
Subject: perfcounters: fix "perf counters kill oprofile" bug

With oprofile as a module, and unloaded by profiling script,
both oprofile and kerneltop work fine.. unless you leave kerneltop
running when you start profiling, then you may see badness.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 4 +++-
 arch/x86/oprofile/nmi_int.c        | 7 ++++---
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 46c436cdd73..8bb213323fe 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -643,7 +643,9 @@ perf_counter_nmi_handler(struct notifier_block *self,
 }
 
 static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
-	.notifier_call		= perf_counter_nmi_handler
+	.notifier_call		= perf_counter_nmi_handler,
+	.next			= NULL,
+	.priority		= 1
 };
 
 void __init init_hw_perf_counters(void)
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 202864ad49a..c638685136e 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -40,8 +40,9 @@ static int profile_exceptions_notify(struct notifier_block *self,
 
 	switch (val) {
 	case DIE_NMI:
-		if (model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu)))
-			ret = NOTIFY_STOP;
+	case DIE_NMI_IPI:
+		model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu));
+		ret = NOTIFY_STOP;
 		break;
 	default:
 		break;
@@ -134,7 +135,7 @@ static void nmi_cpu_setup(void *dummy)
 static struct notifier_block profile_exceptions_nb = {
 	.notifier_call = profile_exceptions_notify,
 	.next = NULL,
-	.priority = 0
+	.priority = 2
 };
 
 static int nmi_setup(void)
-- 
cgit v1.2.3


From 82aa9a1829199233f9bdaf26e2ee271114f4701e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 5 Feb 2009 15:23:08 +0100
Subject: perfcounters: fix "perf counters kills oprofile" bug, v2

Impact: fix kernel crash

Both oprofile and perfcounters register an NMI die handler, but only one
can handle the NMI.  Conveniently, oprofile unregisters it's notifier
when not actively in use, so setting it's notifier priority higher than
perfcounter's allows oprofile to borrow the NMI for the duration of it's
run.  Tested/works both as module and built-in.

While testing, I found that if kerneltop was generating NMIs at very
high frequency, the kernel may panic when oprofile registered it's
handler.  This turned out to be because oprofile registers it's handler
before reset_value has been allocated, so if an NMI comes in while it's
still setting up, kabOom.  Rather than try more invasive changes, I
followed the lead of other places in op_model_ppro.c, and simply
returned in that highly unlikely event.  (debug warnings attached)

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/oprofile/op_model_ppro.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 07c914555a5..85eb6268374 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -126,6 +126,13 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
 	u64 val;
 	int i;
 
+	/*
+	 * This can happen if perf counters are in use when
+	 * we steal the die notifier NMI.
+	 */
+	if (unlikely(!reset_value))
+		goto out;
+
 	for (i = 0 ; i < num_counters; ++i) {
 		if (!reset_value[i])
 			continue;
@@ -136,6 +143,7 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
 		}
 	}
 
+out:
 	/* Only P6 based Pentium M need to re-unmask the apic vector but it
 	 * doesn't hurt other P6 variant */
 	apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
-- 
cgit v1.2.3


From d278c48435625cb6b7edcf6a547620768b175709 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Mon, 9 Feb 2009 07:38:50 +0100
Subject: perf_counters: account NMI interrupts

I noticed that kerneltop interrupts were accounted as NMI, but not their
perf counter origin.

Account NMI performance counter interrupts.

Signed-off-by: Mike Galbraith  <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

 arch/x86/kernel/cpu/perf_counter.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
---
 arch/x86/kernel/cpu/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 8bb213323fe..9901e46998d 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -495,6 +495,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 		goto out;
 
 again:
+	inc_irq_stat(apic_perf_irqs);
 	ack = status;
 	for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
 		struct perf_counter *counter = cpuc->counters[bit];
@@ -570,7 +571,6 @@ void perf_counter_unthrottle(void)
 void smp_perf_counter_interrupt(struct pt_regs *regs)
 {
 	irq_enter();
-	inc_irq_stat(apic_perf_irqs);
 	apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
 	__smp_perf_counter_interrupt(regs, 0);
 
-- 
cgit v1.2.3


From 0475f9ea8e2cc030298908949e0d5da9f2fc2cfe Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 11 Feb 2009 14:35:35 +1100
Subject: perf_counters: allow users to count user, kernel and/or hypervisor
 events

Impact: new perf_counter feature

This extends the perf_counter_hw_event struct with bits that specify
that events in user, kernel and/or hypervisor mode should not be
counted (i.e. should be excluded), and adds code to program the PMU
mode selection bits accordingly on x86 and powerpc.

For software counters, we don't currently have the infrastructure to
distinguish which mode an event occurs in, so we currently fail the
counter initialization if the setting of the hw_event.exclude_* bits
would require us to distinguish.  Context switches and CPU migrations
are currently considered to occur in kernel mode.

On x86, this changes the previous policy that only root can count
kernel events.  Now non-root users can count kernel events or exclude
them.  Non-root users still can't use NMI events, though.  On x86 we
don't appear to have any way to control whether hypervisor events are
counted or not, so hw_event.exclude_hv is ignored.

On powerpc, the selection of whether to count events in user, kernel
and/or hypervisor mode is PMU-wide, not per-counter, so this adds a
check that the hw_event.exclude_* settings are the same as other events
on the PMU.  Counters being added to a group have to have the same
settings as the other hardware counters in the group.  Counters and
groups can only be enabled in hw_perf_group_sched_in or power_perf_enable
if they have the same settings as any other counters already on the
PMU.  If we are not running on a hypervisor, the exclude_hv setting
is ignored (by forcing it to 0) since we can't ever get any
hypervisor events.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/perf_counter.c | 68 ++++++++++++++++++++++++++++++++++++--
 arch/x86/kernel/cpu/perf_counter.c | 31 ++++++++++-------
 2 files changed, 84 insertions(+), 15 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 5b0211348c7..bd6ba85beb5 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -16,6 +16,7 @@
 #include <asm/reg.h>
 #include <asm/pmc.h>
 #include <asm/machdep.h>
+#include <asm/firmware.h>
 
 struct cpu_hw_counters {
 	int n_counters;
@@ -214,6 +215,36 @@ static int power_check_constraints(unsigned int event[], int n_ev)
 	return 0;
 }
 
+/*
+ * Check if newly-added counters have consistent settings for
+ * exclude_{user,kernel,hv} with each other and any previously
+ * added counters.
+ */
+static int check_excludes(struct perf_counter **ctrs, int n_prev, int n_new)
+{
+	int eu, ek, eh;
+	int i, n;
+	struct perf_counter *counter;
+
+	n = n_prev + n_new;
+	if (n <= 1)
+		return 0;
+
+	eu = ctrs[0]->hw_event.exclude_user;
+	ek = ctrs[0]->hw_event.exclude_kernel;
+	eh = ctrs[0]->hw_event.exclude_hv;
+	if (n_prev == 0)
+		n_prev = 1;
+	for (i = n_prev; i < n; ++i) {
+		counter = ctrs[i];
+		if (counter->hw_event.exclude_user != eu ||
+		    counter->hw_event.exclude_kernel != ek ||
+		    counter->hw_event.exclude_hv != eh)
+			return -EAGAIN;
+	}
+	return 0;
+}
+
 static void power_perf_read(struct perf_counter *counter)
 {
 	long val, delta, prev;
@@ -323,6 +354,20 @@ void hw_perf_restore(u64 disable)
 		goto out;
 	}
 
+	/*
+	 * Add in MMCR0 freeze bits corresponding to the
+	 * hw_event.exclude_* bits for the first counter.
+	 * We have already checked that all counters have the
+	 * same values for these bits as the first counter.
+	 */
+	counter = cpuhw->counter[0];
+	if (counter->hw_event.exclude_user)
+		cpuhw->mmcr[0] |= MMCR0_FCP;
+	if (counter->hw_event.exclude_kernel)
+		cpuhw->mmcr[0] |= MMCR0_FCS;
+	if (counter->hw_event.exclude_hv)
+		cpuhw->mmcr[0] |= MMCR0_FCHV;
+
 	/*
 	 * Write the new configuration to MMCR* with the freeze
 	 * bit set and set the hardware counters to their initial values.
@@ -424,6 +469,8 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader,
 			   &cpuhw->counter[n0], &cpuhw->events[n0]);
 	if (n < 0)
 		return -EAGAIN;
+	if (check_excludes(cpuhw->counter, n0, n))
+		return -EAGAIN;
 	if (power_check_constraints(cpuhw->events, n + n0))
 		return -EAGAIN;
 	cpuhw->n_counters = n0 + n;
@@ -476,6 +523,8 @@ static int power_perf_enable(struct perf_counter *counter)
 		goto out;
 	cpuhw->counter[n0] = counter;
 	cpuhw->events[n0] = counter->hw.config;
+	if (check_excludes(cpuhw->counter, n0, 1))
+		goto out;
 	if (power_check_constraints(cpuhw->events, n0 + 1))
 		goto out;
 
@@ -554,6 +603,17 @@ hw_perf_counter_init(struct perf_counter *counter)
 	counter->hw.config_base = ev;
 	counter->hw.idx = 0;
 
+	/*
+	 * If we are not running on a hypervisor, force the
+	 * exclude_hv bit to 0 so that we don't care what
+	 * the user set it to.  This also means that we don't
+	 * set the MMCR0_FCHV bit, which unconditionally freezes
+	 * the counters on the PPC970 variants used in Apple G5
+	 * machines (since MSR.HV is always 1 on those machines).
+	 */
+	if (!firmware_has_feature(FW_FEATURE_LPAR))
+		counter->hw_event.exclude_hv = 0;
+	
 	/*
 	 * If this is in a group, check if it can go on with all the
 	 * other hardware counters in the group.  We assume the counter
@@ -566,11 +626,13 @@ hw_perf_counter_init(struct perf_counter *counter)
 		if (n < 0)
 			return NULL;
 	}
-	events[n++] = ev;
-	if (power_check_constraints(events, n))
+	events[n] = ev;
+	if (check_excludes(ctrs, n, 1))
+		return NULL;
+	if (power_check_constraints(events, n + 1))
 		return NULL;
 
-	counter->hw.config = events[n - 1];
+	counter->hw.config = events[n];
 	atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period);
 	return &power_perf_ops;
 }
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 9901e46998d..383d4c6423a 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -107,21 +107,25 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		return -EINVAL;
 
 	/*
-	 * Count user events, and generate PMC IRQs:
+	 * Generate PMC IRQs:
 	 * (keep 'enabled' bit clear for now)
 	 */
-	hwc->config = ARCH_PERFMON_EVENTSEL_USR | ARCH_PERFMON_EVENTSEL_INT;
+	hwc->config = ARCH_PERFMON_EVENTSEL_INT;
 
 	/*
-	 * If privileged enough, count OS events too, and allow
-	 * NMI events as well:
+	 * Count user and OS events unless requested not to.
 	 */
-	hwc->nmi = 0;
-	if (capable(CAP_SYS_ADMIN)) {
+	if (!hw_event->exclude_user)
+		hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
+	if (!hw_event->exclude_kernel)
 		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
-		if (hw_event->nmi)
-			hwc->nmi = 1;
-	}
+
+	/*
+	 * If privileged enough, allow NMI events:
+	 */
+	hwc->nmi = 0;
+	if (capable(CAP_SYS_ADMIN) && hw_event->nmi)
+		hwc->nmi = 1;
 
 	hwc->irq_period		= hw_event->irq_period;
 	/*
@@ -248,10 +252,13 @@ __pmc_fixed_enable(struct perf_counter *counter,
 	int err;
 
 	/*
-	 * Enable IRQ generation (0x8) and ring-3 counting (0x2),
-	 * and enable ring-0 counting if allowed:
+	 * Enable IRQ generation (0x8),
+	 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
+	 * if requested:
 	 */
-	bits = 0x8ULL | 0x2ULL;
+	bits = 0x8ULL;
+	if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
+		bits |= 0x2;
 	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
 		bits |= 0x1;
 	bits <<= (idx * 4);
-- 
cgit v1.2.3


From 73ca2f8380311115723c7afe811f3ed1f0ba945e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 16 Feb 2009 01:08:17 +0100
Subject: perfcounters: remove duplicate definition of LOCAL_PERF_VECTOR

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/irq_vectors.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index b66b518ff00..b07278c55e9 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -111,11 +111,6 @@
  */
 #define LOCAL_PERF_VECTOR		0xee
 
-/*
- * Performance monitoring interrupt vector:
- */
-#define LOCAL_PERF_VECTOR	0xee
-
 /*
  * First APIC vector available to drivers: (vectors 0x30-0xee) we
  * start at 0x31(0x41) to spread out vectors evenly between priority
-- 
cgit v1.2.3


From d095cd46dac104e4d2a4967c7c19b55a12f78240 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 23 Feb 2009 23:01:28 +1100
Subject: perfcounters/powerpc: Make exclude_kernel bit work on Apple G5
 processors

Currently, setting hw_event.exclude_kernel does nothing on the PPC970
variants used in Apple G5 machines, because they have the HV (hypervisor)
bit in the MSR forced to 1, so as far as the PMU is concerned, the
kernel runs in hypervisor mode.  Thus we have to use the MMCR0_FCHV
(freeze counters in hypervisor mode) bit rather than the MMCR0_FCS
(freeze counters in supervisor mode) bit.

This checks the MSR.HV bit at startup, and if it is set, we set the
freeze_counters_kernel variable to MMCR0_FCHV (it was initialized to
MMCR0_FCS).  We then use that whenever we need to exclude kernel events.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/perf_counter.c | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index bd6ba85beb5..6e27913ec0d 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -32,6 +32,15 @@ DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
 
 struct power_pmu *ppmu;
 
+/*
+ * Normally, to ignore kernel events we set the FCS (freeze counters
+ * in supervisor mode) bit in MMCR0, but if the kernel runs with the
+ * hypervisor bit set in the MSR, or if we are running on a processor
+ * where the hypervisor bit is forced to 1 (as on Apple G5 processors),
+ * then we need to use the FCHV bit to ignore kernel events.
+ */
+static unsigned int freeze_counters_kernel = MMCR0_FCS;
+
 void perf_counter_print_debug(void)
 {
 }
@@ -364,7 +373,7 @@ void hw_perf_restore(u64 disable)
 	if (counter->hw_event.exclude_user)
 		cpuhw->mmcr[0] |= MMCR0_FCP;
 	if (counter->hw_event.exclude_kernel)
-		cpuhw->mmcr[0] |= MMCR0_FCS;
+		cpuhw->mmcr[0] |= freeze_counters_kernel;
 	if (counter->hw_event.exclude_hv)
 		cpuhw->mmcr[0] |= MMCR0_FCHV;
 
@@ -606,10 +615,7 @@ hw_perf_counter_init(struct perf_counter *counter)
 	/*
 	 * If we are not running on a hypervisor, force the
 	 * exclude_hv bit to 0 so that we don't care what
-	 * the user set it to.  This also means that we don't
-	 * set the MMCR0_FCHV bit, which unconditionally freezes
-	 * the counters on the PPC970 variants used in Apple G5
-	 * machines (since MSR.HV is always 1 on those machines).
+	 * the user set it to.
 	 */
 	if (!firmware_has_feature(FW_FEATURE_LPAR))
 		counter->hw_event.exclude_hv = 0;
@@ -841,6 +847,13 @@ static int init_perf_counters(void)
 		ppmu = &power6_pmu;
 		break;
 	}
+
+	/*
+	 * Use FCHV to ignore kernel events if MSR.HV is set.
+	 */
+	if (mfmsr() & MSR_HV)
+		freeze_counters_kernel = MMCR0_FCHV;
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 742bd95ba96e19b3f7196c3a0834ebc17c8ba006 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Tue, 24 Feb 2009 11:33:56 +1100
Subject: perfcounters/powerpc: Add support for POWER5 processors

This adds the back-end for the PMU on the POWER5 processor.  This knows
how to use the fixed-function PMC5 and PMC6 (instructions completed and
run cycles).  Unlike POWER6, PMC5/6 obey the freeze conditions and can
generate interrupts, so their use doesn't impose any extra restrictions.

POWER5+ is different and is not supported by this patch.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/Makefile       |   3 +-
 arch/powerpc/kernel/perf_counter.c |   4 +
 arch/powerpc/kernel/power5-pmu.c   | 475 +++++++++++++++++++++++++++++++++++++
 3 files changed, 481 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/kernel/power5-pmu.c

(limited to 'arch')

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 7c941ec3b23..b4c6f466164 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -94,7 +94,8 @@ obj-$(CONFIG_AUDIT)		+= audit.o
 obj64-$(CONFIG_AUDIT)		+= compat_audit.o
 
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
-obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o ppc970-pmu.o power6-pmu.o
+obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o ppc970-pmu.o power5-pmu.o \
+				   power6-pmu.o
 
 obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
 
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 6e27913ec0d..112332d07fc 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -824,6 +824,7 @@ void hw_perf_counter_setup(int cpu)
 }
 
 extern struct power_pmu ppc970_pmu;
+extern struct power_pmu power5_pmu;
 extern struct power_pmu power6_pmu;
 
 static int init_perf_counters(void)
@@ -843,6 +844,9 @@ static int init_perf_counters(void)
 	case PV_970MP:
 		ppmu = &ppc970_pmu;
 		break;
+	case PV_POWER5:
+		ppmu = &power5_pmu;
+		break;
 	case 0x3e:
 		ppmu = &power6_pmu;
 		break;
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
new file mode 100644
index 00000000000..379ed1087cc
--- /dev/null
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -0,0 +1,475 @@
+/*
+ * Performance counter support for POWER5 (not POWER5++) processors.
+ *
+ * Copyright 2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/perf_counter.h>
+#include <asm/reg.h>
+
+/*
+ * Bits in event code for POWER5 (not POWER5++)
+ */
+#define PM_PMC_SH	20	/* PMC number (1-based) for direct events */
+#define PM_PMC_MSK	0xf
+#define PM_PMC_MSKS	(PM_PMC_MSK << PM_PMC_SH)
+#define PM_UNIT_SH	16	/* TTMMUX number and setting - unit select */
+#define PM_UNIT_MSK	0xf
+#define PM_BYTE_SH	12	/* Byte number of event bus to use */
+#define PM_BYTE_MSK	7
+#define PM_GRS_SH	8	/* Storage subsystem mux select */
+#define PM_GRS_MSK	7
+#define PM_BUSEVENT_MSK	0x80	/* Set if event uses event bus */
+#define PM_PMCSEL_MSK	0x7f
+
+/* Values in PM_UNIT field */
+#define PM_FPU		0
+#define PM_ISU0		1
+#define PM_IFU		2
+#define PM_ISU1		3
+#define PM_IDU		4
+#define PM_ISU0_ALT	6
+#define PM_GRS		7
+#define PM_LSU0		8
+#define PM_LSU1		0xc
+#define PM_LASTUNIT	0xc
+
+/*
+ * Bits in MMCR1 for POWER5
+ */
+#define MMCR1_TTM0SEL_SH	62
+#define MMCR1_TTM1SEL_SH	60
+#define MMCR1_TTM2SEL_SH	58
+#define MMCR1_TTM3SEL_SH	56
+#define MMCR1_TTMSEL_MSK	3
+#define MMCR1_TD_CP_DBG0SEL_SH	54
+#define MMCR1_TD_CP_DBG1SEL_SH	52
+#define MMCR1_TD_CP_DBG2SEL_SH	50
+#define MMCR1_TD_CP_DBG3SEL_SH	48
+#define MMCR1_GRS_L2SEL_SH	46
+#define MMCR1_GRS_L2SEL_MSK	3
+#define MMCR1_GRS_L3SEL_SH	44
+#define MMCR1_GRS_L3SEL_MSK	3
+#define MMCR1_GRS_MCSEL_SH	41
+#define MMCR1_GRS_MCSEL_MSK	7
+#define MMCR1_GRS_FABSEL_SH	39
+#define MMCR1_GRS_FABSEL_MSK	3
+#define MMCR1_PMC1_ADDER_SEL_SH	35
+#define MMCR1_PMC2_ADDER_SEL_SH	34
+#define MMCR1_PMC3_ADDER_SEL_SH	33
+#define MMCR1_PMC4_ADDER_SEL_SH	32
+#define MMCR1_PMC1SEL_SH	25
+#define MMCR1_PMC2SEL_SH	17
+#define MMCR1_PMC3SEL_SH	9
+#define MMCR1_PMC4SEL_SH	1
+#define MMCR1_PMCSEL_SH(n)	(MMCR1_PMC1SEL_SH - (n) * 8)
+#define MMCR1_PMCSEL_MSK	0x7f
+
+/*
+ * Bits in MMCRA
+ */
+
+/*
+ * Layout of constraint bits:
+ * 6666555555555544444444443333333333222222222211111111110000000000
+ * 3210987654321098765432109876543210987654321098765432109876543210
+ *         <><>[  ><><>< ><> [  >[ >[ ><  ><  ><  ><  ><><><><><><>
+ *         T0T1 NC G0G1G2 G3  UC PS1PS2 B0  B1  B2  B3 P6P5P4P3P2P1
+ *
+ * T0 - TTM0 constraint
+ *     54-55: TTM0SEL value (0=FPU, 2=IFU, 3=ISU1) 0xc0_0000_0000_0000
+ *
+ * T1 - TTM1 constraint
+ *     52-53: TTM1SEL value (0=IDU, 3=GRS) 0x30_0000_0000_0000
+ *
+ * NC - number of counters
+ *     51: NC error 0x0008_0000_0000_0000
+ *     48-50: number of events needing PMC1-4 0x0007_0000_0000_0000
+ *
+ * G0..G3 - GRS mux constraints
+ *     46-47: GRS_L2SEL value
+ *     44-45: GRS_L3SEL value
+ *     41-44: GRS_MCSEL value
+ *     39-40: GRS_FABSEL value
+ *	Note that these match up with their bit positions in MMCR1
+ *
+ * UC - unit constraint: can't have all three of FPU|IFU|ISU1, ISU0, IDU|GRS
+ *     37: UC3 error 0x20_0000_0000
+ *     36: FPU|IFU|ISU1 events needed 0x10_0000_0000
+ *     35: ISU0 events needed 0x08_0000_0000
+ *     34: IDU|GRS events needed 0x04_0000_0000
+ *
+ * PS1
+ *     33: PS1 error 0x2_0000_0000
+ *     31-32: count of events needing PMC1/2 0x1_8000_0000
+ *
+ * PS2
+ *     30: PS2 error 0x4000_0000
+ *     28-29: count of events needing PMC3/4 0x3000_0000
+ *
+ * B0
+ *     24-27: Byte 0 event source 0x0f00_0000
+ *	      Encoding as for the event code
+ *
+ * B1, B2, B3
+ *     20-23, 16-19, 12-15: Byte 1, 2, 3 event sources
+ *
+ * P1..P6
+ *     0-11: Count of events needing PMC1..PMC6
+ */
+
+static const int grsel_shift[8] = {
+	MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH,
+	MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH,
+	MMCR1_GRS_MCSEL_SH, MMCR1_GRS_FABSEL_SH
+};
+
+/* Masks and values for using events from the various units */
+static u64 unit_cons[PM_LASTUNIT+1][2] = {
+	[PM_FPU] =   { 0xc0002000000000ull, 0x00001000000000ull },
+	[PM_ISU0] =  { 0x00002000000000ull, 0x00000800000000ull },
+	[PM_ISU1] =  { 0xc0002000000000ull, 0xc0001000000000ull },
+	[PM_IFU] =   { 0xc0002000000000ull, 0x80001000000000ull },
+	[PM_IDU] =   { 0x30002000000000ull, 0x00000400000000ull },
+	[PM_GRS] =   { 0x30002000000000ull, 0x30000400000000ull },
+};
+
+static int power5_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
+{
+	int pmc, byte, unit, sh;
+	int bit, fmask;
+	u64 mask = 0, value = 0;
+	int grp = -1;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	if (pmc) {
+		if (pmc > 6)
+			return -1;
+		sh = (pmc - 1) * 2;
+		mask |= 2 << sh;
+		value |= 1 << sh;
+		if (pmc <= 4)
+			grp = (pmc - 1) >> 1;
+		else if (event != 0x500009 && event != 0x600005)
+			return -1;
+	}
+	if (event & PM_BUSEVENT_MSK) {
+		unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+		if (unit > PM_LASTUNIT)
+			return -1;
+		if (unit == PM_ISU0_ALT)
+			unit = PM_ISU0;
+		mask |= unit_cons[unit][0];
+		value |= unit_cons[unit][1];
+		byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+		if (byte >= 4) {
+			if (unit != PM_LSU1)
+				return -1;
+			/* Map LSU1 low word (bytes 4-7) to unit LSU1+1 */
+			++unit;
+			byte &= 3;
+		}
+		if (unit == PM_GRS) {
+			bit = event & 7;
+			fmask = (bit == 6)? 7: 3;
+			sh = grsel_shift[bit];
+			mask |= (u64)fmask << sh;
+			value |= (u64)((event >> PM_GRS_SH) & fmask) << sh;
+		}
+		/*
+		 * Bus events on bytes 0 and 2 can be counted
+		 * on PMC1/2; bytes 1 and 3 on PMC3/4.
+		 */
+		if (!pmc)
+			grp = byte & 1;
+		/* Set byte lane select field */
+		mask  |= 0xfULL << (24 - 4 * byte);
+		value |= (u64)unit << (24 - 4 * byte);
+	}
+	if (grp == 0) {
+		/* increment PMC1/2 field */
+		mask  |= 0x200000000ull;
+		value |= 0x080000000ull;
+	} else if (grp == 1) {
+		/* increment PMC3/4 field */
+		mask  |= 0x40000000ull;
+		value |= 0x10000000ull;
+	}
+	if (pmc < 5) {
+		/* need a counter from PMC1-4 set */
+		mask  |= 0x8000000000000ull;
+		value |= 0x1000000000000ull;
+	}
+	*maskp = mask;
+	*valp = value;
+	return 0;
+}
+
+#define MAX_ALT	3	/* at most 3 alternatives for any event */
+
+static const unsigned int event_alternatives[][MAX_ALT] = {
+	{ 0x120e4,  0x400002 },			/* PM_GRP_DISP_REJECT */
+	{ 0x410c7,  0x441084 },			/* PM_THRD_L2MISS_BOTH_CYC */
+	{ 0x100005, 0x600005 },			/* PM_RUN_CYC */
+	{ 0x100009, 0x200009, 0x500009 },	/* PM_INST_CMPL */
+	{ 0x300009, 0x400009 },			/* PM_INST_DISP */
+};
+
+/*
+ * Scan the alternatives table for a match and return the
+ * index into the alternatives table if found, else -1.
+ */
+static int find_alternative(unsigned int event)
+{
+	int i, j;
+
+	for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
+		if (event < event_alternatives[i][0])
+			break;
+		for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j)
+			if (event == event_alternatives[i][j])
+				return i;
+	}
+	return -1;
+}
+
+static const unsigned char bytedecode_alternatives[4][4] = {
+	/* PMC 1 */	{ 0x21, 0x23, 0x25, 0x27 },
+	/* PMC 2 */	{ 0x07, 0x17, 0x0e, 0x1e },
+	/* PMC 3 */	{ 0x20, 0x22, 0x24, 0x26 },
+	/* PMC 4 */	{ 0x07, 0x17, 0x0e, 0x1e }
+};
+
+/*
+ * Some direct events for decodes of event bus byte 3 have alternative
+ * PMCSEL values on other counters.  This returns the alternative
+ * event code for those that do, or -1 otherwise.
+ */
+static int find_alternative_bdecode(unsigned int event)
+{
+	int pmc, altpmc, pp, j;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	if (pmc == 0 || pmc > 4)
+		return -1;
+	altpmc = 5 - pmc;	/* 1 <-> 4, 2 <-> 3 */
+	pp = event & PM_PMCSEL_MSK;
+	for (j = 0; j < 4; ++j) {
+		if (bytedecode_alternatives[pmc - 1][j] == pp) {
+			return (event & ~(PM_PMC_MSKS | PM_PMCSEL_MSK)) |
+				(altpmc << PM_PMC_SH) |
+				bytedecode_alternatives[altpmc - 1][j];
+		}
+	}
+	return -1;
+}
+
+static int power5_get_alternatives(unsigned int event, unsigned int alt[])
+{
+	int i, j, ae, nalt = 1;
+
+	alt[0] = event;
+	nalt = 1;
+	i = find_alternative(event);
+	if (i >= 0) {
+		for (j = 0; j < MAX_ALT; ++j) {
+			ae = event_alternatives[i][j];
+			if (ae && ae != event)
+				alt[nalt++] = ae;
+		}
+	} else {
+		ae = find_alternative_bdecode(event);
+		if (ae > 0)
+			alt[nalt++] = ae;
+	}
+	return nalt;
+}
+
+static int power5_compute_mmcr(unsigned int event[], int n_ev,
+			       unsigned int hwc[], u64 mmcr[])
+{
+	u64 mmcr1 = 0;
+	unsigned int pmc, unit, byte, psel;
+	unsigned int ttm, grp;
+	int i, isbus, bit, grsel;
+	unsigned int pmc_inuse = 0;
+	unsigned int pmc_grp_use[2];
+	unsigned char busbyte[4];
+	unsigned char unituse[16];
+	int ttmuse;
+
+	if (n_ev > 6)
+		return -1;
+
+	/* First pass to count resource use */
+	pmc_grp_use[0] = pmc_grp_use[1] = 0;
+	memset(busbyte, 0, sizeof(busbyte));
+	memset(unituse, 0, sizeof(unituse));
+	for (i = 0; i < n_ev; ++i) {
+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+		if (pmc) {
+			if (pmc > 6)
+				return -1;
+			if (pmc_inuse & (1 << (pmc - 1)))
+				return -1;
+			pmc_inuse |= 1 << (pmc - 1);
+			/* count 1/2 vs 3/4 use */
+			if (pmc <= 4)
+				++pmc_grp_use[(pmc - 1) >> 1];
+		}
+		if (event[i] & PM_BUSEVENT_MSK) {
+			unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+			byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+			if (unit > PM_LASTUNIT)
+				return -1;
+			if (unit == PM_ISU0_ALT)
+				unit = PM_ISU0;
+			if (byte >= 4) {
+				if (unit != PM_LSU1)
+					return -1;
+				++unit;
+				byte &= 3;
+			}
+			if (!pmc)
+				++pmc_grp_use[byte & 1];
+			if (busbyte[byte] && busbyte[byte] != unit)
+				return -1;
+			busbyte[byte] = unit;
+			unituse[unit] = 1;
+		}
+	}
+	if (pmc_grp_use[0] > 2 || pmc_grp_use[1] > 2)
+		return -1;
+
+	/*
+	 * Assign resources and set multiplexer selects.
+	 *
+	 * PM_ISU0 can go either on TTM0 or TTM1, but that's the only
+	 * choice we have to deal with.
+	 */
+	if (unituse[PM_ISU0] &
+	    (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_ISU1])) {
+		unituse[PM_ISU0_ALT] = 1;	/* move ISU to TTM1 */
+		unituse[PM_ISU0] = 0;
+	}
+	/* Set TTM[01]SEL fields. */
+	ttmuse = 0;
+	for (i = PM_FPU; i <= PM_ISU1; ++i) {
+		if (!unituse[i])
+			continue;
+		if (ttmuse++)
+			return -1;
+		mmcr1 |= (u64)i << MMCR1_TTM0SEL_SH;
+	}
+	ttmuse = 0;
+	for (; i <= PM_GRS; ++i) {
+		if (!unituse[i])
+			continue;
+		if (ttmuse++)
+			return -1;
+		mmcr1 |= (u64)(i & 3) << MMCR1_TTM1SEL_SH;
+	}
+	if (ttmuse > 1)
+		return -1;
+
+	/* Set byte lane select fields, TTM[23]SEL and GRS_*SEL. */
+	for (byte = 0; byte < 4; ++byte) {
+		unit = busbyte[byte];
+		if (!unit)
+			continue;
+		if (unit == PM_ISU0 && unituse[PM_ISU0_ALT]) {
+			/* get ISU0 through TTM1 rather than TTM0 */
+			unit = PM_ISU0_ALT;
+		} else if (unit == PM_LSU1 + 1) {
+			/* select lower word of LSU1 for this byte */
+			mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
+		}
+		ttm = unit >> 2;
+		mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
+	}
+
+	/* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
+	for (i = 0; i < n_ev; ++i) {
+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+		unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+		byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+		psel = event[i] & PM_PMCSEL_MSK;
+		isbus = event[i] & PM_BUSEVENT_MSK;
+		if (!pmc) {
+			/* Bus event or any-PMC direct event */
+			for (pmc = 0; pmc < 4; ++pmc) {
+				if (pmc_inuse & (1 << pmc))
+					continue;
+				grp = (pmc >> 1) & 1;
+				if (isbus) {
+					if (grp == (byte & 1))
+						break;
+				} else if (pmc_grp_use[grp] < 2) {
+					++pmc_grp_use[grp];
+					break;
+				}
+			}
+			pmc_inuse |= 1 << pmc;
+		} else if (pmc <= 4) {
+			/* Direct event */
+			--pmc;
+			if ((psel == 8 || psel == 0x10) && isbus && (byte & 2))
+				/* add events on higher-numbered bus */
+				mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc);
+		} else {
+			/* Instructions or run cycles on PMC5/6 */
+			--pmc;
+		}
+		if (isbus && unit == PM_GRS) {
+			bit = psel & 7;
+			grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK;
+			mmcr1 |= (u64)grsel << grsel_shift[bit];
+		}
+		if (pmc <= 3)
+			mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc);
+		hwc[i] = pmc;
+	}
+
+	/* Return MMCRx values */
+	mmcr[0] = 0;
+	if (pmc_inuse & 1)
+		mmcr[0] = MMCR0_PMC1CE;
+	if (pmc_inuse & 0x3e)
+		mmcr[0] |= MMCR0_PMCjCE;
+	mmcr[1] = mmcr1;
+	mmcr[2] = 0;
+	return 0;
+}
+
+static void power5_disable_pmc(unsigned int pmc, u64 mmcr[])
+{
+	if (pmc <= 3)
+		mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc));
+}
+
+static int power5_generic_events[] = {
+	[PERF_COUNT_CPU_CYCLES] = 0xf,
+	[PERF_COUNT_INSTRUCTIONS] = 0x100009,
+	[PERF_COUNT_CACHE_REFERENCES] = 0x4c1090,	/* LD_REF_L1 */
+	[PERF_COUNT_CACHE_MISSES] = 0x3c1088,		/* LD_MISS_L1 */
+	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x230e4,	/* BR_ISSUED */ 
+	[PERF_COUNT_BRANCH_MISSES] = 0x230e5,		/* BR_MPRED_CR */
+};
+
+struct power_pmu power5_pmu = {
+	.n_counter = 6,
+	.max_alternatives = MAX_ALT,
+	.add_fields = 0x7000090000555ull,
+	.test_adder = 0x3000490000000ull,
+	.compute_mmcr = power5_compute_mmcr,
+	.get_constraint = power5_get_constraint,
+	.get_alternatives = power5_get_alternatives,
+	.disable_pmc = power5_disable_pmc,
+	.n_generic = ARRAY_SIZE(power5_generic_events),
+	.generic_events = power5_generic_events,
+};
-- 
cgit v1.2.3


From f3dfd2656deb81a0addee4f4ceff66b50a387388 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 26 Feb 2009 22:43:46 +1100
Subject: perfcounters: fix a few minor cleanliness issues

This fixes three issues noticed by Arnd Bergmann:

- Add #ifdef __KERNEL__ and move some things around in perf_counter.h
  to make sure only the bits that userspace needs are exported to
  userspace.

- Use __u64, __s64, __u32 types in the structs exported to userspace
  rather than u64, s64, u32.

- Make the sys_perf_counter_open syscall available to the SPUs on
  Cell platforms.

And one issue that I noticed in looking at the code again:

- Wrap the perf_counter_open syscall with SYSCALL_DEFINE4 so we get
  the proper handling of int arguments on ppc64 (and some other 64-bit
  architectures).

Reported-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/systbl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
index 4c8095f6bec..d312eec8abb 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -322,4 +322,4 @@ SYSCALL_SPU(epoll_create1)
 SYSCALL_SPU(dup3)
 SYSCALL_SPU(pipe2)
 SYSCALL(inotify_init1)
-SYSCALL(perf_counter_open)
+SYSCALL_SPU(perf_counter_open)
-- 
cgit v1.2.3


From b56a3802dc6df29aa27d2c12edf420258091ad66 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Fri, 27 Feb 2009 18:09:09 +0530
Subject: x86: prepare perf_counter to add more cpus

Introduced  struct pmc_x86_ops to add more cpus.

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 106 +++++++++++++++++++++++++++----------
 1 file changed, 78 insertions(+), 28 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 383d4c6423a..a3c88529bb7 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -3,6 +3,7 @@
  *
  *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
  *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
+ *  Copyright(C) 2009 Jaswinder Singh Rajput
  *
  *  For licencing details see kernel-base/COPYING
  */
@@ -38,10 +39,24 @@ struct cpu_hw_counters {
 };
 
 /*
- * Intel PerfMon v3. Used on Core2 and later.
+ * struct pmc_x86_ops - performance counter x86 ops
  */
+struct pmc_x86_ops {
+	u64 (*save_disable_all)		(void);
+	void (*restore_all)		(u64 ctrl);
+	unsigned eventsel;
+	unsigned perfctr;
+	int (*event_map)		(int event);
+	int max_events;
+};
+
+static struct pmc_x86_ops *pmc_ops;
+
 static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
 
+/*
+ * Intel PerfMon v3. Used on Core2 and later.
+ */
 static const int intel_perfmon_event_map[] =
 {
   [PERF_COUNT_CPU_CYCLES]		= 0x003c,
@@ -53,7 +68,10 @@ static const int intel_perfmon_event_map[] =
   [PERF_COUNT_BUS_CYCLES]		= 0x013c,
 };
 
-static const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
+static int pmc_intel_event_map(int event)
+{
+	return intel_perfmon_event_map[event];
+}
 
 /*
  * Propagate counter elapsed time into the generic counter.
@@ -144,38 +162,48 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	if (hw_event->raw) {
 		hwc->config |= hw_event->type;
 	} else {
-		if (hw_event->type >= max_intel_perfmon_events)
+		if (hw_event->type >= pmc_ops->max_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= intel_perfmon_event_map[hw_event->type];
+		hwc->config |= pmc_ops->event_map(hw_event->type);
 	}
 	counter->wakeup_pending = 0;
 
 	return 0;
 }
 
-u64 hw_perf_save_disable(void)
+static u64 pmc_intel_save_disable_all(void)
 {
 	u64 ctrl;
 
-	if (unlikely(!perf_counters_initialized))
-		return 0;
-
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
 
 	return ctrl;
 }
+
+u64 hw_perf_save_disable(void)
+{
+	if (unlikely(!perf_counters_initialized))
+		return 0;
+
+	return pmc_ops->save_disable_all();
+}
 EXPORT_SYMBOL_GPL(hw_perf_save_disable);
 
+static void pmc_intel_restore_all(u64 ctrl)
+{
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+}
+
 void hw_perf_restore(u64 ctrl)
 {
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+	pmc_ops->restore_all(ctrl);
 }
 EXPORT_SYMBOL_GPL(hw_perf_restore);
 
@@ -291,11 +319,11 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 
 	event = hwc->config & ARCH_PERFMON_EVENT_MASK;
 
-	if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_INSTRUCTIONS]))
+	if (unlikely(event == pmc_ops->event_map(PERF_COUNT_INSTRUCTIONS)))
 		return X86_PMC_IDX_FIXED_INSTRUCTIONS;
-	if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_CPU_CYCLES]))
+	if (unlikely(event == pmc_ops->event_map(PERF_COUNT_CPU_CYCLES)))
 		return X86_PMC_IDX_FIXED_CPU_CYCLES;
-	if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_BUS_CYCLES]))
+	if (unlikely(event == pmc_ops->event_map(PERF_COUNT_BUS_CYCLES)))
 		return X86_PMC_IDX_FIXED_BUS_CYCLES;
 
 	return -1;
@@ -339,8 +367,8 @@ try_generic:
 			set_bit(idx, cpuc->used);
 			hwc->idx = idx;
 		}
-		hwc->config_base  = MSR_ARCH_PERFMON_EVENTSEL0;
-		hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0;
+		hwc->config_base  = pmc_ops->eventsel;
+		hwc->counter_base = pmc_ops->perfctr;
 	}
 
 	perf_counters_lapic_init(hwc->nmi);
@@ -386,8 +414,8 @@ void perf_counter_print_debug(void)
 	printk(KERN_INFO "CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
-		rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
-		rdmsrl(MSR_ARCH_PERFMON_PERFCTR0  + idx, pmc_count);
+		rdmsrl(pmc_ops->eventsel + idx, pmc_ctrl);
+		rdmsrl(pmc_ops->perfctr  + idx, pmc_count);
 
 		prev_left = per_cpu(prev_left[idx], cpu);
 
@@ -655,29 +683,56 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
 	.priority		= 1
 };
 
-void __init init_hw_perf_counters(void)
+static struct pmc_x86_ops pmc_intel_ops = {
+	.save_disable_all	= pmc_intel_save_disable_all,
+	.restore_all		= pmc_intel_restore_all,
+	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
+	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
+	.event_map		= pmc_intel_event_map,
+	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
+};
+
+static struct pmc_x86_ops *pmc_intel_init(void)
 {
 	union cpuid10_eax eax;
 	unsigned int ebx;
 	unsigned int unused;
 	union cpuid10_edx edx;
 
-	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
-		return;
-
 	/*
 	 * Check whether the Architectural PerfMon supports
 	 * Branch Misses Retired Event or not.
 	 */
 	cpuid(10, &eax.full, &ebx, &unused, &edx.full);
 	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
-		return;
+		return NULL;
 
 	printk(KERN_INFO "Intel Performance Monitoring support detected.\n");
-
 	printk(KERN_INFO "... version:         %d\n", eax.split.version_id);
-	printk(KERN_INFO "... num counters:    %d\n", eax.split.num_counters);
+	printk(KERN_INFO "... bit width:       %d\n", eax.split.bit_width);
+	printk(KERN_INFO "... mask length:     %d\n", eax.split.mask_length);
+
 	nr_counters_generic = eax.split.num_counters;
+	nr_counters_fixed = edx.split.num_counters_fixed;
+	counter_value_mask = (1ULL << eax.split.bit_width) - 1;
+
+	return &pmc_intel_ops;
+}
+
+void __init init_hw_perf_counters(void)
+{
+	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+		return;
+
+	switch (boot_cpu_data.x86_vendor) {
+	case X86_VENDOR_INTEL:
+		pmc_ops = pmc_intel_init();
+		break;
+	}
+	if (!pmc_ops)
+		return;
+
+	printk(KERN_INFO "... num counters:    %d\n", nr_counters_generic);
 	if (nr_counters_generic > X86_PMC_MAX_GENERIC) {
 		nr_counters_generic = X86_PMC_MAX_GENERIC;
 		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
@@ -686,13 +741,8 @@ void __init init_hw_perf_counters(void)
 	perf_counter_mask = (1 << nr_counters_generic) - 1;
 	perf_max_counters = nr_counters_generic;
 
-	printk(KERN_INFO "... bit width:       %d\n", eax.split.bit_width);
-	counter_value_mask = (1ULL << eax.split.bit_width) - 1;
 	printk(KERN_INFO "... value mask:      %016Lx\n", counter_value_mask);
 
-	printk(KERN_INFO "... mask length:     %d\n", eax.split.mask_length);
-
-	nr_counters_fixed = edx.split.num_counters_fixed;
 	if (nr_counters_fixed > X86_PMC_MAX_FIXED) {
 		nr_counters_fixed = X86_PMC_MAX_FIXED;
 		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
-- 
cgit v1.2.3


From f87ad35d37fa543925210550f7db20a54c83ed70 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Fri, 27 Feb 2009 20:15:14 +0530
Subject: x86: AMD Support for perf_counter

Supported basic performance counter for AMD K7 and later:

$ perfstat -e 0,1,2,3,4,5,-1,-2,-3,-4,-5 ls > /dev/null

 Performance counter stats for 'ls':

      12.298610  task clock ticks     (msecs)

        3298477  CPU cycles           (events)
        1406354  instructions         (events)
         749035  cache references     (events)
          16939  cache misses         (events)
         100589  branches             (events)
          11159  branch misses        (events)
       7.627540  cpu clock ticks      (msecs)
      12.298610  task clock ticks     (msecs)
            500  pagefaults           (events)
              6  context switches     (events)
              3  CPU migrations       (events)

 Wall-clock time elapsed:     8.672290 msecs

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/amd.c          |  4 ++
 arch/x86/kernel/cpu/perf_counter.c | 83 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 85 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 25423a5b80e..edcde52bd17 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -368,6 +368,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	if (c->x86 >= 6)
 		set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
 
+	/* Enable Performance counter for K7 and later */
+	if (c->x86 > 6 && c->x86 <= 0x11)
+		set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
+
 	if (!c->x86_model_id[0]) {
 		switch (c->x86) {
 		case 0xf:
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index a3c88529bb7..266618aa1a0 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -73,6 +73,24 @@ static int pmc_intel_event_map(int event)
 	return intel_perfmon_event_map[event];
 }
 
+/*
+ * AMD Performance Monitor K7 and later.
+ */
+static const int amd_perfmon_event_map[] =
+{
+  [PERF_COUNT_CPU_CYCLES]		= 0x0076,
+  [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
+  [PERF_COUNT_CACHE_REFERENCES]		= 0x0080,
+  [PERF_COUNT_CACHE_MISSES]		= 0x0081,
+  [PERF_COUNT_BRANCH_INSTRUCTIONS]	= 0x00c4,
+  [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
+};
+
+static int pmc_amd_event_map(int event)
+{
+	return amd_perfmon_event_map[event];
+}
+
 /*
  * Propagate counter elapsed time into the generic counter.
  * Can only be executed on the CPU where the counter is active.
@@ -151,8 +169,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	 * so we install an artificial 1<<31 period regardless of
 	 * the generic counter period:
 	 */
-	if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF)
-		hwc->irq_period = 0x7FFFFFFF;
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+		if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF)
+			hwc->irq_period = 0x7FFFFFFF;
 
 	atomic64_set(&hwc->period_left, hwc->irq_period);
 
@@ -184,6 +203,22 @@ static u64 pmc_intel_save_disable_all(void)
 	return ctrl;
 }
 
+static u64 pmc_amd_save_disable_all(void)
+{
+	int idx;
+	u64 val, ctrl = 0;
+
+	for (idx = 0; idx < nr_counters_generic; idx++) {
+		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
+		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
+			ctrl |= (1 << idx);
+		val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
+	}
+
+	return ctrl;
+}
+
 u64 hw_perf_save_disable(void)
 {
 	if (unlikely(!perf_counters_initialized))
@@ -198,6 +233,20 @@ static void pmc_intel_restore_all(u64 ctrl)
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 }
 
+static void pmc_amd_restore_all(u64 ctrl)
+{
+	u64 val;
+	int idx;
+
+	for (idx = 0; idx < nr_counters_generic; idx++) {
+		if (ctrl & (1 << idx)) {
+			rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
+			val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+			wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
+		}
+	}
+}
+
 void hw_perf_restore(u64 ctrl)
 {
 	if (unlikely(!perf_counters_initialized))
@@ -314,6 +363,9 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 {
 	unsigned int event;
 
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+		return -1;
+
 	if (unlikely(hwc->nmi))
 		return -1;
 
@@ -401,6 +453,7 @@ void perf_counter_print_debug(void)
 	cpu = smp_processor_id();
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 	rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
@@ -411,6 +464,7 @@ void perf_counter_print_debug(void)
 	printk(KERN_INFO "CPU#%d: status:     %016llx\n", cpu, status);
 	printk(KERN_INFO "CPU#%d: overflow:   %016llx\n", cpu, overflow);
 	printk(KERN_INFO "CPU#%d: fixed:      %016llx\n", cpu, fixed);
+	}
 	printk(KERN_INFO "CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
@@ -588,6 +642,9 @@ void perf_counter_unthrottle(void)
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
 		return;
 
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+		return;
+
 	if (unlikely(!perf_counters_initialized))
 		return;
 
@@ -692,6 +749,15 @@ static struct pmc_x86_ops pmc_intel_ops = {
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
 };
 
+static struct pmc_x86_ops pmc_amd_ops = {
+	.save_disable_all	= pmc_amd_save_disable_all,
+	.restore_all		= pmc_amd_restore_all,
+	.eventsel		= MSR_K7_EVNTSEL0,
+	.perfctr		= MSR_K7_PERFCTR0,
+	.event_map		= pmc_amd_event_map,
+	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
+};
+
 static struct pmc_x86_ops *pmc_intel_init(void)
 {
 	union cpuid10_eax eax;
@@ -719,6 +785,16 @@ static struct pmc_x86_ops *pmc_intel_init(void)
 	return &pmc_intel_ops;
 }
 
+static struct pmc_x86_ops *pmc_amd_init(void)
+{
+	nr_counters_generic = 4;
+	nr_counters_fixed = 0;
+
+	printk(KERN_INFO "AMD Performance Monitoring support detected.\n");
+
+	return &pmc_amd_ops;
+}
+
 void __init init_hw_perf_counters(void)
 {
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
@@ -728,6 +804,9 @@ void __init init_hw_perf_counters(void)
 	case X86_VENDOR_INTEL:
 		pmc_ops = pmc_intel_init();
 		break;
+	case X86_VENDOR_AMD:
+		pmc_ops = pmc_amd_init();
+		break;
 	}
 	if (!pmc_ops)
 		return;
-- 
cgit v1.2.3


From 169e41eb7f5464c077a7e0e129f025759d04cc54 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Sat, 28 Feb 2009 18:37:49 +0530
Subject: x86: decent declarations in perf_counter.c

Impact: cleanup

making decent declrations for struct pmc_x86_ops and
fix checkpatch error:
 ERROR: Macros with complex values should be enclosed in parenthesis

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 266618aa1a0..a1f3646a3e8 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -42,12 +42,12 @@ struct cpu_hw_counters {
  * struct pmc_x86_ops - performance counter x86 ops
  */
 struct pmc_x86_ops {
-	u64 (*save_disable_all)		(void);
-	void (*restore_all)		(u64 ctrl);
-	unsigned eventsel;
-	unsigned perfctr;
-	int (*event_map)		(int event);
-	int max_events;
+	u64		(*save_disable_all)(void);
+	void		(*restore_all)(u64 ctrl);
+	unsigned	eventsel;
+	unsigned	perfctr;
+	int		(*event_map)(int event);
+	int		max_events;
 };
 
 static struct pmc_x86_ops *pmc_ops;
@@ -561,7 +561,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 /*
  * Maximum interrupt frequency of 100KHz per CPU
  */
-#define PERFMON_MAX_INTERRUPTS 100000/HZ
+#define PERFMON_MAX_INTERRUPTS (100000/HZ)
 
 /*
  * This handler is triggered by the local APIC, so the APIC IRQ handling
-- 
cgit v1.2.3


From a1ef58f442542d8b3e3b963339fbc522c36e827c Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Sat, 28 Feb 2009 18:45:39 +0530
Subject: x86: use pr_info in perf_counter.c

Impact: cleanup

using pr_info in perf_counter.c fixes various 80 characters warnings and
also indenting for conditional statement

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 48 +++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index a1f3646a3e8..3b65f19a668 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -454,18 +454,18 @@ void perf_counter_print_debug(void)
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
 	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
-	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
-	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
-	rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
-	rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
-
-	printk(KERN_INFO "\n");
-	printk(KERN_INFO "CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
-	printk(KERN_INFO "CPU#%d: status:     %016llx\n", cpu, status);
-	printk(KERN_INFO "CPU#%d: overflow:   %016llx\n", cpu, overflow);
-	printk(KERN_INFO "CPU#%d: fixed:      %016llx\n", cpu, fixed);
+		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
+		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
+
+		pr_info("\n");
+		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
+		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
+		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
+		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
 	}
-	printk(KERN_INFO "CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
+	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
 		rdmsrl(pmc_ops->eventsel + idx, pmc_ctrl);
@@ -473,17 +473,17 @@ void perf_counter_print_debug(void)
 
 		prev_left = per_cpu(prev_left[idx], cpu);
 
-		printk(KERN_INFO "CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
+		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
 			cpu, idx, pmc_ctrl);
-		printk(KERN_INFO "CPU#%d:   gen-PMC%d count: %016llx\n",
+		pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
 			cpu, idx, pmc_count);
-		printk(KERN_INFO "CPU#%d:   gen-PMC%d left:  %016llx\n",
+		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
 			cpu, idx, prev_left);
 	}
 	for (idx = 0; idx < nr_counters_fixed; idx++) {
 		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
 
-		printk(KERN_INFO "CPU#%d: fixed-PMC%d count: %016llx\n",
+		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
 			cpu, idx, pmc_count);
 	}
 	local_irq_enable();
@@ -773,10 +773,10 @@ static struct pmc_x86_ops *pmc_intel_init(void)
 	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
 		return NULL;
 
-	printk(KERN_INFO "Intel Performance Monitoring support detected.\n");
-	printk(KERN_INFO "... version:         %d\n", eax.split.version_id);
-	printk(KERN_INFO "... bit width:       %d\n", eax.split.bit_width);
-	printk(KERN_INFO "... mask length:     %d\n", eax.split.mask_length);
+	pr_info("Intel Performance Monitoring support detected.\n");
+	pr_info("... version:         %d\n", eax.split.version_id);
+	pr_info("... bit width:       %d\n", eax.split.bit_width);
+	pr_info("... mask length:     %d\n", eax.split.mask_length);
 
 	nr_counters_generic = eax.split.num_counters;
 	nr_counters_fixed = edx.split.num_counters_fixed;
@@ -790,7 +790,7 @@ static struct pmc_x86_ops *pmc_amd_init(void)
 	nr_counters_generic = 4;
 	nr_counters_fixed = 0;
 
-	printk(KERN_INFO "AMD Performance Monitoring support detected.\n");
+	pr_info("AMD Performance Monitoring support detected.\n");
 
 	return &pmc_amd_ops;
 }
@@ -811,7 +811,7 @@ void __init init_hw_perf_counters(void)
 	if (!pmc_ops)
 		return;
 
-	printk(KERN_INFO "... num counters:    %d\n", nr_counters_generic);
+	pr_info("... num counters:    %d\n", nr_counters_generic);
 	if (nr_counters_generic > X86_PMC_MAX_GENERIC) {
 		nr_counters_generic = X86_PMC_MAX_GENERIC;
 		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
@@ -820,18 +820,18 @@ void __init init_hw_perf_counters(void)
 	perf_counter_mask = (1 << nr_counters_generic) - 1;
 	perf_max_counters = nr_counters_generic;
 
-	printk(KERN_INFO "... value mask:      %016Lx\n", counter_value_mask);
+	pr_info("... value mask:      %016Lx\n", counter_value_mask);
 
 	if (nr_counters_fixed > X86_PMC_MAX_FIXED) {
 		nr_counters_fixed = X86_PMC_MAX_FIXED;
 		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
 			nr_counters_fixed, X86_PMC_MAX_FIXED);
 	}
-	printk(KERN_INFO "... fixed counters:  %d\n", nr_counters_fixed);
+	pr_info("... fixed counters:  %d\n", nr_counters_fixed);
 
 	perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED;
 
-	printk(KERN_INFO "... counter mask:    %016Lx\n", perf_counter_mask);
+	pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
 	perf_counters_initialized = true;
 
 	perf_counters_lapic_init(0);
-- 
cgit v1.2.3


From b0f3f28e0f14eb335f67bfaae33ce8b8d74fd58b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 5 Mar 2009 18:08:27 +0100
Subject: perfcounters: IRQ and NMI support on AMD CPUs

The below completes the K7+ performance counter support:

 - IRQ support
 - NMI support

KernelTop output works now as well.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Jaswinder Singh Rajput <jaswinder@kernel.org>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1236273633.5187.286.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 272 +++++++++++++++++++++++++++++++------
 1 file changed, 228 insertions(+), 44 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 3b65f19a668..6ebe9abf6ae 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -28,6 +28,7 @@ static bool perf_counters_initialized __read_mostly;
 static int nr_counters_generic __read_mostly;
 static u64 perf_counter_mask __read_mostly;
 static u64 counter_value_mask __read_mostly;
+static int counter_value_bits __read_mostly;
 
 static int nr_counters_fixed __read_mostly;
 
@@ -35,7 +36,9 @@ struct cpu_hw_counters {
 	struct perf_counter	*counters[X86_PMC_IDX_MAX];
 	unsigned long		used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long		interrupts;
-	u64			global_enable;
+	u64			throttle_ctrl;
+	u64			active_mask;
+	int			enabled;
 };
 
 /*
@@ -43,21 +46,28 @@ struct cpu_hw_counters {
  */
 struct pmc_x86_ops {
 	u64		(*save_disable_all)(void);
-	void		(*restore_all)(u64 ctrl);
+	void		(*restore_all)(u64);
+	u64		(*get_status)(u64);
+	void		(*ack_status)(u64);
+	void		(*enable)(int, u64);
+	void		(*disable)(int, u64);
 	unsigned	eventsel;
 	unsigned	perfctr;
-	int		(*event_map)(int event);
+	u64		(*event_map)(int);
+	u64		(*raw_event)(u64);
 	int		max_events;
 };
 
 static struct pmc_x86_ops *pmc_ops;
 
-static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
+static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
+	.enabled = 1,
+};
 
 /*
  * Intel PerfMon v3. Used on Core2 and later.
  */
-static const int intel_perfmon_event_map[] =
+static const u64 intel_perfmon_event_map[] =
 {
   [PERF_COUNT_CPU_CYCLES]		= 0x003c,
   [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
@@ -68,15 +78,29 @@ static const int intel_perfmon_event_map[] =
   [PERF_COUNT_BUS_CYCLES]		= 0x013c,
 };
 
-static int pmc_intel_event_map(int event)
+static u64 pmc_intel_event_map(int event)
 {
 	return intel_perfmon_event_map[event];
 }
 
+static u64 pmc_intel_raw_event(u64 event)
+{
+#define CORE_EVNTSEL_EVENT_MASK		0x000000FF
+#define CORE_EVNTSEL_UNIT_MASK		0x0000FF00
+#define CORE_EVNTSEL_COUNTER_MASK	0xFF000000
+
+#define CORE_EVNTSEL_MASK 		\
+	(CORE_EVNTSEL_EVENT_MASK |	\
+	 CORE_EVNTSEL_UNIT_MASK  |	\
+	 CORE_EVNTSEL_COUNTER_MASK)
+
+	return event & CORE_EVNTSEL_MASK;
+}
+
 /*
  * AMD Performance Monitor K7 and later.
  */
-static const int amd_perfmon_event_map[] =
+static const u64 amd_perfmon_event_map[] =
 {
   [PERF_COUNT_CPU_CYCLES]		= 0x0076,
   [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
@@ -86,11 +110,25 @@ static const int amd_perfmon_event_map[] =
   [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
 };
 
-static int pmc_amd_event_map(int event)
+static u64 pmc_amd_event_map(int event)
 {
 	return amd_perfmon_event_map[event];
 }
 
+static u64 pmc_amd_raw_event(u64 event)
+{
+#define K7_EVNTSEL_EVENT_MASK	0x7000000FF
+#define K7_EVNTSEL_UNIT_MASK	0x00000FF00
+#define K7_EVNTSEL_COUNTER_MASK	0x0FF000000
+
+#define K7_EVNTSEL_MASK			\
+	(K7_EVNTSEL_EVENT_MASK |	\
+	 K7_EVNTSEL_UNIT_MASK  |	\
+	 K7_EVNTSEL_COUNTER_MASK)
+
+	return event & K7_EVNTSEL_MASK;
+}
+
 /*
  * Propagate counter elapsed time into the generic counter.
  * Can only be executed on the CPU where the counter is active.
@@ -179,7 +217,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	 * Raw event type provide the config in the event structure
 	 */
 	if (hw_event->raw) {
-		hwc->config |= hw_event->type;
+		hwc->config |= pmc_ops->raw_event(hw_event->type);
 	} else {
 		if (hw_event->type >= pmc_ops->max_events)
 			return -EINVAL;
@@ -205,18 +243,24 @@ static u64 pmc_intel_save_disable_all(void)
 
 static u64 pmc_amd_save_disable_all(void)
 {
-	int idx;
-	u64 val, ctrl = 0;
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+	int enabled, idx;
+
+	enabled = cpuc->enabled;
+	cpuc->enabled = 0;
+	barrier();
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
+		u64 val;
+
 		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
-		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
-			ctrl |= (1 << idx);
-		val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
-		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
+		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) {
+			val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+			wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
+		}
 	}
 
-	return ctrl;
+	return enabled;
 }
 
 u64 hw_perf_save_disable(void)
@@ -226,6 +270,9 @@ u64 hw_perf_save_disable(void)
 
 	return pmc_ops->save_disable_all();
 }
+/*
+ * Exported because of ACPI idle
+ */
 EXPORT_SYMBOL_GPL(hw_perf_save_disable);
 
 static void pmc_intel_restore_all(u64 ctrl)
@@ -235,11 +282,18 @@ static void pmc_intel_restore_all(u64 ctrl)
 
 static void pmc_amd_restore_all(u64 ctrl)
 {
-	u64 val;
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	int idx;
 
+	cpuc->enabled = ctrl;
+	barrier();
+	if (!ctrl)
+		return;
+
 	for (idx = 0; idx < nr_counters_generic; idx++) {
-		if (ctrl & (1 << idx)) {
+		if (test_bit(idx, (unsigned long *)&cpuc->active_mask)) {
+			u64 val;
+
 			rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
 			val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
 			wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
@@ -254,8 +308,112 @@ void hw_perf_restore(u64 ctrl)
 
 	pmc_ops->restore_all(ctrl);
 }
+/*
+ * Exported because of ACPI idle
+ */
 EXPORT_SYMBOL_GPL(hw_perf_restore);
 
+static u64 pmc_intel_get_status(u64 mask)
+{
+	u64 status;
+
+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+
+	return status;
+}
+
+static u64 pmc_amd_get_status(u64 mask)
+{
+	u64 status = 0;
+	int idx;
+
+	for (idx = 0; idx < nr_counters_generic; idx++) {
+		s64 val;
+
+		if (!(mask & (1 << idx)))
+			continue;
+
+		rdmsrl(MSR_K7_PERFCTR0 + idx, val);
+		val <<= (64 - counter_value_bits);
+		if (val >= 0)
+			status |= (1 << idx);
+	}
+
+	return status;
+}
+
+static u64 hw_perf_get_status(u64 mask)
+{
+	if (unlikely(!perf_counters_initialized))
+		return 0;
+
+	return pmc_ops->get_status(mask);
+}
+
+static void pmc_intel_ack_status(u64 ack)
+{
+	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
+}
+
+static void pmc_amd_ack_status(u64 ack)
+{
+}
+
+static void hw_perf_ack_status(u64 ack)
+{
+	if (unlikely(!perf_counters_initialized))
+		return;
+
+	pmc_ops->ack_status(ack);
+}
+
+static void pmc_intel_enable(int idx, u64 config)
+{
+	wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx,
+			config | ARCH_PERFMON_EVENTSEL0_ENABLE);
+}
+
+static void pmc_amd_enable(int idx, u64 config)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+	set_bit(idx, (unsigned long *)&cpuc->active_mask);
+	if (cpuc->enabled)
+		config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+
+	wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
+}
+
+static void hw_perf_enable(int idx, u64 config)
+{
+	if (unlikely(!perf_counters_initialized))
+		return;
+
+	pmc_ops->enable(idx, config);
+}
+
+static void pmc_intel_disable(int idx, u64 config)
+{
+	wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config);
+}
+
+static void pmc_amd_disable(int idx, u64 config)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+	clear_bit(idx, (unsigned long *)&cpuc->active_mask);
+	wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
+
+}
+
+static void hw_perf_disable(int idx, u64 config)
+{
+	if (unlikely(!perf_counters_initialized))
+		return;
+
+	pmc_ops->disable(idx, config);
+}
+
 static inline void
 __pmc_fixed_disable(struct perf_counter *counter,
 		    struct hw_perf_counter *hwc, unsigned int __idx)
@@ -278,7 +436,7 @@ __pmc_generic_disable(struct perf_counter *counter,
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
 		__pmc_fixed_disable(counter, hwc, idx);
 	else
-		wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
+		hw_perf_disable(idx, hwc->config);
 }
 
 static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
@@ -354,8 +512,7 @@ __pmc_generic_enable(struct perf_counter *counter,
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
 		__pmc_fixed_enable(counter, hwc, idx);
 	else
-		wrmsr(hwc->config_base + idx,
-		      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
+		hw_perf_enable(idx, hwc->config);
 }
 
 static int
@@ -567,22 +724,20 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
  * This handler is triggered by the local APIC, so the APIC IRQ handling
  * rules apply:
  */
-static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
+static int __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 {
 	int bit, cpu = smp_processor_id();
 	u64 ack, status;
 	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
+	int ret = 0;
 
-	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
-
-	/* Disable counters globally */
-	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-	ack_APIC_irq();
+	cpuc->throttle_ctrl = hw_perf_save_disable();
 
-	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+	status = hw_perf_get_status(cpuc->throttle_ctrl);
 	if (!status)
 		goto out;
 
+	ret = 1;
 again:
 	inc_irq_stat(apic_perf_irqs);
 	ack = status;
@@ -618,12 +773,12 @@ again:
 		}
 	}
 
-	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
+	hw_perf_ack_status(ack);
 
 	/*
 	 * Repeat if there is more work to be done:
 	 */
-	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+	status = hw_perf_get_status(cpuc->throttle_ctrl);
 	if (status)
 		goto again;
 out:
@@ -631,32 +786,27 @@ out:
 	 * Restore - do not reenable when global enable is off or throttled:
 	 */
 	if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS)
-		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
+		hw_perf_restore(cpuc->throttle_ctrl);
+
+	return ret;
 }
 
 void perf_counter_unthrottle(void)
 {
 	struct cpu_hw_counters *cpuc;
-	u64 global_enable;
 
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
 		return;
 
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
-		return;
-
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	cpuc = &per_cpu(cpu_hw_counters, smp_processor_id());
+	cpuc = &__get_cpu_var(cpu_hw_counters);
 	if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
 		if (printk_ratelimit())
 			printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n");
-		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
+		hw_perf_restore(cpuc->throttle_ctrl);
 	}
-	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, global_enable);
-	if (unlikely(cpuc->global_enable && !global_enable))
-		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
 	cpuc->interrupts = 0;
 }
 
@@ -664,8 +814,8 @@ void smp_perf_counter_interrupt(struct pt_regs *regs)
 {
 	irq_enter();
 	apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
+	ack_APIC_irq();
 	__smp_perf_counter_interrupt(regs, 0);
-
 	irq_exit();
 }
 
@@ -722,16 +872,23 @@ perf_counter_nmi_handler(struct notifier_block *self,
 {
 	struct die_args *args = __args;
 	struct pt_regs *regs;
+	int ret;
+
+	switch (cmd) {
+	case DIE_NMI:
+	case DIE_NMI_IPI:
+		break;
 
-	if (likely(cmd != DIE_NMI_IPI))
+	default:
 		return NOTIFY_DONE;
+	}
 
 	regs = args->regs;
 
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
-	__smp_perf_counter_interrupt(regs, 1);
+	ret = __smp_perf_counter_interrupt(regs, 1);
 
-	return NOTIFY_STOP;
+	return ret ? NOTIFY_STOP : NOTIFY_OK;
 }
 
 static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
@@ -743,18 +900,28 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
 static struct pmc_x86_ops pmc_intel_ops = {
 	.save_disable_all	= pmc_intel_save_disable_all,
 	.restore_all		= pmc_intel_restore_all,
+	.get_status		= pmc_intel_get_status,
+	.ack_status		= pmc_intel_ack_status,
+	.enable			= pmc_intel_enable,
+	.disable		= pmc_intel_disable,
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
 	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
 	.event_map		= pmc_intel_event_map,
+	.raw_event		= pmc_intel_raw_event,
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
 };
 
 static struct pmc_x86_ops pmc_amd_ops = {
 	.save_disable_all	= pmc_amd_save_disable_all,
 	.restore_all		= pmc_amd_restore_all,
+	.get_status		= pmc_amd_get_status,
+	.ack_status		= pmc_amd_ack_status,
+	.enable			= pmc_amd_enable,
+	.disable		= pmc_amd_disable,
 	.eventsel		= MSR_K7_EVNTSEL0,
 	.perfctr		= MSR_K7_PERFCTR0,
 	.event_map		= pmc_amd_event_map,
+	.raw_event		= pmc_amd_raw_event,
 	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
 };
 
@@ -787,8 +954,25 @@ static struct pmc_x86_ops *pmc_intel_init(void)
 
 static struct pmc_x86_ops *pmc_amd_init(void)
 {
+	u64 old;
+	int bits;
+
 	nr_counters_generic = 4;
 	nr_counters_fixed = 0;
+	counter_value_mask = ~0ULL;
+
+	rdmsrl(MSR_K7_PERFCTR0, old);
+	wrmsrl(MSR_K7_PERFCTR0, counter_value_mask);
+	/*
+	 * read the truncated mask
+	 */
+	rdmsrl(MSR_K7_PERFCTR0, counter_value_mask);
+	wrmsrl(MSR_K7_PERFCTR0, old);
+
+	bits = 32 + fls(counter_value_mask >> 32);
+	if (bits == 32)
+		bits = fls((u32)counter_value_mask);
+	counter_value_bits = bits;
 
 	pr_info("AMD Performance Monitoring support detected.\n");
 
-- 
cgit v1.2.3


From b5e8acf66ff5db707c7e08df49fdf6b415878442 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 5 Mar 2009 20:34:21 +0100
Subject: perfcounters: IRQ and NMI support on AMD CPUs, fix

The BKGD suggests that counter width on AMD CPUs is 48 for all
existing models (it certainly is for mine).

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 6ebe9abf6ae..f5853718d4d 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -959,20 +959,8 @@ static struct pmc_x86_ops *pmc_amd_init(void)
 
 	nr_counters_generic = 4;
 	nr_counters_fixed = 0;
-	counter_value_mask = ~0ULL;
-
-	rdmsrl(MSR_K7_PERFCTR0, old);
-	wrmsrl(MSR_K7_PERFCTR0, counter_value_mask);
-	/*
-	 * read the truncated mask
-	 */
-	rdmsrl(MSR_K7_PERFCTR0, counter_value_mask);
-	wrmsrl(MSR_K7_PERFCTR0, old);
-
-	bits = 32 + fls(counter_value_mask >> 32);
-	if (bits == 32)
-		bits = fls((u32)counter_value_mask);
-	counter_value_bits = bits;
+	counter_value_mask = 0x0000FFFFFFFFFFFFULL;
+	counter_value_bits = 48;
 
 	pr_info("AMD Performance Monitoring support detected.\n");
 
-- 
cgit v1.2.3


From 86028598de16538f02519141756ccf4accfc29a6 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 5 Mar 2009 14:05:57 +1100
Subject: perfcounters/powerpc: fix oops with multiple counters in a group

Impact: fix oops-causing bug

This fixes a bug in the powerpc hw_perf_counter_init where the code
didn't initialize ctrs[n] before passing the ctrs array to check_excludes,
leading to possible oopses and other incorrect behaviour.  This fixes it
by initializing ctrs[n] correctly.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/perf_counter.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 112332d07fc..4fec112386f 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -633,6 +633,7 @@ hw_perf_counter_init(struct perf_counter *counter)
 			return NULL;
 	}
 	events[n] = ev;
+	ctrs[n] = counter;
 	if (check_excludes(ctrs, n, 1))
 		return NULL;
 	if (power_check_constraints(events, n + 1))
-- 
cgit v1.2.3


From aabbaa6036fd847c583f585c6bae82b5a033e6c7 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 6 Mar 2009 16:27:10 +1100
Subject: perfcounters/powerpc: add support for POWER5+ processors

Impact: more hardware support

This adds the back-end for the PMU on the POWER5+ processors (i.e. GS,
including GS DD3 aka POWER5++).  This doesn't use the fixed-function
PMC5 and PMC6 since they don't respect the freeze conditions and don't
generate interrupts, as on POWER6.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/Makefile       |   4 +-
 arch/powerpc/kernel/perf_counter.c |   4 +
 arch/powerpc/kernel/power5+-pmu.c  | 452 +++++++++++++++++++++++++++++++++++++
 3 files changed, 458 insertions(+), 2 deletions(-)
 create mode 100644 arch/powerpc/kernel/power5+-pmu.c

(limited to 'arch')

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index b4c6f466164..49851e0d8fd 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -94,8 +94,8 @@ obj-$(CONFIG_AUDIT)		+= audit.o
 obj64-$(CONFIG_AUDIT)		+= compat_audit.o
 
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
-obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o ppc970-pmu.o power5-pmu.o \
-				   power6-pmu.o
+obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o ppc970-pmu.o \
+				   power5-pmu.o power5+-pmu.o power6-pmu.o
 
 obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
 
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 4fec112386f..162f3981fa2 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -826,6 +826,7 @@ void hw_perf_counter_setup(int cpu)
 
 extern struct power_pmu ppc970_pmu;
 extern struct power_pmu power5_pmu;
+extern struct power_pmu power5p_pmu;
 extern struct power_pmu power6_pmu;
 
 static int init_perf_counters(void)
@@ -848,6 +849,9 @@ static int init_perf_counters(void)
 	case PV_POWER5:
 		ppmu = &power5_pmu;
 		break;
+	case PV_POWER5p:
+		ppmu = &power5p_pmu;
+		break;
 	case 0x3e:
 		ppmu = &power6_pmu;
 		break;
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
new file mode 100644
index 00000000000..cec21ea65b0
--- /dev/null
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -0,0 +1,452 @@
+/*
+ * Performance counter support for POWER5 (not POWER5++) processors.
+ *
+ * Copyright 2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/perf_counter.h>
+#include <asm/reg.h>
+
+/*
+ * Bits in event code for POWER5+ (POWER5 GS) and POWER5++ (POWER5 GS DD3)
+ */
+#define PM_PMC_SH	20	/* PMC number (1-based) for direct events */
+#define PM_PMC_MSK	0xf
+#define PM_PMC_MSKS	(PM_PMC_MSK << PM_PMC_SH)
+#define PM_UNIT_SH	16	/* TTMMUX number and setting - unit select */
+#define PM_UNIT_MSK	0xf
+#define PM_BYTE_SH	12	/* Byte number of event bus to use */
+#define PM_BYTE_MSK	7
+#define PM_GRS_SH	8	/* Storage subsystem mux select */
+#define PM_GRS_MSK	7
+#define PM_BUSEVENT_MSK	0x80	/* Set if event uses event bus */
+#define PM_PMCSEL_MSK	0x7f
+
+/* Values in PM_UNIT field */
+#define PM_FPU		0
+#define PM_ISU0		1
+#define PM_IFU		2
+#define PM_ISU1		3
+#define PM_IDU		4
+#define PM_ISU0_ALT	6
+#define PM_GRS		7
+#define PM_LSU0		8
+#define PM_LSU1		0xc
+#define PM_LASTUNIT	0xc
+
+/*
+ * Bits in MMCR1 for POWER5+
+ */
+#define MMCR1_TTM0SEL_SH	62
+#define MMCR1_TTM1SEL_SH	60
+#define MMCR1_TTM2SEL_SH	58
+#define MMCR1_TTM3SEL_SH	56
+#define MMCR1_TTMSEL_MSK	3
+#define MMCR1_TD_CP_DBG0SEL_SH	54
+#define MMCR1_TD_CP_DBG1SEL_SH	52
+#define MMCR1_TD_CP_DBG2SEL_SH	50
+#define MMCR1_TD_CP_DBG3SEL_SH	48
+#define MMCR1_GRS_L2SEL_SH	46
+#define MMCR1_GRS_L2SEL_MSK	3
+#define MMCR1_GRS_L3SEL_SH	44
+#define MMCR1_GRS_L3SEL_MSK	3
+#define MMCR1_GRS_MCSEL_SH	41
+#define MMCR1_GRS_MCSEL_MSK	7
+#define MMCR1_GRS_FABSEL_SH	39
+#define MMCR1_GRS_FABSEL_MSK	3
+#define MMCR1_PMC1_ADDER_SEL_SH	35
+#define MMCR1_PMC2_ADDER_SEL_SH	34
+#define MMCR1_PMC3_ADDER_SEL_SH	33
+#define MMCR1_PMC4_ADDER_SEL_SH	32
+#define MMCR1_PMC1SEL_SH	25
+#define MMCR1_PMC2SEL_SH	17
+#define MMCR1_PMC3SEL_SH	9
+#define MMCR1_PMC4SEL_SH	1
+#define MMCR1_PMCSEL_SH(n)	(MMCR1_PMC1SEL_SH - (n) * 8)
+#define MMCR1_PMCSEL_MSK	0x7f
+
+/*
+ * Bits in MMCRA
+ */
+
+/*
+ * Layout of constraint bits:
+ * 6666555555555544444444443333333333222222222211111111110000000000
+ * 3210987654321098765432109876543210987654321098765432109876543210
+ *             [  ><><>< ><> <><>[  >      <  ><  ><  ><  ><><><><>
+ *             NC  G0G1G2 G3 T0T1 UC        B0  B1  B2  B3 P4P3P2P1
+ *
+ * NC - number of counters
+ *     51: NC error 0x0008_0000_0000_0000
+ *     48-50: number of events needing PMC1-4 0x0007_0000_0000_0000
+ *
+ * G0..G3 - GRS mux constraints
+ *     46-47: GRS_L2SEL value
+ *     44-45: GRS_L3SEL value
+ *     41-44: GRS_MCSEL value
+ *     39-40: GRS_FABSEL value
+ *	Note that these match up with their bit positions in MMCR1
+ *
+ * T0 - TTM0 constraint
+ *     36-37: TTM0SEL value (0=FPU, 2=IFU, 3=ISU1) 0x30_0000_0000
+ *
+ * T1 - TTM1 constraint
+ *     34-35: TTM1SEL value (0=IDU, 3=GRS) 0x0c_0000_0000
+ *
+ * UC - unit constraint: can't have all three of FPU|IFU|ISU1, ISU0, IDU|GRS
+ *     33: UC3 error 0x02_0000_0000
+ *     32: FPU|IFU|ISU1 events needed 0x01_0000_0000
+ *     31: ISU0 events needed 0x01_8000_0000
+ *     30: IDU|GRS events needed 0x00_4000_0000
+ *
+ * B0
+ *     20-23: Byte 0 event source 0x00f0_0000
+ *	      Encoding as for the event code
+ *
+ * B1, B2, B3
+ *     16-19, 12-15, 8-11: Byte 1, 2, 3 event sources
+ *
+ * P4
+ *     7: P1 error 0x80
+ *     6-7: Count of events needing PMC4
+ *
+ * P1..P3
+ *     0-6: Count of events needing PMC1..PMC3
+ */
+
+static const int grsel_shift[8] = {
+	MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH,
+	MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH,
+	MMCR1_GRS_MCSEL_SH, MMCR1_GRS_FABSEL_SH
+};
+
+/* Masks and values for using events from the various units */
+static u64 unit_cons[PM_LASTUNIT+1][2] = {
+	[PM_FPU] =   { 0x3200000000ull, 0x0100000000ull },
+	[PM_ISU0] =  { 0x0200000000ull, 0x0080000000ull },
+	[PM_ISU1] =  { 0x3200000000ull, 0x3100000000ull },
+	[PM_IFU] =   { 0x3200000000ull, 0x2100000000ull },
+	[PM_IDU] =   { 0x0e00000000ull, 0x0040000000ull },
+	[PM_GRS] =   { 0x0e00000000ull, 0x0c40000000ull },
+};
+
+static int power5p_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
+{
+	int pmc, byte, unit, sh;
+	int bit, fmask;
+	u64 mask = 0, value = 0;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	if (pmc) {
+		if (pmc > 4)
+			return -1;
+		sh = (pmc - 1) * 2;
+		mask |= 2 << sh;
+		value |= 1 << sh;
+	}
+	if (event & PM_BUSEVENT_MSK) {
+		unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+		if (unit > PM_LASTUNIT)
+			return -1;
+		if (unit == PM_ISU0_ALT)
+			unit = PM_ISU0;
+		mask |= unit_cons[unit][0];
+		value |= unit_cons[unit][1];
+		byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+		if (byte >= 4) {
+			if (unit != PM_LSU1)
+				return -1;
+			/* Map LSU1 low word (bytes 4-7) to unit LSU1+1 */
+			++unit;
+			byte &= 3;
+		}
+		if (unit == PM_GRS) {
+			bit = event & 7;
+			fmask = (bit == 6)? 7: 3;
+			sh = grsel_shift[bit];
+			mask |= (u64)fmask << sh;
+			value |= (u64)((event >> PM_GRS_SH) & fmask) << sh;
+		}
+		/* Set byte lane select field */
+		mask  |= 0xfULL << (20 - 4 * byte);
+		value |= (u64)unit << (20 - 4 * byte);
+	}
+	mask  |= 0x8000000000000ull;
+	value |= 0x1000000000000ull;
+	*maskp = mask;
+	*valp = value;
+	return 0;
+}
+
+#define MAX_ALT	3	/* at most 3 alternatives for any event */
+
+static const unsigned int event_alternatives[][MAX_ALT] = {
+	{ 0x100c0,  0x40001f },			/* PM_GCT_FULL_CYC */
+	{ 0x120e4,  0x400002 },			/* PM_GRP_DISP_REJECT */
+	{ 0x230e2,  0x323087 },			/* PM_BR_PRED_CR */
+	{ 0x230e3,  0x223087, 0x3230a0 },	/* PM_BR_PRED_TA */
+	{ 0x410c7,  0x441084 },			/* PM_THRD_L2MISS_BOTH_CYC */
+	{ 0x800c4,  0xc20e0 },			/* PM_DTLB_MISS */
+	{ 0xc50c6,  0xc60e0 },			/* PM_MRK_DTLB_MISS */
+	{ 0x100009, 0x200009 },			/* PM_INST_CMPL */
+	{ 0x200015, 0x300015 },			/* PM_LSU_LMQ_SRQ_EMPTY_CYC */
+	{ 0x300009, 0x400009 },			/* PM_INST_DISP */
+};
+
+/*
+ * Scan the alternatives table for a match and return the
+ * index into the alternatives table if found, else -1.
+ */
+static int find_alternative(unsigned int event)
+{
+	int i, j;
+
+	for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
+		if (event < event_alternatives[i][0])
+			break;
+		for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j)
+			if (event == event_alternatives[i][j])
+				return i;
+	}
+	return -1;
+}
+
+static const unsigned char bytedecode_alternatives[4][4] = {
+	/* PMC 1 */	{ 0x21, 0x23, 0x25, 0x27 },
+	/* PMC 2 */	{ 0x07, 0x17, 0x0e, 0x1e },
+	/* PMC 3 */	{ 0x20, 0x22, 0x24, 0x26 },
+	/* PMC 4 */	{ 0x07, 0x17, 0x0e, 0x1e }
+};
+
+/*
+ * Some direct events for decodes of event bus byte 3 have alternative
+ * PMCSEL values on other counters.  This returns the alternative
+ * event code for those that do, or -1 otherwise.  This also handles
+ * alternative PCMSEL values for add events.
+ */
+static int find_alternative_bdecode(unsigned int event)
+{
+	int pmc, altpmc, pp, j;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	if (pmc == 0 || pmc > 4)
+		return -1;
+	altpmc = 5 - pmc;	/* 1 <-> 4, 2 <-> 3 */
+	pp = event & PM_PMCSEL_MSK;
+	for (j = 0; j < 4; ++j) {
+		if (bytedecode_alternatives[pmc - 1][j] == pp) {
+			return (event & ~(PM_PMC_MSKS | PM_PMCSEL_MSK)) |
+				(altpmc << PM_PMC_SH) |
+				bytedecode_alternatives[altpmc - 1][j];
+		}
+	}
+
+	/* new decode alternatives for power5+ */
+	if (pmc == 1 && (pp == 0x0d || pp == 0x0e))
+		return event + (2 << PM_PMC_SH) + (0x2e - 0x0d);
+	if (pmc == 3 && (pp == 0x2e || pp == 0x2f))
+		return event - (2 << PM_PMC_SH) - (0x2e - 0x0d);
+
+	/* alternative add event encodings */
+	if (pp == 0x10 || pp == 0x28)
+		return ((event ^ (0x10 ^ 0x28)) & ~PM_PMC_MSKS) |
+			(altpmc << PM_PMC_SH);
+
+	return -1;
+}
+
+static int power5p_get_alternatives(unsigned int event, unsigned int alt[])
+{
+	int i, j, ae, nalt = 1;
+
+	alt[0] = event;
+	nalt = 1;
+	i = find_alternative(event);
+	if (i >= 0) {
+		for (j = 0; j < MAX_ALT; ++j) {
+			ae = event_alternatives[i][j];
+			if (ae && ae != event)
+				alt[nalt++] = ae;
+		}
+	} else {
+		ae = find_alternative_bdecode(event);
+		if (ae > 0)
+			alt[nalt++] = ae;
+	}
+	return nalt;
+}
+
+static int power5p_compute_mmcr(unsigned int event[], int n_ev,
+				unsigned int hwc[], u64 mmcr[])
+{
+	u64 mmcr1 = 0;
+	unsigned int pmc, unit, byte, psel;
+	unsigned int ttm;
+	int i, isbus, bit, grsel;
+	unsigned int pmc_inuse = 0;
+	unsigned char busbyte[4];
+	unsigned char unituse[16];
+	int ttmuse;
+
+	if (n_ev > 4)
+		return -1;
+
+	/* First pass to count resource use */
+	memset(busbyte, 0, sizeof(busbyte));
+	memset(unituse, 0, sizeof(unituse));
+	for (i = 0; i < n_ev; ++i) {
+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+		if (pmc) {
+			if (pmc > 4)
+				return -1;
+			if (pmc_inuse & (1 << (pmc - 1)))
+				return -1;
+			pmc_inuse |= 1 << (pmc - 1);
+		}
+		if (event[i] & PM_BUSEVENT_MSK) {
+			unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+			byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+			if (unit > PM_LASTUNIT)
+				return -1;
+			if (unit == PM_ISU0_ALT)
+				unit = PM_ISU0;
+			if (byte >= 4) {
+				if (unit != PM_LSU1)
+					return -1;
+				++unit;
+				byte &= 3;
+			}
+			if (busbyte[byte] && busbyte[byte] != unit)
+				return -1;
+			busbyte[byte] = unit;
+			unituse[unit] = 1;
+		}
+	}
+
+	/*
+	 * Assign resources and set multiplexer selects.
+	 *
+	 * PM_ISU0 can go either on TTM0 or TTM1, but that's the only
+	 * choice we have to deal with.
+	 */
+	if (unituse[PM_ISU0] &
+	    (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_ISU1])) {
+		unituse[PM_ISU0_ALT] = 1;	/* move ISU to TTM1 */
+		unituse[PM_ISU0] = 0;
+	}
+	/* Set TTM[01]SEL fields. */
+	ttmuse = 0;
+	for (i = PM_FPU; i <= PM_ISU1; ++i) {
+		if (!unituse[i])
+			continue;
+		if (ttmuse++)
+			return -1;
+		mmcr1 |= (u64)i << MMCR1_TTM0SEL_SH;
+	}
+	ttmuse = 0;
+	for (; i <= PM_GRS; ++i) {
+		if (!unituse[i])
+			continue;
+		if (ttmuse++)
+			return -1;
+		mmcr1 |= (u64)(i & 3) << MMCR1_TTM1SEL_SH;
+	}
+	if (ttmuse > 1)
+		return -1;
+
+	/* Set byte lane select fields, TTM[23]SEL and GRS_*SEL. */
+	for (byte = 0; byte < 4; ++byte) {
+		unit = busbyte[byte];
+		if (!unit)
+			continue;
+		if (unit == PM_ISU0 && unituse[PM_ISU0_ALT]) {
+			/* get ISU0 through TTM1 rather than TTM0 */
+			unit = PM_ISU0_ALT;
+		} else if (unit == PM_LSU1 + 1) {
+			/* select lower word of LSU1 for this byte */
+			mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
+		}
+		ttm = unit >> 2;
+		mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
+	}
+
+	/* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
+	for (i = 0; i < n_ev; ++i) {
+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+		unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+		byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+		psel = event[i] & PM_PMCSEL_MSK;
+		isbus = event[i] & PM_BUSEVENT_MSK;
+		if (!pmc) {
+			/* Bus event or any-PMC direct event */
+			for (pmc = 0; pmc < 4; ++pmc) {
+				if (!(pmc_inuse & (1 << pmc)))
+					break;
+			}
+			if (pmc >= 4)
+				return -1;
+			pmc_inuse |= 1 << pmc;
+		} else {
+			/* Direct event */
+			--pmc;
+			if (isbus && (byte & 2) &&
+			    (psel == 8 || psel == 0x10 || psel == 0x28))
+				/* add events on higher-numbered bus */
+				mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc);
+		}
+		if (isbus && unit == PM_GRS) {
+			bit = psel & 7;
+			grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK;
+			mmcr1 |= (u64)grsel << grsel_shift[bit];
+		}
+		if ((psel & 0x58) == 0x40 && (byte & 1) != ((pmc >> 1) & 1))
+			/* select alternate byte lane */
+			psel |= 0x10;
+		if (pmc <= 3)
+			mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc);
+		hwc[i] = pmc;
+	}
+
+	/* Return MMCRx values */
+	mmcr[0] = 0;
+	if (pmc_inuse & 1)
+		mmcr[0] = MMCR0_PMC1CE;
+	if (pmc_inuse & 0x3e)
+		mmcr[0] |= MMCR0_PMCjCE;
+	mmcr[1] = mmcr1;
+	mmcr[2] = 0;
+	return 0;
+}
+
+static void power5p_disable_pmc(unsigned int pmc, u64 mmcr[])
+{
+	if (pmc <= 3)
+		mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc));
+}
+
+static int power5p_generic_events[] = {
+	[PERF_COUNT_CPU_CYCLES] = 0xf,
+	[PERF_COUNT_INSTRUCTIONS] = 0x100009,
+	[PERF_COUNT_CACHE_REFERENCES] = 0x1c10a8,	/* LD_REF_L1 */
+	[PERF_COUNT_CACHE_MISSES] = 0x3c1088,		/* LD_MISS_L1 */
+	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x230e4,	/* BR_ISSUED */ 
+	[PERF_COUNT_BRANCH_MISSES] = 0x230e5,		/* BR_MPRED_CR */
+};
+
+struct power_pmu power5p_pmu = {
+	.n_counter = 4,
+	.max_alternatives = MAX_ALT,
+	.add_fields = 0x7000000000055ull,
+	.test_adder = 0x3000040000000ull,
+	.compute_mmcr = power5p_compute_mmcr,
+	.get_constraint = power5p_get_constraint,
+	.get_alternatives = power5p_get_alternatives,
+	.disable_pmc = power5p_disable_pmc,
+	.n_generic = ARRAY_SIZE(power5p_generic_events),
+	.generic_events = power5p_generic_events,
+};
-- 
cgit v1.2.3


From 880860e392d92c457e8116cdee39ec4d109174ee Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 6 Mar 2009 16:30:52 +1100
Subject: perfcounters/powerpc: add support for POWER4 processors

Impact: more hardware support

This adds the back-end for the PMU on the POWER4 and POWER4+ processors
(GP and GQ).  This is quite similar to the PPC970, with 8 PMCs, but has
fewer events than the PPC970.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/Makefile       |   2 +-
 arch/powerpc/kernel/perf_counter.c |   5 +
 arch/powerpc/kernel/power4-pmu.c   | 557 +++++++++++++++++++++++++++++++++++++
 3 files changed, 563 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/kernel/power4-pmu.c

(limited to 'arch')

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 49851e0d8fd..8e5e2c74971 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -94,7 +94,7 @@ obj-$(CONFIG_AUDIT)		+= audit.o
 obj64-$(CONFIG_AUDIT)		+= compat_audit.o
 
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
-obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o ppc970-pmu.o \
+obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o power4-pmu.o ppc970-pmu.o \
 				   power5-pmu.o power5+-pmu.o power6-pmu.o
 
 obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 162f3981fa2..0e33d27cd46 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -824,6 +824,7 @@ void hw_perf_counter_setup(int cpu)
 	cpuhw->mmcr[0] = MMCR0_FC;
 }
 
+extern struct power_pmu power4_pmu;
 extern struct power_pmu ppc970_pmu;
 extern struct power_pmu power5_pmu;
 extern struct power_pmu power5p_pmu;
@@ -841,6 +842,10 @@ static int init_perf_counters(void)
 	/* XXX should get this from cputable */
 	pvr = mfspr(SPRN_PVR);
 	switch (PVR_VER(pvr)) {
+	case PV_POWER4:
+	case PV_POWER4p:
+		ppmu = &power4_pmu;
+		break;
 	case PV_970:
 	case PV_970FX:
 	case PV_970MP:
diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c
new file mode 100644
index 00000000000..1407b19ab61
--- /dev/null
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -0,0 +1,557 @@
+/*
+ * Performance counter support for POWER4 (GP) and POWER4+ (GQ) processors.
+ *
+ * Copyright 2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/perf_counter.h>
+#include <asm/reg.h>
+
+/*
+ * Bits in event code for POWER4
+ */
+#define PM_PMC_SH	12	/* PMC number (1-based) for direct events */
+#define PM_PMC_MSK	0xf
+#define PM_UNIT_SH	8	/* TTMMUX number and setting - unit select */
+#define PM_UNIT_MSK	0xf
+#define PM_LOWER_SH	6
+#define PM_LOWER_MSK	1
+#define PM_LOWER_MSKS	0x40
+#define PM_BYTE_SH	4	/* Byte number of event bus to use */
+#define PM_BYTE_MSK	3
+#define PM_PMCSEL_MSK	7
+
+/*
+ * Unit code values
+ */
+#define PM_FPU		1
+#define PM_ISU1		2
+#define PM_IFU		3
+#define PM_IDU0		4
+#define PM_ISU1_ALT	6
+#define PM_ISU2		7
+#define PM_IFU_ALT	8
+#define PM_LSU0		9
+#define PM_LSU1		0xc
+#define PM_GPS		0xf
+
+/*
+ * Bits in MMCR0 for POWER4
+ */
+#define MMCR0_PMC1SEL_SH	8
+#define MMCR0_PMC2SEL_SH	1
+#define MMCR_PMCSEL_MSK		0x1f
+
+/*
+ * Bits in MMCR1 for POWER4
+ */
+#define MMCR1_TTM0SEL_SH	62
+#define MMCR1_TTC0SEL_SH	61
+#define MMCR1_TTM1SEL_SH	59
+#define MMCR1_TTC1SEL_SH	58
+#define MMCR1_TTM2SEL_SH	56
+#define MMCR1_TTC2SEL_SH	55
+#define MMCR1_TTM3SEL_SH	53
+#define MMCR1_TTC3SEL_SH	52
+#define MMCR1_TTMSEL_MSK	3
+#define MMCR1_TD_CP_DBG0SEL_SH	50
+#define MMCR1_TD_CP_DBG1SEL_SH	48
+#define MMCR1_TD_CP_DBG2SEL_SH	46
+#define MMCR1_TD_CP_DBG3SEL_SH	44
+#define MMCR1_DEBUG0SEL_SH	43
+#define MMCR1_DEBUG1SEL_SH	42
+#define MMCR1_DEBUG2SEL_SH	41
+#define MMCR1_DEBUG3SEL_SH	40
+#define MMCR1_PMC1_ADDER_SEL_SH	39
+#define MMCR1_PMC2_ADDER_SEL_SH	38
+#define MMCR1_PMC6_ADDER_SEL_SH	37
+#define MMCR1_PMC5_ADDER_SEL_SH	36
+#define MMCR1_PMC8_ADDER_SEL_SH	35
+#define MMCR1_PMC7_ADDER_SEL_SH	34
+#define MMCR1_PMC3_ADDER_SEL_SH	33
+#define MMCR1_PMC4_ADDER_SEL_SH	32
+#define MMCR1_PMC3SEL_SH	27
+#define MMCR1_PMC4SEL_SH	22
+#define MMCR1_PMC5SEL_SH	17
+#define MMCR1_PMC6SEL_SH	12
+#define MMCR1_PMC7SEL_SH	7
+#define MMCR1_PMC8SEL_SH	2	/* note bit 0 is in MMCRA for GP */
+
+static short mmcr1_adder_bits[8] = {
+	MMCR1_PMC1_ADDER_SEL_SH,
+	MMCR1_PMC2_ADDER_SEL_SH,
+	MMCR1_PMC3_ADDER_SEL_SH,
+	MMCR1_PMC4_ADDER_SEL_SH,
+	MMCR1_PMC5_ADDER_SEL_SH,
+	MMCR1_PMC6_ADDER_SEL_SH,
+	MMCR1_PMC7_ADDER_SEL_SH,
+	MMCR1_PMC8_ADDER_SEL_SH
+};
+
+/*
+ * Bits in MMCRA
+ */
+#define MMCRA_PMC8SEL0_SH	17	/* PMC8SEL bit 0 for GP */
+
+/*
+ * Layout of constraint bits:
+ * 6666555555555544444444443333333333222222222211111111110000000000
+ * 3210987654321098765432109876543210987654321098765432109876543210
+ *        |[  >[  >[   >|||[  >[  ><  ><  ><  ><  ><><><><><><><><>
+ *        | UC1 UC2 UC3 ||| PS1 PS2 B0  B1  B2  B3 P1P2P3P4P5P6P7P8
+ * 	  \SMPL	        ||\TTC3SEL
+ * 		        |\TTC_IFU_SEL
+ * 		        \TTM2SEL0
+ *
+ * SMPL - SAMPLE_ENABLE constraint
+ *     56: SAMPLE_ENABLE value 0x0100_0000_0000_0000
+ *
+ * UC1 - unit constraint 1: can't have all three of FPU/ISU1/IDU0|ISU2
+ *     55: UC1 error 0x0080_0000_0000_0000
+ *     54: FPU events needed 0x0040_0000_0000_0000
+ *     53: ISU1 events needed 0x0020_0000_0000_0000
+ *     52: IDU0|ISU2 events needed 0x0010_0000_0000_0000
+ *
+ * UC2 - unit constraint 2: can't have all three of FPU/IFU/LSU0
+ *     51: UC2 error 0x0008_0000_0000_0000
+ *     50: FPU events needed 0x0004_0000_0000_0000
+ *     49: IFU events needed 0x0002_0000_0000_0000
+ *     48: LSU0 events needed 0x0001_0000_0000_0000
+ *
+ * UC3 - unit constraint 3: can't have all four of LSU0/IFU/IDU0|ISU2/ISU1
+ *     47: UC3 error 0x8000_0000_0000
+ *     46: LSU0 events needed 0x4000_0000_0000
+ *     45: IFU events needed 0x2000_0000_0000
+ *     44: IDU0|ISU2 events needed 0x1000_0000_0000
+ *     43: ISU1 events needed 0x0800_0000_0000
+ *
+ * TTM2SEL0
+ *     42: 0 = IDU0 events needed
+ *     	   1 = ISU2 events needed 0x0400_0000_0000
+ *
+ * TTC_IFU_SEL
+ *     41: 0 = IFU.U events needed
+ *     	   1 = IFU.L events needed 0x0200_0000_0000
+ *
+ * TTC3SEL
+ *     40: 0 = LSU1.U events needed
+ *     	   1 = LSU1.L events needed 0x0100_0000_0000
+ *
+ * PS1
+ *     39: PS1 error 0x0080_0000_0000
+ *     36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000
+ *
+ * PS2
+ *     35: PS2 error 0x0008_0000_0000
+ *     32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000
+ *
+ * B0
+ *     28-31: Byte 0 event source 0xf000_0000
+ *     	   1 = FPU
+ * 	   2 = ISU1
+ * 	   3 = IFU
+ * 	   4 = IDU0
+ * 	   7 = ISU2
+ * 	   9 = LSU0
+ * 	   c = LSU1
+ * 	   f = GPS
+ *
+ * B1, B2, B3
+ *     24-27, 20-23, 16-19: Byte 1, 2, 3 event sources
+ *
+ * P8
+ *     15: P8 error 0x8000
+ *     14-15: Count of events needing PMC8
+ *
+ * P1..P7
+ *     0-13: Count of events needing PMC1..PMC7
+ *
+ * Note: this doesn't allow events using IFU.U to be combined with events
+ * using IFU.L, though that is feasible (using TTM0 and TTM2).  However
+ * there are no listed events for IFU.L (they are debug events not
+ * verified for performance monitoring) so this shouldn't cause a
+ * problem.
+ */
+
+static struct unitinfo {
+	u64	value, mask;
+	int	unit;
+	int	lowerbit;
+} p4_unitinfo[16] = {
+	[PM_FPU]  = { 0x44000000000000ull, 0x88000000000000ull, PM_FPU, 0 },
+	[PM_ISU1] = { 0x20080000000000ull, 0x88000000000000ull, PM_ISU1, 0 },
+	[PM_ISU1_ALT] =
+		    { 0x20080000000000ull, 0x88000000000000ull, PM_ISU1, 0 },
+	[PM_IFU]  = { 0x02200000000000ull, 0x08820000000000ull, PM_IFU, 41 },
+	[PM_IFU_ALT] =
+		    { 0x02200000000000ull, 0x08820000000000ull, PM_IFU, 41 },
+	[PM_IDU0] = { 0x10100000000000ull, 0x80840000000000ull, PM_IDU0, 1 },
+	[PM_ISU2] = { 0x10140000000000ull, 0x80840000000000ull, PM_ISU2, 0 },
+	[PM_LSU0] = { 0x01400000000000ull, 0x08800000000000ull, PM_LSU0, 0 },
+	[PM_LSU1] = { 0x00000000000000ull, 0x00010000000000ull, PM_LSU1, 40 },
+	[PM_GPS]  = { 0x00000000000000ull, 0x00000000000000ull, PM_GPS, 0 }
+};
+
+static unsigned char direct_marked_event[8] = {
+	(1<<2) | (1<<3),	/* PMC1: PM_MRK_GRP_DISP, PM_MRK_ST_CMPL */
+	(1<<3) | (1<<5),	/* PMC2: PM_THRESH_TIMEO, PM_MRK_BRU_FIN */
+	(1<<3),			/* PMC3: PM_MRK_ST_CMPL_INT */
+	(1<<4) | (1<<5),	/* PMC4: PM_MRK_GRP_CMPL, PM_MRK_CRU_FIN */
+	(1<<4) | (1<<5),	/* PMC5: PM_MRK_GRP_TIMEO */
+	(1<<3) | (1<<4) | (1<<5),
+		/* PMC6: PM_MRK_ST_GPS, PM_MRK_FXU_FIN, PM_MRK_GRP_ISSUED */
+	(1<<4) | (1<<5),	/* PMC7: PM_MRK_FPU_FIN, PM_MRK_INST_FIN */
+	(1<<4),			/* PMC8: PM_MRK_LSU_FIN */
+};
+
+/*
+ * Returns 1 if event counts things relating to marked instructions
+ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
+ */
+static int p4_marked_instr_event(unsigned int event)
+{
+	int pmc, psel, unit, byte, bit;
+	unsigned int mask;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	psel = event & PM_PMCSEL_MSK;
+	if (pmc) {
+		if (direct_marked_event[pmc - 1] & (1 << psel))
+			return 1;
+		if (psel == 0)		/* add events */
+			bit = (pmc <= 4)? pmc - 1: 8 - pmc;
+		else if (psel == 6)	/* decode events */
+			bit = 4;
+		else
+			return 0;
+	} else
+		bit = psel;
+
+	byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+	mask = 0;
+	switch (unit) {
+	case PM_LSU1:
+		if (event & PM_LOWER_MSKS)
+			mask = 1 << 28;		/* byte 7 bit 4 */
+		else
+			mask = 6 << 24;		/* byte 3 bits 1 and 2 */
+		break;
+	case PM_LSU0:
+		/* byte 3, bit 3; byte 2 bits 0,2,3,4,5; byte 1 */
+		mask = 0x083dff00;
+	}
+	return (mask >> (byte * 8 + bit)) & 1;
+}
+
+static int p4_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
+{
+	int pmc, byte, unit, lower, sh;
+	u64 mask = 0, value = 0;
+	int grp = -1;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	if (pmc) {
+		if (pmc > 8)
+			return -1;
+		sh = (pmc - 1) * 2;
+		mask |= 2 << sh;
+		value |= 1 << sh;
+		grp = ((pmc - 1) >> 1) & 1;
+	}
+	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+	byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+	if (unit) {
+		lower = (event >> PM_LOWER_SH) & PM_LOWER_MSK;
+
+		/*
+		 * Bus events on bytes 0 and 2 can be counted
+		 * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8.
+		 */
+		if (!pmc)
+			grp = byte & 1;
+
+		if (!p4_unitinfo[unit].unit)
+			return -1;
+		mask  |= p4_unitinfo[unit].mask;
+		value |= p4_unitinfo[unit].value;
+		sh = p4_unitinfo[unit].lowerbit;
+		if (sh > 1)
+			value |= (u64)lower << sh;
+		else if (lower != sh)
+			return -1;
+		unit = p4_unitinfo[unit].unit;
+
+		/* Set byte lane select field */
+		mask  |= 0xfULL << (28 - 4 * byte);
+		value |= (u64)unit << (28 - 4 * byte);
+	}
+	if (grp == 0) {
+		/* increment PMC1/2/5/6 field */
+		mask  |= 0x8000000000ull;
+		value |= 0x1000000000ull;
+	} else {
+		/* increment PMC3/4/7/8 field */
+		mask  |= 0x800000000ull;
+		value |= 0x100000000ull;
+	}
+
+	/* Marked instruction events need sample_enable set */
+	if (p4_marked_instr_event(event)) {
+		mask  |= 1ull << 56;
+		value |= 1ull << 56;
+	}
+
+	/* PMCSEL=6 decode events on byte 2 need sample_enable clear */
+	if (pmc && (event & PM_PMCSEL_MSK) == 6 && byte == 2)
+		mask  |= 1ull << 56;
+
+	*maskp = mask;
+	*valp = value;
+	return 0;
+}
+
+static unsigned int ppc_inst_cmpl[] = {
+	0x1001, 0x4001, 0x6001, 0x7001, 0x8001
+};
+
+static int p4_get_alternatives(unsigned int event, unsigned int alt[])
+{
+	int i, j, na;
+
+	alt[0] = event;
+	na = 1;
+
+	/* 2 possibilities for PM_GRP_DISP_REJECT */
+	if (event == 0x8003 || event == 0x0224) {
+		alt[1] = event ^ (0x8003 ^ 0x0224);
+		return 2;
+	}
+
+	/* 2 possibilities for PM_ST_MISS_L1 */
+	if (event == 0x0c13 || event == 0x0c23) {
+		alt[1] = event ^ (0x0c13 ^ 0x0c23);
+		return 2;
+	}
+
+	/* several possibilities for PM_INST_CMPL */
+	for (i = 0; i < ARRAY_SIZE(ppc_inst_cmpl); ++i) {
+		if (event == ppc_inst_cmpl[i]) {
+			for (j = 0; j < ARRAY_SIZE(ppc_inst_cmpl); ++j)
+				if (j != i)
+					alt[na++] = ppc_inst_cmpl[j];
+			break;
+		}
+	}
+
+	return na;
+}
+
+static int p4_compute_mmcr(unsigned int event[], int n_ev,
+			   unsigned int hwc[], u64 mmcr[])
+{
+	u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0;
+	unsigned int pmc, unit, byte, psel, lower;
+	unsigned int ttm, grp;
+	unsigned int pmc_inuse = 0;
+	unsigned int pmc_grp_use[2];
+	unsigned char busbyte[4];
+	unsigned char unituse[16];
+	unsigned int unitlower = 0;
+	int i;
+
+	if (n_ev > 8)
+		return -1;
+
+	/* First pass to count resource use */
+	pmc_grp_use[0] = pmc_grp_use[1] = 0;
+	memset(busbyte, 0, sizeof(busbyte));
+	memset(unituse, 0, sizeof(unituse));
+	for (i = 0; i < n_ev; ++i) {
+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+		if (pmc) {
+			if (pmc_inuse & (1 << (pmc - 1)))
+				return -1;
+			pmc_inuse |= 1 << (pmc - 1);
+			/* count 1/2/5/6 vs 3/4/7/8 use */
+			++pmc_grp_use[((pmc - 1) >> 1) & 1];
+		}
+		unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+		byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+		lower = (event[i] >> PM_LOWER_SH) & PM_LOWER_MSK;
+		if (unit) {
+			if (!pmc)
+				++pmc_grp_use[byte & 1];
+			if (unit == 6 || unit == 8)
+				/* map alt ISU1/IFU codes: 6->2, 8->3 */
+				unit = (unit >> 1) - 1;
+			if (busbyte[byte] && busbyte[byte] != unit)
+				return -1;
+			busbyte[byte] = unit;
+			lower <<= unit;
+			if (unituse[unit] && lower != (unitlower & lower))
+				return -1;
+			unituse[unit] = 1;
+			unitlower |= lower;
+		}
+	}
+	if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4)
+		return -1;
+
+	/*
+	 * Assign resources and set multiplexer selects.
+	 *
+	 * Units 1,2,3 are on TTM0, 4,6,7 on TTM1, 8,10 on TTM2.
+	 * Each TTMx can only select one unit, but since
+	 * units 2 and 6 are both ISU1, and 3 and 8 are both IFU,
+	 * we have some choices.
+	 */
+	if (unituse[2] & (unituse[1] | (unituse[3] & unituse[9]))) {
+		unituse[6] = 1;		/* Move 2 to 6 */
+		unituse[2] = 0;
+	}
+	if (unituse[3] & (unituse[1] | unituse[2])) {
+		unituse[8] = 1;		/* Move 3 to 8 */
+		unituse[3] = 0;
+		unitlower = (unitlower & ~8) | ((unitlower & 8) << 5);
+	}
+	/* Check only one unit per TTMx */
+	if (unituse[1] + unituse[2] + unituse[3] > 1 ||
+	    unituse[4] + unituse[6] + unituse[7] > 1 ||
+	    unituse[8] + unituse[9] > 1 ||
+	    (unituse[5] | unituse[10] | unituse[11] |
+	     unituse[13] | unituse[14]))
+		return -1;
+
+	/* Set TTMxSEL fields.  Note, units 1-3 => TTM0SEL codes 0-2 */
+	mmcr1 |= (u64)(unituse[3] * 2 + unituse[2]) << MMCR1_TTM0SEL_SH;
+	mmcr1 |= (u64)(unituse[7] * 3 + unituse[6] * 2) << MMCR1_TTM1SEL_SH;
+	mmcr1 |= (u64)unituse[9] << MMCR1_TTM2SEL_SH;
+
+	/* Set TTCxSEL fields. */
+	if (unitlower & 0xe)
+		mmcr1 |= 1ull << MMCR1_TTC0SEL_SH;
+	if (unitlower & 0xf0)
+		mmcr1 |= 1ull << MMCR1_TTC1SEL_SH;
+	if (unitlower & 0xf00)
+		mmcr1 |= 1ull << MMCR1_TTC2SEL_SH;
+	if (unitlower & 0x7000)
+		mmcr1 |= 1ull << MMCR1_TTC3SEL_SH;
+
+	/* Set byte lane select fields. */
+	for (byte = 0; byte < 4; ++byte) {
+		unit = busbyte[byte];
+		if (!unit)
+			continue;
+		if (unit == 0xf) {
+			/* special case for GPS */
+			mmcr1 |= 1ull << (MMCR1_DEBUG0SEL_SH - byte);
+		} else {
+			if (!unituse[unit])
+				ttm = unit - 1;		/* 2->1, 3->2 */
+			else
+				ttm = unit >> 2;
+			mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2*byte);
+		}
+	}
+
+	/* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
+	for (i = 0; i < n_ev; ++i) {
+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+		unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+		byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+		psel = event[i] & PM_PMCSEL_MSK;
+		if (!pmc) {
+			/* Bus event or 00xxx direct event (off or cycles) */
+			if (unit)
+				psel |= 0x10 | ((byte & 2) << 2);
+			for (pmc = 0; pmc < 8; ++pmc) {
+				if (pmc_inuse & (1 << pmc))
+					continue;
+				grp = (pmc >> 1) & 1;
+				if (unit) {
+					if (grp == (byte & 1))
+						break;
+				} else if (pmc_grp_use[grp] < 4) {
+					++pmc_grp_use[grp];
+					break;
+				}
+			}
+			pmc_inuse |= 1 << pmc;
+		} else {
+			/* Direct event */
+			--pmc;
+			if (psel == 0 && (byte & 2))
+				/* add events on higher-numbered bus */
+				mmcr1 |= 1ull << mmcr1_adder_bits[pmc];
+			else if (psel == 6 && byte == 3)
+				/* seem to need to set sample_enable here */
+				mmcra |= MMCRA_SAMPLE_ENABLE;
+			psel |= 8;
+		}
+		if (pmc <= 1)
+			mmcr0 |= psel << (MMCR0_PMC1SEL_SH - 7 * pmc);
+		else
+			mmcr1 |= psel << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2));
+		if (pmc == 7)	/* PMC8 */
+			mmcra |= (psel & 1) << MMCRA_PMC8SEL0_SH;
+		hwc[i] = pmc;
+		if (p4_marked_instr_event(event[i]))
+			mmcra |= MMCRA_SAMPLE_ENABLE;
+	}
+
+	if (pmc_inuse & 1)
+		mmcr0 |= MMCR0_PMC1CE;
+	if (pmc_inuse & 0xfe)
+		mmcr0 |= MMCR0_PMCjCE;
+
+	mmcra |= 0x2000;	/* mark only one IOP per PPC instruction */
+
+	/* Return MMCRx values */
+	mmcr[0] = mmcr0;
+	mmcr[1] = mmcr1;
+	mmcr[2] = mmcra;
+	return 0;
+}
+
+static void p4_disable_pmc(unsigned int pmc, u64 mmcr[])
+{
+	/*
+	 * Setting the PMCxSEL field to 0 disables PMC x.
+	 * (Note that pmc is 0-based here, not 1-based.)
+	 */
+	if (pmc <= 1) {
+		mmcr[0] &= ~(0x1fUL << (MMCR0_PMC1SEL_SH - 7 * pmc));
+	} else {
+		mmcr[1] &= ~(0x1fUL << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2)));
+		if (pmc == 7)
+			mmcr[2] &= ~(1UL << MMCRA_PMC8SEL0_SH);
+	}
+}
+
+static int p4_generic_events[] = {
+	[PERF_COUNT_CPU_CYCLES] = 7,
+	[PERF_COUNT_INSTRUCTIONS] = 0x1001,
+	[PERF_COUNT_CACHE_REFERENCES] = 0x8c10,		/* PM_LD_REF_L1 */
+	[PERF_COUNT_CACHE_MISSES] = 0x3c10,		/* PM_LD_MISS_L1 */
+	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x330,	/* PM_BR_ISSUED */
+	[PERF_COUNT_BRANCH_MISSES] = 0x331,		/* PM_BR_MPRED_CR */
+};
+
+struct power_pmu power4_pmu = {
+	.n_counter = 8,
+	.max_alternatives = 5,
+	.add_fields = 0x0000001100005555ull,
+	.test_adder = 0x0011083300000000ull,
+	.compute_mmcr = p4_compute_mmcr,
+	.get_constraint = p4_get_constraint,
+	.get_alternatives = p4_get_alternatives,
+	.disable_pmc = p4_disable_pmc,
+	.n_generic = ARRAY_SIZE(p4_generic_events),
+	.generic_events = p4_generic_events,
+};
-- 
cgit v1.2.3


From 184fe4ab1f2e4dfa45584889bb3820031648386b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sun, 8 Mar 2009 11:34:19 +0100
Subject: x86: perf_counter cleanup

Use and actual unsigned long bitmap instead of casting our way around.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Jaswinder Singh Rajput <jaswinder@kernel.org>
LKML-Reference: <1236508459.22914.3645.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index f5853718d4d..1df421042b2 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -37,7 +37,7 @@ struct cpu_hw_counters {
 	unsigned long		used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long		interrupts;
 	u64			throttle_ctrl;
-	u64			active_mask;
+	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	int			enabled;
 };
 
@@ -291,7 +291,7 @@ static void pmc_amd_restore_all(u64 ctrl)
 		return;
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
-		if (test_bit(idx, (unsigned long *)&cpuc->active_mask)) {
+		if (test_bit(idx, cpuc->active_mask)) {
 			u64 val;
 
 			rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
@@ -377,7 +377,7 @@ static void pmc_amd_enable(int idx, u64 config)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 
-	set_bit(idx, (unsigned long *)&cpuc->active_mask);
+	set_bit(idx, cpuc->active_mask);
 	if (cpuc->enabled)
 		config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
 
@@ -401,7 +401,7 @@ static void pmc_amd_disable(int idx, u64 config)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 
-	clear_bit(idx, (unsigned long *)&cpuc->active_mask);
+	clear_bit(idx, cpuc->active_mask);
 	wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
 
 }
-- 
cgit v1.2.3


From e255357764f92afcafafbd4879b222b8c752065a Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Sun, 8 Mar 2009 17:09:49 +0530
Subject: x86: perf_counter cleanup

Remove unused variables and duplicate header file.

Signed-off-by: Jaswinder Singh Rajput <jaswinder@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 1df421042b2..155bc3c239b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -17,7 +17,6 @@
 #include <linux/kdebug.h>
 #include <linux/sched.h>
 
-#include <asm/perf_counter.h>
 #include <asm/apic.h>
 
 static bool perf_counters_initialized __read_mostly;
@@ -954,9 +953,6 @@ static struct pmc_x86_ops *pmc_intel_init(void)
 
 static struct pmc_x86_ops *pmc_amd_init(void)
 {
-	u64 old;
-	int bits;
-
 	nr_counters_generic = 4;
 	nr_counters_fixed = 0;
 	counter_value_mask = 0x0000FFFFFFFFFFFFULL;
-- 
cgit v1.2.3


From 595258aaeac4cc6e187b98b1bf29bb176febe763 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:28 +0100
Subject: perf_counter: x86: fix 32-bit irq_period assumption

No need to assume the irq_period is 32bit.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 155bc3c239b..1cedc3468ce 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -449,7 +449,7 @@ __hw_perf_counter_set_period(struct perf_counter *counter,
 			     struct hw_perf_counter *hwc, int idx)
 {
 	s64 left = atomic64_read(&hwc->period_left);
-	s32 period = hwc->irq_period;
+	s64 period = hwc->irq_period;
 	int err;
 
 	/*
-- 
cgit v1.2.3


From 60b3df9c1e24a18aabb412da9905208c5f04ebea Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:30 +0100
Subject: perf_counter: add comment to barrier

We need to ensure the enabled=0 write happens before we
start disabling the actual counters, so that a pcm_amd_enable()
will not enable one underneath us.

I think the race is impossible anyway, we always balance the
ops within any one context and perform enable() with IRQs disabled.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 1cedc3468ce..a2e3b76bfdc 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -247,6 +247,10 @@ static u64 pmc_amd_save_disable_all(void)
 
 	enabled = cpuc->enabled;
 	cpuc->enabled = 0;
+	/*
+	 * ensure we write the disable before we start disabling the
+	 * counters proper, so that pcm_amd_enable() does the right thing.
+	 */
 	barrier();
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
-- 
cgit v1.2.3


From 82bae4f8c2fd64a2bb1e2e72c508853ed2b4a299 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:31 +0100
Subject: perf_counter: x86: use ULL postfix for 64bit constants

Fix a build warning on 32bit machines by explicitly marking the
constants as 64-bit.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index a2e3b76bfdc..22dab06c08a 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -84,9 +84,9 @@ static u64 pmc_intel_event_map(int event)
 
 static u64 pmc_intel_raw_event(u64 event)
 {
-#define CORE_EVNTSEL_EVENT_MASK		0x000000FF
-#define CORE_EVNTSEL_UNIT_MASK		0x0000FF00
-#define CORE_EVNTSEL_COUNTER_MASK	0xFF000000
+#define CORE_EVNTSEL_EVENT_MASK		0x000000FFULL
+#define CORE_EVNTSEL_UNIT_MASK		0x0000FF00ULL
+#define CORE_EVNTSEL_COUNTER_MASK	0xFF000000ULL
 
 #define CORE_EVNTSEL_MASK 		\
 	(CORE_EVNTSEL_EVENT_MASK |	\
@@ -116,9 +116,9 @@ static u64 pmc_amd_event_map(int event)
 
 static u64 pmc_amd_raw_event(u64 event)
 {
-#define K7_EVNTSEL_EVENT_MASK	0x7000000FF
-#define K7_EVNTSEL_UNIT_MASK	0x00000FF00
-#define K7_EVNTSEL_COUNTER_MASK	0x0FF000000
+#define K7_EVNTSEL_EVENT_MASK	0x7000000FFULL
+#define K7_EVNTSEL_UNIT_MASK	0x00000FF00ULL
+#define K7_EVNTSEL_COUNTER_MASK	0x0FF000000ULL
 
 #define K7_EVNTSEL_MASK			\
 	(K7_EVNTSEL_EVENT_MASK |	\
-- 
cgit v1.2.3


From 7dd1fcc258b65da718f01e4684a7b9244501a9fb Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:33 +0100
Subject: perf_counter: provide pagefault software events

We use the generic software counter infrastructure to provide
page fault events.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/mm/fault.c | 3 +++
 arch/x86/mm/fault.c     | 3 +++
 2 files changed, 6 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 76993941cac..eda5b0ca4af 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -29,6 +29,7 @@
 #include <linux/module.h>
 #include <linux/kprobes.h>
 #include <linux/kdebug.h>
+#include <linux/perf_counter.h>
 
 #include <asm/firmware.h>
 #include <asm/page.h>
@@ -170,6 +171,8 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 		die("Weird page fault", regs, SIGSEGV);
 	}
 
+	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs);
+
 	/* When running in the kernel we expect faults to occur only to
 	 * addresses in user space.  All other faults represent errors in the
 	 * kernel and should generate an OOPS.  Unfortunately, in the case of an
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a03b7279efa..c8725752b6c 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -27,6 +27,7 @@
 #include <linux/tty.h>
 #include <linux/smp.h>
 #include <linux/mm.h>
+#include <linux/perf_counter.h>
 
 #include <asm-generic/sections.h>
 
@@ -1044,6 +1045,8 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	if (unlikely(error_code & PF_RSVD))
 		pgtable_bad(regs, error_code, address);
 
+	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs);
+
 	/*
 	 * If we're in an interrupt, have no user context or are running
 	 * in an atomic region then we must not take the fault:
-- 
cgit v1.2.3


From ac17dc8e58f3069ea895cfff963adf98ff3cf6b2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:34 +0100
Subject: perf_counter: provide major/minor page fault software events

Provide separate sw counters for major and minor page faults.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/mm/fault.c | 5 ++++-
 arch/x86/mm/fault.c     | 7 +++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index eda5b0ca4af..17bbf6f91fb 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -312,6 +312,7 @@ good_area:
 	}
 	if (ret & VM_FAULT_MAJOR) {
 		current->maj_flt++;
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs);
 #ifdef CONFIG_PPC_SMLPAR
 		if (firmware_has_feature(FW_FEATURE_CMO)) {
 			preempt_disable();
@@ -319,8 +320,10 @@ good_area:
 			preempt_enable();
 		}
 #endif
-	} else
+	} else {
 		current->min_flt++;
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs);
+	}
 	up_read(&mm->mmap_sem);
 	return 0;
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index c8725752b6c..f2d3324d921 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1140,10 +1140,13 @@ good_area:
 		return;
 	}
 
-	if (fault & VM_FAULT_MAJOR)
+	if (fault & VM_FAULT_MAJOR) {
 		tsk->maj_flt++;
-	else
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs);
+	} else {
 		tsk->min_flt++;
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs);
+	}
 
 	check_v8086_mode(regs, address, tsk);
 
-- 
cgit v1.2.3


From 7bb497bd885eedd0f56dfe3cc1b5ff20710d33b9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 18 Mar 2009 08:59:21 +0100
Subject: perf_counter: fix crash on perfmon v1 systems

Impact: fix boot crash on Intel Perfmon Version 1 systems

Intel Perfmon v1 does not support the global MSRs, nor does
it offer the generalized MSR ranges. So support v2 and later
CPUs only.

Also mark pmc_ops as read-mostly - to avoid false cacheline
sharing.

Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 22dab06c08a..6cba9d47b71 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -57,12 +57,14 @@ struct pmc_x86_ops {
 	int		max_events;
 };
 
-static struct pmc_x86_ops *pmc_ops;
+static struct pmc_x86_ops *pmc_ops __read_mostly;
 
 static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
 	.enabled = 1,
 };
 
+static __read_mostly int intel_perfmon_version;
+
 /*
  * Intel PerfMon v3. Used on Core2 and later.
  */
@@ -613,7 +615,7 @@ void perf_counter_print_debug(void)
 	cpu = smp_processor_id();
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+	if (intel_perfmon_version >= 2) {
 		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
@@ -930,10 +932,10 @@ static struct pmc_x86_ops pmc_amd_ops = {
 
 static struct pmc_x86_ops *pmc_intel_init(void)
 {
+	union cpuid10_edx edx;
 	union cpuid10_eax eax;
-	unsigned int ebx;
 	unsigned int unused;
-	union cpuid10_edx edx;
+	unsigned int ebx;
 
 	/*
 	 * Check whether the Architectural PerfMon supports
@@ -943,8 +945,12 @@ static struct pmc_x86_ops *pmc_intel_init(void)
 	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
 		return NULL;
 
+	intel_perfmon_version = eax.split.version_id;
+	if (intel_perfmon_version < 2)
+		return NULL;
+
 	pr_info("Intel Performance Monitoring support detected.\n");
-	pr_info("... version:         %d\n", eax.split.version_id);
+	pr_info("... version:         %d\n", intel_perfmon_version);
 	pr_info("... bit width:       %d\n", eax.split.bit_width);
 	pr_info("... mask length:     %d\n", eax.split.mask_length);
 
-- 
cgit v1.2.3


From b6c5a71da1477d261bc36254fe1f20d32b57598d Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 16 Mar 2009 21:00:00 +1100
Subject: perf_counter: abstract wakeup flag setting in core to fix powerpc
 build

Impact: build fix for powerpc

Commit bd753921015e7905 ("perf_counter: software counter event
infrastructure") introduced a use of TIF_PERF_COUNTERS into the core
perfcounter code.  This breaks the build on powerpc because we use
a flag in a per-cpu area to signal wakeups on powerpc rather than
a thread_info flag, because the thread_info flags have to be
manipulated with atomic operations and are thus slower than per-cpu
flags.

This fixes the by changing the core to use an abstracted
set_perf_counter_pending() function, which is defined on x86 to set
the TIF_PERF_COUNTERS flag and on powerpc to set the per-cpu flag
(paca->perf_counter_pending).  It changes the previous powerpc
definition of set_perf_counter_pending to not take an argument and
adds a clear_perf_counter_pending, so as to simplify the definition
on x86.

On x86, set_perf_counter_pending() is defined as a macro.  Defining
it as a static inline in arch/x86/include/asm/perf_counters.h causes
compile failures because <asm/perf_counters.h> gets included early in
<linux/sched.h>, and the definitions of set_tsk_thread_flag etc. are
therefore not available in <asm/perf_counters.h>.  (On powerpc this
problem is avoided by defining set_perf_counter_pending etc. in
<asm/hw_irq.h>.)

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/hw_irq.h   | 14 +++++++++++---
 arch/powerpc/kernel/irq.c           | 11 +++--------
 arch/powerpc/kernel/perf_counter.c  |  3 +--
 arch/x86/include/asm/perf_counter.h |  3 +++
 4 files changed, 18 insertions(+), 13 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index b43076ff92c..cb32d571c9c 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -142,10 +142,17 @@ static inline unsigned long get_perf_counter_pending(void)
 	return x;
 }
 
-static inline void set_perf_counter_pending(int x)
+static inline void set_perf_counter_pending(void)
 {
 	asm volatile("stb %0,%1(13)" : :
-		"r" (x),
+		"r" (1),
+		"i" (offsetof(struct paca_struct, perf_counter_pending)));
+}
+
+static inline void clear_perf_counter_pending(void)
+{
+	asm volatile("stb %0,%1(13)" : :
+		"r" (0),
 		"i" (offsetof(struct paca_struct, perf_counter_pending)));
 }
 
@@ -158,7 +165,8 @@ static inline unsigned long get_perf_counter_pending(void)
 	return 0;
 }
 
-static inline void set_perf_counter_pending(int x) {}
+static inline void set_perf_counter_pending(void) {}
+static inline void clear_perf_counter_pending(void) {}
 static inline void perf_counter_do_pending(void) {}
 #endif /* CONFIG_PERF_COUNTERS */
 
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 0d2e37c5773..469e9635ff0 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -104,13 +104,6 @@ static inline notrace void set_soft_enabled(unsigned long enable)
 	: : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled)));
 }
 
-#ifdef CONFIG_PERF_COUNTERS
-notrace void __weak perf_counter_do_pending(void)
-{
-	set_perf_counter_pending(0);
-}
-#endif
-
 notrace void raw_local_irq_restore(unsigned long en)
 {
 	/*
@@ -142,8 +135,10 @@ notrace void raw_local_irq_restore(unsigned long en)
 			iseries_handle_interrupts();
 	}
 
-	if (get_perf_counter_pending())
+	if (get_perf_counter_pending()) {
+		clear_perf_counter_pending();
 		perf_counter_do_pending();
+	}
 
 	/*
 	 * if (get_paca()->hard_enabled) return;
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 0e33d27cd46..5008762e8bf 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -653,7 +653,6 @@ void perf_counter_do_pending(void)
 	struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
 	struct perf_counter *counter;
 
-	set_perf_counter_pending(0);
 	for (i = 0; i < cpuhw->n_counters; ++i) {
 		counter = cpuhw->counter[i];
 		if (counter && counter->wakeup_pending) {
@@ -811,7 +810,7 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 			perf_counter_do_pending();
 			irq_exit();
 		} else {
-			set_perf_counter_pending(1);
+			set_perf_counter_pending();
 		}
 	}
 }
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index 2e08ed73664..1662043b340 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -84,6 +84,9 @@ union cpuid10_edx {
 #define MSR_ARCH_PERFMON_FIXED_CTR2			0x30b
 #define X86_PMC_IDX_FIXED_BUS_CYCLES			(X86_PMC_IDX_FIXED + 2)
 
+#define set_perf_counter_pending()	\
+		set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
+
 #ifdef CONFIG_PERF_COUNTERS
 extern void init_hw_perf_counters(void);
 extern void perf_counters_lapic_init(int nmi);
-- 
cgit v1.2.3


From b8e83514b64577b48bfb794fe85fcde40a9343ca Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Mar 2009 20:26:18 +0100
Subject: perf_counter: revamp syscall input ABI

Impact: modify ABI

The hardware/software classification in hw_event->type became a little
strained due to the addition of tracepoint tracing.

Instead split up the field and provide a type field to explicitly specify
the counter type, while using the event_id field to specify which event to
use.

Raw counters still work as before, only the raw config now goes into
raw_event.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Orig-LKML-Reference: <20090319194233.836807573@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |  4 ++--
 arch/x86/kernel/cpu/perf_counter.c | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 5008762e8bf..26f69dc7130 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -602,7 +602,7 @@ hw_perf_counter_init(struct perf_counter *counter)
 		return NULL;
 	if ((s64)counter->hw_event.irq_period < 0)
 		return NULL;
-	ev = counter->hw_event.type;
+	ev = counter->hw_event.event_id;
 	if (!counter->hw_event.raw) {
 		if (ev >= ppmu->n_generic ||
 		    ppmu->generic_events[ev] == 0)
@@ -692,7 +692,7 @@ static void perf_handle_group(struct perf_counter *counter)
 	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
 		if (sub != counter)
 			sub->hw_ops->read(sub);
-		perf_store_irq_data(counter, sub->hw_event.type);
+		perf_store_irq_data(counter, sub->hw_event.event_config);
 		perf_store_irq_data(counter, atomic64_read(&sub->count));
 	}
 }
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 6cba9d47b71..d844ae41d5a 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -217,15 +217,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	/*
 	 * Raw event type provide the config in the event structure
 	 */
-	if (hw_event->raw) {
-		hwc->config |= pmc_ops->raw_event(hw_event->type);
+	if (hw_event->raw_type) {
+		hwc->config |= pmc_ops->raw_event(hw_event->raw_event_id);
 	} else {
-		if (hw_event->type >= pmc_ops->max_events)
+		if (hw_event->event_id >= pmc_ops->max_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= pmc_ops->event_map(hw_event->type);
+		hwc->config |= pmc_ops->event_map(hw_event->event_id);
 	}
 	counter->wakeup_pending = 0;
 
@@ -715,7 +715,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
 
 		x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
-		perf_store_irq_data(sibling, counter->hw_event.type);
+		perf_store_irq_data(sibling, counter->hw_event.event_config);
 		perf_store_irq_data(sibling, atomic64_read(&counter->count));
 	}
 }
-- 
cgit v1.2.3


From 0322cd6ec504b0bf08ca7b2c3d7f43bda37d79c9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Mar 2009 20:26:19 +0100
Subject: perf_counter: unify irq output code

Impact: cleanup

Having 3 slightly different copies of the same code around does nobody
any good. First step in revamping the output format.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Orig-LKML-Reference: <20090319194233.929962222@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 51 ++----------------------------------
 arch/x86/kernel/cpu/perf_counter.c | 53 +-------------------------------------
 2 files changed, 3 insertions(+), 101 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 26f69dc7130..88b72eb4af1 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -662,41 +662,6 @@ void perf_counter_do_pending(void)
 	}
 }
 
-/*
- * Record data for an irq counter.
- * This function was lifted from the x86 code; maybe it should
- * go in the core?
- */
-static void perf_store_irq_data(struct perf_counter *counter, u64 data)
-{
-	struct perf_data *irqdata = counter->irqdata;
-
-	if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
-		irqdata->overrun++;
-	} else {
-		u64 *p = (u64 *) &irqdata->data[irqdata->len];
-
-		*p = data;
-		irqdata->len += sizeof(u64);
-	}
-}
-
-/*
- * Record all the values of the counters in a group
- */
-static void perf_handle_group(struct perf_counter *counter)
-{
-	struct perf_counter *leader, *sub;
-
-	leader = counter->group_leader;
-	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
-		if (sub != counter)
-			sub->hw_ops->read(sub);
-		perf_store_irq_data(counter, sub->hw_event.event_config);
-		perf_store_irq_data(counter, atomic64_read(&sub->count));
-	}
-}
-
 /*
  * A counter has overflowed; update its count and record
  * things if requested.  Note that interrupts are hard-disabled
@@ -736,20 +701,8 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	/*
 	 * Finally record data if requested.
 	 */
-	if (record) {
-		switch (counter->hw_event.record_type) {
-		case PERF_RECORD_SIMPLE:
-			break;
-		case PERF_RECORD_IRQ:
-			perf_store_irq_data(counter, instruction_pointer(regs));
-			counter->wakeup_pending = 1;
-			break;
-		case PERF_RECORD_GROUP:
-			perf_handle_group(counter);
-			counter->wakeup_pending = 1;
-			break;
-		}
-	}
+	if (record)
+		perf_counter_output(counter, 1, regs);
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index d844ae41d5a..902282d68b0 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -674,20 +674,6 @@ static void pmc_generic_disable(struct perf_counter *counter)
 	x86_perf_counter_update(counter, hwc, idx);
 }
 
-static void perf_store_irq_data(struct perf_counter *counter, u64 data)
-{
-	struct perf_data *irqdata = counter->irqdata;
-
-	if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
-		irqdata->overrun++;
-	} else {
-		u64 *p = (u64 *) &irqdata->data[irqdata->len];
-
-		*p = data;
-		irqdata->len += sizeof(u64);
-	}
-}
-
 /*
  * Save and restart an expired counter. Called by NMI contexts,
  * so it has to be careful about preempting normal counter ops:
@@ -704,22 +690,6 @@ static void perf_save_and_restart(struct perf_counter *counter)
 		__pmc_generic_enable(counter, hwc, idx);
 }
 
-static void
-perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
-{
-	struct perf_counter *counter, *group_leader = sibling->group_leader;
-
-	/*
-	 * Store sibling timestamps (if any):
-	 */
-	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
-
-		x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
-		perf_store_irq_data(sibling, counter->hw_event.event_config);
-		perf_store_irq_data(sibling, atomic64_read(&counter->count));
-	}
-}
-
 /*
  * Maximum interrupt frequency of 100KHz per CPU
  */
@@ -754,28 +724,7 @@ again:
 			continue;
 
 		perf_save_and_restart(counter);
-
-		switch (counter->hw_event.record_type) {
-		case PERF_RECORD_SIMPLE:
-			continue;
-		case PERF_RECORD_IRQ:
-			perf_store_irq_data(counter, instruction_pointer(regs));
-			break;
-		case PERF_RECORD_GROUP:
-			perf_handle_group(counter, &status, &ack);
-			break;
-		}
-		/*
-		 * From NMI context we cannot call into the scheduler to
-		 * do a task wakeup - but we mark these generic as
-		 * wakeup_pending and initate a wakeup callback:
-		 */
-		if (nmi) {
-			counter->wakeup_pending = 1;
-			set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
-		} else {
-			wake_up(&counter->waitq);
-		}
+		perf_counter_output(counter, nmi, regs);
 	}
 
 	hw_perf_ack_status(ack);
-- 
cgit v1.2.3


From db4fb5acf20295063d1d5105e67724eb51440207 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 19 Mar 2009 20:26:20 +0100
Subject: perf_counter: powerpc: clean up perc_counter_interrupt

Impact: cleanup

This updates the powerpc perf_counter_interrupt following on from the
"perf_counter: unify irq output code" patch.  Since we now use the
generic perf_counter_output code, which sets the perf_counter_pending
flag directly, we no longer need the need_wakeup variable.

This removes need_wakeup and makes perf_counter_interrupt use
get_perf_counter_pending() instead.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Steven Rostedt <rostedt@goodmis.org>
Orig-LKML-Reference: <20090319194234.024464535@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 88b72eb4af1..830ca9c4494 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -723,8 +723,6 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 			/* counter has overflowed */
 			found = 1;
 			record_and_restart(counter, val, regs);
-			if (counter->wakeup_pending)
-				need_wakeup = 1;
 		}
 	}
 
@@ -754,17 +752,14 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 	/*
 	 * If we need a wakeup, check whether interrupts were soft-enabled
 	 * when we took the interrupt.  If they were, we can wake stuff up
-	 * immediately; otherwise we'll have to set a flag and do the
-	 * wakeup when interrupts get soft-enabled.
+	 * immediately; otherwise we'll have do the wakeup when interrupts
+	 * get soft-enabled.
 	 */
-	if (need_wakeup) {
-		if (regs->softe) {
-			irq_enter();
-			perf_counter_do_pending();
-			irq_exit();
-		} else {
-			set_perf_counter_pending();
-		}
+	if (get_perf_counter_pending() && regs->softe) {
+		irq_enter();
+		clear_perf_counter_pending();
+		perf_counter_do_pending();
+		irq_exit();
 	}
 }
 
-- 
cgit v1.2.3


From 9aaa131a279834dff75c290c91f0058f62d72d46 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Sat, 21 Mar 2009 15:31:47 +1100
Subject: perf_counter: fix type/event_id layout on big-endian systems

Impact: build fix for powerpc

Commit db3a944aca35ae61 ("perf_counter: revamp syscall input ABI")
expanded the hw_event.type field into a union of structs containing
bitfields.  In particular it introduced a type field and a raw_type
field, with the intention that the 1-bit raw_type field should
overlay the most-significant bit of the 8-bit type field, and in fact
perf_counter_alloc() now assumes that (or at least, assumes that
raw_type doesn't overlay any of the bits that are 1 in the values of
PERF_TYPE_{HARDWARE,SOFTWARE,TRACEPOINT}).

Unfortunately this is not true on big-endian systems such as PowerPC,
where bitfields are laid out from left to right, i.e. from most
significant bit to least significant.  This means that setting
hw_event.type = PERF_TYPE_SOFTWARE will set hw_event.raw_type to 1.

This fixes it by making the layout depend on whether or not
__BIG_ENDIAN_BITFIELD is defined.  It's a bit ugly, but that's what
we get for using bitfields in a user/kernel ABI.

Also, that commit didn't fix up some places in arch/powerpc/kernel/
perf_counter.c where hw_event.raw and hw_event.event_id were used.
This fixes them too.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/perf_counter.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 830ca9c4494..6413d9c0313 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -602,12 +602,13 @@ hw_perf_counter_init(struct perf_counter *counter)
 		return NULL;
 	if ((s64)counter->hw_event.irq_period < 0)
 		return NULL;
-	ev = counter->hw_event.event_id;
-	if (!counter->hw_event.raw) {
-		if (ev >= ppmu->n_generic ||
-		    ppmu->generic_events[ev] == 0)
+	if (!counter->hw_event.raw_type) {
+		ev = counter->hw_event.event_id;
+		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
 			return NULL;
 		ev = ppmu->generic_events[ev];
+	} else {
+		ev = counter->hw_event.raw_event_id;
 	}
 	counter->hw.config_base = ev;
 	counter->hw.idx = 0;
-- 
cgit v1.2.3


From f4a2deb4860497f4332cf6a1acddab3dd628ddf0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 23 Mar 2009 18:22:06 +0100
Subject: perf_counter: remove the event config bitfields

Since the bitfields turned into a bit of a mess, remove them and rely on
good old masks.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090323172417.059499915@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 6 +++---
 arch/x86/kernel/cpu/perf_counter.c | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 6413d9c0313..d05651584d4 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -602,13 +602,13 @@ hw_perf_counter_init(struct perf_counter *counter)
 		return NULL;
 	if ((s64)counter->hw_event.irq_period < 0)
 		return NULL;
-	if (!counter->hw_event.raw_type) {
-		ev = counter->hw_event.event_id;
+	if (!perf_event_raw(&counter->hw_event)) {
+		ev = perf_event_id(&counter->hw_event);
 		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
 			return NULL;
 		ev = ppmu->generic_events[ev];
 	} else {
-		ev = counter->hw_event.raw_event_id;
+		ev = perf_event_config(&counter->hw_event);
 	}
 	counter->hw.config_base = ev;
 	counter->hw.idx = 0;
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 902282d68b0..3f95b0cdc55 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -217,15 +217,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	/*
 	 * Raw event type provide the config in the event structure
 	 */
-	if (hw_event->raw_type) {
-		hwc->config |= pmc_ops->raw_event(hw_event->raw_event_id);
+	if (perf_event_raw(hw_event)) {
+		hwc->config |= pmc_ops->raw_event(perf_event_config(hw_event));
 	} else {
-		if (hw_event->event_id >= pmc_ops->max_events)
+		if (perf_event_id(hw_event) >= pmc_ops->max_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= pmc_ops->event_map(hw_event->event_id);
+		hwc->config |= pmc_ops->event_map(perf_event_id(hw_event));
 	}
 	counter->wakeup_pending = 0;
 
-- 
cgit v1.2.3


From 37d81828385f8ff823caaaf1a83e72d065b6cfa1 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 23 Mar 2009 18:22:08 +0100
Subject: perf_counter: add an mmap method to allow userspace to read hardware
 counters

Impact: new feature giving performance improvement

This adds the ability for userspace to do an mmap on a hardware counter
fd and get access to a read-only page that contains the information
needed to translate a hardware counter value to the full 64-bit
counter value that would be returned by a read on the fd.  This is
useful on architectures that allow user programs to read the hardware
counters, such as PowerPC.

The mmap will only succeed if the counter is a hardware counter
monitoring the current process.

On my quad 2.5GHz PowerPC 970MP machine, userspace can read a counter
and translate it to the full 64-bit value in about 30ns using the
mmapped page, compared to about 830ns for the read syscall on the
counter, so this does give a significant performance improvement.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Orig-LKML-Reference: <20090323172417.297057964@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index d05651584d4..e4349281b07 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -417,6 +417,8 @@ void hw_perf_restore(u64 disable)
 		atomic64_set(&counter->hw.prev_count, val);
 		counter->hw.idx = hwc_index[i] + 1;
 		write_pmc(counter->hw.idx, val);
+		if (counter->user_page)
+			perf_counter_update_userpage(counter);
 	}
 	mb();
 	cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
@@ -572,6 +574,8 @@ static void power_perf_disable(struct perf_counter *counter)
 			ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
 			write_pmc(counter->hw.idx, 0);
 			counter->hw.idx = 0;
+			if (counter->user_page)
+				perf_counter_update_userpage(counter);
 			break;
 		}
 	}
@@ -698,6 +702,8 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	write_pmc(counter->hw.idx, val);
 	atomic64_set(&counter->hw.prev_count, val);
 	atomic64_set(&counter->hw.period_left, left);
+	if (counter->user_page)
+		perf_counter_update_userpage(counter);
 
 	/*
 	 * Finally record data if requested.
-- 
cgit v1.2.3


From 7b732a75047738e4f85438ed2f9cd34bf5f2a19a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 23 Mar 2009 18:22:10 +0100
Subject: perf_counter: new output ABI - part 1

Impact: Rework the perfcounter output ABI

use sys_read() only for instant data and provide mmap() output for all
async overflow data.

The first mmap() determines the size of the output buffer. The mmap()
size must be a PAGE_SIZE multiple of 1+pages, where pages must be a
power of 2 or 0. Further mmap()s of the same fd must have the same
size. Once all maps are gone, you can again mmap() with a new size.

In case of 0 extra pages there is no data output and the first page
only contains meta data.

When there are data pages, a poll() event will be generated for each
full page of data. Furthermore, the output is circular. This means
that although 1 page is a valid configuration, its useless, since
we'll start overwriting it the instant we report a full page.

Future work will focus on the output format (currently maintained)
where we'll likey want each entry denoted by a header which includes a
type and length.

Further future work will allow to splice() the fd, also containing the
async overflow data -- splice() would be mutually exclusive with
mmap() of the data.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090323172417.470536358@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index e4349281b07..d48596ab655 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -417,8 +417,7 @@ void hw_perf_restore(u64 disable)
 		atomic64_set(&counter->hw.prev_count, val);
 		counter->hw.idx = hwc_index[i] + 1;
 		write_pmc(counter->hw.idx, val);
-		if (counter->user_page)
-			perf_counter_update_userpage(counter);
+		perf_counter_update_userpage(counter);
 	}
 	mb();
 	cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
@@ -574,8 +573,7 @@ static void power_perf_disable(struct perf_counter *counter)
 			ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
 			write_pmc(counter->hw.idx, 0);
 			counter->hw.idx = 0;
-			if (counter->user_page)
-				perf_counter_update_userpage(counter);
+			perf_counter_update_userpage(counter);
 			break;
 		}
 	}
@@ -702,8 +700,7 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	write_pmc(counter->hw.idx, val);
 	atomic64_set(&counter->hw.prev_count, val);
 	atomic64_set(&counter->hw.period_left, left);
-	if (counter->user_page)
-		perf_counter_update_userpage(counter);
+	perf_counter_update_userpage(counter);
 
 	/*
 	 * Finally record data if requested.
-- 
cgit v1.2.3


From 53cfbf593758916aac41db728f029986a62f1254 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 25 Mar 2009 22:46:58 +1100
Subject: perf_counter: record time running and time enabled for each counter

Impact: new functionality

Currently, if there are more counters enabled than can fit on the CPU,
the kernel will multiplex the counters on to the hardware using
round-robin scheduling.  That isn't too bad for sampling counters, but
for counting counters it means that the value read from a counter
represents some unknown fraction of the true count of events that
occurred while the counter was enabled.

This remedies the situation by keeping track of how long each counter
is enabled for, and how long it is actually on the cpu and counting
events.  These times are recorded in nanoseconds using the task clock
for per-task counters and the cpu clock for per-cpu counters.

These values can be supplied to userspace on a read from the counter.
Userspace requests that they be supplied after the counter value by
setting the PERF_FORMAT_TOTAL_TIME_ENABLED and/or
PERF_FORMAT_TOTAL_TIME_RUNNING bits in the hw_event.read_format field
when creating the counter.  (There is no way to change the read format
after the counter is created, though it would be possible to add some
way to do that.)

Using this information it is possible for userspace to scale the count
it reads from the counter to get an estimate of the true count:

true_count_estimate = count * total_time_enabled / total_time_running

This also lets userspace detect the situation where the counter never
got to go on the cpu: total_time_running == 0.

This functionality has been requested by the PAPI developers, and will
be generally needed for interpreting the count values from counting
counters correctly.

In the implementation, this keeps 5 time values (in nanoseconds) for
each counter: total_time_enabled and total_time_running are used when
the counter is in state OFF or ERROR and for reporting back to
userspace.  When the counter is in state INACTIVE or ACTIVE, it is the
tstamp_enabled, tstamp_running and tstamp_stopped values that are
relevant, and total_time_enabled and total_time_running are determined
from them.  (tstamp_stopped is only used in INACTIVE state.)  The
reason for doing it like this is that it means that only counters
being enabled or disabled at sched-in and sched-out time need to be
updated.  There are no new loops that iterate over all counters to
update total_time_enabled or total_time_running.

This also keeps separate child_total_time_running and
child_total_time_enabled fields that get added in when reporting the
totals to userspace.  They are separate fields so that they can be
atomic.  We don't want to use atomics for total_time_running,
total_time_enabled etc., because then we would have to use atomic
sequences to update them, which are slower than regular arithmetic and
memory accesses.

It is possible to measure total_time_running by adding a task_clock
counter to each group of counters, and total_time_enabled can be
measured approximately with a top-level task_clock counter (though
inaccuracies will creep in if you need to disable and enable groups
since it is not possible in general to disable/enable the top-level
task_clock counter simultaneously with another group).  However, that
adds extra overhead - I measured around 15% increase in the context
switch latency reported by lat_ctx (from lmbench) when a task_clock
counter was added to each of 2 groups, and around 25% increase when a
task_clock counter was added to each of 4 groups.  (In both cases a
top-level task-clock counter was also added.)

In contrast, the code added in this commit gives better information
with no overhead that I could measure (in fact in some cases I
measured lower times with this code, but the differences were all less
than one standard deviation).

[ v2: address review comments by Andrew Morton. ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrew Morton <akpm@linux-foundation.org>
Orig-LKML-Reference: <18890.6578.728637.139402@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index d48596ab655..df007fe0cc0 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -455,6 +455,8 @@ static void counter_sched_in(struct perf_counter *counter, int cpu)
 {
 	counter->state = PERF_COUNTER_STATE_ACTIVE;
 	counter->oncpu = cpu;
+	counter->tstamp_running += counter->ctx->time_now -
+		counter->tstamp_stopped;
 	if (is_software_counter(counter))
 		counter->hw_ops->enable(counter);
 }
-- 
cgit v1.2.3


From 925d519ab82b6dd7aca9420d809ee83819c08db2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:02 +0200
Subject: perf_counter: unify and fix delayed counter wakeup

While going over the wakeup code I noticed delayed wakeups only work
for hardware counters but basically all software counters rely on
them.

This patch unifies and generalizes the delayed wakeup to fix this
issue.

Since we're dealing with NMI context bits here, use a cmpxchg() based
single link list implementation to track counters that have pending
wakeups.

[ This should really be generic code for delayed wakeups, but since we
  cannot use cmpxchg()/xchg() in generic code, I've let it live in the
  perf_counter code. -- Eric Dumazet could use it to aggregate the
  network wakeups. ]

Furthermore, the x86 method of using TIF flags was flawed in that its
quite possible to end up setting the bit on the idle task, loosing the
wakeup.

The powerpc method uses per-cpu storage and does appear to be
sufficient.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090330171023.153932974@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/include/asm/hw_irq.h   |  4 ++--
 arch/powerpc/kernel/irq.c           |  2 +-
 arch/powerpc/kernel/perf_counter.c  | 22 ++--------------------
 arch/x86/include/asm/perf_counter.h |  5 +++--
 arch/x86/include/asm/thread_info.h  |  4 +---
 arch/x86/kernel/cpu/perf_counter.c  | 29 -----------------------------
 arch/x86/kernel/signal.c            |  6 ------
 7 files changed, 9 insertions(+), 63 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index cb32d571c9c..20a44d0c9fd 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -132,7 +132,7 @@ static inline int irqs_disabled_flags(unsigned long flags)
 struct irq_chip;
 
 #ifdef CONFIG_PERF_COUNTERS
-static inline unsigned long get_perf_counter_pending(void)
+static inline unsigned long test_perf_counter_pending(void)
 {
 	unsigned long x;
 
@@ -160,7 +160,7 @@ extern void perf_counter_do_pending(void);
 
 #else
 
-static inline unsigned long get_perf_counter_pending(void)
+static inline unsigned long test_perf_counter_pending(void)
 {
 	return 0;
 }
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 469e9635ff0..2cd471f92fe 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -135,7 +135,7 @@ notrace void raw_local_irq_restore(unsigned long en)
 			iseries_handle_interrupts();
 	}
 
-	if (get_perf_counter_pending()) {
+	if (test_perf_counter_pending()) {
 		clear_perf_counter_pending();
 		perf_counter_do_pending();
 	}
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index df007fe0cc0..cde720fc495 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -649,24 +649,6 @@ hw_perf_counter_init(struct perf_counter *counter)
 	return &power_perf_ops;
 }
 
-/*
- * Handle wakeups.
- */
-void perf_counter_do_pending(void)
-{
-	int i;
-	struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
-	struct perf_counter *counter;
-
-	for (i = 0; i < cpuhw->n_counters; ++i) {
-		counter = cpuhw->counter[i];
-		if (counter && counter->wakeup_pending) {
-			counter->wakeup_pending = 0;
-			wake_up(&counter->waitq);
-		}
-	}
-}
-
 /*
  * A counter has overflowed; update its count and record
  * things if requested.  Note that interrupts are hard-disabled
@@ -720,7 +702,7 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 	struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
 	struct perf_counter *counter;
 	long val;
-	int need_wakeup = 0, found = 0;
+	int found = 0;
 
 	for (i = 0; i < cpuhw->n_counters; ++i) {
 		counter = cpuhw->counter[i];
@@ -761,7 +743,7 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 	 * immediately; otherwise we'll have do the wakeup when interrupts
 	 * get soft-enabled.
 	 */
-	if (get_perf_counter_pending() && regs->softe) {
+	if (test_perf_counter_pending() && regs->softe) {
 		irq_enter();
 		clear_perf_counter_pending();
 		perf_counter_do_pending();
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index 1662043b340..e2b0e66b235 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -84,8 +84,9 @@ union cpuid10_edx {
 #define MSR_ARCH_PERFMON_FIXED_CTR2			0x30b
 #define X86_PMC_IDX_FIXED_BUS_CYCLES			(X86_PMC_IDX_FIXED + 2)
 
-#define set_perf_counter_pending()	\
-		set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
+#define set_perf_counter_pending()	do { } while (0)
+#define clear_perf_counter_pending()	do { } while (0)
+#define test_perf_counter_pending()	(0)
 
 #ifdef CONFIG_PERF_COUNTERS
 extern void init_hw_perf_counters(void);
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 3ffd5d2a367..8820a73ae09 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -83,7 +83,6 @@ struct thread_info {
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
-#define TIF_PERF_COUNTERS	11	/* notify perf counter work */
 #define TIF_NOTSC		16	/* TSC is not accessible in userland */
 #define TIF_IA32		17	/* 32bit process */
 #define TIF_FORK		18	/* ret_from_fork */
@@ -107,7 +106,6 @@ struct thread_info {
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
 #define _TIF_MCE_NOTIFY		(1 << TIF_MCE_NOTIFY)
-#define _TIF_PERF_COUNTERS	(1 << TIF_PERF_COUNTERS)
 #define _TIF_NOTSC		(1 << TIF_NOTSC)
 #define _TIF_IA32		(1 << TIF_IA32)
 #define _TIF_FORK		(1 << TIF_FORK)
@@ -141,7 +139,7 @@ struct thread_info {
 
 /* Only used for 64 bit */
 #define _TIF_DO_NOTIFY_MASK						\
-	(_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_PERF_COUNTERS|_TIF_NOTIFY_RESUME)
+	(_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 3f95b0cdc55..7aab177fb56 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -227,7 +227,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		 */
 		hwc->config |= pmc_ops->event_map(perf_event_id(hw_event));
 	}
-	counter->wakeup_pending = 0;
 
 	return 0;
 }
@@ -773,34 +772,6 @@ void smp_perf_counter_interrupt(struct pt_regs *regs)
 	irq_exit();
 }
 
-/*
- * This handler is triggered by NMI contexts:
- */
-void perf_counter_notify(struct pt_regs *regs)
-{
-	struct cpu_hw_counters *cpuc;
-	unsigned long flags;
-	int bit, cpu;
-
-	local_irq_save(flags);
-	cpu = smp_processor_id();
-	cpuc = &per_cpu(cpu_hw_counters, cpu);
-
-	for_each_bit(bit, cpuc->used, X86_PMC_IDX_MAX) {
-		struct perf_counter *counter = cpuc->counters[bit];
-
-		if (!counter)
-			continue;
-
-		if (counter->wakeup_pending) {
-			counter->wakeup_pending = 0;
-			wake_up(&counter->waitq);
-		}
-	}
-
-	local_irq_restore(flags);
-}
-
 void perf_counters_lapic_init(int nmi)
 {
 	u32 apic_val;
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 611615a92c9..0a813b17b17 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,7 +6,6 @@
  *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes
  *  2000-2002   x86-64 support by Andi Kleen
  */
-#include <linux/perf_counter.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
@@ -872,11 +871,6 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 		tracehook_notify_resume(regs);
 	}
 
-	if (thread_info_flags & _TIF_PERF_COUNTERS) {
-		clear_thread_flag(TIF_PERF_COUNTERS);
-		perf_counter_notify(regs);
-	}
-
 #ifdef CONFIG_X86_32
 	clear_thread_flag(TIF_IRET);
 #endif /* CONFIG_X86_32 */
-- 
cgit v1.2.3


From 7595d63b3a9ce65d14c4fbd0e7de448a343d7215 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 30 Mar 2009 19:07:07 +0200
Subject: perf_counter: powerpc: only reserve PMU hardware when we need it

Impact: cooperate with oprofile

At present, on PowerPC, if you have perf_counters compiled in, oprofile
doesn't work.  There is code to allow the PMU to be shared between
competing subsystems, such as perf_counters and oprofile, but currently
the perf_counter subsystem reserves the PMU for itself at boot time,
and never releases it.

This makes perf_counter play nicely with oprofile.  Now we keep a count
of how many perf_counter instances are counting hardware events, and
reserve the PMU when that count becomes non-zero, and release the PMU
when that count becomes zero.  This means that it is possible to have
perf_counters compiled in and still use oprofile, as long as there are
no hardware perf_counters active.  This also means that if oprofile is
active, sys_perf_counter_open will fail if the hw_event specifies a
hardware event.

To avoid races with other tasks creating and destroying perf_counters,
we use a mutex.  We use atomic_inc_not_zero and atomic_add_unless to
avoid having to take the mutex unless there is a possibility of the
count going between 0 and 1.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Orig-LKML-Reference: <20090330171023.627912475@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 47 ++++++++++++++++++++++++++++++++++----
 1 file changed, 42 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index cde720fc495..560dd1e7b52 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -41,6 +41,8 @@ struct power_pmu *ppmu;
  */
 static unsigned int freeze_counters_kernel = MMCR0_FCS;
 
+static void perf_counter_interrupt(struct pt_regs *regs);
+
 void perf_counter_print_debug(void)
 {
 }
@@ -594,6 +596,24 @@ struct hw_perf_counter_ops power_perf_ops = {
 	.read = power_perf_read
 };
 
+/* Number of perf_counters counting hardware events */
+static atomic_t num_counters;
+/* Used to avoid races in calling reserve/release_pmc_hardware */
+static DEFINE_MUTEX(pmc_reserve_mutex);
+
+/*
+ * Release the PMU if this is the last perf_counter.
+ */
+static void hw_perf_counter_destroy(struct perf_counter *counter)
+{
+	if (!atomic_add_unless(&num_counters, -1, 1)) {
+		mutex_lock(&pmc_reserve_mutex);
+		if (atomic_dec_return(&num_counters) == 0)
+			release_pmc_hardware();
+		mutex_unlock(&pmc_reserve_mutex);
+	}
+}
+
 const struct hw_perf_counter_ops *
 hw_perf_counter_init(struct perf_counter *counter)
 {
@@ -601,6 +621,7 @@ hw_perf_counter_init(struct perf_counter *counter)
 	struct perf_counter *ctrs[MAX_HWCOUNTERS];
 	unsigned int events[MAX_HWCOUNTERS];
 	int n;
+	int err;
 
 	if (!ppmu)
 		return NULL;
@@ -646,6 +667,27 @@ hw_perf_counter_init(struct perf_counter *counter)
 
 	counter->hw.config = events[n];
 	atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period);
+
+	/*
+	 * See if we need to reserve the PMU.
+	 * If no counters are currently in use, then we have to take a
+	 * mutex to ensure that we don't race with another task doing
+	 * reserve_pmc_hardware or release_pmc_hardware.
+	 */
+	err = 0;
+	if (!atomic_inc_not_zero(&num_counters)) {
+		mutex_lock(&pmc_reserve_mutex);
+		if (atomic_read(&num_counters) == 0 &&
+		    reserve_pmc_hardware(perf_counter_interrupt))
+			err = -EBUSY;
+		else
+			atomic_inc(&num_counters);
+		mutex_unlock(&pmc_reserve_mutex);
+	}
+	counter->destroy = hw_perf_counter_destroy;
+
+	if (err)
+		return NULL;
 	return &power_perf_ops;
 }
 
@@ -769,11 +811,6 @@ static int init_perf_counters(void)
 {
 	unsigned long pvr;
 
-	if (reserve_pmc_hardware(perf_counter_interrupt)) {
-		printk(KERN_ERR "Couldn't init performance monitor subsystem\n");
-		return -EBUSY;
-	}
-
 	/* XXX should get this from cputable */
 	pvr = mfspr(SPRN_PVR);
 	switch (PVR_VER(pvr)) {
-- 
cgit v1.2.3


From d5d2bc0dd0379deddb9ede66fec90a3083eaec57 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 30 Mar 2009 19:07:08 +0200
Subject: perf_counter: make it possible for hw_perf_counter_init to return
 error codes

Impact: better error reporting

At present, if hw_perf_counter_init encounters an error, all it can do
is return NULL, which causes sys_perf_counter_open to return an EINVAL
error to userspace.  This isn't very informative for userspace; it means
that userspace can't tell the difference between "sorry, oprofile is
already using the PMU" and "we don't support this CPU" and "this CPU
doesn't support the requested generic hardware event".

This commit uses the PTR_ERR/ERR_PTR/IS_ERR set of macros to let
hw_perf_counter_init return an error code on error rather than just NULL
if it wishes.  If it does so, that error code will be returned from
sys_perf_counter_open to userspace.  If it returns NULL, an EINVAL
error will be returned to userspace, as before.

This also adapts the powerpc hw_perf_counter_init to make use of this
to return ENXIO, EINVAL, EBUSY, or EOPNOTSUPP as appropriate.  It would
be good to add extra error numbers in future to allow userspace to
distinguish the various errors that are currently reported as EINVAL,
i.e. irq_period < 0, too many events in a group, conflict between
exclude_* settings in a group, and PMU resource conflict in a group.

[ v2: fix a bug pointed out by Corey Ashford where error returns from
      hw_perf_counter_init were not handled correctly in the case of
      raw hardware events.]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Orig-LKML-Reference: <20090330171023.682428180@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 560dd1e7b52..0a4d14f279a 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -624,13 +624,13 @@ hw_perf_counter_init(struct perf_counter *counter)
 	int err;
 
 	if (!ppmu)
-		return NULL;
+		return ERR_PTR(-ENXIO);
 	if ((s64)counter->hw_event.irq_period < 0)
-		return NULL;
+		return ERR_PTR(-EINVAL);
 	if (!perf_event_raw(&counter->hw_event)) {
 		ev = perf_event_id(&counter->hw_event);
 		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
-			return NULL;
+			return ERR_PTR(-EOPNOTSUPP);
 		ev = ppmu->generic_events[ev];
 	} else {
 		ev = perf_event_config(&counter->hw_event);
@@ -656,14 +656,14 @@ hw_perf_counter_init(struct perf_counter *counter)
 		n = collect_events(counter->group_leader, ppmu->n_counter - 1,
 				   ctrs, events);
 		if (n < 0)
-			return NULL;
+			return ERR_PTR(-EINVAL);
 	}
 	events[n] = ev;
 	ctrs[n] = counter;
 	if (check_excludes(ctrs, n, 1))
-		return NULL;
+		return ERR_PTR(-EINVAL);
 	if (power_check_constraints(events, n + 1))
-		return NULL;
+		return ERR_PTR(-EINVAL);
 
 	counter->hw.config = events[n];
 	atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period);
@@ -687,7 +687,7 @@ hw_perf_counter_init(struct perf_counter *counter)
 	counter->destroy = hw_perf_counter_destroy;
 
 	if (err)
-		return NULL;
+		return ERR_PTR(err);
 	return &power_perf_ops;
 }
 
-- 
cgit v1.2.3


From 9ea98e191255ee642e64a5745014424fc63f83b0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:09 +0200
Subject: perf_counter: x86: proper error propagation for the x86
 hw_perf_counter_init()

Now that Paul cleaned up the error propagation paths, pass down the
x86 error as well.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090330171023.792822360@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 7aab177fb56..b8885ccd804 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -954,7 +954,7 @@ hw_perf_counter_init(struct perf_counter *counter)
 
 	err = __hw_perf_counter_init(counter);
 	if (err)
-		return NULL;
+		return ERR_PTR(err);
 
 	return &x86_perf_counter_ops;
 }
-- 
cgit v1.2.3


From d7d59fb323833682b117b528d77eeb8ef587036a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:15 +0200
Subject: perf_counter: x86: callchain support

Provide the x86 perf_callchain() implementation.

Code based on the ftrace/sysprof code from Soeren Sandmann Pedersen.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Cc: Soeren Sandmann Pedersen <sandmann@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Orig-LKML-Reference: <20090330171024.341993293@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 154 +++++++++++++++++++++++++++++++++++++
 1 file changed, 154 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index b8885ccd804..e16dfafc6d7 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -16,8 +16,10 @@
 #include <linux/module.h>
 #include <linux/kdebug.h>
 #include <linux/sched.h>
+#include <linux/uaccess.h>
 
 #include <asm/apic.h>
+#include <asm/stacktrace.h>
 
 static bool perf_counters_initialized __read_mostly;
 
@@ -958,3 +960,155 @@ hw_perf_counter_init(struct perf_counter *counter)
 
 	return &x86_perf_counter_ops;
 }
+
+/*
+ * callchain support
+ */
+
+static inline
+void callchain_store(struct perf_callchain_entry *entry, unsigned long ip)
+{
+	if (entry->nr < MAX_STACK_DEPTH)
+		entry->ip[entry->nr++] = ip;
+}
+
+static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
+static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
+
+
+static void
+backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+	/* Ignore warnings */
+}
+
+static void backtrace_warning(void *data, char *msg)
+{
+	/* Ignore warnings */
+}
+
+static int backtrace_stack(void *data, char *name)
+{
+	/* Don't bother with IRQ stacks for now */
+	return -1;
+}
+
+static void backtrace_address(void *data, unsigned long addr, int reliable)
+{
+	struct perf_callchain_entry *entry = data;
+
+	if (reliable)
+		callchain_store(entry, addr);
+}
+
+static const struct stacktrace_ops backtrace_ops = {
+	.warning		= backtrace_warning,
+	.warning_symbol		= backtrace_warning_symbol,
+	.stack			= backtrace_stack,
+	.address		= backtrace_address,
+};
+
+static void
+perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+	unsigned long bp;
+	char *stack;
+
+	callchain_store(entry, instruction_pointer(regs));
+
+	stack = ((char *)regs + sizeof(struct pt_regs));
+#ifdef CONFIG_FRAME_POINTER
+	bp = frame_pointer(regs);
+#else
+	bp = 0;
+#endif
+
+	dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry);
+}
+
+
+struct stack_frame {
+	const void __user	*next_fp;
+	unsigned long		return_address;
+};
+
+static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+{
+	int ret;
+
+	if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
+		return 0;
+
+	ret = 1;
+	pagefault_disable();
+	if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
+		ret = 0;
+	pagefault_enable();
+
+	return ret;
+}
+
+static void
+perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+	struct stack_frame frame;
+	const void __user *fp;
+
+	regs = (struct pt_regs *)current->thread.sp0 - 1;
+	fp   = (void __user *)regs->bp;
+
+	callchain_store(entry, regs->ip);
+
+	while (entry->nr < MAX_STACK_DEPTH) {
+		frame.next_fp	     = NULL;
+		frame.return_address = 0;
+
+		if (!copy_stack_frame(fp, &frame))
+			break;
+
+		if ((unsigned long)fp < user_stack_pointer(regs))
+			break;
+
+		callchain_store(entry, frame.return_address);
+		fp = frame.next_fp;
+	}
+}
+
+static void
+perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+	int is_user;
+
+	if (!regs)
+		return;
+
+	is_user = user_mode(regs);
+
+	if (!current || current->pid == 0)
+		return;
+
+	if (is_user && current->state != TASK_RUNNING)
+		return;
+
+	if (!is_user)
+		perf_callchain_kernel(regs, entry);
+
+	if (current->mm)
+		perf_callchain_user(regs, entry);
+}
+
+struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+	struct perf_callchain_entry *entry;
+
+	if (in_nmi())
+		entry = &__get_cpu_var(nmi_entry);
+	else
+		entry = &__get_cpu_var(irq_entry);
+
+	entry->nr = 0;
+
+	perf_do_callchain(regs, entry);
+
+	return entry;
+}
-- 
cgit v1.2.3


From 4e935e47177c3b26cf383e79849bae2a464d0160 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:16 +0200
Subject: perf_counter: pmc arbitration

Follow the example set by powerpc and try to play nice with oprofile
and the nmi watchdog.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090330171024.459968444@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 75 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index e16dfafc6d7..2a946a160ca 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -20,6 +20,7 @@
 
 #include <asm/apic.h>
 #include <asm/stacktrace.h>
+#include <asm/nmi.h>
 
 static bool perf_counters_initialized __read_mostly;
 
@@ -172,6 +173,65 @@ again:
 	atomic64_sub(delta, &hwc->period_left);
 }
 
+static atomic_t num_counters;
+static DEFINE_MUTEX(pmc_reserve_mutex);
+
+static bool reserve_pmc_hardware(void)
+{
+	int i;
+
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+		disable_lapic_nmi_watchdog();
+
+	for (i = 0; i < nr_counters_generic; i++) {
+		if (!reserve_perfctr_nmi(pmc_ops->perfctr + i))
+			goto perfctr_fail;
+	}
+
+	for (i = 0; i < nr_counters_generic; i++) {
+		if (!reserve_evntsel_nmi(pmc_ops->eventsel + i))
+			goto eventsel_fail;
+	}
+
+	return true;
+
+eventsel_fail:
+	for (i--; i >= 0; i--)
+		release_evntsel_nmi(pmc_ops->eventsel + i);
+
+	i = nr_counters_generic;
+
+perfctr_fail:
+	for (i--; i >= 0; i--)
+		release_perfctr_nmi(pmc_ops->perfctr + i);
+
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+		enable_lapic_nmi_watchdog();
+
+	return false;
+}
+
+static void release_pmc_hardware(void)
+{
+	int i;
+
+	for (i = 0; i < nr_counters_generic; i++) {
+		release_perfctr_nmi(pmc_ops->perfctr + i);
+		release_evntsel_nmi(pmc_ops->eventsel + i);
+	}
+
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+		enable_lapic_nmi_watchdog();
+}
+
+static void hw_perf_counter_destroy(struct perf_counter *counter)
+{
+	if (atomic_dec_and_mutex_lock(&num_counters, &pmc_reserve_mutex)) {
+		release_pmc_hardware();
+		mutex_unlock(&pmc_reserve_mutex);
+	}
+}
+
 /*
  * Setup the hardware configuration for a given hw_event_type
  */
@@ -179,10 +239,23 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 {
 	struct perf_counter_hw_event *hw_event = &counter->hw_event;
 	struct hw_perf_counter *hwc = &counter->hw;
+	int err;
 
 	if (unlikely(!perf_counters_initialized))
 		return -EINVAL;
 
+	err = 0;
+	if (atomic_inc_not_zero(&num_counters)) {
+		mutex_lock(&pmc_reserve_mutex);
+		if (atomic_read(&num_counters) == 0 && !reserve_pmc_hardware())
+			err = -EBUSY;
+		else
+			atomic_inc(&num_counters);
+		mutex_unlock(&pmc_reserve_mutex);
+	}
+	if (err)
+		return err;
+
 	/*
 	 * Generate PMC IRQs:
 	 * (keep 'enabled' bit clear for now)
@@ -230,6 +303,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		hwc->config |= pmc_ops->event_map(perf_event_id(hw_event));
 	}
 
+	counter->destroy = hw_perf_counter_destroy;
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 5872bdb88a35fae7d224bd6b21e5f377e854ccfc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 2 Apr 2009 11:12:03 +0200
Subject: perf_counter: add more context information

Put in counts to tell which ips belong to what context.

  -----
   | |  hv
   | --
nr | |  kernel
   | --
   | |  user
  -----

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Orig-LKML-Reference: <20090402091319.493101305@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 2a946a160ca..c74e20d593a 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1088,6 +1088,7 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
 {
 	unsigned long bp;
 	char *stack;
+	int nr = entry->nr;
 
 	callchain_store(entry, instruction_pointer(regs));
 
@@ -1099,6 +1100,8 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
 #endif
 
 	dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry);
+
+	entry->kernel = entry->nr - nr;
 }
 
 
@@ -1128,6 +1131,7 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
 {
 	struct stack_frame frame;
 	const void __user *fp;
+	int nr = entry->nr;
 
 	regs = (struct pt_regs *)current->thread.sp0 - 1;
 	fp   = (void __user *)regs->bp;
@@ -1147,6 +1151,8 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
 		callchain_store(entry, frame.return_address);
 		fp = frame.next_fp;
 	}
+
+	entry->user = entry->nr - nr;
 }
 
 static void
@@ -1182,6 +1188,9 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 		entry = &__get_cpu_var(irq_entry);
 
 	entry->nr = 0;
+	entry->hv = 0;
+	entry->kernel = 0;
+	entry->user = 0;
 
 	perf_do_callchain(regs, entry);
 
-- 
cgit v1.2.3


From b6276f353bf490add62dcf7db0ebd75baa3e1a37 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:03 +0200
Subject: perf_counter: x86: self-IPI for pending work

Implement set_perf_counter_pending() with a self-IPI so that it will
run ASAP in a usable context.

For now use a second IRQ vector, because the primary vector pokes
the apic in funny ways that seem to confuse things.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094517.724626696@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/entry_arch.h   |  1 +
 arch/x86/include/asm/hardirq.h      |  1 +
 arch/x86/include/asm/hw_irq.h       |  1 +
 arch/x86/include/asm/irq_vectors.h  |  5 +++++
 arch/x86/include/asm/perf_counter.h |  3 ++-
 arch/x86/kernel/cpu/perf_counter.c  | 14 ++++++++++++++
 arch/x86/kernel/entry_64.S          |  2 ++
 arch/x86/kernel/irq.c               |  5 +++++
 arch/x86/kernel/irqinit_32.c        |  1 +
 arch/x86/kernel/irqinit_64.c        |  1 +
 10 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index c2e6bedaf25..fe24d280249 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -50,6 +50,7 @@ BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
 
 #ifdef CONFIG_PERF_COUNTERS
 BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR)
+BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
 #endif
 
 #ifdef CONFIG_X86_MCE_P4THERMAL
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 25454427cee..f5ebe2aaca4 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -14,6 +14,7 @@ typedef struct {
 #endif
 	unsigned int generic_irqs;	/* arch dependent */
 	unsigned int apic_perf_irqs;
+	unsigned int apic_pending_irqs;
 #ifdef CONFIG_SMP
 	unsigned int irq_resched_count;
 	unsigned int irq_call_count;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index ae80f64973e..7309c0ad690 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -30,6 +30,7 @@ extern void apic_timer_interrupt(void);
 extern void generic_interrupt(void);
 extern void error_interrupt(void);
 extern void perf_counter_interrupt(void);
+extern void perf_pending_interrupt(void);
 
 extern void spurious_interrupt(void);
 extern void thermal_interrupt(void);
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 3cbd79bbb47..545bb811ccb 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -116,6 +116,11 @@
  */
 #define GENERIC_INTERRUPT_VECTOR	0xed
 
+/*
+ * Performance monitoring pending work vector:
+ */
+#define LOCAL_PENDING_VECTOR		0xec
+
 /*
  * First APIC vector available to drivers: (vectors 0x30-0xee) we
  * start at 0x31(0x41) to spread out vectors evenly between priority
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index e2b0e66b235..d08dd52cb8f 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -84,7 +84,8 @@ union cpuid10_edx {
 #define MSR_ARCH_PERFMON_FIXED_CTR2			0x30b
 #define X86_PMC_IDX_FIXED_BUS_CYCLES			(X86_PMC_IDX_FIXED + 2)
 
-#define set_perf_counter_pending()	do { } while (0)
+extern void set_perf_counter_pending(void);
+
 #define clear_perf_counter_pending()	do { } while (0)
 #define test_perf_counter_pending()	(0)
 
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index c74e20d593a..438415866fe 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -849,6 +849,20 @@ void smp_perf_counter_interrupt(struct pt_regs *regs)
 	irq_exit();
 }
 
+void smp_perf_pending_interrupt(struct pt_regs *regs)
+{
+	irq_enter();
+	ack_APIC_irq();
+	inc_irq_stat(apic_pending_irqs);
+	perf_counter_do_pending();
+	irq_exit();
+}
+
+void set_perf_counter_pending(void)
+{
+	apic->send_IPI_self(LOCAL_PENDING_VECTOR);
+}
+
 void perf_counters_lapic_init(int nmi)
 {
 	u32 apic_val;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 3f129d963a0..1d46cba56fd 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1028,6 +1028,8 @@ apicinterrupt SPURIOUS_APIC_VECTOR \
 #ifdef CONFIG_PERF_COUNTERS
 apicinterrupt LOCAL_PERF_VECTOR \
 	perf_counter_interrupt smp_perf_counter_interrupt
+apicinterrupt LOCAL_PENDING_VECTOR \
+	perf_pending_interrupt smp_perf_pending_interrupt
 #endif
 
 /*
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 9c2754302ec..d465487da58 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -67,6 +67,10 @@ static int show_other_interrupts(struct seq_file *p, int prec)
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
 	seq_printf(p, "  Performance counter interrupts\n");
+	seq_printf(p, "PND: ");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
+	seq_printf(p, "  Performance pending work\n");
 #endif
 	if (generic_interrupt_extension) {
 		seq_printf(p, "PLT: ");
@@ -171,6 +175,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 	sum += irq_stats(cpu)->apic_timer_irqs;
 	sum += irq_stats(cpu)->irq_spurious_count;
 	sum += irq_stats(cpu)->apic_perf_irqs;
+	sum += irq_stats(cpu)->apic_pending_irqs;
 #endif
 	if (generic_interrupt_extension)
 		sum += irq_stats(cpu)->generic_irqs;
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 925d87cfc55..3190a6b961e 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -166,6 +166,7 @@ static void __init apic_intr_init(void)
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
 # ifdef CONFIG_PERF_COUNTERS
 	alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
+	alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
 # endif
 
 # ifdef CONFIG_X86_MCE_P4THERMAL
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 665e2ab48ab..53ceb26f80f 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -156,6 +156,7 @@ static void __init apic_intr_init(void)
 	/* Performance monitoring interrupt: */
 #ifdef CONFIG_PERF_COUNTERS
 	alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
+	alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
 #endif
 }
 
-- 
cgit v1.2.3


From f6c7d5fe58b4846ee0cb4b98b6042489705eced4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:04 +0200
Subject: perf_counter: theres more to overflow than writing events

Prepare for more generic overflow handling. The new perf_counter_overflow()
method will handle the generic bits of the counter overflow, and can return
a !0 return value, in which case the counter should be (soft) disabled, so
that it won't count until it's properly disabled.

XXX: do powerpc and swcounter

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094517.812109629@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 2 +-
 arch/x86/kernel/cpu/perf_counter.c | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 0a4d14f279a..f88c35d0710 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -732,7 +732,7 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	 * Finally record data if requested.
 	 */
 	if (record)
-		perf_counter_output(counter, 1, regs);
+		perf_counter_overflow(counter, 1, regs);
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 438415866fe..1116a41bc7b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -800,7 +800,8 @@ again:
 			continue;
 
 		perf_save_and_restart(counter);
-		perf_counter_output(counter, nmi, regs);
+		if (perf_counter_overflow(counter, nmi, regs))
+			__pmc_generic_disable(counter, &counter->hw, bit);
 	}
 
 	hw_perf_ack_status(ack);
-- 
cgit v1.2.3


From 98c2aaf8be5baf7193be37fb28bce8e7327158bc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 7 Apr 2009 11:30:17 +0200
Subject: x86, perfcounters: add atomic64_xchg()

Complete atomic64_t support on the 32-bit side by adding atomic64_xch().

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090406094518.445450972@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/atomic_32.h | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
index 977250ed8b8..aff9f1fcdcd 100644
--- a/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@ -291,19 +291,37 @@ atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val,
 }
 
 /**
- * atomic64_set - set atomic64 variable
+ * atomic64_xchg - xchg atomic64 variable
  * @ptr:      pointer to type atomic64_t
  * @new_val:  value to assign
+ * @old_val:  old value that was there
  *
- * Atomically sets the value of @ptr to @new_val.
+ * Atomically xchgs the value of @ptr to @new_val and returns
+ * the old value.
  */
-static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val)
+
+static inline unsigned long long
+atomic64_xchg(atomic64_t *ptr, unsigned long long new_val)
 {
 	unsigned long long old_val;
 
 	do {
 		old_val = atomic_read(ptr);
 	} while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
+
+	return old_val;
+}
+
+/**
+ * atomic64_set - set atomic64 variable
+ * @ptr:      pointer to type atomic64_t
+ * @new_val:  value to assign
+ *
+ * Atomically sets the value of @ptr to @new_val.
+ */
+static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val)
+{
+	atomic64_xchg(ptr, new_val);
 }
 
 /**
-- 
cgit v1.2.3


From dc66270b51a62b1a6888d5309229e638a305c47b Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 8 Apr 2009 20:30:10 +1000
Subject: perf_counter: fix powerpc build

Commit 4af4998b ("perf_counter: rework context time") changed struct
perf_counter_context to have a 'time' field instead of a 'time_now'
field, but neglected to fix the place in the powerpc perf_counter.c
where the time_now field was accessed.  This fixes it.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <18908.31922.411398.147810@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index f88c35d0710..0e5651385dd 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -457,8 +457,7 @@ static void counter_sched_in(struct perf_counter *counter, int cpu)
 {
 	counter->state = PERF_COUNTER_STATE_ACTIVE;
 	counter->oncpu = cpu;
-	counter->tstamp_running += counter->ctx->time_now -
-		counter->tstamp_stopped;
+	counter->tstamp_running += counter->ctx->time - counter->tstamp_stopped;
 	if (is_software_counter(counter))
 		counter->hw_ops->enable(counter);
 }
-- 
cgit v1.2.3


From f708223d49ac39f5af1643985056206c98033f5b Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 8 Apr 2009 20:30:18 +1000
Subject: perf_counter: powerpc: set sample enable bit for marked instruction
 events

Impact: enable access to hardware feature

POWER processors have the ability to "mark" a subset of the instructions
and provide more detailed information on what happens to the marked
instructions as they flow through the pipeline.  This marking is
enabled by the "sample enable" bit in MMCRA, and there are
synchronization requirements around setting and clearing the bit.

This adds logic to the processor-specific back-ends so that they know
which events relate to marked instructions and set the sampling enable
bit if any event that we want to put on the PMU is a marked instruction
event.  It also adds logic to the generic powerpc code to do the
necessary synchronization if that bit is set.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <18908.31930.1024.228867@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |  28 +++++++--
 arch/powerpc/kernel/power5+-pmu.c  | 103 +++++++++++++++++++++++++++++-
 arch/powerpc/kernel/power5-pmu.c   |  96 +++++++++++++++++++++++++++-
 arch/powerpc/kernel/power6-pmu.c   | 126 ++++++++++++++++++++++++++++++++++++-
 arch/powerpc/kernel/ppc970-pmu.c   |  72 ++++++++++++++++++++-
 5 files changed, 413 insertions(+), 12 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 0e5651385dd..0697ade84dd 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -306,6 +306,15 @@ u64 hw_perf_save_disable(void)
 			cpuhw->pmcs_enabled = 1;
 		}
 
+		/*
+		 * Disable instruction sampling if it was enabled
+		 */
+		if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
+			mtspr(SPRN_MMCRA,
+			      cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
+			mb();
+		}
+
 		/*
 		 * Set the 'freeze counters' bit.
 		 * The barrier is to make sure the mtspr has been
@@ -347,12 +356,11 @@ void hw_perf_restore(u64 disable)
 	 * (possibly updated for removal of counters).
 	 */
 	if (!cpuhw->n_added) {
-		mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
+		mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
 		mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
-		mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
 		if (cpuhw->n_counters == 0)
 			get_lppaca()->pmcregs_in_use = 0;
-		goto out;
+		goto out_enable;
 	}
 
 	/*
@@ -385,7 +393,7 @@ void hw_perf_restore(u64 disable)
 	 * Then unfreeze the counters.
 	 */
 	get_lppaca()->pmcregs_in_use = 1;
-	mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
+	mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
 	mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
 	mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
 				| MMCR0_FC);
@@ -421,10 +429,20 @@ void hw_perf_restore(u64 disable)
 		write_pmc(counter->hw.idx, val);
 		perf_counter_update_userpage(counter);
 	}
-	mb();
 	cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
+
+ out_enable:
+	mb();
 	mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
 
+	/*
+	 * Enable instruction sampling if necessary
+	 */
+	if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
+		mb();
+		mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
+	}
+
  out:
 	local_irq_restore(flags);
 }
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
index cec21ea65b0..1222c8ea3c2 100644
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -1,5 +1,5 @@
 /*
- * Performance counter support for POWER5 (not POWER5++) processors.
+ * Performance counter support for POWER5+/++ (not POWER5) processors.
  *
  * Copyright 2009 Paul Mackerras, IBM Corporation.
  *
@@ -281,10 +281,107 @@ static int power5p_get_alternatives(unsigned int event, unsigned int alt[])
 	return nalt;
 }
 
+/*
+ * Map of which direct events on which PMCs are marked instruction events.
+ * Indexed by PMCSEL value, bit i (LE) set if PMC i is a marked event.
+ * Bit 0 is set if it is marked for all PMCs.
+ * The 0x80 bit indicates a byte decode PMCSEL value.
+ */
+static unsigned char direct_event_is_marked[0x28] = {
+	0,	/* 00 */
+	0x1f,	/* 01 PM_IOPS_CMPL */
+	0x2,	/* 02 PM_MRK_GRP_DISP */
+	0xe,	/* 03 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */
+	0,	/* 04 */
+	0x1c,	/* 05 PM_MRK_BRU_FIN, PM_MRK_INST_FIN, PM_MRK_CRU_FIN */
+	0x80,	/* 06 */
+	0x80,	/* 07 */
+	0, 0, 0,/* 08 - 0a */
+	0x18,	/* 0b PM_THRESH_TIMEO, PM_MRK_GRP_TIMEO */
+	0,	/* 0c */
+	0x80,	/* 0d */
+	0x80,	/* 0e */
+	0,	/* 0f */
+	0,	/* 10 */
+	0x14,	/* 11 PM_MRK_GRP_BR_REDIR, PM_MRK_GRP_IC_MISS */
+	0,	/* 12 */
+	0x10,	/* 13 PM_MRK_GRP_CMPL */
+	0x1f,	/* 14 PM_GRP_MRK, PM_MRK_{FXU,FPU,LSU}_FIN */
+	0x2,	/* 15 PM_MRK_GRP_ISSUED */
+	0x80,	/* 16 */
+	0x80,	/* 17 */
+	0, 0, 0, 0, 0,
+	0x80,	/* 1d */
+	0x80,	/* 1e */
+	0,	/* 1f */
+	0x80,	/* 20 */
+	0x80,	/* 21 */
+	0x80,	/* 22 */
+	0x80,	/* 23 */
+	0x80,	/* 24 */
+	0x80,	/* 25 */
+	0x80,	/* 26 */
+	0x80,	/* 27 */
+};
+
+/*
+ * Returns 1 if event counts things relating to marked instructions
+ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
+ */
+static int power5p_marked_instr_event(unsigned int event)
+{
+	int pmc, psel;
+	int bit, byte, unit;
+	u32 mask;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	psel = event & PM_PMCSEL_MSK;
+	if (pmc >= 5)
+		return 0;
+
+	bit = -1;
+	if (psel < sizeof(direct_event_is_marked)) {
+		if (direct_event_is_marked[psel] & (1 << pmc))
+			return 1;
+		if (direct_event_is_marked[psel] & 0x80)
+			bit = 4;
+		else if (psel == 0x08)
+			bit = pmc - 1;
+		else if (psel == 0x10)
+			bit = 4 - pmc;
+		else if (psel == 0x1b && (pmc == 1 || pmc == 3))
+			bit = 4;
+	} else if ((psel & 0x48) == 0x40) {
+		bit = psel & 7;
+	} else if (psel == 0x28) {
+		bit = pmc - 1;
+	} else if (pmc == 3 && (psel == 0x2e || psel == 0x2f)) {
+		bit = 4;
+	}
+
+	if (!(event & PM_BUSEVENT_MSK) || bit == -1)
+		return 0;
+
+	byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+	if (unit == PM_LSU0) {
+		/* byte 1 bits 0-7, byte 2 bits 0,2-4,6 */
+		mask = 0x5dff00;
+	} else if (unit == PM_LSU1 && byte >= 4) {
+		byte -= 4;
+		/* byte 5 bits 6-7, byte 6 bits 0,4, byte 7 bits 0-4,6 */
+		mask = 0x5f11c000;
+	} else
+		return 0;
+
+	return (mask >> (byte * 8 + bit)) & 1;
+}
+
 static int power5p_compute_mmcr(unsigned int event[], int n_ev,
 				unsigned int hwc[], u64 mmcr[])
 {
 	u64 mmcr1 = 0;
+	u64 mmcra = 0;
 	unsigned int pmc, unit, byte, psel;
 	unsigned int ttm;
 	int i, isbus, bit, grsel;
@@ -404,6 +501,8 @@ static int power5p_compute_mmcr(unsigned int event[], int n_ev,
 			grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK;
 			mmcr1 |= (u64)grsel << grsel_shift[bit];
 		}
+		if (power5p_marked_instr_event(event[i]))
+			mmcra |= MMCRA_SAMPLE_ENABLE;
 		if ((psel & 0x58) == 0x40 && (byte & 1) != ((pmc >> 1) & 1))
 			/* select alternate byte lane */
 			psel |= 0x10;
@@ -419,7 +518,7 @@ static int power5p_compute_mmcr(unsigned int event[], int n_ev,
 	if (pmc_inuse & 0x3e)
 		mmcr[0] |= MMCR0_PMCjCE;
 	mmcr[1] = mmcr1;
-	mmcr[2] = 0;
+	mmcr[2] = mmcra;
 	return 0;
 }
 
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
index 379ed1087cc..116c4bb1809 100644
--- a/arch/powerpc/kernel/power5-pmu.c
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -290,10 +290,102 @@ static int power5_get_alternatives(unsigned int event, unsigned int alt[])
 	return nalt;
 }
 
+/*
+ * Map of which direct events on which PMCs are marked instruction events.
+ * Indexed by PMCSEL value, bit i (LE) set if PMC i is a marked event.
+ * Bit 0 is set if it is marked for all PMCs.
+ * The 0x80 bit indicates a byte decode PMCSEL value.
+ */
+static unsigned char direct_event_is_marked[0x28] = {
+	0,	/* 00 */
+	0x1f,	/* 01 PM_IOPS_CMPL */
+	0x2,	/* 02 PM_MRK_GRP_DISP */
+	0xe,	/* 03 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */
+	0,	/* 04 */
+	0x1c,	/* 05 PM_MRK_BRU_FIN, PM_MRK_INST_FIN, PM_MRK_CRU_FIN */
+	0x80,	/* 06 */
+	0x80,	/* 07 */
+	0, 0, 0,/* 08 - 0a */
+	0x18,	/* 0b PM_THRESH_TIMEO, PM_MRK_GRP_TIMEO */
+	0,	/* 0c */
+	0x80,	/* 0d */
+	0x80,	/* 0e */
+	0,	/* 0f */
+	0,	/* 10 */
+	0x14,	/* 11 PM_MRK_GRP_BR_REDIR, PM_MRK_GRP_IC_MISS */
+	0,	/* 12 */
+	0x10,	/* 13 PM_MRK_GRP_CMPL */
+	0x1f,	/* 14 PM_GRP_MRK, PM_MRK_{FXU,FPU,LSU}_FIN */
+	0x2,	/* 15 PM_MRK_GRP_ISSUED */
+	0x80,	/* 16 */
+	0x80,	/* 17 */
+	0, 0, 0, 0, 0,
+	0x80,	/* 1d */
+	0x80,	/* 1e */
+	0,	/* 1f */
+	0x80,	/* 20 */
+	0x80,	/* 21 */
+	0x80,	/* 22 */
+	0x80,	/* 23 */
+	0x80,	/* 24 */
+	0x80,	/* 25 */
+	0x80,	/* 26 */
+	0x80,	/* 27 */
+};
+
+/*
+ * Returns 1 if event counts things relating to marked instructions
+ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
+ */
+static int power5_marked_instr_event(unsigned int event)
+{
+	int pmc, psel;
+	int bit, byte, unit;
+	u32 mask;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	psel = event & PM_PMCSEL_MSK;
+	if (pmc >= 5)
+		return 0;
+
+	bit = -1;
+	if (psel < sizeof(direct_event_is_marked)) {
+		if (direct_event_is_marked[psel] & (1 << pmc))
+			return 1;
+		if (direct_event_is_marked[psel] & 0x80)
+			bit = 4;
+		else if (psel == 0x08)
+			bit = pmc - 1;
+		else if (psel == 0x10)
+			bit = 4 - pmc;
+		else if (psel == 0x1b && (pmc == 1 || pmc == 3))
+			bit = 4;
+	} else if ((psel & 0x58) == 0x40)
+		bit = psel & 7;
+
+	if (!(event & PM_BUSEVENT_MSK))
+		return 0;
+
+	byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+	if (unit == PM_LSU0) {
+		/* byte 1 bits 0-7, byte 2 bits 0,2-4,6 */
+		mask = 0x5dff00;
+	} else if (unit == PM_LSU1 && byte >= 4) {
+		byte -= 4;
+		/* byte 4 bits 1,3,5,7, byte 5 bits 6-7, byte 7 bits 0-4,6 */
+		mask = 0x5f00c0aa;
+	} else
+		return 0;
+
+	return (mask >> (byte * 8 + bit)) & 1;
+}
+
 static int power5_compute_mmcr(unsigned int event[], int n_ev,
 			       unsigned int hwc[], u64 mmcr[])
 {
 	u64 mmcr1 = 0;
+	u64 mmcra = 0;
 	unsigned int pmc, unit, byte, psel;
 	unsigned int ttm, grp;
 	int i, isbus, bit, grsel;
@@ -430,6 +522,8 @@ static int power5_compute_mmcr(unsigned int event[], int n_ev,
 			grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK;
 			mmcr1 |= (u64)grsel << grsel_shift[bit];
 		}
+		if (power5_marked_instr_event(event[i]))
+			mmcra |= MMCRA_SAMPLE_ENABLE;
 		if (pmc <= 3)
 			mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc);
 		hwc[i] = pmc;
@@ -442,7 +536,7 @@ static int power5_compute_mmcr(unsigned int event[], int n_ev,
 	if (pmc_inuse & 0x3e)
 		mmcr[0] |= MMCR0_PMCjCE;
 	mmcr[1] = mmcr1;
-	mmcr[2] = 0;
+	mmcr[2] = mmcra;
 	return 0;
 }
 
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
index b1f61f3c97b..fce1fc290a1 100644
--- a/arch/powerpc/kernel/power6-pmu.c
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -48,6 +48,127 @@
 #define MMCR1_PMCSEL_SH(n)	(MMCR1_PMC1SEL_SH - (n) * 8)
 #define MMCR1_PMCSEL_MSK	0xff
 
+/*
+ * Map of which direct events on which PMCs are marked instruction events.
+ * Indexed by PMCSEL value >> 1.
+ * Bottom 4 bits are a map of which PMCs are interesting,
+ * top 4 bits say what sort of event:
+ *   0 = direct marked event,
+ *   1 = byte decode event,
+ *   4 = add/and event (PMC1 -> bits 0 & 4),
+ *   5 = add/and event (PMC1 -> bits 1 & 5),
+ *   6 = add/and event (PMC1 -> bits 2 & 6),
+ *   7 = add/and event (PMC1 -> bits 3 & 7).
+ */
+static unsigned char direct_event_is_marked[0x60 >> 1] = {
+	0,	/* 00 */
+	0,	/* 02 */
+	0,	/* 04 */
+	0x07,	/* 06 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */
+	0x04,	/* 08 PM_MRK_DFU_FIN */
+	0x06,	/* 0a PM_MRK_IFU_FIN, PM_MRK_INST_FIN */
+	0,	/* 0c */
+	0,	/* 0e */
+	0x02,	/* 10 PM_MRK_INST_DISP */
+	0x08,	/* 12 PM_MRK_LSU_DERAT_MISS */
+	0,	/* 14 */
+	0,	/* 16 */
+	0x0c,	/* 18 PM_THRESH_TIMEO, PM_MRK_INST_FIN */
+	0x0f,	/* 1a PM_MRK_INST_DISP, PM_MRK_{FXU,FPU,LSU}_FIN */
+	0x01,	/* 1c PM_MRK_INST_ISSUED */
+	0,	/* 1e */
+	0,	/* 20 */
+	0,	/* 22 */
+	0,	/* 24 */
+	0,	/* 26 */
+	0x15,	/* 28 PM_MRK_DATA_FROM_L2MISS, PM_MRK_DATA_FROM_L3MISS */
+	0,	/* 2a */
+	0,	/* 2c */
+	0,	/* 2e */
+	0x4f,	/* 30 */
+	0x7f,	/* 32 */
+	0x4f,	/* 34 */
+	0x5f,	/* 36 */
+	0x6f,	/* 38 */
+	0x4f,	/* 3a */
+	0,	/* 3c */
+	0x08,	/* 3e PM_MRK_INST_TIMEO */
+	0x1f,	/* 40 */
+	0x1f,	/* 42 */
+	0x1f,	/* 44 */
+	0x1f,	/* 46 */
+	0x1f,	/* 48 */
+	0x1f,	/* 4a */
+	0x1f,	/* 4c */
+	0x1f,	/* 4e */
+	0,	/* 50 */
+	0x05,	/* 52 PM_MRK_BR_TAKEN, PM_MRK_BR_MPRED */
+	0x1c,	/* 54 PM_MRK_PTEG_FROM_L3MISS, PM_MRK_PTEG_FROM_L2MISS */
+	0x02,	/* 56 PM_MRK_LD_MISS_L1 */
+	0,	/* 58 */
+	0,	/* 5a */
+	0,	/* 5c */
+	0,	/* 5e */
+};
+
+/*
+ * Masks showing for each unit which bits are marked events.
+ * These masks are in LE order, i.e. 0x00000001 is byte 0, bit 0.
+ */
+static u32 marked_bus_events[16] = {
+	0x01000000,	/* direct events set 1: byte 3 bit 0 */
+	0x00010000,	/* direct events set 2: byte 2 bit 0 */
+	0, 0, 0, 0,	/* IDU, IFU, nest: nothing */
+	0x00000088,	/* VMX set 1: byte 0 bits 3, 7 */
+	0x000000c0,	/* VMX set 2: byte 0 bits 4-7 */
+	0x04010000,	/* LSU set 1: byte 2 bit 0, byte 3 bit 2 */
+	0xff010000u,	/* LSU set 2: byte 2 bit 0, all of byte 3 */
+	0,		/* LSU set 3 */
+	0x00000010,	/* VMX set 3: byte 0 bit 4 */
+	0,		/* BFP set 1 */
+	0x00000022,	/* BFP set 2: byte 0 bits 1, 5 */
+	0, 0
+};
+	
+/*
+ * Returns 1 if event counts things relating to marked instructions
+ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
+ */
+static int power6_marked_instr_event(unsigned int event)
+{
+	int pmc, psel, ptype;
+	int bit, byte, unit;
+	u32 mask;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	psel = (event & PM_PMCSEL_MSK) >> 1;	/* drop edge/level bit */
+	if (pmc >= 5)
+		return 0;
+
+	bit = -1;
+	if (psel < sizeof(direct_event_is_marked)) {
+		ptype = direct_event_is_marked[psel];
+		if (pmc == 0 || !(ptype & (1 << (pmc - 1))))
+			return 0;
+		ptype >>= 4;
+		if (ptype == 0)
+			return 1;
+		if (ptype == 1)
+			bit = 0;
+		else
+			bit = ptype ^ (pmc - 1);
+	} else if ((psel & 0x48) == 0x40)
+		bit = psel & 7;
+
+	if (!(event & PM_BUSEVENT_MSK) || bit == -1)
+		return 0;
+
+	byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+	mask = marked_bus_events[unit];
+	return (mask >> (byte * 8 + bit)) & 1;
+}
+
 /*
  * Assign PMC numbers and compute MMCR1 value for a set of events
  */
@@ -55,6 +176,7 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev,
 			   unsigned int hwc[], u64 mmcr[])
 {
 	u64 mmcr1 = 0;
+	u64 mmcra = 0;
 	int i;
 	unsigned int pmc, ev, b, u, s, psel;
 	unsigned int ttmset = 0;
@@ -116,6 +238,8 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev,
 			if (ev & PM_LLAV)
 				mmcr1 |= MMCR1_PMC1_LLA_VALUE >> pmc;
 		}
+		if (power6_marked_instr_event(event[i]))
+			mmcra |= MMCRA_SAMPLE_ENABLE;
 		mmcr1 |= (u64)psel << MMCR1_PMCSEL_SH(pmc);
 	}
 	mmcr[0] = 0;
@@ -124,7 +248,7 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev,
 	if (pmc_inuse & 0xe)
 		mmcr[0] |= MMCR0_PMCjCE;
 	mmcr[1] = mmcr1;
-	mmcr[2] = 0;
+	mmcr[2] = mmcra;
 	return 0;
 }
 
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
index c3256580be1..aed8ccd7c07 100644
--- a/arch/powerpc/kernel/ppc970-pmu.c
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -19,6 +19,8 @@
 #define PM_PMC_MSK	0xf
 #define PM_UNIT_SH	8	/* TTMMUX number and setting - unit select */
 #define PM_UNIT_MSK	0xf
+#define PM_SPCSEL_SH	6
+#define PM_SPCSEL_MSK	3
 #define PM_BYTE_SH	4	/* Byte number of event bus to use */
 #define PM_BYTE_MSK	3
 #define PM_PMCSEL_MSK	0xf
@@ -88,8 +90,11 @@ static short mmcr1_adder_bits[8] = {
  * Layout of constraint bits:
  * 6666555555555544444444443333333333222222222211111111110000000000
  * 3210987654321098765432109876543210987654321098765432109876543210
- *                 <><>[  >[  >[  ><  ><  ><  ><  ><><><><><><><><>
- *                 T0T1 UC  PS1 PS2 B0  B1  B2  B3 P1P2P3P4P5P6P7P8
+ *               <><><>[  >[  >[  ><  ><  ><  ><  ><><><><><><><><>
+ *               SPT0T1 UC  PS1 PS2 B0  B1  B2  B3 P1P2P3P4P5P6P7P8
+ *
+ * SP - SPCSEL constraint
+ *     48-49: SPCSEL value 0x3_0000_0000_0000
  *
  * T0 - TTM0 constraint
  *     46-47: TTM0SEL value (0=FPU, 2=IFU, 3=VPU) 0xC000_0000_0000
@@ -126,6 +131,57 @@ static short mmcr1_adder_bits[8] = {
  *     0-13: Count of events needing PMC2..PMC8
  */
 
+static unsigned char direct_marked_event[8] = {
+	(1<<2) | (1<<3),	/* PMC1: PM_MRK_GRP_DISP, PM_MRK_ST_CMPL */
+	(1<<3) | (1<<5),	/* PMC2: PM_THRESH_TIMEO, PM_MRK_BRU_FIN */
+	(1<<3) | (1<<5),	/* PMC3: PM_MRK_ST_CMPL_INT, PM_MRK_VMX_FIN */
+	(1<<4) | (1<<5),	/* PMC4: PM_MRK_GRP_CMPL, PM_MRK_CRU_FIN */
+	(1<<4) | (1<<5),	/* PMC5: PM_GRP_MRK, PM_MRK_GRP_TIMEO */
+	(1<<3) | (1<<4) | (1<<5),
+		/* PMC6: PM_MRK_ST_STS, PM_MRK_FXU_FIN, PM_MRK_GRP_ISSUED */
+	(1<<4) | (1<<5),	/* PMC7: PM_MRK_FPU_FIN, PM_MRK_INST_FIN */
+	(1<<4)			/* PMC8: PM_MRK_LSU_FIN */
+};
+
+/*
+ * Returns 1 if event counts things relating to marked instructions
+ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
+ */
+static int p970_marked_instr_event(unsigned int event)
+{
+	int pmc, psel, unit, byte, bit;
+	unsigned int mask;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	psel = event & PM_PMCSEL_MSK;
+	if (pmc) {
+		if (direct_marked_event[pmc - 1] & (1 << psel))
+			return 1;
+		if (psel == 0)		/* add events */
+			bit = (pmc <= 4)? pmc - 1: 8 - pmc;
+		else if (psel == 7 || psel == 13)	/* decode events */
+			bit = 4;
+		else
+			return 0;
+	} else
+		bit = psel;
+
+	byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+	mask = 0;
+	switch (unit) {
+	case PM_VPU:
+		mask = 0x4c;		/* byte 0 bits 2,3,6 */
+	case PM_LSU0:
+		/* byte 2 bits 0,2,3,4,6; all of byte 1 */
+		mask = 0x085dff00;
+	case PM_LSU1L:
+		mask = 0x50 << 24;	/* byte 3 bits 4,6 */
+		break;
+	}
+	return (mask >> (byte * 8 + bit)) & 1;
+}
+
 /* Masks and values for using events from the various units */
 static u64 unit_cons[PM_LASTUNIT+1][2] = {
 	[PM_FPU] =   { 0xc80000000000ull, 0x040000000000ull },
@@ -138,7 +194,7 @@ static u64 unit_cons[PM_LASTUNIT+1][2] = {
 
 static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
 {
-	int pmc, byte, unit, sh;
+	int pmc, byte, unit, sh, spcsel;
 	u64 mask = 0, value = 0;
 	int grp = -1;
 
@@ -177,6 +233,11 @@ static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
 		mask  |= 0x800000000ull;
 		value |= 0x100000000ull;
 	}
+	spcsel = (event >> PM_SPCSEL_SH) & PM_SPCSEL_MSK;
+	if (spcsel) {
+		mask  |= 3ull << 48;
+		value |= (u64)spcsel << 48;
+	}
 	*maskp = mask;
 	*valp = value;
 	return 0;
@@ -209,6 +270,7 @@ static int p970_compute_mmcr(unsigned int event[], int n_ev,
 	unsigned char ttmuse[2];
 	unsigned char pmcsel[8];
 	int i;
+	int spcsel;
 
 	if (n_ev > 8)
 		return -1;
@@ -316,6 +378,10 @@ static int p970_compute_mmcr(unsigned int event[], int n_ev,
 		}
 		pmcsel[pmc] = psel;
 		hwc[i] = pmc;
+		spcsel = (event[i] >> PM_SPCSEL_SH) & PM_SPCSEL_MSK;
+		mmcr1 |= spcsel;
+		if (p970_marked_instr_event(event[i]))
+			mmcra |= MMCRA_SAMPLE_ENABLE;
 	}
 	for (pmc = 0; pmc < 2; ++pmc)
 		mmcr0 |= pmcsel[pmc] << (MMCR0_PMC1SEL_SH - 7 * pmc);
-- 
cgit v1.2.3


From 7333a8003cdc0470e8c0ae8b949cbc44f3165ff3 Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Wed, 25 Mar 2009 10:50:34 +0900
Subject: x86: smarten /proc/interrupts output for new counters

Now /proc/interrupts of tip tree has new counters:

  CNT: Performance counter interrupts

Format change of output, as like that by commit:

  commit 7a81d9a7da03d2f27840d659f97ef140d032f609
  x86: smarten /proc/interrupts output

should be applied to these new counters too.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <49C98DEA.8060208@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index d465487da58..dccaaa85578 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -63,7 +63,7 @@ static int show_other_interrupts(struct seq_file *p, int prec)
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
 	seq_printf(p, "  Spurious interrupts\n");
-	seq_printf(p, "CNT: ");
+	seq_printf(p, "%*s: ", prec, "CNT");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
 	seq_printf(p, "  Performance counter interrupts\n");
-- 
cgit v1.2.3


From 78f13e9525ba777da25c4ddab89f28e9366a8b7c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 8 Apr 2009 15:01:33 +0200
Subject: perf_counter: allow for data addresses to be recorded

Paul suggested we allow for data addresses to be recorded along with
the traditional IPs as power can provide these.

For now, only the software pagefault events provide data addresses,
but in the future power might as well for some events.

x86 doesn't seem capable of providing this atm.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090408130409.394816925@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 2 +-
 arch/powerpc/mm/fault.c            | 8 +++++---
 arch/x86/kernel/cpu/perf_counter.c | 2 +-
 arch/x86/mm/fault.c                | 8 +++++---
 4 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 0697ade84dd..c9d019f1907 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -749,7 +749,7 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	 * Finally record data if requested.
 	 */
 	if (record)
-		perf_counter_overflow(counter, 1, regs);
+		perf_counter_overflow(counter, 1, regs, 0);
 }
 
 /*
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 17bbf6f91fb..ac0e112031b 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -171,7 +171,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 		die("Weird page fault", regs, SIGSEGV);
 	}
 
-	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs);
+	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs, address);
 
 	/* When running in the kernel we expect faults to occur only to
 	 * addresses in user space.  All other faults represent errors in the
@@ -312,7 +312,8 @@ good_area:
 	}
 	if (ret & VM_FAULT_MAJOR) {
 		current->maj_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs);
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0,
+				     regs, address);
 #ifdef CONFIG_PPC_SMLPAR
 		if (firmware_has_feature(FW_FEATURE_CMO)) {
 			preempt_disable();
@@ -322,7 +323,8 @@ good_area:
 #endif
 	} else {
 		current->min_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs);
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0,
+				     regs, address);
 	}
 	up_read(&mm->mmap_sem);
 	return 0;
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 1116a41bc7b..0fcbaab83f9 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -800,7 +800,7 @@ again:
 			continue;
 
 		perf_save_and_restart(counter);
-		if (perf_counter_overflow(counter, nmi, regs))
+		if (perf_counter_overflow(counter, nmi, regs, 0))
 			__pmc_generic_disable(counter, &counter->hw, bit);
 	}
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f2d3324d921..6f9df2babe4 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1045,7 +1045,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	if (unlikely(error_code & PF_RSVD))
 		pgtable_bad(regs, error_code, address);
 
-	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs);
+	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs, address);
 
 	/*
 	 * If we're in an interrupt, have no user context or are running
@@ -1142,10 +1142,12 @@ good_area:
 
 	if (fault & VM_FAULT_MAJOR) {
 		tsk->maj_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs);
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0,
+				     regs, address);
 	} else {
 		tsk->min_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs);
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0,
+				     regs, address);
 	}
 
 	check_v8086_mode(regs, address, tsk);
-- 
cgit v1.2.3


From ca8f2d7f019a8547f39ddb9ed0144932f12807f2 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 9 Apr 2009 14:42:56 +1000
Subject: perf_counter: powerpc: add nmi_enter/nmi_exit calls

Impact: fix potential deadlocks on powerpc

Now that the core is using in_nmi() (added in e30e08f6, "perf_counter:
fix NMI race in task clock"), we need the powerpc perf_counter_interrupt
to call nmi_enter() and nmi_exit() in those cases where the interrupt
happens when interrupts are soft-disabled.

If interrupts were soft-enabled, we can treat it as a regular interrupt
and do irq_enter/irq_exit around the whole routine. This lets us get rid
of the test_perf_counter_pending() call at the end of
perf_counter_interrupt, thus simplifying things a little.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <18909.31952.873098.336615@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index c9d019f1907..bd76d0fa2c3 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -714,7 +714,7 @@ hw_perf_counter_init(struct perf_counter *counter)
  * here so there is no possibility of being interrupted.
  */
 static void record_and_restart(struct perf_counter *counter, long val,
-			       struct pt_regs *regs)
+			       struct pt_regs *regs, int nmi)
 {
 	s64 prev, delta, left;
 	int record = 0;
@@ -749,7 +749,7 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	 * Finally record data if requested.
 	 */
 	if (record)
-		perf_counter_overflow(counter, 1, regs, 0);
+		perf_counter_overflow(counter, nmi, regs, 0);
 }
 
 /*
@@ -762,6 +762,17 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 	struct perf_counter *counter;
 	long val;
 	int found = 0;
+	int nmi;
+
+	/*
+	 * If interrupts were soft-disabled when this PMU interrupt
+	 * occurred, treat it as an NMI.
+	 */
+	nmi = !regs->softe;
+	if (nmi)
+		nmi_enter();
+	else
+		irq_enter();
 
 	for (i = 0; i < cpuhw->n_counters; ++i) {
 		counter = cpuhw->counter[i];
@@ -769,7 +780,7 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 		if ((int)val < 0) {
 			/* counter has overflowed */
 			found = 1;
-			record_and_restart(counter, val, regs);
+			record_and_restart(counter, val, regs, nmi);
 		}
 	}
 
@@ -796,18 +807,10 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 	 */
 	mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
 
-	/*
-	 * If we need a wakeup, check whether interrupts were soft-enabled
-	 * when we took the interrupt.  If they were, we can wake stuff up
-	 * immediately; otherwise we'll have do the wakeup when interrupts
-	 * get soft-enabled.
-	 */
-	if (test_perf_counter_pending() && regs->softe) {
-		irq_enter();
-		clear_perf_counter_pending();
-		perf_counter_do_pending();
+	if (nmi)
+		nmi_exit();
+	else
 		irq_exit();
-	}
 }
 
 void hw_perf_counter_setup(int cpu)
-- 
cgit v1.2.3


From 0f3fd87ce43727d6b8573191ce89e874533b1429 Mon Sep 17 00:00:00 2001
From: Luis Henriques <henrix@sapo.pt>
Date: Mon, 13 Apr 2009 20:24:50 +0100
Subject: perf_counter: fix alignment in /proc/interrupts

Trivial fix on columns alignment in /proc/interrupts file.

Signed-off-by: Luis Henriques <henrix@sapo.pt>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090413192449.GA3920@hades.domain.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index dccaaa85578..849cfabb1fd 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -67,7 +67,7 @@ static int show_other_interrupts(struct seq_file *p, int prec)
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
 	seq_printf(p, "  Performance counter interrupts\n");
-	seq_printf(p, "PND: ");
+	seq_printf(p, "%*s: ", prec, "PND");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
 	seq_printf(p, "  Performance pending work\n");
-- 
cgit v1.2.3


From da1a776be1ac7f78bb30ececbec4c1383163b079 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:46:58 +0200
Subject: perf_counter, x86: remove X86_FEATURE_ARCH_PERFMON flag for AMD cpus

X86_FEATURE_ARCH_PERFMON is an Intel hardware feature that does not
work on AMD CPUs. The flag is now only used in Intel specific code
(especially initialization).

[ Impact: refactor code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1241002046-8832-2-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/amd.c          | 4 ----
 arch/x86/kernel/cpu/perf_counter.c | 6 +++---
 2 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index fd69c514ca2..7e4a459daa6 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -420,10 +420,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	if (c->x86 >= 6)
 		set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
 
-	/* Enable Performance counter for K7 and later */
-	if (c->x86 > 6 && c->x86 <= 0x11)
-		set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
-
 	if (!c->x86_model_id[0]) {
 		switch (c->x86) {
 		case 0xf:
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 0fcbaab83f9..7d0f81dcb52 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -949,6 +949,9 @@ static struct pmc_x86_ops *pmc_intel_init(void)
 	unsigned int unused;
 	unsigned int ebx;
 
+	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+		return NULL;
+
 	/*
 	 * Check whether the Architectural PerfMon supports
 	 * Branch Misses Retired Event or not.
@@ -987,9 +990,6 @@ static struct pmc_x86_ops *pmc_amd_init(void)
 
 void __init init_hw_perf_counters(void)
 {
-	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
-		return;
-
 	switch (boot_cpu_data.x86_vendor) {
 	case X86_VENDOR_INTEL:
 		pmc_ops = pmc_intel_init();
-- 
cgit v1.2.3


From 4138960a9251a265002b5cf07e671a49f8495381 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:00 +0200
Subject: perf_counter, x86: add default path to cpu detection

This quits hw counter initialization immediately if no cpu is
detected.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-4-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 7d0f81dcb52..d6d6529349d 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -997,6 +997,8 @@ void __init init_hw_perf_counters(void)
 	case X86_VENDOR_AMD:
 		pmc_ops = pmc_amd_init();
 		break;
+	default:
+		return;
 	}
 	if (!pmc_ops)
 		return;
-- 
cgit v1.2.3


From 4295ee62660b13ddb87d41539f49b239e6e7d56f Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:01 +0200
Subject: perf_counter, x86: rework pmc_amd_save_disable_all() and
 pmc_amd_restore_all()

MSR reads and writes are expensive. This patch adds checks to avoid
its usage where possible.

[ Impact: micro-optimization on AMD CPUs ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-5-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index d6d6529349d..75a090394b6 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -334,11 +334,13 @@ static u64 pmc_amd_save_disable_all(void)
 	for (idx = 0; idx < nr_counters_generic; idx++) {
 		u64 val;
 
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
 		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
-		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) {
-			val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
-			wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
-		}
+		if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
+			continue;
+		val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
 	}
 
 	return enabled;
@@ -372,13 +374,15 @@ static void pmc_amd_restore_all(u64 ctrl)
 		return;
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
-		if (test_bit(idx, cpuc->active_mask)) {
-			u64 val;
+		u64 val;
 
-			rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
-			val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
-			wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
-		}
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
+		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
+			continue;
+		val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
 	}
 }
 
-- 
cgit v1.2.3


From 527e26af3741a2168986d8b82653ffe173891324 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:02 +0200
Subject: perf_counter, x86: protect per-cpu variables with compile barriers
 only

Per-cpu variables needn't to be protected with cpu barriers
(smp_wmb()). Protection is only needed for preemption on the same cpu
(rescheduling or the nmi handler). This can be done using a compiler
barrier only.

[ Impact: micro-optimization ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-6-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 75a090394b6..ad663d5ad2d 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -673,7 +673,7 @@ try_generic:
 	/*
 	 * Make it visible before enabling the hw:
 	 */
-	smp_wmb();
+	barrier();
 
 	__hw_perf_counter_set_period(counter, hwc, idx);
 	__pmc_generic_enable(counter, hwc, idx);
@@ -745,7 +745,7 @@ static void pmc_generic_disable(struct perf_counter *counter)
 	 * Make sure the cleared pointer becomes visible before we
 	 * (potentially) free the counter:
 	 */
-	smp_wmb();
+	barrier();
 
 	/*
 	 * Drain the remaining delta count out of a counter
-- 
cgit v1.2.3


From 4aeb0b4239bb3b67ed402cb9cef3e000c892cadf Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:03 +0200
Subject: perfcounters: rename struct hw_perf_counter_ops into struct pmu

This patch renames struct hw_perf_counter_ops into struct pmu. It
introduces a structure to describe a cpu specific pmu (performance
monitoring unit). It may contain ops and data. The new name of the
structure fits better, is shorter, and thus better to handle. Where it
was appropriate, names of function and variable have been changed too.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-7-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 25 ++++++++++++-------------
 arch/x86/kernel/cpu/perf_counter.c | 37 ++++++++++++++++++-------------------
 2 files changed, 30 insertions(+), 32 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index bd76d0fa2c3..d9bbe5efc64 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -256,7 +256,7 @@ static int check_excludes(struct perf_counter **ctrs, int n_prev, int n_new)
 	return 0;
 }
 
-static void power_perf_read(struct perf_counter *counter)
+static void power_pmu_read(struct perf_counter *counter)
 {
 	long val, delta, prev;
 
@@ -405,7 +405,7 @@ void hw_perf_restore(u64 disable)
 	for (i = 0; i < cpuhw->n_counters; ++i) {
 		counter = cpuhw->counter[i];
 		if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) {
-			power_perf_read(counter);
+			power_pmu_read(counter);
 			write_pmc(counter->hw.idx, 0);
 			counter->hw.idx = 0;
 		}
@@ -477,7 +477,7 @@ static void counter_sched_in(struct perf_counter *counter, int cpu)
 	counter->oncpu = cpu;
 	counter->tstamp_running += counter->ctx->time - counter->tstamp_stopped;
 	if (is_software_counter(counter))
-		counter->hw_ops->enable(counter);
+		counter->pmu->enable(counter);
 }
 
 /*
@@ -533,7 +533,7 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader,
  * re-enable the PMU in order to get hw_perf_restore to do the
  * actual work of reconfiguring the PMU.
  */
-static int power_perf_enable(struct perf_counter *counter)
+static int power_pmu_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuhw;
 	unsigned long flags;
@@ -573,7 +573,7 @@ static int power_perf_enable(struct perf_counter *counter)
 /*
  * Remove a counter from the PMU.
  */
-static void power_perf_disable(struct perf_counter *counter)
+static void power_pmu_disable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuhw;
 	long i;
@@ -583,7 +583,7 @@ static void power_perf_disable(struct perf_counter *counter)
 	local_irq_save(flags);
 	pmudis = hw_perf_save_disable();
 
-	power_perf_read(counter);
+	power_pmu_read(counter);
 
 	cpuhw = &__get_cpu_var(cpu_hw_counters);
 	for (i = 0; i < cpuhw->n_counters; ++i) {
@@ -607,10 +607,10 @@ static void power_perf_disable(struct perf_counter *counter)
 	local_irq_restore(flags);
 }
 
-struct hw_perf_counter_ops power_perf_ops = {
-	.enable = power_perf_enable,
-	.disable = power_perf_disable,
-	.read = power_perf_read
+struct pmu power_pmu = {
+	.enable		= power_pmu_enable,
+	.disable	= power_pmu_disable,
+	.read		= power_pmu_read,
 };
 
 /* Number of perf_counters counting hardware events */
@@ -631,8 +631,7 @@ static void hw_perf_counter_destroy(struct perf_counter *counter)
 	}
 }
 
-const struct hw_perf_counter_ops *
-hw_perf_counter_init(struct perf_counter *counter)
+const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 {
 	unsigned long ev;
 	struct perf_counter *ctrs[MAX_HWCOUNTERS];
@@ -705,7 +704,7 @@ hw_perf_counter_init(struct perf_counter *counter)
 
 	if (err)
 		return ERR_PTR(err);
-	return &power_perf_ops;
+	return &power_pmu;
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index ad663d5ad2d..95de980c74a 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -515,8 +515,8 @@ __pmc_fixed_disable(struct perf_counter *counter,
 }
 
 static inline void
-__pmc_generic_disable(struct perf_counter *counter,
-			   struct hw_perf_counter *hwc, unsigned int idx)
+__x86_pmu_disable(struct perf_counter *counter,
+		  struct hw_perf_counter *hwc, unsigned int idx)
 {
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
 		__pmc_fixed_disable(counter, hwc, idx);
@@ -591,8 +591,8 @@ __pmc_fixed_enable(struct perf_counter *counter,
 }
 
 static void
-__pmc_generic_enable(struct perf_counter *counter,
-			  struct hw_perf_counter *hwc, int idx)
+__x86_pmu_enable(struct perf_counter *counter,
+		 struct hw_perf_counter *hwc, int idx)
 {
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
 		__pmc_fixed_enable(counter, hwc, idx);
@@ -626,7 +626,7 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 /*
  * Find a PMC slot for the freshly enabled / scheduled in counter:
  */
-static int pmc_generic_enable(struct perf_counter *counter)
+static int x86_pmu_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
@@ -667,7 +667,7 @@ try_generic:
 
 	perf_counters_lapic_init(hwc->nmi);
 
-	__pmc_generic_disable(counter, hwc, idx);
+	__x86_pmu_disable(counter, hwc, idx);
 
 	cpuc->counters[idx] = counter;
 	/*
@@ -676,7 +676,7 @@ try_generic:
 	barrier();
 
 	__hw_perf_counter_set_period(counter, hwc, idx);
-	__pmc_generic_enable(counter, hwc, idx);
+	__x86_pmu_enable(counter, hwc, idx);
 
 	return 0;
 }
@@ -731,13 +731,13 @@ void perf_counter_print_debug(void)
 	local_irq_enable();
 }
 
-static void pmc_generic_disable(struct perf_counter *counter)
+static void x86_pmu_disable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
 	unsigned int idx = hwc->idx;
 
-	__pmc_generic_disable(counter, hwc, idx);
+	__x86_pmu_disable(counter, hwc, idx);
 
 	clear_bit(idx, cpuc->used);
 	cpuc->counters[idx] = NULL;
@@ -767,7 +767,7 @@ static void perf_save_and_restart(struct perf_counter *counter)
 	__hw_perf_counter_set_period(counter, hwc, idx);
 
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
-		__pmc_generic_enable(counter, hwc, idx);
+		__x86_pmu_enable(counter, hwc, idx);
 }
 
 /*
@@ -805,7 +805,7 @@ again:
 
 		perf_save_and_restart(counter);
 		if (perf_counter_overflow(counter, nmi, regs, 0))
-			__pmc_generic_disable(counter, &counter->hw, bit);
+			__x86_pmu_disable(counter, &counter->hw, bit);
 	}
 
 	hw_perf_ack_status(ack);
@@ -1034,19 +1034,18 @@ void __init init_hw_perf_counters(void)
 	register_die_notifier(&perf_counter_nmi_notifier);
 }
 
-static void pmc_generic_read(struct perf_counter *counter)
+static void x86_pmu_read(struct perf_counter *counter)
 {
 	x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
 }
 
-static const struct hw_perf_counter_ops x86_perf_counter_ops = {
-	.enable		= pmc_generic_enable,
-	.disable	= pmc_generic_disable,
-	.read		= pmc_generic_read,
+static const struct pmu pmu = {
+	.enable		= x86_pmu_enable,
+	.disable	= x86_pmu_disable,
+	.read		= x86_pmu_read,
 };
 
-const struct hw_perf_counter_ops *
-hw_perf_counter_init(struct perf_counter *counter)
+const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 {
 	int err;
 
@@ -1054,7 +1053,7 @@ hw_perf_counter_init(struct perf_counter *counter)
 	if (err)
 		return ERR_PTR(err);
 
-	return &x86_perf_counter_ops;
+	return &pmu;
 }
 
 /*
-- 
cgit v1.2.3


From 5f4ec28ffe77c840354cce1820a3436106e9e0f1 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:04 +0200
Subject: perf_counter, x86: rename struct pmc_x86_ops into struct x86_pmu

This patch renames struct pmc_x86_ops into struct x86_pmu. It
introduces a structure to describe an x86 model specific pmu
(performance monitoring unit). It may contain ops and data. The new
name of the structure fits better, is shorter, and thus better to
handle. Where it was appropriate, names of function and variable have
been changed too.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-8-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 135 +++++++++++++++++++------------------
 1 file changed, 68 insertions(+), 67 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 95de980c74a..808a1a11346 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -44,9 +44,9 @@ struct cpu_hw_counters {
 };
 
 /*
- * struct pmc_x86_ops - performance counter x86 ops
+ * struct x86_pmu - generic x86 pmu
  */
-struct pmc_x86_ops {
+struct x86_pmu {
 	u64		(*save_disable_all)(void);
 	void		(*restore_all)(u64);
 	u64		(*get_status)(u64);
@@ -60,7 +60,7 @@ struct pmc_x86_ops {
 	int		max_events;
 };
 
-static struct pmc_x86_ops *pmc_ops __read_mostly;
+static struct x86_pmu *x86_pmu __read_mostly;
 
 static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
 	.enabled = 1,
@@ -82,12 +82,12 @@ static const u64 intel_perfmon_event_map[] =
   [PERF_COUNT_BUS_CYCLES]		= 0x013c,
 };
 
-static u64 pmc_intel_event_map(int event)
+static u64 intel_pmu_event_map(int event)
 {
 	return intel_perfmon_event_map[event];
 }
 
-static u64 pmc_intel_raw_event(u64 event)
+static u64 intel_pmu_raw_event(u64 event)
 {
 #define CORE_EVNTSEL_EVENT_MASK		0x000000FFULL
 #define CORE_EVNTSEL_UNIT_MASK		0x0000FF00ULL
@@ -114,12 +114,12 @@ static const u64 amd_perfmon_event_map[] =
   [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
 };
 
-static u64 pmc_amd_event_map(int event)
+static u64 amd_pmu_event_map(int event)
 {
 	return amd_perfmon_event_map[event];
 }
 
-static u64 pmc_amd_raw_event(u64 event)
+static u64 amd_pmu_raw_event(u64 event)
 {
 #define K7_EVNTSEL_EVENT_MASK	0x7000000FFULL
 #define K7_EVNTSEL_UNIT_MASK	0x00000FF00ULL
@@ -184,12 +184,12 @@ static bool reserve_pmc_hardware(void)
 		disable_lapic_nmi_watchdog();
 
 	for (i = 0; i < nr_counters_generic; i++) {
-		if (!reserve_perfctr_nmi(pmc_ops->perfctr + i))
+		if (!reserve_perfctr_nmi(x86_pmu->perfctr + i))
 			goto perfctr_fail;
 	}
 
 	for (i = 0; i < nr_counters_generic; i++) {
-		if (!reserve_evntsel_nmi(pmc_ops->eventsel + i))
+		if (!reserve_evntsel_nmi(x86_pmu->eventsel + i))
 			goto eventsel_fail;
 	}
 
@@ -197,13 +197,13 @@ static bool reserve_pmc_hardware(void)
 
 eventsel_fail:
 	for (i--; i >= 0; i--)
-		release_evntsel_nmi(pmc_ops->eventsel + i);
+		release_evntsel_nmi(x86_pmu->eventsel + i);
 
 	i = nr_counters_generic;
 
 perfctr_fail:
 	for (i--; i >= 0; i--)
-		release_perfctr_nmi(pmc_ops->perfctr + i);
+		release_perfctr_nmi(x86_pmu->perfctr + i);
 
 	if (nmi_watchdog == NMI_LOCAL_APIC)
 		enable_lapic_nmi_watchdog();
@@ -216,8 +216,8 @@ static void release_pmc_hardware(void)
 	int i;
 
 	for (i = 0; i < nr_counters_generic; i++) {
-		release_perfctr_nmi(pmc_ops->perfctr + i);
-		release_evntsel_nmi(pmc_ops->eventsel + i);
+		release_perfctr_nmi(x86_pmu->perfctr + i);
+		release_evntsel_nmi(x86_pmu->eventsel + i);
 	}
 
 	if (nmi_watchdog == NMI_LOCAL_APIC)
@@ -293,14 +293,14 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	 * Raw event type provide the config in the event structure
 	 */
 	if (perf_event_raw(hw_event)) {
-		hwc->config |= pmc_ops->raw_event(perf_event_config(hw_event));
+		hwc->config |= x86_pmu->raw_event(perf_event_config(hw_event));
 	} else {
-		if (perf_event_id(hw_event) >= pmc_ops->max_events)
+		if (perf_event_id(hw_event) >= x86_pmu->max_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= pmc_ops->event_map(perf_event_id(hw_event));
+		hwc->config |= x86_pmu->event_map(perf_event_id(hw_event));
 	}
 
 	counter->destroy = hw_perf_counter_destroy;
@@ -308,7 +308,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	return 0;
 }
 
-static u64 pmc_intel_save_disable_all(void)
+static u64 intel_pmu_save_disable_all(void)
 {
 	u64 ctrl;
 
@@ -318,7 +318,7 @@ static u64 pmc_intel_save_disable_all(void)
 	return ctrl;
 }
 
-static u64 pmc_amd_save_disable_all(void)
+static u64 amd_pmu_save_disable_all(void)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	int enabled, idx;
@@ -327,7 +327,8 @@ static u64 pmc_amd_save_disable_all(void)
 	cpuc->enabled = 0;
 	/*
 	 * ensure we write the disable before we start disabling the
-	 * counters proper, so that pcm_amd_enable() does the right thing.
+	 * counters proper, so that amd_pmu_enable_counter() does the
+	 * right thing.
 	 */
 	barrier();
 
@@ -351,19 +352,19 @@ u64 hw_perf_save_disable(void)
 	if (unlikely(!perf_counters_initialized))
 		return 0;
 
-	return pmc_ops->save_disable_all();
+	return x86_pmu->save_disable_all();
 }
 /*
  * Exported because of ACPI idle
  */
 EXPORT_SYMBOL_GPL(hw_perf_save_disable);
 
-static void pmc_intel_restore_all(u64 ctrl)
+static void intel_pmu_restore_all(u64 ctrl)
 {
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 }
 
-static void pmc_amd_restore_all(u64 ctrl)
+static void amd_pmu_restore_all(u64 ctrl)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	int idx;
@@ -391,14 +392,14 @@ void hw_perf_restore(u64 ctrl)
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	pmc_ops->restore_all(ctrl);
+	x86_pmu->restore_all(ctrl);
 }
 /*
  * Exported because of ACPI idle
  */
 EXPORT_SYMBOL_GPL(hw_perf_restore);
 
-static u64 pmc_intel_get_status(u64 mask)
+static u64 intel_pmu_get_status(u64 mask)
 {
 	u64 status;
 
@@ -407,7 +408,7 @@ static u64 pmc_intel_get_status(u64 mask)
 	return status;
 }
 
-static u64 pmc_amd_get_status(u64 mask)
+static u64 amd_pmu_get_status(u64 mask)
 {
 	u64 status = 0;
 	int idx;
@@ -432,15 +433,15 @@ static u64 hw_perf_get_status(u64 mask)
 	if (unlikely(!perf_counters_initialized))
 		return 0;
 
-	return pmc_ops->get_status(mask);
+	return x86_pmu->get_status(mask);
 }
 
-static void pmc_intel_ack_status(u64 ack)
+static void intel_pmu_ack_status(u64 ack)
 {
 	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
 }
 
-static void pmc_amd_ack_status(u64 ack)
+static void amd_pmu_ack_status(u64 ack)
 {
 }
 
@@ -449,16 +450,16 @@ static void hw_perf_ack_status(u64 ack)
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	pmc_ops->ack_status(ack);
+	x86_pmu->ack_status(ack);
 }
 
-static void pmc_intel_enable(int idx, u64 config)
+static void intel_pmu_enable_counter(int idx, u64 config)
 {
 	wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx,
 			config | ARCH_PERFMON_EVENTSEL0_ENABLE);
 }
 
-static void pmc_amd_enable(int idx, u64 config)
+static void amd_pmu_enable_counter(int idx, u64 config)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 
@@ -474,15 +475,15 @@ static void hw_perf_enable(int idx, u64 config)
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	pmc_ops->enable(idx, config);
+	x86_pmu->enable(idx, config);
 }
 
-static void pmc_intel_disable(int idx, u64 config)
+static void intel_pmu_disable_counter(int idx, u64 config)
 {
 	wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config);
 }
 
-static void pmc_amd_disable(int idx, u64 config)
+static void amd_pmu_disable_counter(int idx, u64 config)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 
@@ -496,7 +497,7 @@ static void hw_perf_disable(int idx, u64 config)
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	pmc_ops->disable(idx, config);
+	x86_pmu->disable(idx, config);
 }
 
 static inline void
@@ -613,11 +614,11 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 
 	event = hwc->config & ARCH_PERFMON_EVENT_MASK;
 
-	if (unlikely(event == pmc_ops->event_map(PERF_COUNT_INSTRUCTIONS)))
+	if (unlikely(event == x86_pmu->event_map(PERF_COUNT_INSTRUCTIONS)))
 		return X86_PMC_IDX_FIXED_INSTRUCTIONS;
-	if (unlikely(event == pmc_ops->event_map(PERF_COUNT_CPU_CYCLES)))
+	if (unlikely(event == x86_pmu->event_map(PERF_COUNT_CPU_CYCLES)))
 		return X86_PMC_IDX_FIXED_CPU_CYCLES;
-	if (unlikely(event == pmc_ops->event_map(PERF_COUNT_BUS_CYCLES)))
+	if (unlikely(event == x86_pmu->event_map(PERF_COUNT_BUS_CYCLES)))
 		return X86_PMC_IDX_FIXED_BUS_CYCLES;
 
 	return -1;
@@ -661,8 +662,8 @@ try_generic:
 			set_bit(idx, cpuc->used);
 			hwc->idx = idx;
 		}
-		hwc->config_base  = pmc_ops->eventsel;
-		hwc->counter_base = pmc_ops->perfctr;
+		hwc->config_base  = x86_pmu->eventsel;
+		hwc->counter_base = x86_pmu->perfctr;
 	}
 
 	perf_counters_lapic_init(hwc->nmi);
@@ -710,8 +711,8 @@ void perf_counter_print_debug(void)
 	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
-		rdmsrl(pmc_ops->eventsel + idx, pmc_ctrl);
-		rdmsrl(pmc_ops->perfctr  + idx, pmc_count);
+		rdmsrl(x86_pmu->eventsel + idx, pmc_ctrl);
+		rdmsrl(x86_pmu->perfctr  + idx, pmc_count);
 
 		prev_left = per_cpu(prev_left[idx], cpu);
 
@@ -918,35 +919,35 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
 	.priority		= 1
 };
 
-static struct pmc_x86_ops pmc_intel_ops = {
-	.save_disable_all	= pmc_intel_save_disable_all,
-	.restore_all		= pmc_intel_restore_all,
-	.get_status		= pmc_intel_get_status,
-	.ack_status		= pmc_intel_ack_status,
-	.enable			= pmc_intel_enable,
-	.disable		= pmc_intel_disable,
+static struct x86_pmu intel_pmu = {
+	.save_disable_all	= intel_pmu_save_disable_all,
+	.restore_all		= intel_pmu_restore_all,
+	.get_status		= intel_pmu_get_status,
+	.ack_status		= intel_pmu_ack_status,
+	.enable			= intel_pmu_enable_counter,
+	.disable		= intel_pmu_disable_counter,
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
 	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
-	.event_map		= pmc_intel_event_map,
-	.raw_event		= pmc_intel_raw_event,
+	.event_map		= intel_pmu_event_map,
+	.raw_event		= intel_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
 };
 
-static struct pmc_x86_ops pmc_amd_ops = {
-	.save_disable_all	= pmc_amd_save_disable_all,
-	.restore_all		= pmc_amd_restore_all,
-	.get_status		= pmc_amd_get_status,
-	.ack_status		= pmc_amd_ack_status,
-	.enable			= pmc_amd_enable,
-	.disable		= pmc_amd_disable,
+static struct x86_pmu amd_pmu = {
+	.save_disable_all	= amd_pmu_save_disable_all,
+	.restore_all		= amd_pmu_restore_all,
+	.get_status		= amd_pmu_get_status,
+	.ack_status		= amd_pmu_ack_status,
+	.enable			= amd_pmu_enable_counter,
+	.disable		= amd_pmu_disable_counter,
 	.eventsel		= MSR_K7_EVNTSEL0,
 	.perfctr		= MSR_K7_PERFCTR0,
-	.event_map		= pmc_amd_event_map,
-	.raw_event		= pmc_amd_raw_event,
+	.event_map		= amd_pmu_event_map,
+	.raw_event		= amd_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
 };
 
-static struct pmc_x86_ops *pmc_intel_init(void)
+static struct x86_pmu *intel_pmu_init(void)
 {
 	union cpuid10_edx edx;
 	union cpuid10_eax eax;
@@ -977,10 +978,10 @@ static struct pmc_x86_ops *pmc_intel_init(void)
 	nr_counters_fixed = edx.split.num_counters_fixed;
 	counter_value_mask = (1ULL << eax.split.bit_width) - 1;
 
-	return &pmc_intel_ops;
+	return &intel_pmu;
 }
 
-static struct pmc_x86_ops *pmc_amd_init(void)
+static struct x86_pmu *amd_pmu_init(void)
 {
 	nr_counters_generic = 4;
 	nr_counters_fixed = 0;
@@ -989,22 +990,22 @@ static struct pmc_x86_ops *pmc_amd_init(void)
 
 	pr_info("AMD Performance Monitoring support detected.\n");
 
-	return &pmc_amd_ops;
+	return &amd_pmu;
 }
 
 void __init init_hw_perf_counters(void)
 {
 	switch (boot_cpu_data.x86_vendor) {
 	case X86_VENDOR_INTEL:
-		pmc_ops = pmc_intel_init();
+		x86_pmu = intel_pmu_init();
 		break;
 	case X86_VENDOR_AMD:
-		pmc_ops = pmc_amd_init();
+		x86_pmu = amd_pmu_init();
 		break;
 	default:
 		return;
 	}
-	if (!pmc_ops)
+	if (!x86_pmu)
 		return;
 
 	pr_info("... num counters:    %d\n", nr_counters_generic);
-- 
cgit v1.2.3


From 39d81eab2374d71b2d9c82f66258a1a4f57ddd2e Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:05 +0200
Subject: perf_counter, x86: make interrupt handler model specific

This separates the perfcounter interrupt handler for AMD and Intel
cpus. The AMD interrupt handler implementation is a follow-on patch.

[ Impact: refactor and clean up code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-9-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 808a1a11346..9d90de0bd0b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -4,6 +4,7 @@
  *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
  *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
  *  Copyright(C) 2009 Jaswinder Singh Rajput
+ *  Copyright(C) 2009 Advanced Micro Devices, Inc., Robert Richter
  *
  *  For licencing details see kernel-base/COPYING
  */
@@ -47,6 +48,7 @@ struct cpu_hw_counters {
  * struct x86_pmu - generic x86 pmu
  */
 struct x86_pmu {
+	int		(*handle_irq)(struct pt_regs *, int);
 	u64		(*save_disable_all)(void);
 	void		(*restore_all)(u64);
 	u64		(*get_status)(u64);
@@ -241,6 +243,10 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	struct hw_perf_counter *hwc = &counter->hw;
 	int err;
 
+	/* disable temporarily */
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+		return -ENOSYS;
+
 	if (unlikely(!perf_counters_initialized))
 		return -EINVAL;
 
@@ -780,7 +786,7 @@ static void perf_save_and_restart(struct perf_counter *counter)
  * This handler is triggered by the local APIC, so the APIC IRQ handling
  * rules apply:
  */
-static int __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
+static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 {
 	int bit, cpu = smp_processor_id();
 	u64 ack, status;
@@ -827,6 +833,8 @@ out:
 	return ret;
 }
 
+static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) { return 0; }
+
 void perf_counter_unthrottle(void)
 {
 	struct cpu_hw_counters *cpuc;
@@ -851,7 +859,7 @@ void smp_perf_counter_interrupt(struct pt_regs *regs)
 	irq_enter();
 	apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
 	ack_APIC_irq();
-	__smp_perf_counter_interrupt(regs, 0);
+	x86_pmu->handle_irq(regs, 0);
 	irq_exit();
 }
 
@@ -908,7 +916,7 @@ perf_counter_nmi_handler(struct notifier_block *self,
 	regs = args->regs;
 
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
-	ret = __smp_perf_counter_interrupt(regs, 1);
+	ret = x86_pmu->handle_irq(regs, 1);
 
 	return ret ? NOTIFY_STOP : NOTIFY_OK;
 }
@@ -920,6 +928,7 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
 };
 
 static struct x86_pmu intel_pmu = {
+	.handle_irq		= intel_pmu_handle_irq,
 	.save_disable_all	= intel_pmu_save_disable_all,
 	.restore_all		= intel_pmu_restore_all,
 	.get_status		= intel_pmu_get_status,
@@ -934,6 +943,7 @@ static struct x86_pmu intel_pmu = {
 };
 
 static struct x86_pmu amd_pmu = {
+	.handle_irq		= amd_pmu_handle_irq,
 	.save_disable_all	= amd_pmu_save_disable_all,
 	.restore_all		= amd_pmu_restore_all,
 	.get_status		= amd_pmu_get_status,
-- 
cgit v1.2.3


From b7f8859a8ed1937e2139c17b84878f1d413fa659 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:06 +0200
Subject: perf_counter, x86: remove get_status() from struct x86_pmu

This function is Intel only and not necessary for AMD cpus.

[ Impact: simplify code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-10-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 39 +++++---------------------------------
 1 file changed, 5 insertions(+), 34 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 9d90de0bd0b..d0bb02919c6 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -51,7 +51,6 @@ struct x86_pmu {
 	int		(*handle_irq)(struct pt_regs *, int);
 	u64		(*save_disable_all)(void);
 	void		(*restore_all)(u64);
-	u64		(*get_status)(u64);
 	void		(*ack_status)(u64);
 	void		(*enable)(int, u64);
 	void		(*disable)(int, u64);
@@ -405,41 +404,15 @@ void hw_perf_restore(u64 ctrl)
  */
 EXPORT_SYMBOL_GPL(hw_perf_restore);
 
-static u64 intel_pmu_get_status(u64 mask)
+static inline u64 intel_pmu_get_status(u64 mask)
 {
 	u64 status;
 
-	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
-
-	return status;
-}
-
-static u64 amd_pmu_get_status(u64 mask)
-{
-	u64 status = 0;
-	int idx;
-
-	for (idx = 0; idx < nr_counters_generic; idx++) {
-		s64 val;
-
-		if (!(mask & (1 << idx)))
-			continue;
-
-		rdmsrl(MSR_K7_PERFCTR0 + idx, val);
-		val <<= (64 - counter_value_bits);
-		if (val >= 0)
-			status |= (1 << idx);
-	}
-
-	return status;
-}
-
-static u64 hw_perf_get_status(u64 mask)
-{
 	if (unlikely(!perf_counters_initialized))
 		return 0;
+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 
-	return x86_pmu->get_status(mask);
+	return status;
 }
 
 static void intel_pmu_ack_status(u64 ack)
@@ -795,7 +768,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 
 	cpuc->throttle_ctrl = hw_perf_save_disable();
 
-	status = hw_perf_get_status(cpuc->throttle_ctrl);
+	status = intel_pmu_get_status(cpuc->throttle_ctrl);
 	if (!status)
 		goto out;
 
@@ -820,7 +793,7 @@ again:
 	/*
 	 * Repeat if there is more work to be done:
 	 */
-	status = hw_perf_get_status(cpuc->throttle_ctrl);
+	status = intel_pmu_get_status(cpuc->throttle_ctrl);
 	if (status)
 		goto again;
 out:
@@ -931,7 +904,6 @@ static struct x86_pmu intel_pmu = {
 	.handle_irq		= intel_pmu_handle_irq,
 	.save_disable_all	= intel_pmu_save_disable_all,
 	.restore_all		= intel_pmu_restore_all,
-	.get_status		= intel_pmu_get_status,
 	.ack_status		= intel_pmu_ack_status,
 	.enable			= intel_pmu_enable_counter,
 	.disable		= intel_pmu_disable_counter,
@@ -946,7 +918,6 @@ static struct x86_pmu amd_pmu = {
 	.handle_irq		= amd_pmu_handle_irq,
 	.save_disable_all	= amd_pmu_save_disable_all,
 	.restore_all		= amd_pmu_restore_all,
-	.get_status		= amd_pmu_get_status,
 	.ack_status		= amd_pmu_ack_status,
 	.enable			= amd_pmu_enable_counter,
 	.disable		= amd_pmu_disable_counter,
-- 
cgit v1.2.3


From dee5d9067ca78b317538fd67930be4e09a83dbc5 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:07 +0200
Subject: perf_counter, x86: remove ack_status() from struct x86_pmu

This function is Intel only and not necessary for AMD cpus.

[ Impact: simplify code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-11-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index d0bb02919c6..6bbdc16cc69 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -51,7 +51,6 @@ struct x86_pmu {
 	int		(*handle_irq)(struct pt_regs *, int);
 	u64		(*save_disable_all)(void);
 	void		(*restore_all)(u64);
-	void		(*ack_status)(u64);
 	void		(*enable)(int, u64);
 	void		(*disable)(int, u64);
 	unsigned	eventsel;
@@ -415,23 +414,11 @@ static inline u64 intel_pmu_get_status(u64 mask)
 	return status;
 }
 
-static void intel_pmu_ack_status(u64 ack)
+static inline void intel_pmu_ack_status(u64 ack)
 {
 	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
 }
 
-static void amd_pmu_ack_status(u64 ack)
-{
-}
-
-static void hw_perf_ack_status(u64 ack)
-{
-	if (unlikely(!perf_counters_initialized))
-		return;
-
-	x86_pmu->ack_status(ack);
-}
-
 static void intel_pmu_enable_counter(int idx, u64 config)
 {
 	wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx,
@@ -788,7 +775,7 @@ again:
 			__x86_pmu_disable(counter, &counter->hw, bit);
 	}
 
-	hw_perf_ack_status(ack);
+	intel_pmu_ack_status(ack);
 
 	/*
 	 * Repeat if there is more work to be done:
@@ -904,7 +891,6 @@ static struct x86_pmu intel_pmu = {
 	.handle_irq		= intel_pmu_handle_irq,
 	.save_disable_all	= intel_pmu_save_disable_all,
 	.restore_all		= intel_pmu_restore_all,
-	.ack_status		= intel_pmu_ack_status,
 	.enable			= intel_pmu_enable_counter,
 	.disable		= intel_pmu_disable_counter,
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
@@ -918,7 +904,6 @@ static struct x86_pmu amd_pmu = {
 	.handle_irq		= amd_pmu_handle_irq,
 	.save_disable_all	= amd_pmu_save_disable_all,
 	.restore_all		= amd_pmu_restore_all,
-	.ack_status		= amd_pmu_ack_status,
 	.enable			= amd_pmu_enable_counter,
 	.disable		= amd_pmu_disable_counter,
 	.eventsel		= MSR_K7_EVNTSEL0,
-- 
cgit v1.2.3


From 26816c287e13eedc67bc4ed0cd40c138314b7c7d Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:08 +0200
Subject: perf_counter, x86: rename __hw_perf_counter_set_period into
 x86_perf_counter_set_period

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-12-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 6bbdc16cc69..fa6541d781b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -498,7 +498,7 @@ static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
  * To be called with the counter disabled in hw:
  */
 static void
-__hw_perf_counter_set_period(struct perf_counter *counter,
+x86_perf_counter_set_period(struct perf_counter *counter,
 			     struct hw_perf_counter *hwc, int idx)
 {
 	s64 left = atomic64_read(&hwc->period_left);
@@ -642,7 +642,7 @@ try_generic:
 	 */
 	barrier();
 
-	__hw_perf_counter_set_period(counter, hwc, idx);
+	x86_perf_counter_set_period(counter, hwc, idx);
 	__x86_pmu_enable(counter, hwc, idx);
 
 	return 0;
@@ -731,7 +731,7 @@ static void perf_save_and_restart(struct perf_counter *counter)
 	int idx = hwc->idx;
 
 	x86_perf_counter_update(counter, hwc, idx);
-	__hw_perf_counter_set_period(counter, hwc, idx);
+	x86_perf_counter_set_period(counter, hwc, idx);
 
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
 		__x86_pmu_enable(counter, hwc, idx);
-- 
cgit v1.2.3


From 55de0f2e57994b525324bf0d04d242d9358a2417 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:09 +0200
Subject: perf_counter, x86: rename intel only functions

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-13-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index fa6541d781b..5a52d73ccfa 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -725,7 +725,7 @@ static void x86_pmu_disable(struct perf_counter *counter)
  * Save and restart an expired counter. Called by NMI contexts,
  * so it has to be careful about preempting normal counter ops:
  */
-static void perf_save_and_restart(struct perf_counter *counter)
+static void intel_pmu_save_and_restart(struct perf_counter *counter)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
 	int idx = hwc->idx;
@@ -753,7 +753,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
 	int ret = 0;
 
-	cpuc->throttle_ctrl = hw_perf_save_disable();
+	cpuc->throttle_ctrl = intel_pmu_save_disable_all();
 
 	status = intel_pmu_get_status(cpuc->throttle_ctrl);
 	if (!status)
@@ -770,7 +770,7 @@ again:
 		if (!counter)
 			continue;
 
-		perf_save_and_restart(counter);
+		intel_pmu_save_and_restart(counter);
 		if (perf_counter_overflow(counter, nmi, regs, 0))
 			__x86_pmu_disable(counter, &counter->hw, bit);
 	}
@@ -788,7 +788,7 @@ out:
 	 * Restore - do not reenable when global enable is off or throttled:
 	 */
 	if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS)
-		hw_perf_restore(cpuc->throttle_ctrl);
+		intel_pmu_restore_all(cpuc->throttle_ctrl);
 
 	return ret;
 }
-- 
cgit v1.2.3


From 72eae04d3a3075c26d39e1e685acfc8e8c29db64 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:10 +0200
Subject: perf_counter, x86: modify initialization of struct x86_pmu

This patch adds an error handler and changes initialization of struct
x86_pmu. No functional changes. Needed for follow-on patches.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-14-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 5a52d73ccfa..7c72a942363 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -913,7 +913,7 @@ static struct x86_pmu amd_pmu = {
 	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
 };
 
-static struct x86_pmu *intel_pmu_init(void)
+static int intel_pmu_init(void)
 {
 	union cpuid10_edx edx;
 	union cpuid10_eax eax;
@@ -921,7 +921,7 @@ static struct x86_pmu *intel_pmu_init(void)
 	unsigned int ebx;
 
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
-		return NULL;
+		return -ENODEV;
 
 	/*
 	 * Check whether the Architectural PerfMon supports
@@ -929,49 +929,54 @@ static struct x86_pmu *intel_pmu_init(void)
 	 */
 	cpuid(10, &eax.full, &ebx, &unused, &edx.full);
 	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
-		return NULL;
+		return -ENODEV;
 
 	intel_perfmon_version = eax.split.version_id;
 	if (intel_perfmon_version < 2)
-		return NULL;
+		return -ENODEV;
 
 	pr_info("Intel Performance Monitoring support detected.\n");
 	pr_info("... version:         %d\n", intel_perfmon_version);
 	pr_info("... bit width:       %d\n", eax.split.bit_width);
 	pr_info("... mask length:     %d\n", eax.split.mask_length);
 
+	x86_pmu = &intel_pmu;
+
 	nr_counters_generic = eax.split.num_counters;
 	nr_counters_fixed = edx.split.num_counters_fixed;
 	counter_value_mask = (1ULL << eax.split.bit_width) - 1;
 
-	return &intel_pmu;
+	return 0;
 }
 
-static struct x86_pmu *amd_pmu_init(void)
+static int amd_pmu_init(void)
 {
+	x86_pmu = &amd_pmu;
+
 	nr_counters_generic = 4;
 	nr_counters_fixed = 0;
 	counter_value_mask = 0x0000FFFFFFFFFFFFULL;
 	counter_value_bits = 48;
 
 	pr_info("AMD Performance Monitoring support detected.\n");
-
-	return &amd_pmu;
+	return 0;
 }
 
 void __init init_hw_perf_counters(void)
 {
+	int err;
+
 	switch (boot_cpu_data.x86_vendor) {
 	case X86_VENDOR_INTEL:
-		x86_pmu = intel_pmu_init();
+		err = intel_pmu_init();
 		break;
 	case X86_VENDOR_AMD:
-		x86_pmu = amd_pmu_init();
+		err = amd_pmu_init();
 		break;
 	default:
 		return;
 	}
-	if (!x86_pmu)
+	if (err != 0)
 		return;
 
 	pr_info("... num counters:    %d\n", nr_counters_generic);
-- 
cgit v1.2.3


From 4a06bd8508f65ad1dd5cd2046b85694813fa36a2 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:11 +0200
Subject: perf_counter, x86: make x86_pmu data a static struct

Instead of using a pointer to reference to the x86 pmu we now have one
single data structure that is initialized at the beginning. This saves
the pointer access when using this memory.

[ Impact: micro-optimization ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-15-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 50 +++++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 7c72a942363..68597d76338 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -60,7 +60,7 @@ struct x86_pmu {
 	int		max_events;
 };
 
-static struct x86_pmu *x86_pmu __read_mostly;
+static struct x86_pmu x86_pmu __read_mostly;
 
 static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
 	.enabled = 1,
@@ -184,12 +184,12 @@ static bool reserve_pmc_hardware(void)
 		disable_lapic_nmi_watchdog();
 
 	for (i = 0; i < nr_counters_generic; i++) {
-		if (!reserve_perfctr_nmi(x86_pmu->perfctr + i))
+		if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
 			goto perfctr_fail;
 	}
 
 	for (i = 0; i < nr_counters_generic; i++) {
-		if (!reserve_evntsel_nmi(x86_pmu->eventsel + i))
+		if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
 			goto eventsel_fail;
 	}
 
@@ -197,13 +197,13 @@ static bool reserve_pmc_hardware(void)
 
 eventsel_fail:
 	for (i--; i >= 0; i--)
-		release_evntsel_nmi(x86_pmu->eventsel + i);
+		release_evntsel_nmi(x86_pmu.eventsel + i);
 
 	i = nr_counters_generic;
 
 perfctr_fail:
 	for (i--; i >= 0; i--)
-		release_perfctr_nmi(x86_pmu->perfctr + i);
+		release_perfctr_nmi(x86_pmu.perfctr + i);
 
 	if (nmi_watchdog == NMI_LOCAL_APIC)
 		enable_lapic_nmi_watchdog();
@@ -216,8 +216,8 @@ static void release_pmc_hardware(void)
 	int i;
 
 	for (i = 0; i < nr_counters_generic; i++) {
-		release_perfctr_nmi(x86_pmu->perfctr + i);
-		release_evntsel_nmi(x86_pmu->eventsel + i);
+		release_perfctr_nmi(x86_pmu.perfctr + i);
+		release_evntsel_nmi(x86_pmu.eventsel + i);
 	}
 
 	if (nmi_watchdog == NMI_LOCAL_APIC)
@@ -297,14 +297,14 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	 * Raw event type provide the config in the event structure
 	 */
 	if (perf_event_raw(hw_event)) {
-		hwc->config |= x86_pmu->raw_event(perf_event_config(hw_event));
+		hwc->config |= x86_pmu.raw_event(perf_event_config(hw_event));
 	} else {
-		if (perf_event_id(hw_event) >= x86_pmu->max_events)
+		if (perf_event_id(hw_event) >= x86_pmu.max_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= x86_pmu->event_map(perf_event_id(hw_event));
+		hwc->config |= x86_pmu.event_map(perf_event_id(hw_event));
 	}
 
 	counter->destroy = hw_perf_counter_destroy;
@@ -356,7 +356,7 @@ u64 hw_perf_save_disable(void)
 	if (unlikely(!perf_counters_initialized))
 		return 0;
 
-	return x86_pmu->save_disable_all();
+	return x86_pmu.save_disable_all();
 }
 /*
  * Exported because of ACPI idle
@@ -396,7 +396,7 @@ void hw_perf_restore(u64 ctrl)
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	x86_pmu->restore_all(ctrl);
+	x86_pmu.restore_all(ctrl);
 }
 /*
  * Exported because of ACPI idle
@@ -441,7 +441,7 @@ static void hw_perf_enable(int idx, u64 config)
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	x86_pmu->enable(idx, config);
+	x86_pmu.enable(idx, config);
 }
 
 static void intel_pmu_disable_counter(int idx, u64 config)
@@ -463,7 +463,7 @@ static void hw_perf_disable(int idx, u64 config)
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	x86_pmu->disable(idx, config);
+	x86_pmu.disable(idx, config);
 }
 
 static inline void
@@ -580,11 +580,11 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 
 	event = hwc->config & ARCH_PERFMON_EVENT_MASK;
 
-	if (unlikely(event == x86_pmu->event_map(PERF_COUNT_INSTRUCTIONS)))
+	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_INSTRUCTIONS)))
 		return X86_PMC_IDX_FIXED_INSTRUCTIONS;
-	if (unlikely(event == x86_pmu->event_map(PERF_COUNT_CPU_CYCLES)))
+	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_CPU_CYCLES)))
 		return X86_PMC_IDX_FIXED_CPU_CYCLES;
-	if (unlikely(event == x86_pmu->event_map(PERF_COUNT_BUS_CYCLES)))
+	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_BUS_CYCLES)))
 		return X86_PMC_IDX_FIXED_BUS_CYCLES;
 
 	return -1;
@@ -628,8 +628,8 @@ try_generic:
 			set_bit(idx, cpuc->used);
 			hwc->idx = idx;
 		}
-		hwc->config_base  = x86_pmu->eventsel;
-		hwc->counter_base = x86_pmu->perfctr;
+		hwc->config_base  = x86_pmu.eventsel;
+		hwc->counter_base = x86_pmu.perfctr;
 	}
 
 	perf_counters_lapic_init(hwc->nmi);
@@ -677,8 +677,8 @@ void perf_counter_print_debug(void)
 	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
-		rdmsrl(x86_pmu->eventsel + idx, pmc_ctrl);
-		rdmsrl(x86_pmu->perfctr  + idx, pmc_count);
+		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
+		rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
 
 		prev_left = per_cpu(prev_left[idx], cpu);
 
@@ -819,7 +819,7 @@ void smp_perf_counter_interrupt(struct pt_regs *regs)
 	irq_enter();
 	apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
 	ack_APIC_irq();
-	x86_pmu->handle_irq(regs, 0);
+	x86_pmu.handle_irq(regs, 0);
 	irq_exit();
 }
 
@@ -876,7 +876,7 @@ perf_counter_nmi_handler(struct notifier_block *self,
 	regs = args->regs;
 
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
-	ret = x86_pmu->handle_irq(regs, 1);
+	ret = x86_pmu.handle_irq(regs, 1);
 
 	return ret ? NOTIFY_STOP : NOTIFY_OK;
 }
@@ -940,7 +940,7 @@ static int intel_pmu_init(void)
 	pr_info("... bit width:       %d\n", eax.split.bit_width);
 	pr_info("... mask length:     %d\n", eax.split.mask_length);
 
-	x86_pmu = &intel_pmu;
+	x86_pmu = intel_pmu;
 
 	nr_counters_generic = eax.split.num_counters;
 	nr_counters_fixed = edx.split.num_counters_fixed;
@@ -951,7 +951,7 @@ static int intel_pmu_init(void)
 
 static int amd_pmu_init(void)
 {
-	x86_pmu = &amd_pmu;
+	x86_pmu = amd_pmu;
 
 	nr_counters_generic = 4;
 	nr_counters_fixed = 0;
-- 
cgit v1.2.3


From 0933e5c6a680ba8d8d786a6f7fa377b7ec0d1e49 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:12 +0200
Subject: perf_counter, x86: move counter parameters to struct x86_pmu

[ Impact: refactor and generalize code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-16-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 80 ++++++++++++++++++--------------------
 1 file changed, 37 insertions(+), 43 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 68597d76338..75dbb1f0900 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -24,16 +24,7 @@
 #include <asm/nmi.h>
 
 static bool perf_counters_initialized __read_mostly;
-
-/*
- * Number of (generic) HW counters:
- */
-static int nr_counters_generic __read_mostly;
 static u64 perf_counter_mask __read_mostly;
-static u64 counter_value_mask __read_mostly;
-static int counter_value_bits __read_mostly;
-
-static int nr_counters_fixed __read_mostly;
 
 struct cpu_hw_counters {
 	struct perf_counter	*counters[X86_PMC_IDX_MAX];
@@ -58,6 +49,10 @@ struct x86_pmu {
 	u64		(*event_map)(int);
 	u64		(*raw_event)(u64);
 	int		max_events;
+	int		num_counters;
+	int		num_counters_fixed;
+	int		counter_bits;
+	u64		counter_mask;
 };
 
 static struct x86_pmu x86_pmu __read_mostly;
@@ -183,12 +178,12 @@ static bool reserve_pmc_hardware(void)
 	if (nmi_watchdog == NMI_LOCAL_APIC)
 		disable_lapic_nmi_watchdog();
 
-	for (i = 0; i < nr_counters_generic; i++) {
+	for (i = 0; i < x86_pmu.num_counters; i++) {
 		if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
 			goto perfctr_fail;
 	}
 
-	for (i = 0; i < nr_counters_generic; i++) {
+	for (i = 0; i < x86_pmu.num_counters; i++) {
 		if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
 			goto eventsel_fail;
 	}
@@ -199,7 +194,7 @@ eventsel_fail:
 	for (i--; i >= 0; i--)
 		release_evntsel_nmi(x86_pmu.eventsel + i);
 
-	i = nr_counters_generic;
+	i = x86_pmu.num_counters;
 
 perfctr_fail:
 	for (i--; i >= 0; i--)
@@ -215,7 +210,7 @@ static void release_pmc_hardware(void)
 {
 	int i;
 
-	for (i = 0; i < nr_counters_generic; i++) {
+	for (i = 0; i < x86_pmu.num_counters; i++) {
 		release_perfctr_nmi(x86_pmu.perfctr + i);
 		release_evntsel_nmi(x86_pmu.eventsel + i);
 	}
@@ -336,7 +331,7 @@ static u64 amd_pmu_save_disable_all(void)
 	 */
 	barrier();
 
-	for (idx = 0; idx < nr_counters_generic; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
 
 		if (!test_bit(idx, cpuc->active_mask))
@@ -378,7 +373,7 @@ static void amd_pmu_restore_all(u64 ctrl)
 	if (!ctrl)
 		return;
 
-	for (idx = 0; idx < nr_counters_generic; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
 
 		if (!test_bit(idx, cpuc->active_mask))
@@ -527,7 +522,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 	atomic64_set(&hwc->prev_count, (u64)-left);
 
 	err = checking_wrmsrl(hwc->counter_base + idx,
-			     (u64)(-left) & counter_value_mask);
+			     (u64)(-left) & x86_pmu.counter_mask);
 }
 
 static inline void
@@ -621,8 +616,9 @@ static int x86_pmu_enable(struct perf_counter *counter)
 		/* Try to get the previous generic counter again */
 		if (test_and_set_bit(idx, cpuc->used)) {
 try_generic:
-			idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
-			if (idx == nr_counters_generic)
+			idx = find_first_zero_bit(cpuc->used,
+						  x86_pmu.num_counters);
+			if (idx == x86_pmu.num_counters)
 				return -EAGAIN;
 
 			set_bit(idx, cpuc->used);
@@ -654,7 +650,7 @@ void perf_counter_print_debug(void)
 	struct cpu_hw_counters *cpuc;
 	int cpu, idx;
 
-	if (!nr_counters_generic)
+	if (!x86_pmu.num_counters)
 		return;
 
 	local_irq_disable();
@@ -676,7 +672,7 @@ void perf_counter_print_debug(void)
 	}
 	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
 
-	for (idx = 0; idx < nr_counters_generic; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
 		rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
 
@@ -689,7 +685,7 @@ void perf_counter_print_debug(void)
 		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
 			cpu, idx, prev_left);
 	}
-	for (idx = 0; idx < nr_counters_fixed; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
 		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
 
 		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
@@ -911,6 +907,9 @@ static struct x86_pmu amd_pmu = {
 	.event_map		= amd_pmu_event_map,
 	.raw_event		= amd_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
+	.num_counters		= 4,
+	.counter_bits		= 48,
+	.counter_mask		= (1ULL << 48) - 1,
 };
 
 static int intel_pmu_init(void)
@@ -941,10 +940,10 @@ static int intel_pmu_init(void)
 	pr_info("... mask length:     %d\n", eax.split.mask_length);
 
 	x86_pmu = intel_pmu;
-
-	nr_counters_generic = eax.split.num_counters;
-	nr_counters_fixed = edx.split.num_counters_fixed;
-	counter_value_mask = (1ULL << eax.split.bit_width) - 1;
+	x86_pmu.num_counters = eax.split.num_counters;
+	x86_pmu.num_counters_fixed = edx.split.num_counters_fixed;
+	x86_pmu.counter_bits = eax.split.bit_width;
+	x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
 
 	return 0;
 }
@@ -952,12 +951,6 @@ static int intel_pmu_init(void)
 static int amd_pmu_init(void)
 {
 	x86_pmu = amd_pmu;
-
-	nr_counters_generic = 4;
-	nr_counters_fixed = 0;
-	counter_value_mask = 0x0000FFFFFFFFFFFFULL;
-	counter_value_bits = 48;
-
 	pr_info("AMD Performance Monitoring support detected.\n");
 	return 0;
 }
@@ -979,25 +972,26 @@ void __init init_hw_perf_counters(void)
 	if (err != 0)
 		return;
 
-	pr_info("... num counters:    %d\n", nr_counters_generic);
-	if (nr_counters_generic > X86_PMC_MAX_GENERIC) {
-		nr_counters_generic = X86_PMC_MAX_GENERIC;
+	pr_info("... num counters:    %d\n", x86_pmu.num_counters);
+	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
+		x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
 		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
-			nr_counters_generic, X86_PMC_MAX_GENERIC);
+		     x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
 	}
-	perf_counter_mask = (1 << nr_counters_generic) - 1;
-	perf_max_counters = nr_counters_generic;
+	perf_counter_mask = (1 << x86_pmu.num_counters) - 1;
+	perf_max_counters = x86_pmu.num_counters;
 
-	pr_info("... value mask:      %016Lx\n", counter_value_mask);
+	pr_info("... value mask:      %016Lx\n", x86_pmu.counter_mask);
 
-	if (nr_counters_fixed > X86_PMC_MAX_FIXED) {
-		nr_counters_fixed = X86_PMC_MAX_FIXED;
+	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
+		x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
 		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
-			nr_counters_fixed, X86_PMC_MAX_FIXED);
+		     x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
 	}
-	pr_info("... fixed counters:  %d\n", nr_counters_fixed);
+	pr_info("... fixed counters:  %d\n", x86_pmu.num_counters_fixed);
 
-	perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED;
+	perf_counter_mask |=
+		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
 
 	pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
 	perf_counters_initialized = true;
-- 
cgit v1.2.3


From faa28ae018ed004a22aa4a7704e04ccdde4a941e Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:13 +0200
Subject: perf_counter, x86: make pmu version generic

This makes the use of the version variable generic. Also, some debug
messages have been generalized.

[ Impact: refactor and generalize code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-17-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 75dbb1f0900..15d2c03e16f 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -39,6 +39,8 @@ struct cpu_hw_counters {
  * struct x86_pmu - generic x86 pmu
  */
 struct x86_pmu {
+	const char	*name;
+	int		version;
 	int		(*handle_irq)(struct pt_regs *, int);
 	u64		(*save_disable_all)(void);
 	void		(*restore_all)(u64);
@@ -61,8 +63,6 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
 	.enabled = 1,
 };
 
-static __read_mostly int intel_perfmon_version;
-
 /*
  * Intel PerfMon v3. Used on Core2 and later.
  */
@@ -658,7 +658,7 @@ void perf_counter_print_debug(void)
 	cpu = smp_processor_id();
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
-	if (intel_perfmon_version >= 2) {
+	if (x86_pmu.version >= 2) {
 		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
@@ -884,6 +884,7 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
 };
 
 static struct x86_pmu intel_pmu = {
+	.name			= "Intel",
 	.handle_irq		= intel_pmu_handle_irq,
 	.save_disable_all	= intel_pmu_save_disable_all,
 	.restore_all		= intel_pmu_restore_all,
@@ -897,6 +898,7 @@ static struct x86_pmu intel_pmu = {
 };
 
 static struct x86_pmu amd_pmu = {
+	.name			= "AMD",
 	.handle_irq		= amd_pmu_handle_irq,
 	.save_disable_all	= amd_pmu_save_disable_all,
 	.restore_all		= amd_pmu_restore_all,
@@ -918,6 +920,7 @@ static int intel_pmu_init(void)
 	union cpuid10_eax eax;
 	unsigned int unused;
 	unsigned int ebx;
+	int version;
 
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
 		return -ENODEV;
@@ -930,16 +933,12 @@ static int intel_pmu_init(void)
 	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
 		return -ENODEV;
 
-	intel_perfmon_version = eax.split.version_id;
-	if (intel_perfmon_version < 2)
+	version = eax.split.version_id;
+	if (version < 2)
 		return -ENODEV;
 
-	pr_info("Intel Performance Monitoring support detected.\n");
-	pr_info("... version:         %d\n", intel_perfmon_version);
-	pr_info("... bit width:       %d\n", eax.split.bit_width);
-	pr_info("... mask length:     %d\n", eax.split.mask_length);
-
 	x86_pmu = intel_pmu;
+	x86_pmu.version = version;
 	x86_pmu.num_counters = eax.split.num_counters;
 	x86_pmu.num_counters_fixed = edx.split.num_counters_fixed;
 	x86_pmu.counter_bits = eax.split.bit_width;
@@ -951,7 +950,6 @@ static int intel_pmu_init(void)
 static int amd_pmu_init(void)
 {
 	x86_pmu = amd_pmu;
-	pr_info("AMD Performance Monitoring support detected.\n");
 	return 0;
 }
 
@@ -972,6 +970,10 @@ void __init init_hw_perf_counters(void)
 	if (err != 0)
 		return;
 
+	pr_info("%s Performance Monitoring support detected.\n", x86_pmu.name);
+	pr_info("... version:         %d\n", x86_pmu.version);
+	pr_info("... bit width:       %d\n", x86_pmu.counter_bits);
+
 	pr_info("... num counters:    %d\n", x86_pmu.num_counters);
 	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
 		x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
-- 
cgit v1.2.3


From bb775fc2d1dcd1aa6eafde37a8289ba2d80783aa Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:14 +0200
Subject: perf_counter, x86: make x86_pmu_read() static inline

[ Impact: micro-optimization ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-18-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 15d2c03e16f..3f3ae477a7d 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1002,7 +1002,7 @@ void __init init_hw_perf_counters(void)
 	register_die_notifier(&perf_counter_nmi_notifier);
 }
 
-static void x86_pmu_read(struct perf_counter *counter)
+static inline void x86_pmu_read(struct perf_counter *counter)
 {
 	x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
 }
-- 
cgit v1.2.3


From 93904966934193204ad08e951f806d5631c29eb3 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:15 +0200
Subject: perf_counter, x86: rename cpuc->active_mask

This is to have a consistent naming scheme with cpuc->used.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-19-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 3f3ae477a7d..9ec51a662db 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -29,9 +29,9 @@ static u64 perf_counter_mask __read_mostly;
 struct cpu_hw_counters {
 	struct perf_counter	*counters[X86_PMC_IDX_MAX];
 	unsigned long		used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	unsigned long		active[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long		interrupts;
 	u64			throttle_ctrl;
-	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	int			enabled;
 };
 
@@ -334,7 +334,7 @@ static u64 amd_pmu_save_disable_all(void)
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
 
-		if (!test_bit(idx, cpuc->active_mask))
+		if (!test_bit(idx, cpuc->active))
 			continue;
 		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
 		if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
@@ -376,7 +376,7 @@ static void amd_pmu_restore_all(u64 ctrl)
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
 
-		if (!test_bit(idx, cpuc->active_mask))
+		if (!test_bit(idx, cpuc->active))
 			continue;
 		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
 		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
@@ -424,7 +424,7 @@ static void amd_pmu_enable_counter(int idx, u64 config)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 
-	set_bit(idx, cpuc->active_mask);
+	set_bit(idx, cpuc->active);
 	if (cpuc->enabled)
 		config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
 
@@ -448,7 +448,7 @@ static void amd_pmu_disable_counter(int idx, u64 config)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 
-	clear_bit(idx, cpuc->active_mask);
+	clear_bit(idx, cpuc->active);
 	wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
 
 }
-- 
cgit v1.2.3


From 095342389e2ed8deed07b3076f990260ce3c7c9f Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:16 +0200
Subject: perf_counter, x86: generic use of cpuc->active

cpuc->active will now be used to indicate an enabled counter which
implies also valid pointers of cpuc->counters[]. In contrast,
cpuc->used only locks the counter, but it can be still uninitialized.

[ Impact: refactor and generalize code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-20-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 9ec51a662db..f7fd4a35515 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -424,7 +424,6 @@ static void amd_pmu_enable_counter(int idx, u64 config)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 
-	set_bit(idx, cpuc->active);
 	if (cpuc->enabled)
 		config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
 
@@ -446,9 +445,6 @@ static void intel_pmu_disable_counter(int idx, u64 config)
 
 static void amd_pmu_disable_counter(int idx, u64 config)
 {
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-
-	clear_bit(idx, cpuc->active);
 	wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
 
 }
@@ -633,10 +629,7 @@ try_generic:
 	__x86_pmu_disable(counter, hwc, idx);
 
 	cpuc->counters[idx] = counter;
-	/*
-	 * Make it visible before enabling the hw:
-	 */
-	barrier();
+	set_bit(idx, cpuc->active);
 
 	x86_perf_counter_set_period(counter, hwc, idx);
 	__x86_pmu_enable(counter, hwc, idx);
@@ -700,10 +693,13 @@ static void x86_pmu_disable(struct perf_counter *counter)
 	struct hw_perf_counter *hwc = &counter->hw;
 	unsigned int idx = hwc->idx;
 
+	/*
+	 * Must be done before we disable, otherwise the nmi handler
+	 * could reenable again:
+	 */
+	clear_bit(idx, cpuc->active);
 	__x86_pmu_disable(counter, hwc, idx);
 
-	clear_bit(idx, cpuc->used);
-	cpuc->counters[idx] = NULL;
 	/*
 	 * Make sure the cleared pointer becomes visible before we
 	 * (potentially) free the counter:
@@ -715,6 +711,8 @@ static void x86_pmu_disable(struct perf_counter *counter)
 	 * that we are disabling:
 	 */
 	x86_perf_counter_update(counter, hwc, idx);
+	cpuc->counters[idx] = NULL;
+	clear_bit(idx, cpuc->used);
 }
 
 /*
@@ -763,7 +761,7 @@ again:
 		struct perf_counter *counter = cpuc->counters[bit];
 
 		clear_bit(bit, (unsigned long *) &status);
-		if (!counter)
+		if (!test_bit(bit, cpuc->active))
 			continue;
 
 		intel_pmu_save_and_restart(counter);
-- 
cgit v1.2.3


From 6f00cada07bb5da7f751929d3173494dcc5446cc Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:17 +0200
Subject: perf_counter, x86: consistent use of type int for counter index

The type of counter index is sometimes implemented as unsigned
int. This patch changes this to have a consistent usage of int.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-21-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index f7fd4a35515..d8beebeb270 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -459,7 +459,7 @@ static void hw_perf_disable(int idx, u64 config)
 
 static inline void
 __pmc_fixed_disable(struct perf_counter *counter,
-		    struct hw_perf_counter *hwc, unsigned int __idx)
+		    struct hw_perf_counter *hwc, int __idx)
 {
 	int idx = __idx - X86_PMC_IDX_FIXED;
 	u64 ctrl_val, mask;
@@ -474,7 +474,7 @@ __pmc_fixed_disable(struct perf_counter *counter,
 
 static inline void
 __x86_pmu_disable(struct perf_counter *counter,
-		  struct hw_perf_counter *hwc, unsigned int idx)
+		  struct hw_perf_counter *hwc, int idx)
 {
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
 		__pmc_fixed_disable(counter, hwc, idx);
@@ -523,7 +523,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 
 static inline void
 __pmc_fixed_enable(struct perf_counter *counter,
-		   struct hw_perf_counter *hwc, unsigned int __idx)
+		   struct hw_perf_counter *hwc, int __idx)
 {
 	int idx = __idx - X86_PMC_IDX_FIXED;
 	u64 ctrl_val, bits, mask;
@@ -691,7 +691,7 @@ static void x86_pmu_disable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
-	unsigned int idx = hwc->idx;
+	int idx = hwc->idx;
 
 	/*
 	 * Must be done before we disable, otherwise the nmi handler
-- 
cgit v1.2.3


From 7c90cc45f89af4dd4617f97d452740ad95b800d5 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:18 +0200
Subject: perf_counter, x86: rework counter enable functions

There is vendor specific code in generic x86 code, and there is vendor
specific code that could be generic. This patch introduces
x86_pmu_enable_counter() for x86 generic code. Fixed counter code for
Intel is moved to Intel only functions. In the end, checks and calls
via function pointers were reduced to the necessary. Also, the
internal function i/f changed.

[ Impact: refactor and generalize code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-22-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 52 ++++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 28 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index d8beebeb270..ae55933ce79 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -44,7 +44,7 @@ struct x86_pmu {
 	int		(*handle_irq)(struct pt_regs *, int);
 	u64		(*save_disable_all)(void);
 	void		(*restore_all)(u64);
-	void		(*enable)(int, u64);
+	void		(*enable)(struct hw_perf_counter *, int);
 	void		(*disable)(int, u64);
 	unsigned	eventsel;
 	unsigned	perfctr;
@@ -414,28 +414,15 @@ static inline void intel_pmu_ack_status(u64 ack)
 	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
 }
 
-static void intel_pmu_enable_counter(int idx, u64 config)
+static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 {
-	wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx,
-			config | ARCH_PERFMON_EVENTSEL0_ENABLE);
-}
-
-static void amd_pmu_enable_counter(int idx, u64 config)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-
-	if (cpuc->enabled)
-		config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
-
-	wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
-}
+	int err;
 
-static void hw_perf_enable(int idx, u64 config)
-{
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	x86_pmu.enable(idx, config);
+	err = checking_wrmsrl(hwc->config_base + idx,
+			      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
 }
 
 static void intel_pmu_disable_counter(int idx, u64 config)
@@ -522,8 +509,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 }
 
 static inline void
-__pmc_fixed_enable(struct perf_counter *counter,
-		   struct hw_perf_counter *hwc, int __idx)
+intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
 {
 	int idx = __idx - X86_PMC_IDX_FIXED;
 	u64 ctrl_val, bits, mask;
@@ -548,14 +534,24 @@ __pmc_fixed_enable(struct perf_counter *counter,
 	err = checking_wrmsrl(hwc->config_base, ctrl_val);
 }
 
-static void
-__x86_pmu_enable(struct perf_counter *counter,
-		 struct hw_perf_counter *hwc, int idx)
+static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 {
-	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
-		__pmc_fixed_enable(counter, hwc, idx);
+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+		intel_pmu_enable_fixed(hwc, idx);
+		return;
+	}
+
+	x86_pmu_enable_counter(hwc, idx);
+}
+
+static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+	if (cpuc->enabled)
+		x86_pmu_enable_counter(hwc, idx);
 	else
-		hw_perf_enable(idx, hwc->config);
+		amd_pmu_disable_counter(idx, hwc->config);
 }
 
 static int
@@ -632,7 +628,7 @@ try_generic:
 	set_bit(idx, cpuc->active);
 
 	x86_perf_counter_set_period(counter, hwc, idx);
-	__x86_pmu_enable(counter, hwc, idx);
+	x86_pmu.enable(hwc, idx);
 
 	return 0;
 }
@@ -728,7 +724,7 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter)
 	x86_perf_counter_set_period(counter, hwc, idx);
 
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
-		__x86_pmu_enable(counter, hwc, idx);
+		intel_pmu_enable_counter(hwc, idx);
 }
 
 /*
-- 
cgit v1.2.3


From d43698918bd46c71d494555fb92195fbea1fcb6c Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:19 +0200
Subject: perf_counter, x86: rework counter disable functions

As for the enable function, this patch reworks the disable functions
and introduces x86_pmu_disable_counter(). The internal function i/f in
struct x86_pmu changed too.

[ Impact: refactor and generalize code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-23-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 48 ++++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 25 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index ae55933ce79..df9012bbd21 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -45,7 +45,7 @@ struct x86_pmu {
 	u64		(*save_disable_all)(void);
 	void		(*restore_all)(u64);
 	void		(*enable)(struct hw_perf_counter *, int);
-	void		(*disable)(int, u64);
+	void		(*disable)(struct hw_perf_counter *, int);
 	unsigned	eventsel;
 	unsigned	perfctr;
 	u64		(*event_map)(int);
@@ -425,28 +425,19 @@ static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 			      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
 }
 
-static void intel_pmu_disable_counter(int idx, u64 config)
+static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
 {
-	wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config);
-}
-
-static void amd_pmu_disable_counter(int idx, u64 config)
-{
-	wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
-
-}
+	int err;
 
-static void hw_perf_disable(int idx, u64 config)
-{
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	x86_pmu.disable(idx, config);
+	err = checking_wrmsrl(hwc->config_base + idx,
+			      hwc->config);
 }
 
 static inline void
-__pmc_fixed_disable(struct perf_counter *counter,
-		    struct hw_perf_counter *hwc, int __idx)
+intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
 {
 	int idx = __idx - X86_PMC_IDX_FIXED;
 	u64 ctrl_val, mask;
@@ -460,13 +451,20 @@ __pmc_fixed_disable(struct perf_counter *counter,
 }
 
 static inline void
-__x86_pmu_disable(struct perf_counter *counter,
-		  struct hw_perf_counter *hwc, int idx)
+intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
 {
-	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
-		__pmc_fixed_disable(counter, hwc, idx);
-	else
-		hw_perf_disable(idx, hwc->config);
+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+		intel_pmu_disable_fixed(hwc, idx);
+		return;
+	}
+
+	x86_pmu_disable_counter(hwc, idx);
+}
+
+static inline void
+amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
+{
+	x86_pmu_disable_counter(hwc, idx);
 }
 
 static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
@@ -551,7 +549,7 @@ static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 	if (cpuc->enabled)
 		x86_pmu_enable_counter(hwc, idx);
 	else
-		amd_pmu_disable_counter(idx, hwc->config);
+		x86_pmu_disable_counter(hwc, idx);
 }
 
 static int
@@ -622,7 +620,7 @@ try_generic:
 
 	perf_counters_lapic_init(hwc->nmi);
 
-	__x86_pmu_disable(counter, hwc, idx);
+	x86_pmu.disable(hwc, idx);
 
 	cpuc->counters[idx] = counter;
 	set_bit(idx, cpuc->active);
@@ -694,7 +692,7 @@ static void x86_pmu_disable(struct perf_counter *counter)
 	 * could reenable again:
 	 */
 	clear_bit(idx, cpuc->active);
-	__x86_pmu_disable(counter, hwc, idx);
+	x86_pmu.disable(hwc, idx);
 
 	/*
 	 * Make sure the cleared pointer becomes visible before we
@@ -762,7 +760,7 @@ again:
 
 		intel_pmu_save_and_restart(counter);
 		if (perf_counter_overflow(counter, nmi, regs, 0))
-			__x86_pmu_disable(counter, &counter->hw, bit);
+			intel_pmu_disable_counter(&counter->hw, bit);
 	}
 
 	intel_pmu_ack_status(ack);
-- 
cgit v1.2.3


From 85cf9dba92152bb4edec118b2f4f0be1ae7fdcab Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:20 +0200
Subject: perf_counter, x86: change and remove pmu initialization checks

Some functions are only called if the pmu was proper initialized. That
initalization checks can be removed. The way to check initialization
changed too. Now, the pointer to the interrupt handler is checked. If
it exists the pmu is initialized. This also removes a static variable
and uses struct x86_pmu as only data source for the check.

[ Impact: simplify code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-24-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 34 +++++++++++++---------------------
 1 file changed, 13 insertions(+), 21 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index df9012bbd21..2d3681bbb52 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -23,7 +23,6 @@
 #include <asm/stacktrace.h>
 #include <asm/nmi.h>
 
-static bool perf_counters_initialized __read_mostly;
 static u64 perf_counter_mask __read_mostly;
 
 struct cpu_hw_counters {
@@ -227,6 +226,11 @@ static void hw_perf_counter_destroy(struct perf_counter *counter)
 	}
 }
 
+static inline int x86_pmu_initialized(void)
+{
+	return x86_pmu.handle_irq != NULL;
+}
+
 /*
  * Setup the hardware configuration for a given hw_event_type
  */
@@ -240,8 +244,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
 		return -ENOSYS;
 
-	if (unlikely(!perf_counters_initialized))
-		return -EINVAL;
+	if (!x86_pmu_initialized())
+		return -ENODEV;
 
 	err = 0;
 	if (atomic_inc_not_zero(&num_counters)) {
@@ -348,9 +352,8 @@ static u64 amd_pmu_save_disable_all(void)
 
 u64 hw_perf_save_disable(void)
 {
-	if (unlikely(!perf_counters_initialized))
+	if (!x86_pmu_initialized())
 		return 0;
-
 	return x86_pmu.save_disable_all();
 }
 /*
@@ -388,9 +391,8 @@ static void amd_pmu_restore_all(u64 ctrl)
 
 void hw_perf_restore(u64 ctrl)
 {
-	if (unlikely(!perf_counters_initialized))
+	if (!x86_pmu_initialized())
 		return;
-
 	x86_pmu.restore_all(ctrl);
 }
 /*
@@ -402,8 +404,6 @@ static inline u64 intel_pmu_get_status(u64 mask)
 {
 	u64 status;
 
-	if (unlikely(!perf_counters_initialized))
-		return 0;
 	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 
 	return status;
@@ -417,10 +417,6 @@ static inline void intel_pmu_ack_status(u64 ack)
 static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 {
 	int err;
-
-	if (unlikely(!perf_counters_initialized))
-		return;
-
 	err = checking_wrmsrl(hwc->config_base + idx,
 			      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
 }
@@ -428,10 +424,6 @@ static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
 {
 	int err;
-
-	if (unlikely(!perf_counters_initialized))
-		return;
-
 	err = checking_wrmsrl(hwc->config_base + idx,
 			      hwc->config);
 }
@@ -787,10 +779,10 @@ void perf_counter_unthrottle(void)
 {
 	struct cpu_hw_counters *cpuc;
 
-	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+	if (!x86_pmu_initialized())
 		return;
 
-	if (unlikely(!perf_counters_initialized))
+	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
 		return;
 
 	cpuc = &__get_cpu_var(cpu_hw_counters);
@@ -829,8 +821,9 @@ void perf_counters_lapic_init(int nmi)
 {
 	u32 apic_val;
 
-	if (!perf_counters_initialized)
+	if (!x86_pmu_initialized())
 		return;
+
 	/*
 	 * Enable the performance counter vector in the APIC LVT:
 	 */
@@ -988,7 +981,6 @@ void __init init_hw_perf_counters(void)
 		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
 
 	pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
-	perf_counters_initialized = true;
 
 	perf_counters_lapic_init(0);
 	register_die_notifier(&perf_counter_nmi_notifier);
-- 
cgit v1.2.3


From a29aa8a7ff93e4196d558036928597e68337dd8d Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:21 +0200
Subject: perf_counter, x86: implement the interrupt handler for AMD cpus

This patch implements the interrupt handler for AMD performance
counters. In difference to the Intel pmu, there is no single status
register and also there are no fixed counters. This makes the handler
very different and it is useful to make the handler vendor
specific. To check if a counter is overflowed the upper bit of the
counter is checked. Only counters where the active bit is set are
checked.

With this patch throttling is enabled for AMD performance counters.

This patch also reenables Linux performance counters on AMD cpus.

[ Impact: re-enable perfcounters on AMD CPUs ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-25-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 45 +++++++++++++++++++++++++++++++-------
 1 file changed, 37 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 2d3681bbb52..f4d59d4cf3f 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -240,10 +240,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	struct hw_perf_counter *hwc = &counter->hw;
 	int err;
 
-	/* disable temporarily */
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
-		return -ENOSYS;
-
 	if (!x86_pmu_initialized())
 		return -ENODEV;
 
@@ -773,7 +769,43 @@ out:
 	return ret;
 }
 
-static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) { return 0; }
+static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
+{
+	int cpu = smp_processor_id();
+	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
+	u64 val;
+	int handled = 0;
+	struct perf_counter *counter;
+	struct hw_perf_counter *hwc;
+	int idx;
+
+	++cpuc->interrupts;
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+		if (!test_bit(idx, cpuc->active))
+			continue;
+		counter = cpuc->counters[idx];
+		hwc = &counter->hw;
+		x86_perf_counter_update(counter, hwc, idx);
+		val = atomic64_read(&hwc->prev_count);
+		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
+			continue;
+		/* counter overflow */
+		x86_perf_counter_set_period(counter, hwc, idx);
+		handled = 1;
+		inc_irq_stat(apic_perf_irqs);
+		if (perf_counter_overflow(counter, nmi, regs, 0))
+			amd_pmu_disable_counter(hwc, idx);
+		else if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS)
+			/*
+			 * do not reenable when throttled, but reload
+			 * the register
+			 */
+			amd_pmu_disable_counter(hwc, idx);
+		else if (counter->state == PERF_COUNTER_STATE_ACTIVE)
+			amd_pmu_enable_counter(hwc, idx);
+	}
+	return handled;
+}
 
 void perf_counter_unthrottle(void)
 {
@@ -782,9 +814,6 @@ void perf_counter_unthrottle(void)
 	if (!x86_pmu_initialized())
 		return;
 
-	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
-		return;
-
 	cpuc = &__get_cpu_var(cpu_hw_counters);
 	if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
 		if (printk_ratelimit())
-- 
cgit v1.2.3


From 4b7bfd0d276da3a006d37e85d3cf900d7a14ae2a Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:22 +0200
Subject: perf_counter, x86: return raw count with x86_perf_counter_update()

To check on AMD cpus if a counter overflows, the upper bit of the raw
counter value must be checked. This value is already internally
available in x86_perf_counter_update(). Now, the value is returned so
that it can be used directly to check for overflows.

[ Impact: micro-optimization ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-26-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index f4d59d4cf3f..a8a53abd706 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -132,7 +132,7 @@ static u64 amd_pmu_raw_event(u64 event)
  * Can only be executed on the CPU where the counter is active.
  * Returns the delta events processed.
  */
-static void
+static u64
 x86_perf_counter_update(struct perf_counter *counter,
 			struct hw_perf_counter *hwc, int idx)
 {
@@ -165,6 +165,8 @@ again:
 
 	atomic64_add(delta, &counter->count);
 	atomic64_sub(delta, &hwc->period_left);
+
+	return new_raw_count;
 }
 
 static atomic_t num_counters;
@@ -785,8 +787,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 			continue;
 		counter = cpuc->counters[idx];
 		hwc = &counter->hw;
-		x86_perf_counter_update(counter, hwc, idx);
-		val = atomic64_read(&hwc->prev_count);
+		val = x86_perf_counter_update(counter, hwc, idx);
 		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
 			continue;
 		/* counter overflow */
-- 
cgit v1.2.3


From c619b8ffb1cec6a431687a35695dc6fd292a79e6 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:23 +0200
Subject: perf_counter, x86: introduce max_period variable

In x86 pmus the allowed counter period to programm differs. This
introduces a max_period value and allows the generic implementation
for all models to check the max period.

[ Impact: generalize code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-27-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index a8a53abd706..4b8715b34f8 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -54,6 +54,7 @@ struct x86_pmu {
 	int		num_counters_fixed;
 	int		counter_bits;
 	u64		counter_mask;
+	u64		max_period;
 };
 
 static struct x86_pmu x86_pmu __read_mostly;
@@ -279,14 +280,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		hwc->nmi = 1;
 
 	hwc->irq_period		= hw_event->irq_period;
-	/*
-	 * Intel PMCs cannot be accessed sanely above 32 bit width,
-	 * so we install an artificial 1<<31 period regardless of
-	 * the generic counter period:
-	 */
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
-		if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF)
-			hwc->irq_period = 0x7FFFFFFF;
+	if ((s64)hwc->irq_period <= 0 || hwc->irq_period > x86_pmu.max_period)
+		hwc->irq_period = x86_pmu.max_period;
 
 	atomic64_set(&hwc->period_left, hwc->irq_period);
 
@@ -910,6 +905,12 @@ static struct x86_pmu intel_pmu = {
 	.event_map		= intel_pmu_event_map,
 	.raw_event		= intel_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
+	/*
+	 * Intel PMCs cannot be accessed sanely above 32 bit width,
+	 * so we install an artificial 1<<31 period regardless of
+	 * the generic counter period:
+	 */
+	.max_period		= (1ULL << 31) - 1,
 };
 
 static struct x86_pmu amd_pmu = {
@@ -927,6 +928,8 @@ static struct x86_pmu amd_pmu = {
 	.num_counters		= 4,
 	.counter_bits		= 48,
 	.counter_mask		= (1ULL << 48) - 1,
+	/* use highest bit to detect overflow */
+	.max_period		= (1ULL << 47) - 1,
 };
 
 static int intel_pmu_init(void)
@@ -999,6 +1002,7 @@ void __init init_hw_perf_counters(void)
 	perf_max_counters = x86_pmu.num_counters;
 
 	pr_info("... value mask:      %016Lx\n", x86_pmu.counter_mask);
+	pr_info("... max period:      %016Lx\n", x86_pmu.max_period);
 
 	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
 		x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
-- 
cgit v1.2.3


From ef7b3e09ffdcd5200aea9523f6b56d331d1c4fc0 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:24 +0200
Subject: perf_counter, x86: remove vendor check in fixed_mode_idx()

The function fixed_mode_idx() is used generically. Now it checks the
num_counters_fixed value instead of the vendor to decide if fixed
counters are present.

[ Impact: generalize code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-28-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 4b8715b34f8..d1c8036dcbd 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -542,7 +542,7 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 {
 	unsigned int event;
 
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+	if (!x86_pmu.num_counters_fixed)
 		return -1;
 
 	if (unlikely(hwc->nmi))
-- 
cgit v1.2.3


From 19d84dab55a383d75c885b5c1a618f5ead96f2f6 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:25 +0200
Subject: perf_counter, x86: remove unused function argument in
 intel_pmu_get_status()

The mask argument is unused and thus can be removed.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-29-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index d1c8036dcbd..856b0b85219 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -393,7 +393,7 @@ void hw_perf_restore(u64 ctrl)
  */
 EXPORT_SYMBOL_GPL(hw_perf_restore);
 
-static inline u64 intel_pmu_get_status(u64 mask)
+static inline u64 intel_pmu_get_status(void)
 {
 	u64 status;
 
@@ -728,7 +728,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 
 	cpuc->throttle_ctrl = intel_pmu_save_disable_all();
 
-	status = intel_pmu_get_status(cpuc->throttle_ctrl);
+	status = intel_pmu_get_status();
 	if (!status)
 		goto out;
 
@@ -753,7 +753,7 @@ again:
 	/*
 	 * Repeat if there is more work to be done:
 	 */
-	status = intel_pmu_get_status(cpuc->throttle_ctrl);
+	status = intel_pmu_get_status();
 	if (status)
 		goto again;
 out:
-- 
cgit v1.2.3


From 98144511427c192e4249ff66a3f9debc55c59411 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 29 Apr 2009 14:52:50 +0200
Subject: perf_counter: add/update copyrights

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 856b0b85219..47e563bfd4c 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1,10 +1,11 @@
 /*
  * Performance counter x86 architecture code
  *
- *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
- *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
- *  Copyright(C) 2009 Jaswinder Singh Rajput
- *  Copyright(C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2009 Jaswinder Singh Rajput
+ *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  *
  *  For licencing details see kernel-base/COPYING
  */
-- 
cgit v1.2.3


From ab7ef2e50a557af92f4f90689f51fadadafc16b2 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 29 Apr 2009 22:38:51 +1000
Subject: perf_counter: powerpc: allow use of limited-function counters

POWER5+ and POWER6 have two hardware counters with limited functionality:
PMC5 counts instructions completed in run state and PMC6 counts cycles
in run state.  (Run state is the state when a hardware RUN bit is 1;
the idle task clears RUN while waiting for work to do and sets it when
there is work to do.)

These counters can't be written to by the kernel, can't generate
interrupts, and don't obey the freeze conditions.  That means we can
only use them for per-task counters (where we know we'll always be in
run state; we can't put a per-task counter on an idle task), and only
if we don't want interrupts and we do want to count in all processor
modes.

Obviously some counters can't go on a limited hardware counter, but there
are also situations where we can only put a counter on a limited hardware
counter - if there are already counters on that exclude some processor
modes and we want to put on a per-task cycle or instruction counter that
doesn't exclude any processor mode, it could go on if it can use a
limited hardware counter.

To keep track of these constraints, this adds a flags argument to the
processor-specific get_alternatives() functions, with three bits defined:
one to say that we can accept alternative event codes that go on limited
counters, one to say we only want alternatives on limited counters, and
one to say that this is a per-task counter and therefore events that are
gated by run state are equivalent to those that aren't (e.g. a "cycles"
event is equivalent to a "cycles in run state" event).  These flags
are computed for each counter and stored in the counter->hw.counter_base
field (slightly wonky name for what it does, but it was an existing
unused field).

Since the limited counters don't freeze when we freeze the other counters,
we need some special handling to avoid getting skew between things counted
on the limited counters and those counted on normal counters.  To minimize
this skew, if we are using any limited counters, we read PMC5 and PMC6
immediately after setting and clearing the freeze bit.  This is done in
a single asm in the new write_mmcr0() function.

The code here is specific to PMC5 and PMC6 being the limited hardware
counters.  Being more general (e.g. having a bitmap of limited hardware
counter numbers) would have meant more complex code to read the limited
counters when freezing and unfreezing the normal counters, with
conditional branches, which would have increased the skew.  Since it
isn't necessary for the code to be more general at this stage, it isn't.

This also extends the back-ends for POWER5+ and POWER6 to be able to
handle up to 6 counters rather than the 4 they previously handled.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Robert Richter <robert.richter@amd.com>
LKML-Reference: <18936.19035.163066.892208@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/include/asm/perf_counter.h |  13 +-
 arch/powerpc/kernel/perf_counter.c      | 297 ++++++++++++++++++++++++++++----
 arch/powerpc/kernel/power4-pmu.c        |   3 +-
 arch/powerpc/kernel/power5+-pmu.c       | 117 +++++++++++--
 arch/powerpc/kernel/power5-pmu.c        |   3 +-
 arch/powerpc/kernel/power6-pmu.c        | 119 +++++++++++--
 arch/powerpc/kernel/ppc970-pmu.c        |   3 +-
 7 files changed, 479 insertions(+), 76 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h
index 9d7ff6d7fb5..56d66c38143 100644
--- a/arch/powerpc/include/asm/perf_counter.h
+++ b/arch/powerpc/include/asm/perf_counter.h
@@ -12,6 +12,7 @@
 
 #define MAX_HWCOUNTERS		8
 #define MAX_EVENT_ALTERNATIVES	8
+#define MAX_LIMITED_HWCOUNTERS	2
 
 /*
  * This struct provides the constants and functions needed to
@@ -25,14 +26,24 @@ struct power_pmu {
 	int	(*compute_mmcr)(unsigned int events[], int n_ev,
 				unsigned int hwc[], u64 mmcr[]);
 	int	(*get_constraint)(unsigned int event, u64 *mskp, u64 *valp);
-	int	(*get_alternatives)(unsigned int event, unsigned int alt[]);
+	int	(*get_alternatives)(unsigned int event, unsigned int flags,
+				    unsigned int alt[]);
 	void	(*disable_pmc)(unsigned int pmc, u64 mmcr[]);
+	int	(*limited_pmc_event)(unsigned int event);
+	int	limited_pmc5_6;	/* PMC5 and PMC6 have limited function */
 	int	n_generic;
 	int	*generic_events;
 };
 
 extern struct power_pmu *ppmu;
 
+/*
+ * Values for flags to get_alternatives()
+ */
+#define PPMU_LIMITED_PMC_OK	1	/* can put this on a limited PMC */
+#define PPMU_LIMITED_PMC_REQD	2	/* have to put this on a limited PMC */
+#define PPMU_ONLY_COUNT_RUN	4	/* only counting in run state */
+
 /*
  * The power_pmu.get_constraint function returns a 64-bit value and
  * a 64-bit mask that express the constraints between this event and
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index d9bbe5efc64..15cdc8e6722 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -23,10 +23,14 @@ struct cpu_hw_counters {
 	int n_percpu;
 	int disabled;
 	int n_added;
+	int n_limited;
+	u8  pmcs_enabled;
 	struct perf_counter *counter[MAX_HWCOUNTERS];
 	unsigned int events[MAX_HWCOUNTERS];
+	unsigned int flags[MAX_HWCOUNTERS];
 	u64 mmcr[3];
-	u8 pmcs_enabled;
+	struct perf_counter *limited_counter[MAX_LIMITED_HWCOUNTERS];
+	u8  limited_hwidx[MAX_LIMITED_HWCOUNTERS];
 };
 DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
 
@@ -127,7 +131,8 @@ static void write_pmc(int idx, unsigned long val)
  * and see if any combination of alternative codes is feasible.
  * The feasible set is returned in event[].
  */
-static int power_check_constraints(unsigned int event[], int n_ev)
+static int power_check_constraints(unsigned int event[], unsigned int cflags[],
+				   int n_ev)
 {
 	u64 mask, value, nv;
 	unsigned int alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
@@ -144,11 +149,15 @@ static int power_check_constraints(unsigned int event[], int n_ev)
 
 	/* First see if the events will go on as-is */
 	for (i = 0; i < n_ev; ++i) {
-		alternatives[i][0] = event[i];
+		if ((cflags[i] & PPMU_LIMITED_PMC_REQD)
+		    && !ppmu->limited_pmc_event(event[i])) {
+			ppmu->get_alternatives(event[i], cflags[i],
+					       alternatives[i]);
+			event[i] = alternatives[i][0];
+		}
 		if (ppmu->get_constraint(event[i], &amasks[i][0],
 					 &avalues[i][0]))
 			return -1;
-		choice[i] = 0;
 	}
 	value = mask = 0;
 	for (i = 0; i < n_ev; ++i) {
@@ -166,7 +175,9 @@ static int power_check_constraints(unsigned int event[], int n_ev)
 	if (!ppmu->get_alternatives)
 		return -1;
 	for (i = 0; i < n_ev; ++i) {
-		n_alt[i] = ppmu->get_alternatives(event[i], alternatives[i]);
+		choice[i] = 0;
+		n_alt[i] = ppmu->get_alternatives(event[i], cflags[i],
+						  alternatives[i]);
 		for (j = 1; j < n_alt[i]; ++j)
 			ppmu->get_constraint(alternatives[i][j],
 					     &amasks[i][j], &avalues[i][j]);
@@ -231,28 +242,41 @@ static int power_check_constraints(unsigned int event[], int n_ev)
  * exclude_{user,kernel,hv} with each other and any previously
  * added counters.
  */
-static int check_excludes(struct perf_counter **ctrs, int n_prev, int n_new)
+static int check_excludes(struct perf_counter **ctrs, unsigned int cflags[],
+			  int n_prev, int n_new)
 {
-	int eu, ek, eh;
-	int i, n;
+	int eu = 0, ek = 0, eh = 0;
+	int i, n, first;
 	struct perf_counter *counter;
 
 	n = n_prev + n_new;
 	if (n <= 1)
 		return 0;
 
-	eu = ctrs[0]->hw_event.exclude_user;
-	ek = ctrs[0]->hw_event.exclude_kernel;
-	eh = ctrs[0]->hw_event.exclude_hv;
-	if (n_prev == 0)
-		n_prev = 1;
-	for (i = n_prev; i < n; ++i) {
+	first = 1;
+	for (i = 0; i < n; ++i) {
+		if (cflags[i] & PPMU_LIMITED_PMC_OK) {
+			cflags[i] &= ~PPMU_LIMITED_PMC_REQD;
+			continue;
+		}
 		counter = ctrs[i];
-		if (counter->hw_event.exclude_user != eu ||
-		    counter->hw_event.exclude_kernel != ek ||
-		    counter->hw_event.exclude_hv != eh)
+		if (first) {
+			eu = counter->hw_event.exclude_user;
+			ek = counter->hw_event.exclude_kernel;
+			eh = counter->hw_event.exclude_hv;
+			first = 0;
+		} else if (counter->hw_event.exclude_user != eu ||
+			   counter->hw_event.exclude_kernel != ek ||
+			   counter->hw_event.exclude_hv != eh) {
 			return -EAGAIN;
+		}
 	}
+
+	if (eu || ek || eh)
+		for (i = 0; i < n; ++i)
+			if (cflags[i] & PPMU_LIMITED_PMC_OK)
+				cflags[i] |= PPMU_LIMITED_PMC_REQD;
+
 	return 0;
 }
 
@@ -279,6 +303,85 @@ static void power_pmu_read(struct perf_counter *counter)
 	atomic64_sub(delta, &counter->hw.period_left);
 }
 
+/*
+ * On some machines, PMC5 and PMC6 can't be written, don't respect
+ * the freeze conditions, and don't generate interrupts.  This tells
+ * us if `counter' is using such a PMC.
+ */
+static int is_limited_pmc(int pmcnum)
+{
+	return ppmu->limited_pmc5_6 && (pmcnum == 5 || pmcnum == 6);
+}
+
+static void freeze_limited_counters(struct cpu_hw_counters *cpuhw,
+				    unsigned long pmc5, unsigned long pmc6)
+{
+	struct perf_counter *counter;
+	u64 val, prev, delta;
+	int i;
+
+	for (i = 0; i < cpuhw->n_limited; ++i) {
+		counter = cpuhw->limited_counter[i];
+		if (!counter->hw.idx)
+			continue;
+		val = (counter->hw.idx == 5) ? pmc5 : pmc6;
+		prev = atomic64_read(&counter->hw.prev_count);
+		counter->hw.idx = 0;
+		delta = (val - prev) & 0xfffffffful;
+		atomic64_add(delta, &counter->count);
+	}
+}
+
+static void thaw_limited_counters(struct cpu_hw_counters *cpuhw,
+				  unsigned long pmc5, unsigned long pmc6)
+{
+	struct perf_counter *counter;
+	u64 val;
+	int i;
+
+	for (i = 0; i < cpuhw->n_limited; ++i) {
+		counter = cpuhw->limited_counter[i];
+		counter->hw.idx = cpuhw->limited_hwidx[i];
+		val = (counter->hw.idx == 5) ? pmc5 : pmc6;
+		atomic64_set(&counter->hw.prev_count, val);
+		perf_counter_update_userpage(counter);
+	}
+}
+
+/*
+ * Since limited counters don't respect the freeze conditions, we
+ * have to read them immediately after freezing or unfreezing the
+ * other counters.  We try to keep the values from the limited
+ * counters as consistent as possible by keeping the delay (in
+ * cycles and instructions) between freezing/unfreezing and reading
+ * the limited counters as small and consistent as possible.
+ * Therefore, if any limited counters are in use, we read them
+ * both, and always in the same order, to minimize variability,
+ * and do it inside the same asm that writes MMCR0.
+ */
+static void write_mmcr0(struct cpu_hw_counters *cpuhw, unsigned long mmcr0)
+{
+	unsigned long pmc5, pmc6;
+
+	if (!cpuhw->n_limited) {
+		mtspr(SPRN_MMCR0, mmcr0);
+		return;
+	}
+
+	/*
+	 * Write MMCR0, then read PMC5 and PMC6 immediately.
+	 */
+	asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5"
+		     : "=&r" (pmc5), "=&r" (pmc6)
+		     : "r" (mmcr0), "i" (SPRN_MMCR0),
+		       "i" (SPRN_PMC5), "i" (SPRN_PMC6));
+
+	if (mmcr0 & MMCR0_FC)
+		freeze_limited_counters(cpuhw, pmc5, pmc6);
+	else
+		thaw_limited_counters(cpuhw, pmc5, pmc6);
+}
+
 /*
  * Disable all counters to prevent PMU interrupts and to allow
  * counters to be added or removed.
@@ -321,7 +424,7 @@ u64 hw_perf_save_disable(void)
 		 * executed and the PMU has frozen the counters
 		 * before we return.
 		 */
-		mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_FC);
+		write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC);
 		mb();
 	}
 	local_irq_restore(flags);
@@ -342,6 +445,8 @@ void hw_perf_restore(u64 disable)
 	unsigned long val;
 	s64 left;
 	unsigned int hwc_index[MAX_HWCOUNTERS];
+	int n_lim;
+	int idx;
 
 	if (disable)
 		return;
@@ -414,10 +519,18 @@ void hw_perf_restore(u64 disable)
 	/*
 	 * Initialize the PMCs for all the new and moved counters.
 	 */
+	cpuhw->n_limited = n_lim = 0;
 	for (i = 0; i < cpuhw->n_counters; ++i) {
 		counter = cpuhw->counter[i];
 		if (counter->hw.idx)
 			continue;
+		idx = hwc_index[i] + 1;
+		if (is_limited_pmc(idx)) {
+			cpuhw->limited_counter[n_lim] = counter;
+			cpuhw->limited_hwidx[n_lim] = idx;
+			++n_lim;
+			continue;
+		}
 		val = 0;
 		if (counter->hw_event.irq_period) {
 			left = atomic64_read(&counter->hw.period_left);
@@ -425,15 +538,16 @@ void hw_perf_restore(u64 disable)
 				val = 0x80000000L - left;
 		}
 		atomic64_set(&counter->hw.prev_count, val);
-		counter->hw.idx = hwc_index[i] + 1;
-		write_pmc(counter->hw.idx, val);
+		counter->hw.idx = idx;
+		write_pmc(idx, val);
 		perf_counter_update_userpage(counter);
 	}
+	cpuhw->n_limited = n_lim;
 	cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
 
  out_enable:
 	mb();
-	mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
+	write_mmcr0(cpuhw, cpuhw->mmcr[0]);
 
 	/*
 	 * Enable instruction sampling if necessary
@@ -448,7 +562,8 @@ void hw_perf_restore(u64 disable)
 }
 
 static int collect_events(struct perf_counter *group, int max_count,
-			  struct perf_counter *ctrs[], unsigned int *events)
+			  struct perf_counter *ctrs[], unsigned int *events,
+			  unsigned int *flags)
 {
 	int n = 0;
 	struct perf_counter *counter;
@@ -457,6 +572,7 @@ static int collect_events(struct perf_counter *group, int max_count,
 		if (n >= max_count)
 			return -1;
 		ctrs[n] = group;
+		flags[n] = group->hw.counter_base;
 		events[n++] = group->hw.config;
 	}
 	list_for_each_entry(counter, &group->sibling_list, list_entry) {
@@ -465,6 +581,7 @@ static int collect_events(struct perf_counter *group, int max_count,
 			if (n >= max_count)
 				return -1;
 			ctrs[n] = counter;
+			flags[n] = counter->hw.counter_base;
 			events[n++] = counter->hw.config;
 		}
 	}
@@ -497,12 +614,14 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader,
 	cpuhw = &__get_cpu_var(cpu_hw_counters);
 	n0 = cpuhw->n_counters;
 	n = collect_events(group_leader, ppmu->n_counter - n0,
-			   &cpuhw->counter[n0], &cpuhw->events[n0]);
+			   &cpuhw->counter[n0], &cpuhw->events[n0],
+			   &cpuhw->flags[n0]);
 	if (n < 0)
 		return -EAGAIN;
-	if (check_excludes(cpuhw->counter, n0, n))
+	if (check_excludes(cpuhw->counter, cpuhw->flags, n0, n))
 		return -EAGAIN;
-	if (power_check_constraints(cpuhw->events, n + n0))
+	i = power_check_constraints(cpuhw->events, cpuhw->flags, n + n0);
+	if (i < 0)
 		return -EAGAIN;
 	cpuhw->n_counters = n0 + n;
 	cpuhw->n_added += n;
@@ -554,9 +673,10 @@ static int power_pmu_enable(struct perf_counter *counter)
 		goto out;
 	cpuhw->counter[n0] = counter;
 	cpuhw->events[n0] = counter->hw.config;
-	if (check_excludes(cpuhw->counter, n0, 1))
+	cpuhw->flags[n0] = counter->hw.counter_base;
+	if (check_excludes(cpuhw->counter, cpuhw->flags, n0, 1))
 		goto out;
-	if (power_check_constraints(cpuhw->events, n0 + 1))
+	if (power_check_constraints(cpuhw->events, cpuhw->flags, n0 + 1))
 		goto out;
 
 	counter->hw.config = cpuhw->events[n0];
@@ -592,12 +712,24 @@ static void power_pmu_disable(struct perf_counter *counter)
 				cpuhw->counter[i-1] = cpuhw->counter[i];
 			--cpuhw->n_counters;
 			ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
-			write_pmc(counter->hw.idx, 0);
-			counter->hw.idx = 0;
+			if (counter->hw.idx) {
+				write_pmc(counter->hw.idx, 0);
+				counter->hw.idx = 0;
+			}
 			perf_counter_update_userpage(counter);
 			break;
 		}
 	}
+	for (i = 0; i < cpuhw->n_limited; ++i)
+		if (counter == cpuhw->limited_counter[i])
+			break;
+	if (i < cpuhw->n_limited) {
+		while (++i < cpuhw->n_limited) {
+			cpuhw->limited_counter[i-1] = cpuhw->limited_counter[i];
+			cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i];
+		}
+		--cpuhw->n_limited;
+	}
 	if (cpuhw->n_counters == 0) {
 		/* disable exceptions if no counters are running */
 		cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
@@ -613,6 +745,61 @@ struct pmu power_pmu = {
 	.read		= power_pmu_read,
 };
 
+/*
+ * Return 1 if we might be able to put counter on a limited PMC,
+ * or 0 if not.
+ * A counter can only go on a limited PMC if it counts something
+ * that a limited PMC can count, doesn't require interrupts, and
+ * doesn't exclude any processor mode.
+ */
+static int can_go_on_limited_pmc(struct perf_counter *counter, unsigned int ev,
+				 unsigned int flags)
+{
+	int n;
+	unsigned int alt[MAX_EVENT_ALTERNATIVES];
+
+	if (counter->hw_event.exclude_user
+	    || counter->hw_event.exclude_kernel
+	    || counter->hw_event.exclude_hv
+	    || counter->hw_event.irq_period)
+		return 0;
+
+	if (ppmu->limited_pmc_event(ev))
+		return 1;
+
+	/*
+	 * The requested event isn't on a limited PMC already;
+	 * see if any alternative code goes on a limited PMC.
+	 */
+	if (!ppmu->get_alternatives)
+		return 0;
+
+	flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD;
+	n = ppmu->get_alternatives(ev, flags, alt);
+	if (n)
+		return alt[0];
+
+	return 0;
+}
+
+/*
+ * Find an alternative event that goes on a normal PMC, if possible,
+ * and return the event code, or 0 if there is no such alternative.
+ * (Note: event code 0 is "don't count" on all machines.)
+ */
+static unsigned long normal_pmc_alternative(unsigned long ev,
+					    unsigned long flags)
+{
+	unsigned int alt[MAX_EVENT_ALTERNATIVES];
+	int n;
+
+	flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD);
+	n = ppmu->get_alternatives(ev, flags, alt);
+	if (!n)
+		return 0;
+	return alt[0];
+}
+
 /* Number of perf_counters counting hardware events */
 static atomic_t num_counters;
 /* Used to avoid races in calling reserve/release_pmc_hardware */
@@ -633,9 +820,10 @@ static void hw_perf_counter_destroy(struct perf_counter *counter)
 
 const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 {
-	unsigned long ev;
+	unsigned long ev, flags;
 	struct perf_counter *ctrs[MAX_HWCOUNTERS];
 	unsigned int events[MAX_HWCOUNTERS];
+	unsigned int cflags[MAX_HWCOUNTERS];
 	int n;
 	int err;
 
@@ -661,7 +849,36 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 	 */
 	if (!firmware_has_feature(FW_FEATURE_LPAR))
 		counter->hw_event.exclude_hv = 0;
-	
+
+	/*
+	 * If this is a per-task counter, then we can use
+	 * PM_RUN_* events interchangeably with their non RUN_*
+	 * equivalents, e.g. PM_RUN_CYC instead of PM_CYC.
+	 * XXX we should check if the task is an idle task.
+	 */
+	flags = 0;
+	if (counter->ctx->task)
+		flags |= PPMU_ONLY_COUNT_RUN;
+
+	/*
+	 * If this machine has limited counters, check whether this
+	 * event could go on a limited counter.
+	 */
+	if (ppmu->limited_pmc5_6) {
+		if (can_go_on_limited_pmc(counter, ev, flags)) {
+			flags |= PPMU_LIMITED_PMC_OK;
+		} else if (ppmu->limited_pmc_event(ev)) {
+			/*
+			 * The requested event is on a limited PMC,
+			 * but we can't use a limited PMC; see if any
+			 * alternative goes on a normal PMC.
+			 */
+			ev = normal_pmc_alternative(ev, flags);
+			if (!ev)
+				return ERR_PTR(-EINVAL);
+		}
+	}
+
 	/*
 	 * If this is in a group, check if it can go on with all the
 	 * other hardware counters in the group.  We assume the counter
@@ -670,18 +887,20 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 	n = 0;
 	if (counter->group_leader != counter) {
 		n = collect_events(counter->group_leader, ppmu->n_counter - 1,
-				   ctrs, events);
+				   ctrs, events, cflags);
 		if (n < 0)
 			return ERR_PTR(-EINVAL);
 	}
 	events[n] = ev;
 	ctrs[n] = counter;
-	if (check_excludes(ctrs, n, 1))
+	cflags[n] = flags;
+	if (check_excludes(ctrs, cflags, n, 1))
 		return ERR_PTR(-EINVAL);
-	if (power_check_constraints(events, n + 1))
+	if (power_check_constraints(events, cflags, n + 1))
 		return ERR_PTR(-EINVAL);
 
 	counter->hw.config = events[n];
+	counter->hw.counter_base = cflags[n];
 	atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period);
 
 	/*
@@ -763,6 +982,10 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 	int found = 0;
 	int nmi;
 
+	if (cpuhw->n_limited)
+		freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5),
+					mfspr(SPRN_PMC6));
+
 	/*
 	 * If interrupts were soft-disabled when this PMU interrupt
 	 * occurred, treat it as an NMI.
@@ -775,6 +998,8 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 
 	for (i = 0; i < cpuhw->n_counters; ++i) {
 		counter = cpuhw->counter[i];
+		if (is_limited_pmc(counter->hw.idx))
+			continue;
 		val = read_pmc(counter->hw.idx);
 		if ((int)val < 0) {
 			/* counter has overflowed */
@@ -791,6 +1016,8 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 	 */
 	if (!found) {
 		for (i = 0; i < ppmu->n_counter; ++i) {
+			if (is_limited_pmc(i + 1))
+				continue;
 			val = read_pmc(i + 1);
 			if ((int)val < 0)
 				write_pmc(i + 1, 0);
@@ -804,7 +1031,7 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 	 * XXX might want to use MSR.PM to keep the counters frozen until
 	 * we get back out of this interrupt.
 	 */
-	mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
+	write_mmcr0(cpuhw, cpuhw->mmcr[0]);
 
 	if (nmi)
 		nmi_exit();
diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c
index 1407b19ab61..744a2756958 100644
--- a/arch/powerpc/kernel/power4-pmu.c
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -320,7 +320,8 @@ static unsigned int ppc_inst_cmpl[] = {
 	0x1001, 0x4001, 0x6001, 0x7001, 0x8001
 };
 
-static int p4_get_alternatives(unsigned int event, unsigned int alt[])
+static int p4_get_alternatives(unsigned int event, unsigned int flags,
+			       unsigned int alt[])
 {
 	int i, j, na;
 
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
index 1222c8ea3c2..8154eaa2404 100644
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -78,8 +78,8 @@
  * Layout of constraint bits:
  * 6666555555555544444444443333333333222222222211111111110000000000
  * 3210987654321098765432109876543210987654321098765432109876543210
- *             [  ><><>< ><> <><>[  >      <  ><  ><  ><  ><><><><>
- *             NC  G0G1G2 G3 T0T1 UC        B0  B1  B2  B3 P4P3P2P1
+ *             [  ><><>< ><> <><>[  >  <  ><  ><  ><  ><><><><><><>
+ *             NC  G0G1G2 G3 T0T1 UC    B0  B1  B2  B3 P6P5P4P3P2P1
  *
  * NC - number of counters
  *     51: NC error 0x0008_0000_0000_0000
@@ -105,18 +105,18 @@
  *     30: IDU|GRS events needed 0x00_4000_0000
  *
  * B0
- *     20-23: Byte 0 event source 0x00f0_0000
+ *     24-27: Byte 0 event source 0x0f00_0000
  *	      Encoding as for the event code
  *
  * B1, B2, B3
- *     16-19, 12-15, 8-11: Byte 1, 2, 3 event sources
+ *     20-23, 16-19, 12-15: Byte 1, 2, 3 event sources
  *
- * P4
- *     7: P1 error 0x80
- *     6-7: Count of events needing PMC4
+ * P6
+ *     11: P6 error 0x800
+ *     10-11: Count of events needing PMC6
  *
- * P1..P3
- *     0-6: Count of events needing PMC1..PMC3
+ * P1..P5
+ *     0-9: Count of events needing PMC1..PMC5
  */
 
 static const int grsel_shift[8] = {
@@ -143,11 +143,13 @@ static int power5p_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
 
 	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
 	if (pmc) {
-		if (pmc > 4)
+		if (pmc > 6)
 			return -1;
 		sh = (pmc - 1) * 2;
 		mask |= 2 << sh;
 		value |= 1 << sh;
+		if (pmc >= 5 && !(event == 0x500009 || event == 0x600005))
+			return -1;
 	}
 	if (event & PM_BUSEVENT_MSK) {
 		unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
@@ -173,16 +175,26 @@ static int power5p_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
 			value |= (u64)((event >> PM_GRS_SH) & fmask) << sh;
 		}
 		/* Set byte lane select field */
-		mask  |= 0xfULL << (20 - 4 * byte);
-		value |= (u64)unit << (20 - 4 * byte);
+		mask  |= 0xfULL << (24 - 4 * byte);
+		value |= (u64)unit << (24 - 4 * byte);
+	}
+	if (pmc < 5) {
+		/* need a counter from PMC1-4 set */
+		mask  |= 0x8000000000000ull;
+		value |= 0x1000000000000ull;
 	}
-	mask  |= 0x8000000000000ull;
-	value |= 0x1000000000000ull;
 	*maskp = mask;
 	*valp = value;
 	return 0;
 }
 
+static int power5p_limited_pmc_event(unsigned int event)
+{
+	int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+
+	return pmc == 5 || pmc == 6;
+}
+
 #define MAX_ALT	3	/* at most 3 alternatives for any event */
 
 static const unsigned int event_alternatives[][MAX_ALT] = {
@@ -193,6 +205,7 @@ static const unsigned int event_alternatives[][MAX_ALT] = {
 	{ 0x410c7,  0x441084 },			/* PM_THRD_L2MISS_BOTH_CYC */
 	{ 0x800c4,  0xc20e0 },			/* PM_DTLB_MISS */
 	{ 0xc50c6,  0xc60e0 },			/* PM_MRK_DTLB_MISS */
+	{ 0x100005, 0x600005 },			/* PM_RUN_CYC */
 	{ 0x100009, 0x200009 },			/* PM_INST_CMPL */
 	{ 0x200015, 0x300015 },			/* PM_LSU_LMQ_SRQ_EMPTY_CYC */
 	{ 0x300009, 0x400009 },			/* PM_INST_DISP */
@@ -260,24 +273,85 @@ static int find_alternative_bdecode(unsigned int event)
 	return -1;
 }
 
-static int power5p_get_alternatives(unsigned int event, unsigned int alt[])
+static int power5p_get_alternatives(unsigned int event, unsigned int flags,
+				    unsigned int alt[])
 {
 	int i, j, ae, nalt = 1;
+	int nlim;
 
 	alt[0] = event;
 	nalt = 1;
+	nlim = power5p_limited_pmc_event(event);
 	i = find_alternative(event);
 	if (i >= 0) {
 		for (j = 0; j < MAX_ALT; ++j) {
 			ae = event_alternatives[i][j];
 			if (ae && ae != event)
 				alt[nalt++] = ae;
+			nlim += power5p_limited_pmc_event(ae);
 		}
 	} else {
 		ae = find_alternative_bdecode(event);
 		if (ae > 0)
 			alt[nalt++] = ae;
 	}
+
+	if (flags & PPMU_ONLY_COUNT_RUN) {
+		/*
+		 * We're only counting in RUN state,
+		 * so PM_CYC is equivalent to PM_RUN_CYC
+		 * and PM_INST_CMPL === PM_RUN_INST_CMPL.
+		 * This doesn't include alternatives that don't provide
+		 * any extra flexibility in assigning PMCs (e.g.
+		 * 0x100005 for PM_RUN_CYC vs. 0xf for PM_CYC).
+		 * Note that even with these additional alternatives
+		 * we never end up with more than 3 alternatives for any event.
+		 */
+		j = nalt;
+		for (i = 0; i < nalt; ++i) {
+			switch (alt[i]) {
+			case 0xf:	/* PM_CYC */
+				alt[j++] = 0x600005;	/* PM_RUN_CYC */
+				++nlim;
+				break;
+			case 0x600005:	/* PM_RUN_CYC */
+				alt[j++] = 0xf;
+				break;
+			case 0x100009:	/* PM_INST_CMPL */
+				alt[j++] = 0x500009;	/* PM_RUN_INST_CMPL */
+				++nlim;
+				break;
+			case 0x500009:	/* PM_RUN_INST_CMPL */
+				alt[j++] = 0x100009;	/* PM_INST_CMPL */
+				alt[j++] = 0x200009;
+				break;
+			}
+		}
+		nalt = j;
+	}
+
+	if (!(flags & PPMU_LIMITED_PMC_OK) && nlim) {
+		/* remove the limited PMC events */
+		j = 0;
+		for (i = 0; i < nalt; ++i) {
+			if (!power5p_limited_pmc_event(alt[i])) {
+				alt[j] = alt[i];
+				++j;
+			}
+		}
+		nalt = j;
+	} else if ((flags & PPMU_LIMITED_PMC_REQD) && nlim < nalt) {
+		/* remove all but the limited PMC events */
+		j = 0;
+		for (i = 0; i < nalt; ++i) {
+			if (power5p_limited_pmc_event(alt[i])) {
+				alt[j] = alt[i];
+				++j;
+			}
+		}
+		nalt = j;
+	}
+
 	return nalt;
 }
 
@@ -390,7 +464,7 @@ static int power5p_compute_mmcr(unsigned int event[], int n_ev,
 	unsigned char unituse[16];
 	int ttmuse;
 
-	if (n_ev > 4)
+	if (n_ev > 6)
 		return -1;
 
 	/* First pass to count resource use */
@@ -399,7 +473,7 @@ static int power5p_compute_mmcr(unsigned int event[], int n_ev,
 	for (i = 0; i < n_ev; ++i) {
 		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
 		if (pmc) {
-			if (pmc > 4)
+			if (pmc > 6)
 				return -1;
 			if (pmc_inuse & (1 << (pmc - 1)))
 				return -1;
@@ -488,13 +562,16 @@ static int power5p_compute_mmcr(unsigned int event[], int n_ev,
 			if (pmc >= 4)
 				return -1;
 			pmc_inuse |= 1 << pmc;
-		} else {
+		} else if (pmc <= 4) {
 			/* Direct event */
 			--pmc;
 			if (isbus && (byte & 2) &&
 			    (psel == 8 || psel == 0x10 || psel == 0x28))
 				/* add events on higher-numbered bus */
 				mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc);
+		} else {
+			/* Instructions or run cycles on PMC5/6 */
+			--pmc;
 		}
 		if (isbus && unit == PM_GRS) {
 			bit = psel & 7;
@@ -538,7 +615,7 @@ static int power5p_generic_events[] = {
 };
 
 struct power_pmu power5p_pmu = {
-	.n_counter = 4,
+	.n_counter = 6,
 	.max_alternatives = MAX_ALT,
 	.add_fields = 0x7000000000055ull,
 	.test_adder = 0x3000040000000ull,
@@ -548,4 +625,6 @@ struct power_pmu power5p_pmu = {
 	.disable_pmc = power5p_disable_pmc,
 	.n_generic = ARRAY_SIZE(power5p_generic_events),
 	.generic_events = power5p_generic_events,
+	.limited_pmc5_6 = 1,
+	.limited_pmc_event = power5p_limited_pmc_event,
 };
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
index 116c4bb1809..6e667dc8647 100644
--- a/arch/powerpc/kernel/power5-pmu.c
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -269,7 +269,8 @@ static int find_alternative_bdecode(unsigned int event)
 	return -1;
 }
 
-static int power5_get_alternatives(unsigned int event, unsigned int alt[])
+static int power5_get_alternatives(unsigned int event, unsigned int flags,
+				   unsigned int alt[])
 {
 	int i, j, ae, nalt = 1;
 
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
index fce1fc290a1..d44049f0ae2 100644
--- a/arch/powerpc/kernel/power6-pmu.c
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -182,7 +182,7 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev,
 	unsigned int ttmset = 0;
 	unsigned int pmc_inuse = 0;
 
-	if (n_ev > 4)
+	if (n_ev > 6)
 		return -1;
 	for (i = 0; i < n_ev; ++i) {
 		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
@@ -202,6 +202,8 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev,
 			for (pmc = 0; pmc < 4; ++pmc)
 				if (!(pmc_inuse & (1 << pmc)))
 					break;
+			if (pmc >= 4)
+				return -1;
 			pmc_inuse |= 1 << pmc;
 		}
 		hwc[i] = pmc;
@@ -240,7 +242,8 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev,
 		}
 		if (power6_marked_instr_event(event[i]))
 			mmcra |= MMCRA_SAMPLE_ENABLE;
-		mmcr1 |= (u64)psel << MMCR1_PMCSEL_SH(pmc);
+		if (pmc < 4)
+			mmcr1 |= (u64)psel << MMCR1_PMCSEL_SH(pmc);
 	}
 	mmcr[0] = 0;
 	if (pmc_inuse & 1)
@@ -256,19 +259,20 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev,
  * Layout of constraint bits:
  *
  *	0-1	add field: number of uses of PMC1 (max 1)
- *	2-3, 4-5, 6-7: ditto for PMC2, 3, 4
- *	8-10	select field: nest (subunit) event selector
+ *	2-3, 4-5, 6-7, 8-9, 10-11: ditto for PMC2, 3, 4, 5, 6
+ *	12-15	add field: number of uses of PMC1-4 (max 4)
  *	16-19	select field: unit on byte 0 of event bus
  *	20-23, 24-27, 28-31 ditto for bytes 1, 2, 3
+ *	32-34	select field: nest (subunit) event selector
  */
 static int p6_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
 {
-	int pmc, byte, sh;
-	unsigned int mask = 0, value = 0;
+	int pmc, byte, sh, subunit;
+	u64 mask = 0, value = 0;
 
 	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
 	if (pmc) {
-		if (pmc > 4)
+		if (pmc > 4 && !(event == 0x500009 || event == 0x600005))
 			return -1;
 		sh = (pmc - 1) * 2;
 		mask |= 2 << sh;
@@ -276,26 +280,38 @@ static int p6_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
 	}
 	if (event & PM_BUSEVENT_MSK) {
 		byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
-		sh = byte * 4;
+		sh = byte * 4 + (16 - PM_UNIT_SH);
 		mask |= PM_UNIT_MSKS << sh;
-		value |= (event & PM_UNIT_MSKS) << sh;
+		value |= (u64)(event & PM_UNIT_MSKS) << sh;
 		if ((event & PM_UNIT_MSKS) == (5 << PM_UNIT_SH)) {
-			mask |= PM_SUBUNIT_MSKS;
-			value |= event & PM_SUBUNIT_MSKS;
+			subunit = (event >> PM_SUBUNIT_SH) & PM_SUBUNIT_MSK;
+			mask  |= (u64)PM_SUBUNIT_MSK << 32;
+			value |= (u64)subunit << 32;
 		}
 	}
+	if (pmc <= 4) {
+		mask  |= 0x8000;	/* add field for count of PMC1-4 uses */
+		value |= 0x1000;
+	}
 	*maskp = mask;
 	*valp = value;
 	return 0;
 }
 
+static int p6_limited_pmc_event(unsigned int event)
+{
+	int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+
+	return pmc == 5 || pmc == 6;
+}
+
 #define MAX_ALT	4	/* at most 4 alternatives for any event */
 
 static const unsigned int event_alternatives[][MAX_ALT] = {
 	{ 0x0130e8, 0x2000f6, 0x3000fc },	/* PM_PTEG_RELOAD_VALID */
 	{ 0x080080, 0x10000d, 0x30000c, 0x4000f0 }, /* PM_LD_MISS_L1 */
 	{ 0x080088, 0x200054, 0x3000f0 },	/* PM_ST_MISS_L1 */
-	{ 0x10000a, 0x2000f4 },			/* PM_RUN_CYC */
+	{ 0x10000a, 0x2000f4, 0x600005 },	/* PM_RUN_CYC */
 	{ 0x10000b, 0x2000f5 },			/* PM_RUN_COUNT */
 	{ 0x10000e, 0x400010 },			/* PM_PURR */
 	{ 0x100010, 0x4000f8 },			/* PM_FLUSH */
@@ -340,13 +356,15 @@ static int find_alternatives_list(unsigned int event)
 	return -1;
 }
 
-static int p6_get_alternatives(unsigned int event, unsigned int alt[])
+static int p6_get_alternatives(unsigned int event, unsigned int flags,
+			       unsigned int alt[])
 {
-	int i, j;
+	int i, j, nlim;
 	unsigned int aevent, psel, pmc;
 	unsigned int nalt = 1;
 
 	alt[0] = event;
+	nlim = p6_limited_pmc_event(event);
 
 	/* check the alternatives table */
 	i = find_alternatives_list(event);
@@ -358,6 +376,7 @@ static int p6_get_alternatives(unsigned int event, unsigned int alt[])
 				break;
 			if (aevent != event)
 				alt[nalt++] = aevent;
+			nlim += p6_limited_pmc_event(aevent);
 		}
 
 	} else {
@@ -375,13 +394,75 @@ static int p6_get_alternatives(unsigned int event, unsigned int alt[])
 				((pmc > 2? pmc - 2: pmc + 2) << PM_PMC_SH);
 	}
 
+	if (flags & PPMU_ONLY_COUNT_RUN) {
+		/*
+		 * We're only counting in RUN state,
+		 * so PM_CYC is equivalent to PM_RUN_CYC,
+		 * PM_INST_CMPL === PM_RUN_INST_CMPL, PM_PURR === PM_RUN_PURR.
+		 * This doesn't include alternatives that don't provide
+		 * any extra flexibility in assigning PMCs (e.g.
+		 * 0x10000a for PM_RUN_CYC vs. 0x1e for PM_CYC).
+		 * Note that even with these additional alternatives
+		 * we never end up with more than 4 alternatives for any event.
+		 */
+		j = nalt;
+		for (i = 0; i < nalt; ++i) {
+			switch (alt[i]) {
+			case 0x1e:	/* PM_CYC */
+				alt[j++] = 0x600005;	/* PM_RUN_CYC */
+				++nlim;
+				break;
+			case 0x10000a:	/* PM_RUN_CYC */
+				alt[j++] = 0x1e;	/* PM_CYC */
+				break;
+			case 2:		/* PM_INST_CMPL */
+				alt[j++] = 0x500009;	/* PM_RUN_INST_CMPL */
+				++nlim;
+				break;
+			case 0x500009:	/* PM_RUN_INST_CMPL */
+				alt[j++] = 2;		/* PM_INST_CMPL */
+				break;
+			case 0x10000e:	/* PM_PURR */
+				alt[j++] = 0x4000f4;	/* PM_RUN_PURR */
+				break;
+			case 0x4000f4:	/* PM_RUN_PURR */
+				alt[j++] = 0x10000e;	/* PM_PURR */
+				break;
+			}
+		}
+		nalt = j;
+	}
+
+	if (!(flags & PPMU_LIMITED_PMC_OK) && nlim) {
+		/* remove the limited PMC events */
+		j = 0;
+		for (i = 0; i < nalt; ++i) {
+			if (!p6_limited_pmc_event(alt[i])) {
+				alt[j] = alt[i];
+				++j;
+			}
+		}
+		nalt = j;
+	} else if ((flags & PPMU_LIMITED_PMC_REQD) && nlim < nalt) {
+		/* remove all but the limited PMC events */
+		j = 0;
+		for (i = 0; i < nalt; ++i) {
+			if (p6_limited_pmc_event(alt[i])) {
+				alt[j] = alt[i];
+				++j;
+			}
+		}
+		nalt = j;
+	}
+
 	return nalt;
 }
 
 static void p6_disable_pmc(unsigned int pmc, u64 mmcr[])
 {
 	/* Set PMCxSEL to 0 to disable PMCx */
-	mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc));
+	if (pmc <= 3)
+		mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc));
 }
 
 static int power6_generic_events[] = {
@@ -394,14 +475,16 @@ static int power6_generic_events[] = {
 };
 
 struct power_pmu power6_pmu = {
-	.n_counter = 4,
+	.n_counter = 6,
 	.max_alternatives = MAX_ALT,
-	.add_fields = 0x55,
-	.test_adder = 0,
+	.add_fields = 0x1555,
+	.test_adder = 0x3000,
 	.compute_mmcr = p6_compute_mmcr,
 	.get_constraint = p6_get_constraint,
 	.get_alternatives = p6_get_alternatives,
 	.disable_pmc = p6_disable_pmc,
 	.n_generic = ARRAY_SIZE(power6_generic_events),
 	.generic_events = power6_generic_events,
+	.limited_pmc5_6 = 1,
+	.limited_pmc_event = p6_limited_pmc_event,
 };
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
index aed8ccd7c07..af2d1884058 100644
--- a/arch/powerpc/kernel/ppc970-pmu.c
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -243,7 +243,8 @@ static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
 	return 0;
 }
 
-static int p970_get_alternatives(unsigned int event, unsigned int alt[])
+static int p970_get_alternatives(unsigned int event, unsigned int flags,
+				 unsigned int alt[])
 {
 	alt[0] = event;
 
-- 
cgit v1.2.3


From 43f6201a22dbf1c5abe1cab96b49bd56fa9df8f4 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 16:55:56 +0200
Subject: perf_counter, x86: rename bitmasks to ->used_mask and ->active_mask

Standardize on explicitly mentioning '_mask' in fields that
are not plain flags but masks. This avoids typos like:

       if (cpuc->used)

(which could easily slip through review unnoticed), while if a
typo looks like this:

       if (cpuc->used_mask)

it might get noticed during review.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1241016956-24648-1-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 47e563bfd4c..fc06f4d3264 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -28,8 +28,8 @@ static u64 perf_counter_mask __read_mostly;
 
 struct cpu_hw_counters {
 	struct perf_counter	*counters[X86_PMC_IDX_MAX];
-	unsigned long		used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-	unsigned long		active[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	unsigned long		used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long		interrupts;
 	u64			throttle_ctrl;
 	int			enabled;
@@ -332,7 +332,7 @@ static u64 amd_pmu_save_disable_all(void)
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
 
-		if (!test_bit(idx, cpuc->active))
+		if (!test_bit(idx, cpuc->active_mask))
 			continue;
 		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
 		if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
@@ -373,7 +373,7 @@ static void amd_pmu_restore_all(u64 ctrl)
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
 
-		if (!test_bit(idx, cpuc->active))
+		if (!test_bit(idx, cpuc->active_mask))
 			continue;
 		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
 		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
@@ -576,7 +576,7 @@ static int x86_pmu_enable(struct perf_counter *counter)
 		 * Try to get the fixed counter, if that is already taken
 		 * then try to get a generic counter:
 		 */
-		if (test_and_set_bit(idx, cpuc->used))
+		if (test_and_set_bit(idx, cpuc->used_mask))
 			goto try_generic;
 
 		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
@@ -590,14 +590,14 @@ static int x86_pmu_enable(struct perf_counter *counter)
 	} else {
 		idx = hwc->idx;
 		/* Try to get the previous generic counter again */
-		if (test_and_set_bit(idx, cpuc->used)) {
+		if (test_and_set_bit(idx, cpuc->used_mask)) {
 try_generic:
-			idx = find_first_zero_bit(cpuc->used,
+			idx = find_first_zero_bit(cpuc->used_mask,
 						  x86_pmu.num_counters);
 			if (idx == x86_pmu.num_counters)
 				return -EAGAIN;
 
-			set_bit(idx, cpuc->used);
+			set_bit(idx, cpuc->used_mask);
 			hwc->idx = idx;
 		}
 		hwc->config_base  = x86_pmu.eventsel;
@@ -609,7 +609,7 @@ try_generic:
 	x86_pmu.disable(hwc, idx);
 
 	cpuc->counters[idx] = counter;
-	set_bit(idx, cpuc->active);
+	set_bit(idx, cpuc->active_mask);
 
 	x86_perf_counter_set_period(counter, hwc, idx);
 	x86_pmu.enable(hwc, idx);
@@ -643,7 +643,7 @@ void perf_counter_print_debug(void)
 		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
 		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
 	}
-	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
+	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used_mask);
 
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
@@ -677,7 +677,7 @@ static void x86_pmu_disable(struct perf_counter *counter)
 	 * Must be done before we disable, otherwise the nmi handler
 	 * could reenable again:
 	 */
-	clear_bit(idx, cpuc->active);
+	clear_bit(idx, cpuc->active_mask);
 	x86_pmu.disable(hwc, idx);
 
 	/*
@@ -692,7 +692,7 @@ static void x86_pmu_disable(struct perf_counter *counter)
 	 */
 	x86_perf_counter_update(counter, hwc, idx);
 	cpuc->counters[idx] = NULL;
-	clear_bit(idx, cpuc->used);
+	clear_bit(idx, cpuc->used_mask);
 }
 
 /*
@@ -741,7 +741,7 @@ again:
 		struct perf_counter *counter = cpuc->counters[bit];
 
 		clear_bit(bit, (unsigned long *) &status);
-		if (!test_bit(bit, cpuc->active))
+		if (!test_bit(bit, cpuc->active_mask))
 			continue;
 
 		intel_pmu_save_and_restart(counter);
@@ -779,7 +779,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 
 	++cpuc->interrupts;
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		if (!test_bit(idx, cpuc->active))
+		if (!test_bit(idx, cpuc->active_mask))
 			continue;
 		counter = cpuc->counters[idx];
 		hwc = &counter->hw;
-- 
cgit v1.2.3


From 63a809a2dc53b91268dd915bbcbd425063893676 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 1 May 2009 12:23:17 +0200
Subject: perf_counter: fix nmi-watchdog interaction

When we don't have any perf-counters active, don't act like we know
what the NMI is for.

[ Impact: fix hard hang with nmi_watchdog=2 ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090501102533.109867793@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index fc06f4d3264..d4c0cc9d326 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -871,6 +871,9 @@ perf_counter_nmi_handler(struct notifier_block *self,
 	struct pt_regs *regs;
 	int ret;
 
+	if (!atomic_read(&num_counters))
+		return NOTIFY_DONE;
+
 	switch (cmd) {
 	case DIE_NMI:
 	case DIE_NMI_IPI:
-- 
cgit v1.2.3


From ba77813a2a22d631fe5bc0bf1ec0d11350544b70 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 4 May 2009 18:47:44 +0200
Subject: perf_counter: x86: fixup nmi_watchdog vs perf_counter boo-boo

Invert the atomic_inc_not_zero() test so that we will indeed detect the
first activation.

Also rename the global num_counters, since its easy to confuse with
x86_pmu.num_counters.

[ Impact: fix non-working perfcounters on AMD CPUs, cleanup ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241455664.7620.4938.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index d4c0cc9d326..196b58f0444 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -171,7 +171,7 @@ again:
 	return new_raw_count;
 }
 
-static atomic_t num_counters;
+static atomic_t active_counters;
 static DEFINE_MUTEX(pmc_reserve_mutex);
 
 static bool reserve_pmc_hardware(void)
@@ -224,7 +224,7 @@ static void release_pmc_hardware(void)
 
 static void hw_perf_counter_destroy(struct perf_counter *counter)
 {
-	if (atomic_dec_and_mutex_lock(&num_counters, &pmc_reserve_mutex)) {
+	if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) {
 		release_pmc_hardware();
 		mutex_unlock(&pmc_reserve_mutex);
 	}
@@ -248,12 +248,12 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		return -ENODEV;
 
 	err = 0;
-	if (atomic_inc_not_zero(&num_counters)) {
+	if (!atomic_inc_not_zero(&active_counters)) {
 		mutex_lock(&pmc_reserve_mutex);
-		if (atomic_read(&num_counters) == 0 && !reserve_pmc_hardware())
+		if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware())
 			err = -EBUSY;
 		else
-			atomic_inc(&num_counters);
+			atomic_inc(&active_counters);
 		mutex_unlock(&pmc_reserve_mutex);
 	}
 	if (err)
@@ -280,7 +280,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	if (capable(CAP_SYS_ADMIN) && hw_event->nmi)
 		hwc->nmi = 1;
 
-	hwc->irq_period		= hw_event->irq_period;
+	hwc->irq_period	= hw_event->irq_period;
 	if ((s64)hwc->irq_period <= 0 || hwc->irq_period > x86_pmu.max_period)
 		hwc->irq_period = x86_pmu.max_period;
 
@@ -871,7 +871,7 @@ perf_counter_nmi_handler(struct notifier_block *self,
 	struct pt_regs *regs;
 	int ret;
 
-	if (!atomic_read(&num_counters))
+	if (!atomic_read(&active_counters))
 		return NOTIFY_DONE;
 
 	switch (cmd) {
-- 
cgit v1.2.3


From 066d7dea32c9bffe6decc0abe465627656cdd84e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 4 May 2009 19:04:09 +0200
Subject: perf_counter: fix fixed-purpose counter support on v2 Intel-PERFMON

Fixed-purpose counters stopped working in a simple 'perf stat ls' run:

   <not counted>  cache references
   <not counted>  cache misses

Due to:

  ef7b3e0: perf_counter, x86: remove vendor check in fixed_mode_idx()

Which made x86_pmu.num_counters_fixed matter: if it's nonzero, the
fixed-purpose counters are utilized.

But on v2 perfmon this field is not set (despite there being
fixed-purpose PMCs). So add a quirk to set the number of fixed-purpose
counters to at least three.

[ Impact: add quirk for three fixed-purpose counters on certain Intel CPUs ]

Cc: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-28-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 196b58f0444..a6878b0798e 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -962,7 +962,13 @@ static int intel_pmu_init(void)
 	x86_pmu = intel_pmu;
 	x86_pmu.version = version;
 	x86_pmu.num_counters = eax.split.num_counters;
-	x86_pmu.num_counters_fixed = edx.split.num_counters_fixed;
+
+	/*
+	 * Quirk: v2 perfmon does not report fixed-purpose counters, so
+	 * assume at least 3 counters:
+	 */
+	x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
+
 	x86_pmu.counter_bits = eax.split.bit_width;
 	x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
 
-- 
cgit v1.2.3


From 8823392360dc4992f87bf4c623834d315f297493 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Sun, 10 May 2009 10:53:05 +0200
Subject: perf_counter, x86: clean up throttling printk

s/PERFMON/perfcounters for perfcounter interrupt throttling warning.

'perfmon' is the CPU feature name that is Intel-only, while we do
throttling in a generic way.

[ Impact: cleanup ]

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index a6878b0798e..da27419923a 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -814,7 +814,7 @@ void perf_counter_unthrottle(void)
 	cpuc = &__get_cpu_var(cpu_hw_counters);
 	if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
 		if (printk_ratelimit())
-			printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n");
+			printk(KERN_WARNING "perfcounters: max interrupts exceeded!\n");
 		hw_perf_restore(cpuc->throttle_ctrl);
 	}
 	cpuc->interrupts = 0;
-- 
cgit v1.2.3


From 5bb9efe33ea4001a17ab98186a40a134a3061d67 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 13 May 2009 08:12:51 +0200
Subject: perf_counter: fix print debug irq disable

inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage.
bash/15802 [HC0[0]:SC0[0]:HE1:SE1] takes:
 (sysrq_key_table_lock){?.....},

Don't unconditionally enable interrupts in the perf_counter_print_debug()
path.

[ Impact: fix potential deadlock pointed out by lockdep ]

LKML-Reference: <new-submission>
Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 arch/x86/kernel/cpu/perf_counter.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index da27419923a..f7772ff7936 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -621,12 +621,13 @@ void perf_counter_print_debug(void)
 {
 	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
 	struct cpu_hw_counters *cpuc;
+	unsigned long flags;
 	int cpu, idx;
 
 	if (!x86_pmu.num_counters)
 		return;
 
-	local_irq_disable();
+	local_irq_save(flags);
 
 	cpu = smp_processor_id();
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
@@ -664,7 +665,7 @@ void perf_counter_print_debug(void)
 		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
 			cpu, idx, pmc_count);
 	}
-	local_irq_enable();
+	local_irq_restore(flags);
 }
 
 static void x86_pmu_disable(struct perf_counter *counter)
-- 
cgit v1.2.3


From ec3232bdf8518bea8410f0027f870b24d3aa8753 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 13 May 2009 09:45:19 +0200
Subject: perf_counter: x86: More accurate counter update

Take the counter width into account instead of assuming 32 bits.

In particular Nehalem has 44 bit wide counters, and all
arithmetics should happen on a 44-bit signed integer basis.

[ Impact: fix rare event imprecision, warning message on Nehalem ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index f7772ff7936..3a92a2b2a80 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -138,7 +138,9 @@ static u64
 x86_perf_counter_update(struct perf_counter *counter,
 			struct hw_perf_counter *hwc, int idx)
 {
-	u64 prev_raw_count, new_raw_count, delta;
+	int shift = 64 - x86_pmu.counter_bits;
+	u64 prev_raw_count, new_raw_count;
+	s64 delta;
 
 	/*
 	 * Careful: an NMI might modify the previous counter value.
@@ -161,9 +163,10 @@ again:
 	 * (counter-)time and add that to the generic counter.
 	 *
 	 * Careful, not all hw sign-extends above the physical width
-	 * of the count, so we do that by clipping the delta to 32 bits:
+	 * of the count.
 	 */
-	delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count);
+	delta = (new_raw_count << shift) - (prev_raw_count << shift);
+	delta >>= shift;
 
 	atomic64_add(delta, &counter->count);
 	atomic64_sub(delta, &hwc->period_left);
-- 
cgit v1.2.3


From f5a5a2f6e69e88647ae12da39f0ff3a510bcf0a6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 13 May 2009 12:54:01 +0200
Subject: perf_counter: x86: Fix throttling

If counters are disabled globally when a perfcounter IRQ/NMI hits,
and if we throttle in that case, we'll promote the '0' value to
the next lapic IRQ and disable all perfcounters at that point,
permanently ...

Fix it.

[ Impact: fix hung perfcounters under load ]

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 3a92a2b2a80..88ae8cebf3c 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -765,8 +765,13 @@ out:
 	/*
 	 * Restore - do not reenable when global enable is off or throttled:
 	 */
-	if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS)
-		intel_pmu_restore_all(cpuc->throttle_ctrl);
+	if (cpuc->throttle_ctrl) {
+		if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) {
+			intel_pmu_restore_all(cpuc->throttle_ctrl);
+		} else {
+			pr_info("CPU#%d: perfcounters: max interrupt rate exceeded! Throttle on.\n", smp_processor_id());
+		}
+	}
 
 	return ret;
 }
@@ -817,11 +822,16 @@ void perf_counter_unthrottle(void)
 
 	cpuc = &__get_cpu_var(cpu_hw_counters);
 	if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
-		if (printk_ratelimit())
-			printk(KERN_WARNING "perfcounters: max interrupts exceeded!\n");
+		pr_info("CPU#%d: perfcounters: throttle off.\n", smp_processor_id());
+
+		/*
+		 * Clear them before re-enabling irqs/NMIs again:
+		 */
+		cpuc->interrupts = 0;
 		hw_perf_restore(cpuc->throttle_ctrl);
+	} else {
+		cpuc->interrupts = 0;
 	}
-	cpuc->interrupts = 0;
 }
 
 void smp_perf_counter_interrupt(struct pt_regs *regs)
-- 
cgit v1.2.3


From a026dfecc035f213c1cfa0bf6407ce3155f6a9df Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 13 May 2009 10:02:57 +0200
Subject: perf_counter: x86: Allow unpriviliged use of NMIs

Apply sysctl_perf_counter_priv to NMIs. Also, fail the counter
creation instead of silently down-grading to regular interrupts.

[ Impact: allow wider perf-counter usage ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 88ae8cebf3c..c19e927b697 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -280,8 +280,11 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	 * If privileged enough, allow NMI events:
 	 */
 	hwc->nmi = 0;
-	if (capable(CAP_SYS_ADMIN) && hw_event->nmi)
+	if (hw_event->nmi) {
+		if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN))
+			return -EACCES;
 		hwc->nmi = 1;
+	}
 
 	hwc->irq_period	= hw_event->irq_period;
 	if ((s64)hwc->irq_period <= 0 || hwc->irq_period > x86_pmu.max_period)
-- 
cgit v1.2.3


From 962bf7a66edca4d36a730a38ff8410a67f560e40 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 13 May 2009 13:21:36 +0200
Subject: perf_counter: x86: Fix up the amd NMI/INT throttle

perf_counter_unthrottle() restores throttle_ctrl, buts its never set.
Also, we fail to disable all counters when throttling.

[ Impact: fix rare stuck perf-counters when they are throttled ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 38 ++++++++++++++++++++++++++------------
 1 file changed, 26 insertions(+), 12 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index c19e927b697..7601c014f8f 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -334,6 +334,8 @@ static u64 amd_pmu_save_disable_all(void)
 	 * right thing.
 	 */
 	barrier();
+	if (!enabled)
+		goto out;
 
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
@@ -347,6 +349,7 @@ static u64 amd_pmu_save_disable_all(void)
 		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
 	}
 
+out:
 	return enabled;
 }
 
@@ -787,32 +790,43 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 	int handled = 0;
 	struct perf_counter *counter;
 	struct hw_perf_counter *hwc;
-	int idx;
+	int idx, throttle = 0;
+
+	cpuc->throttle_ctrl = cpuc->enabled;
+	cpuc->enabled = 0;
+	barrier();
+
+	if (cpuc->throttle_ctrl) {
+		if (++cpuc->interrupts >= PERFMON_MAX_INTERRUPTS)
+			throttle = 1;
+	}
 
-	++cpuc->interrupts;
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+		int disable = 0;
+
 		if (!test_bit(idx, cpuc->active_mask))
 			continue;
+
 		counter = cpuc->counters[idx];
 		hwc = &counter->hw;
 		val = x86_perf_counter_update(counter, hwc, idx);
 		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
-			continue;
+			goto next;
+
 		/* counter overflow */
 		x86_perf_counter_set_period(counter, hwc, idx);
 		handled = 1;
 		inc_irq_stat(apic_perf_irqs);
-		if (perf_counter_overflow(counter, nmi, regs, 0))
-			amd_pmu_disable_counter(hwc, idx);
-		else if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS)
-			/*
-			 * do not reenable when throttled, but reload
-			 * the register
-			 */
+		disable = perf_counter_overflow(counter, nmi, regs, 0);
+
+next:
+		if (disable || throttle)
 			amd_pmu_disable_counter(hwc, idx);
-		else if (counter->state == PERF_COUNTER_STATE_ACTIVE)
-			amd_pmu_enable_counter(hwc, idx);
 	}
+
+	if (cpuc->throttle_ctrl && !throttle)
+		cpuc->enabled = 1;
+
 	return handled;
 }
 
-- 
cgit v1.2.3


From 9e35ad388bea89f7d6f375af4c0ae98803688666 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 13 May 2009 16:21:38 +0200
Subject: perf_counter: Rework the perf counter disable/enable

The current disable/enable mechanism is:

	token = hw_perf_save_disable();
	...
	/* do bits */
	...
	hw_perf_restore(token);

This works well, provided that the use nests properly. Except we don't.

x86 NMI/INT throttling has non-nested use of this, breaking things. Therefore
provide a reference counter disable/enable interface, where the first disable
disables the hardware, and the last enable enables the hardware again.

[ Impact: refactor, simplify the PMU disable/enable logic ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |  24 ++++----
 arch/x86/kernel/cpu/perf_counter.c | 113 ++++++++++++++-----------------------
 2 files changed, 54 insertions(+), 83 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 15cdc8e6722..bb1b463c136 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -386,7 +386,7 @@ static void write_mmcr0(struct cpu_hw_counters *cpuhw, unsigned long mmcr0)
  * Disable all counters to prevent PMU interrupts and to allow
  * counters to be added or removed.
  */
-u64 hw_perf_save_disable(void)
+void hw_perf_disable(void)
 {
 	struct cpu_hw_counters *cpuhw;
 	unsigned long ret;
@@ -428,7 +428,6 @@ u64 hw_perf_save_disable(void)
 		mb();
 	}
 	local_irq_restore(flags);
-	return ret;
 }
 
 /*
@@ -436,7 +435,7 @@ u64 hw_perf_save_disable(void)
  * If we were previously disabled and counters were added, then
  * put the new config on the PMU.
  */
-void hw_perf_restore(u64 disable)
+void hw_perf_enable(void)
 {
 	struct perf_counter *counter;
 	struct cpu_hw_counters *cpuhw;
@@ -448,9 +447,12 @@ void hw_perf_restore(u64 disable)
 	int n_lim;
 	int idx;
 
-	if (disable)
-		return;
 	local_irq_save(flags);
+	if (!cpuhw->disabled) {
+		local_irq_restore(flags);
+		return;
+	}
+
 	cpuhw = &__get_cpu_var(cpu_hw_counters);
 	cpuhw->disabled = 0;
 
@@ -649,19 +651,18 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader,
 /*
  * Add a counter to the PMU.
  * If all counters are not already frozen, then we disable and
- * re-enable the PMU in order to get hw_perf_restore to do the
+ * re-enable the PMU in order to get hw_perf_enable to do the
  * actual work of reconfiguring the PMU.
  */
 static int power_pmu_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuhw;
 	unsigned long flags;
-	u64 pmudis;
 	int n0;
 	int ret = -EAGAIN;
 
 	local_irq_save(flags);
-	pmudis = hw_perf_save_disable();
+	perf_disable();
 
 	/*
 	 * Add the counter to the list (if there is room)
@@ -685,7 +686,7 @@ static int power_pmu_enable(struct perf_counter *counter)
 
 	ret = 0;
  out:
-	hw_perf_restore(pmudis);
+	perf_enable();
 	local_irq_restore(flags);
 	return ret;
 }
@@ -697,11 +698,10 @@ static void power_pmu_disable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuhw;
 	long i;
-	u64 pmudis;
 	unsigned long flags;
 
 	local_irq_save(flags);
-	pmudis = hw_perf_save_disable();
+	perf_disable();
 
 	power_pmu_read(counter);
 
@@ -735,7 +735,7 @@ static void power_pmu_disable(struct perf_counter *counter)
 		cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
 	}
 
-	hw_perf_restore(pmudis);
+	perf_enable();
 	local_irq_restore(flags);
 }
 
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 7601c014f8f..313638cecbb 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -31,7 +31,6 @@ struct cpu_hw_counters {
 	unsigned long		used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long		interrupts;
-	u64			throttle_ctrl;
 	int			enabled;
 };
 
@@ -42,8 +41,8 @@ struct x86_pmu {
 	const char	*name;
 	int		version;
 	int		(*handle_irq)(struct pt_regs *, int);
-	u64		(*save_disable_all)(void);
-	void		(*restore_all)(u64);
+	void		(*disable_all)(void);
+	void		(*enable_all)(void);
 	void		(*enable)(struct hw_perf_counter *, int);
 	void		(*disable)(struct hw_perf_counter *, int);
 	unsigned	eventsel;
@@ -56,6 +55,7 @@ struct x86_pmu {
 	int		counter_bits;
 	u64		counter_mask;
 	u64		max_period;
+	u64		intel_ctrl;
 };
 
 static struct x86_pmu x86_pmu __read_mostly;
@@ -311,22 +311,19 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	return 0;
 }
 
-static u64 intel_pmu_save_disable_all(void)
+static void intel_pmu_disable_all(void)
 {
-	u64 ctrl;
-
-	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-
-	return ctrl;
 }
 
-static u64 amd_pmu_save_disable_all(void)
+static void amd_pmu_disable_all(void)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-	int enabled, idx;
+	int idx;
+
+	if (!cpuc->enabled)
+		return;
 
-	enabled = cpuc->enabled;
 	cpuc->enabled = 0;
 	/*
 	 * ensure we write the disable before we start disabling the
@@ -334,8 +331,6 @@ static u64 amd_pmu_save_disable_all(void)
 	 * right thing.
 	 */
 	barrier();
-	if (!enabled)
-		goto out;
 
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
@@ -348,37 +343,31 @@ static u64 amd_pmu_save_disable_all(void)
 		val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
 		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
 	}
-
-out:
-	return enabled;
 }
 
-u64 hw_perf_save_disable(void)
+void hw_perf_disable(void)
 {
 	if (!x86_pmu_initialized())
-		return 0;
-	return x86_pmu.save_disable_all();
+		return;
+	return x86_pmu.disable_all();
 }
-/*
- * Exported because of ACPI idle
- */
-EXPORT_SYMBOL_GPL(hw_perf_save_disable);
 
-static void intel_pmu_restore_all(u64 ctrl)
+static void intel_pmu_enable_all(void)
 {
-	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
 }
 
-static void amd_pmu_restore_all(u64 ctrl)
+static void amd_pmu_enable_all(void)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	int idx;
 
-	cpuc->enabled = ctrl;
-	barrier();
-	if (!ctrl)
+	if (cpuc->enabled)
 		return;
 
+	cpuc->enabled = 1;
+	barrier();
+
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
 
@@ -392,16 +381,12 @@ static void amd_pmu_restore_all(u64 ctrl)
 	}
 }
 
-void hw_perf_restore(u64 ctrl)
+void hw_perf_enable(void)
 {
 	if (!x86_pmu_initialized())
 		return;
-	x86_pmu.restore_all(ctrl);
+	x86_pmu.enable_all();
 }
-/*
- * Exported because of ACPI idle
- */
-EXPORT_SYMBOL_GPL(hw_perf_restore);
 
 static inline u64 intel_pmu_get_status(void)
 {
@@ -735,15 +720,14 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 	int bit, cpu = smp_processor_id();
 	u64 ack, status;
 	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
-	int ret = 0;
-
-	cpuc->throttle_ctrl = intel_pmu_save_disable_all();
 
+	perf_disable();
 	status = intel_pmu_get_status();
-	if (!status)
-		goto out;
+	if (!status) {
+		perf_enable();
+		return 0;
+	}
 
-	ret = 1;
 again:
 	inc_irq_stat(apic_perf_irqs);
 	ack = status;
@@ -767,19 +751,11 @@ again:
 	status = intel_pmu_get_status();
 	if (status)
 		goto again;
-out:
-	/*
-	 * Restore - do not reenable when global enable is off or throttled:
-	 */
-	if (cpuc->throttle_ctrl) {
-		if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) {
-			intel_pmu_restore_all(cpuc->throttle_ctrl);
-		} else {
-			pr_info("CPU#%d: perfcounters: max interrupt rate exceeded! Throttle on.\n", smp_processor_id());
-		}
-	}
 
-	return ret;
+	if (++cpuc->interrupts != PERFMON_MAX_INTERRUPTS)
+		perf_enable();
+
+	return 1;
 }
 
 static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
@@ -792,13 +768,11 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 	struct hw_perf_counter *hwc;
 	int idx, throttle = 0;
 
-	cpuc->throttle_ctrl = cpuc->enabled;
-	cpuc->enabled = 0;
-	barrier();
-
-	if (cpuc->throttle_ctrl) {
-		if (++cpuc->interrupts >= PERFMON_MAX_INTERRUPTS)
-			throttle = 1;
+	if (++cpuc->interrupts == PERFMON_MAX_INTERRUPTS) {
+		throttle = 1;
+		__perf_disable();
+		cpuc->enabled = 0;
+		barrier();
 	}
 
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
@@ -824,9 +798,6 @@ next:
 			amd_pmu_disable_counter(hwc, idx);
 	}
 
-	if (cpuc->throttle_ctrl && !throttle)
-		cpuc->enabled = 1;
-
 	return handled;
 }
 
@@ -839,13 +810,11 @@ void perf_counter_unthrottle(void)
 
 	cpuc = &__get_cpu_var(cpu_hw_counters);
 	if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
-		pr_info("CPU#%d: perfcounters: throttle off.\n", smp_processor_id());
-
 		/*
 		 * Clear them before re-enabling irqs/NMIs again:
 		 */
 		cpuc->interrupts = 0;
-		hw_perf_restore(cpuc->throttle_ctrl);
+		perf_enable();
 	} else {
 		cpuc->interrupts = 0;
 	}
@@ -931,8 +900,8 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
 static struct x86_pmu intel_pmu = {
 	.name			= "Intel",
 	.handle_irq		= intel_pmu_handle_irq,
-	.save_disable_all	= intel_pmu_save_disable_all,
-	.restore_all		= intel_pmu_restore_all,
+	.disable_all		= intel_pmu_disable_all,
+	.enable_all		= intel_pmu_enable_all,
 	.enable			= intel_pmu_enable_counter,
 	.disable		= intel_pmu_disable_counter,
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
@@ -951,8 +920,8 @@ static struct x86_pmu intel_pmu = {
 static struct x86_pmu amd_pmu = {
 	.name			= "AMD",
 	.handle_irq		= amd_pmu_handle_irq,
-	.save_disable_all	= amd_pmu_save_disable_all,
-	.restore_all		= amd_pmu_restore_all,
+	.disable_all		= amd_pmu_disable_all,
+	.enable_all		= amd_pmu_enable_all,
 	.enable			= amd_pmu_enable_counter,
 	.disable		= amd_pmu_disable_counter,
 	.eventsel		= MSR_K7_EVNTSEL0,
@@ -1003,6 +972,8 @@ static int intel_pmu_init(void)
 	x86_pmu.counter_bits = eax.split.bit_width;
 	x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
 
+	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From a4016a79fcbd139e7378944c0d86a39fdbc70ecc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 14 May 2009 14:52:17 +0200
Subject: perf_counter: x86: Robustify interrupt handling

Two consecutive NMIs could daze and confuse the machine when the
first would handle the overflow of both counters.

[ Impact: fix false-positive syslog messages under multi-session profiling ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 313638cecbb..1dcf67057f1 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -783,6 +783,10 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 
 		counter = cpuc->counters[idx];
 		hwc = &counter->hw;
+
+		if (counter->hw_event.nmi != nmi)
+			goto next;
+
 		val = x86_perf_counter_update(counter, hwc, idx);
 		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
 			goto next;
@@ -869,7 +873,6 @@ perf_counter_nmi_handler(struct notifier_block *self,
 {
 	struct die_args *args = __args;
 	struct pt_regs *regs;
-	int ret;
 
 	if (!atomic_read(&active_counters))
 		return NOTIFY_DONE;
@@ -886,9 +889,16 @@ perf_counter_nmi_handler(struct notifier_block *self,
 	regs = args->regs;
 
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
-	ret = x86_pmu.handle_irq(regs, 1);
+	/*
+	 * Can't rely on the handled return value to say it was our NMI, two
+	 * counters could trigger 'simultaneously' raising two back-to-back NMIs.
+	 *
+	 * If the first NMI handles both, the latter will be empty and daze
+	 * the CPU.
+	 */
+	x86_pmu.handle_irq(regs, 1);
 
-	return ret ? NOTIFY_STOP : NOTIFY_OK;
+	return NOTIFY_STOP;
 }
 
 static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
-- 
cgit v1.2.3


From 1c80f4b598d9b075a2a0be694e28be93a6702bcc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 15 May 2009 08:25:22 +0200
Subject: perf_counter: x86: Disallow interval of 1

On certain CPUs i have observed a stuck PMU if interval was set to
1 and NMIs were used. The PMU had PMC0 set in MSR_CORE_PERF_GLOBAL_STATUS,
but it was not possible to ack it via MSR_CORE_PERF_GLOBAL_OVF_CTRL,
and the NMI loop got stuck infinitely.

[ Impact: fix rare hangs during high perfcounter load ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 1dcf67057f1..46a82d1e4cb 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -473,6 +473,11 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 		left += period;
 		atomic64_set(&hwc->period_left, left);
 	}
+	/*
+	 * Quirk: certain CPUs dont like it if just 1 event is left:
+	 */
+	if (unlikely(left < 2))
+		left = 2;
 
 	per_cpu(prev_left[idx], smp_processor_id()) = left;
 
-- 
cgit v1.2.3


From 9029a5e3801f1cc7cdaab80169d82427acf928d8 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 15 May 2009 08:26:20 +0200
Subject: perf_counter: x86: Protect against infinite loops in
 intel_pmu_handle_irq()

intel_pmu_handle_irq() can lock up in an infinite loop if the hardware
does not allow the acking of irqs. Alas, this happened in testing so
make this robust and emit a warning if it happens in the future.

Also, clean up the IRQ handlers a bit.

[ Impact: improve perfcounter irq/nmi handling robustness ]

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 46a82d1e4cb..5a7f718eb1e 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -722,9 +722,13 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter)
  */
 static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 {
-	int bit, cpu = smp_processor_id();
+	struct cpu_hw_counters *cpuc;
+	struct cpu_hw_counters;
+	int bit, cpu, loops;
 	u64 ack, status;
-	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
+
+	cpu = smp_processor_id();
+	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
 	perf_disable();
 	status = intel_pmu_get_status();
@@ -733,7 +737,13 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 		return 0;
 	}
 
+	loops = 0;
 again:
+	if (++loops > 100) {
+		WARN_ONCE(1, "perfcounters: irq loop stuck!\n");
+		return 1;
+	}
+
 	inc_irq_stat(apic_perf_irqs);
 	ack = status;
 	for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
@@ -765,13 +775,14 @@ again:
 
 static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 {
-	int cpu = smp_processor_id();
-	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
-	u64 val;
-	int handled = 0;
+	int cpu, idx, throttle = 0, handled = 0;
+	struct cpu_hw_counters *cpuc;
 	struct perf_counter *counter;
 	struct hw_perf_counter *hwc;
-	int idx, throttle = 0;
+	u64 val;
+
+	cpu = smp_processor_id();
+	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
 	if (++cpuc->interrupts == PERFMON_MAX_INTERRUPTS) {
 		throttle = 1;
-- 
cgit v1.2.3


From 60db5e09c13109b13830cc9dcae688003fd39e79 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 15 May 2009 15:19:28 +0200
Subject: perf_counter: frequency based adaptive irq_period

Instead of specifying the irq_period for a counter, provide a target interrupt
frequency and dynamically adapt the irq_period to match this frequency.

[ Impact: new perf-counter attribute/feature ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <20090515132018.646195868@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 13 ++++++-------
 arch/x86/kernel/cpu/perf_counter.c |  9 +++------
 2 files changed, 9 insertions(+), 13 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index bb1b463c136..db8d5cafc15 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -534,7 +534,7 @@ void hw_perf_enable(void)
 			continue;
 		}
 		val = 0;
-		if (counter->hw_event.irq_period) {
+		if (counter->hw.irq_period) {
 			left = atomic64_read(&counter->hw.period_left);
 			if (left < 0x80000000L)
 				val = 0x80000000L - left;
@@ -829,8 +829,6 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 
 	if (!ppmu)
 		return ERR_PTR(-ENXIO);
-	if ((s64)counter->hw_event.irq_period < 0)
-		return ERR_PTR(-EINVAL);
 	if (!perf_event_raw(&counter->hw_event)) {
 		ev = perf_event_id(&counter->hw_event);
 		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
@@ -901,7 +899,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 
 	counter->hw.config = events[n];
 	counter->hw.counter_base = cflags[n];
-	atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period);
+	atomic64_set(&counter->hw.period_left, counter->hw.irq_period);
 
 	/*
 	 * See if we need to reserve the PMU.
@@ -934,6 +932,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 static void record_and_restart(struct perf_counter *counter, long val,
 			       struct pt_regs *regs, int nmi)
 {
+	u64 period = counter->hw.irq_period;
 	s64 prev, delta, left;
 	int record = 0;
 
@@ -948,11 +947,11 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	 */
 	val = 0;
 	left = atomic64_read(&counter->hw.period_left) - delta;
-	if (counter->hw_event.irq_period) {
+	if (period) {
 		if (left <= 0) {
-			left += counter->hw_event.irq_period;
+			left += period;
 			if (left <= 0)
-				left = counter->hw_event.irq_period;
+				left = period;
 			record = 1;
 		}
 		if (left < 0x80000000L)
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 5a7f718eb1e..886dcf334bc 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -286,11 +286,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		hwc->nmi = 1;
 	}
 
-	hwc->irq_period	= hw_event->irq_period;
-	if ((s64)hwc->irq_period <= 0 || hwc->irq_period > x86_pmu.max_period)
-		hwc->irq_period = x86_pmu.max_period;
-
-	atomic64_set(&hwc->period_left, hwc->irq_period);
+	atomic64_set(&hwc->period_left,
+			min(x86_pmu.max_period, hwc->irq_period));
 
 	/*
 	 * Raw event type provide the config in the event structure
@@ -458,7 +455,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 			     struct hw_perf_counter *hwc, int idx)
 {
 	s64 left = atomic64_read(&hwc->period_left);
-	s64 period = hwc->irq_period;
+	s64 period = min(x86_pmu.max_period, hwc->irq_period);
 	int err;
 
 	/*
-- 
cgit v1.2.3


From ef923214a4816c289e4af2d67a9ebb1a31e4ac61 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 14 May 2009 13:29:14 +1000
Subject: perf_counter: powerpc: use u64 for event codes internally

Although the perf_counter API allows 63-bit raw event codes,
internally in the powerpc back-end we had been using 32-bit
event codes.  This expands them to 64 bits so that we can add
bits for specifying threshold start/stop events and instruction
sampling modes later.

This also corrects the return value of can_go_on_limited_pmc;
we were returning an event code rather than just a 0/1 value in
some circumstances. That didn't particularly matter while event
codes were 32-bit, but now that event codes are 64-bit it
might, so this fixes it.

[ Impact: extend PowerPC perfcounter interfaces from u32 to u64 ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <18955.36874.472452.353104@drongo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/include/asm/perf_counter.h | 10 +++++-----
 arch/powerpc/kernel/perf_counter.c      | 26 ++++++++++++--------------
 arch/powerpc/kernel/power4-pmu.c        |  9 ++++-----
 arch/powerpc/kernel/power5+-pmu.c       | 14 +++++++-------
 arch/powerpc/kernel/power5-pmu.c        | 16 ++++++++--------
 arch/powerpc/kernel/power6-pmu.c        | 16 ++++++++--------
 arch/powerpc/kernel/ppc970-pmu.c        |  9 ++++-----
 7 files changed, 48 insertions(+), 52 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h
index 56d66c38143..ceea76a48e3 100644
--- a/arch/powerpc/include/asm/perf_counter.h
+++ b/arch/powerpc/include/asm/perf_counter.h
@@ -23,13 +23,13 @@ struct power_pmu {
 	int	max_alternatives;
 	u64	add_fields;
 	u64	test_adder;
-	int	(*compute_mmcr)(unsigned int events[], int n_ev,
+	int	(*compute_mmcr)(u64 events[], int n_ev,
 				unsigned int hwc[], u64 mmcr[]);
-	int	(*get_constraint)(unsigned int event, u64 *mskp, u64 *valp);
-	int	(*get_alternatives)(unsigned int event, unsigned int flags,
-				    unsigned int alt[]);
+	int	(*get_constraint)(u64 event, u64 *mskp, u64 *valp);
+	int	(*get_alternatives)(u64 event, unsigned int flags,
+				    u64 alt[]);
 	void	(*disable_pmc)(unsigned int pmc, u64 mmcr[]);
-	int	(*limited_pmc_event)(unsigned int event);
+	int	(*limited_pmc_event)(u64 event);
 	int	limited_pmc5_6;	/* PMC5 and PMC6 have limited function */
 	int	n_generic;
 	int	*generic_events;
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index db8d5cafc15..8d4cafc84b8 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -26,7 +26,7 @@ struct cpu_hw_counters {
 	int n_limited;
 	u8  pmcs_enabled;
 	struct perf_counter *counter[MAX_HWCOUNTERS];
-	unsigned int events[MAX_HWCOUNTERS];
+	u64 events[MAX_HWCOUNTERS];
 	unsigned int flags[MAX_HWCOUNTERS];
 	u64 mmcr[3];
 	struct perf_counter *limited_counter[MAX_LIMITED_HWCOUNTERS];
@@ -131,11 +131,11 @@ static void write_pmc(int idx, unsigned long val)
  * and see if any combination of alternative codes is feasible.
  * The feasible set is returned in event[].
  */
-static int power_check_constraints(unsigned int event[], unsigned int cflags[],
+static int power_check_constraints(u64 event[], unsigned int cflags[],
 				   int n_ev)
 {
 	u64 mask, value, nv;
-	unsigned int alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
+	u64 alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
 	u64 amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
 	u64 avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
 	u64 smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS];
@@ -564,7 +564,7 @@ void hw_perf_enable(void)
 }
 
 static int collect_events(struct perf_counter *group, int max_count,
-			  struct perf_counter *ctrs[], unsigned int *events,
+			  struct perf_counter *ctrs[], u64 *events,
 			  unsigned int *flags)
 {
 	int n = 0;
@@ -752,11 +752,11 @@ struct pmu power_pmu = {
  * that a limited PMC can count, doesn't require interrupts, and
  * doesn't exclude any processor mode.
  */
-static int can_go_on_limited_pmc(struct perf_counter *counter, unsigned int ev,
+static int can_go_on_limited_pmc(struct perf_counter *counter, u64 ev,
 				 unsigned int flags)
 {
 	int n;
-	unsigned int alt[MAX_EVENT_ALTERNATIVES];
+	u64 alt[MAX_EVENT_ALTERNATIVES];
 
 	if (counter->hw_event.exclude_user
 	    || counter->hw_event.exclude_kernel
@@ -776,10 +776,8 @@ static int can_go_on_limited_pmc(struct perf_counter *counter, unsigned int ev,
 
 	flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD;
 	n = ppmu->get_alternatives(ev, flags, alt);
-	if (n)
-		return alt[0];
 
-	return 0;
+	return n > 0;
 }
 
 /*
@@ -787,10 +785,9 @@ static int can_go_on_limited_pmc(struct perf_counter *counter, unsigned int ev,
  * and return the event code, or 0 if there is no such alternative.
  * (Note: event code 0 is "don't count" on all machines.)
  */
-static unsigned long normal_pmc_alternative(unsigned long ev,
-					    unsigned long flags)
+static u64 normal_pmc_alternative(u64 ev, unsigned long flags)
 {
-	unsigned int alt[MAX_EVENT_ALTERNATIVES];
+	u64 alt[MAX_EVENT_ALTERNATIVES];
 	int n;
 
 	flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD);
@@ -820,9 +817,10 @@ static void hw_perf_counter_destroy(struct perf_counter *counter)
 
 const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 {
-	unsigned long ev, flags;
+	u64 ev;
+	unsigned long flags;
 	struct perf_counter *ctrs[MAX_HWCOUNTERS];
-	unsigned int events[MAX_HWCOUNTERS];
+	u64 events[MAX_HWCOUNTERS];
 	unsigned int cflags[MAX_HWCOUNTERS];
 	int n;
 	int err;
diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c
index 744a2756958..836fa118eb1 100644
--- a/arch/powerpc/kernel/power4-pmu.c
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -213,7 +213,7 @@ static unsigned char direct_marked_event[8] = {
  * Returns 1 if event counts things relating to marked instructions
  * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
  */
-static int p4_marked_instr_event(unsigned int event)
+static int p4_marked_instr_event(u64 event)
 {
 	int pmc, psel, unit, byte, bit;
 	unsigned int mask;
@@ -249,7 +249,7 @@ static int p4_marked_instr_event(unsigned int event)
 	return (mask >> (byte * 8 + bit)) & 1;
 }
 
-static int p4_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
+static int p4_get_constraint(u64 event, u64 *maskp, u64 *valp)
 {
 	int pmc, byte, unit, lower, sh;
 	u64 mask = 0, value = 0;
@@ -320,8 +320,7 @@ static unsigned int ppc_inst_cmpl[] = {
 	0x1001, 0x4001, 0x6001, 0x7001, 0x8001
 };
 
-static int p4_get_alternatives(unsigned int event, unsigned int flags,
-			       unsigned int alt[])
+static int p4_get_alternatives(u64 event, unsigned int flags, u64 alt[])
 {
 	int i, j, na;
 
@@ -353,7 +352,7 @@ static int p4_get_alternatives(unsigned int event, unsigned int flags,
 	return na;
 }
 
-static int p4_compute_mmcr(unsigned int event[], int n_ev,
+static int p4_compute_mmcr(u64 event[], int n_ev,
 			   unsigned int hwc[], u64 mmcr[])
 {
 	u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0;
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
index 8154eaa2404..3ac0654372a 100644
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -135,7 +135,7 @@ static u64 unit_cons[PM_LASTUNIT+1][2] = {
 	[PM_GRS] =   { 0x0e00000000ull, 0x0c40000000ull },
 };
 
-static int power5p_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
+static int power5p_get_constraint(u64 event, u64 *maskp, u64 *valp)
 {
 	int pmc, byte, unit, sh;
 	int bit, fmask;
@@ -188,7 +188,7 @@ static int power5p_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
 	return 0;
 }
 
-static int power5p_limited_pmc_event(unsigned int event)
+static int power5p_limited_pmc_event(u64 event)
 {
 	int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
 
@@ -273,11 +273,11 @@ static int find_alternative_bdecode(unsigned int event)
 	return -1;
 }
 
-static int power5p_get_alternatives(unsigned int event, unsigned int flags,
-				    unsigned int alt[])
+static int power5p_get_alternatives(u64 event, unsigned int flags, u64 alt[])
 {
-	int i, j, ae, nalt = 1;
+	int i, j, nalt = 1;
 	int nlim;
+	u64 ae;
 
 	alt[0] = event;
 	nalt = 1;
@@ -402,7 +402,7 @@ static unsigned char direct_event_is_marked[0x28] = {
  * Returns 1 if event counts things relating to marked instructions
  * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
  */
-static int power5p_marked_instr_event(unsigned int event)
+static int power5p_marked_instr_event(u64 event)
 {
 	int pmc, psel;
 	int bit, byte, unit;
@@ -451,7 +451,7 @@ static int power5p_marked_instr_event(unsigned int event)
 	return (mask >> (byte * 8 + bit)) & 1;
 }
 
-static int power5p_compute_mmcr(unsigned int event[], int n_ev,
+static int power5p_compute_mmcr(u64 event[], int n_ev,
 				unsigned int hwc[], u64 mmcr[])
 {
 	u64 mmcr1 = 0;
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
index 6e667dc8647..d5344968ee9 100644
--- a/arch/powerpc/kernel/power5-pmu.c
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -139,7 +139,7 @@ static u64 unit_cons[PM_LASTUNIT+1][2] = {
 	[PM_GRS] =   { 0x30002000000000ull, 0x30000400000000ull },
 };
 
-static int power5_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
+static int power5_get_constraint(u64 event, u64 *maskp, u64 *valp)
 {
 	int pmc, byte, unit, sh;
 	int bit, fmask;
@@ -224,7 +224,7 @@ static const unsigned int event_alternatives[][MAX_ALT] = {
  * Scan the alternatives table for a match and return the
  * index into the alternatives table if found, else -1.
  */
-static int find_alternative(unsigned int event)
+static int find_alternative(u64 event)
 {
 	int i, j;
 
@@ -250,7 +250,7 @@ static const unsigned char bytedecode_alternatives[4][4] = {
  * PMCSEL values on other counters.  This returns the alternative
  * event code for those that do, or -1 otherwise.
  */
-static int find_alternative_bdecode(unsigned int event)
+static u64 find_alternative_bdecode(u64 event)
 {
 	int pmc, altpmc, pp, j;
 
@@ -269,10 +269,10 @@ static int find_alternative_bdecode(unsigned int event)
 	return -1;
 }
 
-static int power5_get_alternatives(unsigned int event, unsigned int flags,
-				   unsigned int alt[])
+static int power5_get_alternatives(u64 event, unsigned int flags, u64 alt[])
 {
-	int i, j, ae, nalt = 1;
+	int i, j, nalt = 1;
+	u64 ae;
 
 	alt[0] = event;
 	nalt = 1;
@@ -338,7 +338,7 @@ static unsigned char direct_event_is_marked[0x28] = {
  * Returns 1 if event counts things relating to marked instructions
  * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
  */
-static int power5_marked_instr_event(unsigned int event)
+static int power5_marked_instr_event(u64 event)
 {
 	int pmc, psel;
 	int bit, byte, unit;
@@ -382,7 +382,7 @@ static int power5_marked_instr_event(unsigned int event)
 	return (mask >> (byte * 8 + bit)) & 1;
 }
 
-static int power5_compute_mmcr(unsigned int event[], int n_ev,
+static int power5_compute_mmcr(u64 event[], int n_ev,
 			       unsigned int hwc[], u64 mmcr[])
 {
 	u64 mmcr1 = 0;
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
index d44049f0ae2..ab7c615c458 100644
--- a/arch/powerpc/kernel/power6-pmu.c
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -134,7 +134,7 @@ static u32 marked_bus_events[16] = {
  * Returns 1 if event counts things relating to marked instructions
  * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
  */
-static int power6_marked_instr_event(unsigned int event)
+static int power6_marked_instr_event(u64 event)
 {
 	int pmc, psel, ptype;
 	int bit, byte, unit;
@@ -172,7 +172,7 @@ static int power6_marked_instr_event(unsigned int event)
 /*
  * Assign PMC numbers and compute MMCR1 value for a set of events
  */
-static int p6_compute_mmcr(unsigned int event[], int n_ev,
+static int p6_compute_mmcr(u64 event[], int n_ev,
 			   unsigned int hwc[], u64 mmcr[])
 {
 	u64 mmcr1 = 0;
@@ -265,7 +265,7 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev,
  *	20-23, 24-27, 28-31 ditto for bytes 1, 2, 3
  *	32-34	select field: nest (subunit) event selector
  */
-static int p6_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
+static int p6_get_constraint(u64 event, u64 *maskp, u64 *valp)
 {
 	int pmc, byte, sh, subunit;
 	u64 mask = 0, value = 0;
@@ -298,7 +298,7 @@ static int p6_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
 	return 0;
 }
 
-static int p6_limited_pmc_event(unsigned int event)
+static int p6_limited_pmc_event(u64 event)
 {
 	int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
 
@@ -337,7 +337,7 @@ static const unsigned int event_alternatives[][MAX_ALT] = {
  * This could be made more efficient with a binary search on
  * a presorted list, if necessary
  */
-static int find_alternatives_list(unsigned int event)
+static int find_alternatives_list(u64 event)
 {
 	int i, j;
 	unsigned int alt;
@@ -356,12 +356,12 @@ static int find_alternatives_list(unsigned int event)
 	return -1;
 }
 
-static int p6_get_alternatives(unsigned int event, unsigned int flags,
-			       unsigned int alt[])
+static int p6_get_alternatives(u64 event, unsigned int flags, u64 alt[])
 {
 	int i, j, nlim;
-	unsigned int aevent, psel, pmc;
+	unsigned int psel, pmc;
 	unsigned int nalt = 1;
+	u64 aevent;
 
 	alt[0] = event;
 	nlim = p6_limited_pmc_event(event);
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
index af2d1884058..eed47c4523f 100644
--- a/arch/powerpc/kernel/ppc970-pmu.c
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -147,7 +147,7 @@ static unsigned char direct_marked_event[8] = {
  * Returns 1 if event counts things relating to marked instructions
  * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
  */
-static int p970_marked_instr_event(unsigned int event)
+static int p970_marked_instr_event(u64 event)
 {
 	int pmc, psel, unit, byte, bit;
 	unsigned int mask;
@@ -192,7 +192,7 @@ static u64 unit_cons[PM_LASTUNIT+1][2] = {
 	[PM_STS] =   { 0x380000000000ull, 0x310000000000ull },
 };
 
-static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
+static int p970_get_constraint(u64 event, u64 *maskp, u64 *valp)
 {
 	int pmc, byte, unit, sh, spcsel;
 	u64 mask = 0, value = 0;
@@ -243,8 +243,7 @@ static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
 	return 0;
 }
 
-static int p970_get_alternatives(unsigned int event, unsigned int flags,
-				 unsigned int alt[])
+static int p970_get_alternatives(u64 event, unsigned int flags, u64 alt[])
 {
 	alt[0] = event;
 
@@ -257,7 +256,7 @@ static int p970_get_alternatives(unsigned int event, unsigned int flags,
 	return 1;
 }
 
-static int p970_compute_mmcr(unsigned int event[], int n_ev,
+static int p970_compute_mmcr(u64 event[], int n_ev,
 			     unsigned int hwc[], u64 mmcr[])
 {
 	u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0;
-- 
cgit v1.2.3


From 0bbd0d4be8d5d3676c126e06e3c75c16def00441 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 14 May 2009 13:31:48 +1000
Subject: perf_counter: powerpc: supply more precise information on counter
 overflow events

This uses values from the MMCRA, SIAR and SDAR registers on
powerpc to supply more precise information for overflow events,
including a data address when PERF_RECORD_ADDR is specified.

Since POWER6 uses different bit positions in MMCRA from earlier
processors, this converts the struct power_pmu limited_pmc5_6
field, which only had 0/1 values, into a flags field and
defines bit values for its previous use (PPMU_LIMITED_PMC5_6)
and a new flag (PPMU_ALT_SIPR) to indicate that the processor
uses the POWER6 bit positions rather than the earlier
positions.  It also adds definitions in reg.h for the new and
old positions of the bit that indicates that the SIAR and SDAR
values come from the same instruction.

For the data address, the SDAR value is supplied if we are not
doing instruction sampling.  In that case there is no guarantee
that the address given in the PERF_RECORD_ADDR subrecord will
correspond to the instruction whose address is given in the
PERF_RECORD_IP subrecord.

If instruction sampling is enabled (e.g. because this counter
is counting a marked instruction event), then we only supply
the SDAR value for the PERF_RECORD_ADDR subrecord if it
corresponds to the instruction whose address is in the
PERF_RECORD_IP subrecord.  Otherwise we supply 0.

[ Impact: support more PMU hardware features on PowerPC ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <18955.37028.48861.555309@drongo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/include/asm/perf_counter.h | 14 +++++-
 arch/powerpc/include/asm/reg.h          |  2 +
 arch/powerpc/kernel/perf_counter.c      | 84 +++++++++++++++++++++++++++++++--
 arch/powerpc/kernel/power5+-pmu.c       |  2 +-
 arch/powerpc/kernel/power6-pmu.c        |  2 +-
 5 files changed, 97 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h
index ceea76a48e3..1c60f0ca792 100644
--- a/arch/powerpc/include/asm/perf_counter.h
+++ b/arch/powerpc/include/asm/perf_counter.h
@@ -30,13 +30,19 @@ struct power_pmu {
 				    u64 alt[]);
 	void	(*disable_pmc)(unsigned int pmc, u64 mmcr[]);
 	int	(*limited_pmc_event)(u64 event);
-	int	limited_pmc5_6;	/* PMC5 and PMC6 have limited function */
+	u32	flags;
 	int	n_generic;
 	int	*generic_events;
 };
 
 extern struct power_pmu *ppmu;
 
+/*
+ * Values for power_pmu.flags
+ */
+#define PPMU_LIMITED_PMC5_6	1	/* PMC5/6 have limited function */
+#define PPMU_ALT_SIPR		2	/* uses alternate posn for SIPR/HV */
+
 /*
  * Values for flags to get_alternatives()
  */
@@ -44,6 +50,12 @@ extern struct power_pmu *ppmu;
 #define PPMU_LIMITED_PMC_REQD	2	/* have to put this on a limited PMC */
 #define PPMU_ONLY_COUNT_RUN	4	/* only counting in run state */
 
+struct pt_regs;
+extern unsigned long perf_misc_flags(struct pt_regs *regs);
+#define perf_misc_flags(regs)	perf_misc_flags(regs)
+
+extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
+
 /*
  * The power_pmu.get_constraint function returns a 64-bit value and
  * a 64-bit mask that express the constraints between this event and
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index e8018d540e8..fb359b0a693 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -492,11 +492,13 @@
 #define   MMCR0_FCHV	0x00000001UL /* freeze conditions in hypervisor mode */
 #define SPRN_MMCR1	798
 #define SPRN_MMCRA	0x312
+#define   MMCRA_SDSYNC	0x80000000UL /* SDAR synced with SIAR */
 #define   MMCRA_SIHV	0x10000000UL /* state of MSR HV when SIAR set */
 #define   MMCRA_SIPR	0x08000000UL /* state of MSR PR when SIAR set */
 #define   MMCRA_SLOT	0x07000000UL /* SLOT bits (37-39) */
 #define   MMCRA_SLOT_SHIFT	24
 #define   MMCRA_SAMPLE_ENABLE 0x00000001UL /* enable sampling */
+#define   POWER6_MMCRA_SDSYNC 0x0000080000000000ULL	/* SDAR/SIAR synced */
 #define   POWER6_MMCRA_SIHV   0x0000040000000000ULL
 #define   POWER6_MMCRA_SIPR   0x0000020000000000ULL
 #define   POWER6_MMCRA_THRM	0x00000020UL
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 8d4cafc84b8..6baae5a5c33 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -17,6 +17,7 @@
 #include <asm/pmc.h>
 #include <asm/machdep.h>
 #include <asm/firmware.h>
+#include <asm/ptrace.h>
 
 struct cpu_hw_counters {
 	int n_counters;
@@ -310,7 +311,8 @@ static void power_pmu_read(struct perf_counter *counter)
  */
 static int is_limited_pmc(int pmcnum)
 {
-	return ppmu->limited_pmc5_6 && (pmcnum == 5 || pmcnum == 6);
+	return (ppmu->flags & PPMU_LIMITED_PMC5_6)
+		&& (pmcnum == 5 || pmcnum == 6);
 }
 
 static void freeze_limited_counters(struct cpu_hw_counters *cpuhw,
@@ -860,7 +862,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 	 * If this machine has limited counters, check whether this
 	 * event could go on a limited counter.
 	 */
-	if (ppmu->limited_pmc5_6) {
+	if (ppmu->flags & PPMU_LIMITED_PMC5_6) {
 		if (can_go_on_limited_pmc(counter, ev, flags)) {
 			flags |= PPMU_LIMITED_PMC_OK;
 		} else if (ppmu->limited_pmc_event(ev)) {
@@ -933,6 +935,7 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	u64 period = counter->hw.irq_period;
 	s64 prev, delta, left;
 	int record = 0;
+	u64 addr, mmcra, sdsync;
 
 	/* we don't have to worry about interrupts here */
 	prev = atomic64_read(&counter->hw.prev_count);
@@ -963,8 +966,76 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	/*
 	 * Finally record data if requested.
 	 */
-	if (record)
-		perf_counter_overflow(counter, nmi, regs, 0);
+	if (record) {
+		addr = 0;
+		if (counter->hw_event.record_type & PERF_RECORD_ADDR) {
+			/*
+			 * The user wants a data address recorded.
+			 * If we're not doing instruction sampling,
+			 * give them the SDAR (sampled data address).
+			 * If we are doing instruction sampling, then only
+			 * give them the SDAR if it corresponds to the
+			 * instruction pointed to by SIAR; this is indicated
+			 * by the [POWER6_]MMCRA_SDSYNC bit in MMCRA.
+			 */
+			mmcra = regs->dsisr;
+			sdsync = (ppmu->flags & PPMU_ALT_SIPR) ?
+				POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC;
+			if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync))
+				addr = mfspr(SPRN_SDAR);
+		}
+		perf_counter_overflow(counter, nmi, regs, addr);
+	}
+}
+
+/*
+ * Called from generic code to get the misc flags (i.e. processor mode)
+ * for an event.
+ */
+unsigned long perf_misc_flags(struct pt_regs *regs)
+{
+	unsigned long mmcra;
+
+	if (TRAP(regs) != 0xf00) {
+		/* not a PMU interrupt */
+		return user_mode(regs) ? PERF_EVENT_MISC_USER :
+			PERF_EVENT_MISC_KERNEL;
+	}
+
+	mmcra = regs->dsisr;
+	if (ppmu->flags & PPMU_ALT_SIPR) {
+		if (mmcra & POWER6_MMCRA_SIHV)
+			return PERF_EVENT_MISC_HYPERVISOR;
+		return (mmcra & POWER6_MMCRA_SIPR) ? PERF_EVENT_MISC_USER :
+			PERF_EVENT_MISC_KERNEL;
+	}
+	if (mmcra & MMCRA_SIHV)
+		return PERF_EVENT_MISC_HYPERVISOR;
+	return (mmcra & MMCRA_SIPR) ? PERF_EVENT_MISC_USER :
+			PERF_EVENT_MISC_KERNEL;
+}
+
+/*
+ * Called from generic code to get the instruction pointer
+ * for an event.
+ */
+unsigned long perf_instruction_pointer(struct pt_regs *regs)
+{
+	unsigned long mmcra;
+	unsigned long ip;
+	unsigned long slot;
+
+	if (TRAP(regs) != 0xf00)
+		return regs->nip;	/* not a PMU interrupt */
+
+	ip = mfspr(SPRN_SIAR);
+	mmcra = regs->dsisr;
+	if ((mmcra & MMCRA_SAMPLE_ENABLE) && !(ppmu->flags & PPMU_ALT_SIPR)) {
+		slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT;
+		if (slot > 1)
+			ip += 4 * (slot - 1);
+	}
+	return ip;
 }
 
 /*
@@ -983,6 +1054,11 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 		freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5),
 					mfspr(SPRN_PMC6));
 
+	/*
+	 * Overload regs->dsisr to store MMCRA so we only need to read it once.
+	 */
+	regs->dsisr = mfspr(SPRN_MMCRA);
+
 	/*
 	 * If interrupts were soft-disabled when this PMU interrupt
 	 * occurred, treat it as an NMI.
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
index 3ac0654372a..c6cdfc165d6 100644
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -625,6 +625,6 @@ struct power_pmu power5p_pmu = {
 	.disable_pmc = power5p_disable_pmc,
 	.n_generic = ARRAY_SIZE(power5p_generic_events),
 	.generic_events = power5p_generic_events,
-	.limited_pmc5_6 = 1,
+	.flags = PPMU_LIMITED_PMC5_6,
 	.limited_pmc_event = power5p_limited_pmc_event,
 };
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
index ab7c615c458..cd4fbe06c35 100644
--- a/arch/powerpc/kernel/power6-pmu.c
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -485,6 +485,6 @@ struct power_pmu power6_pmu = {
 	.disable_pmc = p6_disable_pmc,
 	.n_generic = ARRAY_SIZE(power6_generic_events),
 	.generic_events = power6_generic_events,
-	.limited_pmc5_6 = 1,
+	.flags = PPMU_LIMITED_PMC5_6 | PPMU_ALT_SIPR,
 	.limited_pmc_event = p6_limited_pmc_event,
 };
-- 
cgit v1.2.3


From d2517a49d55536b38c7a87e5289550cfedaa4dcc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 17 May 2009 10:04:45 +0200
Subject: perf_counter, x86: fix zero irq_period counters

The quirk to irq_period unearthed an unrobustness we had in the
hw_counter initialization sequence: we left irq_period at 0, which
was then quirked up to 2 ... which then generated a _lot_ of
interrupts during 'perf stat' runs, slowed them down and skewed
the counter results in general.

Initialize irq_period to the maximum instead.

[ Impact: fix perf stat results ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 886dcf334bc..5bfd30ab392 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -286,6 +286,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		hwc->nmi = 1;
 	}
 
+	if (!hwc->irq_period)
+		hwc->irq_period = x86_pmu.max_period;
+
 	atomic64_set(&hwc->period_left,
 			min(x86_pmu.max_period, hwc->irq_period));
 
-- 
cgit v1.2.3


From c0daaf3f1f672defa3a45ca449b76d0e86c55892 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 18 May 2009 14:02:12 +1000
Subject: perf_counter: powerpc: initialize cpuhw pointer before use

Commit 9e35ad38 ("perf_counter: Rework the perf counter
disable/enable") added code to the powerpc hw_perf_enable (renamed
from hw_perf_restore) to test cpuhw->disabled and return immediately
if it is not set (i.e. if the PMU is already enabled).

Unfortunately the test got added before cpuhw was initialized,
resulting in an oops the first time hw_perf_enable got called.
This fixes it by moving the initialization of cpuhw to before
cpuhw->disabled is tested.

[ Impact: fix oops-causing bug on powerpc ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <18960.56772.869734.304631@drongo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 6baae5a5c33..fe21b2440f2 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -450,12 +450,11 @@ void hw_perf_enable(void)
 	int idx;
 
 	local_irq_save(flags);
+	cpuhw = &__get_cpu_var(cpu_hw_counters);
 	if (!cpuhw->disabled) {
 		local_irq_restore(flags);
 		return;
 	}
-
-	cpuhw = &__get_cpu_var(cpu_hw_counters);
 	cpuhw->disabled = 0;
 
 	/*
-- 
cgit v1.2.3


From b68f1d2e7aa21029d73c7d453a8046e95d351740 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 17 May 2009 19:37:25 +0200
Subject: perf_counter, x86: speed up the scheduling fast-path

We have to set up the LVT entry only at counter init time, not at
every switch-in time.

There's friction between NMI and non-NMI use here - we'll probably
remove the per counter configurability of it - but until then, dont
slow down things ...

[ Impact: micro-optimization ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 5bfd30ab392..c109819c2cb 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -285,6 +285,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 			return -EACCES;
 		hwc->nmi = 1;
 	}
+	perf_counters_lapic_init(hwc->nmi);
 
 	if (!hwc->irq_period)
 		hwc->irq_period = x86_pmu.max_period;
@@ -603,8 +604,6 @@ try_generic:
 		hwc->counter_base = x86_pmu.perfctr;
 	}
 
-	perf_counters_lapic_init(hwc->nmi);
-
 	x86_pmu.disable(hwc, idx);
 
 	cpuc->counters[idx] = counter;
@@ -1054,7 +1053,7 @@ void __init init_hw_perf_counters(void)
 
 	pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
 
-	perf_counters_lapic_init(0);
+	perf_counters_lapic_init(1);
 	register_die_notifier(&perf_counter_nmi_notifier);
 }
 
-- 
cgit v1.2.3


From 34adc8062227f41b04ade0ff3fbd1dbe3002669e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 20 May 2009 20:13:28 +0200
Subject: perf_counter: Fix context removal deadlock

Disable the PMU globally before removing a counter from a
context. This fixes the following lockup:

[22081.741922] ------------[ cut here ]------------
[22081.746668] WARNING: at arch/x86/kernel/cpu/perf_counter.c:803 intel_pmu_handle_irq+0x9b/0x24e()
[22081.755624] Hardware name: X8DTN
[22081.758903] perfcounters: irq loop stuck!
[22081.762985] Modules linked in:
[22081.766136] Pid: 11082, comm: perf Not tainted 2.6.30-rc6-tip #226
[22081.772432] Call Trace:
[22081.774940]  <NMI>  [<ffffffff81019aed>] ? intel_pmu_handle_irq+0x9b/0x24e
[22081.781993]  [<ffffffff81019aed>] ? intel_pmu_handle_irq+0x9b/0x24e
[22081.788368]  [<ffffffff8104505c>] ? warn_slowpath_common+0x77/0xa3
[22081.794649]  [<ffffffff810450d3>] ? warn_slowpath_fmt+0x40/0x45
[22081.800696]  [<ffffffff81019aed>] ? intel_pmu_handle_irq+0x9b/0x24e
[22081.807080]  [<ffffffff814d1a72>] ? perf_counter_nmi_handler+0x3f/0x4a
[22081.813751]  [<ffffffff814d2d09>] ? notifier_call_chain+0x58/0x86
[22081.819951]  [<ffffffff8105b250>] ? notify_die+0x2d/0x32
[22081.825392]  [<ffffffff814d1414>] ? do_nmi+0x8e/0x242
[22081.830538]  [<ffffffff814d0f0a>] ? nmi+0x1a/0x20
[22081.835342]  [<ffffffff8117e102>] ? selinux_file_free_security+0x0/0x1a
[22081.842105]  [<ffffffff81018793>] ? x86_pmu_disable_counter+0x15/0x41
[22081.848673]  <<EOE>>  [<ffffffff81018f3d>] ? x86_pmu_disable+0x86/0x103
[22081.855512]  [<ffffffff8108fedd>] ? __perf_counter_remove_from_context+0x0/0xfe
[22081.862926]  [<ffffffff8108fcbc>] ? counter_sched_out+0x30/0xce
[22081.868909]  [<ffffffff8108ff36>] ? __perf_counter_remove_from_context+0x59/0xfe
[22081.876382]  [<ffffffff8106808a>] ? smp_call_function_single+0x6c/0xe6
[22081.882955]  [<ffffffff81091b96>] ? perf_release+0x86/0x14c
[22081.888600]  [<ffffffff810c4c84>] ? __fput+0xe7/0x195
[22081.893718]  [<ffffffff810c213e>] ? filp_close+0x5b/0x62
[22081.899107]  [<ffffffff81046a70>] ? put_files_struct+0x64/0xc2
[22081.905031]  [<ffffffff8104841a>] ? do_exit+0x1e2/0x6ef
[22081.910360]  [<ffffffff814d0a60>] ? _spin_lock_irqsave+0x9/0xe
[22081.916292]  [<ffffffff8104898e>] ? do_group_exit+0x67/0x93
[22081.921953]  [<ffffffff810489cc>] ? sys_exit_group+0x12/0x16
[22081.927759]  [<ffffffff8100baab>] ? system_call_fastpath+0x16/0x1b
[22081.934076] ---[ end trace 3a3936ce3e1b4505 ]---

And could potentially also fix the lockup reported by Marcelo Tosatti.

Also, print more debug info in case of a detected lockup.

[ Impact: fix lockup ]

Reported-by: Marcelo Tosatti <mtosatti@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index c109819c2cb..6cc1660db8d 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -740,6 +740,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 again:
 	if (++loops > 100) {
 		WARN_ONCE(1, "perfcounters: irq loop stuck!\n");
+		perf_counter_print_debug();
 		return 1;
 	}
 
-- 
cgit v1.2.3


From a63eaf34ae60bdb067a354cc8def2e8f4a01f5f4 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 22 May 2009 14:17:31 +1000
Subject: perf_counter: Dynamically allocate tasks' perf_counter_context struct

This replaces the struct perf_counter_context in the task_struct with
a pointer to a dynamically allocated perf_counter_context struct.  The
main reason for doing is this is to allow us to transfer a
perf_counter_context from one task to another when we do lazy PMU
switching in a later patch.

This has a few side-benefits: the task_struct becomes a little smaller,
we save some memory because only tasks that have perf_counters attached
get a perf_counter_context allocated for them, and we can remove the
inclusion of <linux/perf_counter.h> in sched.h, meaning that we don't
end up recompiling nearly everything whenever perf_counter.h changes.

The perf_counter_context structures are reference-counted and freed
when the last reference is dropped.  A context can have references
from its task and the counters on its task.  Counters can outlive the
task so it is possible that a context will be freed well after its
task has exited.

Contexts are allocated on fork if the parent had a context, or
otherwise the first time that a per-task counter is created on a task.
In the latter case, we set the context pointer in the task struct
locklessly using an atomic compare-and-exchange operation in case we
raced with some other task in creating a context for the subject task.

This also removes the task pointer from the perf_counter struct.  The
task pointer was not used anywhere and would make it harder to move a
context from one task to another.  Anything that needed to know which
task a counter was attached to was already using counter->ctx->task.

The __perf_counter_init_context function moves up in perf_counter.c
so that it can be called from find_get_context, and now initializes
the refcount, but is otherwise unchanged.

We were potentially calling list_del_counter twice: once from
__perf_counter_exit_task when the task exits and once from
__perf_counter_remove_from_context when the counter's fd gets closed.
This adds a check in list_del_counter so it doesn't do anything if
the counter has already been removed from the lists.

Since perf_counter_task_sched_in doesn't do anything if the task doesn't
have a context, and leaves cpuctx->task_ctx = NULL, this adds code to
__perf_install_in_context to set cpuctx->task_ctx if necessary, i.e. in
the case where the current task adds the first counter to itself and
thus creates a context for itself.

This also adds similar code to __perf_counter_enable to handle a
similar situation which can arise when the counters have been disabled
using prctl; that also leaves cpuctx->task_ctx = NULL.

[ Impact: refactor counter context management to prepare for new feature ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <18966.10075.781053.231153@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/apic.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index e9021a90802..b4f64402a82 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -14,6 +14,7 @@
  *	Mikael Pettersson	:	PM converted to driver model.
  */
 
+#include <linux/perf_counter.h>
 #include <linux/kernel_stat.h>
 #include <linux/mc146818rtc.h>
 #include <linux/acpi_pmtmr.h>
-- 
cgit v1.2.3


From ff99be573e02e9f7edc23b472c7f9a5ddba12795 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 May 2009 17:39:03 +0200
Subject: perf_counter: x86: Expose INV and EDGE bits

Expose the INV and EDGE bits of the PMU to raw configs.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090525153931.494709027@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 6cc1660db8d..c14437faf5d 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -87,11 +87,15 @@ static u64 intel_pmu_raw_event(u64 event)
 {
 #define CORE_EVNTSEL_EVENT_MASK		0x000000FFULL
 #define CORE_EVNTSEL_UNIT_MASK		0x0000FF00ULL
+#define CORE_EVNTSEL_EDGE_MASK		0x00040000ULL
+#define CORE_EVNTSEL_INV_MASK		0x00800000ULL
 #define CORE_EVNTSEL_COUNTER_MASK	0xFF000000ULL
 
 #define CORE_EVNTSEL_MASK 		\
 	(CORE_EVNTSEL_EVENT_MASK |	\
 	 CORE_EVNTSEL_UNIT_MASK  |	\
+	 CORE_EVNTSEL_EDGE_MASK  |	\
+	 CORE_EVNTSEL_INV_MASK  |	\
 	 CORE_EVNTSEL_COUNTER_MASK)
 
 	return event & CORE_EVNTSEL_MASK;
@@ -119,11 +123,15 @@ static u64 amd_pmu_raw_event(u64 event)
 {
 #define K7_EVNTSEL_EVENT_MASK	0x7000000FFULL
 #define K7_EVNTSEL_UNIT_MASK	0x00000FF00ULL
+#define K7_EVNTSEL_EDGE_MASK	0x000040000ULL
+#define K7_EVNTSEL_INV_MASK	0x000800000ULL
 #define K7_EVNTSEL_COUNTER_MASK	0x0FF000000ULL
 
 #define K7_EVNTSEL_MASK			\
 	(K7_EVNTSEL_EVENT_MASK |	\
 	 K7_EVNTSEL_UNIT_MASK  |	\
+	 K7_EVNTSEL_EDGE_MASK  |	\
+	 K7_EVNTSEL_INV_MASK   |	\
 	 K7_EVNTSEL_COUNTER_MASK)
 
 	return event & K7_EVNTSEL_MASK;
-- 
cgit v1.2.3


From 48e22d56ecdeddd1ffb42a02fccba5c6ef42b133 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 May 2009 17:39:04 +0200
Subject: perf_counter: x86: Remove interrupt throttle

remove the x86 specific interrupt throttle

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090525153931.616671838@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/apic.c        |  2 --
 arch/x86/kernel/cpu/perf_counter.c | 47 ++++----------------------------------
 2 files changed, 5 insertions(+), 44 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index b4f64402a82..89b63b5fad3 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -763,8 +763,6 @@ static void local_apic_timer_interrupt(void)
 	inc_irq_stat(apic_timer_irqs);
 
 	evt->event_handler(evt);
-
-	perf_counter_unthrottle();
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index c14437faf5d..8c8177f859f 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -718,11 +718,6 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter)
 		intel_pmu_enable_counter(hwc, idx);
 }
 
-/*
- * Maximum interrupt frequency of 100KHz per CPU
- */
-#define PERFMON_MAX_INTERRUPTS (100000/HZ)
-
 /*
  * This handler is triggered by the local APIC, so the APIC IRQ handling
  * rules apply:
@@ -775,15 +770,14 @@ again:
 	if (status)
 		goto again;
 
-	if (++cpuc->interrupts != PERFMON_MAX_INTERRUPTS)
-		perf_enable();
+	perf_enable();
 
 	return 1;
 }
 
 static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 {
-	int cpu, idx, throttle = 0, handled = 0;
+	int cpu, idx, handled = 0;
 	struct cpu_hw_counters *cpuc;
 	struct perf_counter *counter;
 	struct hw_perf_counter *hwc;
@@ -792,16 +786,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 	cpu = smp_processor_id();
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
-	if (++cpuc->interrupts == PERFMON_MAX_INTERRUPTS) {
-		throttle = 1;
-		__perf_disable();
-		cpuc->enabled = 0;
-		barrier();
-	}
-
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		int disable = 0;
-
 		if (!test_bit(idx, cpuc->active_mask))
 			continue;
 
@@ -809,45 +794,23 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 		hwc = &counter->hw;
 
 		if (counter->hw_event.nmi != nmi)
-			goto next;
+			continue;
 
 		val = x86_perf_counter_update(counter, hwc, idx);
 		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
-			goto next;
+			continue;
 
 		/* counter overflow */
 		x86_perf_counter_set_period(counter, hwc, idx);
 		handled = 1;
 		inc_irq_stat(apic_perf_irqs);
-		disable = perf_counter_overflow(counter, nmi, regs, 0);
-
-next:
-		if (disable || throttle)
+		if (perf_counter_overflow(counter, nmi, regs, 0))
 			amd_pmu_disable_counter(hwc, idx);
 	}
 
 	return handled;
 }
 
-void perf_counter_unthrottle(void)
-{
-	struct cpu_hw_counters *cpuc;
-
-	if (!x86_pmu_initialized())
-		return;
-
-	cpuc = &__get_cpu_var(cpu_hw_counters);
-	if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
-		/*
-		 * Clear them before re-enabling irqs/NMIs again:
-		 */
-		cpuc->interrupts = 0;
-		perf_enable();
-	} else {
-		cpuc->interrupts = 0;
-	}
-}
-
 void smp_perf_counter_interrupt(struct pt_regs *regs)
 {
 	irq_enter();
-- 
cgit v1.2.3


From a78ac3258782f3e64cb40beb5990808e1febcc0c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 May 2009 17:39:05 +0200
Subject: perf_counter: Generic per counter interrupt throttle

Introduce a generic per counter interrupt throttle.

This uses the perf_counter_overflow() quick disable to throttle a specific
counter when its going too fast when a pmu->unthrottle() method is provided
which can undo the quick disable.

Power needs to implement both the quick disable and the unthrottle method.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090525153931.703093461@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 8c8177f859f..c4b543d1a86 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -623,6 +623,18 @@ try_generic:
 	return 0;
 }
 
+static void x86_pmu_unthrottle(struct perf_counter *counter)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+	struct hw_perf_counter *hwc = &counter->hw;
+
+	if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
+				cpuc->counters[hwc->idx] != counter))
+		return;
+
+	x86_pmu.enable(hwc, hwc->idx);
+}
+
 void perf_counter_print_debug(void)
 {
 	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
@@ -1038,6 +1050,7 @@ static const struct pmu pmu = {
 	.enable		= x86_pmu_enable,
 	.disable	= x86_pmu_disable,
 	.read		= x86_pmu_read,
+	.unthrottle	= x86_pmu_unthrottle,
 };
 
 const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
-- 
cgit v1.2.3


From 53b441a565bf4036ab49c8ea04c5ad06ace7dd6b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 25 May 2009 21:41:28 +0200
Subject: Revert "perf_counter, x86: speed up the scheduling fast-path"

This reverts commit b68f1d2e7aa21029d73c7d453a8046e95d351740.

It is causing problems (stuck/stuttering profiling) - when mixed
NMI and non-NMI counters are used.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090525153931.703093461@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index c4b543d1a86..189bf9d7cda 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -293,7 +293,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 			return -EACCES;
 		hwc->nmi = 1;
 	}
-	perf_counters_lapic_init(hwc->nmi);
 
 	if (!hwc->irq_period)
 		hwc->irq_period = x86_pmu.max_period;
@@ -612,6 +611,8 @@ try_generic:
 		hwc->counter_base = x86_pmu.perfctr;
 	}
 
+	perf_counters_lapic_init(hwc->nmi);
+
 	x86_pmu.disable(hwc, idx);
 
 	cpuc->counters[idx] = counter;
@@ -1037,7 +1038,7 @@ void __init init_hw_perf_counters(void)
 
 	pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
 
-	perf_counters_lapic_init(1);
+	perf_counters_lapic_init(0);
 	register_die_notifier(&perf_counter_nmi_notifier);
 }
 
-- 
cgit v1.2.3


From 8a7b8cb91f26a671f22cedc7fd54508667f2d9b9 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Tue, 26 May 2009 16:27:59 +1000
Subject: perf_counter: powerpc: Implement interrupt throttling

This implements interrupt throttling on powerpc.  Since we don't have
individual count enable/disable or interrupt enable/disable controls
per counter, this simply sets the hardware counter to 0, meaning that
it will not interrupt again until it has counted 2^31 counts, which
will take at least 2^30 cycles assuming a maximum of 2 counts per
cycle.  Also, we set counter->hw.period_left to the maximum possible
value (2^63 - 1), so we won't report overflows for this counter for
the forseeable future.

The unthrottle operation restores counter->hw.period_left and the
hardware counter so that we will once again report a counter overflow
after counter->hw.irq_period counts.

[ Impact: new perfcounters robustness feature on PowerPC ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <18971.35823.643362.446774@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 48 ++++++++++++++++++++++++++++++++++----
 1 file changed, 43 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index fe21b2440f2..f96d55f55bd 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -740,10 +740,37 @@ static void power_pmu_disable(struct perf_counter *counter)
 	local_irq_restore(flags);
 }
 
+/*
+ * Re-enable interrupts on a counter after they were throttled
+ * because they were coming too fast.
+ */
+static void power_pmu_unthrottle(struct perf_counter *counter)
+{
+	s64 val, left;
+	unsigned long flags;
+
+	if (!counter->hw.idx || !counter->hw.irq_period)
+		return;
+	local_irq_save(flags);
+	perf_disable();
+	power_pmu_read(counter);
+	left = counter->hw.irq_period;
+	val = 0;
+	if (left < 0x80000000L)
+		val = 0x80000000L - left;
+	write_pmc(counter->hw.idx, val);
+	atomic64_set(&counter->hw.prev_count, val);
+	atomic64_set(&counter->hw.period_left, left);
+	perf_counter_update_userpage(counter);
+	perf_enable();
+	local_irq_restore(flags);
+}
+
 struct pmu power_pmu = {
 	.enable		= power_pmu_enable,
 	.disable	= power_pmu_disable,
 	.read		= power_pmu_read,
+	.unthrottle	= power_pmu_unthrottle,
 };
 
 /*
@@ -957,10 +984,6 @@ static void record_and_restart(struct perf_counter *counter, long val,
 		if (left < 0x80000000L)
 			val = 0x80000000L - left;
 	}
-	write_pmc(counter->hw.idx, val);
-	atomic64_set(&counter->hw.prev_count, val);
-	atomic64_set(&counter->hw.period_left, left);
-	perf_counter_update_userpage(counter);
 
 	/*
 	 * Finally record data if requested.
@@ -983,8 +1006,23 @@ static void record_and_restart(struct perf_counter *counter, long val,
 			if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync))
 				addr = mfspr(SPRN_SDAR);
 		}
-		perf_counter_overflow(counter, nmi, regs, addr);
+		if (perf_counter_overflow(counter, nmi, regs, addr)) {
+			/*
+			 * Interrupts are coming too fast - throttle them
+			 * by setting the counter to 0, so it will be
+			 * at least 2^30 cycles until the next interrupt
+			 * (assuming each counter counts at most 2 counts
+			 * per cycle).
+			 */
+			val = 0;
+			left = ~0ULL >> 1;
+		}
 	}
+
+	write_pmc(counter->hw.idx, val);
+	atomic64_set(&counter->hw.prev_count, val);
+	atomic64_set(&counter->hw.period_left, left);
+	perf_counter_update_userpage(counter);
 }
 
 /*
-- 
cgit v1.2.3


From 79202ba9ff8cf570a75596f42e011167734d1c4b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 26 May 2009 08:10:00 +0200
Subject: perf_counter, x86: Fix APIC NMI programming

My Nehalem box locks up in certain situations (with an
always-asserted NMI causing a lockup) if the PMU LVT
entry is programmed between NMI and IRQ mode with a
high frequency.

Standardize exlusively on NMIs instead.

[ Impact: fix lockup ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 189bf9d7cda..ece3813c7a3 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -285,14 +285,10 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
 
 	/*
-	 * If privileged enough, allow NMI events:
+	 * Use NMI events all the time:
 	 */
-	hwc->nmi = 0;
-	if (hw_event->nmi) {
-		if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN))
-			return -EACCES;
-		hwc->nmi = 1;
-	}
+	hwc->nmi	= 1;
+	hw_event->nmi	= 1;
 
 	if (!hwc->irq_period)
 		hwc->irq_period = x86_pmu.max_period;
@@ -553,9 +549,6 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 	if (!x86_pmu.num_counters_fixed)
 		return -1;
 
-	if (unlikely(hwc->nmi))
-		return -1;
-
 	event = hwc->config & ARCH_PERFMON_EVENT_MASK;
 
 	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_INSTRUCTIONS)))
@@ -806,9 +799,6 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 		counter = cpuc->counters[idx];
 		hwc = &counter->hw;
 
-		if (counter->hw_event.nmi != nmi)
-			continue;
-
 		val = x86_perf_counter_update(counter, hwc, idx);
 		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
 			continue;
-- 
cgit v1.2.3


From aaba98018b8295dfa2119345d17f833d74448cd0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 26 May 2009 08:10:00 +0200
Subject: perf_counter, x86: Make NMI lockups more robust

We have a debug check that detects stuck NMIs and returns with
the PMU disabled in the global ctrl MSR - but i managed to trigger
a situation where this was not enough to deassert the NMI.

So clear/reset the full PMU and keep the disable count balanced when
exiting from here. This way the box produces a debug warning but
stays up and is more debuggable.

[ Impact: in case of PMU related bugs, recover more gracefully ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index ece3813c7a3..2eeaa99add1 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -724,6 +724,30 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter)
 		intel_pmu_enable_counter(hwc, idx);
 }
 
+static void intel_pmu_reset(void)
+{
+	unsigned long flags;
+	int idx;
+
+	if (!x86_pmu.num_counters)
+		return;
+
+	local_irq_save(flags);
+
+	printk("clearing PMU state on CPU#%d\n", smp_processor_id());
+
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+		checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
+		checking_wrmsrl(x86_pmu.perfctr  + idx, 0ull);
+	}
+	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+		checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
+	}
+
+	local_irq_restore(flags);
+}
+
+
 /*
  * This handler is triggered by the local APIC, so the APIC IRQ handling
  * rules apply:
@@ -750,6 +774,8 @@ again:
 	if (++loops > 100) {
 		WARN_ONCE(1, "perfcounters: irq loop stuck!\n");
 		perf_counter_print_debug();
+		intel_pmu_reset();
+		perf_enable();
 		return 1;
 	}
 
-- 
cgit v1.2.3


From c323d95fa4dbe0b6bf6d59e24a0b7db067dd08a7 Mon Sep 17 00:00:00 2001
From: Yong Wang <yong.y.wang@linux.intel.com>
Date: Fri, 29 May 2009 13:28:35 +0800
Subject: perf_counter/x86: Always use NMI for performance-monitoring interrupt

Always use NMI for performance-monitoring interrupt as there could be
racy situations if we switch between irq and nmi mode frequently.

Signed-off-by: Yong Wang <yong.y.wang@intel.com>
LKML-Reference: <20090529052835.GA13657@ywang-moblin2.bj.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_counter.h |  4 ++--
 arch/x86/kernel/apic/apic.c         |  2 +-
 arch/x86/kernel/cpu/perf_counter.c  | 19 +++++--------------
 3 files changed, 8 insertions(+), 17 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index d08dd52cb8f..876ed97147b 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -91,10 +91,10 @@ extern void set_perf_counter_pending(void);
 
 #ifdef CONFIG_PERF_COUNTERS
 extern void init_hw_perf_counters(void);
-extern void perf_counters_lapic_init(int nmi);
+extern void perf_counters_lapic_init(void);
 #else
 static inline void init_hw_perf_counters(void)		{ }
-static inline void perf_counters_lapic_init(int nmi)	{ }
+static inline void perf_counters_lapic_init(void)	{ }
 #endif
 
 #endif /* _ASM_X86_PERF_COUNTER_H */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 89b63b5fad3..60df2efd7c8 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1135,7 +1135,7 @@ void __cpuinit setup_local_APIC(void)
 		apic_write(APIC_ESR, 0);
 	}
 #endif
-	perf_counters_lapic_init(0);
+	perf_counters_lapic_init();
 
 	preempt_disable();
 
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 2eeaa99add1..316b0c995f3 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -604,7 +604,7 @@ try_generic:
 		hwc->counter_base = x86_pmu.perfctr;
 	}
 
-	perf_counters_lapic_init(hwc->nmi);
+	perf_counters_lapic_init();
 
 	x86_pmu.disable(hwc, idx);
 
@@ -863,24 +863,15 @@ void set_perf_counter_pending(void)
 	apic->send_IPI_self(LOCAL_PENDING_VECTOR);
 }
 
-void perf_counters_lapic_init(int nmi)
+void perf_counters_lapic_init(void)
 {
-	u32 apic_val;
-
 	if (!x86_pmu_initialized())
 		return;
 
 	/*
-	 * Enable the performance counter vector in the APIC LVT:
+	 * Always use NMI for PMU
 	 */
-	apic_val = apic_read(APIC_LVTERR);
-
-	apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED);
-	if (nmi)
-		apic_write(APIC_LVTPC, APIC_DM_NMI);
-	else
-		apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
-	apic_write(APIC_LVTERR, apic_val);
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
 }
 
 static int __kprobes
@@ -1054,7 +1045,7 @@ void __init init_hw_perf_counters(void)
 
 	pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
 
-	perf_counters_lapic_init(0);
+	perf_counters_lapic_init();
 	register_die_notifier(&perf_counter_nmi_notifier);
 }
 
-- 
cgit v1.2.3


From b23f3325ed465f1bd914384884269af0d106778c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 15:13:03 +0200
Subject: perf_counter: Rename various fields

A few renames:

  s/irq_period/sample_period/
  s/irq_freq/sample_freq/
  s/PERF_RECORD_/PERF_SAMPLE_/
  s/record_type/sample_type/

And change both the new sample_type and read_format to u64.

Reported-by: Stephane Eranian <eranian@googlemail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 12 ++++++------
 arch/x86/kernel/cpu/perf_counter.c |  8 ++++----
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index f96d55f55bd..c9633321e7a 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -535,7 +535,7 @@ void hw_perf_enable(void)
 			continue;
 		}
 		val = 0;
-		if (counter->hw.irq_period) {
+		if (counter->hw.sample_period) {
 			left = atomic64_read(&counter->hw.period_left);
 			if (left < 0x80000000L)
 				val = 0x80000000L - left;
@@ -749,12 +749,12 @@ static void power_pmu_unthrottle(struct perf_counter *counter)
 	s64 val, left;
 	unsigned long flags;
 
-	if (!counter->hw.idx || !counter->hw.irq_period)
+	if (!counter->hw.idx || !counter->hw.sample_period)
 		return;
 	local_irq_save(flags);
 	perf_disable();
 	power_pmu_read(counter);
-	left = counter->hw.irq_period;
+	left = counter->hw.sample_period;
 	val = 0;
 	if (left < 0x80000000L)
 		val = 0x80000000L - left;
@@ -789,7 +789,7 @@ static int can_go_on_limited_pmc(struct perf_counter *counter, u64 ev,
 	if (counter->hw_event.exclude_user
 	    || counter->hw_event.exclude_kernel
 	    || counter->hw_event.exclude_hv
-	    || counter->hw_event.irq_period)
+	    || counter->hw_event.sample_period)
 		return 0;
 
 	if (ppmu->limited_pmc_event(ev))
@@ -925,7 +925,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 
 	counter->hw.config = events[n];
 	counter->hw.counter_base = cflags[n];
-	atomic64_set(&counter->hw.period_left, counter->hw.irq_period);
+	atomic64_set(&counter->hw.period_left, counter->hw.sample_period);
 
 	/*
 	 * See if we need to reserve the PMU.
@@ -958,7 +958,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 static void record_and_restart(struct perf_counter *counter, long val,
 			       struct pt_regs *regs, int nmi)
 {
-	u64 period = counter->hw.irq_period;
+	u64 period = counter->hw.sample_period;
 	s64 prev, delta, left;
 	int record = 0;
 	u64 addr, mmcra, sdsync;
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 316b0c995f3..ec06aa5e928 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -290,11 +290,11 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	hwc->nmi	= 1;
 	hw_event->nmi	= 1;
 
-	if (!hwc->irq_period)
-		hwc->irq_period = x86_pmu.max_period;
+	if (!hwc->sample_period)
+		hwc->sample_period = x86_pmu.max_period;
 
 	atomic64_set(&hwc->period_left,
-			min(x86_pmu.max_period, hwc->irq_period));
+			min(x86_pmu.max_period, hwc->sample_period));
 
 	/*
 	 * Raw event type provide the config in the event structure
@@ -462,7 +462,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 			     struct hw_perf_counter *hwc, int idx)
 {
 	s64 left = atomic64_read(&hwc->period_left);
-	s64 period = min(x86_pmu.max_period, hwc->irq_period);
+	s64 period = min(x86_pmu.max_period, hwc->sample_period);
 	int err;
 
 	/*
-- 
cgit v1.2.3


From 8a016db386195b193e2a8aeddff9fe937dcb7a40 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 15:27:45 +0200
Subject: perf_counter: Remove the last nmi/irq bits

IRQ (non-NMI) sampling is not used anymore - remove the last few bits.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index ec06aa5e928..9e144fbebd2 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -284,12 +284,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	if (!hw_event->exclude_kernel)
 		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
 
-	/*
-	 * Use NMI events all the time:
-	 */
-	hwc->nmi	= 1;
-	hw_event->nmi	= 1;
-
 	if (!hwc->sample_period)
 		hwc->sample_period = x86_pmu.max_period;
 
-- 
cgit v1.2.3


From e4abb5d4f7ddabc1fc7c392cf0a10d8e5868c9ca Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 16:08:20 +0200
Subject: perf_counter: x86: Emulate longer sample periods

Do as Power already does, emulate sample periods up to 2^63-1 by
composing them of smaller values limited by hardware capabilities.
Only once we wrap the software period do we generate an overflow
event.

Just 10 lines of new code.

Reported-by: Stephane Eranian <eranian@googlemail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 9e144fbebd2..904571bea71 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -287,8 +287,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	if (!hwc->sample_period)
 		hwc->sample_period = x86_pmu.max_period;
 
-	atomic64_set(&hwc->period_left,
-			min(x86_pmu.max_period, hwc->sample_period));
+	atomic64_set(&hwc->period_left, hwc->sample_period);
 
 	/*
 	 * Raw event type provide the config in the event structure
@@ -451,13 +450,13 @@ static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
  * Set the next IRQ period, based on the hwc->period_left value.
  * To be called with the counter disabled in hw:
  */
-static void
+static int
 x86_perf_counter_set_period(struct perf_counter *counter,
 			     struct hw_perf_counter *hwc, int idx)
 {
 	s64 left = atomic64_read(&hwc->period_left);
-	s64 period = min(x86_pmu.max_period, hwc->sample_period);
-	int err;
+	s64 period = hwc->sample_period;
+	int err, ret = 0;
 
 	/*
 	 * If we are way outside a reasoable range then just skip forward:
@@ -465,11 +464,13 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 	if (unlikely(left <= -period)) {
 		left = period;
 		atomic64_set(&hwc->period_left, left);
+		ret = 1;
 	}
 
 	if (unlikely(left <= 0)) {
 		left += period;
 		atomic64_set(&hwc->period_left, left);
+		ret = 1;
 	}
 	/*
 	 * Quirk: certain CPUs dont like it if just 1 event is left:
@@ -477,6 +478,9 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 	if (unlikely(left < 2))
 		left = 2;
 
+	if (left > x86_pmu.max_period)
+		left = x86_pmu.max_period;
+
 	per_cpu(prev_left[idx], smp_processor_id()) = left;
 
 	/*
@@ -487,6 +491,8 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 
 	err = checking_wrmsrl(hwc->counter_base + idx,
 			     (u64)(-left) & x86_pmu.counter_mask);
+
+	return ret;
 }
 
 static inline void
@@ -706,16 +712,19 @@ static void x86_pmu_disable(struct perf_counter *counter)
  * Save and restart an expired counter. Called by NMI contexts,
  * so it has to be careful about preempting normal counter ops:
  */
-static void intel_pmu_save_and_restart(struct perf_counter *counter)
+static int intel_pmu_save_and_restart(struct perf_counter *counter)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
 	int idx = hwc->idx;
+	int ret;
 
 	x86_perf_counter_update(counter, hwc, idx);
-	x86_perf_counter_set_period(counter, hwc, idx);
+	ret = x86_perf_counter_set_period(counter, hwc, idx);
 
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
 		intel_pmu_enable_counter(hwc, idx);
+
+	return ret;
 }
 
 static void intel_pmu_reset(void)
@@ -782,7 +791,9 @@ again:
 		if (!test_bit(bit, cpuc->active_mask))
 			continue;
 
-		intel_pmu_save_and_restart(counter);
+		if (!intel_pmu_save_and_restart(counter))
+			continue;
+
 		if (perf_counter_overflow(counter, nmi, regs, 0))
 			intel_pmu_disable_counter(&counter->hw, bit);
 	}
@@ -824,9 +835,11 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 			continue;
 
 		/* counter overflow */
-		x86_perf_counter_set_period(counter, hwc, idx);
 		handled = 1;
 		inc_irq_stat(apic_perf_irqs);
+		if (!x86_perf_counter_set_period(counter, hwc, idx))
+			continue;
+
 		if (perf_counter_overflow(counter, nmi, regs, 0))
 			amd_pmu_disable_counter(hwc, idx);
 	}
-- 
cgit v1.2.3


From 0d48696f87e3618b0d35bd3e4e9d7c188d51e7de Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 19:22:16 +0200
Subject: perf_counter: Rename perf_counter_hw_event => perf_counter_attr

The structure isn't hw only and when I read event, I think about those
things that fall out the other end. Rename the thing.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
Cc: Stephane Eranian <eranian@googlemail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 38 +++++++++++++++++++-------------------
 arch/x86/kernel/cpu/perf_counter.c | 16 ++++++++--------
 2 files changed, 27 insertions(+), 27 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index c9633321e7a..ea54686cb78 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -262,13 +262,13 @@ static int check_excludes(struct perf_counter **ctrs, unsigned int cflags[],
 		}
 		counter = ctrs[i];
 		if (first) {
-			eu = counter->hw_event.exclude_user;
-			ek = counter->hw_event.exclude_kernel;
-			eh = counter->hw_event.exclude_hv;
+			eu = counter->attr.exclude_user;
+			ek = counter->attr.exclude_kernel;
+			eh = counter->attr.exclude_hv;
 			first = 0;
-		} else if (counter->hw_event.exclude_user != eu ||
-			   counter->hw_event.exclude_kernel != ek ||
-			   counter->hw_event.exclude_hv != eh) {
+		} else if (counter->attr.exclude_user != eu ||
+			   counter->attr.exclude_kernel != ek ||
+			   counter->attr.exclude_hv != eh) {
 			return -EAGAIN;
 		}
 	}
@@ -483,16 +483,16 @@ void hw_perf_enable(void)
 
 	/*
 	 * Add in MMCR0 freeze bits corresponding to the
-	 * hw_event.exclude_* bits for the first counter.
+	 * attr.exclude_* bits for the first counter.
 	 * We have already checked that all counters have the
 	 * same values for these bits as the first counter.
 	 */
 	counter = cpuhw->counter[0];
-	if (counter->hw_event.exclude_user)
+	if (counter->attr.exclude_user)
 		cpuhw->mmcr[0] |= MMCR0_FCP;
-	if (counter->hw_event.exclude_kernel)
+	if (counter->attr.exclude_kernel)
 		cpuhw->mmcr[0] |= freeze_counters_kernel;
-	if (counter->hw_event.exclude_hv)
+	if (counter->attr.exclude_hv)
 		cpuhw->mmcr[0] |= MMCR0_FCHV;
 
 	/*
@@ -786,10 +786,10 @@ static int can_go_on_limited_pmc(struct perf_counter *counter, u64 ev,
 	int n;
 	u64 alt[MAX_EVENT_ALTERNATIVES];
 
-	if (counter->hw_event.exclude_user
-	    || counter->hw_event.exclude_kernel
-	    || counter->hw_event.exclude_hv
-	    || counter->hw_event.sample_period)
+	if (counter->attr.exclude_user
+	    || counter->attr.exclude_kernel
+	    || counter->attr.exclude_hv
+	    || counter->attr.sample_period)
 		return 0;
 
 	if (ppmu->limited_pmc_event(ev))
@@ -855,13 +855,13 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 
 	if (!ppmu)
 		return ERR_PTR(-ENXIO);
-	if (!perf_event_raw(&counter->hw_event)) {
-		ev = perf_event_id(&counter->hw_event);
+	if (!perf_event_raw(&counter->attr)) {
+		ev = perf_event_id(&counter->attr);
 		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
 			return ERR_PTR(-EOPNOTSUPP);
 		ev = ppmu->generic_events[ev];
 	} else {
-		ev = perf_event_config(&counter->hw_event);
+		ev = perf_event_config(&counter->attr);
 	}
 	counter->hw.config_base = ev;
 	counter->hw.idx = 0;
@@ -872,7 +872,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 	 * the user set it to.
 	 */
 	if (!firmware_has_feature(FW_FEATURE_LPAR))
-		counter->hw_event.exclude_hv = 0;
+		counter->attr.exclude_hv = 0;
 
 	/*
 	 * If this is a per-task counter, then we can use
@@ -990,7 +990,7 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	 */
 	if (record) {
 		addr = 0;
-		if (counter->hw_event.record_type & PERF_RECORD_ADDR) {
+		if (counter->attr.record_type & PERF_RECORD_ADDR) {
 			/*
 			 * The user wants a data address recorded.
 			 * If we're not doing instruction sampling,
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 904571bea71..e16e8c13132 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -247,11 +247,11 @@ static inline int x86_pmu_initialized(void)
 }
 
 /*
- * Setup the hardware configuration for a given hw_event_type
+ * Setup the hardware configuration for a given attr_type
  */
 static int __hw_perf_counter_init(struct perf_counter *counter)
 {
-	struct perf_counter_hw_event *hw_event = &counter->hw_event;
+	struct perf_counter_attr *attr = &counter->attr;
 	struct hw_perf_counter *hwc = &counter->hw;
 	int err;
 
@@ -279,9 +279,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	/*
 	 * Count user and OS events unless requested not to.
 	 */
-	if (!hw_event->exclude_user)
+	if (!attr->exclude_user)
 		hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
-	if (!hw_event->exclude_kernel)
+	if (!attr->exclude_kernel)
 		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
 
 	if (!hwc->sample_period)
@@ -292,15 +292,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	/*
 	 * Raw event type provide the config in the event structure
 	 */
-	if (perf_event_raw(hw_event)) {
-		hwc->config |= x86_pmu.raw_event(perf_event_config(hw_event));
+	if (perf_event_raw(attr)) {
+		hwc->config |= x86_pmu.raw_event(perf_event_config(attr));
 	} else {
-		if (perf_event_id(hw_event) >= x86_pmu.max_events)
+		if (perf_event_id(attr) >= x86_pmu.max_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= x86_pmu.event_map(perf_event_id(hw_event));
+		hwc->config |= x86_pmu.event_map(perf_event_id(attr));
 	}
 
 	counter->destroy = hw_perf_counter_destroy;
-- 
cgit v1.2.3


From a32881066e58346f2901afe0ebdfbf0c562877e5 Mon Sep 17 00:00:00 2001
From: Yong Wang <yong.y.wang@linux.intel.com>
Date: Wed, 3 Jun 2009 13:12:55 +0800
Subject: perf_counter/x86: Remove the IRQ (non-NMI) handling bits

Remove the IRQ (non-NMI) handling bits as NMI will be used always.

Signed-off-by: Yong Wang <yong.y.wang@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090603051255.GA2791@ywang-moblin2.bj.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/entry_arch.h  |  1 -
 arch/x86/include/asm/hw_irq.h      |  1 -
 arch/x86/include/asm/irq_vectors.h |  5 -----
 arch/x86/kernel/cpu/perf_counter.c | 21 ++++++---------------
 arch/x86/kernel/entry_64.S         |  2 --
 arch/x86/kernel/irqinit_32.c       |  1 -
 arch/x86/kernel/irqinit_64.c       |  1 -
 7 files changed, 6 insertions(+), 26 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index fe24d280249..d750a10ccad 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -49,7 +49,6 @@ BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
 BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
 
 #ifdef CONFIG_PERF_COUNTERS
-BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR)
 BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
 #endif
 
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 7309c0ad690..4b4921d7a28 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -29,7 +29,6 @@
 extern void apic_timer_interrupt(void);
 extern void generic_interrupt(void);
 extern void error_interrupt(void);
-extern void perf_counter_interrupt(void);
 extern void perf_pending_interrupt(void);
 
 extern void spurious_interrupt(void);
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 545bb811ccb..4492e19f839 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -106,11 +106,6 @@
  */
 #define LOCAL_TIMER_VECTOR		0xef
 
-/*
- * Performance monitoring interrupt vector:
- */
-#define LOCAL_PERF_VECTOR		0xee
-
 /*
  * Generic system vector for platform specific use
  */
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index e16e8c13132..12cc05ed9f4 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -40,7 +40,7 @@ struct cpu_hw_counters {
 struct x86_pmu {
 	const char	*name;
 	int		version;
-	int		(*handle_irq)(struct pt_regs *, int);
+	int		(*handle_irq)(struct pt_regs *);
 	void		(*disable_all)(void);
 	void		(*enable_all)(void);
 	void		(*enable)(struct hw_perf_counter *, int);
@@ -755,7 +755,7 @@ static void intel_pmu_reset(void)
  * This handler is triggered by the local APIC, so the APIC IRQ handling
  * rules apply:
  */
-static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
+static int intel_pmu_handle_irq(struct pt_regs *regs)
 {
 	struct cpu_hw_counters *cpuc;
 	struct cpu_hw_counters;
@@ -794,7 +794,7 @@ again:
 		if (!intel_pmu_save_and_restart(counter))
 			continue;
 
-		if (perf_counter_overflow(counter, nmi, regs, 0))
+		if (perf_counter_overflow(counter, 1, regs, 0))
 			intel_pmu_disable_counter(&counter->hw, bit);
 	}
 
@@ -812,7 +812,7 @@ again:
 	return 1;
 }
 
-static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
+static int amd_pmu_handle_irq(struct pt_regs *regs)
 {
 	int cpu, idx, handled = 0;
 	struct cpu_hw_counters *cpuc;
@@ -840,22 +840,13 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 		if (!x86_perf_counter_set_period(counter, hwc, idx))
 			continue;
 
-		if (perf_counter_overflow(counter, nmi, regs, 0))
+		if (perf_counter_overflow(counter, 1, regs, 0))
 			amd_pmu_disable_counter(hwc, idx);
 	}
 
 	return handled;
 }
 
-void smp_perf_counter_interrupt(struct pt_regs *regs)
-{
-	irq_enter();
-	apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
-	ack_APIC_irq();
-	x86_pmu.handle_irq(regs, 0);
-	irq_exit();
-}
-
 void smp_perf_pending_interrupt(struct pt_regs *regs)
 {
 	irq_enter();
@@ -910,7 +901,7 @@ perf_counter_nmi_handler(struct notifier_block *self,
 	 * If the first NMI handles both, the latter will be empty and daze
 	 * the CPU.
 	 */
-	x86_pmu.handle_irq(regs, 1);
+	x86_pmu.handle_irq(regs);
 
 	return NOTIFY_STOP;
 }
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 89100461914..7985c010f8a 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1026,8 +1026,6 @@ apicinterrupt SPURIOUS_APIC_VECTOR \
 	spurious_interrupt smp_spurious_interrupt
 
 #ifdef CONFIG_PERF_COUNTERS
-apicinterrupt LOCAL_PERF_VECTOR \
-	perf_counter_interrupt smp_perf_counter_interrupt
 apicinterrupt LOCAL_PENDING_VECTOR \
 	perf_pending_interrupt smp_perf_pending_interrupt
 #endif
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 3190a6b961e..205bdd880d3 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -165,7 +165,6 @@ static void __init apic_intr_init(void)
 	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
 # ifdef CONFIG_PERF_COUNTERS
-	alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
 	alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
 # endif
 
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 53ceb26f80f..fa6ef692000 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -155,7 +155,6 @@ static void __init apic_intr_init(void)
 
 	/* Performance monitoring interrupt: */
 #ifdef CONFIG_PERF_COUNTERS
-	alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
 	alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
 #endif
 }
-- 
cgit v1.2.3


From 6984efb692e97ce5f75f26e595685c04c2061bac Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 3 Jun 2009 19:38:58 +1000
Subject: perf_counter: powerpc: Fix event alternative code generation on
 POWER5/5+

Commit ef923214 ("perf_counter: powerpc: use u64 for event
codes internally") introduced a bug where the return value from
function find_alternative_bdecode gets put into a u64 variable
and later tested to see if it is < 0.  The effect is that we
get extra, bogus event code alternatives on POWER5 and POWER5+,
leading to error messages such as "oops compute_mmcr failed"
being printed and counters not counting properly.

This fixes it by using s64 for the return type of
find_alternative_bdecode and for the local variable that the
caller puts the value in.  It also makes the event argument a
u64 on POWER5+ for consistency.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
Cc: Stephane Eranian <eranian@googlemail.com>
LKML-Reference: <18982.17586.666132.90983@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/power5+-pmu.c | 4 ++--
 arch/powerpc/kernel/power5-pmu.c  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
index c6cdfc165d6..8471e3c2e46 100644
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -242,7 +242,7 @@ static const unsigned char bytedecode_alternatives[4][4] = {
  * event code for those that do, or -1 otherwise.  This also handles
  * alternative PCMSEL values for add events.
  */
-static int find_alternative_bdecode(unsigned int event)
+static s64 find_alternative_bdecode(u64 event)
 {
 	int pmc, altpmc, pp, j;
 
@@ -277,7 +277,7 @@ static int power5p_get_alternatives(u64 event, unsigned int flags, u64 alt[])
 {
 	int i, j, nalt = 1;
 	int nlim;
-	u64 ae;
+	s64 ae;
 
 	alt[0] = event;
 	nalt = 1;
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
index d5344968ee9..1b44c5fca18 100644
--- a/arch/powerpc/kernel/power5-pmu.c
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -250,7 +250,7 @@ static const unsigned char bytedecode_alternatives[4][4] = {
  * PMCSEL values on other counters.  This returns the alternative
  * event code for those that do, or -1 otherwise.
  */
-static u64 find_alternative_bdecode(u64 event)
+static s64 find_alternative_bdecode(u64 event)
 {
 	int pmc, altpmc, pp, j;
 
@@ -272,7 +272,7 @@ static u64 find_alternative_bdecode(u64 event)
 static int power5_get_alternatives(u64 event, unsigned int flags, u64 alt[])
 {
 	int i, j, nalt = 1;
-	u64 ae;
+	s64 ae;
 
 	alt[0] = event;
 	nalt = 1;
-- 
cgit v1.2.3


From dcd945e0d8a6d654e3e1de51faea9f98f1504aa5 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 3 Jun 2009 19:40:36 +1000
Subject: perf_counter: powerpc: Fix race causing "oops trying to read PMC0"
 errors

When using interrupting counters and limited (non-interrupting)
counters at the same time, it's possible that we get an
interrupt in write_mmcr0() after writing MMCR0 but before we
have set up the counters using limited PMCs.  What happens then
is that we get into perf_counter_interrupt() with
counter->hw.idx = 0 for the limited counters, leading to the
"oops trying to read PMC0" error message being printed.

This fixes the problem by making perf_counter_interrupt()
robust against counter->hw.idx being zero (the counter is just
ignored in that case) and also by changing write_mmcr0() to
write MMCR0 initially with the counter overflow interrupt
enable bits masked (set to 0).  If the MMCR0 value requested by
the caller has either of those bits set, we write MMCR0 again
with the requested value of those bits after setting up the
limited counters properly.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
Cc: Stephane Eranian <eranian@googlemail.com>
LKML-Reference: <18982.17684.138182.954599@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index ea54686cb78..4cc4ac5c791 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -372,16 +372,28 @@ static void write_mmcr0(struct cpu_hw_counters *cpuhw, unsigned long mmcr0)
 
 	/*
 	 * Write MMCR0, then read PMC5 and PMC6 immediately.
+	 * To ensure we don't get a performance monitor interrupt
+	 * between writing MMCR0 and freezing/thawing the limited
+	 * counters, we first write MMCR0 with the counter overflow
+	 * interrupt enable bits turned off.
 	 */
 	asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5"
 		     : "=&r" (pmc5), "=&r" (pmc6)
-		     : "r" (mmcr0), "i" (SPRN_MMCR0),
+		     : "r" (mmcr0 & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)),
+		       "i" (SPRN_MMCR0),
 		       "i" (SPRN_PMC5), "i" (SPRN_PMC6));
 
 	if (mmcr0 & MMCR0_FC)
 		freeze_limited_counters(cpuhw, pmc5, pmc6);
 	else
 		thaw_limited_counters(cpuhw, pmc5, pmc6);
+
+	/*
+	 * Write the full MMCR0 including the counter overflow interrupt
+	 * enable bits, if necessary.
+	 */
+	if (mmcr0 & (MMCR0_PMC1CE | MMCR0_PMCjCE))
+		mtspr(SPRN_MMCR0, mmcr0);
 }
 
 /*
@@ -1108,7 +1120,7 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 
 	for (i = 0; i < cpuhw->n_counters; ++i) {
 		counter = cpuhw->counter[i];
-		if (is_limited_pmc(counter->hw.idx))
+		if (!counter->hw.idx || is_limited_pmc(counter->hw.idx))
 			continue;
 		val = read_pmc(counter->hw.idx);
 		if ((int)val < 0) {
-- 
cgit v1.2.3


From 128f048f0f0d2a477ad2555e7acd2ad15a1b6061 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Jun 2009 22:19:36 +0200
Subject: perf_counter: Fix throttling lock-up

Throttling logic is broken and we can lock up with too small
hw sampling intervals.

Make the throttling code more robust: disable counters even
if we already disabled them.

( Also clean up whitespace damage i noticed while reading
  various pieces of code related to throttling. )

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 12cc05ed9f4..8f53f3a7da2 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -91,7 +91,7 @@ static u64 intel_pmu_raw_event(u64 event)
 #define CORE_EVNTSEL_INV_MASK		0x00800000ULL
 #define CORE_EVNTSEL_COUNTER_MASK	0xFF000000ULL
 
-#define CORE_EVNTSEL_MASK 		\
+#define CORE_EVNTSEL_MASK		\
 	(CORE_EVNTSEL_EVENT_MASK |	\
 	 CORE_EVNTSEL_UNIT_MASK  |	\
 	 CORE_EVNTSEL_EDGE_MASK  |	\
-- 
cgit v1.2.3


From 1b58c2515be48d5df79d20210ac5a86e30094de2 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 4 Jun 2009 09:49:59 +1000
Subject: perf_counter: powerpc: Use new identifier names in powerpc-specific
 code

Commit b23f3325 ("perf_counter: Rename various fields") fixed up
most of the uses of the renamed fields, but missed one instance
of "record_type" in powerpc-specific code which needs to be changed
to "sample_type", and a "PERF_RECORD_ADDR" in the same statement that
needs to be changed to "PERF_SAMPLE_ADDR", causing compilation
errors on powerpc.  This fixes it.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <18983.3111.770392.800486@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 4cc4ac5c791..232b00a36f7 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -1002,7 +1002,7 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	 */
 	if (record) {
 		addr = 0;
-		if (counter->attr.record_type & PERF_RECORD_ADDR) {
+		if (counter->attr.sample_type & PERF_SAMPLE_ADDR) {
 			/*
 			 * The user wants a data address recorded.
 			 * If we're not doing instruction sampling,
-- 
cgit v1.2.3


From f7b6eb3fa07269da20dbbde8ba37a0273fdbd9c9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 5 Jun 2009 14:04:51 +0200
Subject: x86: Set context.vdso before installing the mapping

In order to make arch_vma_name() work from inside
install_special_mapping() we need to set the context.vdso
before calling it.

( This is needed for performance counters to be able to track
  this special executable area. )

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/vdso/vdso32-setup.c | 6 +++++-
 arch/x86/vdso/vma.c          | 7 +++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 1241f118ab5..58bc00f68b1 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -338,6 +338,8 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 		}
 	}
 
+	current->mm->context.vdso = (void *)addr;
+
 	if (compat_uses_vma || !compat) {
 		/*
 		 * MAYWRITE to allow gdb to COW and set breakpoints
@@ -358,11 +360,13 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 			goto up_fail;
 	}
 
-	current->mm->context.vdso = (void *)addr;
 	current_thread_info()->sysenter_return =
 		VDSO32_SYMBOL(addr, SYSENTER_RETURN);
 
   up_fail:
+	if (ret)
+		current->mm->context.vdso = NULL;
+
 	up_write(&mm->mmap_sem);
 
 	return ret;
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 7133cdf9098..93b7a2938b2 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -115,15 +115,18 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 		goto up_fail;
 	}
 
+	current->mm->context.vdso = (void *)addr;
+
 	ret = install_special_mapping(mm, addr, vdso_size,
 				      VM_READ|VM_EXEC|
 				      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
 				      VM_ALWAYSDUMP,
 				      vdso_pages);
-	if (ret)
+	if (ret) {
+		current->mm->context.vdso = NULL;
 		goto up_fail;
+	}
 
-	current->mm->context.vdso = (void *)addr;
 up_fail:
 	up_write(&mm->mmap_sem);
 	return ret;
-- 
cgit v1.2.3


From a21ca2cac582886a3e95c8bb84ff7c52d4d15e54 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 6 Jun 2009 09:58:57 +0200
Subject: perf_counter: Separate out attr->type from attr->config

Counter type is a frequently used value and we do a lot of
bit juggling by encoding and decoding it from attr->config.

Clean this up by creating a separate attr->type field.

Also clean up the various similarly complex user-space bits
all around counter attribute management.

The net improvement is significant, and it will be easier
to add a new major type (which is what triggered this cleanup).

(This changes the ABI, all tools are adapted.)
(PowerPC build-tested.)

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 6 +++---
 arch/x86/kernel/cpu/perf_counter.c | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 232b00a36f7..4786ad9a288 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -867,13 +867,13 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 
 	if (!ppmu)
 		return ERR_PTR(-ENXIO);
-	if (!perf_event_raw(&counter->attr)) {
-		ev = perf_event_id(&counter->attr);
+	if (counter->attr.type != PERF_TYPE_RAW) {
+		ev = counter->attr.config;
 		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
 			return ERR_PTR(-EOPNOTSUPP);
 		ev = ppmu->generic_events[ev];
 	} else {
-		ev = perf_event_config(&counter->attr);
+		ev = counter->attr.config;
 	}
 	counter->hw.config_base = ev;
 	counter->hw.idx = 0;
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 8f53f3a7da2..430e048f285 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -292,15 +292,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	/*
 	 * Raw event type provide the config in the event structure
 	 */
-	if (perf_event_raw(attr)) {
-		hwc->config |= x86_pmu.raw_event(perf_event_config(attr));
+	if (attr->type == PERF_TYPE_RAW) {
+		hwc->config |= x86_pmu.raw_event(attr->config);
 	} else {
-		if (perf_event_id(attr) >= x86_pmu.max_events)
+		if (attr->config >= x86_pmu.max_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= x86_pmu.event_map(perf_event_id(attr));
+		hwc->config |= x86_pmu.event_map(attr->config);
 	}
 
 	counter->destroy = hw_perf_counter_destroy;
-- 
cgit v1.2.3


From 8326f44da090d6d304d29b9fdc7fb3e20889e329 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 5 Jun 2009 20:22:46 +0200
Subject: perf_counter: Implement generalized cache event types

Extend generic event enumeration with the PERF_TYPE_HW_CACHE
method.

This is a 3-dimensional space:

       { L1-D, L1-I, L2, ITLB, DTLB, BPU } x
       { load, store, prefetch } x
       { accesses, misses }

User-space passes in the 3 coordinates and the kernel provides
a counter. (if the hardware supports that type and if the
combination makes sense.)

Combinations that make no sense produce a -EINVAL.
Combinations that are not supported by the hardware produce -ENOTSUP.

Extend the tools to deal with this, and rewrite the event symbol
parsing code with various popular aliases for the units and
access methods above. So 'l1-cache-miss' and 'l1d-read-ops' are
both valid aliases.

( x86 is supported for now, with the Nehalem event table filled in,
  and with Core2 and Atom having placeholder tables. )

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 201 +++++++++++++++++++++++++++++++++++--
 1 file changed, 193 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 430e048f285..e86679fa521 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -83,6 +83,128 @@ static u64 intel_pmu_event_map(int event)
 	return intel_perfmon_event_map[event];
 }
 
+/*
+ * Generalized hw caching related event table, filled
+ * in on a per model basis. A value of 0 means
+ * 'not supported', -1 means 'event makes no sense on
+ * this CPU', any other value means the raw event
+ * ID.
+ */
+
+#define C(x) PERF_COUNT_HW_CACHE_##x
+
+static u64 __read_mostly hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX];
+
+static const u64 nehalem_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI            */
+		[ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE         */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI            */
+		[ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE         */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
+		[ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0480, /* L1I.READS                    */
+		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(L2  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS               */
+		[ C(RESULT_MISS)   ] = 0x0224, /* L2_RQSTS.LD_MISS             */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS                */
+		[ C(RESULT_MISS)   ] = 0x0824, /* L2_RQSTS.RFO_MISS            */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0xc024, /* L2_RQSTS.PREFETCHES          */
+		[ C(RESULT_MISS)   ] = 0x8024, /* L2_RQSTS.PREFETCH_MISS       */
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI   (alias)  */
+		[ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI   (alias)  */
+		[ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
+		[ C(RESULT_MISS)   ] = 0x0185, /* ITLB_MISS_RETIRED            */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+		[ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
+static const u64 core2_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+	/* To be filled in */
+};
+
+static const u64 atom_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+	/* To be filled in */
+};
+
 static u64 intel_pmu_raw_event(u64 event)
 {
 #define CORE_EVNTSEL_EVENT_MASK		0x000000FFULL
@@ -246,6 +368,39 @@ static inline int x86_pmu_initialized(void)
 	return x86_pmu.handle_irq != NULL;
 }
 
+static inline int
+set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
+{
+	unsigned int cache_type, cache_op, cache_result;
+	u64 config, val;
+
+	config = attr->config;
+
+	cache_type = (config >>  0) & 0xff;
+	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
+		return -EINVAL;
+
+	cache_op = (config >>  8) & 0xff;
+	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
+		return -EINVAL;
+
+	cache_result = (config >> 16) & 0xff;
+	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
+		return -EINVAL;
+
+	val = hw_cache_event_ids[cache_type][cache_op][cache_result];
+
+	if (val == 0)
+		return -ENOENT;
+
+	if (val == -1)
+		return -EINVAL;
+
+	hwc->config |= val;
+
+	return 0;
+}
+
 /*
  * Setup the hardware configuration for a given attr_type
  */
@@ -288,22 +443,25 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		hwc->sample_period = x86_pmu.max_period;
 
 	atomic64_set(&hwc->period_left, hwc->sample_period);
+	counter->destroy = hw_perf_counter_destroy;
 
 	/*
 	 * Raw event type provide the config in the event structure
 	 */
 	if (attr->type == PERF_TYPE_RAW) {
 		hwc->config |= x86_pmu.raw_event(attr->config);
-	} else {
-		if (attr->config >= x86_pmu.max_events)
-			return -EINVAL;
-		/*
-		 * The generic map:
-		 */
-		hwc->config |= x86_pmu.event_map(attr->config);
+		return 0;
 	}
 
-	counter->destroy = hw_perf_counter_destroy;
+	if (attr->type == PERF_TYPE_HW_CACHE)
+		return set_ext_hw_attr(hwc, attr);
+
+	if (attr->config >= x86_pmu.max_events)
+		return -EINVAL;
+	/*
+	 * The generic map:
+	 */
+	hwc->config |= x86_pmu.event_map(attr->config);
 
 	return 0;
 }
@@ -989,6 +1147,33 @@ static int intel_pmu_init(void)
 
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
 
+	/*
+	 * Nehalem:
+	 */
+	switch (boot_cpu_data.x86_model) {
+	case 17:
+		memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
+		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
+			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
+
+		pr_info("... installed Core2 event tables\n");
+		break;
+	default:
+	case 26:
+		memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
+		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
+			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
+
+		pr_info("... installed Nehalem/Corei7 event tables\n");
+		break;
+	case 28:
+		memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
+		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
+			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
+
+		pr_info("... installed Atom event tables\n");
+		break;
+	}
 	return 0;
 }
 
-- 
cgit v1.2.3


From 0312af84164215a452f2a94957ebd9bce86e0204 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 8 Jun 2009 07:42:04 +0200
Subject: perf_counter, x86: Implement generalized cache event types, add Core2
 support

Fill in core2_hw_cache_event_id[] with the Core2 model specific events.

The events can be used in all the tools via the -e (--event) parameter,
for example "-e l1-misses" or -"-e l2-accesses" or "-e l2-write-misses".

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 85 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 84 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index e86679fa521..b1f71ff5025 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -194,7 +194,90 @@ static const u64 core2_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_OP_MAX]
 				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
 {
-	/* To be filled in */
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI          */
+		[ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE       */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI          */
+		[ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE       */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS      */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS                  */
+		[ C(RESULT_MISS)   ] = 0x0081, /* L1I.MISSES                 */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(L2  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
+		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
+		[ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI  (alias) */
+		[ C(RESULT_MISS)   ] = 0x0208, /* DTLB_MISSES.MISS_LD        */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI  (alias) */
+		[ C(RESULT_MISS)   ] = 0x0808, /* DTLB_MISSES.MISS_ST        */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
+		[ C(RESULT_MISS)   ] = 0x1282, /* ITLBMISSES                 */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
+		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
 };
 
 static const u64 atom_hw_cache_event_ids
-- 
cgit v1.2.3


From ad689220614b6c7c0b13b70d742f358e9310e71e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 8 Jun 2009 09:30:41 +0200
Subject: perf_counter, x86: Implement generalized cache event types, add Atom
 support

Fill in core2_hw_cache_event_id[] with the Atom model specific events.

The events can be used in all the tools via the -e (--event) parameter,
for example "-e l1-misses" or -"-e l2-accesses" or "-e l2-write-misses".

( Note: these are straight from the Intel manuals - not tested yet.)

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 85 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 84 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index b1f71ff5025..71590e09d16 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -285,7 +285,90 @@ static const u64 atom_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_OP_MAX]
 				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
 {
-	/* To be filled in */
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD               */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x2241, /* L1D_CACHE.ST               */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS                  */
+		[ C(RESULT_MISS)   ] = 0x0081, /* L1I.MISSES                 */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(L2  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
+		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
+		[ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI  (alias) */
+		[ C(RESULT_MISS)   ] = 0x0508, /* DTLB_MISSES.MISS_LD        */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI  (alias) */
+		[ C(RESULT_MISS)   ] = 0x0608, /* DTLB_MISSES.MISS_ST        */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
+		[ C(RESULT_MISS)   ] = 0x0282, /* ITLB.MISSES                */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
+		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
 };
 
 static u64 intel_pmu_raw_event(u64 event)
-- 
cgit v1.2.3


From 1123e3ad73697d64ad99f0104bbe49f8b52d7d65 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 29 May 2009 11:25:09 +0200
Subject: perf_counter: Clean up x86 boot messages

Standardize and tidy up all the messages we print during
perfcounter initialization.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 46 ++++++++++++++++++++------------------
 1 file changed, 24 insertions(+), 22 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 71590e09d16..0339d195a3f 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1298,23 +1298,22 @@ static int intel_pmu_init(void)
 	if (version < 2)
 		return -ENODEV;
 
-	x86_pmu = intel_pmu;
-	x86_pmu.version = version;
-	x86_pmu.num_counters = eax.split.num_counters;
+	x86_pmu				= intel_pmu;
+	x86_pmu.version			= version;
+	x86_pmu.num_counters		= eax.split.num_counters;
+	x86_pmu.counter_bits		= eax.split.bit_width;
+	x86_pmu.counter_mask		= (1ULL << eax.split.bit_width) - 1;
 
 	/*
 	 * Quirk: v2 perfmon does not report fixed-purpose counters, so
 	 * assume at least 3 counters:
 	 */
-	x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
-
-	x86_pmu.counter_bits = eax.split.bit_width;
-	x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
+	x86_pmu.num_counters_fixed	= max((int)edx.split.num_counters_fixed, 3);
 
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
 
 	/*
-	 * Nehalem:
+	 * Install the hw-cache-events table:
 	 */
 	switch (boot_cpu_data.x86_model) {
 	case 17:
@@ -1322,7 +1321,7 @@ static int intel_pmu_init(void)
 		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
 			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
 
-		pr_info("... installed Core2 event tables\n");
+		pr_cont("Core2 events, ");
 		break;
 	default:
 	case 26:
@@ -1330,14 +1329,14 @@ static int intel_pmu_init(void)
 		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
 			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
 
-		pr_info("... installed Nehalem/Corei7 event tables\n");
+		pr_cont("Nehalem/Corei7 events, ");
 		break;
 	case 28:
 		memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
 		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
 			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
 
-		pr_info("... installed Atom event tables\n");
+		pr_cont("Atom events, ");
 		break;
 	}
 	return 0;
@@ -1353,6 +1352,8 @@ void __init init_hw_perf_counters(void)
 {
 	int err;
 
+	pr_info("Performance Counters: ");
+
 	switch (boot_cpu_data.x86_vendor) {
 	case X86_VENDOR_INTEL:
 		err = intel_pmu_init();
@@ -1363,14 +1364,13 @@ void __init init_hw_perf_counters(void)
 	default:
 		return;
 	}
-	if (err != 0)
+	if (err != 0) {
+		pr_cont("no PMU driver, software counters only.\n");
 		return;
+	}
 
-	pr_info("%s Performance Monitoring support detected.\n", x86_pmu.name);
-	pr_info("... version:         %d\n", x86_pmu.version);
-	pr_info("... bit width:       %d\n", x86_pmu.counter_bits);
+	pr_cont("%s PMU driver.\n", x86_pmu.name);
 
-	pr_info("... num counters:    %d\n", x86_pmu.num_counters);
 	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
 		x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
 		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
@@ -1379,23 +1379,25 @@ void __init init_hw_perf_counters(void)
 	perf_counter_mask = (1 << x86_pmu.num_counters) - 1;
 	perf_max_counters = x86_pmu.num_counters;
 
-	pr_info("... value mask:      %016Lx\n", x86_pmu.counter_mask);
-	pr_info("... max period:      %016Lx\n", x86_pmu.max_period);
-
 	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
 		x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
 		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
 		     x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
 	}
-	pr_info("... fixed counters:  %d\n", x86_pmu.num_counters_fixed);
 
 	perf_counter_mask |=
 		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
 
-	pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
-
 	perf_counters_lapic_init();
 	register_die_notifier(&perf_counter_nmi_notifier);
+
+	pr_info("... version:                 %d\n",     x86_pmu.version);
+	pr_info("... bit width:               %d\n",     x86_pmu.counter_bits);
+	pr_info("... generic counters:        %d\n",     x86_pmu.num_counters);
+	pr_info("... value mask:              %016Lx\n", x86_pmu.counter_mask);
+	pr_info("... max period:              %016Lx\n", x86_pmu.max_period);
+	pr_info("... fixed-purpose counters:  %d\n",     x86_pmu.num_counters_fixed);
+	pr_info("... counter mask:            %016Lx\n", perf_counter_mask);
 }
 
 static inline void x86_pmu_read(struct perf_counter *counter)
-- 
cgit v1.2.3


From f86748e91a14bd6cc49477560f33ed5d59896e89 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 8 Jun 2009 22:33:10 +0200
Subject: perf_counter, x86: Implement generalized cache event types, add AMD
 support

Fill in amd_hw_cache_event_id[] with the AMD CPU specific events,
for family 0x0f, 0x10 and 0x11.

There's apparently no distinction between load and store events, so
we only fill in the load events.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 102 +++++++++++++++++++++++++++++++++++++
 1 file changed, 102 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 0339d195a3f..93af821ebe5 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -389,6 +389,97 @@ static u64 intel_pmu_raw_event(u64 event)
 	return event & CORE_EVNTSEL_MASK;
 }
 
+static const u64 amd_0f_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches  */
+		[ C(RESULT_MISS)   ] = 0x0081, /* Instruction cache misses   */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(L2  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes        */
+		[ C(RESULT_MISS)   ] = 0x0085, /* Instr. fetch ITLB misses   */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr.      */
+		[ C(RESULT_MISS)   ] = 0x00c3, /* Retired Mispredicted BI    */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
 /*
  * AMD Performance Monitor K7 and later.
  */
@@ -1345,6 +1436,17 @@ static int intel_pmu_init(void)
 static int amd_pmu_init(void)
 {
 	x86_pmu = amd_pmu;
+
+	switch (boot_cpu_data.x86) {
+	case 0x0f:
+	case 0x10:
+	case 0x11:
+		memcpy(hw_cache_event_ids, amd_0f_hw_cache_event_ids,
+		       sizeof(hw_cache_event_ids));
+
+		pr_cont("AMD Family 0f/10/11 events, ");
+		break;
+	}
 	return 0;
 }
 
-- 
cgit v1.2.3


From 820a644211bc1ac7715333abdb0f0b9ea4fbb549 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 8 Jun 2009 19:10:25 +0200
Subject: perf_counter, x86: Clean up hw_cache_event ids copies

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 93af821ebe5..56001feeffc 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1409,23 +1409,20 @@ static int intel_pmu_init(void)
 	switch (boot_cpu_data.x86_model) {
 	case 17:
 		memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
-		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
-			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
+		       sizeof(hw_cache_event_ids));
 
 		pr_cont("Core2 events, ");
 		break;
 	default:
 	case 26:
 		memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
-		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
-			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
+		       sizeof(hw_cache_event_ids));
 
 		pr_cont("Nehalem/Corei7 events, ");
 		break;
 	case 28:
 		memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
-		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
-			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
+		       sizeof(hw_cache_event_ids));
 
 		pr_cont("Atom events, ");
 		break;
-- 
cgit v1.2.3


From fecc8ac8496fce96069724f54daba8e7078b0082 Mon Sep 17 00:00:00 2001
From: Yong Wang <yong.y.wang@linux.intel.com>
Date: Tue, 9 Jun 2009 21:15:53 +0800
Subject: perf_counter, x86: Correct some event and umask values for Intel
 processors

Correct some event and UMASK values according to Intel SDM,
in the Nehalem and Atom tables.

Signed-off-by: Yong Wang <yong.y.wang@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <20090609131553.GA12489@ywang-moblin2.bj.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 56001feeffc..40978aac6e0 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -119,7 +119,7 @@ static const u64 nehalem_hw_cache_event_ids
  },
  [ C(L1I ) ] = {
 	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0480, /* L1I.READS                    */
+		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                    */
 		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
 	},
 	[ C(OP_WRITE) ] = {
@@ -162,7 +162,7 @@ static const u64 nehalem_hw_cache_event_ids
  [ C(ITLB) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
-		[ C(RESULT_MISS)   ] = 0x0185, /* ITLB_MISS_RETIRED            */
+		[ C(RESULT_MISS)   ] = 0x20c8, /* ITLB_MISS_RETIRED            */
 	},
 	[ C(OP_WRITE) ] = {
 		[ C(RESULT_ACCESS) ] = -1,
@@ -291,7 +291,7 @@ static const u64 atom_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = 0,
 	},
 	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x2241, /* L1D_CACHE.ST               */
+		[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST               */
 		[ C(RESULT_MISS)   ] = 0,
 	},
 	[ C(OP_PREFETCH) ] = {
@@ -301,8 +301,8 @@ static const u64 atom_hw_cache_event_ids
  },
  [ C(L1I ) ] = {
 	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS                  */
-		[ C(RESULT_MISS)   ] = 0x0081, /* L1I.MISSES                 */
+		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                  */
+		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                 */
 	},
 	[ C(OP_WRITE) ] = {
 		[ C(RESULT_ACCESS) ] = -1,
@@ -329,11 +329,11 @@ static const u64 atom_hw_cache_event_ids
  },
  [ C(DTLB) ] = {
 	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI  (alias) */
+		[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI  (alias) */
 		[ C(RESULT_MISS)   ] = 0x0508, /* DTLB_MISSES.MISS_LD        */
 	},
 	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI  (alias) */
+		[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI  (alias) */
 		[ C(RESULT_MISS)   ] = 0x0608, /* DTLB_MISSES.MISS_ST        */
 	},
 	[ C(OP_PREFETCH) ] = {
-- 
cgit v1.2.3


From dc81081b2d9a6a9d64dad1bef1e5fc9fb660e53e Mon Sep 17 00:00:00 2001
From: Yong Wang <yong.y.wang@linux.intel.com>
Date: Wed, 10 Jun 2009 17:06:12 +0800
Subject: perf_counter/x86: Fix the model number of Intel Core2 processors

Fix the model number of Intel Core2 processors according to the
documentation: Intel Processor Identification with the CPUID
Instruction: http://www.intel.com/support/processors/sb/cs-009861.htm

Signed-off-by: Yong Wang <yong.y.wang@intel.com>
Also-Reported-by: Arnd Bergmann <arnd@arndb.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <20090610090612.GA26580@ywang-moblin2.bj.intel.com>
[ Added two more model numbers suggested by Arnd Bergmann ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 40978aac6e0..49f258537cb 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1407,7 +1407,10 @@ static int intel_pmu_init(void)
 	 * Install the hw-cache-events table:
 	 */
 	switch (boot_cpu_data.x86_model) {
-	case 17:
+	case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
+	case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
+	case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
+	case 29: /* six-core 45 nm xeon "Dunnington" */
 		memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 
-- 
cgit v1.2.3


From bd2b5b12849a3446abad0b25e920f86f5480b309 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 10 Jun 2009 13:40:57 +0200
Subject: perf_counter: More aggressive frequency adjustment

Also employ the overflow handler to adjust the frequency, this results
in a stable frequency in about 40~50 samples, instead of that many ticks.

This also means we can start sampling at a sample period of 1 without
running head-first into the throttle.

It relies on sched_clock() to accurately measure the time difference
between the overflow NMIs.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 49f258537cb..240ca563063 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -696,10 +696,11 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	if (!attr->exclude_kernel)
 		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
 
-	if (!hwc->sample_period)
+	if (!hwc->sample_period) {
 		hwc->sample_period = x86_pmu.max_period;
+		atomic64_set(&hwc->period_left, hwc->sample_period);
+	}
 
-	atomic64_set(&hwc->period_left, hwc->sample_period);
 	counter->destroy = hw_perf_counter_destroy;
 
 	/*
-- 
cgit v1.2.3


From df1a132bf3d3508f863336c80a27806a2ac947e0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 10 Jun 2009 21:02:22 +0200
Subject: perf_counter: Introduce struct for sample data

For easy extension of the sample data, put it in a structure.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 10 +++++++---
 arch/x86/kernel/cpu/perf_counter.c | 15 +++++++++++----
 2 files changed, 18 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 4786ad9a288..5e0bf399c43 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -1001,7 +1001,11 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	 * Finally record data if requested.
 	 */
 	if (record) {
-		addr = 0;
+		struct perf_sample_data data = {
+			.regs = regs,
+			.addr = 0,
+		};
+
 		if (counter->attr.sample_type & PERF_SAMPLE_ADDR) {
 			/*
 			 * The user wants a data address recorded.
@@ -1016,9 +1020,9 @@ static void record_and_restart(struct perf_counter *counter, long val,
 			sdsync = (ppmu->flags & PPMU_ALT_SIPR) ?
 				POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC;
 			if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync))
-				addr = mfspr(SPRN_SDAR);
+				data.addr = mfspr(SPRN_SDAR);
 		}
-		if (perf_counter_overflow(counter, nmi, regs, addr)) {
+		if (perf_counter_overflow(counter, nmi, &data)) {
 			/*
 			 * Interrupts are coming too fast - throttle them
 			 * by setting the counter to 0, so it will be
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 240ca563063..82a23d487f9 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1173,11 +1173,14 @@ static void intel_pmu_reset(void)
  */
 static int intel_pmu_handle_irq(struct pt_regs *regs)
 {
+	struct perf_sample_data data;
 	struct cpu_hw_counters *cpuc;
-	struct cpu_hw_counters;
 	int bit, cpu, loops;
 	u64 ack, status;
 
+	data.regs = regs;
+	data.addr = 0;
+
 	cpu = smp_processor_id();
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
@@ -1210,7 +1213,7 @@ again:
 		if (!intel_pmu_save_and_restart(counter))
 			continue;
 
-		if (perf_counter_overflow(counter, 1, regs, 0))
+		if (perf_counter_overflow(counter, 1, &data))
 			intel_pmu_disable_counter(&counter->hw, bit);
 	}
 
@@ -1230,12 +1233,16 @@ again:
 
 static int amd_pmu_handle_irq(struct pt_regs *regs)
 {
-	int cpu, idx, handled = 0;
+	struct perf_sample_data data;
 	struct cpu_hw_counters *cpuc;
 	struct perf_counter *counter;
 	struct hw_perf_counter *hwc;
+	int cpu, idx, handled = 0;
 	u64 val;
 
+	data.regs = regs;
+	data.addr = 0;
+
 	cpu = smp_processor_id();
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
@@ -1256,7 +1263,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
 		if (!x86_perf_counter_set_period(counter, hwc, idx))
 			continue;
 
-		if (perf_counter_overflow(counter, 1, regs, 0))
+		if (perf_counter_overflow(counter, 1, &data))
 			amd_pmu_disable_counter(hwc, idx);
 	}
 
-- 
cgit v1.2.3


From 9e350de37ac9607012fcf9c5314a28fbddf8f43c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 10 Jun 2009 21:34:59 +0200
Subject: perf_counter: Accurate period data

We currently log hw.sample_period for PERF_SAMPLE_PERIOD, however this is
incorrect. When we adjust the period, it will only take effect the next
cycle but report it for the current cycle. So when we adjust the period
for every cycle, we're always wrong.

Solve this by keeping track of the last_period.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |  9 ++++++---
 arch/x86/kernel/cpu/perf_counter.c | 15 ++++++++++++---
 2 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 5e0bf399c43..4990ce2e5f0 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -767,6 +767,7 @@ static void power_pmu_unthrottle(struct perf_counter *counter)
 	perf_disable();
 	power_pmu_read(counter);
 	left = counter->hw.sample_period;
+	counter->hw.last_period = left;
 	val = 0;
 	if (left < 0x80000000L)
 		val = 0x80000000L - left;
@@ -937,7 +938,8 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 
 	counter->hw.config = events[n];
 	counter->hw.counter_base = cflags[n];
-	atomic64_set(&counter->hw.period_left, counter->hw.sample_period);
+	counter->hw.last_period = counter->hw.sample_period;
+	atomic64_set(&counter->hw.period_left, counter->hw.last_period);
 
 	/*
 	 * See if we need to reserve the PMU.
@@ -1002,8 +1004,9 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	 */
 	if (record) {
 		struct perf_sample_data data = {
-			.regs = regs,
-			.addr = 0,
+			.regs	= regs,
+			.addr	= 0,
+			.period	= counter->hw.last_period,
 		};
 
 		if (counter->attr.sample_type & PERF_SAMPLE_ADDR) {
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 82a23d487f9..57ae1bec81b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -698,6 +698,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 
 	if (!hwc->sample_period) {
 		hwc->sample_period = x86_pmu.max_period;
+		hwc->last_period = hwc->sample_period;
 		atomic64_set(&hwc->period_left, hwc->sample_period);
 	}
 
@@ -880,12 +881,14 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 	if (unlikely(left <= -period)) {
 		left = period;
 		atomic64_set(&hwc->period_left, left);
+		hwc->last_period = period;
 		ret = 1;
 	}
 
 	if (unlikely(left <= 0)) {
 		left += period;
 		atomic64_set(&hwc->period_left, left);
+		hwc->last_period = period;
 		ret = 1;
 	}
 	/*
@@ -1257,9 +1260,12 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
 		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
 			continue;
 
-		/* counter overflow */
-		handled = 1;
-		inc_irq_stat(apic_perf_irqs);
+		/*
+		 * counter overflow
+		 */
+		handled		= 1;
+		data.period	= counter->hw.last_period;
+
 		if (!x86_perf_counter_set_period(counter, hwc, idx))
 			continue;
 
@@ -1267,6 +1273,9 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
 			amd_pmu_disable_counter(hwc, idx);
 	}
 
+	if (handled)
+		inc_irq_stat(apic_perf_irqs);
+
 	return handled;
 }
 
-- 
cgit v1.2.3


From 4da52960fd1ae3ddd14901bc88b608cbeaa4b9a6 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 11 Jun 2009 14:54:01 +1000
Subject: perf_counters: powerpc: Add support for POWER7 processors

This adds the back-end for the PMU on POWER7 processors.  POWER7
has 4 fully-programmable counters and two fixed-function counters
(which do respect the freeze conditions, can generate interrupts,
and are writable, unlike PMC5/6 on POWER5+/6).

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <18992.36329.189378.17992@drongo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/Makefile       |   3 +-
 arch/powerpc/kernel/perf_counter.c |   4 +
 arch/powerpc/kernel/power7-pmu.c   | 316 +++++++++++++++++++++++++++++++++++++
 3 files changed, 322 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/kernel/power7-pmu.c

(limited to 'arch')

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 9ba1bb731fc..a2c683403c2 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -95,7 +95,8 @@ obj64-$(CONFIG_AUDIT)		+= compat_audit.o
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER)	+= ftrace.o
 obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o power4-pmu.o ppc970-pmu.o \
-				   power5-pmu.o power5+-pmu.o power6-pmu.o
+				   power5-pmu.o power5+-pmu.o power6-pmu.o \
+				   power7-pmu.o
 
 obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
 
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 4990ce2e5f0..5d12e68aac1 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -1181,6 +1181,7 @@ extern struct power_pmu ppc970_pmu;
 extern struct power_pmu power5_pmu;
 extern struct power_pmu power5p_pmu;
 extern struct power_pmu power6_pmu;
+extern struct power_pmu power7_pmu;
 
 static int init_perf_counters(void)
 {
@@ -1207,6 +1208,9 @@ static int init_perf_counters(void)
 	case 0x3e:
 		ppmu = &power6_pmu;
 		break;
+	case 0x3f:
+		ppmu = &power7_pmu;
+		break;
 	}
 
 	/*
diff --git a/arch/powerpc/kernel/power7-pmu.c b/arch/powerpc/kernel/power7-pmu.c
new file mode 100644
index 00000000000..dfac48d8ff4
--- /dev/null
+++ b/arch/powerpc/kernel/power7-pmu.c
@@ -0,0 +1,316 @@
+/*
+ * Performance counter support for POWER7 processors.
+ *
+ * Copyright 2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/perf_counter.h>
+#include <asm/reg.h>
+
+/*
+ * Bits in event code for POWER7
+ */
+#define PM_PMC_SH	16	/* PMC number (1-based) for direct events */
+#define PM_PMC_MSK	0xf
+#define PM_PMC_MSKS	(PM_PMC_MSK << PM_PMC_SH)
+#define PM_UNIT_SH	12	/* TTMMUX number and setting - unit select */
+#define PM_UNIT_MSK	0xf
+#define PM_COMBINE_SH	11	/* Combined event bit */
+#define PM_COMBINE_MSK	1
+#define PM_COMBINE_MSKS	0x800
+#define PM_L2SEL_SH	8	/* L2 event select */
+#define PM_L2SEL_MSK	7
+#define PM_PMCSEL_MSK	0xff
+
+/*
+ * Bits in MMCR1 for POWER7
+ */
+#define MMCR1_TTM0SEL_SH	60
+#define MMCR1_TTM1SEL_SH	56
+#define MMCR1_TTM2SEL_SH	52
+#define MMCR1_TTM3SEL_SH	48
+#define MMCR1_TTMSEL_MSK	0xf
+#define MMCR1_L2SEL_SH		45
+#define MMCR1_L2SEL_MSK		7
+#define MMCR1_PMC1_COMBINE_SH	35
+#define MMCR1_PMC2_COMBINE_SH	34
+#define MMCR1_PMC3_COMBINE_SH	33
+#define MMCR1_PMC4_COMBINE_SH	32
+#define MMCR1_PMC1SEL_SH	24
+#define MMCR1_PMC2SEL_SH	16
+#define MMCR1_PMC3SEL_SH	8
+#define MMCR1_PMC4SEL_SH	0
+#define MMCR1_PMCSEL_SH(n)	(MMCR1_PMC1SEL_SH - (n) * 8)
+#define MMCR1_PMCSEL_MSK	0xff
+
+/*
+ * Bits in MMCRA
+ */
+
+/*
+ * Layout of constraint bits:
+ * 6666555555555544444444443333333333222222222211111111110000000000
+ * 3210987654321098765432109876543210987654321098765432109876543210
+ *                                                 [  ><><><><><><>
+ *                                                  NC P6P5P4P3P2P1
+ *
+ * NC - number of counters
+ *     15: NC error 0x8000
+ *     12-14: number of events needing PMC1-4 0x7000
+ *
+ * P6
+ *     11: P6 error 0x800
+ *     10-11: Count of events needing PMC6
+ *
+ * P1..P5
+ *     0-9: Count of events needing PMC1..PMC5
+ */
+
+static int power7_get_constraint(u64 event, u64 *maskp, u64 *valp)
+{
+	int pmc, sh;
+	u64 mask = 0, value = 0;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	if (pmc) {
+		if (pmc > 6)
+			return -1;
+		sh = (pmc - 1) * 2;
+		mask |= 2 << sh;
+		value |= 1 << sh;
+		if (pmc >= 5 && !(event == 0x500fa || event == 0x600f4))
+			return -1;
+	}
+	if (pmc < 5) {
+		/* need a counter from PMC1-4 set */
+		mask  |= 0x8000;
+		value |= 0x1000;
+	}
+	*maskp = mask;
+	*valp = value;
+	return 0;
+}
+
+#define MAX_ALT	2	/* at most 2 alternatives for any event */
+
+static const unsigned int event_alternatives[][MAX_ALT] = {
+	{ 0x200f2, 0x300f2 },		/* PM_INST_DISP */
+	{ 0x200f4, 0x600f4 },		/* PM_RUN_CYC */
+	{ 0x400fa, 0x500fa },		/* PM_RUN_INST_CMPL */
+};
+
+/*
+ * Scan the alternatives table for a match and return the
+ * index into the alternatives table if found, else -1.
+ */
+static int find_alternative(u64 event)
+{
+	int i, j;
+
+	for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
+		if (event < event_alternatives[i][0])
+			break;
+		for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j)
+			if (event == event_alternatives[i][j])
+				return i;
+	}
+	return -1;
+}
+
+static s64 find_alternative_decode(u64 event)
+{
+	int pmc, psel;
+
+	/* this only handles the 4x decode events */
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	psel = event & PM_PMCSEL_MSK;
+	if ((pmc == 2 || pmc == 4) && (psel & ~7) == 0x40)
+		return event - (1 << PM_PMC_SH) + 8;
+	if ((pmc == 1 || pmc == 3) && (psel & ~7) == 0x48)
+		return event + (1 << PM_PMC_SH) - 8;
+	return -1;
+}
+
+static int power7_get_alternatives(u64 event, unsigned int flags, u64 alt[])
+{
+	int i, j, nalt = 1;
+	s64 ae;
+
+	alt[0] = event;
+	nalt = 1;
+	i = find_alternative(event);
+	if (i >= 0) {
+		for (j = 0; j < MAX_ALT; ++j) {
+			ae = event_alternatives[i][j];
+			if (ae && ae != event)
+				alt[nalt++] = ae;
+		}
+	} else {
+		ae = find_alternative_decode(event);
+		if (ae > 0)
+			alt[nalt++] = ae;
+	}
+
+	if (flags & PPMU_ONLY_COUNT_RUN) {
+		/*
+		 * We're only counting in RUN state,
+		 * so PM_CYC is equivalent to PM_RUN_CYC
+		 * and PM_INST_CMPL === PM_RUN_INST_CMPL.
+		 * This doesn't include alternatives that don't provide
+		 * any extra flexibility in assigning PMCs.
+		 */
+		j = nalt;
+		for (i = 0; i < nalt; ++i) {
+			switch (alt[i]) {
+			case 0x1e:	/* PM_CYC */
+				alt[j++] = 0x600f4;	/* PM_RUN_CYC */
+				break;
+			case 0x600f4:	/* PM_RUN_CYC */
+				alt[j++] = 0x1e;
+				break;
+			case 0x2:	/* PM_PPC_CMPL */
+				alt[j++] = 0x500fa;	/* PM_RUN_INST_CMPL */
+				break;
+			case 0x500fa:	/* PM_RUN_INST_CMPL */
+				alt[j++] = 0x2;	/* PM_PPC_CMPL */
+				break;
+			}
+		}
+		nalt = j;
+	}
+
+	return nalt;
+}
+
+/*
+ * Returns 1 if event counts things relating to marked instructions
+ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
+ */
+static int power7_marked_instr_event(u64 event)
+{
+	int pmc, psel;
+	int unit;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+	psel = event & PM_PMCSEL_MSK & ~1;	/* trim off edge/level bit */
+	if (pmc >= 5)
+		return 0;
+
+	switch (psel >> 4) {
+	case 2:
+		return pmc == 2 || pmc == 4;
+	case 3:
+		if (psel == 0x3c)
+			return pmc == 1;
+		if (psel == 0x3e)
+			return pmc != 2;
+		return 1;
+	case 4:
+	case 5:
+		return unit == 0xd;
+	case 6:
+		if (psel == 0x64)
+			return pmc >= 3;
+	case 8:
+		return unit == 0xd;
+	}
+	return 0;
+}
+
+static int power7_compute_mmcr(u64 event[], int n_ev,
+			       unsigned int hwc[], u64 mmcr[])
+{
+	u64 mmcr1 = 0;
+	u64 mmcra = 0;
+	unsigned int pmc, unit, combine, l2sel, psel;
+	unsigned int pmc_inuse = 0;
+	int i;
+
+	/* First pass to count resource use */
+	for (i = 0; i < n_ev; ++i) {
+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+		if (pmc) {
+			if (pmc > 6)
+				return -1;
+			if (pmc_inuse & (1 << (pmc - 1)))
+				return -1;
+			pmc_inuse |= 1 << (pmc - 1);
+		}
+	}
+
+	/* Second pass: assign PMCs, set all MMCR1 fields */
+	for (i = 0; i < n_ev; ++i) {
+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+		unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+		combine = (event[i] >> PM_COMBINE_SH) & PM_COMBINE_MSK;
+		l2sel = (event[i] >> PM_L2SEL_SH) & PM_L2SEL_MSK;
+		psel = event[i] & PM_PMCSEL_MSK;
+		if (!pmc) {
+			/* Bus event or any-PMC direct event */
+			for (pmc = 0; pmc < 4; ++pmc) {
+				if (!(pmc_inuse & (1 << pmc)))
+					break;
+			}
+			if (pmc >= 4)
+				return -1;
+			pmc_inuse |= 1 << pmc;
+		} else {
+			/* Direct or decoded event */
+			--pmc;
+		}
+		if (pmc <= 3) {
+			mmcr1 |= (u64) unit << (MMCR1_TTM0SEL_SH - 4 * pmc);
+			mmcr1 |= (u64) combine << (MMCR1_PMC1_COMBINE_SH - pmc);
+			mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc);
+			if (unit == 6)	/* L2 events */
+				mmcr1 |= (u64) l2sel << MMCR1_L2SEL_SH;
+		}
+		if (power7_marked_instr_event(event[i]))
+			mmcra |= MMCRA_SAMPLE_ENABLE;
+		hwc[i] = pmc;
+	}
+
+	/* Return MMCRx values */
+	mmcr[0] = 0;
+	if (pmc_inuse & 1)
+		mmcr[0] = MMCR0_PMC1CE;
+	if (pmc_inuse & 0x3e)
+		mmcr[0] |= MMCR0_PMCjCE;
+	mmcr[1] = mmcr1;
+	mmcr[2] = mmcra;
+	return 0;
+}
+
+static void power7_disable_pmc(unsigned int pmc, u64 mmcr[])
+{
+	if (pmc <= 3)
+		mmcr[1] &= ~(0xffULL << MMCR1_PMCSEL_SH(pmc));
+}
+
+static int power7_generic_events[] = {
+	[PERF_COUNT_CPU_CYCLES] = 0x1e,
+	[PERF_COUNT_INSTRUCTIONS] = 2,
+	[PERF_COUNT_CACHE_REFERENCES] = 0xc880,		/* LD_REF_L1_LSU */
+	[PERF_COUNT_CACHE_MISSES] = 0x400f0,		/* LD_MISS_L1 */
+	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x10068,	/* BRU_FIN */
+	[PERF_COUNT_BRANCH_MISSES] = 0x400f6,		/* BR_MPRED */
+};
+
+struct power_pmu power7_pmu = {
+	.n_counter = 6,
+	.max_alternatives = MAX_ALT + 1,
+	.add_fields = 0x1555ull,
+	.test_adder = 0x3000ull,
+	.compute_mmcr = power7_compute_mmcr,
+	.get_constraint = power7_get_constraint,
+	.get_alternatives = power7_get_alternatives,
+	.disable_pmc = power7_disable_pmc,
+	.n_generic = ARRAY_SIZE(power7_generic_events),
+	.generic_events = power7_generic_events,
+};
-- 
cgit v1.2.3


From 106b506c3a8b74daa5751e83ed3e46438fcf9a52 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 11 Jun 2009 14:55:42 +1000
Subject: perf_counter: powerpc: Implement generalized cache events for POWER
 processors

This adds tables of event codes for the generalized cache events for
all the currently supported powerpc processors: POWER{4,5,5+,6,7} and
PPC970*, plus powerpc-specific code to use these tables when a
generalized cache event is requested.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <18992.36430.933526.742969@drongo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/include/asm/perf_counter.h |  3 +++
 arch/powerpc/kernel/perf_counter.c      | 42 ++++++++++++++++++++++++++++--
 arch/powerpc/kernel/power4-pmu.c        | 41 +++++++++++++++++++++++++++++
 arch/powerpc/kernel/power5+-pmu.c       | 45 ++++++++++++++++++++++++++++++--
 arch/powerpc/kernel/power5-pmu.c        | 41 +++++++++++++++++++++++++++++
 arch/powerpc/kernel/power6-pmu.c        | 46 +++++++++++++++++++++++++++++++--
 arch/powerpc/kernel/power7-pmu.c        | 41 +++++++++++++++++++++++++++++
 arch/powerpc/kernel/ppc970-pmu.c        | 41 +++++++++++++++++++++++++++++
 8 files changed, 294 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h
index 1c60f0ca792..cc7c887705b 100644
--- a/arch/powerpc/include/asm/perf_counter.h
+++ b/arch/powerpc/include/asm/perf_counter.h
@@ -33,6 +33,9 @@ struct power_pmu {
 	u32	flags;
 	int	n_generic;
 	int	*generic_events;
+	int	(*cache_events)[PERF_COUNT_HW_CACHE_MAX]
+			       [PERF_COUNT_HW_CACHE_OP_MAX]
+			       [PERF_COUNT_HW_CACHE_RESULT_MAX];
 };
 
 extern struct power_pmu *ppmu;
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 5d12e68aac1..bb202388170 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -856,6 +856,36 @@ static void hw_perf_counter_destroy(struct perf_counter *counter)
 	}
 }
 
+/*
+ * Translate a generic cache event config to a raw event code.
+ */
+static int hw_perf_cache_event(u64 config, u64 *eventp)
+{
+	unsigned long type, op, result;
+	int ev;
+
+	if (!ppmu->cache_events)
+		return -EINVAL;
+
+	/* unpack config */
+	type = config & 0xff;
+	op = (config >> 8) & 0xff;
+	result = (config >> 16) & 0xff;
+
+	if (type >= PERF_COUNT_HW_CACHE_MAX ||
+	    op >= PERF_COUNT_HW_CACHE_OP_MAX ||
+	    result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
+		return -EINVAL;
+
+	ev = (*ppmu->cache_events)[type][op][result];
+	if (ev == 0)
+		return -EOPNOTSUPP;
+	if (ev == -1)
+		return -EINVAL;
+	*eventp = ev;
+	return 0;
+}
+
 const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 {
 	u64 ev;
@@ -868,13 +898,21 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 
 	if (!ppmu)
 		return ERR_PTR(-ENXIO);
-	if (counter->attr.type != PERF_TYPE_RAW) {
+	switch (counter->attr.type) {
+	case PERF_TYPE_HARDWARE:
 		ev = counter->attr.config;
 		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
 			return ERR_PTR(-EOPNOTSUPP);
 		ev = ppmu->generic_events[ev];
-	} else {
+		break;
+	case PERF_TYPE_HW_CACHE:
+		err = hw_perf_cache_event(counter->attr.config, &ev);
+		if (err)
+			return ERR_PTR(err);
+		break;
+	case PERF_TYPE_RAW:
 		ev = counter->attr.config;
+		break;
 	}
 	counter->hw.config_base = ev;
 	counter->hw.idx = 0;
diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c
index 836fa118eb1..0e94b685722 100644
--- a/arch/powerpc/kernel/power4-pmu.c
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -543,6 +543,46 @@ static int p4_generic_events[] = {
 	[PERF_COUNT_BRANCH_MISSES] = 0x331,		/* PM_BR_MPRED_CR */
 };
 
+#define C(x)	PERF_COUNT_HW_CACHE_##x
+
+/*
+ * Table of generalized cache-related events.
+ * 0 means not supported, -1 means nonsensical, other values
+ * are event codes.
+ */
+static int power4_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x8c10,		0x3c10	},
+		[C(OP_WRITE)] = {	0x7c10,		0xc13	},
+		[C(OP_PREFETCH)] = {	0xc35,		0	},
+	},
+	[C(L1I)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	0,		0	},
+	},
+	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0	},
+		[C(OP_WRITE)] = {	0,		0	},
+		[C(OP_PREFETCH)] = {	0xc34,		0	},
+	},
+	[C(DTLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x904	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	-1,		-1	},
+	},
+	[C(ITLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x900	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	-1,		-1	},
+	},
+	[C(BPU)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x330,		0x331	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	-1,		-1	},
+	},
+};
+
 struct power_pmu power4_pmu = {
 	.n_counter = 8,
 	.max_alternatives = 5,
@@ -554,4 +594,5 @@ struct power_pmu power4_pmu = {
 	.disable_pmc = p4_disable_pmc,
 	.n_generic = ARRAY_SIZE(p4_generic_events),
 	.generic_events = p4_generic_events,
+	.cache_events = &power4_cache_events,
 };
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
index 8471e3c2e46..bbf2cbb0738 100644
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -614,6 +614,46 @@ static int power5p_generic_events[] = {
 	[PERF_COUNT_BRANCH_MISSES] = 0x230e5,		/* BR_MPRED_CR */
 };
 
+#define C(x)	PERF_COUNT_HW_CACHE_##x
+
+/*
+ * Table of generalized cache-related events.
+ * 0 means not supported, -1 means nonsensical, other values
+ * are event codes.
+ */
+static int power5p_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x1c10a8,	0x3c1088	},
+		[C(OP_WRITE)] = {	0x2c10a8,	0xc10c3		},
+		[C(OP_PREFETCH)] = {	0xc70e7,	-1		},
+	},
+	[C(L1I)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0		},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	0,		0		},
+	},
+	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0		},
+		[C(OP_WRITE)] = {	0,		0		},
+		[C(OP_PREFETCH)] = {	0xc50c3,	0		},
+	},
+	[C(DTLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0xc20e4,	0x800c4		},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	-1,		-1		},
+	},
+	[C(ITLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x800c0		},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	-1,		-1		},
+	},
+	[C(BPU)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x230e4,	0x230e5		},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	-1,		-1		},
+	},
+};
+
 struct power_pmu power5p_pmu = {
 	.n_counter = 6,
 	.max_alternatives = MAX_ALT,
@@ -623,8 +663,9 @@ struct power_pmu power5p_pmu = {
 	.get_constraint = power5p_get_constraint,
 	.get_alternatives = power5p_get_alternatives,
 	.disable_pmc = power5p_disable_pmc,
+	.limited_pmc_event = power5p_limited_pmc_event,
+	.flags = PPMU_LIMITED_PMC5_6,
 	.n_generic = ARRAY_SIZE(power5p_generic_events),
 	.generic_events = power5p_generic_events,
-	.flags = PPMU_LIMITED_PMC5_6,
-	.limited_pmc_event = power5p_limited_pmc_event,
+	.cache_events = &power5p_cache_events,
 };
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
index 1b44c5fca18..670cf10b91e 100644
--- a/arch/powerpc/kernel/power5-pmu.c
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -556,6 +556,46 @@ static int power5_generic_events[] = {
 	[PERF_COUNT_BRANCH_MISSES] = 0x230e5,		/* BR_MPRED_CR */
 };
 
+#define C(x)	PERF_COUNT_HW_CACHE_##x
+
+/*
+ * Table of generalized cache-related events.
+ * 0 means not supported, -1 means nonsensical, other values
+ * are event codes.
+ */
+static int power5_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x4c1090,	0x3c1088	},
+		[C(OP_WRITE)] = {	0x3c1090,	0xc10c3		},
+		[C(OP_PREFETCH)] = {	0xc70e7,	0		},
+	},
+	[C(L1I)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0		},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	0,		0		},
+	},
+	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x3c309b	},
+		[C(OP_WRITE)] = {	0,		0		},
+		[C(OP_PREFETCH)] = {	0xc50c3,	0		},
+	},
+	[C(DTLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x2c4090,	0x800c4		},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	-1,		-1		},
+	},
+	[C(ITLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x800c0		},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	-1,		-1		},
+	},
+	[C(BPU)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x230e4,	0x230e5		},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	-1,		-1		},
+	},
+};
+
 struct power_pmu power5_pmu = {
 	.n_counter = 6,
 	.max_alternatives = MAX_ALT,
@@ -567,4 +607,5 @@ struct power_pmu power5_pmu = {
 	.disable_pmc = power5_disable_pmc,
 	.n_generic = ARRAY_SIZE(power5_generic_events),
 	.generic_events = power5_generic_events,
+	.cache_events = &power5_cache_events,
 };
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
index cd4fbe06c35..4da70786609 100644
--- a/arch/powerpc/kernel/power6-pmu.c
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -474,6 +474,47 @@ static int power6_generic_events[] = {
 	[PERF_COUNT_BRANCH_MISSES] = 0x400052,		/* BR_MPRED */
 };
 
+#define C(x)	PERF_COUNT_HW_CACHE_##x
+
+/*
+ * Table of generalized cache-related events.
+ * 0 means not supported, -1 means nonsensical, other values
+ * are event codes.
+ * The "DTLB" and "ITLB" events relate to the DERAT and IERAT.
+ */
+static int power6_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x80082,	0x80080		},
+		[C(OP_WRITE)] = {	0x80086,	0x80088		},
+		[C(OP_PREFETCH)] = {	0x810a4,	0		},
+	},
+	[C(L1I)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x100056 	},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	0x4008c,	0		},
+	},
+	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x150730,	0x250532	},
+		[C(OP_WRITE)] = {	0x250432,	0x150432	},
+		[C(OP_PREFETCH)] = {	0x810a6,	0		},
+	},
+	[C(DTLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x20000e	},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	-1,		-1		},
+	},
+	[C(ITLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x420ce		},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	-1,		-1		},
+	},
+	[C(BPU)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x430e6,	0x400052	},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	-1,		-1		},
+	},
+};
+
 struct power_pmu power6_pmu = {
 	.n_counter = 6,
 	.max_alternatives = MAX_ALT,
@@ -483,8 +524,9 @@ struct power_pmu power6_pmu = {
 	.get_constraint = p6_get_constraint,
 	.get_alternatives = p6_get_alternatives,
 	.disable_pmc = p6_disable_pmc,
+	.limited_pmc_event = p6_limited_pmc_event,
+	.flags = PPMU_LIMITED_PMC5_6 | PPMU_ALT_SIPR,
 	.n_generic = ARRAY_SIZE(power6_generic_events),
 	.generic_events = power6_generic_events,
-	.flags = PPMU_LIMITED_PMC5_6 | PPMU_ALT_SIPR,
-	.limited_pmc_event = p6_limited_pmc_event,
+	.cache_events = &power6_cache_events,
 };
diff --git a/arch/powerpc/kernel/power7-pmu.c b/arch/powerpc/kernel/power7-pmu.c
index dfac48d8ff4..060e0deb399 100644
--- a/arch/powerpc/kernel/power7-pmu.c
+++ b/arch/powerpc/kernel/power7-pmu.c
@@ -302,6 +302,46 @@ static int power7_generic_events[] = {
 	[PERF_COUNT_BRANCH_MISSES] = 0x400f6,		/* BR_MPRED */
 };
 
+#define C(x)	PERF_COUNT_HW_CACHE_##x
+
+/*
+ * Table of generalized cache-related events.
+ * 0 means not supported, -1 means nonsensical, other values
+ * are event codes.
+ */
+static int power7_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x400f0,	0xc880	},
+		[C(OP_WRITE)] = {	0,		0x300f0	},
+		[C(OP_PREFETCH)] = {	0xd8b8,		0	},
+	},
+	[C(L1I)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x200fc	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	0x408a,		0	},
+	},
+	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x6080,		0x6084	},
+		[C(OP_WRITE)] = {	0x6082,		0x6086	},
+		[C(OP_PREFETCH)] = {	0,		0	},
+	},
+	[C(DTLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x300fc	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	-1,		-1	},
+	},
+	[C(ITLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x400fc	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	-1,		-1	},
+	},
+	[C(BPU)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x10068,	0x400f6	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	-1,		-1	},
+	},
+};
+
 struct power_pmu power7_pmu = {
 	.n_counter = 6,
 	.max_alternatives = MAX_ALT + 1,
@@ -313,4 +353,5 @@ struct power_pmu power7_pmu = {
 	.disable_pmc = power7_disable_pmc,
 	.n_generic = ARRAY_SIZE(power7_generic_events),
 	.generic_events = power7_generic_events,
+	.cache_events = &power7_cache_events,
 };
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
index eed47c4523f..336adf1736a 100644
--- a/arch/powerpc/kernel/ppc970-pmu.c
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -427,6 +427,46 @@ static int ppc970_generic_events[] = {
 	[PERF_COUNT_BRANCH_MISSES] = 0x327,		/* PM_GRP_BR_MPRED */
 };
 
+#define C(x)	PERF_COUNT_HW_CACHE_##x
+
+/*
+ * Table of generalized cache-related events.
+ * 0 means not supported, -1 means nonsensical, other values
+ * are event codes.
+ */
+static int ppc970_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x8810,		0x3810	},
+		[C(OP_WRITE)] = {	0x7810,		0x813	},
+		[C(OP_PREFETCH)] = {	0x731,		0	},
+	},
+	[C(L1I)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	0,		0	},
+	},
+	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0	},
+		[C(OP_WRITE)] = {	0,		0	},
+		[C(OP_PREFETCH)] = {	0x733,		0	},
+	},
+	[C(DTLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x704	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	-1,		-1	},
+	},
+	[C(ITLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x700	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	-1,		-1	},
+	},
+	[C(BPU)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x431,		0x327	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	-1,		-1	},
+	},
+};
+
 struct power_pmu ppc970_pmu = {
 	.n_counter = 8,
 	.max_alternatives = 2,
@@ -438,4 +478,5 @@ struct power_pmu ppc970_pmu = {
 	.disable_pmc = p970_disable_pmc,
 	.n_generic = ARRAY_SIZE(ppc970_generic_events),
 	.generic_events = ppc970_generic_events,
+	.cache_events = &ppc970_cache_events,
 };
-- 
cgit v1.2.3


From f4dbfa8f3131a84257223393905f7efad0ca5996 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 11 Jun 2009 14:06:28 +0200
Subject: perf_counter: Standardize event names

Pure renames only, to PERF_COUNT_HW_* and PERF_COUNT_SW_*.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/power4-pmu.c   | 12 ++++++------
 arch/powerpc/kernel/power5+-pmu.c  | 12 ++++++------
 arch/powerpc/kernel/power5-pmu.c   | 12 ++++++------
 arch/powerpc/kernel/power6-pmu.c   | 12 ++++++------
 arch/powerpc/kernel/ppc970-pmu.c   | 12 ++++++------
 arch/powerpc/mm/fault.c            |  6 +++---
 arch/x86/kernel/cpu/perf_counter.c | 32 ++++++++++++++++----------------
 arch/x86/mm/fault.c                |  6 +++---
 8 files changed, 52 insertions(+), 52 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c
index 0e94b685722..73956f084b2 100644
--- a/arch/powerpc/kernel/power4-pmu.c
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -535,12 +535,12 @@ static void p4_disable_pmc(unsigned int pmc, u64 mmcr[])
 }
 
 static int p4_generic_events[] = {
-	[PERF_COUNT_CPU_CYCLES] = 7,
-	[PERF_COUNT_INSTRUCTIONS] = 0x1001,
-	[PERF_COUNT_CACHE_REFERENCES] = 0x8c10,		/* PM_LD_REF_L1 */
-	[PERF_COUNT_CACHE_MISSES] = 0x3c10,		/* PM_LD_MISS_L1 */
-	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x330,	/* PM_BR_ISSUED */
-	[PERF_COUNT_BRANCH_MISSES] = 0x331,		/* PM_BR_MPRED_CR */
+	[PERF_COUNT_HW_CPU_CYCLES]		= 7,
+	[PERF_COUNT_HW_INSTRUCTIONS]		= 0x1001,
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x8c10, /* PM_LD_REF_L1 */
+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x3c10, /* PM_LD_MISS_L1 */
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x330,  /* PM_BR_ISSUED */
+	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x331,  /* PM_BR_MPRED_CR */
 };
 
 #define C(x)	PERF_COUNT_HW_CACHE_##x
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
index bbf2cbb0738..5f8b7741e97 100644
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -606,12 +606,12 @@ static void power5p_disable_pmc(unsigned int pmc, u64 mmcr[])
 }
 
 static int power5p_generic_events[] = {
-	[PERF_COUNT_CPU_CYCLES] = 0xf,
-	[PERF_COUNT_INSTRUCTIONS] = 0x100009,
-	[PERF_COUNT_CACHE_REFERENCES] = 0x1c10a8,	/* LD_REF_L1 */
-	[PERF_COUNT_CACHE_MISSES] = 0x3c1088,		/* LD_MISS_L1 */
-	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x230e4,	/* BR_ISSUED */ 
-	[PERF_COUNT_BRANCH_MISSES] = 0x230e5,		/* BR_MPRED_CR */
+	[PERF_COUNT_HW_CPU_CYCLES]		= 0xf,
+	[PERF_COUNT_HW_INSTRUCTIONS]		= 0x100009,
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x1c10a8, /* LD_REF_L1 */
+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x3c1088, /* LD_MISS_L1 */
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x230e4,  /* BR_ISSUED */
+	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x230e5,  /* BR_MPRED_CR */
 };
 
 #define C(x)	PERF_COUNT_HW_CACHE_##x
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
index 670cf10b91e..d54723ab627 100644
--- a/arch/powerpc/kernel/power5-pmu.c
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -548,12 +548,12 @@ static void power5_disable_pmc(unsigned int pmc, u64 mmcr[])
 }
 
 static int power5_generic_events[] = {
-	[PERF_COUNT_CPU_CYCLES] = 0xf,
-	[PERF_COUNT_INSTRUCTIONS] = 0x100009,
-	[PERF_COUNT_CACHE_REFERENCES] = 0x4c1090,	/* LD_REF_L1 */
-	[PERF_COUNT_CACHE_MISSES] = 0x3c1088,		/* LD_MISS_L1 */
-	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x230e4,	/* BR_ISSUED */ 
-	[PERF_COUNT_BRANCH_MISSES] = 0x230e5,		/* BR_MPRED_CR */
+	[PERF_COUNT_HW_CPU_CYCLES]		= 0xf,
+	[PERF_COUNT_HW_INSTRUCTIONS]		= 0x100009,
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x4c1090, /* LD_REF_L1 */
+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x3c1088, /* LD_MISS_L1 */
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x230e4,  /* BR_ISSUED */
+	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x230e5,  /* BR_MPRED_CR */
 };
 
 #define C(x)	PERF_COUNT_HW_CACHE_##x
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
index 4da70786609..0cd406ee765 100644
--- a/arch/powerpc/kernel/power6-pmu.c
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -466,12 +466,12 @@ static void p6_disable_pmc(unsigned int pmc, u64 mmcr[])
 }
 
 static int power6_generic_events[] = {
-	[PERF_COUNT_CPU_CYCLES] = 0x1e,
-	[PERF_COUNT_INSTRUCTIONS] = 2,
-	[PERF_COUNT_CACHE_REFERENCES] = 0x280030,	/* LD_REF_L1 */
-	[PERF_COUNT_CACHE_MISSES] = 0x30000c,		/* LD_MISS_L1 */
-	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x410a0,	/* BR_PRED */ 
-	[PERF_COUNT_BRANCH_MISSES] = 0x400052,		/* BR_MPRED */
+	[PERF_COUNT_HW_CPU_CYCLES]		= 0x1e,
+	[PERF_COUNT_HW_INSTRUCTIONS]		= 2,
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x280030, /* LD_REF_L1 */
+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x30000c, /* LD_MISS_L1 */
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x410a0,  /* BR_PRED */
+	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x400052, /* BR_MPRED */
 };
 
 #define C(x)	PERF_COUNT_HW_CACHE_##x
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
index 336adf1736a..46a20640942 100644
--- a/arch/powerpc/kernel/ppc970-pmu.c
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -419,12 +419,12 @@ static void p970_disable_pmc(unsigned int pmc, u64 mmcr[])
 }
 
 static int ppc970_generic_events[] = {
-	[PERF_COUNT_CPU_CYCLES] = 7,
-	[PERF_COUNT_INSTRUCTIONS] = 1,
-	[PERF_COUNT_CACHE_REFERENCES] = 0x8810,		/* PM_LD_REF_L1 */
-	[PERF_COUNT_CACHE_MISSES] = 0x3810,		/* PM_LD_MISS_L1 */
-	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x431,	/* PM_BR_ISSUED */
-	[PERF_COUNT_BRANCH_MISSES] = 0x327,		/* PM_GRP_BR_MPRED */
+	[PERF_COUNT_HW_CPU_CYCLES]		= 7,
+	[PERF_COUNT_HW_INSTRUCTIONS]		= 1,
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x8810, /* PM_LD_REF_L1 */
+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x3810, /* PM_LD_MISS_L1 */
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x431,  /* PM_BR_ISSUED */
+	[PERF_COUNT_HW_BRANCH_MISSES] 		= 0x327,  /* PM_GRP_BR_MPRED */
 };
 
 #define C(x)	PERF_COUNT_HW_CACHE_##x
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index ac0e112031b..5beffc8f481 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -171,7 +171,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 		die("Weird page fault", regs, SIGSEGV);
 	}
 
-	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs, address);
+	perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
 
 	/* When running in the kernel we expect faults to occur only to
 	 * addresses in user space.  All other faults represent errors in the
@@ -312,7 +312,7 @@ good_area:
 	}
 	if (ret & VM_FAULT_MAJOR) {
 		current->maj_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0,
+		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
 				     regs, address);
 #ifdef CONFIG_PPC_SMLPAR
 		if (firmware_has_feature(FW_FEATURE_CMO)) {
@@ -323,7 +323,7 @@ good_area:
 #endif
 	} else {
 		current->min_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0,
+		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
 				     regs, address);
 	}
 	up_read(&mm->mmap_sem);
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 57ae1bec81b..572fb434a66 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -69,13 +69,13 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
  */
 static const u64 intel_perfmon_event_map[] =
 {
-  [PERF_COUNT_CPU_CYCLES]		= 0x003c,
-  [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
-  [PERF_COUNT_CACHE_REFERENCES]		= 0x4f2e,
-  [PERF_COUNT_CACHE_MISSES]		= 0x412e,
-  [PERF_COUNT_BRANCH_INSTRUCTIONS]	= 0x00c4,
-  [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
-  [PERF_COUNT_BUS_CYCLES]		= 0x013c,
+  [PERF_COUNT_HW_CPU_CYCLES]		= 0x003c,
+  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
+  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x4f2e,
+  [PERF_COUNT_HW_CACHE_MISSES]		= 0x412e,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
+  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
+  [PERF_COUNT_HW_BUS_CYCLES]		= 0x013c,
 };
 
 static u64 intel_pmu_event_map(int event)
@@ -485,12 +485,12 @@ static const u64 amd_0f_hw_cache_event_ids
  */
 static const u64 amd_perfmon_event_map[] =
 {
-  [PERF_COUNT_CPU_CYCLES]		= 0x0076,
-  [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
-  [PERF_COUNT_CACHE_REFERENCES]		= 0x0080,
-  [PERF_COUNT_CACHE_MISSES]		= 0x0081,
-  [PERF_COUNT_BRANCH_INSTRUCTIONS]	= 0x00c4,
-  [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
+  [PERF_COUNT_HW_CPU_CYCLES]		= 0x0076,
+  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
+  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x0080,
+  [PERF_COUNT_HW_CACHE_MISSES]		= 0x0081,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
+  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
 };
 
 static u64 amd_pmu_event_map(int event)
@@ -970,11 +970,11 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 
 	event = hwc->config & ARCH_PERFMON_EVENT_MASK;
 
-	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_INSTRUCTIONS)))
+	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
 		return X86_PMC_IDX_FIXED_INSTRUCTIONS;
-	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_CPU_CYCLES)))
+	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
 		return X86_PMC_IDX_FIXED_CPU_CYCLES;
-	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_BUS_CYCLES)))
+	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
 		return X86_PMC_IDX_FIXED_BUS_CYCLES;
 
 	return -1;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 6f9df2babe4..5c6d816f30b 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1045,7 +1045,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	if (unlikely(error_code & PF_RSVD))
 		pgtable_bad(regs, error_code, address);
 
-	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs, address);
+	perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
 
 	/*
 	 * If we're in an interrupt, have no user context or are running
@@ -1142,11 +1142,11 @@ good_area:
 
 	if (fault & VM_FAULT_MAJOR) {
 		tsk->maj_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0,
+		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
 				     regs, address);
 	} else {
 		tsk->min_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0,
+		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
 				     regs, address);
 	}
 
-- 
cgit v1.2.3


From 8be6e8f3c3a13900169f1141870562d0c723b010 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 11 Jun 2009 14:19:11 +0200
Subject: perf_counter: Rename L2 to LL cache

The top (fastest) and last level (biggest) caches are the most
interesting ones, performance wise.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
[ Fixed the Nehalem LL table to LLC Reference/Miss events ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/power4-pmu.c   |  2 +-
 arch/powerpc/kernel/power5+-pmu.c  |  2 +-
 arch/powerpc/kernel/power5-pmu.c   |  2 +-
 arch/powerpc/kernel/power6-pmu.c   |  2 +-
 arch/powerpc/kernel/power7-pmu.c   |  2 +-
 arch/powerpc/kernel/ppc970-pmu.c   |  2 +-
 arch/x86/kernel/cpu/perf_counter.c | 12 ++++++------
 7 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c
index 73956f084b2..07bd308a5fa 100644
--- a/arch/powerpc/kernel/power4-pmu.c
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -561,7 +561,7 @@ static int power4_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1	},
 		[C(OP_PREFETCH)] = {	0,		0	},
 	},
-	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
 		[C(OP_READ)] = {	0,		0	},
 		[C(OP_WRITE)] = {	0,		0	},
 		[C(OP_PREFETCH)] = {	0xc34,		0	},
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
index 5f8b7741e97..41e5d2d958d 100644
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -632,7 +632,7 @@ static int power5p_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1		},
 		[C(OP_PREFETCH)] = {	0,		0		},
 	},
-	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
 		[C(OP_READ)] = {	0,		0		},
 		[C(OP_WRITE)] = {	0,		0		},
 		[C(OP_PREFETCH)] = {	0xc50c3,	0		},
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
index d54723ab627..05600b66221 100644
--- a/arch/powerpc/kernel/power5-pmu.c
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -574,7 +574,7 @@ static int power5_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1		},
 		[C(OP_PREFETCH)] = {	0,		0		},
 	},
-	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
 		[C(OP_READ)] = {	0,		0x3c309b	},
 		[C(OP_WRITE)] = {	0,		0		},
 		[C(OP_PREFETCH)] = {	0xc50c3,	0		},
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
index 0cd406ee765..46f74bebcfd 100644
--- a/arch/powerpc/kernel/power6-pmu.c
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -493,7 +493,7 @@ static int power6_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1		},
 		[C(OP_PREFETCH)] = {	0x4008c,	0		},
 	},
-	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
 		[C(OP_READ)] = {	0x150730,	0x250532	},
 		[C(OP_WRITE)] = {	0x250432,	0x150432	},
 		[C(OP_PREFETCH)] = {	0x810a6,	0		},
diff --git a/arch/powerpc/kernel/power7-pmu.c b/arch/powerpc/kernel/power7-pmu.c
index 060e0deb399..b3f7d1216ba 100644
--- a/arch/powerpc/kernel/power7-pmu.c
+++ b/arch/powerpc/kernel/power7-pmu.c
@@ -320,7 +320,7 @@ static int power7_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1	},
 		[C(OP_PREFETCH)] = {	0x408a,		0	},
 	},
-	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
 		[C(OP_READ)] = {	0x6080,		0x6084	},
 		[C(OP_WRITE)] = {	0x6082,		0x6086	},
 		[C(OP_PREFETCH)] = {	0,		0	},
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
index 46a20640942..ba0a357a89f 100644
--- a/arch/powerpc/kernel/ppc970-pmu.c
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -445,7 +445,7 @@ static int ppc970_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1	},
 		[C(OP_PREFETCH)] = {	0,		0	},
 	},
-	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
 		[C(OP_READ)] = {	0,		0	},
 		[C(OP_WRITE)] = {	0,		0	},
 		[C(OP_PREFETCH)] = {	0x733,		0	},
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 572fb434a66..895c82e7845 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -131,7 +131,7 @@ static const u64 nehalem_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = 0x0,
 	},
  },
- [ C(L2  ) ] = {
+ [ C(LL  ) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS               */
 		[ C(RESULT_MISS)   ] = 0x0224, /* L2_RQSTS.LD_MISS             */
@@ -141,8 +141,8 @@ static const u64 nehalem_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = 0x0824, /* L2_RQSTS.RFO_MISS            */
 	},
 	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0xc024, /* L2_RQSTS.PREFETCHES          */
-		[ C(RESULT_MISS)   ] = 0x8024, /* L2_RQSTS.PREFETCH_MISS       */
+		[ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference                */
+		[ C(RESULT_MISS)   ] = 0x412e, /* LLC Misses                   */
 	},
  },
  [ C(DTLB) ] = {
@@ -222,7 +222,7 @@ static const u64 core2_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = 0,
 	},
  },
- [ C(L2  ) ] = {
+ [ C(LL  ) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
 		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
@@ -313,7 +313,7 @@ static const u64 atom_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = 0,
 	},
  },
- [ C(L2  ) ] = {
+ [ C(LL  ) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
 		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
@@ -422,7 +422,7 @@ static const u64 amd_0f_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = 0,
 	},
  },
- [ C(L2  ) ] = {
+ [ C(LL  ) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0,
 		[ C(RESULT_MISS)   ] = 0,
-- 
cgit v1.2.3