aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arch/powerpc/include/asm/pgtable.h6
-rw-r--r--arch/powerpc/kernel/Makefile2
-rw-r--r--arch/powerpc/kernel/asm-offsets.c2
-rw-r--r--arch/powerpc/kernel/exceptions-64s.S19
-rw-r--r--arch/powerpc/kernel/perf_callchain.c527
-rw-r--r--arch/powerpc/mm/slb.c37
-rw-r--r--arch/powerpc/mm/stab.c11
-rw-r--r--include/linux/perf_counter.h5
-rw-r--r--kernel/perf_counter.c143
-rw-r--r--tools/perf/Documentation/Makefile2
-rw-r--r--tools/perf/Documentation/examples.txt (renamed from tools/perf/Documentation/perf-examples.txt)0
-rw-r--r--tools/perf/builtin-annotate.c14
-rw-r--r--tools/perf/builtin-report.c32
-rw-r--r--tools/perf/builtin-top.c17
-rw-r--r--tools/perf/util/thread.c25
-rw-r--r--tools/perf/util/util.h4
16 files changed, 798 insertions, 48 deletions
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index eb17da78112..2a5da069714 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -104,8 +104,8 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
else
pte_update(ptep, ~_PAGE_HASHPTE, pte_val(pte));
-#elif defined(CONFIG_PPC32) && defined(CONFIG_PTE_64BIT) && defined(CONFIG_SMP)
- /* Second case is 32-bit with 64-bit PTE in SMP mode. In this case, we
+#elif defined(CONFIG_PPC32) && defined(CONFIG_PTE_64BIT)
+ /* Second case is 32-bit with 64-bit PTE. In this case, we
* can just store as long as we do the two halves in the right order
* with a barrier in between. This is possible because we take care,
* in the hash code, to pre-invalidate if the PTE was already hashed,
@@ -140,7 +140,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
#else
/* Anything else just stores the PTE normally. That covers all 64-bit
- * cases, and 32-bit non-hash with 64-bit PTEs in UP mode
+ * cases, and 32-bit non-hash with 32-bit PTEs.
*/
*ptep = pte;
#endif
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index b73396b9390..9619285f64e 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -97,7 +97,7 @@ obj64-$(CONFIG_AUDIT) += compat_audit.o
obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
-obj-$(CONFIG_PPC_PERF_CTRS) += perf_counter.o
+obj-$(CONFIG_PPC_PERF_CTRS) += perf_counter.o perf_callchain.o
obj64-$(CONFIG_PPC_PERF_CTRS) += power4-pmu.o ppc970-pmu.o power5-pmu.o \
power5+-pmu.o power6-pmu.o power7-pmu.o
obj32-$(CONFIG_PPC_PERF_CTRS) += mpc7450-pmu.o
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 561b6465231..197b15646ee 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -67,6 +67,8 @@ int main(void)
DEFINE(MMCONTEXTID, offsetof(struct mm_struct, context.id));
#ifdef CONFIG_PPC64
DEFINE(AUDITCONTEXT, offsetof(struct task_struct, audit_context));
+ DEFINE(SIGSEGV, SIGSEGV);
+ DEFINE(NMI_MASK, NMI_MASK);
#else
DEFINE(THREAD_INFO, offsetof(struct task_struct, stack));
#endif /* CONFIG_PPC64 */
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index eb898112e57..8ac85e08ffa 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -729,6 +729,11 @@ BEGIN_FTR_SECTION
bne- do_ste_alloc /* If so handle it */
END_FTR_SECTION_IFCLR(CPU_FTR_SLB)
+ clrrdi r11,r1,THREAD_SHIFT
+ lwz r0,TI_PREEMPT(r11) /* If we're in an "NMI" */
+ andis. r0,r0,NMI_MASK@h /* (i.e. an irq when soft-disabled) */
+ bne 77f /* then don't call hash_page now */
+
/*
* On iSeries, we soft-disable interrupts here, then
* hard-enable interrupts so that the hash_page code can spin on
@@ -833,6 +838,20 @@ handle_page_fault:
bl .low_hash_fault
b .ret_from_except
+/*
+ * We come here as a result of a DSI at a point where we don't want
+ * to call hash_page, such as when we are accessing memory (possibly
+ * user memory) inside a PMU interrupt that occurred while interrupts
+ * were soft-disabled. We want to invoke the exception handler for
+ * the access, or panic if there isn't a handler.
+ */
+77: bl .save_nvgprs
+ mr r4,r3
+ addi r3,r1,STACK_FRAME_OVERHEAD
+ li r5,SIGSEGV
+ bl .bad_page_fault
+ b .ret_from_except
+
/* here we have a segment miss */
do_ste_alloc:
bl .ste_allocate /* try to insert stab entry */
diff --git a/arch/powerpc/kernel/perf_callchain.c b/arch/powerpc/kernel/perf_callchain.c
new file mode 100644
index 00000000000..f74b62c6751
--- /dev/null
+++ b/arch/powerpc/kernel/perf_callchain.c
@@ -0,0 +1,527 @@
+/*
+ * Performance counter callchain support - powerpc architecture code
+ *
+ * Copyright © 2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/perf_counter.h>
+#include <linux/percpu.h>
+#include <linux/uaccess.h>
+#include <linux/mm.h>
+#include <asm/ptrace.h>
+#include <asm/pgtable.h>
+#include <asm/sigcontext.h>
+#include <asm/ucontext.h>
+#include <asm/vdso.h>
+#ifdef CONFIG_PPC64
+#include "ppc32.h"
+#endif
+
+/*
+ * Store another value in a callchain_entry.
+ */
+static inline void callchain_store(struct perf_callchain_entry *entry, u64 ip)
+{
+ unsigned int nr = entry->nr;
+
+ if (nr < PERF_MAX_STACK_DEPTH) {
+ entry->ip[nr] = ip;
+ entry->nr = nr + 1;
+ }
+}
+
+/*
+ * Is sp valid as the address of the next kernel stack frame after prev_sp?
+ * The next frame may be in a different stack area but should not go
+ * back down in the same stack area.
+ */
+static int valid_next_sp(unsigned long sp, unsigned long prev_sp)
+{
+ if (sp & 0xf)
+ return 0; /* must be 16-byte aligned */
+ if (!validate_sp(sp, current, STACK_FRAME_OVERHEAD))
+ return 0;
+ if (sp >= prev_sp + STACK_FRAME_OVERHEAD)
+ return 1;
+ /*
+ * sp could decrease when we jump off an interrupt stack
+ * back to the regular process stack.
+ */
+ if ((sp & ~(THREAD_SIZE - 1)) != (prev_sp & ~(THREAD_SIZE - 1)))
+ return 1;
+ return 0;
+}
+
+static void perf_callchain_kernel(struct pt_regs *regs,
+ struct perf_callchain_entry *entry)
+{
+ unsigned long sp, next_sp;
+ unsigned long next_ip;
+ unsigned long lr;
+ long level = 0;
+ unsigned long *fp;
+
+ lr = regs->link;
+ sp = regs->gpr[1];
+ callchain_store(entry, PERF_CONTEXT_KERNEL);
+ callchain_store(entry, regs->nip);
+
+ if (!validate_sp(sp, current, STACK_FRAME_OVERHEAD))
+ return;
+
+ for (;;) {
+ fp = (unsigned long *) sp;
+ next_sp = fp[0];
+
+ if (next_sp == sp + STACK_INT_FRAME_SIZE &&
+ fp[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) {
+ /*
+ * This looks like an interrupt frame for an
+ * interrupt that occurred in the kernel
+ */
+ regs = (struct pt_regs *)(sp + STACK_FRAME_OVERHEAD);
+ next_ip = regs->nip;
+ lr = regs->link;
+ level = 0;
+ callchain_store(entry, PERF_CONTEXT_KERNEL);
+
+ } else {
+ if (level == 0)
+ next_ip = lr;
+ else
+ next_ip = fp[STACK_FRAME_LR_SAVE];
+
+ /*
+ * We can't tell which of the first two addresses
+ * we get are valid, but we can filter out the
+ * obviously bogus ones here. We replace them
+ * with 0 rather than removing them entirely so
+ * that userspace can tell which is which.
+ */
+ if ((level == 1 && next_ip == lr) ||
+ (level <= 1 && !kernel_text_address(next_ip)))
+ next_ip = 0;
+
+ ++level;
+ }
+
+ callchain_store(entry, next_ip);
+ if (!valid_next_sp(next_sp, sp))
+ return;
+ sp = next_sp;
+ }
+}
+
+#ifdef CONFIG_PPC64
+
+#ifdef CONFIG_HUGETLB_PAGE
+#define is_huge_psize(pagesize) (HPAGE_SHIFT && mmu_huge_psizes[pagesize])
+#else
+#define is_huge_psize(pagesize) 0
+#endif
+
+/*
+ * On 64-bit we don't want to invoke hash_page on user addresses from
+ * interrupt context, so if the access faults, we read the page tables
+ * to find which page (if any) is mapped and access it directly.
+ */
+static int read_user_stack_slow(void __user *ptr, void *ret, int nb)
+{
+ pgd_t *pgdir;
+ pte_t *ptep, pte;
+ int pagesize;
+ unsigned long addr = (unsigned long) ptr;
+ unsigned long offset;
+ unsigned long pfn;
+ void *kaddr;
+
+ pgdir = current->mm->pgd;
+ if (!pgdir)
+ return -EFAULT;
+
+ pagesize = get_slice_psize(current->mm, addr);
+
+ /* align address to page boundary */
+ offset = addr & ((1ul << mmu_psize_defs[pagesize].shift) - 1);
+ addr -= offset;
+
+ if (is_huge_psize(pagesize))
+ ptep = huge_pte_offset(current->mm, addr);
+ else
+ ptep = find_linux_pte(pgdir, addr);
+
+ if (ptep == NULL)
+ return -EFAULT;
+ pte = *ptep;
+ if (!pte_present(pte) || !(pte_val(pte) & _PAGE_USER))
+ return -EFAULT;
+ pfn = pte_pfn(pte);
+ if (!page_is_ram(pfn))
+ return -EFAULT;
+
+ /* no highmem to worry about here */
+ kaddr = pfn_to_kaddr(pfn);
+ memcpy(ret, kaddr + offset, nb);
+ return 0;
+}
+
+static int read_user_stack_64(unsigned long __user *ptr, unsigned long *ret)
+{
+ if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned long) ||
+ ((unsigned long)ptr & 7))
+ return -EFAULT;
+
+ if (!__get_user_inatomic(*ret, ptr))
+ return 0;
+
+ return read_user_stack_slow(ptr, ret, 8);
+}
+
+static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret)
+{
+ if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) ||
+ ((unsigned long)ptr & 3))
+ return -EFAULT;
+
+ if (!__get_user_inatomic(*ret, ptr))
+ return 0;
+
+ return read_user_stack_slow(ptr, ret, 4);
+}
+
+static inline int valid_user_sp(unsigned long sp, int is_64)
+{
+ if (!sp || (sp & 7) || sp > (is_64 ? TASK_SIZE : 0x100000000UL) - 32)
+ return 0;
+ return 1;
+}
+
+/*
+ * 64-bit user processes use the same stack frame for RT and non-RT signals.
+ */
+struct signal_frame_64 {
+ char dummy[__SIGNAL_FRAMESIZE];
+ struct ucontext uc;
+ unsigned long unused[2];
+ unsigned int tramp[6];
+ struct siginfo *pinfo;
+ void *puc;
+ struct siginfo info;
+ char abigap[288];
+};
+
+static int is_sigreturn_64_address(unsigned long nip, unsigned long fp)
+{
+ if (nip == fp + offsetof(struct signal_frame_64, tramp))
+ return 1;
+ if (vdso64_rt_sigtramp && current->mm->context.vdso_base &&
+ nip == current->mm->context.vdso_base + vdso64_rt_sigtramp)
+ return 1;
+ return 0;
+}
+
+/*
+ * Do some sanity checking on the signal frame pointed to by sp.
+ * We check the pinfo and puc pointers in the frame.
+ */
+static int sane_signal_64_frame(unsigned long sp)
+{
+ struct signal_frame_64 __user *sf;
+ unsigned long pinfo, puc;
+
+ sf = (struct signal_frame_64 __user *) sp;
+ if (read_user_stack_64((unsigned long __user *) &sf->pinfo, &pinfo) ||
+ read_user_stack_64((unsigned long __user *) &sf->puc, &puc))
+ return 0;
+ return pinfo == (unsigned long) &sf->info &&
+ puc == (unsigned long) &sf->uc;
+}
+
+static void perf_callchain_user_64(struct pt_regs *regs,
+ struct perf_callchain_entry *entry)
+{
+ unsigned long sp, next_sp;
+ unsigned long next_ip;
+ unsigned long lr;
+ long level = 0;
+ struct signal_frame_64 __user *sigframe;
+ unsigned long __user *fp, *uregs;
+
+ next_ip = regs->nip;
+ lr = regs->link;
+ sp = regs->gpr[1];
+ callchain_store(entry, PERF_CONTEXT_USER);
+ callchain_store(entry, next_ip);
+
+ for (;;) {
+ fp = (unsigned long __user *) sp;
+ if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp))
+ return;
+ if (level > 0 && read_user_stack_64(&fp[2], &next_ip))
+ return;
+
+ /*
+ * Note: the next_sp - sp >= signal frame size check
+ * is true when next_sp < sp, which can happen when
+ * transitioning from an alternate signal stack to the
+ * normal stack.
+ */
+ if (next_sp - sp >= sizeof(struct signal_frame_64) &&
+ (is_sigreturn_64_address(next_ip, sp) ||
+ (level <= 1 && is_sigreturn_64_address(lr, sp))) &&
+ sane_signal_64_frame(sp)) {
+ /*
+ * This looks like an signal frame
+ */
+ sigframe = (struct signal_frame_64 __user *) sp;
+ uregs = sigframe->uc.uc_mcontext.gp_regs;
+ if (read_user_stack_64(&uregs[PT_NIP], &next_ip) ||
+ read_user_stack_64(&uregs[PT_LNK], &lr) ||
+ read_user_stack_64(&uregs[PT_R1], &sp))
+ return;
+ level = 0;
+ callchain_store(entry, PERF_CONTEXT_USER);
+ callchain_store(entry, next_ip);
+ continue;
+ }
+
+ if (level == 0)
+ next_ip = lr;
+ callchain_store(entry, next_ip);
+ ++level;
+ sp = next_sp;
+ }
+}
+
+static inline int current_is_64bit(void)
+{
+ /*
+ * We can't use test_thread_flag() here because we may be on an
+ * interrupt stack, and the thread flags don't get copied over
+ * from the thread_info on the main stack to the interrupt stack.
+ */
+ return !test_ti_thread_flag(task_thread_info(current), TIF_32BIT);
+}
+
+#else /* CONFIG_PPC64 */
+/*
+ * On 32-bit we just access the address and let hash_page create a
+ * HPTE if necessary, so there is no need to fall back to reading
+ * the page tables. Since this is called at interrupt level,
+ * do_page_fault() won't treat a DSI as a page fault.
+ */
+static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret)
+{
+ if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) ||
+ ((unsigned long)ptr & 3))
+ return -EFAULT;
+
+ return __get_user_inatomic(*ret, ptr);
+}
+
+static inline void perf_callchain_user_64(struct pt_regs *regs,
+ struct perf_callchain_entry *entry)
+{
+}
+
+static inline int current_is_64bit(void)
+{
+ return 0;
+}
+
+static inline int valid_user_sp(unsigned long sp, int is_64)
+{
+ if (!sp || (sp & 7) || sp > TASK_SIZE - 32)
+ return 0;
+ return 1;
+}
+
+#define __SIGNAL_FRAMESIZE32 __SIGNAL_FRAMESIZE
+#define sigcontext32 sigcontext
+#define mcontext32 mcontext
+#define ucontext32 ucontext
+#define compat_siginfo_t struct siginfo
+
+#endif /* CONFIG_PPC64 */
+
+/*
+ * Layout for non-RT signal frames
+ */
+struct signal_frame_32 {
+ char dummy[__SIGNAL_FRAMESIZE32];
+ struct sigcontext32 sctx;
+ struct mcontext32 mctx;
+ int abigap[56];
+};
+
+/*
+ * Layout for RT signal frames
+ */
+struct rt_signal_frame_32 {
+ char dummy[__SIGNAL_FRAMESIZE32 + 16];
+ compat_siginfo_t info;
+ struct ucontext32 uc;
+ int abigap[56];
+};
+
+static int is_sigreturn_32_address(unsigned int nip, unsigned int fp)
+{
+ if (nip == fp + offsetof(struct signal_frame_32, mctx.mc_pad))
+ return 1;
+ if (vdso32_sigtramp && current->mm->context.vdso_base &&
+ nip == current->mm->context.vdso_base + vdso32_sigtramp)
+ return 1;
+ return 0;
+}
+
+static int is_rt_sigreturn_32_address(unsigned int nip, unsigned int fp)
+{
+ if (nip == fp + offsetof(struct rt_signal_frame_32,
+ uc.uc_mcontext.mc_pad))
+ return 1;
+ if (vdso32_rt_sigtramp && current->mm->context.vdso_base &&
+ nip == current->mm->context.vdso_base + vdso32_rt_sigtramp)
+ return 1;
+ return 0;
+}
+
+static int sane_signal_32_frame(unsigned int sp)
+{
+ struct signal_frame_32 __user *sf;
+ unsigned int regs;
+
+ sf = (struct signal_frame_32 __user *) (unsigned long) sp;
+ if (read_user_stack_32((unsigned int __user *) &sf->sctx.regs, &regs))
+ return 0;
+ return regs == (unsigned long) &sf->mctx;
+}
+
+static int sane_rt_signal_32_frame(unsigned int sp)
+{
+ struct rt_signal_frame_32 __user *sf;
+ unsigned int regs;
+
+ sf = (struct rt_signal_frame_32 __user *) (unsigned long) sp;
+ if (read_user_stack_32((unsigned int __user *) &sf->uc.uc_regs, &regs))
+ return 0;
+ return regs == (unsigned long) &sf->uc.uc_mcontext;
+}
+
+static unsigned int __user *signal_frame_32_regs(unsigned int sp,
+ unsigned int next_sp, unsigned int next_ip)
+{
+ struct mcontext32 __user *mctx = NULL;
+ struct signal_frame_32 __user *sf;
+ struct rt_signal_frame_32 __user *rt_sf;
+
+ /*
+ * Note: the next_sp - sp >= signal frame size check
+ * is true when next_sp < sp, for example, when
+ * transitioning from an alternate signal stack to the
+ * normal stack.
+ */
+ if (next_sp - sp >= sizeof(struct signal_frame_32) &&
+ is_sigreturn_32_address(next_ip, sp) &&
+ sane_signal_32_frame(sp)) {
+ sf = (struct signal_frame_32 __user *) (unsigned long) sp;
+ mctx = &sf->mctx;
+ }
+
+ if (!mctx && next_sp - sp >= sizeof(struct rt_signal_frame_32) &&
+ is_rt_sigreturn_32_address(next_ip, sp) &&
+ sane_rt_signal_32_frame(sp)) {
+ rt_sf = (struct rt_signal_frame_32 __user *) (unsigned long) sp;
+ mctx = &rt_sf->uc.uc_mcontext;
+ }
+
+ if (!mctx)
+ return NULL;
+ return mctx->mc_gregs;
+}
+
+static void perf_callchain_user_32(struct pt_regs *regs,
+ struct perf_callchain_entry *entry)
+{
+ unsigned int sp, next_sp;
+ unsigned int next_ip;
+ unsigned int lr;
+ long level = 0;
+ unsigned int __user *fp, *uregs;
+
+ next_ip = regs->nip;
+ lr = regs->link;
+ sp = regs->gpr[1];
+ callchain_store(entry, PERF_CONTEXT_USER);
+ callchain_store(entry, next_ip);
+
+ while (entry->nr < PERF_MAX_STACK_DEPTH) {
+ fp = (unsigned int __user *) (unsigned long) sp;
+ if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp))
+ return;
+ if (level > 0 && read_user_stack_32(&fp[1], &next_ip))
+ return;
+
+ uregs = signal_frame_32_regs(sp, next_sp, next_ip);
+ if (!uregs && level <= 1)
+ uregs = signal_frame_32_regs(sp, next_sp, lr);
+ if (uregs) {
+ /*
+ * This looks like an signal frame, so restart
+ * the stack trace with the values in it.
+ */
+ if (read_user_stack_32(&uregs[PT_NIP], &next_ip) ||
+ read_user_stack_32(&uregs[PT_LNK], &lr) ||
+ read_user_stack_32(&uregs[PT_R1], &sp))
+ return;
+ level = 0;
+ callchain_store(entry, PERF_CONTEXT_USER);
+ callchain_store(entry, next_ip);
+ continue;
+ }
+
+ if (level == 0)
+ next_ip = lr;
+ callchain_store(entry, next_ip);
+ ++level;
+ sp = next_sp;
+ }
+}
+
+/*
+ * Since we can't get PMU interrupts inside a PMU interrupt handler,
+ * we don't need separate irq and nmi entries here.
+ */
+static DEFINE_PER_CPU(struct perf_callchain_entry, callchain);
+
+struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+ struct perf_callchain_entry *entry = &__get_cpu_var(callchain);
+
+ entry->nr = 0;
+
+ if (current->pid == 0) /* idle task? */
+ return entry;
+
+ if (!user_mode(regs)) {
+ perf_callchain_kernel(regs, entry);
+ if (current->mm)
+ regs = task_pt_regs(current);
+ else
+ regs = NULL;
+ }
+
+ if (regs) {
+ if (current_is_64bit())
+ perf_callchain_user_64(regs, entry);
+ else
+ perf_callchain_user_32(regs, entry);
+ }
+
+ return entry;
+}
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 5b7038f248b..a685652effe 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -92,15 +92,13 @@ static inline void create_shadowed_slbe(unsigned long ea, int ssize,
: "memory" );
}
-void slb_flush_and_rebolt(void)
+static void __slb_flush_and_rebolt(void)
{
/* If you change this make sure you change SLB_NUM_BOLTED
* appropriately too. */
unsigned long linear_llp, vmalloc_llp, lflags, vflags;
unsigned long ksp_esid_data, ksp_vsid_data;
- WARN_ON(!irqs_disabled());
-
linear_llp = mmu_psize_defs[mmu_linear_psize].sllp;
vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp;
lflags = SLB_VSID_KERNEL | linear_llp;
@@ -117,12 +115,6 @@ void slb_flush_and_rebolt(void)
ksp_vsid_data = get_slb_shadow()->save_area[2].vsid;
}
- /*
- * We can't take a PMU exception in the following code, so hard
- * disable interrupts.
- */
- hard_irq_disable();
-
/* We need to do this all in asm, so we're sure we don't touch
* the stack between the slbia and rebolting it. */
asm volatile("isync\n"
@@ -139,6 +131,21 @@ void slb_flush_and_rebolt(void)
: "memory");
}
+void slb_flush_and_rebolt(void)
+{
+
+ WARN_ON(!irqs_disabled());
+
+ /*
+ * We can't take a PMU exception in the following code, so hard
+ * disable interrupts.
+ */
+ hard_irq_disable();
+
+ __slb_flush_and_rebolt();
+ get_paca()->slb_cache_ptr = 0;
+}
+
void slb_vmalloc_update(void)
{
unsigned long vflags;
@@ -180,12 +187,20 @@ static inline int esids_match(unsigned long addr1, unsigned long addr2)
/* Flush all user entries from the segment table of the current processor. */
void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
{
- unsigned long offset = get_paca()->slb_cache_ptr;
+ unsigned long offset;
unsigned long slbie_data = 0;
unsigned long pc = KSTK_EIP(tsk);
unsigned long stack = KSTK_ESP(tsk);
unsigned long unmapped_base;
+ /*
+ * We need interrupts hard-disabled here, not just soft-disabled,
+ * so that a PMU interrupt can't occur, which might try to access
+ * user memory (to get a stack trace) and possible cause an SLB miss
+ * which would update the slb_cache/slb_cache_ptr fields in the PACA.
+ */
+ hard_irq_disable();
+ offset = get_paca()->slb_cache_ptr;
if (!cpu_has_feature(CPU_FTR_NO_SLBIE_B) &&
offset <= SLB_CACHE_ENTRIES) {
int i;
@@ -200,7 +215,7 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
}
asm volatile("isync" : : : "memory");
} else {
- slb_flush_and_rebolt();
+ __slb_flush_and_rebolt();
}
/* Workaround POWER5 < DD2.1 issue */
diff --git a/arch/powerpc/mm/stab.c b/arch/powerpc/mm/stab.c
index 98cd1dc2ae7..ab5fb48b3e9 100644
--- a/arch/powerpc/mm/stab.c
+++ b/arch/powerpc/mm/stab.c
@@ -164,7 +164,7 @@ void switch_stab(struct task_struct *tsk, struct mm_struct *mm)
{
struct stab_entry *stab = (struct stab_entry *) get_paca()->stab_addr;
struct stab_entry *ste;
- unsigned long offset = __get_cpu_var(stab_cache_ptr);
+ unsigned long offset;
unsigned long pc = KSTK_EIP(tsk);
unsigned long stack = KSTK_ESP(tsk);
unsigned long unmapped_base;
@@ -172,6 +172,15 @@ void switch_stab(struct task_struct *tsk, struct mm_struct *mm)
/* Force previous translations to complete. DRENG */
asm volatile("isync" : : : "memory");
+ /*
+ * We need interrupts hard-disabled here, not just soft-disabled,
+ * so that a PMU interrupt can't occur, which might try to access
+ * user memory (to get a stack trace) and possible cause an STAB miss
+ * which would update the stab_cache/stab_cache_ptr per-cpu variables.
+ */
+ hard_irq_disable();
+
+ offset = __get_cpu_var(stab_cache_ptr);
if (offset <= NR_STAB_CACHE_ENTRIES) {
int i;
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index b53f7006cc4..e022b847c90 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -216,6 +216,7 @@ struct perf_counter_attr {
#define PERF_COUNTER_IOC_REFRESH _IO ('$', 2)
#define PERF_COUNTER_IOC_RESET _IO ('$', 3)
#define PERF_COUNTER_IOC_PERIOD _IOW('$', 4, u64)
+#define PERF_COUNTER_IOC_SET_OUTPUT _IO ('$', 5)
enum perf_counter_ioc_flags {
PERF_IOC_FLAG_GROUP = 1U << 0,
@@ -415,6 +416,9 @@ enum perf_callchain_context {
PERF_CONTEXT_MAX = (__u64)-4095,
};
+#define PERF_FLAG_FD_NO_GROUP (1U << 0)
+#define PERF_FLAG_FD_OUTPUT (1U << 1)
+
#ifdef __KERNEL__
/*
* Kernel-internal data types and definitions:
@@ -536,6 +540,7 @@ struct perf_counter {
struct list_head sibling_list;
int nr_siblings;
struct perf_counter *group_leader;
+ struct perf_counter *output;
const struct pmu *pmu;
enum perf_counter_active_state state;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 534e20d14d6..53abcbefa0b 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -469,7 +469,8 @@ static void update_counter_times(struct perf_counter *counter)
struct perf_counter_context *ctx = counter->ctx;
u64 run_end;
- if (counter->state < PERF_COUNTER_STATE_INACTIVE)
+ if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
+ counter->group_leader->state < PERF_COUNTER_STATE_INACTIVE)
return;
counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
@@ -518,7 +519,7 @@ static void __perf_counter_disable(void *info)
*/
if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
update_context_time(ctx);
- update_counter_times(counter);
+ update_group_times(counter);
if (counter == counter->group_leader)
group_sched_out(counter, cpuctx, ctx);
else
@@ -573,7 +574,7 @@ static void perf_counter_disable(struct perf_counter *counter)
* in, so we can change the state safely.
*/
if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
- update_counter_times(counter);
+ update_group_times(counter);
counter->state = PERF_COUNTER_STATE_OFF;
}
@@ -851,6 +852,27 @@ retry:
}
/*
+ * Put a counter into inactive state and update time fields.
+ * Enabling the leader of a group effectively enables all
+ * the group members that aren't explicitly disabled, so we
+ * have to update their ->tstamp_enabled also.
+ * Note: this works for group members as well as group leaders
+ * since the non-leader members' sibling_lists will be empty.
+ */
+static void __perf_counter_mark_enabled(struct perf_counter *counter,
+ struct perf_counter_context *ctx)
+{
+ struct perf_counter *sub;
+
+ counter->state = PERF_COUNTER_STATE_INACTIVE;
+ counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
+ list_for_each_entry(sub, &counter->sibling_list, list_entry)
+ if (sub->state >= PERF_COUNTER_STATE_INACTIVE)
+ sub->tstamp_enabled =
+ ctx->time - sub->total_time_enabled;
+}
+
+/*
* Cross CPU call to enable a performance counter
*/
static void __perf_counter_enable(void *info)
@@ -877,8 +899,7 @@ static void __perf_counter_enable(void *info)
if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
goto unlock;
- counter->state = PERF_COUNTER_STATE_INACTIVE;
- counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
+ __perf_counter_mark_enabled(counter, ctx);
/*
* If the counter is in a group and isn't the group leader,
@@ -971,11 +992,9 @@ static void perf_counter_enable(struct perf_counter *counter)
* Since we have the lock this context can't be scheduled
* in, so we can change the state safely.
*/
- if (counter->state == PERF_COUNTER_STATE_OFF) {
- counter->state = PERF_COUNTER_STATE_INACTIVE;
- counter->tstamp_enabled =
- ctx->time - counter->total_time_enabled;
- }
+ if (counter->state == PERF_COUNTER_STATE_OFF)
+ __perf_counter_mark_enabled(counter, ctx);
+
out:
spin_unlock_irq(&ctx->lock);
}
@@ -1479,9 +1498,7 @@ static void perf_counter_enable_on_exec(struct task_struct *task)
counter->attr.enable_on_exec = 0;
if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
continue;
- counter->state = PERF_COUNTER_STATE_INACTIVE;
- counter->tstamp_enabled =
- ctx->time - counter->total_time_enabled;
+ __perf_counter_mark_enabled(counter, ctx);
enabled = 1;
}
@@ -1503,10 +1520,21 @@ static void perf_counter_enable_on_exec(struct task_struct *task)
*/
static void __perf_counter_read(void *info)
{
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
struct perf_counter *counter = info;
struct perf_counter_context *ctx = counter->ctx;
unsigned long flags;
+ /*
+ * If this is a task context, we need to check whether it is
+ * the current task context of this cpu. If not it has been
+ * scheduled out before the smp call arrived. In that case
+ * counter->count would have been updated to a recent sample
+ * when the counter was scheduled out.
+ */
+ if (ctx->task && cpuctx->task_ctx != ctx)
+ return;
+
local_irq_save(flags);
if (ctx->is_active)
update_context_time(ctx);
@@ -1664,6 +1692,11 @@ static void free_counter(struct perf_counter *counter)
atomic_dec(&nr_task_counters);
}
+ if (counter->output) {
+ fput(counter->output->filp);
+ counter->output = NULL;
+ }
+
if (counter->destroy)
counter->destroy(counter);
@@ -1780,7 +1813,7 @@ static int perf_counter_read_group(struct perf_counter *counter,
size += err;
list_for_each_entry(sub, &leader->sibling_list, list_entry) {
- err = perf_counter_read_entry(counter, read_format,
+ err = perf_counter_read_entry(sub, read_format,
buf + size);
if (err < 0)
return err;
@@ -1949,6 +1982,8 @@ unlock:
return ret;
}
+int perf_counter_set_output(struct perf_counter *counter, int output_fd);
+
static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct perf_counter *counter = file->private_data;
@@ -1972,6 +2007,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case PERF_COUNTER_IOC_PERIOD:
return perf_counter_period(counter, (u64 __user *)arg);
+ case PERF_COUNTER_IOC_SET_OUTPUT:
+ return perf_counter_set_output(counter, arg);
+
default:
return -ENOTTY;
}
@@ -2008,6 +2046,10 @@ int perf_counter_task_disable(void)
return 0;
}
+#ifndef PERF_COUNTER_INDEX_OFFSET
+# define PERF_COUNTER_INDEX_OFFSET 0
+#endif
+
static int perf_counter_index(struct perf_counter *counter)
{
if (counter->state != PERF_COUNTER_STATE_ACTIVE)
@@ -2238,6 +2280,11 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
WARN_ON_ONCE(counter->ctx->parent_ctx);
mutex_lock(&counter->mmap_mutex);
+ if (counter->output) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
if (atomic_inc_not_zero(&counter->mmap_count)) {
if (nr_pages != counter->data->nr_pages)
ret = -EINVAL;
@@ -2623,6 +2670,7 @@ static int perf_output_begin(struct perf_output_handle *handle,
struct perf_counter *counter, unsigned int size,
int nmi, int sample)
{
+ struct perf_counter *output_counter;
struct perf_mmap_data *data;
unsigned int offset, head;
int have_lost;
@@ -2632,13 +2680,17 @@ static int perf_output_begin(struct perf_output_handle *handle,
u64 lost;
} lost_event;
+ rcu_read_lock();
/*
* For inherited counters we send all the output towards the parent.
*/
if (counter->parent)
counter = counter->parent;
- rcu_read_lock();
+ output_counter = rcu_dereference(counter->output);
+ if (output_counter)
+ counter = output_counter;
+
data = rcu_dereference(counter->data);
if (!data)
goto out;
@@ -4186,6 +4238,57 @@ err_size:
goto out;
}
+int perf_counter_set_output(struct perf_counter *counter, int output_fd)
+{
+ struct perf_counter *output_counter = NULL;
+ struct file *output_file = NULL;
+ struct perf_counter *old_output;
+ int fput_needed = 0;
+ int ret = -EINVAL;
+
+ if (!output_fd)
+ goto set;
+
+ output_file = fget_light(output_fd, &fput_needed);
+ if (!output_file)
+ return -EBADF;
+
+ if (output_file->f_op != &perf_fops)
+ goto out;
+
+ output_counter = output_file->private_data;
+
+ /* Don't chain output fds */
+ if (output_counter->output)
+ goto out;
+
+ /* Don't set an output fd when we already have an output channel */
+ if (counter->data)
+ goto out;
+
+ atomic_long_inc(&output_file->f_count);
+
+set:
+ mutex_lock(&counter->mmap_mutex);
+ old_output = counter->output;
+ rcu_assign_pointer(counter->output, output_counter);
+ mutex_unlock(&counter->mmap_mutex);
+
+ if (old_output) {
+ /*
+ * we need to make sure no existing perf_output_*()
+ * is still referencing this counter.
+ */
+ synchronize_rcu();
+ fput(old_output->filp);
+ }
+
+ ret = 0;
+out:
+ fput_light(output_file, fput_needed);
+ return ret;
+}
+
/**
* sys_perf_counter_open - open a performance counter, associate it to a task/cpu
*
@@ -4208,7 +4311,7 @@ SYSCALL_DEFINE5(perf_counter_open,
int ret;
/* for future expandability... */
- if (flags)
+ if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
return -EINVAL;
ret = perf_copy_attr(attr_uptr, &attr);
@@ -4236,7 +4339,7 @@ SYSCALL_DEFINE5(perf_counter_open,
* Look up the group leader (we will attach this counter to it):
*/
group_leader = NULL;
- if (group_fd != -1) {
+ if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
ret = -EINVAL;
group_file = fget_light(group_fd, &fput_needed);
if (!group_file)
@@ -4278,6 +4381,12 @@ SYSCALL_DEFINE5(perf_counter_open,
if (!counter_file)
goto err_free_put_context;
+ if (flags & PERF_FLAG_FD_OUTPUT) {
+ ret = perf_counter_set_output(counter, group_fd);
+ if (ret)
+ goto err_free_put_context;
+ }
+
counter->filp = counter_file;
WARN_ON_ONCE(ctx->parent_ctx);
mutex_lock(&ctx->mutex);
diff --git a/tools/perf/Documentation/Makefile b/tools/perf/Documentation/Makefile
index 5457192e1b4..bdd3b7ecad0 100644
--- a/tools/perf/Documentation/Makefile
+++ b/tools/perf/Documentation/Makefile
@@ -35,7 +35,7 @@ man7dir=$(mandir)/man7
# DESTDIR=
ASCIIDOC=asciidoc
-ASCIIDOC_EXTRA =
+ASCIIDOC_EXTRA = --unsafe
MANPAGE_XSL = manpage-normal.xsl
XMLTO_EXTRA =
INSTALL?=install
diff --git a/tools/perf/Documentation/perf-examples.txt b/tools/perf/Documentation/examples.txt
index 8eb6c489fb1..8eb6c489fb1 100644
--- a/tools/perf/Documentation/perf-examples.txt
+++ b/tools/perf/Documentation/examples.txt
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 96d421f7161..4c7bc443623 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -28,6 +28,7 @@ static char const *input_name = "perf.data";
static char default_sort_order[] = "comm,symbol";
static char *sort_order = default_sort_order;
+static int force;
static int input;
static int show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
@@ -629,6 +630,13 @@ process_fork_event(event_t *event, unsigned long offset, unsigned long head)
(void *)(long)(event->header.size),
event->fork.pid, event->fork.ppid);
+ /*
+ * A thread clone will have the same PID for both
+ * parent and child.
+ */
+ if (thread == parent)
+ return 0;
+
if (!thread || !parent || thread__fork(thread, parent)) {
dump_printf("problem processing PERF_EVENT_FORK, skipping event.\n");
return -1;
@@ -976,6 +984,11 @@ static int __cmd_annotate(void)
exit(-1);
}
+ if (!force && input_stat.st_uid && (input_stat.st_uid != geteuid())) {
+ fprintf(stderr, "file: %s not owned by current user or root\n", input_name);
+ exit(-1);
+ }
+
if (!input_stat.st_size) {
fprintf(stderr, "zero-sized file, nothing to do!\n");
exit(0);
@@ -1081,6 +1094,7 @@ static const struct option options[] = {
"input file name"),
OPT_STRING('s', "symbol", &sym_hist_filter, "symbol",
"symbol to annotate"),
+ OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
OPT_BOOLEAN('v', "verbose", &verbose,
"be more verbose (show symbol address, etc)"),
OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index ed1fdab3a1f..cdd46ab11bd 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -37,6 +37,7 @@ static char *dso_list_str, *comm_list_str, *sym_list_str,
static struct strlist *dso_list, *comm_list, *sym_list;
static char *field_sep;
+static int force;
static int input;
static int show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
@@ -665,6 +666,27 @@ static void dso__calc_col_width(struct dso *self)
self->slen_calculated = 1;
}
+static int thread__set_comm_adjust(struct thread *self, const char *comm)
+{
+ int ret = thread__set_comm(self, comm);
+
+ if (ret)
+ return ret;
+
+ if (!col_width_list_str && !field_sep &&
+ (!comm_list || strlist__has_entry(comm_list, comm))) {
+ unsigned int slen = strlen(comm);
+
+ if (slen > comms__col_width) {
+ comms__col_width = slen;
+ threads__col_width = slen + 6;
+ }
+ }
+
+ return 0;
+}
+
+
static struct symbol *
resolve_symbol(struct thread *thread, struct map **mapp,
struct dso **dsop, u64 *ipp)
@@ -1056,7 +1078,7 @@ static void register_idle_thread(void)
struct thread *thread = threads__findnew(0, &threads, &last_match);
if (thread == NULL ||
- thread__set_comm(thread, "[idle]")) {
+ thread__set_comm_adjust(thread, "[idle]")) {
fprintf(stderr, "problem inserting idle task.\n");
exit(-1);
}
@@ -1226,7 +1248,7 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head)
event->comm.comm, event->comm.pid);
if (thread == NULL ||
- thread__set_comm(thread, event->comm.comm)) {
+ thread__set_comm_adjust(thread, event->comm.comm)) {
dump_printf("problem processing PERF_EVENT_COMM, skipping event.\n");
return -1;
}
@@ -1383,6 +1405,11 @@ static int __cmd_report(void)
exit(-1);
}
+ if (!force && input_stat.st_uid && (input_stat.st_uid != geteuid())) {
+ fprintf(stderr, "file: %s not owned by current user or root\n", input_name);
+ exit(-1);
+ }
+
if (!input_stat.st_size) {
fprintf(stderr, "zero-sized file, nothing to do!\n");
exit(0);
@@ -1594,6 +1621,7 @@ static const struct option options[] = {
OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
"dump raw trace in ASCII"),
OPT_STRING('k', "vmlinux", &vmlinux_name, "file", "vmlinux pathname"),
+ OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
OPT_BOOLEAN('m', "modules", &modules,
"load module symbols - WARNING: use only with -k and LIVE kernel"),
OPT_BOOLEAN('n', "show-nr-samples", &show_nr_samples,
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 62b55ecab2c..4002ccb3675 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -483,11 +483,16 @@ static void print_sym_table(void)
if (nr_counters == 1)
printf(" samples pcnt");
else
- printf(" weight samples pcnt");
+ printf(" weight samples pcnt");
- printf(" RIP kernel function\n"
- " ______ _______ _____ ________________ _______________\n\n"
- );
+ if (verbose)
+ printf(" RIP ");
+ printf(" kernel function\n");
+ printf(" %s _______ _____",
+ nr_counters == 1 ? " " : "______");
+ if (verbose)
+ printf(" ________________");
+ printf(" _______________\n\n");
for (nd = rb_first(&tmp); nd; nd = rb_next(nd)) {
struct symbol *sym;
@@ -508,7 +513,9 @@ static void print_sym_table(void)
printf("%9.1f %10ld - ", syme->weight, syme->snap_count);
percent_color_fprintf(stdout, "%4.1f%%", pcnt);
- printf(" - %016llx : %s", sym->start, sym->name);
+ if (verbose)
+ printf(" - %016llx", sym->start);
+ printf(" : %s", sym->name);
if (sym->module)
printf("\t[%s]", sym->module->name);
printf("\n");
diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index 00c14b98d65..f98032c135c 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -4,6 +4,7 @@
#include <string.h>
#include "thread.h"
#include "util.h"
+#include "debug.h"
static struct thread *thread__new(pid_t pid)
{
@@ -85,9 +86,27 @@ void thread__insert_map(struct thread *self, struct map *map)
list_for_each_entry_safe(pos, tmp, &self->maps, node) {
if (map__overlap(pos, map)) {
- list_del_init(&pos->node);
- /* XXX leaks dsos */
- free(pos);
+ if (verbose >= 2) {
+ printf("overlapping maps:\n");
+ map__fprintf(map, stdout);
+ map__fprintf(pos, stdout);
+ }
+
+ if (map->start <= pos->start && map->end > pos->start)
+ pos->start = map->end;
+
+ if (map->end >= pos->end && map->start < pos->end)
+ pos->end = map->start;
+
+ if (verbose >= 2) {
+ printf("after collision:\n");
+ map__fprintf(pos, stdout);
+ }
+
+ if (pos->start >= pos->end) {
+ list_del_init(&pos->node);
+ free(pos);
+ }
}
}
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index c62ef9720c6..9de2329dd44 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -39,10 +39,6 @@
/* Approximation of the length of the decimal representation of this type. */
#define decimal_length(x) ((int)(sizeof(x) * 2.56 + 0.5) + 1)
-#if !defined(__APPLE__) && !defined(__FreeBSD__) && !defined(__USLC__) && !defined(_M_UNIX)
-#define _XOPEN_SOURCE 600 /* glibc2 and AIX 5.3L need 500, OpenBSD needs 600 for S_ISLNK() */
-#define _XOPEN_SOURCE_EXTENDED 1 /* AIX 5.3L needs this */
-#endif
#define _ALL_SOURCE 1
#define _GNU_SOURCE 1
#define _BSD_SOURCE 1