From 9b51f66dcb09ac5eb6bc68fc111d5c7a1e0131d6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 12 Dec 2008 13:49:45 +0100 Subject: perfcounters: implement "counter inheritance" Impact: implement new performance feature Counter inheritance can be used to run performance counters in a workload, transparently - and pipe back the counter results to the parent counter. Inheritance for performance counters works the following way: when creating a counter it can be marked with the .inherit=1 flag. Such counters are then 'inherited' by all child tasks (be they fork()-ed or clone()-ed). These counters get inherited through exec() boundaries as well (except through setuid boundaries). The counter values get added back to the parent counter(s) when the child task(s) exit - much like stime/utime statistics are gathered. So inherited counters are ideal to gather summary statistics about an application's behavior via shell commands, without having to modify that application. The timec.c command utilizes counter inheritance: http://redhat.com/~mingo/perfcounters/timec.c Sample output: $ ./timec -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null Performance counter stats for 'ls': 163516953 instructions 2295 cache-misses 2855182 branch-misses Signed-off-by: Ingo Molnar --- kernel/exit.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 2d8be7ebb0f..d336c90a5f1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1093,11 +1093,12 @@ NORET_TYPE void do_exit(long code) mpol_put(tsk->mempolicy); tsk->mempolicy = NULL; #endif -#ifdef CONFIG_FUTEX /* - * This must happen late, after the PID is not - * hashed anymore: + * These must happen late, after the PID is not + * hashed anymore, but still at a point that may sleep: */ + perf_counter_exit_task(tsk); +#ifdef CONFIG_FUTEX if (unlikely(!list_empty(&tsk->pi_state_list))) exit_pi_state_list(tsk); if (unlikely(current->pi_state_cache)) -- cgit v1.2.3 From aa9c4c0f967fdb482ea95e8473ec3d201e6e0781 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 14:10:57 +0100 Subject: perfcounters: fix task clock counter Impact: fix per task clock counter precision Signed-off-by: Ingo Molnar --- kernel/exit.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index d336c90a5f1..244edfd9686 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -922,6 +922,12 @@ static void exit_notify(struct task_struct *tsk, int group_dead) forget_original_parent(tsk); exit_task_namespaces(tsk); + /* + * Flush inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + */ + perf_counter_exit_task(tsk); + write_lock_irq(&tasklist_lock); if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); @@ -1093,11 +1099,6 @@ NORET_TYPE void do_exit(long code) mpol_put(tsk->mempolicy); tsk->mempolicy = NULL; #endif - /* - * These must happen late, after the PID is not - * hashed anymore, but still at a point that may sleep: - */ - perf_counter_exit_task(tsk); #ifdef CONFIG_FUTEX if (unlikely(!list_empty(&tsk->pi_state_list))) exit_pi_state_list(tsk); @@ -1121,6 +1122,12 @@ NORET_TYPE void do_exit(long code) if (tsk->splice_pipe) __free_pipe_info(tsk->splice_pipe); + /* + * These must happen late, after the PID is not + * hashed anymore, but still at a point that may sleep: + */ + perf_counter_exit_task(tsk); + preempt_disable(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; -- cgit v1.2.3 From eef6cbf5844c620d9db9be99e4908cdf92492fb9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 19 Dec 2008 10:20:42 +0100 Subject: perfcounters: pull inherited counters Change counter inheritance from a 'push' to a 'pull' model: instead of child tasks pushing their final counts to the parent, reuse the wait4 infrastructure to pull counters as child tasks are exit-processed, much like how cutime/cstime is collected. Signed-off-by: Ingo Molnar --- kernel/exit.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 244edfd9686..101b7eeff44 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -153,6 +153,9 @@ static void delayed_put_task_struct(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); +#ifdef CONFIG_PERF_COUNTERS + WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list)); +#endif trace_sched_process_free(tsk); put_task_struct(tsk); } @@ -922,12 +925,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead) forget_original_parent(tsk); exit_task_namespaces(tsk); - /* - * Flush inherited counters to the parent - before the parent - * gets woken up by child-exit notifications. - */ - perf_counter_exit_task(tsk); - write_lock_irq(&tasklist_lock); if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); @@ -1122,12 +1119,6 @@ NORET_TYPE void do_exit(long code) if (tsk->splice_pipe) __free_pipe_info(tsk->splice_pipe); - /* - * These must happen late, after the PID is not - * hashed anymore, but still at a point that may sleep: - */ - perf_counter_exit_task(tsk); - preempt_disable(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; @@ -1371,6 +1362,12 @@ static int wait_task_zombie(struct task_struct *p, int options, */ read_unlock(&tasklist_lock); + /* + * Flush inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + */ + perf_counter_exit_task(p); + retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; -- cgit v1.2.3 From a8d154b009168337494fbf345671bab74d3e4b8b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 10 Apr 2009 09:36:00 -0400 Subject: tracing: create automated trace defines This patch lowers the number of places a developer must modify to add new tracepoints. The current method to add a new tracepoint into an existing system is to write the trace point macro in the trace header with one of the macros TRACE_EVENT, TRACE_FORMAT or DECLARE_TRACE, then they must add the same named item into the C file with the macro DEFINE_TRACE(name) and then add the trace point. This change cuts out the needing to add the DEFINE_TRACE(name). Every file that uses the tracepoint must still include the trace/.h file, but the one C file must also add a define before the including of that file. #define CREATE_TRACE_POINTS #include This will cause the trace/mytrace.h file to also produce the C code necessary to implement the trace point. Note, if more than one trace/.h is used to create the C code it is best to list them all together. #define CREATE_TRACE_POINTS #include #include #include Thanks to Mathieu Desnoyers and Christoph Hellwig for coming up with the cleaner solution of the define above the includes over my first design to have the C code include a "special" header. This patch converts sched, irq and lockdep and skb to use this new method. Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Neil Horman Cc: Zhao Lei Cc: Eduard - Gabriel Munteanu Cc: Pekka Enberg Signed-off-by: Steven Rostedt --- kernel/exit.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index abf9cf3b95c..2fe9d2c7eee 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -56,10 +56,6 @@ #include #include "cred-internals.h" -DEFINE_TRACE(sched_process_free); -DEFINE_TRACE(sched_process_exit); -DEFINE_TRACE(sched_process_wait); - static void exit_mm(struct task_struct * tsk); static void __unhash_process(struct task_struct *p) -- cgit v1.2.3 From ad8d75fff811a6a230f7f43b05a6483099349533 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 14 Apr 2009 19:39:12 -0400 Subject: tracing/events: move trace point headers into include/trace/events Impact: clean up Create a sub directory in include/trace called events to keep the trace point headers in their own separate directory. Only headers that declare trace points should be defined in this directory. Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Neil Horman Cc: Zhao Lei Cc: Eduard - Gabriel Munteanu Cc: Pekka Enberg Signed-off-by: Steven Rostedt --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 2fe9d2c7eee..cab535c427b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -48,7 +48,7 @@ #include #include #include -#include +#include #include #include -- cgit v1.2.3 From 78a3d9d5654a7fd99cf8b2ab06b9497b9c7aad64 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 29 Apr 2009 18:01:23 +0200 Subject: do_wait: do take security_task_wait() into account I was never able to understand what should we actually do when security_task_wait() fails, but the current code doesn't look right. If ->task_wait() returns the error, we update *notask_error correctly. But then we either reap the child (despite the fact this was forbidden) or clear *notask_error (and hide the securiy policy problems). This patch assumes that "stolen by ptrace" doesn't matter. If selinux denies the child we should ignore it but make sure we report -EACCESS instead of -ECHLD if there are no other eligible children. Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Signed-off-by: James Morris --- kernel/exit.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 167e1e3ad7c..d2e8239ea18 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1582,6 +1582,7 @@ static int wait_consider_task(struct task_struct *parent, int ptrace, */ if (*notask_error) *notask_error = ret; + return 0; } if (likely(!ptrace) && unlikely(p->ptrace)) { -- cgit v1.2.3 From 856d56b9e5de650a64a6c41c17aaed702b55d578 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 May 2009 20:45:59 +0200 Subject: perf_counter: Fix counter inheritance Srivatsa Vaddagiri reported that a Java workload triggers this warning in kernel/exit.c: WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list)); Add the inherited counter propagation on self-detach, this could cause counter leaks and incomplete stats in threaded code like the below: #include #include void *thread(void *arg) { sleep(5); return NULL; } void main(void) { pthread_t thr; pthread_create(&thr, NULL, thread, NULL); } Reported-by: Srivatsa Vaddagiri Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: Marcelo Tosatti Signed-off-by: Ingo Molnar --- kernel/exit.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 4741376c8de..16d74f13a3e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -128,6 +128,12 @@ static void __exit_signal(struct task_struct *tsk) sig = NULL; /* Marker for below. */ } + /* + * Flush inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + */ + perf_counter_exit_task(tsk); + __unhash_process(tsk); /* -- cgit v1.2.3 From 0203026b58b4299ba7281c0b4b417207c1f05d0e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 17 May 2009 11:24:08 +0200 Subject: perf_counter: fix threaded task exit Flushing counters in __exit_signal() with irqs disabled is not a good idea as perf_counter_exit_task() acquires mutexes. So flush it before acquiring the tasklist lock. (Note, we still need a fix for when the PID has been unhashed.) [ Impact: fix crash with inherited counters ] Cc: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: Marcelo Tosatti Signed-off-by: Ingo Molnar --- kernel/exit.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 16d74f13a3e..73affd35e76 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -128,12 +128,6 @@ static void __exit_signal(struct task_struct *tsk) sig = NULL; /* Marker for below. */ } - /* - * Flush inherited counters to the parent - before the parent - * gets woken up by child-exit notifications. - */ - perf_counter_exit_task(tsk); - __unhash_process(tsk); /* @@ -183,6 +177,13 @@ repeat: atomic_dec(&__task_cred(p)->user->processes); proc_flush_task(p); + + /* + * Flush inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + */ + perf_counter_exit_task(p); + write_lock_irq(&tasklist_lock); tracehook_finish_release_task(p); __exit_signal(p); -- cgit v1.2.3 From 33b2fb303fe7f6b08bbb32f708e67b96eaa94a7a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 17 May 2009 11:08:41 +0200 Subject: perf_counter: fix counter freeing logic Fix counter lifetime bugs which explain the crashes reported by Marcelo Tosatti and Arnaldo Carvalho de Melo. The new rule is: flushing + freeing is only done for a task's own counters, never for other tasks. [ Impact: fix crashes/lockups with inherited counters ] Reported-by: Arnaldo Carvalho de Melo Reported-by: Marcelo Tosatti Acked-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Signed-off-by: Ingo Molnar --- kernel/exit.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 73affd35e76..f9dfedd94af 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -178,12 +178,6 @@ repeat: proc_flush_task(p); - /* - * Flush inherited counters to the parent - before the parent - * gets woken up by child-exit notifications. - */ - perf_counter_exit_task(p); - write_lock_irq(&tasklist_lock); tracehook_finish_release_task(p); __exit_signal(p); @@ -985,6 +979,13 @@ NORET_TYPE void do_exit(long code) module_put(tsk->binfmt->module); proc_exit_connector(tsk); + + /* + * Flush inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + */ + perf_counter_exit_task(tsk); + exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA mpol_put(tsk->mempolicy); @@ -1257,12 +1258,6 @@ static int wait_task_zombie(struct task_struct *p, int options, */ read_unlock(&tasklist_lock); - /* - * Flush inherited counters to the parent - before the parent - * gets woken up by child-exit notifications. - */ - perf_counter_exit_task(p); - retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; -- cgit v1.2.3 From a63eaf34ae60bdb067a354cc8def2e8f4a01f5f4 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 22 May 2009 14:17:31 +1000 Subject: perf_counter: Dynamically allocate tasks' perf_counter_context struct This replaces the struct perf_counter_context in the task_struct with a pointer to a dynamically allocated perf_counter_context struct. The main reason for doing is this is to allow us to transfer a perf_counter_context from one task to another when we do lazy PMU switching in a later patch. This has a few side-benefits: the task_struct becomes a little smaller, we save some memory because only tasks that have perf_counters attached get a perf_counter_context allocated for them, and we can remove the inclusion of in sched.h, meaning that we don't end up recompiling nearly everything whenever perf_counter.h changes. The perf_counter_context structures are reference-counted and freed when the last reference is dropped. A context can have references from its task and the counters on its task. Counters can outlive the task so it is possible that a context will be freed well after its task has exited. Contexts are allocated on fork if the parent had a context, or otherwise the first time that a per-task counter is created on a task. In the latter case, we set the context pointer in the task struct locklessly using an atomic compare-and-exchange operation in case we raced with some other task in creating a context for the subject task. This also removes the task pointer from the perf_counter struct. The task pointer was not used anywhere and would make it harder to move a context from one task to another. Anything that needed to know which task a counter was attached to was already using counter->ctx->task. The __perf_counter_init_context function moves up in perf_counter.c so that it can be called from find_get_context, and now initializes the refcount, but is otherwise unchanged. We were potentially calling list_del_counter twice: once from __perf_counter_exit_task when the task exits and once from __perf_counter_remove_from_context when the counter's fd gets closed. This adds a check in list_del_counter so it doesn't do anything if the counter has already been removed from the lists. Since perf_counter_task_sched_in doesn't do anything if the task doesn't have a context, and leaves cpuctx->task_ctx = NULL, this adds code to __perf_install_in_context to set cpuctx->task_ctx if necessary, i.e. in the case where the current task adds the first counter to itself and thus creates a context for itself. This also adds similar code to __perf_counter_enable to handle a similar situation which can arise when the counters have been disabled using prctl; that also leaves cpuctx->task_ctx = NULL. [ Impact: refactor counter context management to prepare for new feature ] Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo LKML-Reference: <18966.10075.781053.231153@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- kernel/exit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index f9dfedd94af..99ad4063ee4 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include @@ -159,7 +160,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp) struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); #ifdef CONFIG_PERF_COUNTERS - WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list)); + WARN_ON_ONCE(tsk->perf_counter_ctxp); #endif trace_sched_process_free(tsk); put_task_struct(tsk); -- cgit v1.2.3