From ba9372a8f306c4e53a5f61dcbcd6c1e4a8c2e9ac Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 13 Mar 2009 10:48:52 +0100
Subject: x86, hw-branch-tracer: keep resources on stop

Distinguish init/reset and start/stop:

init/reset will allocate and release bts tracing resources
stop/start will suspend and resume bts tracing

Return an error on init() if no cpu can be traced.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
LKML-Reference: <20090313104852.A30168@sedona.ch.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_hw_branches.c | 119 ++++++++++++++++++++++++++++-----------
 1 file changed, 85 insertions(+), 34 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 7bfdf4c2347..a99a04c5e9c 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -19,7 +19,7 @@
 #include "trace_output.h"
 
 
-#define SIZEOF_BTS (1 << 13)
+#define BTS_BUFFER_SIZE (1 << 13)
 
 /*
  * The tracer lock protects the below per-cpu tracer array.
@@ -33,53 +33,68 @@
  */
 static DEFINE_SPINLOCK(bts_tracer_lock);
 static DEFINE_PER_CPU(struct bts_tracer *, tracer);
-static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
+static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer);
 
 #define this_tracer per_cpu(tracer, smp_processor_id())
 #define this_buffer per_cpu(buffer, smp_processor_id())
 
-static int __read_mostly trace_hw_branches_enabled;
+static int trace_hw_branches_enabled __read_mostly;
+static int trace_hw_branches_suspended __read_mostly;
 static struct trace_array *hw_branch_trace __read_mostly;
 
 
 /*
- * Start tracing on the current cpu.
+ * Initialize the tracer for the current cpu.
  * The argument is ignored.
  *
  * pre: bts_tracer_lock must be locked.
  */
-static void bts_trace_start_cpu(void *arg)
+static void bts_trace_init_cpu(void *arg)
 {
 	if (this_tracer)
 		ds_release_bts(this_tracer);
 
-	this_tracer =
-		ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS,
-			       /* ovfl = */ NULL, /* th = */ (size_t)-1,
-			       BTS_KERNEL);
+	this_tracer = ds_request_bts(NULL, this_buffer, BTS_BUFFER_SIZE,
+				     NULL, (size_t)-1, BTS_KERNEL);
 	if (IS_ERR(this_tracer)) {
 		this_tracer = NULL;
 		return;
 	}
 }
 
-static void bts_trace_start(struct trace_array *tr)
+static int bts_trace_init(struct trace_array *tr)
 {
+	int cpu, avail;
+
 	spin_lock(&bts_tracer_lock);
 
-	on_each_cpu(bts_trace_start_cpu, NULL, 1);
-	trace_hw_branches_enabled = 1;
+	hw_branch_trace = tr;
+
+	on_each_cpu(bts_trace_init_cpu, NULL, 1);
+
+	/* Check on how many cpus we could enable tracing */
+	avail = 0;
+	for_each_online_cpu(cpu)
+		if (per_cpu(tracer, cpu))
+			avail++;
+
+	trace_hw_branches_enabled = (avail ? 1 : 0);
+	trace_hw_branches_suspended = 0;
 
 	spin_unlock(&bts_tracer_lock);
+
+
+	/* If we could not enable tracing on a single cpu, we fail. */
+	return avail ? 0 : -EOPNOTSUPP;
 }
 
 /*
- * Stop tracing on the current cpu.
+ * Release the tracer for the current cpu.
  * The argument is ignored.
  *
  * pre: bts_tracer_lock must be locked.
  */
-static void bts_trace_stop_cpu(void *arg)
+static void bts_trace_release_cpu(void *arg)
 {
 	if (this_tracer) {
 		ds_release_bts(this_tracer);
@@ -87,12 +102,57 @@ static void bts_trace_stop_cpu(void *arg)
 	}
 }
 
-static void bts_trace_stop(struct trace_array *tr)
+static void bts_trace_reset(struct trace_array *tr)
 {
 	spin_lock(&bts_tracer_lock);
 
+	on_each_cpu(bts_trace_release_cpu, NULL, 1);
 	trace_hw_branches_enabled = 0;
-	on_each_cpu(bts_trace_stop_cpu, NULL, 1);
+	trace_hw_branches_suspended = 0;
+
+	spin_unlock(&bts_tracer_lock);
+}
+
+/*
+ * Resume tracing on the current cpu.
+ * The argument is ignored.
+ *
+ * pre: bts_tracer_lock must be locked.
+ */
+static void bts_trace_resume_cpu(void *arg)
+{
+	if (this_tracer)
+		ds_resume_bts(this_tracer);
+}
+
+static void bts_trace_start(struct trace_array *tr)
+{
+	spin_lock(&bts_tracer_lock);
+
+	on_each_cpu(bts_trace_resume_cpu, NULL, 1);
+	trace_hw_branches_suspended = 0;
+
+	spin_unlock(&bts_tracer_lock);
+}
+
+/*
+ * Suspend tracing on the current cpu.
+ * The argument is ignored.
+ *
+ * pre: bts_tracer_lock must be locked.
+ */
+static void bts_trace_suspend_cpu(void *arg)
+{
+	if (this_tracer)
+		ds_suspend_bts(this_tracer);
+}
+
+static void bts_trace_stop(struct trace_array *tr)
+{
+	spin_lock(&bts_tracer_lock);
+
+	on_each_cpu(bts_trace_suspend_cpu, NULL, 1);
+	trace_hw_branches_suspended = 1;
 
 	spin_unlock(&bts_tracer_lock);
 }
@@ -110,10 +170,14 @@ static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
 	switch (action) {
 	case CPU_ONLINE:
 	case CPU_DOWN_FAILED:
-		smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
+		smp_call_function_single(cpu, bts_trace_init_cpu, NULL, 1);
+
+		if (trace_hw_branches_suspended)
+			smp_call_function_single(cpu, bts_trace_suspend_cpu,
+						 NULL, 1);
 		break;
 	case CPU_DOWN_PREPARE:
-		smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
+		smp_call_function_single(cpu, bts_trace_release_cpu, NULL, 1);
 		break;
 	}
 
@@ -126,20 +190,6 @@ static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
 	.notifier_call = bts_hotcpu_handler
 };
 
-static int bts_trace_init(struct trace_array *tr)
-{
-	hw_branch_trace = tr;
-
-	bts_trace_start(tr);
-
-	return 0;
-}
-
-static void bts_trace_reset(struct trace_array *tr)
-{
-	bts_trace_stop(tr);
-}
-
 static void bts_trace_print_header(struct seq_file *m)
 {
 	seq_puts(m, "# CPU#        TO  <-  FROM\n");
@@ -228,7 +278,7 @@ static void trace_bts_at(const struct bts_trace *trace, void *at)
  */
 static void trace_bts_cpu(void *arg)
 {
-	struct trace_array *tr = (struct trace_array *) arg;
+	struct trace_array *tr = (struct trace_array *)arg;
 	const struct bts_trace *trace;
 	unsigned char *at;
 
@@ -276,7 +326,8 @@ void trace_hw_branch_oops(void)
 {
 	spin_lock(&bts_tracer_lock);
 
-	trace_bts_cpu(hw_branch_trace);
+	if (trace_hw_branches_enabled)
+		trace_bts_cpu(hw_branch_trace);
 
 	spin_unlock(&bts_tracer_lock);
 }
-- 
cgit v1.2.3


From 321bb5e1ac461c04b6a93f795010d6eb01d8c5ca Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 13 Mar 2009 10:50:27 +0100
Subject: x86, hw-branch-tracer: add selftest

Add a selftest for the hw-branch-tracer.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
LKML-Reference: <20090313105027.A30183@sedona.ch.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h             |  2 ++
 kernel/trace/trace_hw_branches.c |  5 +++-
 kernel/trace/trace_selftest.c    | 53 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 59 insertions(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 56ce34d90b0..e7fbc826f1e 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -576,6 +576,8 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace,
 					       struct trace_array *tr);
 extern int trace_selftest_startup_branch(struct tracer *trace,
 					 struct trace_array *tr);
+extern int trace_selftest_startup_hw_branches(struct tracer *trace,
+					      struct trace_array *tr);
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
 
 extern void *head_page(struct trace_array_cpu *data);
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index a99a04c5e9c..4ca82700c04 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -342,7 +342,10 @@ struct tracer bts_tracer __read_mostly =
 	.start		= bts_trace_start,
 	.stop		= bts_trace_stop,
 	.open		= trace_bts_prepare,
-	.close		= trace_bts_close
+	.close		= trace_bts_close,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest	= trace_selftest_startup_hw_branches,
+#endif /* CONFIG_FTRACE_SELFTEST */
 };
 
 __init static int init_bts_trace(void)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index f907a2b2902..3c7b797d0d2 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -16,6 +16,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
 	case TRACE_BRANCH:
 	case TRACE_GRAPH_ENT:
 	case TRACE_GRAPH_RET:
+	case TRACE_HW_BRANCHES:
 		return 1;
 	}
 	return 0;
@@ -691,3 +692,55 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
 	return ret;
 }
 #endif /* CONFIG_BRANCH_TRACER */
+
+#ifdef CONFIG_HW_BRANCH_TRACER
+int
+trace_selftest_startup_hw_branches(struct tracer *trace,
+				   struct trace_array *tr)
+{
+	unsigned long count;
+	int ret;
+	struct trace_iterator iter;
+	struct tracer tracer;
+
+	if (!trace->open) {
+		printk(KERN_CONT "missing open function...");
+		return -1;
+	}
+
+	ret = tracer_init(trace, tr);
+	if (ret) {
+		warn_failed_init_tracer(trace, ret);
+		return ret;
+	}
+
+	/*
+	 * The hw-branch tracer needs to collect the trace from the various
+	 * cpu trace buffers - before tracing is stopped.
+	 */
+	memset(&iter, 0, sizeof(iter));
+	memcpy(&tracer, trace, sizeof(tracer));
+
+	iter.trace = &tracer;
+	iter.tr = tr;
+	iter.pos = -1;
+	mutex_init(&iter.mutex);
+
+	trace->open(&iter);
+
+	mutex_destroy(&iter.mutex);
+
+	tracing_stop();
+
+	ret = trace_test_buffer(tr, &count);
+	trace->reset(tr);
+	tracing_start();
+
+	if (!ret && !count) {
+		printk(KERN_CONT "no entries found..");
+		ret = -1;
+	}
+
+	return ret;
+}
+#endif /* CONFIG_HW_BRANCH_TRACER */
-- 
cgit v1.2.3


From e9a22d1fb94050b7d600019c32e6b672d539054b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 13 Mar 2009 11:54:40 +0100
Subject: x86, bts: cleanups

Impact: cleanup, no code changed

Cc: Markus Metzger <markus.t.metzger@intel.com>
LKML-Reference: <20090313104218.A30096@sedona.ch.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_hw_branches.c | 6 +++---
 kernel/trace/trace_selftest.c    | 5 +++--
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 4ca82700c04..8b2109a6c61 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -1,5 +1,5 @@
 /*
- * h/w branch tracer for x86 based on bts
+ * h/w branch tracer for x86 based on BTS
  *
  * Copyright (C) 2008-2009 Intel Corporation.
  * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
@@ -15,8 +15,8 @@
 
 #include <asm/ds.h>
 
-#include "trace.h"
 #include "trace_output.h"
+#include "trace.h"
 
 
 #define BTS_BUFFER_SIZE (1 << 13)
@@ -197,10 +197,10 @@ static void bts_trace_print_header(struct seq_file *m)
 
 static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
 {
+	unsigned long symflags = TRACE_ITER_SYM_OFFSET;
 	struct trace_entry *entry = iter->ent;
 	struct trace_seq *seq = &iter->seq;
 	struct hw_branch_entry *it;
-	unsigned long symflags = TRACE_ITER_SYM_OFFSET;
 
 	trace_assign_type(it, entry);
 
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 3c7b797d0d2..b9109126706 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -189,6 +189,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 #else
 # define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; })
 #endif /* CONFIG_DYNAMIC_FTRACE */
+
 /*
  * Simple verification test of ftrace function tracer.
  * Enable ftrace, sleep 1/10 second, and then read the trace
@@ -698,10 +699,10 @@ int
 trace_selftest_startup_hw_branches(struct tracer *trace,
 				   struct trace_array *tr)
 {
-	unsigned long count;
-	int ret;
 	struct trace_iterator iter;
 	struct tracer tracer;
+	unsigned long count;
+	int ret;
 
 	if (!trace->open) {
 		printk(KERN_CONT "missing open function...");
-- 
cgit v1.2.3


From 425480081e936d8725f0d44b8829d699bf088c6b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 24 Mar 2009 13:38:36 -0400
Subject: tracing: add handler to trace_stat

Currently, if a trace_stat user wants a handle to some private data,
the trace_stat infrastructure does not supply a way to do that.

This patch passes the trace_stat structure to the start function of
the trace_stat code.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_branch.c    | 4 ++--
 kernel/trace/trace_stat.c      | 2 +-
 kernel/trace/trace_stat.h      | 2 +-
 kernel/trace/trace_workqueue.c | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index ad8c22efff4..e6e32912ffb 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -263,7 +263,7 @@ static int branch_stat_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static void *annotated_branch_stat_start(void)
+static void *annotated_branch_stat_start(struct tracer_stat *trace)
 {
 	return __start_annotated_branch_profile;
 }
@@ -338,7 +338,7 @@ static int all_branch_stat_headers(struct seq_file *m)
 	return 0;
 }
 
-static void *all_branch_stat_start(void)
+static void *all_branch_stat_start(struct tracer_stat *trace)
 {
 	return __start_branch_profile;
 }
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index f71b85b22cf..f8f48d84b2c 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -85,7 +85,7 @@ static int stat_seq_init(struct tracer_stat_session *session)
 	if (!ts->stat_cmp)
 		ts->stat_cmp = dummy_cmp;
 
-	stat = ts->stat_start();
+	stat = ts->stat_start(ts);
 	if (!stat)
 		goto exit;
 
diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h
index 202274cf7f3..f3546a2cd82 100644
--- a/kernel/trace/trace_stat.h
+++ b/kernel/trace/trace_stat.h
@@ -12,7 +12,7 @@ struct tracer_stat {
 	/* The name of your stat file */
 	const char		*name;
 	/* Iteration over statistic entries */
-	void			*(*stat_start)(void);
+	void			*(*stat_start)(struct tracer_stat *trace);
 	void			*(*stat_next)(void *prev, int idx);
 	/* Compare two entries for stats sorting */
 	int			(*stat_cmp)(void *p1, void *p2);
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 9ab035b58cf..ee533c2e161 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -152,7 +152,7 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
 	return ret;
 }
 
-static void *workqueue_stat_start(void)
+static void *workqueue_stat_start(struct tracer_stat *trace)
 {
 	int cpu;
 	void *ret = NULL;
-- 
cgit v1.2.3


From bac429f037f1a51a74d62bad6d1518c3be065df3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 20 Mar 2009 12:50:56 -0400
Subject: tracing: add function profiler

Impact: new profiling feature

This patch adds a function profiler. In debugfs/tracing/ two new
files are created.

  function_profile_enabled  - to enable or disable profiling

  trace_stat/functions   - the profiled functions.

For example:

  echo 1 > /debugfs/tracing/function_profile_enabled
  ./hackbench 50
  echo 0 > /debugfs/tracing/function_profile_enabled

yields:

  cat /debugfs/tracing/trace_stat/functions

  Function                               Hit
  --------                               ---
  _spin_lock                        10106442
  _spin_unlock                      10097492
  kfree                              6013704
  _spin_unlock_irqrestore            4423941
  _spin_lock_irqsave                 4406825
  __phys_addr                        4181686
  __slab_free                        4038222
  dput                               4030130
  path_put                           4023387
  unroll_tree_refs                   4019532
[...]

The most hit functions are listed first. Functions that are not
hit are not listed.

This feature depends on and uses dynamic function tracing. When the
function profiling is disabled, no overhead occurs. But it still
takes up around 300KB to hold the data, thus it is not recomended
to keep it enabled for systems low on memory.

When a '1' is echoed into the function_profile_enabled file, the
counters for is function is reset back to zero. Thus you can see what
functions are hit most by different programs.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/Kconfig  |  19 +++
 kernel/trace/ftrace.c | 313 +++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 330 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8a4d7293104..95e9ad5735d 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -105,6 +105,7 @@ config FUNCTION_GRAPH_TRACER
 	  This is done by setting the current return address on the current
 	  task structure into a stack of calls.
 
+
 config IRQSOFF_TRACER
 	bool "Interrupts-off Latency Tracer"
 	default n
@@ -376,6 +377,24 @@ config DYNAMIC_FTRACE
 	 were made. If so, it runs stop_machine (stops all CPUS)
 	 and modifies the code to jump over the call to ftrace.
 
+config FUNCTION_PROFILER
+	bool "Kernel function profiler"
+	depends on DYNAMIC_FTRACE
+	default n
+	help
+	 This option enables the kernel function profiler. When the dynamic
+	 function tracing is enabled, a counter is added into the function
+	 records used by the dynamic function tracer. A file is created in
+	 debugfs called function_profile_enabled which defaults to zero.
+	 When a 1 is echoed into this file profiling begins, and when a
+	 zero is entered, profiling stops. A file in the trace_stats
+	 directory called functions, that show the list of functions that
+	 have been hit and their counters.
+
+	 This takes up around 320K more memory.
+
+	 If in doubt, say N
+
 config FTRACE_MCOUNT_RECORD
 	def_bool y
 	depends on DYNAMIC_FTRACE
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 7b8722baf15..11f364c776d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -34,6 +34,7 @@
 #include <asm/ftrace.h>
 
 #include "trace.h"
+#include "trace_stat.h"
 
 #define FTRACE_WARN_ON(cond)			\
 	do {					\
@@ -261,7 +262,6 @@ struct ftrace_func_probe {
 	struct rcu_head		rcu;
 };
 
-
 enum {
 	FTRACE_ENABLE_CALLS		= (1 << 0),
 	FTRACE_DISABLE_CALLS		= (1 << 1),
@@ -309,6 +309,307 @@ static struct dyn_ftrace *ftrace_free_records;
 		}				\
 	}
 
+#ifdef CONFIG_FUNCTION_PROFILER
+static struct hlist_head *ftrace_profile_hash;
+static int ftrace_profile_bits;
+static int ftrace_profile_enabled;
+static DEFINE_MUTEX(ftrace_profile_lock);
+
+static void *
+function_stat_next(void *v, int idx)
+{
+	struct dyn_ftrace *rec = v;
+	struct ftrace_page *pg;
+
+	pg = (struct ftrace_page *)((unsigned long)rec & PAGE_MASK);
+
+ again:
+	rec++;
+	if ((void *)rec >= (void *)&pg->records[pg->index]) {
+		pg = pg->next;
+		if (!pg)
+			return NULL;
+		rec = &pg->records[0];
+	}
+
+	if (rec->flags & FTRACE_FL_FREE ||
+	    rec->flags & FTRACE_FL_FAILED ||
+	    !(rec->flags & FTRACE_FL_CONVERTED) ||
+	    /* ignore non hit functions */
+	    !rec->counter)
+		goto again;
+
+	return rec;
+}
+
+static void *function_stat_start(struct tracer_stat *trace)
+{
+	return function_stat_next(&ftrace_pages_start->records[0], 0);
+}
+
+static int function_stat_cmp(void *p1, void *p2)
+{
+	struct dyn_ftrace *a = p1;
+	struct dyn_ftrace *b = p2;
+
+	if (a->counter < b->counter)
+		return -1;
+	if (a->counter > b->counter)
+		return 1;
+	else
+		return 0;
+}
+
+static int function_stat_headers(struct seq_file *m)
+{
+	seq_printf(m, "  Function                               Hit\n"
+		      "  --------                               ---\n");
+	return 0;
+}
+
+static int function_stat_show(struct seq_file *m, void *v)
+{
+	struct dyn_ftrace *rec = v;
+	char str[KSYM_SYMBOL_LEN];
+
+	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+
+	seq_printf(m, "  %-30.30s  %10lu\n", str, rec->counter);
+	return 0;
+}
+
+static struct tracer_stat function_stats = {
+	.name = "functions",
+	.stat_start = function_stat_start,
+	.stat_next = function_stat_next,
+	.stat_cmp = function_stat_cmp,
+	.stat_headers = function_stat_headers,
+	.stat_show = function_stat_show
+};
+
+static void ftrace_profile_init(int nr_funcs)
+{
+	unsigned long addr;
+	int order;
+	int size;
+
+	/*
+	 * We are profiling all functions, lets make it 1/4th of the
+	 * number of functions that are in core kernel. So we have to
+	 * iterate 4 times.
+	 */
+	order = (sizeof(struct hlist_head) * nr_funcs) / 4;
+	order = get_order(order);
+	size = 1 << (PAGE_SHIFT + order);
+
+	pr_info("Allocating %d KB for profiler hash\n", size >> 10);
+
+	addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
+	if (!addr) {
+		pr_warning("Could not allocate function profiler hash\n");
+		return;
+	}
+
+	ftrace_profile_hash = (void *)addr;
+
+	/*
+	 * struct hlist_head should be a pointer of 4 or 8 bytes.
+	 * And a simple bit manipulation can be done, but if for
+	 * some reason struct hlist_head is not a mulitple of 2,
+	 * then we play it safe, and simply count. This function
+	 * is done once at boot up, so it is not that critical in
+	 * performance.
+	 */
+
+	size--;
+	size /= sizeof(struct hlist_head);
+
+	for (; size; size >>= 1)
+		ftrace_profile_bits++;
+
+	pr_info("Function profiler has %d hash buckets\n",
+		1 << ftrace_profile_bits);
+
+	return;
+}
+
+static ssize_t
+ftrace_profile_read(struct file *filp, char __user *ubuf,
+		     size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	int r;
+
+	r = sprintf(buf, "%u\n", ftrace_profile_enabled);
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static void ftrace_profile_reset(void)
+{
+	struct dyn_ftrace *rec;
+	struct ftrace_page *pg;
+
+	do_for_each_ftrace_rec(pg, rec) {
+		rec->counter = 0;
+	} while_for_each_ftrace_rec();
+}
+
+static struct dyn_ftrace *ftrace_find_profiled_func(unsigned long ip)
+{
+	struct dyn_ftrace *rec;
+	struct hlist_head *hhd;
+	struct hlist_node *n;
+	unsigned long flags;
+	unsigned long key;
+
+	if (!ftrace_profile_hash)
+		return NULL;
+
+	key = hash_long(ip, ftrace_profile_bits);
+	hhd = &ftrace_profile_hash[key];
+
+	if (hlist_empty(hhd))
+		return NULL;
+
+	local_irq_save(flags);
+	hlist_for_each_entry_rcu(rec, n, hhd, node) {
+		if (rec->ip == ip)
+			goto out;
+	}
+	rec = NULL;
+ out:
+	local_irq_restore(flags);
+
+	return rec;
+}
+
+static void
+function_profile_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct dyn_ftrace *rec;
+	unsigned long flags;
+
+	if (!ftrace_profile_enabled)
+		return;
+
+	local_irq_save(flags);
+	rec = ftrace_find_profiled_func(ip);
+	if (!rec)
+		goto out;
+
+	rec->counter++;
+ out:
+	local_irq_restore(flags);
+}
+
+static struct ftrace_ops ftrace_profile_ops __read_mostly =
+{
+	.func = function_profile_call,
+};
+
+static ssize_t
+ftrace_profile_write(struct file *filp, const char __user *ubuf,
+		     size_t cnt, loff_t *ppos)
+{
+	unsigned long val;
+	char buf[64];
+	int ret;
+
+	if (!ftrace_profile_hash) {
+		pr_info("Can not enable hash due to earlier problems\n");
+		return -ENODEV;
+	}
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	ret = strict_strtoul(buf, 10, &val);
+	if (ret < 0)
+		return ret;
+
+	val = !!val;
+
+	mutex_lock(&ftrace_profile_lock);
+	if (ftrace_profile_enabled ^ val) {
+		if (val) {
+			ftrace_profile_reset();
+			register_ftrace_function(&ftrace_profile_ops);
+			ftrace_profile_enabled = 1;
+		} else {
+			ftrace_profile_enabled = 0;
+			unregister_ftrace_function(&ftrace_profile_ops);
+		}
+	}
+	mutex_unlock(&ftrace_profile_lock);
+
+	filp->f_pos += cnt;
+
+	return cnt;
+}
+
+static const struct file_operations ftrace_profile_fops = {
+	.open		= tracing_open_generic,
+	.read		= ftrace_profile_read,
+	.write		= ftrace_profile_write,
+};
+
+static void ftrace_profile_debugfs(struct dentry *d_tracer)
+{
+	struct dentry *entry;
+	int ret;
+
+	ret = register_stat_tracer(&function_stats);
+	if (ret) {
+		pr_warning("Warning: could not register "
+			   "function stats\n");
+		return;
+	}
+
+	entry = debugfs_create_file("function_profile_enabled", 0644,
+				    d_tracer, NULL, &ftrace_profile_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'function_profile_enabled' entry\n");
+}
+
+static void ftrace_add_profile(struct dyn_ftrace *rec)
+{
+	unsigned long key;
+
+	if (!ftrace_profile_hash)
+		return;
+
+	key = hash_long(rec->ip, ftrace_profile_bits);
+	hlist_add_head_rcu(&rec->node, &ftrace_profile_hash[key]);
+}
+
+static void ftrace_profile_release(struct dyn_ftrace *rec)
+{
+	mutex_lock(&ftrace_profile_lock);
+	hlist_del(&rec->node);
+	mutex_unlock(&ftrace_profile_lock);
+}
+
+#else /* CONFIG_FUNCTION_PROFILER */
+static void ftrace_profile_init(int nr_funcs)
+{
+}
+static void ftrace_add_profile(struct dyn_ftrace *rec)
+{
+}
+static void ftrace_profile_debugfs(struct dentry *d_tracer)
+{
+}
+static void ftrace_profile_release(struct dyn_ftrace *rec)
+{
+}
+#endif /* CONFIG_FUNCTION_PROFILER */
+
 #ifdef CONFIG_KPROBES
 
 static int frozen_record_count;
@@ -359,8 +660,10 @@ void ftrace_release(void *start, unsigned long size)
 	mutex_lock(&ftrace_lock);
 	do_for_each_ftrace_rec(pg, rec) {
 		if ((rec->ip >= s) && (rec->ip < e) &&
-		    !(rec->flags & FTRACE_FL_FREE))
+		    !(rec->flags & FTRACE_FL_FREE)) {
 			ftrace_free_rec(rec);
+			ftrace_profile_release(rec);
+		}
 	} while_for_each_ftrace_rec();
 	mutex_unlock(&ftrace_lock);
 }
@@ -414,6 +717,8 @@ ftrace_record_ip(unsigned long ip)
 	rec->newlist = ftrace_new_addrs;
 	ftrace_new_addrs = rec;
 
+	ftrace_add_profile(rec);
+
 	return rec;
 }
 
@@ -2157,6 +2462,8 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
 			   "'set_graph_function' entry\n");
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
+	ftrace_profile_debugfs(d_tracer);
+
 	return 0;
 }
 
@@ -2225,6 +2532,8 @@ void __init ftrace_init(void)
 	if (ret)
 		goto failed;
 
+	ftrace_profile_init(count);
+
 	last_ftrace_enabled = ftrace_enabled = 1;
 
 	ret = ftrace_convert_nops(NULL,
-- 
cgit v1.2.3


From 493762fc534c71d11d489f872c4b4a2c61173668 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 23 Mar 2009 17:12:36 -0400
Subject: tracing: move function profiler data out of function struct

Impact: reduce size of memory in function profiler

The function profiler originally introduces its counters into the
function records itself. There is 20 thousand different functions on
a normal system, and that is adding 20 thousand counters for profiling
event when not needed.

A normal run of the profiler yields only a couple of thousand functions
executed, depending on what is being profiled. This means we have around
18 thousand useless counters.

This patch rectifies this by moving the data out of the function
records used by dynamic ftrace. Data is preallocated to hold the functions
when the profiling begins. Checks are made during profiling to see if
more recorcds should be allocated, and they are allocated if it is safe
to do so.

This also removes the dependency from using dynamic ftrace, and also
removes the overhead by having it enabled.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/Kconfig  |  10 +-
 kernel/trace/ftrace.c | 440 +++++++++++++++++++++++++++++---------------------
 2 files changed, 263 insertions(+), 187 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 95e9ad5735d..8a4136096d7 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -379,20 +379,16 @@ config DYNAMIC_FTRACE
 
 config FUNCTION_PROFILER
 	bool "Kernel function profiler"
-	depends on DYNAMIC_FTRACE
+	depends on FUNCTION_TRACER
 	default n
 	help
-	 This option enables the kernel function profiler. When the dynamic
-	 function tracing is enabled, a counter is added into the function
-	 records used by the dynamic function tracer. A file is created in
-	 debugfs called function_profile_enabled which defaults to zero.
+	 This option enables the kernel function profiler. A file is created
+	 in debugfs called function_profile_enabled which defaults to zero.
 	 When a 1 is echoed into this file profiling begins, and when a
 	 zero is entered, profiling stops. A file in the trace_stats
 	 directory called functions, that show the list of functions that
 	 have been hit and their counters.
 
-	 This takes up around 320K more memory.
-
 	 If in doubt, say N
 
 config FTRACE_MCOUNT_RECORD
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 11f364c776d..24dac448cdc 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -241,87 +241,48 @@ static void ftrace_update_pid_func(void)
 #endif
 }
 
-/* set when tracing only a pid */
-struct pid *ftrace_pid_trace;
-static struct pid * const ftrace_swapper_pid = &init_struct_pid;
-
-#ifdef CONFIG_DYNAMIC_FTRACE
-
-#ifndef CONFIG_FTRACE_MCOUNT_RECORD
-# error Dynamic ftrace depends on MCOUNT_RECORD
-#endif
-
-static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
-
-struct ftrace_func_probe {
-	struct hlist_node	node;
-	struct ftrace_probe_ops	*ops;
-	unsigned long		flags;
-	unsigned long		ip;
-	void			*data;
-	struct rcu_head		rcu;
+#ifdef CONFIG_FUNCTION_PROFILER
+struct ftrace_profile {
+	struct hlist_node		node;
+	unsigned long			ip;
+	unsigned long			counter;
 };
 
-enum {
-	FTRACE_ENABLE_CALLS		= (1 << 0),
-	FTRACE_DISABLE_CALLS		= (1 << 1),
-	FTRACE_UPDATE_TRACE_FUNC	= (1 << 2),
-	FTRACE_ENABLE_MCOUNT		= (1 << 3),
-	FTRACE_DISABLE_MCOUNT		= (1 << 4),
-	FTRACE_START_FUNC_RET		= (1 << 5),
-	FTRACE_STOP_FUNC_RET		= (1 << 6),
+struct ftrace_profile_page {
+	struct ftrace_profile_page	*next;
+	unsigned long			index;
+	struct ftrace_profile		records[];
 };
 
-static int ftrace_filtered;
+#define PROFILE_RECORDS_SIZE						\
+	(PAGE_SIZE - offsetof(struct ftrace_profile_page, records))
 
-static struct dyn_ftrace *ftrace_new_addrs;
+#define PROFILES_PER_PAGE					\
+	(PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile))
 
-static DEFINE_MUTEX(ftrace_regex_lock);
-
-struct ftrace_page {
-	struct ftrace_page	*next;
-	int			index;
-	struct dyn_ftrace	records[];
-};
+/* TODO: make these percpu, to prevent cache line bouncing */
+static struct ftrace_profile_page *profile_pages_start;
+static struct ftrace_profile_page *profile_pages;
 
-#define ENTRIES_PER_PAGE \
-  ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace))
-
-/* estimate from running different kernels */
-#define NR_TO_INIT		10000
-
-static struct ftrace_page	*ftrace_pages_start;
-static struct ftrace_page	*ftrace_pages;
-
-static struct dyn_ftrace *ftrace_free_records;
-
-/*
- * This is a double for. Do not use 'break' to break out of the loop,
- * you must use a goto.
- */
-#define do_for_each_ftrace_rec(pg, rec)					\
-	for (pg = ftrace_pages_start; pg; pg = pg->next) {		\
-		int _____i;						\
-		for (_____i = 0; _____i < pg->index; _____i++) {	\
-			rec = &pg->records[_____i];
-
-#define while_for_each_ftrace_rec()		\
-		}				\
-	}
-
-#ifdef CONFIG_FUNCTION_PROFILER
 static struct hlist_head *ftrace_profile_hash;
 static int ftrace_profile_bits;
 static int ftrace_profile_enabled;
 static DEFINE_MUTEX(ftrace_profile_lock);
 
+static DEFINE_PER_CPU(atomic_t, ftrace_profile_disable);
+
+#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */
+
+static raw_spinlock_t ftrace_profile_rec_lock =
+	(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+
 static void *
 function_stat_next(void *v, int idx)
 {
-	struct dyn_ftrace *rec = v;
-	struct ftrace_page *pg;
+	struct ftrace_profile *rec = v;
+	struct ftrace_profile_page *pg;
 
-	pg = (struct ftrace_page *)((unsigned long)rec & PAGE_MASK);
+	pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK);
 
  again:
 	rec++;
@@ -330,27 +291,22 @@ function_stat_next(void *v, int idx)
 		if (!pg)
 			return NULL;
 		rec = &pg->records[0];
+		if (!rec->counter)
+			goto again;
 	}
 
-	if (rec->flags & FTRACE_FL_FREE ||
-	    rec->flags & FTRACE_FL_FAILED ||
-	    !(rec->flags & FTRACE_FL_CONVERTED) ||
-	    /* ignore non hit functions */
-	    !rec->counter)
-		goto again;
-
 	return rec;
 }
 
 static void *function_stat_start(struct tracer_stat *trace)
 {
-	return function_stat_next(&ftrace_pages_start->records[0], 0);
+	return function_stat_next(&profile_pages_start->records[0], 0);
 }
 
 static int function_stat_cmp(void *p1, void *p2)
 {
-	struct dyn_ftrace *a = p1;
-	struct dyn_ftrace *b = p2;
+	struct ftrace_profile *a = p1;
+	struct ftrace_profile *b = p2;
 
 	if (a->counter < b->counter)
 		return -1;
@@ -369,7 +325,7 @@ static int function_stat_headers(struct seq_file *m)
 
 static int function_stat_show(struct seq_file *m, void *v)
 {
-	struct dyn_ftrace *rec = v;
+	struct ftrace_profile *rec = v;
 	char str[KSYM_SYMBOL_LEN];
 
 	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
@@ -387,115 +343,191 @@ static struct tracer_stat function_stats = {
 	.stat_show = function_stat_show
 };
 
-static void ftrace_profile_init(int nr_funcs)
+static void ftrace_profile_reset(void)
 {
-	unsigned long addr;
-	int order;
-	int size;
+	struct ftrace_profile_page *pg;
 
-	/*
-	 * We are profiling all functions, lets make it 1/4th of the
-	 * number of functions that are in core kernel. So we have to
-	 * iterate 4 times.
-	 */
-	order = (sizeof(struct hlist_head) * nr_funcs) / 4;
-	order = get_order(order);
-	size = 1 << (PAGE_SHIFT + order);
-
-	pr_info("Allocating %d KB for profiler hash\n", size >> 10);
+	pg = profile_pages = profile_pages_start;
 
-	addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
-	if (!addr) {
-		pr_warning("Could not allocate function profiler hash\n");
-		return;
+	while (pg) {
+		memset(pg->records, 0, PROFILE_RECORDS_SIZE);
+		pg->index = 0;
+		pg = pg->next;
 	}
 
-	ftrace_profile_hash = (void *)addr;
+	memset(ftrace_profile_hash, 0,
+	       FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head));
+}
 
-	/*
-	 * struct hlist_head should be a pointer of 4 or 8 bytes.
-	 * And a simple bit manipulation can be done, but if for
-	 * some reason struct hlist_head is not a mulitple of 2,
-	 * then we play it safe, and simply count. This function
-	 * is done once at boot up, so it is not that critical in
-	 * performance.
-	 */
+int ftrace_profile_pages_init(void)
+{
+	struct ftrace_profile_page *pg;
+	int i;
 
-	size--;
-	size /= sizeof(struct hlist_head);
+	/* If we already allocated, do nothing */
+	if (profile_pages)
+		return 0;
 
-	for (; size; size >>= 1)
-		ftrace_profile_bits++;
+	profile_pages = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!profile_pages)
+		return -ENOMEM;
 
-	pr_info("Function profiler has %d hash buckets\n",
-		1 << ftrace_profile_bits);
+	pg = profile_pages_start = profile_pages;
 
-	return;
+	/* allocate 10 more pages to start */
+	for (i = 0; i < 10; i++) {
+		pg->next = (void *)get_zeroed_page(GFP_KERNEL);
+		/*
+		 * We only care about allocating profile_pages, if
+		 * we failed to allocate here, hopefully we will allocate
+		 * later.
+		 */
+		if (!pg->next)
+			break;
+		pg = pg->next;
+	}
+
+	return 0;
 }
 
-static ssize_t
-ftrace_profile_read(struct file *filp, char __user *ubuf,
-		     size_t cnt, loff_t *ppos)
+static int ftrace_profile_init(void)
 {
-	char buf[64];
-	int r;
+	int size;
 
-	r = sprintf(buf, "%u\n", ftrace_profile_enabled);
-	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-}
+	if (ftrace_profile_hash) {
+		/* If the profile is already created, simply reset it */
+		ftrace_profile_reset();
+		return 0;
+	}
 
-static void ftrace_profile_reset(void)
-{
-	struct dyn_ftrace *rec;
-	struct ftrace_page *pg;
+	/*
+	 * We are profiling all functions, but usually only a few thousand
+	 * functions are hit. We'll make a hash of 1024 items.
+	 */
+	size = FTRACE_PROFILE_HASH_SIZE;
 
-	do_for_each_ftrace_rec(pg, rec) {
-		rec->counter = 0;
-	} while_for_each_ftrace_rec();
+	ftrace_profile_hash =
+		kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL);
+
+	if (!ftrace_profile_hash)
+		return -ENOMEM;
+
+	size--;
+
+	for (; size; size >>= 1)
+		ftrace_profile_bits++;
+
+	/* Preallocate a few pages */
+	if (ftrace_profile_pages_init() < 0) {
+		kfree(ftrace_profile_hash);
+		ftrace_profile_hash = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
 }
 
-static struct dyn_ftrace *ftrace_find_profiled_func(unsigned long ip)
+/* interrupts must be disabled */
+static struct ftrace_profile *ftrace_find_profiled_func(unsigned long ip)
 {
-	struct dyn_ftrace *rec;
+	struct ftrace_profile *rec;
 	struct hlist_head *hhd;
 	struct hlist_node *n;
-	unsigned long flags;
 	unsigned long key;
 
-	if (!ftrace_profile_hash)
-		return NULL;
-
 	key = hash_long(ip, ftrace_profile_bits);
 	hhd = &ftrace_profile_hash[key];
 
 	if (hlist_empty(hhd))
 		return NULL;
 
-	local_irq_save(flags);
 	hlist_for_each_entry_rcu(rec, n, hhd, node) {
 		if (rec->ip == ip)
-			goto out;
+			return rec;
+	}
+
+	return NULL;
+}
+
+static void ftrace_add_profile(struct ftrace_profile *rec)
+{
+	unsigned long key;
+
+	key = hash_long(rec->ip, ftrace_profile_bits);
+	hlist_add_head_rcu(&rec->node, &ftrace_profile_hash[key]);
+}
+
+/* Interrupts must be disabled calling this */
+static struct ftrace_profile *
+ftrace_profile_alloc(unsigned long ip, bool alloc_safe)
+{
+	struct ftrace_profile *rec = NULL;
+
+	/* prevent recursion */
+	if (atomic_inc_return(&__get_cpu_var(ftrace_profile_disable)) != 1)
+		goto out;
+
+	__raw_spin_lock(&ftrace_profile_rec_lock);
+
+	/* Try to always keep another page available */
+	if (!profile_pages->next && alloc_safe)
+		profile_pages->next = (void *)get_zeroed_page(GFP_ATOMIC);
+
+	/*
+	 * Try to find the function again since another
+	 * task on another CPU could have added it
+	 */
+	rec = ftrace_find_profiled_func(ip);
+	if (rec)
+		goto out_unlock;
+
+	if (profile_pages->index == PROFILES_PER_PAGE) {
+		if (!profile_pages->next)
+			goto out_unlock;
+		profile_pages = profile_pages->next;
 	}
-	rec = NULL;
+
+	rec = &profile_pages->records[profile_pages->index++];
+	rec->ip = ip;
+	ftrace_add_profile(rec);
+
+ out_unlock:
+	__raw_spin_unlock(&ftrace_profile_rec_lock);
  out:
-	local_irq_restore(flags);
+	atomic_dec(&__get_cpu_var(ftrace_profile_disable));
 
 	return rec;
 }
 
+/*
+ * If we are not in an interrupt, or softirq and
+ * and interrupts are disabled and preemption is not enabled
+ * (not in a spinlock) then it should be safe to allocate memory.
+ */
+static bool ftrace_safe_to_allocate(void)
+{
+	return !in_interrupt() && irqs_disabled() && !preempt_count();
+}
+
 static void
 function_profile_call(unsigned long ip, unsigned long parent_ip)
 {
-	struct dyn_ftrace *rec;
+	struct ftrace_profile *rec;
 	unsigned long flags;
+	bool alloc_safe;
 
 	if (!ftrace_profile_enabled)
 		return;
 
+	alloc_safe = ftrace_safe_to_allocate();
+
 	local_irq_save(flags);
 	rec = ftrace_find_profiled_func(ip);
-	if (!rec)
-		goto out;
+	if (!rec) {
+		rec = ftrace_profile_alloc(ip, alloc_safe);
+		if (!rec)
+			goto out;
+	}
 
 	rec->counter++;
  out:
@@ -515,11 +547,6 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
 	char buf[64];
 	int ret;
 
-	if (!ftrace_profile_hash) {
-		pr_info("Can not enable hash due to earlier problems\n");
-		return -ENODEV;
-	}
-
 	if (cnt >= sizeof(buf))
 		return -EINVAL;
 
@@ -537,7 +564,12 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
 	mutex_lock(&ftrace_profile_lock);
 	if (ftrace_profile_enabled ^ val) {
 		if (val) {
-			ftrace_profile_reset();
+			ret = ftrace_profile_init();
+			if (ret < 0) {
+				cnt = ret;
+				goto out;
+			}
+
 			register_ftrace_function(&ftrace_profile_ops);
 			ftrace_profile_enabled = 1;
 		} else {
@@ -545,6 +577,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
 			unregister_ftrace_function(&ftrace_profile_ops);
 		}
 	}
+ out:
 	mutex_unlock(&ftrace_profile_lock);
 
 	filp->f_pos += cnt;
@@ -552,6 +585,17 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
 	return cnt;
 }
 
+static ssize_t
+ftrace_profile_read(struct file *filp, char __user *ubuf,
+		     size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	int r;
+
+	r = sprintf(buf, "%u\n", ftrace_profile_enabled);
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
 static const struct file_operations ftrace_profile_fops = {
 	.open		= tracing_open_generic,
 	.read		= ftrace_profile_read,
@@ -577,39 +621,80 @@ static void ftrace_profile_debugfs(struct dentry *d_tracer)
 			   "'function_profile_enabled' entry\n");
 }
 
-static void ftrace_add_profile(struct dyn_ftrace *rec)
-{
-	unsigned long key;
-
-	if (!ftrace_profile_hash)
-		return;
-
-	key = hash_long(rec->ip, ftrace_profile_bits);
-	hlist_add_head_rcu(&rec->node, &ftrace_profile_hash[key]);
-}
-
-static void ftrace_profile_release(struct dyn_ftrace *rec)
-{
-	mutex_lock(&ftrace_profile_lock);
-	hlist_del(&rec->node);
-	mutex_unlock(&ftrace_profile_lock);
-}
-
 #else /* CONFIG_FUNCTION_PROFILER */
-static void ftrace_profile_init(int nr_funcs)
-{
-}
-static void ftrace_add_profile(struct dyn_ftrace *rec)
-{
-}
 static void ftrace_profile_debugfs(struct dentry *d_tracer)
 {
 }
-static void ftrace_profile_release(struct dyn_ftrace *rec)
-{
-}
 #endif /* CONFIG_FUNCTION_PROFILER */
 
+/* set when tracing only a pid */
+struct pid *ftrace_pid_trace;
+static struct pid * const ftrace_swapper_pid = &init_struct_pid;
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+#ifndef CONFIG_FTRACE_MCOUNT_RECORD
+# error Dynamic ftrace depends on MCOUNT_RECORD
+#endif
+
+static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
+
+struct ftrace_func_probe {
+	struct hlist_node	node;
+	struct ftrace_probe_ops	*ops;
+	unsigned long		flags;
+	unsigned long		ip;
+	void			*data;
+	struct rcu_head		rcu;
+};
+
+enum {
+	FTRACE_ENABLE_CALLS		= (1 << 0),
+	FTRACE_DISABLE_CALLS		= (1 << 1),
+	FTRACE_UPDATE_TRACE_FUNC	= (1 << 2),
+	FTRACE_ENABLE_MCOUNT		= (1 << 3),
+	FTRACE_DISABLE_MCOUNT		= (1 << 4),
+	FTRACE_START_FUNC_RET		= (1 << 5),
+	FTRACE_STOP_FUNC_RET		= (1 << 6),
+};
+
+static int ftrace_filtered;
+
+static struct dyn_ftrace *ftrace_new_addrs;
+
+static DEFINE_MUTEX(ftrace_regex_lock);
+
+struct ftrace_page {
+	struct ftrace_page	*next;
+	int			index;
+	struct dyn_ftrace	records[];
+};
+
+#define ENTRIES_PER_PAGE \
+  ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace))
+
+/* estimate from running different kernels */
+#define NR_TO_INIT		10000
+
+static struct ftrace_page	*ftrace_pages_start;
+static struct ftrace_page	*ftrace_pages;
+
+static struct dyn_ftrace *ftrace_free_records;
+
+/*
+ * This is a double for. Do not use 'break' to break out of the loop,
+ * you must use a goto.
+ */
+#define do_for_each_ftrace_rec(pg, rec)					\
+	for (pg = ftrace_pages_start; pg; pg = pg->next) {		\
+		int _____i;						\
+		for (_____i = 0; _____i < pg->index; _____i++) {	\
+			rec = &pg->records[_____i];
+
+#define while_for_each_ftrace_rec()		\
+		}				\
+	}
+
 #ifdef CONFIG_KPROBES
 
 static int frozen_record_count;
@@ -660,10 +745,8 @@ void ftrace_release(void *start, unsigned long size)
 	mutex_lock(&ftrace_lock);
 	do_for_each_ftrace_rec(pg, rec) {
 		if ((rec->ip >= s) && (rec->ip < e) &&
-		    !(rec->flags & FTRACE_FL_FREE)) {
+		    !(rec->flags & FTRACE_FL_FREE))
 			ftrace_free_rec(rec);
-			ftrace_profile_release(rec);
-		}
 	} while_for_each_ftrace_rec();
 	mutex_unlock(&ftrace_lock);
 }
@@ -717,8 +800,6 @@ ftrace_record_ip(unsigned long ip)
 	rec->newlist = ftrace_new_addrs;
 	ftrace_new_addrs = rec;
 
-	ftrace_add_profile(rec);
-
 	return rec;
 }
 
@@ -2462,8 +2543,6 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
 			   "'set_graph_function' entry\n");
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
-	ftrace_profile_debugfs(d_tracer);
-
 	return 0;
 }
 
@@ -2532,8 +2611,6 @@ void __init ftrace_init(void)
 	if (ret)
 		goto failed;
 
-	ftrace_profile_init(count);
-
 	last_ftrace_enabled = ftrace_enabled = 1;
 
 	ret = ftrace_convert_nops(NULL,
@@ -2734,6 +2811,9 @@ static __init int ftrace_init_debugfs(void)
 	if (!entry)
 		pr_warning("Could not create debugfs "
 			   "'set_ftrace_pid' entry\n");
+
+	ftrace_profile_debugfs(d_tracer);
+
 	return 0;
 }
 fs_initcall(ftrace_init_debugfs);
-- 
cgit v1.2.3


From 0706f1c48ca8a7ab478090b4e38f2e578ae2bfe0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 23 Mar 2009 23:12:58 -0400
Subject: tracing: adding function timings to function profiler

If the function graph trace is enabled, the function profiler will
use it to take the timing of the functions.

 cat /debug/tracing/trace_stat/functions

  Function                               Hit    Time
  --------                               ---    ----
  mwait_idle                             127    183028.4 us
  schedule                                26    151997.7 us
  __schedule                              31    151975.1 us
  sys_wait4                                2    74080.53 us
  do_wait                                  2    74077.80 us
  sys_newlstat                           138    39929.16 us
  do_path_lookup                         179    39845.79 us
  vfs_lstat_fd                           138    39761.97 us
  user_path_at                           153    39469.58 us
  path_walk                              179    39435.76 us
  __link_path_walk                       189    39143.73 us
[...]

Note the times are skewed due to the function graph tracer not taking
into account schedules.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c                | 93 ++++++++++++++++++++++++++++++++++--
 kernel/trace/trace.c                 | 11 -----
 kernel/trace/trace.h                 |  3 +-
 kernel/trace/trace_functions_graph.c | 17 +++++--
 kernel/trace/trace_output.c          | 10 ++++
 kernel/trace/trace_output.h          |  2 +
 6 files changed, 117 insertions(+), 19 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 24dac448cdc..a9ccd71fc92 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -33,7 +33,7 @@
 
 #include <asm/ftrace.h>
 
-#include "trace.h"
+#include "trace_output.h"
 #include "trace_stat.h"
 
 #define FTRACE_WARN_ON(cond)			\
@@ -246,6 +246,9 @@ struct ftrace_profile {
 	struct hlist_node		node;
 	unsigned long			ip;
 	unsigned long			counter;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	unsigned long long		time;
+#endif
 };
 
 struct ftrace_profile_page {
@@ -303,6 +306,22 @@ static void *function_stat_start(struct tracer_stat *trace)
 	return function_stat_next(&profile_pages_start->records[0], 0);
 }
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+/* function graph compares on total time */
+static int function_stat_cmp(void *p1, void *p2)
+{
+	struct ftrace_profile *a = p1;
+	struct ftrace_profile *b = p2;
+
+	if (a->time < b->time)
+		return -1;
+	if (a->time > b->time)
+		return 1;
+	else
+		return 0;
+}
+#else
+/* not function graph compares against hits */
 static int function_stat_cmp(void *p1, void *p2)
 {
 	struct ftrace_profile *a = p1;
@@ -315,11 +334,17 @@ static int function_stat_cmp(void *p1, void *p2)
 	else
 		return 0;
 }
+#endif
 
 static int function_stat_headers(struct seq_file *m)
 {
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	seq_printf(m, "  Function                               Hit    Time\n"
+		      "  --------                               ---    ----\n");
+#else
 	seq_printf(m, "  Function                               Hit\n"
 		      "  --------                               ---\n");
+#endif
 	return 0;
 }
 
@@ -327,10 +352,25 @@ static int function_stat_show(struct seq_file *m, void *v)
 {
 	struct ftrace_profile *rec = v;
 	char str[KSYM_SYMBOL_LEN];
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	static struct trace_seq s;
+	static DEFINE_MUTEX(mutex);
+
+	mutex_lock(&mutex);
+	trace_seq_init(&s);
+	trace_print_graph_duration(rec->time, &s);
+#endif
 
 	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+	seq_printf(m, "  %-30.30s  %10lu", str, rec->counter);
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	seq_printf(m, "    ");
+	trace_print_seq(m, &s);
+	mutex_unlock(&mutex);
+#endif
+	seq_putc(m, '\n');
 
-	seq_printf(m, "  %-30.30s  %10lu\n", str, rec->counter);
 	return 0;
 }
 
@@ -534,11 +574,52 @@ function_profile_call(unsigned long ip, unsigned long parent_ip)
 	local_irq_restore(flags);
 }
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int profile_graph_entry(struct ftrace_graph_ent *trace)
+{
+	function_profile_call(trace->func, 0);
+	return 1;
+}
+
+static void profile_graph_return(struct ftrace_graph_ret *trace)
+{
+	unsigned long flags;
+	struct ftrace_profile *rec;
+
+	local_irq_save(flags);
+	rec = ftrace_find_profiled_func(trace->func);
+	if (rec)
+		rec->time += trace->rettime - trace->calltime;
+	local_irq_restore(flags);
+}
+
+static int register_ftrace_profiler(void)
+{
+	return register_ftrace_graph(&profile_graph_return,
+				     &profile_graph_entry);
+}
+
+static void unregister_ftrace_profiler(void)
+{
+	unregister_ftrace_graph();
+}
+#else
 static struct ftrace_ops ftrace_profile_ops __read_mostly =
 {
 	.func = function_profile_call,
 };
 
+static int register_ftrace_profiler(void)
+{
+	return register_ftrace_function(&ftrace_profile_ops);
+}
+
+static void unregister_ftrace_profiler(void)
+{
+	unregister_ftrace_function(&ftrace_profile_ops);
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
 static ssize_t
 ftrace_profile_write(struct file *filp, const char __user *ubuf,
 		     size_t cnt, loff_t *ppos)
@@ -570,11 +651,15 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
 				goto out;
 			}
 
-			register_ftrace_function(&ftrace_profile_ops);
+			ret = register_ftrace_profiler();
+			if (ret < 0) {
+				cnt = ret;
+				goto out;
+			}
 			ftrace_profile_enabled = 1;
 		} else {
 			ftrace_profile_enabled = 0;
-			unregister_ftrace_function(&ftrace_profile_ops);
+			unregister_ftrace_profiler();
 		}
 	}
  out:
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 67c6a21dd42..821bf49771d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -402,17 +402,6 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
 	return cnt;
 }
 
-static void
-trace_print_seq(struct seq_file *m, struct trace_seq *s)
-{
-	int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
-
-	s->buffer[len] = 0;
-	seq_puts(m, s->buffer);
-
-	trace_seq_init(s);
-}
-
 /**
  * update_max_tr - snapshot all trace buffers from global_trace to max_tr
  * @tr: tracer
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d7410bbb9a8..c66ca3b6605 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -605,6 +605,8 @@ extern unsigned long trace_flags;
 /* Standard output formatting function used for function return traces */
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 extern enum print_line_t print_graph_function(struct trace_iterator *iter);
+extern enum print_line_t
+trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 /* TODO: make this variable */
@@ -636,7 +638,6 @@ static inline int ftrace_graph_addr(unsigned long addr)
 	return 1;
 }
 #endif /* CONFIG_DYNAMIC_FTRACE */
-
 #else /* CONFIG_FUNCTION_GRAPH_TRACER */
 static inline enum print_line_t
 print_graph_function(struct trace_iterator *iter)
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index d28687e7b3a..85bba0f018b 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -426,8 +426,8 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
 	return TRACE_TYPE_HANDLED;
 }
 
-static enum print_line_t
-print_graph_duration(unsigned long long duration, struct trace_seq *s)
+enum print_line_t
+trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
 {
 	unsigned long nsecs_rem = do_div(duration, 1000);
 	/* log10(ULONG_MAX) + '\0' */
@@ -464,12 +464,23 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
+	return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+print_graph_duration(unsigned long long duration, struct trace_seq *s)
+{
+	int ret;
+
+	ret = trace_print_graph_duration(duration, s);
+	if (ret != TRACE_TYPE_HANDLED)
+		return ret;
 
 	ret = trace_seq_printf(s, "|  ");
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
-	return TRACE_TYPE_HANDLED;
 
+	return TRACE_TYPE_HANDLED;
 }
 
 /* Case of a leaf function on its call entry */
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 19261fdd245..a3b6e3fd704 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -19,6 +19,16 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 
 static int next_event_type = __TRACE_LAST_TYPE + 1;
 
+void trace_print_seq(struct seq_file *m, struct trace_seq *s)
+{
+	int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
+
+	s->buffer[len] = 0;
+	seq_puts(m, s->buffer);
+
+	trace_seq_init(s);
+}
+
 enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 35c422fb51a..1eac2973374 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -20,6 +20,8 @@ trace_print_bprintk_msg_only(struct trace_iterator *iter);
 extern enum print_line_t
 trace_print_printk_msg_only(struct trace_iterator *iter);
 
+extern void trace_print_seq(struct seq_file *m, struct trace_seq *s);
+
 extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 	__attribute__ ((format (printf, 2, 3)));
 extern int
-- 
cgit v1.2.3


From cafb168a1c92e4c9e1731fe3d666c39611762c49 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 24 Mar 2009 20:50:39 -0400
Subject: tracing: make the function profiler per cpu

Impact: speed enhancement

By making the function profiler record in per cpu data we not only
get better readings, avoid races, we also do not have to take any
locks.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 199 +++++++++++++++++++++++++++++++++-----------------
 1 file changed, 130 insertions(+), 69 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a9ccd71fc92..ed1fc5021d4 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -257,28 +257,28 @@ struct ftrace_profile_page {
 	struct ftrace_profile		records[];
 };
 
+struct ftrace_profile_stat {
+	atomic_t			disabled;
+	struct hlist_head		*hash;
+	struct ftrace_profile_page	*pages;
+	struct ftrace_profile_page	*start;
+	struct tracer_stat		stat;
+};
+
 #define PROFILE_RECORDS_SIZE						\
 	(PAGE_SIZE - offsetof(struct ftrace_profile_page, records))
 
 #define PROFILES_PER_PAGE					\
 	(PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile))
 
-/* TODO: make these percpu, to prevent cache line bouncing */
-static struct ftrace_profile_page *profile_pages_start;
-static struct ftrace_profile_page *profile_pages;
-
-static struct hlist_head *ftrace_profile_hash;
 static int ftrace_profile_bits;
 static int ftrace_profile_enabled;
 static DEFINE_MUTEX(ftrace_profile_lock);
 
-static DEFINE_PER_CPU(atomic_t, ftrace_profile_disable);
+static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats);
 
 #define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */
 
-static raw_spinlock_t ftrace_profile_rec_lock =
-	(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
-
 static void *
 function_stat_next(void *v, int idx)
 {
@@ -303,7 +303,13 @@ function_stat_next(void *v, int idx)
 
 static void *function_stat_start(struct tracer_stat *trace)
 {
-	return function_stat_next(&profile_pages_start->records[0], 0);
+	struct ftrace_profile_stat *stat =
+		container_of(trace, struct ftrace_profile_stat, stat);
+
+	if (!stat || !stat->start)
+		return NULL;
+
+	return function_stat_next(&stat->start->records[0], 0);
 }
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -374,20 +380,11 @@ static int function_stat_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static struct tracer_stat function_stats = {
-	.name = "functions",
-	.stat_start = function_stat_start,
-	.stat_next = function_stat_next,
-	.stat_cmp = function_stat_cmp,
-	.stat_headers = function_stat_headers,
-	.stat_show = function_stat_show
-};
-
-static void ftrace_profile_reset(void)
+static void ftrace_profile_reset(struct ftrace_profile_stat *stat)
 {
 	struct ftrace_profile_page *pg;
 
-	pg = profile_pages = profile_pages_start;
+	pg = stat->pages = stat->start;
 
 	while (pg) {
 		memset(pg->records, 0, PROFILE_RECORDS_SIZE);
@@ -395,24 +392,24 @@ static void ftrace_profile_reset(void)
 		pg = pg->next;
 	}
 
-	memset(ftrace_profile_hash, 0,
+	memset(stat->hash, 0,
 	       FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head));
 }
 
-int ftrace_profile_pages_init(void)
+int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
 {
 	struct ftrace_profile_page *pg;
 	int i;
 
 	/* If we already allocated, do nothing */
-	if (profile_pages)
+	if (stat->pages)
 		return 0;
 
-	profile_pages = (void *)get_zeroed_page(GFP_KERNEL);
-	if (!profile_pages)
+	stat->pages = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!stat->pages)
 		return -ENOMEM;
 
-	pg = profile_pages_start = profile_pages;
+	pg = stat->start = stat->pages;
 
 	/* allocate 10 more pages to start */
 	for (i = 0; i < 10; i++) {
@@ -430,13 +427,16 @@ int ftrace_profile_pages_init(void)
 	return 0;
 }
 
-static int ftrace_profile_init(void)
+static int ftrace_profile_init_cpu(int cpu)
 {
+	struct ftrace_profile_stat *stat;
 	int size;
 
-	if (ftrace_profile_hash) {
+	stat = &per_cpu(ftrace_profile_stats, cpu);
+
+	if (stat->hash) {
 		/* If the profile is already created, simply reset it */
-		ftrace_profile_reset();
+		ftrace_profile_reset(stat);
 		return 0;
 	}
 
@@ -446,29 +446,45 @@ static int ftrace_profile_init(void)
 	 */
 	size = FTRACE_PROFILE_HASH_SIZE;
 
-	ftrace_profile_hash =
-		kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL);
+	stat->hash = kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL);
 
-	if (!ftrace_profile_hash)
+	if (!stat->hash)
 		return -ENOMEM;
 
-	size--;
+	if (!ftrace_profile_bits) {
+		size--;
 
-	for (; size; size >>= 1)
-		ftrace_profile_bits++;
+		for (; size; size >>= 1)
+			ftrace_profile_bits++;
+	}
 
 	/* Preallocate a few pages */
-	if (ftrace_profile_pages_init() < 0) {
-		kfree(ftrace_profile_hash);
-		ftrace_profile_hash = NULL;
+	if (ftrace_profile_pages_init(stat) < 0) {
+		kfree(stat->hash);
+		stat->hash = NULL;
 		return -ENOMEM;
 	}
 
 	return 0;
 }
 
+static int ftrace_profile_init(void)
+{
+	int cpu;
+	int ret = 0;
+
+	for_each_online_cpu(cpu) {
+		ret = ftrace_profile_init_cpu(cpu);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
 /* interrupts must be disabled */
-static struct ftrace_profile *ftrace_find_profiled_func(unsigned long ip)
+static struct ftrace_profile *
+ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
 {
 	struct ftrace_profile *rec;
 	struct hlist_head *hhd;
@@ -476,7 +492,7 @@ static struct ftrace_profile *ftrace_find_profiled_func(unsigned long ip)
 	unsigned long key;
 
 	key = hash_long(ip, ftrace_profile_bits);
-	hhd = &ftrace_profile_hash[key];
+	hhd = &stat->hash[key];
 
 	if (hlist_empty(hhd))
 		return NULL;
@@ -489,52 +505,50 @@ static struct ftrace_profile *ftrace_find_profiled_func(unsigned long ip)
 	return NULL;
 }
 
-static void ftrace_add_profile(struct ftrace_profile *rec)
+static void ftrace_add_profile(struct ftrace_profile_stat *stat,
+			       struct ftrace_profile *rec)
 {
 	unsigned long key;
 
 	key = hash_long(rec->ip, ftrace_profile_bits);
-	hlist_add_head_rcu(&rec->node, &ftrace_profile_hash[key]);
+	hlist_add_head_rcu(&rec->node, &stat->hash[key]);
 }
 
 /* Interrupts must be disabled calling this */
 static struct ftrace_profile *
-ftrace_profile_alloc(unsigned long ip, bool alloc_safe)
+ftrace_profile_alloc(struct ftrace_profile_stat *stat,
+		     unsigned long ip, bool alloc_safe)
 {
 	struct ftrace_profile *rec = NULL;
 
 	/* prevent recursion */
-	if (atomic_inc_return(&__get_cpu_var(ftrace_profile_disable)) != 1)
+	if (atomic_inc_return(&stat->disabled) != 1)
 		goto out;
 
-	__raw_spin_lock(&ftrace_profile_rec_lock);
-
 	/* Try to always keep another page available */
-	if (!profile_pages->next && alloc_safe)
-		profile_pages->next = (void *)get_zeroed_page(GFP_ATOMIC);
+	if (!stat->pages->next && alloc_safe)
+		stat->pages->next = (void *)get_zeroed_page(GFP_ATOMIC);
 
 	/*
 	 * Try to find the function again since another
 	 * task on another CPU could have added it
 	 */
-	rec = ftrace_find_profiled_func(ip);
+	rec = ftrace_find_profiled_func(stat, ip);
 	if (rec)
-		goto out_unlock;
+		goto out;
 
-	if (profile_pages->index == PROFILES_PER_PAGE) {
-		if (!profile_pages->next)
-			goto out_unlock;
-		profile_pages = profile_pages->next;
+	if (stat->pages->index == PROFILES_PER_PAGE) {
+		if (!stat->pages->next)
+			goto out;
+		stat->pages = stat->pages->next;
 	}
 
-	rec = &profile_pages->records[profile_pages->index++];
+	rec = &stat->pages->records[stat->pages->index++];
 	rec->ip = ip;
-	ftrace_add_profile(rec);
+	ftrace_add_profile(stat, rec);
 
- out_unlock:
-	__raw_spin_unlock(&ftrace_profile_rec_lock);
  out:
-	atomic_dec(&__get_cpu_var(ftrace_profile_disable));
+	atomic_dec(&stat->disabled);
 
 	return rec;
 }
@@ -552,6 +566,7 @@ static bool ftrace_safe_to_allocate(void)
 static void
 function_profile_call(unsigned long ip, unsigned long parent_ip)
 {
+	struct ftrace_profile_stat *stat;
 	struct ftrace_profile *rec;
 	unsigned long flags;
 	bool alloc_safe;
@@ -562,9 +577,14 @@ function_profile_call(unsigned long ip, unsigned long parent_ip)
 	alloc_safe = ftrace_safe_to_allocate();
 
 	local_irq_save(flags);
-	rec = ftrace_find_profiled_func(ip);
+
+	stat = &__get_cpu_var(ftrace_profile_stats);
+	if (!stat->hash)
+		goto out;
+
+	rec = ftrace_find_profiled_func(stat, ip);
 	if (!rec) {
-		rec = ftrace_profile_alloc(ip, alloc_safe);
+		rec = ftrace_profile_alloc(stat, ip, alloc_safe);
 		if (!rec)
 			goto out;
 	}
@@ -583,13 +603,19 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace)
 
 static void profile_graph_return(struct ftrace_graph_ret *trace)
 {
-	unsigned long flags;
+	struct ftrace_profile_stat *stat;
 	struct ftrace_profile *rec;
+	unsigned long flags;
 
 	local_irq_save(flags);
-	rec = ftrace_find_profiled_func(trace->func);
+	stat = &__get_cpu_var(ftrace_profile_stats);
+	if (!stat->hash)
+		goto out;
+
+	rec = ftrace_find_profiled_func(stat, trace->func);
 	if (rec)
 		rec->time += trace->rettime - trace->calltime;
+ out:
 	local_irq_restore(flags);
 }
 
@@ -687,16 +713,51 @@ static const struct file_operations ftrace_profile_fops = {
 	.write		= ftrace_profile_write,
 };
 
+/* used to initialize the real stat files */
+static struct tracer_stat function_stats __initdata = {
+	.name = "functions",
+	.stat_start = function_stat_start,
+	.stat_next = function_stat_next,
+	.stat_cmp = function_stat_cmp,
+	.stat_headers = function_stat_headers,
+	.stat_show = function_stat_show
+};
+
 static void ftrace_profile_debugfs(struct dentry *d_tracer)
 {
+	struct ftrace_profile_stat *stat;
 	struct dentry *entry;
+	char *name;
 	int ret;
+	int cpu;
 
-	ret = register_stat_tracer(&function_stats);
-	if (ret) {
-		pr_warning("Warning: could not register "
-			   "function stats\n");
-		return;
+	for_each_possible_cpu(cpu) {
+		stat = &per_cpu(ftrace_profile_stats, cpu);
+
+		/* allocate enough for function name + cpu number */
+		name = kmalloc(32, GFP_KERNEL);
+		if (!name) {
+			/*
+			 * The files created are permanent, if something happens
+			 * we still do not free memory.
+			 */
+			kfree(stat);
+			WARN(1,
+			     "Could not allocate stat file for cpu %d\n",
+			     cpu);
+			return;
+		}
+		stat->stat = function_stats;
+		snprintf(name, 32, "function%d", cpu);
+		stat->stat.name = name;
+		ret = register_stat_tracer(&stat->stat);
+		if (ret) {
+			WARN(1,
+			     "Could not register function stat for cpu %d\n",
+			     cpu);
+			kfree(name);
+			return;
+		}
 	}
 
 	entry = debugfs_create_file("function_profile_enabled", 0644,
-- 
cgit v1.2.3


From a2a16d6a3156ef7309ca7328a20c35df9418e670 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 24 Mar 2009 23:17:58 -0400
Subject: function-graph: add option to calculate graph time or not

graph time is the time that a function is executing another function.
Thus if function A calls B, if graph-time is set, then the time for
A includes B. This is the default behavior. But if graph-time is off,
then the time spent executing B is subtracted from A.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c                | 21 ++++++++++++++++++++-
 kernel/trace/trace.c                 |  4 +++-
 kernel/trace/trace.h                 |  1 +
 kernel/trace/trace_functions_graph.c |  8 ++++----
 4 files changed, 28 insertions(+), 6 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ed1fc5021d4..71e5faef12a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -604,6 +604,7 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace)
 static void profile_graph_return(struct ftrace_graph_ret *trace)
 {
 	struct ftrace_profile_stat *stat;
+	unsigned long long calltime;
 	struct ftrace_profile *rec;
 	unsigned long flags;
 
@@ -612,9 +613,27 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
 	if (!stat->hash)
 		goto out;
 
+	calltime = trace->rettime - trace->calltime;
+
+	if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) {
+		int index;
+
+		index = trace->depth;
+
+		/* Append this call time to the parent time to subtract */
+		if (index)
+			current->ret_stack[index - 1].subtime += calltime;
+
+		if (current->ret_stack[index].subtime < calltime)
+			calltime -= current->ret_stack[index].subtime;
+		else
+			calltime = 0;
+	}
+
 	rec = ftrace_find_profiled_func(stat, trace->func);
 	if (rec)
-		rec->time += trace->rettime - trace->calltime;
+		rec->time += calltime;
+
  out:
 	local_irq_restore(flags);
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 821bf49771d..5d1a16cae37 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -255,7 +255,8 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 
 /* trace_flags holds trace_options default values */
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
-	TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME;
+	TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
+	TRACE_ITER_GRAPH_TIME;
 
 /**
  * trace_wake_up - wake up tasks waiting for trace input
@@ -317,6 +318,7 @@ static const char *trace_options[] = {
 	"latency-format",
 	"global-clock",
 	"sleep-time",
+	"graph-time",
 	NULL
 };
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c66ca3b6605..e3429a8ab05 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -685,6 +685,7 @@ enum trace_iterator_flags {
 	TRACE_ITER_LATENCY_FMT		= 0x40000,
 	TRACE_ITER_GLOBAL_CLK		= 0x80000,
 	TRACE_ITER_SLEEP_TIME		= 0x100000,
+	TRACE_ITER_GRAPH_TIME		= 0x200000,
 };
 
 /*
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 85bba0f018b..10f6ad7d85f 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -78,13 +78,14 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
 	current->ret_stack[index].ret = ret;
 	current->ret_stack[index].func = func;
 	current->ret_stack[index].calltime = calltime;
+	current->ret_stack[index].subtime = 0;
 	*depth = index;
 
 	return 0;
 }
 
 /* Retrieve a function return address to the trace stack on thread info.*/
-void
+static void
 ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
 {
 	int index;
@@ -104,9 +105,6 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
 	trace->calltime = current->ret_stack[index].calltime;
 	trace->overrun = atomic_read(&current->trace_overrun);
 	trace->depth = index;
-	barrier();
-	current->curr_ret_stack--;
-
 }
 
 /*
@@ -121,6 +119,8 @@ unsigned long ftrace_return_to_handler(void)
 	ftrace_pop_return_trace(&trace, &ret);
 	trace.rettime = trace_clock_local();
 	ftrace_graph_return(&trace);
+	barrier();
+	current->curr_ret_stack--;
 
 	if (unlikely(!ret)) {
 		ftrace_graph_stop();
-- 
cgit v1.2.3


From fb9fb015e92123fa3a8e0c2e2fff491d4a56b470 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 25 Mar 2009 13:26:41 -0400
Subject: tracing: clean up tracing profiler

Ingo Molnar suggested clean ups for the profiling code. This patch
makes those updates.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 71e5faef12a..a141d8499ab 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -69,7 +69,7 @@ static DEFINE_MUTEX(ftrace_lock);
 
 static struct ftrace_ops ftrace_list_end __read_mostly =
 {
-	.func = ftrace_stub,
+	.func		= ftrace_stub,
 };
 
 static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
@@ -271,8 +271,10 @@ struct ftrace_profile_stat {
 #define PROFILES_PER_PAGE					\
 	(PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile))
 
-static int ftrace_profile_bits;
-static int ftrace_profile_enabled;
+static int ftrace_profile_bits __read_mostly;
+static int ftrace_profile_enabled __read_mostly;
+
+/* ftrace_profile_lock - synchronize the enable and disable of the profiler */
 static DEFINE_MUTEX(ftrace_profile_lock);
 
 static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats);
@@ -651,7 +653,7 @@ static void unregister_ftrace_profiler(void)
 #else
 static struct ftrace_ops ftrace_profile_ops __read_mostly =
 {
-	.func = function_profile_call,
+	.func		= function_profile_call,
 };
 
 static int register_ftrace_profiler(void)
@@ -670,7 +672,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
 		     size_t cnt, loff_t *ppos)
 {
 	unsigned long val;
-	char buf[64];
+	char buf[64];		/* big enough to hold a number */
 	int ret;
 
 	if (cnt >= sizeof(buf))
@@ -719,7 +721,7 @@ static ssize_t
 ftrace_profile_read(struct file *filp, char __user *ubuf,
 		     size_t cnt, loff_t *ppos)
 {
-	char buf[64];
+	char buf[64];		/* big enough to hold a number */
 	int r;
 
 	r = sprintf(buf, "%u\n", ftrace_profile_enabled);
@@ -734,12 +736,12 @@ static const struct file_operations ftrace_profile_fops = {
 
 /* used to initialize the real stat files */
 static struct tracer_stat function_stats __initdata = {
-	.name = "functions",
-	.stat_start = function_stat_start,
-	.stat_next = function_stat_next,
-	.stat_cmp = function_stat_cmp,
-	.stat_headers = function_stat_headers,
-	.stat_show = function_stat_show
+	.name		= "functions",
+	.stat_start	= function_stat_start,
+	.stat_next	= function_stat_next,
+	.stat_cmp	= function_stat_cmp,
+	.stat_headers	= function_stat_headers,
+	.stat_show	= function_stat_show
 };
 
 static void ftrace_profile_debugfs(struct dentry *d_tracer)
@@ -1954,7 +1956,7 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
 
 static struct ftrace_ops trace_probe_ops __read_mostly =
 {
-	.func = function_trace_probe_call,
+	.func		= function_trace_probe_call,
 };
 
 static int ftrace_probe_registered;
-- 
cgit v1.2.3


From 318e0a73c9e41b9a17241829bcd0605a39b87cb9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 25 Mar 2009 20:06:34 -0400
Subject: tracing: remove on the fly allocator from function profiler

Impact: safer code

The on the fly allocator for the function profiler was to save
memory. But at the expense of stability. Although it survived several
tests, allocating from the function tracer is just too risky, just
to save space.

This patch removes the allocator and simply allocates enough entries
at start up.

Each function gets a profiling structure of 40 bytes. With an average
of 20K functions, and this is for each CPU, we have 800K per online
CPU. This is not too bad, at least for non-embedded.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 76 +++++++++++++++++++++++++++++----------------------
 1 file changed, 43 insertions(+), 33 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a141d8499ab..4d90c916b2b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -401,6 +401,8 @@ static void ftrace_profile_reset(struct ftrace_profile_stat *stat)
 int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
 {
 	struct ftrace_profile_page *pg;
+	int functions;
+	int pages;
 	int i;
 
 	/* If we already allocated, do nothing */
@@ -411,22 +413,46 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
 	if (!stat->pages)
 		return -ENOMEM;
 
+#ifdef CONFIG_DYNAMIC_FTRACE
+	functions = ftrace_update_tot_cnt;
+#else
+	/*
+	 * We do not know the number of functions that exist because
+	 * dynamic tracing is what counts them. With past experience
+	 * we have around 20K functions. That should be more than enough.
+	 * It is highly unlikely we will execute every function in
+	 * the kernel.
+	 */
+	functions = 20000;
+#endif
+
 	pg = stat->start = stat->pages;
 
-	/* allocate 10 more pages to start */
-	for (i = 0; i < 10; i++) {
+	pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE);
+
+	for (i = 0; i < pages; i++) {
 		pg->next = (void *)get_zeroed_page(GFP_KERNEL);
-		/*
-		 * We only care about allocating profile_pages, if
-		 * we failed to allocate here, hopefully we will allocate
-		 * later.
-		 */
 		if (!pg->next)
-			break;
+			goto out_free;
 		pg = pg->next;
 	}
 
 	return 0;
+
+ out_free:
+	pg = stat->start;
+	while (pg) {
+		unsigned long tmp = (unsigned long)pg;
+
+		pg = pg->next;
+		free_page(tmp);
+	}
+
+	free_page((unsigned long)stat->pages);
+	stat->pages = NULL;
+	stat->start = NULL;
+
+	return -ENOMEM;
 }
 
 static int ftrace_profile_init_cpu(int cpu)
@@ -460,7 +486,7 @@ static int ftrace_profile_init_cpu(int cpu)
 			ftrace_profile_bits++;
 	}
 
-	/* Preallocate a few pages */
+	/* Preallocate the function profiling pages */
 	if (ftrace_profile_pages_init(stat) < 0) {
 		kfree(stat->hash);
 		stat->hash = NULL;
@@ -516,24 +542,21 @@ static void ftrace_add_profile(struct ftrace_profile_stat *stat,
 	hlist_add_head_rcu(&rec->node, &stat->hash[key]);
 }
 
-/* Interrupts must be disabled calling this */
+/*
+ * The memory is already allocated, this simply finds a new record to use.
+ */
 static struct ftrace_profile *
-ftrace_profile_alloc(struct ftrace_profile_stat *stat,
-		     unsigned long ip, bool alloc_safe)
+ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip)
 {
 	struct ftrace_profile *rec = NULL;
 
-	/* prevent recursion */
+	/* prevent recursion (from NMIs) */
 	if (atomic_inc_return(&stat->disabled) != 1)
 		goto out;
 
-	/* Try to always keep another page available */
-	if (!stat->pages->next && alloc_safe)
-		stat->pages->next = (void *)get_zeroed_page(GFP_ATOMIC);
-
 	/*
-	 * Try to find the function again since another
-	 * task on another CPU could have added it
+	 * Try to find the function again since an NMI
+	 * could have added it
 	 */
 	rec = ftrace_find_profiled_func(stat, ip);
 	if (rec)
@@ -555,29 +578,16 @@ ftrace_profile_alloc(struct ftrace_profile_stat *stat,
 	return rec;
 }
 
-/*
- * If we are not in an interrupt, or softirq and
- * and interrupts are disabled and preemption is not enabled
- * (not in a spinlock) then it should be safe to allocate memory.
- */
-static bool ftrace_safe_to_allocate(void)
-{
-	return !in_interrupt() && irqs_disabled() && !preempt_count();
-}
-
 static void
 function_profile_call(unsigned long ip, unsigned long parent_ip)
 {
 	struct ftrace_profile_stat *stat;
 	struct ftrace_profile *rec;
 	unsigned long flags;
-	bool alloc_safe;
 
 	if (!ftrace_profile_enabled)
 		return;
 
-	alloc_safe = ftrace_safe_to_allocate();
-
 	local_irq_save(flags);
 
 	stat = &__get_cpu_var(ftrace_profile_stats);
@@ -586,7 +596,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip)
 
 	rec = ftrace_find_profiled_func(stat, ip);
 	if (!rec) {
-		rec = ftrace_profile_alloc(stat, ip, alloc_safe);
+		rec = ftrace_profile_alloc(stat, ip);
 		if (!rec)
 			goto out;
 	}
-- 
cgit v1.2.3


From 34886c8bc590f078d4c0b88f50d061326639198d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 25 Mar 2009 21:00:47 -0400
Subject: tracing: add average time in function to function profiler

Show the average time in the function (Time / Hit)

  Function                               Hit    Time            Avg
  --------                               ---    ----            ---
  mwait_idle                              51    140326.6 us     2751.503 us
  smp_apic_timer_interrupt                47    3517.735 us     74.845 us
  schedule                                10    2738.754 us     273.875 us
  __schedule                              10    2732.857 us     273.285 us
  hrtimer_interrupt                       47    1896.104 us     40.342 us
  irq_exit                                56    1711.833 us     30.568 us
  __run_hrtimer                           47    1315.589 us     27.991 us
  tick_sched_timer                        47    1138.690 us     24.227 us
  do_softirq                              56    1116.829 us     19.943 us
  __do_softirq                            56    1066.932 us     19.052 us
  do_IRQ                                   9    926.153 us      102.905 us

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4d90c916b2b..c7f4a4be05d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -347,8 +347,10 @@ static int function_stat_cmp(void *p1, void *p2)
 static int function_stat_headers(struct seq_file *m)
 {
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-	seq_printf(m, "  Function                               Hit    Time\n"
-		      "  --------                               ---    ----\n");
+	seq_printf(m, "  Function                               "
+		   "Hit    Time            Avg\n"
+		      "  --------                               "
+		   "---    ----            ---\n");
 #else
 	seq_printf(m, "  Function                               Hit\n"
 		      "  --------                               ---\n");
@@ -361,12 +363,9 @@ static int function_stat_show(struct seq_file *m, void *v)
 	struct ftrace_profile *rec = v;
 	char str[KSYM_SYMBOL_LEN];
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-	static struct trace_seq s;
 	static DEFINE_MUTEX(mutex);
-
-	mutex_lock(&mutex);
-	trace_seq_init(&s);
-	trace_print_graph_duration(rec->time, &s);
+	static struct trace_seq s;
+	unsigned long long avg;
 #endif
 
 	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
@@ -374,6 +373,14 @@ static int function_stat_show(struct seq_file *m, void *v)
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	seq_printf(m, "    ");
+	avg = rec->time;
+	do_div(avg, rec->counter);
+
+	mutex_lock(&mutex);
+	trace_seq_init(&s);
+	trace_print_graph_duration(rec->time, &s);
+	trace_seq_puts(&s, "    ");
+	trace_print_graph_duration(avg, &s);
 	trace_print_seq(m, &s);
 	mutex_unlock(&mutex);
 #endif
-- 
cgit v1.2.3


From de79f54f5347ad7ec6ff55ccbb6d4ab2a21f6a93 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:40 +0200
Subject: x86, bts, hw-branch-tracer: add _noirq variants to the debug store
 interface

The hw-branch-tracer uses debug store functions from an on_each_cpu()
context, which is simply wrong since the functions may sleep.

Add _noirq variants for most functions, which  may be called with
interrupts disabled.

Separate per-cpu and per-task tracing and allow per-cpu tracing to be
controlled from any cpu.

Make the hw-branch-tracer use the new debug store interface, synchronize
with hotplug cpu event using get/put_online_cpus(), and remove the
unnecessary spinlock.

Make the ptrace bts and the ds selftest code use the new interface.

Defer the ds selftest.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144555.658136000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_hw_branches.c | 193 +++++++++++++++------------------------
 1 file changed, 72 insertions(+), 121 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 8b2109a6c61..50565d8cd2e 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -4,7 +4,6 @@
  * Copyright (C) 2008-2009 Intel Corporation.
  * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
  */
-#include <linux/spinlock.h>
 #include <linux/kallsyms.h>
 #include <linux/debugfs.h>
 #include <linux/ftrace.h>
@@ -21,168 +20,113 @@
 
 #define BTS_BUFFER_SIZE (1 << 13)
 
-/*
- * The tracer lock protects the below per-cpu tracer array.
- * It needs to be held to:
- * - start tracing on all cpus
- * - stop tracing on all cpus
- * - start tracing on a single hotplug cpu
- * - stop tracing on a single hotplug cpu
- * - read the trace from all cpus
- * - read the trace from a single cpu
- */
-static DEFINE_SPINLOCK(bts_tracer_lock);
 static DEFINE_PER_CPU(struct bts_tracer *, tracer);
 static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer);
 
 #define this_tracer per_cpu(tracer, smp_processor_id())
-#define this_buffer per_cpu(buffer, smp_processor_id())
 
 static int trace_hw_branches_enabled __read_mostly;
 static int trace_hw_branches_suspended __read_mostly;
 static struct trace_array *hw_branch_trace __read_mostly;
 
 
-/*
- * Initialize the tracer for the current cpu.
- * The argument is ignored.
- *
- * pre: bts_tracer_lock must be locked.
- */
-static void bts_trace_init_cpu(void *arg)
+static void bts_trace_init_cpu(int cpu)
 {
-	if (this_tracer)
-		ds_release_bts(this_tracer);
+	per_cpu(tracer, cpu) =
+		ds_request_bts_cpu(cpu, per_cpu(buffer, cpu), BTS_BUFFER_SIZE,
+				   NULL, (size_t)-1, BTS_KERNEL);
 
-	this_tracer = ds_request_bts(NULL, this_buffer, BTS_BUFFER_SIZE,
-				     NULL, (size_t)-1, BTS_KERNEL);
-	if (IS_ERR(this_tracer)) {
-		this_tracer = NULL;
-		return;
-	}
+	if (IS_ERR(per_cpu(tracer, cpu)))
+		per_cpu(tracer, cpu) = NULL;
 }
 
 static int bts_trace_init(struct trace_array *tr)
 {
-	int cpu, avail;
-
-	spin_lock(&bts_tracer_lock);
+	int cpu;
 
 	hw_branch_trace = tr;
+	trace_hw_branches_enabled = 0;
 
-	on_each_cpu(bts_trace_init_cpu, NULL, 1);
-
-	/* Check on how many cpus we could enable tracing */
-	avail = 0;
-	for_each_online_cpu(cpu)
-		if (per_cpu(tracer, cpu))
-			avail++;
+	get_online_cpus();
+	for_each_online_cpu(cpu) {
+		bts_trace_init_cpu(cpu);
 
-	trace_hw_branches_enabled = (avail ? 1 : 0);
+		if (likely(per_cpu(tracer, cpu)))
+			trace_hw_branches_enabled = 1;
+	}
 	trace_hw_branches_suspended = 0;
-
-	spin_unlock(&bts_tracer_lock);
-
+	put_online_cpus();
 
 	/* If we could not enable tracing on a single cpu, we fail. */
-	return avail ? 0 : -EOPNOTSUPP;
-}
-
-/*
- * Release the tracer for the current cpu.
- * The argument is ignored.
- *
- * pre: bts_tracer_lock must be locked.
- */
-static void bts_trace_release_cpu(void *arg)
-{
-	if (this_tracer) {
-		ds_release_bts(this_tracer);
-		this_tracer = NULL;
-	}
+	return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP;
 }
 
 static void bts_trace_reset(struct trace_array *tr)
 {
-	spin_lock(&bts_tracer_lock);
+	int cpu;
 
-	on_each_cpu(bts_trace_release_cpu, NULL, 1);
+	get_online_cpus();
+	for_each_online_cpu(cpu) {
+		if (likely(per_cpu(tracer, cpu))) {
+			ds_release_bts(per_cpu(tracer, cpu));
+			per_cpu(tracer, cpu) = NULL;
+		}
+	}
 	trace_hw_branches_enabled = 0;
 	trace_hw_branches_suspended = 0;
-
-	spin_unlock(&bts_tracer_lock);
-}
-
-/*
- * Resume tracing on the current cpu.
- * The argument is ignored.
- *
- * pre: bts_tracer_lock must be locked.
- */
-static void bts_trace_resume_cpu(void *arg)
-{
-	if (this_tracer)
-		ds_resume_bts(this_tracer);
+	put_online_cpus();
 }
 
 static void bts_trace_start(struct trace_array *tr)
 {
-	spin_lock(&bts_tracer_lock);
+	int cpu;
 
-	on_each_cpu(bts_trace_resume_cpu, NULL, 1);
+	get_online_cpus();
+	for_each_online_cpu(cpu)
+		if (likely(per_cpu(tracer, cpu)))
+			ds_resume_bts(per_cpu(tracer, cpu));
 	trace_hw_branches_suspended = 0;
-
-	spin_unlock(&bts_tracer_lock);
-}
-
-/*
- * Suspend tracing on the current cpu.
- * The argument is ignored.
- *
- * pre: bts_tracer_lock must be locked.
- */
-static void bts_trace_suspend_cpu(void *arg)
-{
-	if (this_tracer)
-		ds_suspend_bts(this_tracer);
+	put_online_cpus();
 }
 
 static void bts_trace_stop(struct trace_array *tr)
 {
-	spin_lock(&bts_tracer_lock);
+	int cpu;
 
-	on_each_cpu(bts_trace_suspend_cpu, NULL, 1);
+	get_online_cpus();
+	for_each_online_cpu(cpu)
+		if (likely(per_cpu(tracer, cpu)))
+			ds_suspend_bts(per_cpu(tracer, cpu));
 	trace_hw_branches_suspended = 1;
-
-	spin_unlock(&bts_tracer_lock);
+	put_online_cpus();
 }
 
 static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
 				     unsigned long action, void *hcpu)
 {
-	unsigned int cpu = (unsigned long)hcpu;
-
-	spin_lock(&bts_tracer_lock);
-
-	if (!trace_hw_branches_enabled)
-		goto out;
+	int cpu = (long)hcpu;
 
 	switch (action) {
 	case CPU_ONLINE:
 	case CPU_DOWN_FAILED:
-		smp_call_function_single(cpu, bts_trace_init_cpu, NULL, 1);
-
-		if (trace_hw_branches_suspended)
-			smp_call_function_single(cpu, bts_trace_suspend_cpu,
-						 NULL, 1);
+		/* The notification is sent with interrupts enabled. */
+		if (trace_hw_branches_enabled) {
+			bts_trace_init_cpu(cpu);
+
+			if (trace_hw_branches_suspended &&
+			    likely(per_cpu(tracer, cpu)))
+				ds_suspend_bts(per_cpu(tracer, cpu));
+		}
 		break;
+
 	case CPU_DOWN_PREPARE:
-		smp_call_function_single(cpu, bts_trace_release_cpu, NULL, 1);
-		break;
+		/* The notification is sent with interrupts enabled. */
+		if (likely(per_cpu(tracer, cpu))) {
+			ds_release_bts(per_cpu(tracer, cpu));
+			per_cpu(tracer, cpu) = NULL;
+		}
 	}
 
- out:
-	spin_unlock(&bts_tracer_lock);
 	return NOTIFY_DONE;
 }
 
@@ -274,7 +218,7 @@ static void trace_bts_at(const struct bts_trace *trace, void *at)
 /*
  * Collect the trace on the current cpu and write it into the ftrace buffer.
  *
- * pre: bts_tracer_lock must be locked
+ * pre: tracing must be suspended on the current cpu
  */
 static void trace_bts_cpu(void *arg)
 {
@@ -291,10 +235,9 @@ static void trace_bts_cpu(void *arg)
 	if (unlikely(!this_tracer))
 		return;
 
-	ds_suspend_bts(this_tracer);
 	trace = ds_read_bts(this_tracer);
 	if (!trace)
-		goto out;
+		return;
 
 	for (at = trace->ds.top; (void *)at < trace->ds.end;
 	     at += trace->ds.size)
@@ -303,18 +246,27 @@ static void trace_bts_cpu(void *arg)
 	for (at = trace->ds.begin; (void *)at < trace->ds.top;
 	     at += trace->ds.size)
 		trace_bts_at(trace, at);
-
-out:
-	ds_resume_bts(this_tracer);
 }
 
 static void trace_bts_prepare(struct trace_iterator *iter)
 {
-	spin_lock(&bts_tracer_lock);
+	int cpu;
 
+	get_online_cpus();
+	for_each_online_cpu(cpu)
+		if (likely(per_cpu(tracer, cpu)))
+			ds_suspend_bts(per_cpu(tracer, cpu));
+	/*
+	 * We need to collect the trace on the respective cpu since ftrace
+	 * implicitly adds the record for the current cpu.
+	 * Once that is more flexible, we could collect the data from any cpu.
+	 */
 	on_each_cpu(trace_bts_cpu, iter->tr, 1);
 
-	spin_unlock(&bts_tracer_lock);
+	for_each_online_cpu(cpu)
+		if (likely(per_cpu(tracer, cpu)))
+			ds_resume_bts(per_cpu(tracer, cpu));
+	put_online_cpus();
 }
 
 static void trace_bts_close(struct trace_iterator *iter)
@@ -324,12 +276,11 @@ static void trace_bts_close(struct trace_iterator *iter)
 
 void trace_hw_branch_oops(void)
 {
-	spin_lock(&bts_tracer_lock);
-
-	if (trace_hw_branches_enabled)
+	if (this_tracer) {
+		ds_suspend_bts_noirq(this_tracer);
 		trace_bts_cpu(hw_branch_trace);
-
-	spin_unlock(&bts_tracer_lock);
+		ds_resume_bts_noirq(this_tracer);
+	}
 }
 
 struct tracer bts_tracer __read_mostly =
-- 
cgit v1.2.3


From 4d657e51dfc042216febd4a007c6f36881f9256d Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:41 +0200
Subject: x86, hw-branch-tracer: allocate selftest iterator on heap

Allocate the trace_iterator for the hw-branch-tracer selftest on the heap.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144556.578777000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_selftest.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 499d01c44cd..00dd6485bdd 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -757,7 +757,7 @@ int
 trace_selftest_startup_hw_branches(struct tracer *trace,
 				   struct trace_array *tr)
 {
-	struct trace_iterator iter;
+	struct trace_iterator *iter;
 	struct tracer tracer;
 	unsigned long count;
 	int ret;
@@ -777,17 +777,21 @@ trace_selftest_startup_hw_branches(struct tracer *trace,
 	 * The hw-branch tracer needs to collect the trace from the various
 	 * cpu trace buffers - before tracing is stopped.
 	 */
-	memset(&iter, 0, sizeof(iter));
+	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+	if (!iter)
+		return -ENOMEM;
+
 	memcpy(&tracer, trace, sizeof(tracer));
 
-	iter.trace = &tracer;
-	iter.tr = tr;
-	iter.pos = -1;
-	mutex_init(&iter.mutex);
+	iter->trace = &tracer;
+	iter->tr = tr;
+	iter->pos = -1;
+	mutex_init(&iter->mutex);
 
-	trace->open(&iter);
+	trace->open(iter);
 
-	mutex_destroy(&iter.mutex);
+	mutex_destroy(&iter->mutex);
+	kfree(iter);
 
 	tracing_stop();
 
-- 
cgit v1.2.3


From a5dec5573f3c7e63f2f9b5852b9759ea342a5ff9 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 27 Mar 2009 14:55:44 +0800
Subject: tracing: use macros to denote usec and nsec per second

Impact: cleanup

Use USEC_PER_SEC and NSEC_PER_SEC instead of 1000000 and 1000000000.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <49CC7870.9000309@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_boot.c      | 5 +++--
 kernel/trace/trace_mmiotrace.c | 6 ++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 7a30fc4c364..a29ef23ffb4 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -9,6 +9,7 @@
 #include <linux/debugfs.h>
 #include <linux/ftrace.h>
 #include <linux/kallsyms.h>
+#include <linux/time.h>
 
 #include "trace.h"
 #include "trace_output.h"
@@ -67,7 +68,7 @@ initcall_call_print_line(struct trace_iterator *iter)
 	trace_assign_type(field, entry);
 	call = &field->boot_call;
 	ts = iter->ts;
-	nsec_rem = do_div(ts, 1000000000);
+	nsec_rem = do_div(ts, NSEC_PER_SEC);
 
 	ret = trace_seq_printf(s, "[%5ld.%09ld] calling  %s @ %i\n",
 			(unsigned long)ts, nsec_rem, call->func, call->caller);
@@ -92,7 +93,7 @@ initcall_ret_print_line(struct trace_iterator *iter)
 	trace_assign_type(field, entry);
 	init_ret = &field->boot_ret;
 	ts = iter->ts;
-	nsec_rem = do_div(ts, 1000000000);
+	nsec_rem = do_div(ts, NSEC_PER_SEC);
 
 	ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
 			"returned %d after %llu msecs\n",
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 8e37fcddd8b..d53b45ed080 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -9,6 +9,8 @@
 #include <linux/kernel.h>
 #include <linux/mmiotrace.h>
 #include <linux/pci.h>
+#include <linux/time.h>
+
 #include <asm/atomic.h>
 
 #include "trace.h"
@@ -174,7 +176,7 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
 	struct mmiotrace_rw *rw;
 	struct trace_seq *s	= &iter->seq;
 	unsigned long long t	= ns2usecs(iter->ts);
-	unsigned long usec_rem	= do_div(t, 1000000ULL);
+	unsigned long usec_rem	= do_div(t, USEC_PER_SEC);
 	unsigned secs		= (unsigned long)t;
 	int ret = 1;
 
@@ -221,7 +223,7 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)
 	struct mmiotrace_map *m;
 	struct trace_seq *s	= &iter->seq;
 	unsigned long long t	= ns2usecs(iter->ts);
-	unsigned long usec_rem	= do_div(t, 1000000ULL);
+	unsigned long usec_rem	= do_div(t, USEC_PER_SEC);
 	unsigned secs		= (unsigned long)t;
 	int ret;
 
-- 
cgit v1.2.3


From 5452af664f6fba26b80eb2c8c4ceae2999d5cf56 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 27 Mar 2009 00:25:38 +0100
Subject: tracing/ftrace: factorize the tracing files creation

Impact: cleanup

Most of the tracing files creation follow the same pattern:

ret = debugfs_create_file(...)
if (!ret)
	pr_warning("Couldn't create ... entry\n")

Unify it!

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1238109938-11840-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c              |  39 +++------
 kernel/trace/ring_buffer.c         |   7 +-
 kernel/trace/trace.c               | 159 +++++++++++++++----------------------
 kernel/trace/trace.h               |   6 ++
 kernel/trace/trace_event_profile.c |   1 -
 kernel/trace/trace_printk.c        |   6 +-
 kernel/trace/trace_stack.c         |  13 +--
 kernel/trace/trace_sysprof.c       |   6 +-
 8 files changed, 86 insertions(+), 151 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 678e3d6caf8..6ea5a1ae6a9 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2698,38 +2698,23 @@ static const struct file_operations ftrace_graph_fops = {
 
 static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
 {
-	struct dentry *entry;
 
-	entry = debugfs_create_file("available_filter_functions", 0444,
-				    d_tracer, NULL, &ftrace_avail_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'available_filter_functions' entry\n");
+	trace_create_file("available_filter_functions", 0444,
+			d_tracer, NULL, &ftrace_avail_fops);
 
-	entry = debugfs_create_file("failures", 0444,
-				    d_tracer, NULL, &ftrace_failures_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'failures' entry\n");
+	trace_create_file("failures", 0444,
+			d_tracer, NULL, &ftrace_failures_fops);
 
-	entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer,
-				    NULL, &ftrace_filter_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'set_ftrace_filter' entry\n");
+	trace_create_file("set_ftrace_filter", 0644, d_tracer,
+			NULL, &ftrace_filter_fops);
 
-	entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer,
+	trace_create_file("set_ftrace_notrace", 0644, d_tracer,
 				    NULL, &ftrace_notrace_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'set_ftrace_notrace' entry\n");
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-	entry = debugfs_create_file("set_graph_function", 0444, d_tracer,
+	trace_create_file("set_graph_function", 0444, d_tracer,
 				    NULL,
 				    &ftrace_graph_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'set_graph_function' entry\n");
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
 	return 0;
@@ -2987,7 +2972,6 @@ static const struct file_operations ftrace_pid_fops = {
 static __init int ftrace_init_debugfs(void)
 {
 	struct dentry *d_tracer;
-	struct dentry *entry;
 
 	d_tracer = tracing_init_dentry();
 	if (!d_tracer)
@@ -2995,11 +2979,8 @@ static __init int ftrace_init_debugfs(void)
 
 	ftrace_init_dyn_debugfs(d_tracer);
 
-	entry = debugfs_create_file("set_ftrace_pid", 0644, d_tracer,
-				    NULL, &ftrace_pid_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'set_ftrace_pid' entry\n");
+	trace_create_file("set_ftrace_pid", 0644, d_tracer,
+			    NULL, &ftrace_pid_fops);
 
 	ftrace_profile_debugfs(d_tracer);
 
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 960cbf44c84..74a11808c28 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2845,14 +2845,11 @@ static const struct file_operations rb_simple_fops = {
 static __init int rb_init_debugfs(void)
 {
 	struct dentry *d_tracer;
-	struct dentry *entry;
 
 	d_tracer = tracing_init_dentry();
 
-	entry = debugfs_create_file("tracing_on", 0644, d_tracer,
-				    &ring_buffer_flags, &rb_simple_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'tracing_on' entry\n");
+	trace_create_file("tracing_on", 0644, d_tracer,
+			    &ring_buffer_flags, &rb_simple_fops);
 
 	return 0;
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 32653c8c6e2..0615751a3ed 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3581,7 +3581,7 @@ struct dentry *tracing_dentry_percpu(void)
 static void tracing_init_debugfs_percpu(long cpu)
 {
 	struct dentry *d_percpu = tracing_dentry_percpu();
-	struct dentry *entry, *d_cpu;
+	struct dentry *d_cpu;
 	/* strlen(cpu) + MAX(log10(cpu)) + '\0' */
 	char cpu_dir[7];
 
@@ -3596,21 +3596,15 @@ static void tracing_init_debugfs_percpu(long cpu)
 	}
 
 	/* per cpu trace_pipe */
-	entry = debugfs_create_file("trace_pipe", 0444, d_cpu,
-				(void *) cpu, &tracing_pipe_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'trace_pipe' entry\n");
+	trace_create_file("trace_pipe", 0444, d_cpu,
+			(void *) cpu, &tracing_pipe_fops);
 
 	/* per cpu trace */
-	entry = debugfs_create_file("trace", 0644, d_cpu,
-				(void *) cpu, &tracing_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'trace' entry\n");
+	trace_create_file("trace", 0644, d_cpu,
+			(void *) cpu, &tracing_fops);
 
-	entry = debugfs_create_file("trace_pipe_raw", 0444, d_cpu,
-				    (void *) cpu, &tracing_buffers_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'trace_pipe_raw' entry\n");
+	trace_create_file("trace_pipe_raw", 0444, d_cpu,
+			(void *) cpu, &tracing_buffers_fops);
 }
 
 #ifdef CONFIG_FTRACE_SELFTEST
@@ -3766,6 +3760,22 @@ static const struct file_operations trace_options_core_fops = {
 	.write = trace_options_core_write,
 };
 
+struct dentry *trace_create_file(const char *name,
+				 mode_t mode,
+				 struct dentry *parent,
+				 void *data,
+				 const struct file_operations *fops)
+{
+	struct dentry *ret;
+
+	ret = debugfs_create_file(name, mode, parent, data, fops);
+	if (!ret)
+		pr_warning("Could not create debugfs '%s' entry\n", name);
+
+	return ret;
+}
+
+
 static struct dentry *trace_options_init_dentry(void)
 {
 	struct dentry *d_tracer;
@@ -3793,7 +3803,6 @@ create_trace_option_file(struct trace_option_dentry *topt,
 			 struct tracer_opt *opt)
 {
 	struct dentry *t_options;
-	struct dentry *entry;
 
 	t_options = trace_options_init_dentry();
 	if (!t_options)
@@ -3802,11 +3811,9 @@ create_trace_option_file(struct trace_option_dentry *topt,
 	topt->flags = flags;
 	topt->opt = opt;
 
-	entry = debugfs_create_file(opt->name, 0644, t_options, topt,
+	topt->entry = trace_create_file(opt->name, 0644, t_options, topt,
 				    &trace_options_fops);
 
-	topt->entry = entry;
-
 }
 
 static struct trace_option_dentry *
@@ -3861,123 +3868,81 @@ static struct dentry *
 create_trace_option_core_file(const char *option, long index)
 {
 	struct dentry *t_options;
-	struct dentry *entry;
 
 	t_options = trace_options_init_dentry();
 	if (!t_options)
 		return NULL;
 
-	entry = debugfs_create_file(option, 0644, t_options, (void *)index,
+	return trace_create_file(option, 0644, t_options, (void *)index,
 				    &trace_options_core_fops);
-
-	return entry;
 }
 
 static __init void create_trace_options_dir(void)
 {
 	struct dentry *t_options;
-	struct dentry *entry;
 	int i;
 
 	t_options = trace_options_init_dentry();
 	if (!t_options)
 		return;
 
-	for (i = 0; trace_options[i]; i++) {
-		entry = create_trace_option_core_file(trace_options[i], i);
-		if (!entry)
-			pr_warning("Could not create debugfs %s entry\n",
-				   trace_options[i]);
-	}
+	for (i = 0; trace_options[i]; i++)
+		create_trace_option_core_file(trace_options[i], i);
 }
 
 static __init int tracer_init_debugfs(void)
 {
 	struct dentry *d_tracer;
-	struct dentry *entry;
 	int cpu;
 
 	d_tracer = tracing_init_dentry();
 
-	entry = debugfs_create_file("tracing_enabled", 0644, d_tracer,
-				    &global_trace, &tracing_ctrl_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'tracing_enabled' entry\n");
+	trace_create_file("tracing_enabled", 0644, d_tracer,
+			&global_trace, &tracing_ctrl_fops);
 
-	entry = debugfs_create_file("trace_options", 0644, d_tracer,
-				    NULL, &tracing_iter_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'trace_options' entry\n");
+	trace_create_file("trace_options", 0644, d_tracer,
+			NULL, &tracing_iter_fops);
 
-	create_trace_options_dir();
+	trace_create_file("tracing_cpumask", 0644, d_tracer,
+			NULL, &tracing_cpumask_fops);
+
+	trace_create_file("trace", 0644, d_tracer,
+			(void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
+
+	trace_create_file("available_tracers", 0444, d_tracer,
+			&global_trace, &show_traces_fops);
+
+	trace_create_file("current_tracer", 0444, d_tracer,
+			&global_trace, &set_tracer_fops);
+
+	trace_create_file("tracing_max_latency", 0644, d_tracer,
+			&tracing_max_latency, &tracing_max_lat_fops);
+
+	trace_create_file("tracing_thresh", 0644, d_tracer,
+			&tracing_thresh, &tracing_max_lat_fops);
 
-	entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
-				    NULL, &tracing_cpumask_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'tracing_cpumask' entry\n");
-
-	entry = debugfs_create_file("trace", 0644, d_tracer,
-				 (void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'trace' entry\n");
-
-	entry = debugfs_create_file("available_tracers", 0444, d_tracer,
-				    &global_trace, &show_traces_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'available_tracers' entry\n");
-
-	entry = debugfs_create_file("current_tracer", 0444, d_tracer,
-				    &global_trace, &set_tracer_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'current_tracer' entry\n");
-
-	entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer,
-				    &tracing_max_latency,
-				    &tracing_max_lat_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'tracing_max_latency' entry\n");
-
-	entry = debugfs_create_file("tracing_thresh", 0644, d_tracer,
-				    &tracing_thresh, &tracing_max_lat_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'tracing_thresh' entry\n");
-	entry = debugfs_create_file("README", 0644, d_tracer,
-				    NULL, &tracing_readme_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'README' entry\n");
-
-	entry = debugfs_create_file("trace_pipe", 0444, d_tracer,
+	trace_create_file("README", 0644, d_tracer,
+			NULL, &tracing_readme_fops);
+
+	trace_create_file("trace_pipe", 0444, d_tracer,
 			(void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'trace_pipe' entry\n");
-
-	entry = debugfs_create_file("buffer_size_kb", 0644, d_tracer,
-				    &global_trace, &tracing_entries_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'buffer_size_kb' entry\n");
-
-	entry = debugfs_create_file("trace_marker", 0220, d_tracer,
-				    NULL, &tracing_mark_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'trace_marker' entry\n");
+
+	trace_create_file("buffer_size_kb", 0644, d_tracer,
+			&global_trace, &tracing_entries_fops);
+
+	trace_create_file("trace_marker", 0220, d_tracer,
+			NULL, &tracing_mark_fops);
 
 #ifdef CONFIG_DYNAMIC_FTRACE
-	entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
-				    &ftrace_update_tot_cnt,
-				    &tracing_dyn_info_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'dyn_ftrace_total_info' entry\n");
+	trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
+			&ftrace_update_tot_cnt, &tracing_dyn_info_fops);
 #endif
 #ifdef CONFIG_SYSPROF_TRACER
 	init_tracer_sysprof_debugfs(d_tracer);
 #endif
 
+	create_trace_options_dir();
+
 	for_each_tracing_cpu(cpu)
 		tracing_init_debugfs_percpu(cpu);
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 47aa6d0c97a..f76a8f8689d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -470,6 +470,12 @@ void trace_wake_up(void);
 void tracing_reset(struct trace_array *tr, int cpu);
 void tracing_reset_online_cpus(struct trace_array *tr);
 int tracing_open_generic(struct inode *inode, struct file *filp);
+struct dentry *trace_create_file(const char *name,
+				 mode_t mode,
+				 struct dentry *parent,
+				 void *data,
+				 const struct file_operations *fops);
+
 struct dentry *tracing_init_dentry(void);
 void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
 
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 22cba997077..199de9c7422 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -28,4 +28,3 @@ void ftrace_profile_disable(int event_id)
 			return event->profile_disable(event);
 	}
 }
-
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index eb81556107f..9bece9687b6 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -245,17 +245,13 @@ static const struct file_operations ftrace_formats_fops = {
 static __init int init_trace_printk_function_export(void)
 {
 	struct dentry *d_tracer;
-	struct dentry *entry;
 
 	d_tracer = tracing_init_dentry();
 	if (!d_tracer)
 		return 0;
 
-	entry = debugfs_create_file("printk_formats", 0444, d_tracer,
+	trace_create_file("printk_formats", 0444, d_tracer,
 				    NULL, &ftrace_formats_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'printk_formats' entry\n");
 
 	return 0;
 }
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index c750f65f966..1796f00524e 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -352,19 +352,14 @@ __setup("stacktrace", enable_stacktrace);
 static __init int stack_trace_init(void)
 {
 	struct dentry *d_tracer;
-	struct dentry *entry;
 
 	d_tracer = tracing_init_dentry();
 
-	entry = debugfs_create_file("stack_max_size", 0644, d_tracer,
-				    &max_stack_size, &stack_max_size_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'stack_max_size' entry\n");
+	trace_create_file("stack_max_size", 0644, d_tracer,
+			&max_stack_size, &stack_max_size_fops);
 
-	entry = debugfs_create_file("stack_trace", 0444, d_tracer,
-				    NULL, &stack_trace_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'stack_trace' entry\n");
+	trace_create_file("stack_trace", 0444, d_tracer,
+			NULL, &stack_trace_fops);
 
 	if (stack_tracer_enabled)
 		register_ftrace_function(&trace_ops);
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 91fd19c2149..e04b76cc238 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -321,11 +321,7 @@ static const struct file_operations sysprof_sample_fops = {
 
 void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
 {
-	struct dentry *entry;
 
-	entry = debugfs_create_file("sysprof_sample_period", 0644,
+	trace_create_file("sysprof_sample_period", 0644,
 			d_tracer, NULL, &sysprof_sample_fops);
-	if (entry)
-		return;
-	pr_warning("Could not create debugfs 'sysprof_sample_period' entry\n");
 }
-- 
cgit v1.2.3


From 597af81537654097b67fd7a0c92775e66d4a86fe Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 3 Apr 2009 15:24:12 -0400
Subject: function-graph: use int instead of atomic for ftrace_graph_active

Impact: cleanup

The variable ftrace_graph_active is only modified under the
ftrace_lock mutex, thus an atomic is not necessary for modification.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6ea5a1ae6a9..8e6a0b5c994 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3092,7 +3092,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
-static atomic_t ftrace_graph_active;
+static int ftrace_graph_active;
 static struct notifier_block ftrace_suspend_notifier;
 
 int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
@@ -3244,7 +3244,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 	mutex_lock(&ftrace_lock);
 
 	/* we currently allow only one tracer registered at a time */
-	if (atomic_read(&ftrace_graph_active)) {
+	if (ftrace_graph_active) {
 		ret = -EBUSY;
 		goto out;
 	}
@@ -3252,10 +3252,10 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 	ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
 	register_pm_notifier(&ftrace_suspend_notifier);
 
-	atomic_inc(&ftrace_graph_active);
+	ftrace_graph_active++;
 	ret = start_graph_tracing();
 	if (ret) {
-		atomic_dec(&ftrace_graph_active);
+		ftrace_graph_active--;
 		goto out;
 	}
 
@@ -3273,10 +3273,10 @@ void unregister_ftrace_graph(void)
 {
 	mutex_lock(&ftrace_lock);
 
-	if (!unlikely(atomic_read(&ftrace_graph_active)))
+	if (unlikely(!ftrace_graph_active))
 		goto out;
 
-	atomic_dec(&ftrace_graph_active);
+	ftrace_graph_active--;
 	unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
 	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
 	ftrace_graph_entry = ftrace_graph_entry_stub;
@@ -3290,7 +3290,7 @@ void unregister_ftrace_graph(void)
 /* Allocate a return stack for newly created task */
 void ftrace_graph_init_task(struct task_struct *t)
 {
-	if (atomic_read(&ftrace_graph_active)) {
+	if (ftrace_graph_active) {
 		t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
 				* sizeof(struct ftrace_ret_stack),
 				GFP_KERNEL);
-- 
cgit v1.2.3


From dcef788eb9659b61a2110284fcce3ca6e63480d2 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Tue, 31 Mar 2009 15:26:14 +0800
Subject: ftrace: clean up enable logic for sched_switch

Unify sched_switch and sched_wakeup's action to following logic:
Do record_cmdline when start_cmdline_record() is called.
Start tracing events when the tracer is started.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
LKML-Reference: <49D1C596.5050203@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_sched_switch.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 9117cea6f1a..9d8cccdfaa0 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -29,13 +29,13 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
 	int cpu;
 	int pc;
 
-	if (!sched_ref || sched_stopped)
+	if (unlikely(!sched_ref))
 		return;
 
 	tracing_record_cmdline(prev);
 	tracing_record_cmdline(next);
 
-	if (!tracer_enabled)
+	if (!tracer_enabled || sched_stopped)
 		return;
 
 	pc = preempt_count();
@@ -56,15 +56,15 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
 	unsigned long flags;
 	int cpu, pc;
 
-	if (!likely(tracer_enabled))
+	if (unlikely(!sched_ref))
 		return;
 
-	pc = preempt_count();
 	tracing_record_cmdline(current);
 
-	if (sched_stopped)
+	if (!tracer_enabled || sched_stopped)
 		return;
 
+	pc = preempt_count();
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
 	data = ctx_trace->data[cpu];
-- 
cgit v1.2.3


From 02af61bb50f5d5f0322dbe5ab2a0d75808d25c7b Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Fri, 10 Apr 2009 14:26:18 +0800
Subject: tracing, kmemtrace: Separate include/trace/kmemtrace.h to kmemtrace
 part and tracepoint part

Impact: refactor code for future changes

Current kmemtrace.h is used both as header file of kmemtrace and kmem's
tracepoints definition.

Tracepoints' definition file may be used by other code, and should only have
definition of tracepoint.

We can separate include/trace/kmemtrace.h into 2 files:

  include/linux/kmemtrace.h: header file for kmemtrace
  include/trace/kmem.h:      definition of kmem tracepoints

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Acked-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <49DEE68A.5040902@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/kmemtrace.c | 2 +-
 kernel/trace/trace.h     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 5011f4d91e3..7a0aa0e260d 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -12,7 +12,7 @@
 #include <linux/dcache.h>
 #include <linux/fs.h>
 
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
 
 #include "trace_output.h"
 #include "trace.h"
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f76a8f8689d..34b94c3f40a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -9,7 +9,7 @@
 #include <linux/mmiotrace.h>
 #include <linux/ftrace.h>
 #include <trace/boot.h>
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
 #include <trace/power.h>
 
 enum trace_type {
-- 
cgit v1.2.3


From b78825d608f30a47e3154ab6872a03f0de0c9d45 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 1 Apr 2009 16:18:53 +0800
Subject: blktrace: fix output of unknown events

Not all events are pc (packet command) events. An event is a pc
event only if it has BLK_TC_PC bit set.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <49D3236D.3090705@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 921ef5d1f0b..e45e1af1356 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1182,7 +1182,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
 	}
 
 	if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
-		ret = trace_seq_printf(s, "Bad pc action %x\n", what);
+		ret = trace_seq_printf(s, "Unknown action %x\n", what);
 	else {
 		ret = log_action(iter, what2act[what].act[long_act]);
 		if (ret)
-- 
cgit v1.2.3


From 66de7792c02693b49671afe58c771fde3b092fc7 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 1 Apr 2009 16:19:19 +0800
Subject: blktrace: fix output of BLK_TC_PC events

BLK_TC_PC events should be treated differently with BLK_TC_FS events.

Before this patch:

 # echo 1 > /sys/block/sda/sda1/trace/enable
 # echo pc > /sys/block/sda/sda1/trace/act_mask
 # echo blk > /debugfs/tracing/current_tracer
 # (generate some BLK_TC_PC events)
 # cat trace
        bash-2184  [000]  1774.275413:   8,7    I   N [bash]
        bash-2184  [000]  1774.275435:   8,7    D   N [bash]
        bash-2184  [000]  1774.275540:   8,7    I   R [bash]
        bash-2184  [000]  1774.275547:   8,7    D   R [bash]
 ksoftirqd/0-4     [000]  1774.275580:   8,7    C   N 0 [0]
        bash-2184  [000]  1774.275648:   8,7    I   R [bash]
        bash-2184  [000]  1774.275653:   8,7    D   R [bash]
 ksoftirqd/0-4     [000]  1774.275682:   8,7    C   N 0 [0]
        bash-2184  [000]  1774.275739:   8,7    I   R [bash]
        bash-2184  [000]  1774.275744:   8,7    D   R [bash]
 ksoftirqd/0-4     [000]  1774.275771:   8,7    C   N 0 [0]
        bash-2184  [000]  1774.275804:   8,7    I   R [bash]
        bash-2184  [000]  1774.275808:   8,7    D   R [bash]
 ksoftirqd/0-4     [000]  1774.275836:   8,7    C   N 0 [0]

After this patch:

 # cat trace
        bash-2263  [000]   366.782149:   8,7    I   N 0 (00 ..) [bash]
        bash-2263  [000]   366.782323:   8,7    D   N 0 (00 ..) [bash]
        bash-2263  [000]   366.782557:   8,7    I   R 8 (25 00 ..) [bash]
        bash-2263  [000]   366.782560:   8,7    D   R 8 (25 00 ..) [bash]
 ksoftirqd/0-4     [000]   366.782582:   8,7    C   N (25 00 ..) [0]
        bash-2263  [000]   366.782648:   8,7    I   R 8 (5a 00 3f 00) [bash]
        bash-2263  [000]   366.782650:   8,7    D   R 8 (5a 00 3f 00) [bash]
 ksoftirqd/0-4     [000]   366.782669:   8,7    C   N (5a 00 3f 00) [0]
        bash-2263  [000]   366.782710:   8,7    I   R 8 (5a 00 08 00) [bash]
        bash-2263  [000]   366.782713:   8,7    D   R 8 (5a 00 08 00) [bash]
 ksoftirqd/0-4     [000]   366.782730:   8,7    C   N (5a 00 08 00) [0]
        bash-2263  [000]   366.783375:   8,7    I   R 36 (5a 00 08 00) [bash]
        bash-2263  [000]   366.783379:   8,7    D   R 36 (5a 00 08 00) [bash]
 ksoftirqd/0-4     [000]   366.783404:   8,7    C   N (5a 00 08 00) [0]

This is what we do with PC events in user-space blktrace.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <49D32387.9040106@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 88 ++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 80 insertions(+), 8 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index e45e1af1356..2b98195b338 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -971,6 +971,16 @@ static inline const void *pdu_start(const struct trace_entry *ent)
 	return te_blk_io_trace(ent) + 1;
 }
 
+static inline u32 t_action(const struct trace_entry *ent)
+{
+	return te_blk_io_trace(ent)->action;
+}
+
+static inline u32 t_bytes(const struct trace_entry *ent)
+{
+	return te_blk_io_trace(ent)->bytes;
+}
+
 static inline u32 t_sec(const struct trace_entry *ent)
 {
 	return te_blk_io_trace(ent)->bytes >> 9;
@@ -1031,25 +1041,87 @@ static int blk_log_action(struct trace_iterator *iter, const char *act)
 				MAJOR(t->device), MINOR(t->device), act, rwbs);
 }
 
+static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
+{
+	const char *pdu_buf;
+	int pdu_len;
+	int i, end, ret;
+
+	pdu_buf = pdu_start(ent);
+	pdu_len = te_blk_io_trace(ent)->pdu_len;
+
+	if (!pdu_len)
+		return 1;
+
+	/* find the last zero that needs to be printed */
+	for (end = pdu_len - 1; end >= 0; end--)
+		if (pdu_buf[end])
+			break;
+	end++;
+
+	if (!trace_seq_putc(s, '('))
+		return 0;
+
+	for (i = 0; i < pdu_len; i++) {
+
+		ret = trace_seq_printf(s, "%s%02x",
+				       i == 0 ? "" : " ", pdu_buf[i]);
+		if (!ret)
+			return ret;
+
+		/*
+		 * stop when the rest is just zeroes and indicate so
+		 * with a ".." appended
+		 */
+		if (i == end && end != pdu_len - 1)
+			return trace_seq_puts(s, " ..) ");
+	}
+
+	return trace_seq_puts(s, ") ");
+}
+
 static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
 {
 	char cmd[TASK_COMM_LEN];
 
 	trace_find_cmdline(ent->pid, cmd);
 
-	if (t_sec(ent))
-		return trace_seq_printf(s, "%llu + %u [%s]\n",
-					t_sector(ent), t_sec(ent), cmd);
-	return trace_seq_printf(s, "[%s]\n", cmd);
+	if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
+		int ret;
+
+		ret = trace_seq_printf(s, "%u ", t_bytes(ent));
+		if (!ret)
+			return 0;
+		ret = blk_log_dump_pdu(s, ent);
+		if (!ret)
+			return 0;
+		return trace_seq_printf(s, "[%s]\n", cmd);
+	} else {
+		if (t_sec(ent))
+			return trace_seq_printf(s, "%llu + %u [%s]\n",
+						t_sector(ent), t_sec(ent), cmd);
+		return trace_seq_printf(s, "[%s]\n", cmd);
+	}
 }
 
 static int blk_log_with_error(struct trace_seq *s,
 			      const struct trace_entry *ent)
 {
-	if (t_sec(ent))
-		return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent),
-					t_sec(ent), t_error(ent));
-	return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent));
+	if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
+		int ret;
+
+		ret = blk_log_dump_pdu(s, ent);
+		if (ret)
+			return trace_seq_printf(s, "[%d]\n", t_error(ent));
+		return 0;
+	} else {
+		if (t_sec(ent))
+			return trace_seq_printf(s, "%llu + %u [%d]\n",
+						t_sector(ent),
+						t_sec(ent), t_error(ent));
+		return trace_seq_printf(s, "%llu [%d]\n",
+					t_sector(ent), t_error(ent));
+	}
 }
 
 static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
-- 
cgit v1.2.3


From e1112b4d96859367a93468027c9635e2ac04eb3f Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 31 Mar 2009 00:48:49 -0500
Subject: tracing/filters: add run-time field descriptions to
 TRACE_EVENT_FORMAT events

This patch adds run-time field descriptions to all the event formats
exported using TRACE_EVENT_FORMAT.  It also hooks up all the tracers
that use them (i.e. the tracers in the 'ftrace subsystem') so they can
also have their output filtered by the event-filtering mechanism.

When I was testing this, there were a couple of things that fooled me
into thinking the filters weren't working, when actually they were -
I'll mention them here so others don't make the same mistakes (and file
bug reports. ;-)

One is that some of the tracers trace multiple events e.g. the
sched_switch tracer uses the context_switch and wakeup events, and if
you don't set filters on all of the traced events, the unfiltered output
from the events without filters on them can make it look like the
filtering as a whole isn't working properly, when actually it is doing
what it was asked to do - it just wasn't asked to do the right thing.

The other is that for the really high-volume tracers e.g. the function
tracer, the volume of filtered events can be so high that it pushes the
unfiltered events out of the ring buffer before they can be read so e.g.
cat'ing the trace file repeatedly shows either no output, or once in
awhile some output but that isn't there the next time you read the
trace, which isn't what you normally expect when reading the trace file.
If you read from the trace_pipe file though, you can catch them before
they disappear.

Changes from v1:

As suggested by Frederic Weisbecker:

- get rid of externs in functions
- added unlikely() to filter_check_discard()

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/kmemtrace.c            |  6 ++++
 kernel/trace/trace.c                | 25 ++++++++++++++++
 kernel/trace/trace.h                | 20 +++++++++++++
 kernel/trace/trace_branch.c         |  3 ++
 kernel/trace/trace_event_types.h    |  6 ++--
 kernel/trace/trace_events.c         |  7 +++++
 kernel/trace/trace_events_filter.c  |  4 +--
 kernel/trace/trace_events_stage_2.h |  7 -----
 kernel/trace/trace_export.c         | 57 +++++++++++++++++++++++++++++++++++--
 kernel/trace/trace_hw_branches.c    |  2 ++
 kernel/trace/trace_power.c          |  4 +++
 11 files changed, 127 insertions(+), 14 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 7a0aa0e260d..9419ad10541 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -42,6 +42,7 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
 				   gfp_t gfp_flags,
 				   int node)
 {
+	struct ftrace_event_call *call = &event_kmem_alloc;
 	struct trace_array *tr = kmemtrace_array;
 	struct kmemtrace_alloc_entry *entry;
 	struct ring_buffer_event *event;
@@ -62,6 +63,8 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
 	entry->gfp_flags	= gfp_flags;
 	entry->node		= node;
 
+	filter_check_discard(call, entry, event);
+
 	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
@@ -71,6 +74,7 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
 				  unsigned long call_site,
 				  const void *ptr)
 {
+	struct ftrace_event_call *call = &event_kmem_free;
 	struct trace_array *tr = kmemtrace_array;
 	struct kmemtrace_free_entry *entry;
 	struct ring_buffer_event *event;
@@ -86,6 +90,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
 	entry->call_site	= call_site;
 	entry->ptr		= ptr;
 
+	filter_check_discard(call, entry, event);
+
 	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4865459f609..962e6179994 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -898,6 +898,7 @@ trace_function(struct trace_array *tr,
 	       unsigned long ip, unsigned long parent_ip, unsigned long flags,
 	       int pc)
 {
+	struct ftrace_event_call *call = &event_function;
 	struct ring_buffer_event *event;
 	struct ftrace_entry *entry;
 
@@ -912,6 +913,9 @@ trace_function(struct trace_array *tr,
 	entry	= ring_buffer_event_data(event);
 	entry->ip			= ip;
 	entry->parent_ip		= parent_ip;
+
+	filter_check_discard(call, entry, event);
+
 	ring_buffer_unlock_commit(tr->buffer, event);
 }
 
@@ -921,6 +925,7 @@ static int __trace_graph_entry(struct trace_array *tr,
 				unsigned long flags,
 				int pc)
 {
+	struct ftrace_event_call *call = &event_funcgraph_entry;
 	struct ring_buffer_event *event;
 	struct ftrace_graph_ent_entry *entry;
 
@@ -933,6 +938,7 @@ static int __trace_graph_entry(struct trace_array *tr,
 		return 0;
 	entry	= ring_buffer_event_data(event);
 	entry->graph_ent			= *trace;
+	filter_check_discard(call, entry, event);
 	ring_buffer_unlock_commit(global_trace.buffer, event);
 
 	return 1;
@@ -943,6 +949,7 @@ static void __trace_graph_return(struct trace_array *tr,
 				unsigned long flags,
 				int pc)
 {
+	struct ftrace_event_call *call = &event_funcgraph_exit;
 	struct ring_buffer_event *event;
 	struct ftrace_graph_ret_entry *entry;
 
@@ -955,6 +962,7 @@ static void __trace_graph_return(struct trace_array *tr,
 		return;
 	entry	= ring_buffer_event_data(event);
 	entry->ret				= *trace;
+	filter_check_discard(call, entry, event);
 	ring_buffer_unlock_commit(global_trace.buffer, event);
 }
 #endif
@@ -973,6 +981,7 @@ static void __ftrace_trace_stack(struct trace_array *tr,
 				 int skip, int pc)
 {
 #ifdef CONFIG_STACKTRACE
+	struct ftrace_event_call *call = &event_kernel_stack;
 	struct ring_buffer_event *event;
 	struct stack_entry *entry;
 	struct stack_trace trace;
@@ -990,6 +999,7 @@ static void __ftrace_trace_stack(struct trace_array *tr,
 	trace.entries		= entry->caller;
 
 	save_stack_trace(&trace);
+	filter_check_discard(call, entry, event);
 	ring_buffer_unlock_commit(tr->buffer, event);
 #endif
 }
@@ -1015,6 +1025,7 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 				   unsigned long flags, int pc)
 {
 #ifdef CONFIG_STACKTRACE
+	struct ftrace_event_call *call = &event_user_stack;
 	struct ring_buffer_event *event;
 	struct userstack_entry *entry;
 	struct stack_trace trace;
@@ -1036,6 +1047,7 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 	trace.entries		= entry->caller;
 
 	save_stack_trace_user(&trace);
+	filter_check_discard(call, entry, event);
 	ring_buffer_unlock_commit(tr->buffer, event);
 #endif
 }
@@ -1052,6 +1064,7 @@ ftrace_trace_special(void *__tr,
 		     unsigned long arg1, unsigned long arg2, unsigned long arg3,
 		     int pc)
 {
+	struct ftrace_event_call *call = &event_special;
 	struct ring_buffer_event *event;
 	struct trace_array *tr = __tr;
 	struct special_entry *entry;
@@ -1064,6 +1077,7 @@ ftrace_trace_special(void *__tr,
 	entry->arg1			= arg1;
 	entry->arg2			= arg2;
 	entry->arg3			= arg3;
+	filter_check_discard(call, entry, event);
 	trace_buffer_unlock_commit(tr, event, 0, pc);
 }
 
@@ -1080,6 +1094,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 			   struct task_struct *next,
 			   unsigned long flags, int pc)
 {
+	struct ftrace_event_call *call = &event_context_switch;
 	struct ring_buffer_event *event;
 	struct ctx_switch_entry *entry;
 
@@ -1095,6 +1110,9 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->next_prio		= next->prio;
 	entry->next_state		= next->state;
 	entry->next_cpu	= task_cpu(next);
+
+	filter_check_discard(call, entry, event);
+
 	trace_buffer_unlock_commit(tr, event, flags, pc);
 }
 
@@ -1104,6 +1122,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 			   struct task_struct *curr,
 			   unsigned long flags, int pc)
 {
+	struct ftrace_event_call *call = &event_wakeup;
 	struct ring_buffer_event *event;
 	struct ctx_switch_entry *entry;
 
@@ -1120,6 +1139,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->next_state		= wakee->state;
 	entry->next_cpu			= task_cpu(wakee);
 
+	filter_check_discard(call, entry, event);
+
 	ring_buffer_unlock_commit(tr->buffer, event);
 	ftrace_trace_stack(tr, flags, 6, pc);
 	ftrace_trace_userstack(tr, flags, pc);
@@ -1221,6 +1242,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 		(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 	static u32 trace_buf[TRACE_BUF_SIZE];
 
+	struct ftrace_event_call *call = &event_bprint;
 	struct ring_buffer_event *event;
 	struct trace_array *tr = &global_trace;
 	struct trace_array_cpu *data;
@@ -1260,6 +1282,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 	entry->fmt			= fmt;
 
 	memcpy(entry->buf, trace_buf, sizeof(u32) * len);
+	filter_check_discard(call, entry, event);
 	ring_buffer_unlock_commit(tr->buffer, event);
 
 out_unlock:
@@ -1279,6 +1302,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 	static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
 	static char trace_buf[TRACE_BUF_SIZE];
 
+	struct ftrace_event_call *call = &event_print;
 	struct ring_buffer_event *event;
 	struct trace_array *tr = &global_trace;
 	struct trace_array_cpu *data;
@@ -1314,6 +1338,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 
 	memcpy(&entry->buf, trace_buf, len);
 	entry->buf[len] = 0;
+	filter_check_discard(call, entry, event);
 	ring_buffer_unlock_commit(tr->buffer, event);
 
  out_unlock:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 34b94c3f40a..e7737281953 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -866,6 +866,21 @@ extern void filter_free_subsystem_preds(struct event_subsystem *system);
 extern int filter_add_subsystem_pred(struct event_subsystem *system,
 				     struct filter_pred *pred);
 
+static inline void
+filter_check_discard(struct ftrace_event_call *call, void *rec,
+		     struct ring_buffer_event *event)
+{
+	if (unlikely(call->preds) && !filter_match_preds(call, rec))
+		ring_buffer_event_discard(event);
+}
+
+#define __common_field(type, item)					\
+	ret = trace_define_field(event_call, #type, "common_" #item,	\
+				 offsetof(typeof(field.ent), item),	\
+				 sizeof(field.ent.item));		\
+	if (ret)							\
+		return ret;
+
 void event_trace_printk(unsigned long ip, const char *fmt, ...);
 extern struct ftrace_event_call __start_ftrace_events[];
 extern struct ftrace_event_call __stop_ftrace_events[];
@@ -897,4 +912,9 @@ do {									\
 		__trace_printk(ip, fmt, ##args);			\
 } while (0)
 
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+	extern struct ftrace_event_call event_##call;
+#include "trace_event_types.h"
+
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index e6e32912ffb..c95c25d838e 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -30,6 +30,7 @@ static struct trace_array *branch_tracer;
 static void
 probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 {
+	struct ftrace_event_call *call = &event_branch;
 	struct trace_array *tr = branch_tracer;
 	struct ring_buffer_event *event;
 	struct trace_branch *entry;
@@ -73,6 +74,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 	entry->line = f->line;
 	entry->correct = val == expect;
 
+	filter_check_discard(call, entry, event);
+
 	ring_buffer_unlock_commit(tr->buffer, event);
 
  out:
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index fd78bee71dd..95b147aac22 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -122,8 +122,10 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
 TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
 	TRACE_STRUCT(
 		TRACE_FIELD(unsigned int, line, line)
-		TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, func)
-		TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, file)
+		TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func,
+				    TRACE_FUNC_SIZE+1, func)
+		TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file,
+				    TRACE_FUNC_SIZE+1, file)
 		TRACE_FIELD(char, correct, correct)
 	),
 	TP_RAW_FMT("%u:%s:%s (%u)")
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 64ec4d278ff..be9299a53e2 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -680,6 +680,7 @@ static struct dentry *
 event_subsystem_dir(const char *name, struct dentry *d_events)
 {
 	struct event_subsystem *system;
+	struct dentry *entry;
 
 	/* First see if we did not already create this dir */
 	list_for_each_entry(system, &event_subsystems, list) {
@@ -708,6 +709,12 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
 
 	system->preds = NULL;
 
+	entry = debugfs_create_file("filter", 0644, system->entry, system,
+				    &ftrace_subsystem_filter_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'%s/filter' entry\n", name);
+
 	return system->entry;
 }
 
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 026be412f35..470ad9487ec 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -185,7 +185,7 @@ void filter_free_subsystem_preds(struct event_subsystem *system)
 	}
 
 	events_for_each(call) {
-		if (!call->name || !call->regfunc)
+		if (!call->define_fields)
 			continue;
 
 		if (!strcmp(call->system, system->name))
@@ -324,7 +324,7 @@ int filter_add_subsystem_pred(struct event_subsystem *system,
 	events_for_each(call) {
 		int err;
 
-		if (!call->name || !call->regfunc)
+		if (!call->define_fields)
 			continue;
 
 		if (strcmp(call->system, system->name))
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index 30743f7d411..1c94b87c718 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -146,13 +146,6 @@ ftrace_format_##call(struct trace_seq *s)				\
 	if (ret)							\
 		return ret;
 
-#define __common_field(type, item)					\
-	ret = trace_define_field(event_call, #type, "common_" #item,	\
-				 offsetof(typeof(field.ent), item),	\
-				 sizeof(field.ent.item));		\
-	if (ret)							\
-		return ret;
-
 #undef TRACE_EVENT
 #define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
 int									\
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 07a22c33ebf..f4e46616c48 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -30,7 +30,7 @@
 
 
 #undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd)			\
+#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd)			\
 	ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t"	\
 			       "offset:%u;\tsize:%u;\n",		\
 			       (unsigned int)offsetof(typeof(field), item), \
@@ -85,18 +85,69 @@ ftrace_format_##call(struct trace_seq *s)				\
 #define TRACE_ENTRY	entry
 
 #undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
+#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd)	\
 	cmd;
 
 #undef TRACE_EVENT_FORMAT
 #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+int ftrace_define_fields_##call(void);					\
+static int ftrace_raw_init_event_##call(void);				\
 									\
-static struct ftrace_event_call __used					\
+struct ftrace_event_call __used						\
 __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.name			= #call,				\
 	.id			= proto,				\
 	.system			= __stringify(TRACE_SYSTEM),		\
+	.raw_init		= ftrace_raw_init_event_##call,		\
 	.show_format		= ftrace_format_##call,			\
+	.define_fields		= ftrace_define_fields_##call,		\
+};									\
+static int ftrace_raw_init_event_##call(void)				\
+{									\
+	INIT_LIST_HEAD(&event_##call.fields);				\
+	return 0;							\
+}									\
+
+#include "trace_event_types.h"
+
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign)					\
+	ret = trace_define_field(event_call, #type, #item,		\
+				 offsetof(typeof(field), item),		\
+				 sizeof(field.item));			\
+	if (ret)							\
+		return ret;
+
+#undef TRACE_FIELD_SPECIAL
+#define TRACE_FIELD_SPECIAL(type, item, len, cmd)			\
+	ret = trace_define_field(event_call, #type "[" #len "]", #item,	\
+				 offsetof(typeof(field), item),		\
+				 sizeof(field.item));			\
+	if (ret)							\
+		return ret;
+
+#undef TRACE_FIELD_ZERO_CHAR
+#define TRACE_FIELD_ZERO_CHAR(item)
+
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+int									\
+ftrace_define_fields_##call(void)					\
+{									\
+	struct ftrace_event_call *event_call = &event_##call;		\
+	struct args field;						\
+	int ret;							\
+									\
+	__common_field(unsigned char, type);				\
+	__common_field(unsigned char, flags);				\
+	__common_field(unsigned char, preempt_count);			\
+	__common_field(int, pid);					\
+	__common_field(int, tgid);					\
+									\
+	tstruct;							\
+									\
+	return ret;							\
 }
+
 #include "trace_event_types.h"
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 7bfdf4c2347..e6b275b22ac 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -168,6 +168,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
 
 void trace_hw_branch(u64 from, u64 to)
 {
+	struct ftrace_event_call *call = &event_hw_branch;
 	struct trace_array *tr = hw_branch_trace;
 	struct ring_buffer_event *event;
 	struct hw_branch_entry *entry;
@@ -194,6 +195,7 @@ void trace_hw_branch(u64 from, u64 to)
 	entry->ent.type = TRACE_HW_BRANCHES;
 	entry->from = from;
 	entry->to   = to;
+	filter_check_discard(call, entry, event);
 	trace_buffer_unlock_commit(tr, event, 0, 0);
 
  out:
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index bae791ebcc5..8ce7d7d62c0 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -36,6 +36,7 @@ static void probe_power_start(struct power_trace *it, unsigned int type,
 
 static void probe_power_end(struct power_trace *it)
 {
+	struct ftrace_event_call *call = &event_power;
 	struct ring_buffer_event *event;
 	struct trace_power *entry;
 	struct trace_array_cpu *data;
@@ -54,6 +55,7 @@ static void probe_power_end(struct power_trace *it)
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	entry->state_data = *it;
+	filter_check_discard(call, entry, event);
 	trace_buffer_unlock_commit(tr, event, 0, 0);
  out:
 	preempt_enable();
@@ -62,6 +64,7 @@ static void probe_power_end(struct power_trace *it)
 static void probe_power_mark(struct power_trace *it, unsigned int type,
 				unsigned int level)
 {
+	struct ftrace_event_call *call = &event_power;
 	struct ring_buffer_event *event;
 	struct trace_power *entry;
 	struct trace_array_cpu *data;
@@ -84,6 +87,7 @@ static void probe_power_mark(struct power_trace *it, unsigned int type,
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	entry->state_data = *it;
+	filter_check_discard(call, entry, event);
 	trace_buffer_unlock_commit(tr, event, 0, 0);
  out:
 	preempt_enable();
-- 
cgit v1.2.3


From e45f2e2bd298e1ff687448e5fd15a3588b5807ec Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 31 Mar 2009 00:49:16 -0500
Subject: tracing/filters: add TRACE_EVENT_FORMAT_NOFILTER event macro

Frederic Weisbecker suggested that the trace_special event shouldn't be
filterable; this patch adds a TRACE_EVENT_FORMAT_NOFILTER event macro
that allows an event format to be exported without having a filter
attached, and removes filtering from the trace_special event.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c             |  2 --
 kernel/trace/trace.h             |  2 ++
 kernel/trace/trace_event_types.h |  2 +-
 kernel/trace/trace_export.c      | 33 +++++++++++++++++++++++++++++++++
 4 files changed, 36 insertions(+), 3 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 962e6179994..c209d214169 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1064,7 +1064,6 @@ ftrace_trace_special(void *__tr,
 		     unsigned long arg1, unsigned long arg2, unsigned long arg3,
 		     int pc)
 {
-	struct ftrace_event_call *call = &event_special;
 	struct ring_buffer_event *event;
 	struct trace_array *tr = __tr;
 	struct special_entry *entry;
@@ -1077,7 +1076,6 @@ ftrace_trace_special(void *__tr,
 	entry->arg1			= arg1;
 	entry->arg2			= arg2;
 	entry->arg3			= arg3;
-	filter_check_discard(call, entry, event);
 	trace_buffer_unlock_commit(tr, event, 0, pc);
 }
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e7737281953..3cf856fa597 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -915,6 +915,8 @@ do {									\
 #undef TRACE_EVENT_FORMAT
 #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
 	extern struct ftrace_event_call event_##call;
+#undef TRACE_EVENT_FORMAT_NOFILTER
+#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt)
 #include "trace_event_types.h"
 
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index 95b147aac22..cfcecc4fd86 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -57,7 +57,7 @@ TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
 	TP_RAW_FMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")
 );
 
-TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore,
+TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore,
 	TRACE_STRUCT(
 		TRACE_FIELD(unsigned long, arg1, arg1)
 		TRACE_FIELD(unsigned long, arg2, arg2)
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index f4e46616c48..77c494f5e1d 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -65,6 +65,22 @@ ftrace_format_##call(struct trace_seq *s)				\
 	return ret;							\
 }
 
+#undef TRACE_EVENT_FORMAT_NOFILTER
+#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct,	\
+				    tpfmt)				\
+static int								\
+ftrace_format_##call(struct trace_seq *s)				\
+{									\
+	struct args field;						\
+	int ret;							\
+									\
+	tstruct;							\
+									\
+	trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt);		\
+									\
+	return ret;							\
+}
+
 #include "trace_event_types.h"
 
 #undef TRACE_ZERO_CHAR
@@ -109,6 +125,19 @@ static int ftrace_raw_init_event_##call(void)				\
 	return 0;							\
 }									\
 
+#undef TRACE_EVENT_FORMAT_NOFILTER
+#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct,	\
+				    tpfmt)				\
+									\
+struct ftrace_event_call __used						\
+__attribute__((__aligned__(4)))						\
+__attribute__((section("_ftrace_events"))) event_##call = {		\
+	.name			= #call,				\
+	.id			= proto,				\
+	.system			= __stringify(TRACE_SYSTEM),		\
+	.show_format		= ftrace_format_##call,			\
+};
+
 #include "trace_event_types.h"
 
 #undef TRACE_FIELD
@@ -150,4 +179,8 @@ ftrace_define_fields_##call(void)					\
 	return ret;							\
 }
 
+#undef TRACE_EVENT_FORMAT_NOFILTER
+#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct,	\
+				    tpfmt)
+
 #include "trace_event_types.h"
-- 
cgit v1.2.3


From fa1b47dd85453ec7d4bcfe4aa4a2d172ba452fc3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 2 Apr 2009 00:09:41 -0400
Subject: ring-buffer: add ring_buffer_discard_commit

The ring_buffer_discard_commit is similar to ring_buffer_event_discard
but it can only be done on an event that has yet to be commited.
Unpredictable results can happen otherwise.

The main difference between ring_buffer_discard_commit and
ring_buffer_event_discard is that ring_buffer_discard_commit will try
to free the data in the ring buffer if nothing has addded data
after the reserved event. If something did, then it acts almost the
same as ring_buffer_event_discard followed by a
ring_buffer_unlock_commit.

Note, either ring_buffer_commit_discard and ring_buffer_unlock_commit
can be called on an event, not both.

This commit also exports both discard functions to be usable by
GPL modules.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 125 +++++++++++++++++++++++++++++++++++++--------
 1 file changed, 104 insertions(+), 21 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 74a11808c28..f935bd5ec3e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -205,27 +205,6 @@ static void rb_event_set_padding(struct ring_buffer_event *event)
 	event->time_delta = 0;
 }
 
-/**
- * ring_buffer_event_discard - discard an event in the ring buffer
- * @buffer: the ring buffer
- * @event: the event to discard
- *
- * Sometimes a event that is in the ring buffer needs to be ignored.
- * This function lets the user discard an event in the ring buffer
- * and then that event will not be read later.
- *
- * Note, it is up to the user to be careful with this, and protect
- * against races. If the user discards an event that has been consumed
- * it is possible that it could corrupt the ring buffer.
- */
-void ring_buffer_event_discard(struct ring_buffer_event *event)
-{
-	event->type = RINGBUF_TYPE_PADDING;
-	/* time delta must be non zero */
-	if (!event->time_delta)
-		event->time_delta = 1;
-}
-
 static unsigned
 rb_event_data_length(struct ring_buffer_event *event)
 {
@@ -1570,6 +1549,110 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
 }
 EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
 
+/**
+ * ring_buffer_event_discard - discard any event in the ring buffer
+ * @event: the event to discard
+ *
+ * Sometimes a event that is in the ring buffer needs to be ignored.
+ * This function lets the user discard an event in the ring buffer
+ * and then that event will not be read later.
+ *
+ * Note, it is up to the user to be careful with this, and protect
+ * against races. If the user discards an event that has been consumed
+ * it is possible that it could corrupt the ring buffer.
+ */
+void ring_buffer_event_discard(struct ring_buffer_event *event)
+{
+	event->type = RINGBUF_TYPE_PADDING;
+	/* time delta must be non zero */
+	if (!event->time_delta)
+		event->time_delta = 1;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_event_discard);
+
+/**
+ * ring_buffer_commit_discard - discard an event that has not been committed
+ * @buffer: the ring buffer
+ * @event: non committed event to discard
+ *
+ * This is similar to ring_buffer_event_discard but must only be
+ * performed on an event that has not been committed yet. The difference
+ * is that this will also try to free the event from the ring buffer
+ * if another event has not been added behind it.
+ *
+ * If another event has been added behind it, it will set the event
+ * up as discarded, and perform the commit.
+ *
+ * If this function is called, do not call ring_buffer_unlock_commit on
+ * the event.
+ */
+void ring_buffer_discard_commit(struct ring_buffer *buffer,
+				struct ring_buffer_event *event)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	unsigned long new_index, old_index;
+	struct buffer_page *bpage;
+	unsigned long index;
+	unsigned long addr;
+	int cpu;
+
+	/* The event is discarded regardless */
+	ring_buffer_event_discard(event);
+
+	/*
+	 * This must only be called if the event has not been
+	 * committed yet. Thus we can assume that preemption
+	 * is still disabled.
+	 */
+	RB_WARN_ON(buffer, !preempt_count());
+
+	cpu = smp_processor_id();
+	cpu_buffer = buffer->buffers[cpu];
+
+	new_index = rb_event_index(event);
+	old_index = new_index + rb_event_length(event);
+	addr = (unsigned long)event;
+	addr &= PAGE_MASK;
+
+	bpage = cpu_buffer->tail_page;
+
+	if (bpage == (void *)addr && rb_page_write(bpage) == old_index) {
+		/*
+		 * This is on the tail page. It is possible that
+		 * a write could come in and move the tail page
+		 * and write to the next page. That is fine
+		 * because we just shorten what is on this page.
+		 */
+		index = local_cmpxchg(&bpage->write, old_index, new_index);
+		if (index == old_index)
+			goto out;
+	}
+
+	/*
+	 * The commit is still visible by the reader, so we
+	 * must increment entries.
+	 */
+	cpu_buffer->entries++;
+ out:
+	/*
+	 * If a write came in and pushed the tail page
+	 * we still need to update the commit pointer
+	 * if we were the commit.
+	 */
+	if (rb_is_commit(cpu_buffer, event))
+		rb_set_commit_to_write(cpu_buffer);
+
+	/*
+	 * Only the last preempt count needs to restore preemption.
+	 */
+	if (preempt_count() == 1)
+		ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
+	else
+		preempt_enable_no_resched_notrace();
+
+}
+EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
+
 /**
  * ring_buffer_write - write data to the buffer without reserving
  * @buffer: The ring buffer to write to.
-- 
cgit v1.2.3


From 77d9f465d46fd67cdb82ee5e1ab99dd57a17c486 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 2 Apr 2009 01:16:59 -0400
Subject: tracing/filters: use ring_buffer_discard_commit for discarded events

The ring_buffer_discard_commit makes better usage of the ring_buffer
when an event has been discarded. It tries to remove it completely if
possible.

This patch converts the trace event filtering to use
ring_buffer_discard_commit instead of the ring_buffer_event_discard.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c                | 9 +++++++--
 kernel/trace/trace.h                | 1 +
 kernel/trace/trace_events_stage_3.h | 6 +++---
 3 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c209d214169..d880ab2772c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -884,13 +884,18 @@ trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
 void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
 					unsigned long flags, int pc)
 {
-	return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1);
+	__trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1);
 }
 
 void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
 					unsigned long flags, int pc)
 {
-	return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0);
+	__trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0);
+}
+
+void trace_current_buffer_discard_commit(struct ring_buffer_event *event)
+{
+	ring_buffer_discard_commit(global_trace.buffer, event);
 }
 
 void
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3cf856fa597..dfefffd7ae3 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -497,6 +497,7 @@ void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
 					unsigned long flags, int pc);
 void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
 					unsigned long flags, int pc);
+void trace_current_buffer_discard_commit(struct ring_buffer_event *event);
 
 struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
 						struct trace_array_cpu *data);
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 9d2fa78cecc..d2f34bf30e5 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -223,9 +223,9 @@ static void ftrace_raw_event_##call(proto)				\
 	assign;								\
 									\
 	if (call->preds && !filter_match_preds(call, entry))		\
-		ring_buffer_event_discard(event);			\
-									\
-	trace_nowake_buffer_unlock_commit(event, irq_flags, pc);	\
+		trace_current_buffer_discard_commit(event);		\
+	else								\
+		trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \
 									\
 }									\
 									\
-- 
cgit v1.2.3


From 5f77a88b3f8268b11940b51d2e03d26a663ceb90 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Wed, 8 Apr 2009 03:14:01 -0500
Subject: tracing/infrastructure: separate event tracer from event support

Add a new config option, CONFIG_EVENT_TRACING that gets selected
when CONFIG_TRACING is selected and adds everything needed by the stuff
in trace_export - basically all the event tracing support needed by e.g.
bprint, minus the actual events, which are only included if
CONFIG_EVENT_TRACER is selected.

So CONFIG_EVENT_TRACER can be used to turn on or off the generated events
(what I think of as the 'event tracer'), while CONFIG_EVENT_TRACING turns
on or off the base event tracing support used by both the event tracer and
the other things such as bprint that can't be configured out.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: fweisbec@gmail.com
LKML-Reference: <1239178441.10295.34.camel@tropicana>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig  | 4 ++++
 kernel/trace/Makefile | 6 +++---
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 23b96ebbf89..644606e899f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -48,6 +48,9 @@ config FTRACE_NMI_ENTER
        depends on HAVE_FTRACE_NMI_ENTER
        default y
 
+config EVENT_TRACING
+	bool
+
 config TRACING
 	bool
 	select DEBUG_FS
@@ -56,6 +59,7 @@ config TRACING
 	select TRACEPOINTS
 	select NOP_TRACER
 	select BINARY_PRINTF
+	select EVENT_TRACING
 
 #
 # Minimum requirements an architecture has to meet for us to
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 2630f5121ec..3ad367e7c97 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -40,11 +40,11 @@ obj-$(CONFIG_POWER_TRACER) += trace_power.o
 obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
 obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
-obj-$(CONFIG_EVENT_TRACER) += trace_events.o
+obj-$(CONFIG_EVENT_TRACING) += trace_events.o
 obj-$(CONFIG_EVENT_TRACER) += events.o
-obj-$(CONFIG_EVENT_TRACER) += trace_export.o
+obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
 obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
-obj-$(CONFIG_EVENT_TRACER) += trace_events_filter.o
+obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 
 libftrace-y := ftrace.o
-- 
cgit v1.2.3


From eb02ce017dd83985041a7e54c6449f92d53b026f Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Wed, 8 Apr 2009 03:15:54 -0500
Subject: tracing/filters: use ring_buffer_discard_commit() in
 filter_check_discard()

This patch changes filter_check_discard() to make use of the new
ring_buffer_discard_commit() function and modifies the current users to
call the old commit function in the non-discard case.

It also introduces a version of filter_check_discard() that uses the
global trace buffer (filter_current_check_discard()) for those cases.

v2 changes:

- fix compile error noticed by Ingo Molnar

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: fweisbec@gmail.com
LKML-Reference: <1239178554.10295.36.camel@tropicana>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/kmemtrace.c            | 10 ++++-----
 kernel/trace/trace.c                | 45 ++++++++++++++++++++-----------------
 kernel/trace/trace.h                | 14 +++++++++---
 kernel/trace/trace_branch.c         |  5 ++---
 kernel/trace/trace_events_stage_3.h |  5 +----
 kernel/trace/trace_hw_branches.c    |  4 ++--
 kernel/trace/trace_power.c          |  8 +++----
 7 files changed, 48 insertions(+), 43 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 9419ad10541..86cdf671d7e 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -63,9 +63,8 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
 	entry->gfp_flags	= gfp_flags;
 	entry->node		= node;
 
-	filter_check_discard(call, entry, event);
-
-	ring_buffer_unlock_commit(tr->buffer, event);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 }
@@ -90,9 +89,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
 	entry->call_site	= call_site;
 	entry->ptr		= ptr;
 
-	filter_check_discard(call, entry, event);
-
-	ring_buffer_unlock_commit(tr->buffer, event);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d880ab2772c..c0047fcf707 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -171,6 +171,12 @@ static struct trace_array	global_trace;
 
 static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
 
+int filter_current_check_discard(struct ftrace_event_call *call, void *rec,
+				 struct ring_buffer_event *event)
+{
+	return filter_check_discard(call, rec, global_trace.buffer, event);
+}
+
 cycle_t ftrace_now(int cpu)
 {
 	u64 ts;
@@ -919,9 +925,8 @@ trace_function(struct trace_array *tr,
 	entry->ip			= ip;
 	entry->parent_ip		= parent_ip;
 
-	filter_check_discard(call, entry, event);
-
-	ring_buffer_unlock_commit(tr->buffer, event);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		ring_buffer_unlock_commit(tr->buffer, event);
 }
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -943,8 +948,8 @@ static int __trace_graph_entry(struct trace_array *tr,
 		return 0;
 	entry	= ring_buffer_event_data(event);
 	entry->graph_ent			= *trace;
-	filter_check_discard(call, entry, event);
-	ring_buffer_unlock_commit(global_trace.buffer, event);
+	if (!filter_current_check_discard(call, entry, event))
+		ring_buffer_unlock_commit(global_trace.buffer, event);
 
 	return 1;
 }
@@ -967,8 +972,8 @@ static void __trace_graph_return(struct trace_array *tr,
 		return;
 	entry	= ring_buffer_event_data(event);
 	entry->ret				= *trace;
-	filter_check_discard(call, entry, event);
-	ring_buffer_unlock_commit(global_trace.buffer, event);
+	if (!filter_current_check_discard(call, entry, event))
+		ring_buffer_unlock_commit(global_trace.buffer, event);
 }
 #endif
 
@@ -1004,8 +1009,8 @@ static void __ftrace_trace_stack(struct trace_array *tr,
 	trace.entries		= entry->caller;
 
 	save_stack_trace(&trace);
-	filter_check_discard(call, entry, event);
-	ring_buffer_unlock_commit(tr->buffer, event);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		ring_buffer_unlock_commit(tr->buffer, event);
 #endif
 }
 
@@ -1052,8 +1057,8 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 	trace.entries		= entry->caller;
 
 	save_stack_trace_user(&trace);
-	filter_check_discard(call, entry, event);
-	ring_buffer_unlock_commit(tr->buffer, event);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		ring_buffer_unlock_commit(tr->buffer, event);
 #endif
 }
 
@@ -1114,9 +1119,8 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->next_state		= next->state;
 	entry->next_cpu	= task_cpu(next);
 
-	filter_check_discard(call, entry, event);
-
-	trace_buffer_unlock_commit(tr, event, flags, pc);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		trace_buffer_unlock_commit(tr, event, flags, pc);
 }
 
 void
@@ -1142,9 +1146,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->next_state		= wakee->state;
 	entry->next_cpu			= task_cpu(wakee);
 
-	filter_check_discard(call, entry, event);
-
-	ring_buffer_unlock_commit(tr->buffer, event);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		ring_buffer_unlock_commit(tr->buffer, event);
 	ftrace_trace_stack(tr, flags, 6, pc);
 	ftrace_trace_userstack(tr, flags, pc);
 }
@@ -1285,8 +1288,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 	entry->fmt			= fmt;
 
 	memcpy(entry->buf, trace_buf, sizeof(u32) * len);
-	filter_check_discard(call, entry, event);
-	ring_buffer_unlock_commit(tr->buffer, event);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		ring_buffer_unlock_commit(tr->buffer, event);
 
 out_unlock:
 	__raw_spin_unlock(&trace_buf_lock);
@@ -1341,8 +1344,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 
 	memcpy(&entry->buf, trace_buf, len);
 	entry->buf[len] = 0;
-	filter_check_discard(call, entry, event);
-	ring_buffer_unlock_commit(tr->buffer, event);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		ring_buffer_unlock_commit(tr->buffer, event);
 
  out_unlock:
 	__raw_spin_unlock(&trace_buf_lock);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index dfefffd7ae3..9729d14767d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -866,13 +866,21 @@ extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
 extern void filter_free_subsystem_preds(struct event_subsystem *system);
 extern int filter_add_subsystem_pred(struct event_subsystem *system,
 				     struct filter_pred *pred);
+extern int filter_current_check_discard(struct ftrace_event_call *call,
+					void *rec,
+					struct ring_buffer_event *event);
 
-static inline void
+static inline int
 filter_check_discard(struct ftrace_event_call *call, void *rec,
+		     struct ring_buffer *buffer,
 		     struct ring_buffer_event *event)
 {
-	if (unlikely(call->preds) && !filter_match_preds(call, rec))
-		ring_buffer_event_discard(event);
+	if (unlikely(call->preds) && !filter_match_preds(call, rec)) {
+		ring_buffer_discard_commit(buffer, event);
+		return 1;
+	}
+
+	return 0;
 }
 
 #define __common_field(type, item)					\
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index c95c25d838e..8e64e604f5a 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -74,9 +74,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 	entry->line = f->line;
 	entry->correct = val == expect;
 
-	filter_check_discard(call, entry, event);
-
-	ring_buffer_unlock_commit(tr->buffer, event);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		ring_buffer_unlock_commit(tr->buffer, event);
 
  out:
 	atomic_dec(&tr->data[cpu]->disabled);
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index d2f34bf30e5..b2b298269eb 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -222,11 +222,8 @@ static void ftrace_raw_event_##call(proto)				\
 									\
 	assign;								\
 									\
-	if (call->preds && !filter_match_preds(call, entry))		\
-		trace_current_buffer_discard_commit(event);		\
-	else								\
+	if (!filter_current_check_discard(call, entry, event))		\
 		trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \
-									\
 }									\
 									\
 static int ftrace_raw_reg_event_##call(void)				\
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index e6b275b22ac..8683d50a753 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -195,8 +195,8 @@ void trace_hw_branch(u64 from, u64 to)
 	entry->ent.type = TRACE_HW_BRANCHES;
 	entry->from = from;
 	entry->to   = to;
-	filter_check_discard(call, entry, event);
-	trace_buffer_unlock_commit(tr, event, 0, 0);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		trace_buffer_unlock_commit(tr, event, 0, 0);
 
  out:
 	atomic_dec(&tr->data[cpu]->disabled);
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index 8ce7d7d62c0..810a5b7cf1c 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -55,8 +55,8 @@ static void probe_power_end(struct power_trace *it)
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	entry->state_data = *it;
-	filter_check_discard(call, entry, event);
-	trace_buffer_unlock_commit(tr, event, 0, 0);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		trace_buffer_unlock_commit(tr, event, 0, 0);
  out:
 	preempt_enable();
 }
@@ -87,8 +87,8 @@ static void probe_power_mark(struct power_trace *it, unsigned int type,
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	entry->state_data = *it;
-	filter_check_discard(call, entry, event);
-	trace_buffer_unlock_commit(tr, event, 0, 0);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		trace_buffer_unlock_commit(tr, event, 0, 0);
  out:
 	preempt_enable();
 }
-- 
cgit v1.2.3


From 0a19e53c1514ad8e9c3cbab40c6c3f52c86f403d Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Mon, 13 Apr 2009 03:17:50 -0500
Subject: tracing/filters: allow on-the-fly filter switching

This patch allows event filters to be safely removed or switched
on-the-fly while avoiding the use of rcu or the suspension of tracing of
previous versions.

It does it by adding a new filter_pred_none() predicate function which
does nothing and by never deallocating either the predicates or any of
the filter_pred members used in matching; the predicate lists are
allocated and initialized during ftrace_event_calls initialization.

Whenever a filter is removed or replaced, the filter_pred_* functions
currently in use by the affected ftrace_event_call are immediately
switched over to to the filter_pred_none() function, while the rest of
the filter_pred members are left intact, allowing any currently
executing filter_pred_* functions to finish up, using the values they're
currently using.

In the case of filter replacement, the new predicate values are copied
into the old predicates after the above step, and the filter_pred_none()
functions are replaced by the filter_pred_* functions for the new
filter.  In this case, it is possible though very unlikely that a
previous filter_pred_* is still running even after the
filter_pred_none() switch and the switch to the new filter_pred_*.  In
that case, however, because nothing has been deallocated in the
filter_pred, the worst that can happen is that the old filter_pred_*
function sees the new values and as a result produces either a false
positive or a false negative, depending on the values it finds.

So one downside to this method is that rarely, it can produce a bad
match during the filter switch, but it should be possible to live with
that, IMHO.

The other downside is that at least in this patch the predicate lists
are always pre-allocated, taking up memory from the start.  They could
probably be allocated on first-use, and de-allocated when tracing is
completely stopped - if this patch makes sense, I could create another
one to do that later on.

Oh, and it also places a restriction on the size of __arrays in events,
currently set to 128, since they can't be larger than the now embedded
str_val arrays in the filter_pred struct.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: paulmck@linux.vnet.ibm.com
LKML-Reference: <1239610670.6660.49.camel@tropicana>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h                |  14 +-
 kernel/trace/trace_events.c         |   9 +-
 kernel/trace/trace_events_filter.c  | 252 +++++++++++++++++++-----------------
 kernel/trace/trace_events_stage_2.h |   1 +
 kernel/trace/trace_events_stage_3.h |   1 +
 kernel/trace/trace_export.c         |   1 +
 6 files changed, 150 insertions(+), 128 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9729d14767d..b05b6ac982a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -813,6 +813,7 @@ struct ftrace_event_call {
 	int			(*show_format)(struct trace_seq *s);
 	int			(*define_fields)(void);
 	struct list_head	fields;
+	int			n_preds;
 	struct filter_pred	**preds;
 
 #ifdef CONFIG_EVENT_PROFILE
@@ -826,6 +827,7 @@ struct event_subsystem {
 	struct list_head	list;
 	const char		*name;
 	struct dentry		*entry;
+	int			n_preds;
 	struct filter_pred	**preds;
 };
 
@@ -834,7 +836,8 @@ struct event_subsystem {
 	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
 	     event++)
 
-#define MAX_FILTER_PRED 8
+#define MAX_FILTER_PRED		8
+#define MAX_FILTER_STR_VAL	128
 
 struct filter_pred;
 
@@ -843,7 +846,7 @@ typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
 struct filter_pred {
 	filter_pred_fn_t fn;
 	u64 val;
-	char *str_val;
+	char str_val[MAX_FILTER_STR_VAL];
 	int str_len;
 	char *field_name;
 	int offset;
@@ -855,13 +858,14 @@ struct filter_pred {
 
 int trace_define_field(struct ftrace_event_call *call, char *type,
 		       char *name, int offset, int size);
+extern int init_preds(struct ftrace_event_call *call);
 extern void filter_free_pred(struct filter_pred *pred);
-extern void filter_print_preds(struct filter_pred **preds,
+extern void filter_print_preds(struct filter_pred **preds, int n_preds,
 			       struct trace_seq *s);
 extern int filter_parse(char **pbuf, struct filter_pred *pred);
 extern int filter_add_pred(struct ftrace_event_call *call,
 			   struct filter_pred *pred);
-extern void filter_free_preds(struct ftrace_event_call *call);
+extern void filter_disable_preds(struct ftrace_event_call *call);
 extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
 extern void filter_free_subsystem_preds(struct event_subsystem *system);
 extern int filter_add_subsystem_pred(struct event_subsystem *system,
@@ -875,7 +879,7 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
 		     struct ring_buffer *buffer,
 		     struct ring_buffer_event *event)
 {
-	if (unlikely(call->preds) && !filter_match_preds(call, rec)) {
+	if (unlikely(call->n_preds) && !filter_match_preds(call, rec)) {
 		ring_buffer_discard_commit(buffer, event);
 		return 1;
 	}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 789e14eb09a..ead68ac9919 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -481,7 +481,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 
 	trace_seq_init(s);
 
-	filter_print_preds(call->preds, s);
+	filter_print_preds(call->preds, call->n_preds, s);
 	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
 
 	kfree(s);
@@ -516,7 +516,7 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	}
 
 	if (pred->clear) {
-		filter_free_preds(call);
+		filter_disable_preds(call);
 		filter_free_pred(pred);
 		return cnt;
 	}
@@ -527,6 +527,8 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		return err;
 	}
 
+	filter_free_pred(pred);
+
 	*ppos += cnt;
 
 	return cnt;
@@ -549,7 +551,7 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 
 	trace_seq_init(s);
 
-	filter_print_preds(system->preds, s);
+	filter_print_preds(system->preds, system->n_preds, s);
 	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
 
 	kfree(s);
@@ -712,6 +714,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
 	list_add(&system->list, &event_subsystems);
 
 	system->preds = NULL;
+	system->n_preds = 0;
 
 	entry = debugfs_create_file("filter", 0644, system->entry, system,
 				    &ftrace_subsystem_filter_fops);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 9f8ecca34a5..de42dad42a8 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -82,25 +82,27 @@ static int filter_pred_string(struct filter_pred *pred, void *event)
 	return match;
 }
 
+static int filter_pred_none(struct filter_pred *pred, void *event)
+{
+	return 0;
+}
+
 /* return 1 if event matches, 0 otherwise (discard) */
 int filter_match_preds(struct ftrace_event_call *call, void *rec)
 {
 	int i, matched, and_failed = 0;
 	struct filter_pred *pred;
 
-	for (i = 0; i < MAX_FILTER_PRED; i++) {
-		if (call->preds[i]) {
-			pred = call->preds[i];
-			if (and_failed && !pred->or)
-				continue;
-			matched = pred->fn(pred, rec);
-			if (!matched && !pred->or) {
-				and_failed = 1;
-				continue;
-			} else if (matched && pred->or)
-				return 1;
-		} else
-			break;
+	for (i = 0; i < call->n_preds; i++) {
+		pred = call->preds[i];
+		if (and_failed && !pred->or)
+			continue;
+		matched = pred->fn(pred, rec);
+		if (!matched && !pred->or) {
+			and_failed = 1;
+			continue;
+		} else if (matched && pred->or)
+			return 1;
 	}
 
 	if (and_failed)
@@ -109,31 +111,29 @@ int filter_match_preds(struct ftrace_event_call *call, void *rec)
 	return 1;
 }
 
-void filter_print_preds(struct filter_pred **preds, struct trace_seq *s)
+void filter_print_preds(struct filter_pred **preds, int n_preds,
+			struct trace_seq *s)
 {
 	char *field_name;
 	struct filter_pred *pred;
 	int i;
 
-	if (!preds) {
+	if (!n_preds) {
 		trace_seq_printf(s, "none\n");
 		return;
 	}
 
-	for (i = 0; i < MAX_FILTER_PRED; i++) {
-		if (preds[i]) {
-			pred = preds[i];
-			field_name = pred->field_name;
-			if (i)
-				trace_seq_printf(s, pred->or ? "|| " : "&& ");
-			trace_seq_printf(s, "%s ", field_name);
-			trace_seq_printf(s, pred->not ? "!= " : "== ");
-			if (pred->str_val)
-				trace_seq_printf(s, "%s\n", pred->str_val);
-			else
-				trace_seq_printf(s, "%llu\n", pred->val);
-		} else
-			break;
+	for (i = 0; i < n_preds; i++) {
+		pred = preds[i];
+		field_name = pred->field_name;
+		if (i)
+			trace_seq_printf(s, pred->or ? "|| " : "&& ");
+		trace_seq_printf(s, "%s ", field_name);
+		trace_seq_printf(s, pred->not ? "!= " : "== ");
+		if (pred->str_len)
+			trace_seq_printf(s, "%s\n", pred->str_val);
+		else
+			trace_seq_printf(s, "%llu\n", pred->val);
 	}
 }
 
@@ -156,20 +156,69 @@ void filter_free_pred(struct filter_pred *pred)
 		return;
 
 	kfree(pred->field_name);
-	kfree(pred->str_val);
 	kfree(pred);
 }
 
-void filter_free_preds(struct ftrace_event_call *call)
+static void filter_clear_pred(struct filter_pred *pred)
+{
+	kfree(pred->field_name);
+	pred->field_name = NULL;
+	pred->str_len = 0;
+}
+
+static int filter_set_pred(struct filter_pred *dest,
+			   struct filter_pred *src,
+			   filter_pred_fn_t fn)
+{
+	*dest = *src;
+	dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
+	if (!dest->field_name)
+		return -ENOMEM;
+	dest->fn = fn;
+
+	return 0;
+}
+
+void filter_disable_preds(struct ftrace_event_call *call)
 {
 	int i;
 
-	if (call->preds) {
-		for (i = 0; i < MAX_FILTER_PRED; i++)
+	call->n_preds = 0;
+
+	for (i = 0; i < MAX_FILTER_PRED; i++)
+		call->preds[i]->fn = filter_pred_none;
+}
+
+int init_preds(struct ftrace_event_call *call)
+{
+	struct filter_pred *pred;
+	int i;
+
+	call->n_preds = 0;
+
+	call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
+	if (!call->preds)
+		return -ENOMEM;
+
+	for (i = 0; i < MAX_FILTER_PRED; i++) {
+		pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+		if (!pred)
+			goto oom;
+		pred->fn = filter_pred_none;
+		call->preds[i] = pred;
+	}
+
+	return 0;
+
+oom:
+	for (i = 0; i < MAX_FILTER_PRED; i++) {
+		if (call->preds[i])
 			filter_free_pred(call->preds[i]);
-		kfree(call->preds);
-		call->preds = NULL;
 	}
+	kfree(call->preds);
+	call->preds = NULL;
+
+	return -ENOMEM;
 }
 
 void filter_free_subsystem_preds(struct event_subsystem *system)
@@ -177,11 +226,12 @@ void filter_free_subsystem_preds(struct event_subsystem *system)
 	struct ftrace_event_call *call = __start_ftrace_events;
 	int i;
 
-	if (system->preds) {
-		for (i = 0; i < MAX_FILTER_PRED; i++)
+	if (system->n_preds) {
+		for (i = 0; i < system->n_preds; i++)
 			filter_free_pred(system->preds[i]);
 		kfree(system->preds);
 		system->preds = NULL;
+		system->n_preds = 0;
 	}
 
 	events_for_each(call) {
@@ -189,33 +239,31 @@ void filter_free_subsystem_preds(struct event_subsystem *system)
 			continue;
 
 		if (!strcmp(call->system, system->name))
-			filter_free_preds(call);
+			filter_disable_preds(call);
 	}
 }
 
 static int __filter_add_pred(struct ftrace_event_call *call,
-			     struct filter_pred *pred)
+			     struct filter_pred *pred,
+			     filter_pred_fn_t fn)
 {
-	int i;
+	int idx, err;
 
-	if (call->preds && !pred->compound)
-		filter_free_preds(call);
+	if (call->n_preds && !pred->compound)
+		filter_disable_preds(call);
 
-	if (!call->preds) {
-		call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
-				      GFP_KERNEL);
-		if (!call->preds)
-			return -ENOMEM;
-	}
+	if (call->n_preds == MAX_FILTER_PRED)
+		return -ENOSPC;
 
-	for (i = 0; i < MAX_FILTER_PRED; i++) {
-		if (!call->preds[i]) {
-			call->preds[i] = pred;
-			return 0;
-		}
-	}
+	idx = call->n_preds;
+	filter_clear_pred(call->preds[idx]);
+	err = filter_set_pred(call->preds[idx], pred, fn);
+	if (err)
+		return err;
+
+	call->n_preds++;
 
-	return -ENOSPC;
+	return 0;
 }
 
 static int is_string_field(const char *type)
@@ -229,98 +277,66 @@ static int is_string_field(const char *type)
 int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
 {
 	struct ftrace_event_field *field;
+	filter_pred_fn_t fn;
 
 	field = find_event_field(call, pred->field_name);
 	if (!field)
 		return -EINVAL;
 
+	pred->fn = filter_pred_none;
 	pred->offset = field->offset;
 
 	if (is_string_field(field->type)) {
-		if (!pred->str_val)
+		if (!pred->str_len)
 			return -EINVAL;
-		pred->fn = filter_pred_string;
+		fn = filter_pred_string;
 		pred->str_len = field->size;
-		return __filter_add_pred(call, pred);
+		return __filter_add_pred(call, pred, fn);
 	} else {
-		if (pred->str_val)
+		if (pred->str_len)
 			return -EINVAL;
 	}
 
 	switch (field->size) {
 	case 8:
-		pred->fn = filter_pred_64;
+		fn = filter_pred_64;
 		break;
 	case 4:
-		pred->fn = filter_pred_32;
+		fn = filter_pred_32;
 		break;
 	case 2:
-		pred->fn = filter_pred_16;
+		fn = filter_pred_16;
 		break;
 	case 1:
-		pred->fn = filter_pred_8;
+		fn = filter_pred_8;
 		break;
 	default:
 		return -EINVAL;
 	}
 
-	return __filter_add_pred(call, pred);
-}
-
-static struct filter_pred *copy_pred(struct filter_pred *pred)
-{
-	struct filter_pred *new_pred = kmalloc(sizeof(*pred), GFP_KERNEL);
-	if (!new_pred)
-		return NULL;
-
-	memcpy(new_pred, pred, sizeof(*pred));
-
-	if (pred->field_name) {
-		new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
-		if (!new_pred->field_name) {
-			kfree(new_pred);
-			return NULL;
-		}
-	}
-
-	if (pred->str_val) {
-		new_pred->str_val = kstrdup(pred->str_val, GFP_KERNEL);
-		if (!new_pred->str_val) {
-			filter_free_pred(new_pred);
-			return NULL;
-		}
-	}
-
-	return new_pred;
+	return __filter_add_pred(call, pred, fn);
 }
 
 int filter_add_subsystem_pred(struct event_subsystem *system,
 			      struct filter_pred *pred)
 {
 	struct ftrace_event_call *call = __start_ftrace_events;
-	struct filter_pred *event_pred;
-	int i;
 
-	if (system->preds && !pred->compound)
+	if (system->n_preds && !pred->compound)
 		filter_free_subsystem_preds(system);
 
-	if (!system->preds) {
+	if (!system->n_preds) {
 		system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
 					GFP_KERNEL);
 		if (!system->preds)
 			return -ENOMEM;
 	}
 
-	for (i = 0; i < MAX_FILTER_PRED; i++) {
-		if (!system->preds[i]) {
-			system->preds[i] = pred;
-			break;
-		}
-	}
-
-	if (i == MAX_FILTER_PRED)
+	if (system->n_preds == MAX_FILTER_PRED)
 		return -ENOSPC;
 
+	system->preds[system->n_preds] = pred;
+
 	events_for_each(call) {
 		int err;
 
@@ -333,22 +349,16 @@ int filter_add_subsystem_pred(struct event_subsystem *system,
 		if (!find_event_field(call, pred->field_name))
 			continue;
 
-		event_pred = copy_pred(pred);
-		if (!event_pred)
-			goto oom;
-
-		err = filter_add_pred(call, event_pred);
-		if (err)
-			filter_free_pred(event_pred);
-		if (err == -ENOMEM)
-			goto oom;
+		err = filter_add_pred(call, pred);
+		if (err == -ENOMEM) {
+			system->preds[system->n_preds] = NULL;
+			return err;
+		}
 	}
 
-	return 0;
+	system->n_preds++;
 
-oom:
-	system->preds[i] = NULL;
-	return -ENOMEM;
+	return 0;
 }
 
 int filter_parse(char **pbuf, struct filter_pred *pred)
@@ -410,7 +420,8 @@ int filter_parse(char **pbuf, struct filter_pred *pred)
 		}
 	}
 
-	if (!val_str) {
+	if (!val_str || !strlen(val_str)
+	    || strlen(val_str) >= MAX_FILTER_STR_VAL) {
 		pred->field_name = NULL;
 		return -EINVAL;
 	}
@@ -419,11 +430,12 @@ int filter_parse(char **pbuf, struct filter_pred *pred)
 	if (!pred->field_name)
 		return -ENOMEM;
 
+	pred->str_len = 0;
 	pred->val = simple_strtoull(val_str, &tmp, 0);
 	if (tmp == val_str) {
-		pred->str_val = kstrdup(val_str, GFP_KERNEL);
-		if (!pred->str_val)
-			return -ENOMEM;
+		strncpy(pred->str_val, val_str, MAX_FILTER_STR_VAL);
+		pred->str_len = strlen(val_str);
+		pred->str_val[pred->str_len] = '\0';
 	} else if (*tmp != '\0')
 		return -EINVAL;
 
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index 02fb710193e..59cfd7dfe68 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -140,6 +140,7 @@ ftrace_format_##call(struct trace_seq *s)				\
 
 #undef __array
 #define __array(type, item, len)					\
+	BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);				\
 	ret = trace_define_field(event_call, #type "[" #len "]", #item,	\
 				 offsetof(typeof(field), item),		\
 				 sizeof(field.item));			\
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index b2b298269eb..5bb1b7ffbdb 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -255,6 +255,7 @@ static int ftrace_raw_init_event_##call(void)				\
 		return -ENODEV;						\
 	event_##call.id = id;						\
 	INIT_LIST_HEAD(&event_##call.fields);				\
+	init_preds(&event_##call);					\
 	return 0;							\
 }									\
 									\
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 77c494f5e1d..48fc02fe73a 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -122,6 +122,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 static int ftrace_raw_init_event_##call(void)				\
 {									\
 	INIT_LIST_HEAD(&event_##call.fields);				\
+	init_preds(&event_##call);					\
 	return 0;							\
 }									\
 
-- 
cgit v1.2.3


From ea20d9293ce423a39717ed4375393129a2e701f9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 10 Apr 2009 08:54:16 -0400
Subject: tracing: consolidate trace and trace_event headers

Impact: clean up

Neil Horman (et. al.) criticized the way the trace events were broken up
into two files. The reason for that was that ftrace needed to separate out
the declarations from where the #include <linux/tracepoint.h> was used.
It then dawned on me that the tracepoint.h header only needs to define the
TRACE_EVENT macro if it is not already defined.

The solution is simply to test if TRACE_EVENT is defined, and if it is not
then the linux/tracepoint.h header can define it. This change consolidates
all the <traces>.h and <traces>_event_types.h into the <traces>.h file.

Reported-by: Neil Horman <nhorman@tuxdriver.com>
Reported-by: Theodore Tso <tytso@mit.edu>
Reported-by: Jiaying Zhang <jiayingz@google.com>
Cc: Zhaolei <zhaolei@cn.fujitsu.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/events.c               | 1 +
 kernel/trace/trace_events_stage_1.h | 4 ++--
 kernel/trace/trace_events_stage_2.h | 8 ++++----
 kernel/trace/trace_events_stage_3.h | 4 ++--
 4 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/events.c b/kernel/trace/events.c
index 246f2aa6dc4..5a35a914f0e 100644
--- a/kernel/trace/events.c
+++ b/kernel/trace/events.c
@@ -8,6 +8,7 @@
 
 #include "trace_output.h"
 
+#define TRACE_HEADER_MULTI_READ
 #include "trace_events_stage_1.h"
 #include "trace_events_stage_2.h"
 #include "trace_events_stage_3.h"
diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
index 38985f9b379..475f46a047a 100644
--- a/kernel/trace/trace_events_stage_1.h
+++ b/kernel/trace/trace_events_stage_1.h
@@ -1,7 +1,7 @@
 /*
  * Stage 1 of the trace events.
  *
- * Override the macros in <trace/trace_event_types.h> to include the following:
+ * Override the macros in <trace/trace_events.h> to include the following:
  *
  * struct ftrace_raw_<call> {
  *	struct trace_entry		ent;
@@ -36,4 +36,4 @@
 	};							\
 	static struct ftrace_event_call event_##name
 
-#include <trace/trace_event_types.h>
+#include <trace/trace_events.h>
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index 59cfd7dfe68..aa4a67a0656 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -1,7 +1,7 @@
 /*
  * Stage 2 of the trace events.
  *
- * Override the macros in <trace/trace_event_types.h> to include the following:
+ * Override the macros in <trace/trace_events.h> to include the following:
  *
  * enum print_line_t
  * ftrace_raw_output_<call>(struct trace_iterator *iter, int flags)
@@ -64,7 +64,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 	return TRACE_TYPE_HANDLED;					\
 }
 	
-#include <trace/trace_event_types.h>
+#include <trace/trace_events.h>
 
 /*
  * Setup the showing format of trace point.
@@ -128,7 +128,7 @@ ftrace_format_##call(struct trace_seq *s)				\
 	return ret;							\
 }
 
-#include <trace/trace_event_types.h>
+#include <trace/trace_events.h>
 
 #undef __field
 #define __field(type, item)						\
@@ -167,4 +167,4 @@ ftrace_define_fields_##call(void)					\
 	return ret;							\
 }
 
-#include <trace/trace_event_types.h>
+#include <trace/trace_events.h>
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 5bb1b7ffbdb..45c04e1f38d 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -1,7 +1,7 @@
 /*
  * Stage 3 of the trace events.
  *
- * Override the macros in <trace/trace_event_types.h> to include the following:
+ * Override the macros in <trace/trace_events.h> to include the following:
  *
  * static void ftrace_event_<call>(proto)
  * {
@@ -272,7 +272,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 	_TRACE_PROFILE_INIT(call)					\
 }
 
-#include <trace/trace_event_types.h>
+#include <trace/trace_events.h>
 
 #undef _TRACE_PROFILE
 #undef _TRACE_PROFILE_INIT
-- 
cgit v1.2.3


From 56449f437add737a1e5e1cb7e00f63ac8ead1938 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 14 Apr 2009 11:24:36 +0200
Subject: tracing: make the trace clocks available generally

Jeremy Fitzhardinge reported this build failure:

 LD	 .tmp_vmlinux1
 arch/x86/kernel/built-in.o: In function `ds_take_timestamp':
 git/linux/arch/x86/kernel/ds.c:1380: undefined reference to `trace_clock_global'
 git/linux/arch/x86/kernel/ds.c:1380: undefined reference to `trace_clock_global'

Which is due to !CONFIG_TRACING && CONFIG_X86_DS=y.

Expose the trace clock code to CONFIG_X86_DS as well.

[ Unfortunately librarizing doesnt work well - ancient architectures
  with no raw_local_irq_save() primitive break the build. ]

Reported-by: Jeremy Fitzhardinge <jeremy@goop.org>
LKML-Reference: <49E4413F.7070700@goop.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Makefile | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 2630f5121ec..ecc671e9f14 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -15,11 +15,16 @@ ifdef CONFIG_TRACING_BRANCHES
 KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
 endif
 
+#
+# Make the trace clocks available generally: it's infrastructure
+# relied on by ptrace for example:
+#
+obj-y += trace_clock.o
+
 obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
 obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
 
 obj-$(CONFIG_TRACING) += trace.o
-obj-$(CONFIG_TRACING) += trace_clock.o
 obj-$(CONFIG_TRACING) += trace_output.o
 obj-$(CONFIG_TRACING) += trace_stat.o
 obj-$(CONFIG_TRACING) += trace_printk.o
-- 
cgit v1.2.3


From 9504504cbab29ecb694186b1c5b15d3579c43c51 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 11 Apr 2009 12:59:57 -0400
Subject: tracing: make trace_seq operations available for core kernel

In the process to make TRACE_EVENT macro work for modules, the trace_seq
operations must be available for core kernel code.

These operations are quite useful and can be used for other implementations.

The main idea is that we create a trace_seq handle that acts very much
like the seq_file handle.

	struct trace_seq *s = kmalloc(sizeof(*s, GFP_KERNEL);

	trace_seq_init(s);
	trace_seq_printf(s, "some data %d\n", variable);

	printk("%s", s->buffer);

The main use is to allow a top level function call several other functions
that may store printf like data into the buffer. Then at the end, the top
level function can process all the data with any method it would like to.
It could be passed to userspace, output via printk or even use seq_file:

	trace_seq_to_user(s, ubuf, cnt);
	seq_puts(m, s->buffer);

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h        | 15 ++-------------
 kernel/trace/trace_output.h | 16 +---------------
 2 files changed, 3 insertions(+), 28 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b05b6ac982a..1882846b738 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -12,6 +12,8 @@
 #include <linux/kmemtrace.h>
 #include <trace/power.h>
 
+#include <linux/trace_seq.h>
+
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
 
@@ -423,19 +425,6 @@ struct tracer {
 	struct tracer_stat	*stats;
 };
 
-struct trace_seq {
-	unsigned char		buffer[PAGE_SIZE];
-	unsigned int		len;
-	unsigned int		readpos;
-};
-
-static inline void
-trace_seq_init(struct trace_seq *s)
-{
-	s->len = 0;
-	s->readpos = 0;
-}
-
 
 #define TRACE_PIPE_ALL_CPU	-1
 
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 91630217fb4..5c7cbfb65c7 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -1,6 +1,7 @@
 #ifndef __TRACE_EVENTS_H
 #define __TRACE_EVENTS_H
 
+#include <linux/trace_seq.h>
 #include "trace.h"
 
 typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
@@ -20,24 +21,9 @@ trace_print_bprintk_msg_only(struct trace_iterator *iter);
 extern enum print_line_t
 trace_print_printk_msg_only(struct trace_iterator *iter);
 
-extern void trace_print_seq(struct seq_file *m, struct trace_seq *s);
-
-extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3)));
-extern int
-trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary);
 extern int
 seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
 		unsigned long sym_flags);
-extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
-				 size_t cnt);
-extern int trace_seq_puts(struct trace_seq *s, const char *str);
-extern int trace_seq_putc(struct trace_seq *s, unsigned char c);
-extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len);
-extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
-				size_t len);
-extern void *trace_seq_reserve(struct trace_seq *s, size_t len);
-extern int trace_seq_path(struct trace_seq *s, struct path *path);
 extern int seq_print_userip_objs(const struct userstack_entry *entry,
 				 struct trace_seq *s, unsigned long sym_flags);
 extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
-- 
cgit v1.2.3


From 97f2025153499faa17267a0d4e18c7afaf73f39d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 13 Apr 2009 11:20:49 -0400
Subject: tracing/events: move declarations from trace directory to core
 include

In preparation to allowing trace events to happen in modules, we need
to move some of the local declarations in the kernel/trace directory
into include/linux.

This patch simply moves the declarations and performs no context changes.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h        | 120 +-------------------------------------------
 kernel/trace/trace_output.h |  14 ------
 2 files changed, 1 insertion(+), 133 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 1882846b738..6bcdf4af9b2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -13,6 +13,7 @@
 #include <trace/power.h>
 
 #include <linux/trace_seq.h>
+#include <linux/ftrace_event.h>
 
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
@@ -43,20 +44,6 @@ enum trace_type {
 	__TRACE_LAST_TYPE,
 };
 
-/*
- * The trace entry - the most basic unit of tracing. This is what
- * is printed in the end as a single line in the trace output, such as:
- *
- *     bash-15816 [01]   235.197585: idle_cpu <- irq_enter
- */
-struct trace_entry {
-	unsigned char		type;
-	unsigned char		flags;
-	unsigned char		preempt_count;
-	int			pid;
-	int			tgid;
-};
-
 /*
  * Function trace entry - function address and parent function addres:
  */
@@ -265,8 +252,6 @@ struct trace_array_cpu {
 	char			comm[TASK_COMM_LEN];
 };
 
-struct trace_iterator;
-
 /*
  * The trace array - an array of per-CPU trace arrays. This is the
  * highest level data structure that individual tracers deal with.
@@ -341,15 +326,6 @@ extern void __ftrace_bad_type(void);
 		__ftrace_bad_type();					\
 	} while (0)
 
-/* Return values for print_line callback */
-enum print_line_t {
-	TRACE_TYPE_PARTIAL_LINE	= 0,	/* Retry after flushing the seq */
-	TRACE_TYPE_HANDLED	= 1,
-	TRACE_TYPE_UNHANDLED	= 2,	/* Relay to other output functions */
-	TRACE_TYPE_NO_CONSUME	= 3	/* Handled but ask to not consume */
-};
-
-
 /*
  * An option specific to a tracer. This is a boolean value.
  * The bit is the bit index that sets its value on the
@@ -428,31 +404,6 @@ struct tracer {
 
 #define TRACE_PIPE_ALL_CPU	-1
 
-/*
- * Trace iterator - used by printout routines who present trace
- * results to users and which routines might sleep, etc:
- */
-struct trace_iterator {
-	struct trace_array	*tr;
-	struct tracer		*trace;
-	void			*private;
-	int			cpu_file;
-	struct mutex		mutex;
-	struct ring_buffer_iter	*buffer_iter[NR_CPUS];
-
-	/* The below is zeroed out in pipe_read */
-	struct trace_seq	seq;
-	struct trace_entry	*ent;
-	int			cpu;
-	u64			ts;
-
-	unsigned long		iter_flags;
-	loff_t			pos;
-	long			idx;
-
-	cpumask_var_t		started;
-};
-
 int tracer_init(struct tracer *t, struct trace_array *tr);
 int tracing_is_enabled(void);
 void trace_wake_up(void);
@@ -479,15 +430,6 @@ void trace_buffer_unlock_commit(struct trace_array *tr,
 				struct ring_buffer_event *event,
 				unsigned long flags, int pc);
 
-struct ring_buffer_event *
-trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
-				  unsigned long flags, int pc);
-void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
-					unsigned long flags, int pc);
-void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
-					unsigned long flags, int pc);
-void trace_current_buffer_discard_commit(struct ring_buffer_event *event);
-
 struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
 						struct trace_array_cpu *data);
 
@@ -510,7 +452,6 @@ void tracing_sched_switch_trace(struct trace_array *tr,
 				struct task_struct *prev,
 				struct task_struct *next,
 				unsigned long flags, int pc);
-void tracing_record_cmdline(struct task_struct *tsk);
 
 void tracing_sched_wakeup_trace(struct trace_array *tr,
 				struct task_struct *wakee,
@@ -790,28 +731,6 @@ struct ftrace_event_field {
 	int			size;
 };
 
-struct ftrace_event_call {
-	char			*name;
-	char			*system;
-	struct dentry		*dir;
-	int			enabled;
-	int			(*regfunc)(void);
-	void			(*unregfunc)(void);
-	int			id;
-	int			(*raw_init)(void);
-	int			(*show_format)(struct trace_seq *s);
-	int			(*define_fields)(void);
-	struct list_head	fields;
-	int			n_preds;
-	struct filter_pred	**preds;
-
-#ifdef CONFIG_EVENT_PROFILE
-	atomic_t	profile_count;
-	int		(*profile_enable)(struct ftrace_event_call *);
-	void		(*profile_disable)(struct ftrace_event_call *);
-#endif
-};
-
 struct event_subsystem {
 	struct list_head	list;
 	const char		*name;
@@ -825,9 +744,6 @@ struct event_subsystem {
 	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
 	     event++)
 
-#define MAX_FILTER_PRED		8
-#define MAX_FILTER_STR_VAL	128
-
 struct filter_pred;
 
 typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
@@ -845,9 +761,6 @@ struct filter_pred {
 	int clear;
 };
 
-int trace_define_field(struct ftrace_event_call *call, char *type,
-		       char *name, int offset, int size);
-extern int init_preds(struct ftrace_event_call *call);
 extern void filter_free_pred(struct filter_pred *pred);
 extern void filter_print_preds(struct filter_pred **preds, int n_preds,
 			       struct trace_seq *s);
@@ -855,13 +768,9 @@ extern int filter_parse(char **pbuf, struct filter_pred *pred);
 extern int filter_add_pred(struct ftrace_event_call *call,
 			   struct filter_pred *pred);
 extern void filter_disable_preds(struct ftrace_event_call *call);
-extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
 extern void filter_free_subsystem_preds(struct event_subsystem *system);
 extern int filter_add_subsystem_pred(struct event_subsystem *system,
 				     struct filter_pred *pred);
-extern int filter_current_check_discard(struct ftrace_event_call *call,
-					void *rec,
-					struct ring_buffer_event *event);
 
 static inline int
 filter_check_discard(struct ftrace_event_call *call, void *rec,
@@ -876,14 +785,6 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
 	return 0;
 }
 
-#define __common_field(type, item)					\
-	ret = trace_define_field(event_call, #type, "common_" #item,	\
-				 offsetof(typeof(field.ent), item),	\
-				 sizeof(field.ent.item));		\
-	if (ret)							\
-		return ret;
-
-void event_trace_printk(unsigned long ip, const char *fmt, ...);
 extern struct ftrace_event_call __start_ftrace_events[];
 extern struct ftrace_event_call __stop_ftrace_events[];
 
@@ -895,25 +796,6 @@ extern struct ftrace_event_call __stop_ftrace_events[];
 extern const char *__start___trace_bprintk_fmt[];
 extern const char *__stop___trace_bprintk_fmt[];
 
-/*
- * The double __builtin_constant_p is because gcc will give us an error
- * if we try to allocate the static variable to fmt if it is not a
- * constant. Even with the outer if statement optimizing out.
- */
-#define event_trace_printk(ip, fmt, args...)				\
-do {									\
-	__trace_printk_check_format(fmt, ##args);			\
-	tracing_record_cmdline(current);				\
-	if (__builtin_constant_p(fmt)) {				\
-		static const char *trace_printk_fmt			\
-		  __attribute__((section("__trace_printk_fmt"))) =	\
-			__builtin_constant_p(fmt) ? fmt : NULL;		\
-									\
-		__trace_bprintk(ip, trace_printk_fmt, ##args);		\
-	} else								\
-		__trace_printk(ip, fmt, ##args);			\
-} while (0)
-
 #undef TRACE_EVENT_FORMAT
 #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
 	extern struct ftrace_event_call event_##call;
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 5c7cbfb65c7..6e220a8e570 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -4,18 +4,6 @@
 #include <linux/trace_seq.h>
 #include "trace.h"
 
-typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
-					      int flags);
-
-struct trace_event {
-	struct hlist_node	node;
-	int			type;
-	trace_print_func	trace;
-	trace_print_func	raw;
-	trace_print_func	hex;
-	trace_print_func	binary;
-};
-
 extern enum print_line_t
 trace_print_bprintk_msg_only(struct trace_iterator *iter);
 extern enum print_line_t
@@ -33,8 +21,6 @@ extern int trace_print_context(struct trace_iterator *iter);
 extern int trace_print_lat_context(struct trace_iterator *iter);
 
 extern struct trace_event *ftrace_find_event(int type);
-extern int register_ftrace_event(struct trace_event *event);
-extern int unregister_ftrace_event(struct trace_event *event);
 
 extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
 					 int flags);
-- 
cgit v1.2.3


From f42c85e74faa422cf0bc747ed808681145448f88 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 13 Apr 2009 12:25:37 -0400
Subject: tracing/events: move the ftrace event tracing code to core

This patch moves the ftrace creation into include/trace/ftrace.h and
simplifies the work of developers in adding new tracepoints.
Just the act of creating the trace points in include/trace and including
define_trace.h will create the events in the debugfs/tracing/events
directory.

This patch removes the need of include/trace/trace_events.h

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Makefile               |   1 -
 kernel/trace/events.c               |  15 --
 kernel/trace/trace_events_stage_1.h |  39 -----
 kernel/trace/trace_events_stage_2.h | 170 ----------------------
 kernel/trace/trace_events_stage_3.h | 279 ------------------------------------
 5 files changed, 504 deletions(-)
 delete mode 100644 kernel/trace/events.c
 delete mode 100644 kernel/trace/trace_events_stage_1.h
 delete mode 100644 kernel/trace/trace_events_stage_2.h
 delete mode 100644 kernel/trace/trace_events_stage_3.h

(limited to 'kernel/trace')

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 3ad367e7c97..fb9d7f96489 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -41,7 +41,6 @@ obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
 obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events.o
-obj-$(CONFIG_EVENT_TRACER) += events.o
 obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
 obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
diff --git a/kernel/trace/events.c b/kernel/trace/events.c
deleted file mode 100644
index 5a35a914f0e..00000000000
--- a/kernel/trace/events.c
+++ /dev/null
@@ -1,15 +0,0 @@
-/*
- * This is the place to register all trace points as events.
- */
-
-#include <linux/stringify.h>
-
-#include <trace/trace_events.h>
-
-#include "trace_output.h"
-
-#define TRACE_HEADER_MULTI_READ
-#include "trace_events_stage_1.h"
-#include "trace_events_stage_2.h"
-#include "trace_events_stage_3.h"
-
diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
deleted file mode 100644
index 475f46a047a..00000000000
--- a/kernel/trace/trace_events_stage_1.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Stage 1 of the trace events.
- *
- * Override the macros in <trace/trace_events.h> to include the following:
- *
- * struct ftrace_raw_<call> {
- *	struct trace_entry		ent;
- *	<type>				<item>;
- *	<type2>				<item2>[<len>];
- *	[...]
- * };
- *
- * The <type> <item> is created by the __field(type, item) macro or
- * the __array(type2, item2, len) macro.
- * We simply do "type item;", and that will create the fields
- * in the structure.
- */
-
-#undef TRACE_FORMAT
-#define TRACE_FORMAT(call, proto, args, fmt)
-
-#undef __array
-#define __array(type, item, len)	type	item[len];
-
-#undef __field
-#define __field(type, item)		type	item;
-
-#undef TP_STRUCT__entry
-#define TP_STRUCT__entry(args...) args
-
-#undef TRACE_EVENT
-#define TRACE_EVENT(name, proto, args, tstruct, assign, print)	\
-	struct ftrace_raw_##name {				\
-		struct trace_entry	ent;			\
-		tstruct						\
-	};							\
-	static struct ftrace_event_call event_##name
-
-#include <trace/trace_events.h>
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
deleted file mode 100644
index aa4a67a0656..00000000000
--- a/kernel/trace/trace_events_stage_2.h
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Stage 2 of the trace events.
- *
- * Override the macros in <trace/trace_events.h> to include the following:
- *
- * enum print_line_t
- * ftrace_raw_output_<call>(struct trace_iterator *iter, int flags)
- * {
- *	struct trace_seq *s = &iter->seq;
- *	struct ftrace_raw_<call> *field; <-- defined in stage 1
- *	struct trace_entry *entry;
- *	int ret;
- *
- *	entry = iter->ent;
- *
- *	if (entry->type != event_<call>.id) {
- *		WARN_ON_ONCE(1);
- *		return TRACE_TYPE_UNHANDLED;
- *	}
- *
- *	field = (typeof(field))entry;
- *
- *	ret = trace_seq_printf(s, <TP_printk> "\n");
- *	if (!ret)
- *		return TRACE_TYPE_PARTIAL_LINE;
- *
- *	return TRACE_TYPE_HANDLED;
- * }
- *
- * This is the method used to print the raw event to the trace
- * output format. Note, this is not needed if the data is read
- * in binary.
- */
-
-#undef __entry
-#define __entry field
-
-#undef TP_printk
-#define TP_printk(fmt, args...) fmt "\n", args
-
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
-enum print_line_t							\
-ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
-{									\
-	struct trace_seq *s = &iter->seq;				\
-	struct ftrace_raw_##call *field;				\
-	struct trace_entry *entry;					\
-	int ret;							\
-									\
-	entry = iter->ent;						\
-									\
-	if (entry->type != event_##call.id) {				\
-		WARN_ON_ONCE(1);					\
-		return TRACE_TYPE_UNHANDLED;				\
-	}								\
-									\
-	field = (typeof(field))entry;					\
-									\
-	ret = trace_seq_printf(s, #call ": " print);			\
-	if (!ret)							\
-		return TRACE_TYPE_PARTIAL_LINE;				\
-									\
-	return TRACE_TYPE_HANDLED;					\
-}
-	
-#include <trace/trace_events.h>
-
-/*
- * Setup the showing format of trace point.
- *
- * int
- * ftrace_format_##call(struct trace_seq *s)
- * {
- *	struct ftrace_raw_##call field;
- *	int ret;
- *
- *	ret = trace_seq_printf(s, #type " " #item ";"
- *			       " offset:%u; size:%u;\n",
- *			       offsetof(struct ftrace_raw_##call, item),
- *			       sizeof(field.type));
- *
- * }
- */
-
-#undef TP_STRUCT__entry
-#define TP_STRUCT__entry(args...) args
-
-#undef __field
-#define __field(type, item)					\
-	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
-			       "offset:%u;\tsize:%u;\n",		\
-			       (unsigned int)offsetof(typeof(field), item), \
-			       (unsigned int)sizeof(field.item));	\
-	if (!ret)							\
-		return 0;
-
-#undef __array
-#define __array(type, item, len)						\
-	ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t"	\
-			       "offset:%u;\tsize:%u;\n",		\
-			       (unsigned int)offsetof(typeof(field), item), \
-			       (unsigned int)sizeof(field.item));	\
-	if (!ret)							\
-		return 0;
-
-#undef __entry
-#define __entry REC
-
-#undef TP_printk
-#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
-
-#undef TP_fast_assign
-#define TP_fast_assign(args...) args
-
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
-static int								\
-ftrace_format_##call(struct trace_seq *s)				\
-{									\
-	struct ftrace_raw_##call field;					\
-	int ret;							\
-									\
-	tstruct;							\
-									\
-	trace_seq_printf(s, "\nprint fmt: " print);			\
-									\
-	return ret;							\
-}
-
-#include <trace/trace_events.h>
-
-#undef __field
-#define __field(type, item)						\
-	ret = trace_define_field(event_call, #type, #item,		\
-				 offsetof(typeof(field), item),		\
-				 sizeof(field.item));			\
-	if (ret)							\
-		return ret;
-
-#undef __array
-#define __array(type, item, len)					\
-	BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);				\
-	ret = trace_define_field(event_call, #type "[" #len "]", #item,	\
-				 offsetof(typeof(field), item),		\
-				 sizeof(field.item));			\
-	if (ret)							\
-		return ret;
-
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
-int									\
-ftrace_define_fields_##call(void)					\
-{									\
-	struct ftrace_raw_##call field;					\
-	struct ftrace_event_call *event_call = &event_##call;		\
-	int ret;							\
-									\
-	__common_field(unsigned char, type);				\
-	__common_field(unsigned char, flags);				\
-	__common_field(unsigned char, preempt_count);			\
-	__common_field(int, pid);					\
-	__common_field(int, tgid);					\
-									\
-	tstruct;							\
-									\
-	return ret;							\
-}
-
-#include <trace/trace_events.h>
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
deleted file mode 100644
index 45c04e1f38d..00000000000
--- a/kernel/trace/trace_events_stage_3.h
+++ /dev/null
@@ -1,279 +0,0 @@
-/*
- * Stage 3 of the trace events.
- *
- * Override the macros in <trace/trace_events.h> to include the following:
- *
- * static void ftrace_event_<call>(proto)
- * {
- *	event_trace_printk(_RET_IP_, "<call>: " <fmt>);
- * }
- *
- * static int ftrace_reg_event_<call>(void)
- * {
- *	int ret;
- *
- *	ret = register_trace_<call>(ftrace_event_<call>);
- *	if (!ret)
- *		pr_info("event trace: Could not activate trace point "
- *			"probe to  <call>");
- *	return ret;
- * }
- *
- * static void ftrace_unreg_event_<call>(void)
- * {
- *	unregister_trace_<call>(ftrace_event_<call>);
- * }
- *
- * For those macros defined with TRACE_FORMAT:
- *
- * static struct ftrace_event_call __used
- * __attribute__((__aligned__(4)))
- * __attribute__((section("_ftrace_events"))) event_<call> = {
- *	.name			= "<call>",
- *	.regfunc		= ftrace_reg_event_<call>,
- *	.unregfunc		= ftrace_unreg_event_<call>,
- * }
- *
- *
- * For those macros defined with TRACE_EVENT:
- *
- * static struct ftrace_event_call event_<call>;
- *
- * static void ftrace_raw_event_<call>(proto)
- * {
- *	struct ring_buffer_event *event;
- *	struct ftrace_raw_<call> *entry; <-- defined in stage 1
- *	unsigned long irq_flags;
- *	int pc;
- *
- *	local_save_flags(irq_flags);
- *	pc = preempt_count();
- *
- *	event = trace_current_buffer_lock_reserve(event_<call>.id,
- *				  sizeof(struct ftrace_raw_<call>),
- *				  irq_flags, pc);
- *	if (!event)
- *		return;
- *	entry	= ring_buffer_event_data(event);
- *
- *	<assign>;  <-- Here we assign the entries by the __field and
- *			__array macros.
- *
- *	trace_current_buffer_unlock_commit(event, irq_flags, pc);
- * }
- *
- * static int ftrace_raw_reg_event_<call>(void)
- * {
- *	int ret;
- *
- *	ret = register_trace_<call>(ftrace_raw_event_<call>);
- *	if (!ret)
- *		pr_info("event trace: Could not activate trace point "
- *			"probe to <call>");
- *	return ret;
- * }
- *
- * static void ftrace_unreg_event_<call>(void)
- * {
- *	unregister_trace_<call>(ftrace_raw_event_<call>);
- * }
- *
- * static struct trace_event ftrace_event_type_<call> = {
- *	.trace			= ftrace_raw_output_<call>, <-- stage 2
- * };
- *
- * static int ftrace_raw_init_event_<call>(void)
- * {
- *	int id;
- *
- *	id = register_ftrace_event(&ftrace_event_type_<call>);
- *	if (!id)
- *		return -ENODEV;
- *	event_<call>.id = id;
- *	return 0;
- * }
- *
- * static struct ftrace_event_call __used
- * __attribute__((__aligned__(4)))
- * __attribute__((section("_ftrace_events"))) event_<call> = {
- *	.name			= "<call>",
- *	.system			= "<system>",
- *	.raw_init		= ftrace_raw_init_event_<call>,
- *	.regfunc		= ftrace_reg_event_<call>,
- *	.unregfunc		= ftrace_unreg_event_<call>,
- *	.show_format		= ftrace_format_<call>,
- * }
- *
- */
-
-#undef TP_FMT
-#define TP_FMT(fmt, args...)	fmt "\n", ##args
-
-#ifdef CONFIG_EVENT_PROFILE
-#define _TRACE_PROFILE(call, proto, args)				\
-static void ftrace_profile_##call(proto)				\
-{									\
-	extern void perf_tpcounter_event(int);				\
-	perf_tpcounter_event(event_##call.id);				\
-}									\
-									\
-static int ftrace_profile_enable_##call(struct ftrace_event_call *call) \
-{									\
-	int ret = 0;							\
-									\
-	if (!atomic_inc_return(&call->profile_count))			\
-		ret = register_trace_##call(ftrace_profile_##call);	\
-									\
-	return ret;							\
-}									\
-									\
-static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \
-{									\
-	if (atomic_add_negative(-1, &call->profile_count))		\
-		unregister_trace_##call(ftrace_profile_##call);		\
-}
-
-#define _TRACE_PROFILE_INIT(call)					\
-	.profile_count = ATOMIC_INIT(-1),				\
-	.profile_enable = ftrace_profile_enable_##call,			\
-	.profile_disable = ftrace_profile_disable_##call,
-
-#else
-#define _TRACE_PROFILE(call, proto, args)
-#define _TRACE_PROFILE_INIT(call)
-#endif
-
-#define _TRACE_FORMAT(call, proto, args, fmt)				\
-static void ftrace_event_##call(proto)					\
-{									\
-	event_trace_printk(_RET_IP_, #call ": " fmt);			\
-}									\
-									\
-static int ftrace_reg_event_##call(void)				\
-{									\
-	int ret;							\
-									\
-	ret = register_trace_##call(ftrace_event_##call);		\
-	if (ret)							\
-		pr_info("event trace: Could not activate trace point "	\
-			"probe to " #call "\n");			\
-	return ret;							\
-}									\
-									\
-static void ftrace_unreg_event_##call(void)				\
-{									\
-	unregister_trace_##call(ftrace_event_##call);			\
-}									\
-									\
-static struct ftrace_event_call event_##call;				\
-									\
-static int ftrace_init_event_##call(void)				\
-{									\
-	int id;								\
-									\
-	id = register_ftrace_event(NULL);				\
-	if (!id)							\
-		return -ENODEV;						\
-	event_##call.id = id;						\
-	return 0;							\
-}
-
-#undef TRACE_FORMAT
-#define TRACE_FORMAT(call, proto, args, fmt)				\
-_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))		\
-_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args))			\
-static struct ftrace_event_call __used					\
-__attribute__((__aligned__(4)))						\
-__attribute__((section("_ftrace_events"))) event_##call = {		\
-	.name			= #call,				\
-	.system			= __stringify(TRACE_SYSTEM),		\
-	.raw_init		= ftrace_init_event_##call,		\
-	.regfunc		= ftrace_reg_event_##call,		\
-	.unregfunc		= ftrace_unreg_event_##call,		\
-	_TRACE_PROFILE_INIT(call)					\
-}
-
-#undef __entry
-#define __entry entry
-
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
-_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args))			\
-									\
-static struct ftrace_event_call event_##call;				\
-									\
-static void ftrace_raw_event_##call(proto)				\
-{									\
-	struct ftrace_event_call *call = &event_##call;			\
-	struct ring_buffer_event *event;				\
-	struct ftrace_raw_##call *entry;				\
-	unsigned long irq_flags;					\
-	int pc;								\
-									\
-	local_save_flags(irq_flags);					\
-	pc = preempt_count();						\
-									\
-	event = trace_current_buffer_lock_reserve(event_##call.id,	\
-				  sizeof(struct ftrace_raw_##call),	\
-				  irq_flags, pc);			\
-	if (!event)							\
-		return;							\
-	entry	= ring_buffer_event_data(event);			\
-									\
-	assign;								\
-									\
-	if (!filter_current_check_discard(call, entry, event))		\
-		trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \
-}									\
-									\
-static int ftrace_raw_reg_event_##call(void)				\
-{									\
-	int ret;							\
-									\
-	ret = register_trace_##call(ftrace_raw_event_##call);		\
-	if (ret)							\
-		pr_info("event trace: Could not activate trace point "	\
-			"probe to " #call "\n");			\
-	return ret;							\
-}									\
-									\
-static void ftrace_raw_unreg_event_##call(void)				\
-{									\
-	unregister_trace_##call(ftrace_raw_event_##call);		\
-}									\
-									\
-static struct trace_event ftrace_event_type_##call = {			\
-	.trace			= ftrace_raw_output_##call,		\
-};									\
-									\
-static int ftrace_raw_init_event_##call(void)				\
-{									\
-	int id;								\
-									\
-	id = register_ftrace_event(&ftrace_event_type_##call);		\
-	if (!id)							\
-		return -ENODEV;						\
-	event_##call.id = id;						\
-	INIT_LIST_HEAD(&event_##call.fields);				\
-	init_preds(&event_##call);					\
-	return 0;							\
-}									\
-									\
-static struct ftrace_event_call __used					\
-__attribute__((__aligned__(4)))						\
-__attribute__((section("_ftrace_events"))) event_##call = {		\
-	.name			= #call,				\
-	.system			= __stringify(TRACE_SYSTEM),		\
-	.raw_init		= ftrace_raw_init_event_##call,		\
-	.regfunc		= ftrace_raw_reg_event_##call,		\
-	.unregfunc		= ftrace_raw_unreg_event_##call,	\
-	.show_format		= ftrace_format_##call,			\
-	.define_fields		= ftrace_define_fields_##call,		\
-	_TRACE_PROFILE_INIT(call)					\
-}
-
-#include <trace/trace_events.h>
-
-#undef _TRACE_PROFILE
-#undef _TRACE_PROFILE_INIT
-
-- 
cgit v1.2.3


From a59fd6027218bd7c994e39d14afe0242f895144f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 10 Apr 2009 13:52:20 -0400
Subject: tracing/events: convert event call sites to use a link list

Impact: makes it possible to define events in modules

The events are created by reading down the section that they are linked
in by the macros. But this is not scalable to modules. This patch converts
the manipulations to use a global link list, and on boot up it adds
the items in the section to the list.

This change will allow modules to add their tracing events to the list as
well.

Note, this change alone does not permit modules to use the TRACE_EVENT macros,
but the change is needed for them to eventually do so.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h               | 13 +---------
 kernel/trace/trace_event_profile.c |  4 +--
 kernel/trace/trace_events.c        | 51 +++++++++++++++++++++++---------------
 kernel/trace/trace_events_filter.c |  8 +++---
 4 files changed, 38 insertions(+), 38 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6bcdf4af9b2..8817c18ef97 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -739,11 +739,6 @@ struct event_subsystem {
 	struct filter_pred	**preds;
 };
 
-#define events_for_each(event)						\
-	for (event = __start_ftrace_events;				\
-	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
-	     event++)
-
 struct filter_pred;
 
 typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
@@ -785,13 +780,7 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
 	return 0;
 }
 
-extern struct ftrace_event_call __start_ftrace_events[];
-extern struct ftrace_event_call __stop_ftrace_events[];
-
-#define for_each_event(event)						\
-	for (event = __start_ftrace_events;				\
-	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
-	     event++)
+extern struct list_head ftrace_events;
 
 extern const char *__start___trace_bprintk_fmt[];
 extern const char *__stop___trace_bprintk_fmt[];
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 199de9c7422..7bf2ad65eee 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -11,7 +11,7 @@ int ftrace_profile_enable(int event_id)
 {
 	struct ftrace_event_call *event;
 
-	for_each_event(event) {
+	list_for_each_entry(event, &ftrace_events, list) {
 		if (event->id == event_id)
 			return event->profile_enable(event);
 	}
@@ -23,7 +23,7 @@ void ftrace_profile_disable(int event_id)
 {
 	struct ftrace_event_call *event;
 
-	for_each_event(event) {
+	list_for_each_entry(event, &ftrace_events, list) {
 		if (event->id == event_id)
 			return event->profile_disable(event);
 	}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index ead68ac9919..5c66aaff07c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -19,6 +19,8 @@
 
 static DEFINE_MUTEX(event_mutex);
 
+LIST_HEAD(ftrace_events);
+
 int trace_define_field(struct ftrace_event_call *call, char *type,
 		       char *name, int offset, int size)
 {
@@ -54,16 +56,14 @@ err:
 
 static void ftrace_clear_events(void)
 {
-	struct ftrace_event_call *call = (void *)__start_ftrace_events;
-
+	struct ftrace_event_call *call;
 
-	while ((unsigned long)call < (unsigned long)__stop_ftrace_events) {
+	list_for_each_entry(call, &ftrace_events, list) {
 
 		if (call->enabled) {
 			call->enabled = 0;
 			call->unregfunc();
 		}
-		call++;
 	}
 }
 
@@ -89,7 +89,7 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
 
 static int ftrace_set_clr_event(char *buf, int set)
 {
-	struct ftrace_event_call *call = __start_ftrace_events;
+	struct ftrace_event_call *call;
 	char *event = NULL, *sub = NULL, *match;
 	int ret = -EINVAL;
 
@@ -118,7 +118,7 @@ static int ftrace_set_clr_event(char *buf, int set)
 	}
 
 	mutex_lock(&event_mutex);
-	for_each_event(call) {
+	list_for_each_entry(call, &ftrace_events, list) {
 
 		if (!call->name || !call->regfunc)
 			continue;
@@ -224,15 +224,17 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
 static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	struct ftrace_event_call *call = m->private;
-	struct ftrace_event_call *next = call;
+	struct list_head *list = m->private;
+	struct ftrace_event_call *call;
 
 	(*pos)++;
 
 	for (;;) {
-		if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
+		if (list == &ftrace_events)
 			return NULL;
 
+		call = list_entry(list, struct ftrace_event_call, list);
+
 		/*
 		 * The ftrace subsystem is for showing formats only.
 		 * They can not be enabled or disabled via the event files.
@@ -240,11 +242,10 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 		if (call->regfunc)
 			break;
 
-		call++;
-		next = call;
+		list = list->next;
 	}
 
-	m->private = ++next;
+	m->private = list->next;
 
 	return call;
 }
@@ -257,22 +258,23 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 static void *
 s_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	struct ftrace_event_call *call = m->private;
-	struct ftrace_event_call *next;
+	struct list_head *list = m->private;
+	struct ftrace_event_call *call;
 
 	(*pos)++;
 
  retry:
-	if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
+	if (list == &ftrace_events)
 		return NULL;
 
+	call = list_entry(list, struct ftrace_event_call, list);
+
 	if (!call->enabled) {
-		call++;
+		list = list->next;
 		goto retry;
 	}
 
-	next = call;
-	m->private = ++next;
+	m->private = list->next;
 
 	return call;
 }
@@ -312,7 +314,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file)
 	if (!ret) {
 		struct seq_file *m = file->private_data;
 
-		m->private = __start_ftrace_events;
+		m->private = ftrace_events.next;
 	}
 	return ret;
 }
@@ -797,9 +799,17 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 	return 0;
 }
 
+extern struct ftrace_event_call __start_ftrace_events[];
+extern struct ftrace_event_call __stop_ftrace_events[];
+
+#define for_each_event(event)						\
+	for (event = __start_ftrace_events;				\
+	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
+	     event++)
+
 static __init int event_trace_init(void)
 {
-	struct ftrace_event_call *call = __start_ftrace_events;
+	struct ftrace_event_call *call;
 	struct dentry *d_tracer;
 	struct dentry *entry;
 	struct dentry *d_events;
@@ -830,6 +840,7 @@ static __init int event_trace_init(void)
 		/* The linker may leave blanks */
 		if (!call->name)
 			continue;
+		list_add(&call->list, &ftrace_events);
 		event_create_dir(call, d_events);
 	}
 
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index de42dad42a8..d30b06b02b4 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -223,7 +223,7 @@ oom:
 
 void filter_free_subsystem_preds(struct event_subsystem *system)
 {
-	struct ftrace_event_call *call = __start_ftrace_events;
+	struct ftrace_event_call *call;
 	int i;
 
 	if (system->n_preds) {
@@ -234,7 +234,7 @@ void filter_free_subsystem_preds(struct event_subsystem *system)
 		system->n_preds = 0;
 	}
 
-	events_for_each(call) {
+	list_for_each_entry(call, &ftrace_events, list) {
 		if (!call->define_fields)
 			continue;
 
@@ -320,7 +320,7 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
 int filter_add_subsystem_pred(struct event_subsystem *system,
 			      struct filter_pred *pred)
 {
-	struct ftrace_event_call *call = __start_ftrace_events;
+	struct ftrace_event_call *call;
 
 	if (system->n_preds && !pred->compound)
 		filter_free_subsystem_preds(system);
@@ -337,7 +337,7 @@ int filter_add_subsystem_pred(struct event_subsystem *system,
 
 	system->preds[system->n_preds] = pred;
 
-	events_for_each(call) {
+	list_for_each_entry(call, &ftrace_events, list) {
 		int err;
 
 		if (!call->define_fields)
-- 
cgit v1.2.3


From 17c873ec280a03894bc718af817f7f24fa787ae1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 10 Apr 2009 18:12:50 -0400
Subject: tracing/events: add export symbols for trace events in modules

Impact: let modules add trace events

The trace event code requires some functions to be exported to allow
modules to use TRACE_EVENT. This patch adds EXPORT_SYMBOL_GPL to the
necessary functions.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c               | 3 +++
 kernel/trace/trace_events.c        | 1 +
 kernel/trace/trace_events_filter.c | 2 ++
 kernel/trace/trace_output.c        | 3 +++
 4 files changed, 9 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c0047fcf707..2d69b26b3cc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -176,6 +176,7 @@ int filter_current_check_discard(struct ftrace_event_call *call, void *rec,
 {
 	return filter_check_discard(call, rec, global_trace.buffer, event);
 }
+EXPORT_SYMBOL_GPL(filter_current_check_discard);
 
 cycle_t ftrace_now(int cpu)
 {
@@ -886,6 +887,7 @@ trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
 	return trace_buffer_lock_reserve(&global_trace,
 					 type, len, flags, pc);
 }
+EXPORT_SYMBOL(trace_current_buffer_lock_reserve);
 
 void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
 					unsigned long flags, int pc)
@@ -903,6 +905,7 @@ void trace_current_buffer_discard_commit(struct ring_buffer_event *event)
 {
 	ring_buffer_discard_commit(global_trace.buffer, event);
 }
+EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
 
 void
 trace_function(struct trace_array *tr,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 5c66aaff07c..8b9e621b80b 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -53,6 +53,7 @@ err:
 
 	return -ENOMEM;
 }
+EXPORT_SYMBOL_GPL(trace_define_field);
 
 static void ftrace_clear_events(void)
 {
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index d30b06b02b4..f8e5eab0424 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -110,6 +110,7 @@ int filter_match_preds(struct ftrace_event_call *call, void *rec)
 
 	return 1;
 }
+EXPORT_SYMBOL_GPL(filter_match_preds);
 
 void filter_print_preds(struct filter_pred **preds, int n_preds,
 			struct trace_seq *s)
@@ -220,6 +221,7 @@ oom:
 
 	return -ENOMEM;
 }
+EXPORT_SYMBOL_GPL(init_preds);
 
 void filter_free_subsystem_preds(struct event_subsystem *system)
 {
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 0e70fb07ca7..83a8abb9640 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -94,6 +94,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 
 	return len;
 }
+EXPORT_SYMBOL_GPL(trace_seq_printf);
 
 int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
 {
@@ -538,6 +539,7 @@ int register_ftrace_event(struct trace_event *event)
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(register_ftrace_event);
 
 /**
  * unregister_ftrace_event - remove a no longer used event
@@ -551,6 +553,7 @@ int unregister_ftrace_event(struct trace_event *event)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(unregister_ftrace_event);
 
 /*
  * Standard events
-- 
cgit v1.2.3


From 6d723736e472f7a0cd5b62c84152fceead241328 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 10 Apr 2009 14:53:50 -0400
Subject: tracing/events: add support for modules to TRACE_EVENT

Impact: allow modules to add TRACE_EVENTS on load

This patch adds the final hooks to allow modules to use the TRACE_EVENT
macro. A notifier and a data structure are used to link the TRACE_EVENTs
defined in the module to connect them with the ftrace event tracing system.

It also adds the necessary automated clean ups to the trace events when a
module is removed.

Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 128 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 96 insertions(+), 32 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 8b9e621b80b..a4b177720a6 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -713,7 +713,13 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
 		return d_events;
 	}
 
-	system->name = name;
+	system->name = kstrdup(name, GFP_KERNEL);
+	if (!system->name) {
+		debugfs_remove(system->entry);
+		kfree(system);
+		return d_events;
+	}
+
 	list_add(&system->list, &event_subsystems);
 
 	system->preds = NULL;
@@ -738,7 +744,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 	 * If the trace point header did not define TRACE_SYSTEM
 	 * then the system would be called "TRACE_SYSTEM".
 	 */
-	if (strcmp(call->system, "TRACE_SYSTEM") != 0)
+	if (strcmp(call->system, TRACE_SYSTEM) != 0)
 		d_events = event_subsystem_dir(call->system, d_events);
 
 	if (call->raw_init) {
@@ -757,21 +763,13 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 		return -1;
 	}
 
-	if (call->regfunc) {
-		entry = debugfs_create_file("enable", 0644, call->dir, call,
-					    &ftrace_enable_fops);
-		if (!entry)
-			pr_warning("Could not create debugfs "
-				   "'%s/enable' entry\n", call->name);
-	}
+	if (call->regfunc)
+		entry = trace_create_file("enable", 0644, call->dir, call,
+					  &ftrace_enable_fops);
 
-	if (call->id) {
-		entry = debugfs_create_file("id", 0444, call->dir, call,
-				&ftrace_event_id_fops);
-		if (!entry)
-			pr_warning("Could not create debugfs '%s/id' entry\n",
-					call->name);
-	}
+	if (call->id)
+		entry = trace_create_file("id", 0444, call->dir, call,
+					  &ftrace_event_id_fops);
 
 	if (call->define_fields) {
 		ret = call->define_fields();
@@ -780,40 +778,102 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 				   " events/%s\n", call->name);
 			return ret;
 		}
-		entry = debugfs_create_file("filter", 0644, call->dir, call,
-					    &ftrace_event_filter_fops);
-		if (!entry)
-			pr_warning("Could not create debugfs "
-				   "'%s/filter' entry\n", call->name);
+		entry = trace_create_file("filter", 0644, call->dir, call,
+					  &ftrace_event_filter_fops);
 	}
 
 	/* A trace may not want to export its format */
 	if (!call->show_format)
 		return 0;
 
-	entry = debugfs_create_file("format", 0444, call->dir, call,
-				    &ftrace_event_format_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'%s/format' entry\n", call->name);
+	entry = trace_create_file("format", 0444, call->dir, call,
+				  &ftrace_event_format_fops);
+
+	return 0;
+}
+
+#define for_each_event(event, start, end)			\
+	for (event = start;					\
+	     (unsigned long)event < (unsigned long)end;		\
+	     event++)
+
+static void trace_module_add_events(struct module *mod)
+{
+	struct ftrace_event_call *call, *start, *end;
+	struct dentry *d_events;
+
+	start = mod->trace_events;
+	end = mod->trace_events + mod->num_trace_events;
+
+	if (start == end)
+		return;
+
+	d_events = event_trace_events_dir();
+	if (!d_events)
+		return;
+
+	for_each_event(call, start, end) {
+		/* The linker may leave blanks */
+		if (!call->name)
+			continue;
+		call->mod = mod;
+		list_add(&call->list, &ftrace_events);
+		event_create_dir(call, d_events);
+	}
+}
+
+static void trace_module_remove_events(struct module *mod)
+{
+	struct ftrace_event_call *call, *p;
+
+	list_for_each_entry_safe(call, p, &ftrace_events, list) {
+		if (call->mod == mod) {
+			if (call->enabled) {
+				call->enabled = 0;
+				call->unregfunc();
+			}
+			if (call->event)
+				unregister_ftrace_event(call->event);
+			debugfs_remove_recursive(call->dir);
+			list_del(&call->list);
+		}
+	}
+}
+
+int trace_module_notify(struct notifier_block *self,
+			unsigned long val, void *data)
+{
+	struct module *mod = data;
+
+	mutex_lock(&event_mutex);
+	switch (val) {
+	case MODULE_STATE_COMING:
+		trace_module_add_events(mod);
+		break;
+	case MODULE_STATE_GOING:
+		trace_module_remove_events(mod);
+		break;
+	}
+	mutex_unlock(&event_mutex);
 
 	return 0;
 }
 
+struct notifier_block trace_module_nb = {
+	.notifier_call = trace_module_notify,
+	.priority = 0,
+};
+
 extern struct ftrace_event_call __start_ftrace_events[];
 extern struct ftrace_event_call __stop_ftrace_events[];
 
-#define for_each_event(event)						\
-	for (event = __start_ftrace_events;				\
-	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
-	     event++)
-
 static __init int event_trace_init(void)
 {
 	struct ftrace_event_call *call;
 	struct dentry *d_tracer;
 	struct dentry *entry;
 	struct dentry *d_events;
+	int ret;
 
 	d_tracer = tracing_init_dentry();
 	if (!d_tracer)
@@ -837,7 +897,7 @@ static __init int event_trace_init(void)
 	if (!d_events)
 		return 0;
 
-	for_each_event(call) {
+	for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
 		/* The linker may leave blanks */
 		if (!call->name)
 			continue;
@@ -845,6 +905,10 @@ static __init int event_trace_init(void)
 		event_create_dir(call, d_events);
 	}
 
+	ret = register_module_notifier(&trace_module_nb);
+	if (!ret)
+		pr_warning("Failed to register trace events module notifier\n");
+
 	return 0;
 }
 fs_initcall(event_trace_init);
-- 
cgit v1.2.3


From 61f919a12fbdc3fd20f980a34a118d597198a392 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 14 Apr 2009 18:22:32 -0400
Subject: tracing/events: fix compile for modules disabled

Impact: compile fix

The addition of TRACE_EVENT for modules breaks the build for when
modules are disabled. This code fixes that.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index a4b177720a6..6591d83e1e7 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -797,6 +797,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 	     (unsigned long)event < (unsigned long)end;		\
 	     event++)
 
+#ifdef CONFIG_MODULES
 static void trace_module_add_events(struct module *mod)
 {
 	struct ftrace_event_call *call, *start, *end;
@@ -840,8 +841,8 @@ static void trace_module_remove_events(struct module *mod)
 	}
 }
 
-int trace_module_notify(struct notifier_block *self,
-			unsigned long val, void *data)
+static int trace_module_notify(struct notifier_block *self,
+			       unsigned long val, void *data)
 {
 	struct module *mod = data;
 
@@ -858,6 +859,13 @@ int trace_module_notify(struct notifier_block *self,
 
 	return 0;
 }
+#else
+static int trace_module_notify(struct notifier_block *self,
+			       unsigned long val, void *data)
+{
+	return 0;
+}
+#endif /* CONFIG_MODULES */
 
 struct notifier_block trace_module_nb = {
 	.notifier_call = trace_module_notify,
-- 
cgit v1.2.3


From ad8d75fff811a6a230f7f43b05a6483099349533 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 14 Apr 2009 19:39:12 -0400
Subject: tracing/events: move trace point headers into include/trace/events

Impact: clean up

Create a sub directory in include/trace called events to keep the
trace point headers in their own separate directory. Only headers that
declare trace points should be defined in this directory.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c             | 2 +-
 kernel/trace/trace_sched_switch.c | 2 +-
 kernel/trace/trace_sched_wakeup.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8e6a0b5c994..a2348898858 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -29,7 +29,7 @@
 #include <linux/list.h>
 #include <linux/hash.h>
 
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 
 #include <asm/ftrace.h>
 
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 9d8cccdfaa0..a98106dd979 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -10,7 +10,7 @@
 #include <linux/kallsyms.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 
 #include "trace.h"
 
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 5bc00e8f153..b8b13c5540f 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -15,7 +15,7 @@
 #include <linux/kallsyms.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 
 #include "trace.h"
 
-- 
cgit v1.2.3


From d0deef5b14af7d5bbd0003a0a2a1a32326e20a6d Mon Sep 17 00:00:00 2001
From: Shawn Du <duyuyang@gmail.com>
Date: Tue, 14 Apr 2009 13:58:56 +0800
Subject: blktrace: support per-partition tracing

Though one can specify '-d /dev/sda1' when using blktrace, it still
traces the whole sda.

To support per-partition tracing, when we start tracing, we initialize
bt->start_lba and bt->end_lba to the start and end sector of that
partition.

Note some actions are per device, thus we don't filter 0-sector events.

The original patch and discussion can be found here:
	http://marc.info/?l=linux-btrace&m=122949374214540&w=2

Signed-off-by: Shawn Du <duyuyang@gmail.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
LKML-Reference: <49E42620.4050701@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 2b98195b338..e932654cf59 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -147,7 +147,7 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
 {
 	if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
 		return 1;
-	if (sector < bt->start_lba || sector > bt->end_lba)
+	if (sector && (sector < bt->start_lba || sector > bt->end_lba))
 		return 1;
 	if (bt->pid && pid != bt->pid)
 		return 1;
@@ -192,7 +192,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	what |= MASK_TC_BIT(rw, DISCARD);
 
 	pid = tsk->pid;
-	if (unlikely(act_log_check(bt, what, sector, pid)))
+	if (act_log_check(bt, what, sector, pid))
 		return;
 	cpu = raw_smp_processor_id();
 
@@ -407,11 +407,13 @@ static struct rchan_callbacks blk_relay_callbacks = {
  * Setup everything required to start tracing
  */
 int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
-			struct blk_user_trace_setup *buts)
+		       struct block_device *bdev,
+		       struct blk_user_trace_setup *buts)
 {
 	struct blk_trace *old_bt, *bt = NULL;
 	struct dentry *dir = NULL;
 	int ret, i;
+	struct hd_struct *part = NULL;
 
 	if (!buts->buf_size || !buts->buf_nr)
 		return -EINVAL;
@@ -480,11 +482,21 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	if (!bt->act_mask)
 		bt->act_mask = (u16) -1;
 
-	bt->start_lba = buts->start_lba;
-	bt->end_lba = buts->end_lba;
-	if (!bt->end_lba)
+	if (bdev)
+		part = bdev->bd_part;
+
+	if (part) {
+		bt->start_lba = part->start_sect;
+		bt->end_lba = part->start_sect + part->nr_sects;
+	} else
 		bt->end_lba = -1ULL;
 
+	/* overwrite with user settings */
+	if (buts->start_lba)
+		bt->start_lba = buts->start_lba;
+	if (buts->end_lba)
+		bt->end_lba = buts->end_lba;
+
 	bt->pid = buts->pid;
 	bt->trace_state = Blktrace_setup;
 
@@ -505,6 +517,7 @@ err:
 }
 
 int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+		    struct block_device *bdev,
 		    char __user *arg)
 {
 	struct blk_user_trace_setup buts;
@@ -514,7 +527,7 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	if (ret)
 		return -EFAULT;
 
-	ret = do_blk_trace_setup(q, name, dev, &buts);
+	ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
 	if (ret)
 		return ret;
 
@@ -582,7 +595,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 	switch (cmd) {
 	case BLKTRACESETUP:
 		bdevname(bdev, b);
-		ret = blk_trace_setup(q, b, bdev->bd_dev, arg);
+		ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
 		break;
 	case BLKTRACESTART:
 		start = 1;
-- 
cgit v1.2.3


From 9908c30997b8a73c95f836170b9998dae9aa3f4a Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 14 Apr 2009 13:59:34 +0800
Subject: blktrace: support per-partition tracing for ftrace plugin

The previous patch adds support to trace a single partition for
relay+ioctl blktrace, and this patch is for ftrace plugin blktrace:

  # echo 1 > /sys/block/sda/sda7/enable
  # cat start_lba
  102398373
  # cat end_lba
  102703545

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Shawn Du <duyuyang@gmail.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
LKML-Reference: <49E42646.4060608@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 39 +++++++++++++++++++++++++--------------
 1 file changed, 25 insertions(+), 14 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index e932654cf59..d1098988052 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -403,6 +403,23 @@ static struct rchan_callbacks blk_relay_callbacks = {
 	.remove_buf_file	= blk_remove_buf_file_callback,
 };
 
+static void blk_trace_setup_lba(struct blk_trace *bt,
+				struct block_device *bdev)
+{
+	struct hd_struct *part = NULL;
+
+	if (bdev)
+		part = bdev->bd_part;
+
+	if (part) {
+		bt->start_lba = part->start_sect;
+		bt->end_lba = part->start_sect + part->nr_sects;
+	} else {
+		bt->start_lba = 0;
+		bt->end_lba = -1ULL;
+	}
+}
+
 /*
  * Setup everything required to start tracing
  */
@@ -413,7 +430,6 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	struct blk_trace *old_bt, *bt = NULL;
 	struct dentry *dir = NULL;
 	int ret, i;
-	struct hd_struct *part = NULL;
 
 	if (!buts->buf_size || !buts->buf_nr)
 		return -EINVAL;
@@ -482,14 +498,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	if (!bt->act_mask)
 		bt->act_mask = (u16) -1;
 
-	if (bdev)
-		part = bdev->bd_part;
-
-	if (part) {
-		bt->start_lba = part->start_sect;
-		bt->end_lba = part->start_sect + part->nr_sects;
-	} else
-		bt->end_lba = -1ULL;
+	blk_trace_setup_lba(bt, bdev);
 
 	/* overwrite with user settings */
 	if (buts->start_lba)
@@ -1370,7 +1379,8 @@ static int blk_trace_remove_queue(struct request_queue *q)
 /*
  * Setup everything required to start tracing
  */
-static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
+static int blk_trace_setup_queue(struct request_queue *q,
+				 struct block_device *bdev)
 {
 	struct blk_trace *old_bt, *bt = NULL;
 	int ret = -ENOMEM;
@@ -1383,9 +1393,10 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
 	if (!bt->msg_data)
 		goto free_bt;
 
-	bt->dev = dev;
+	bt->dev = bdev->bd_dev;
 	bt->act_mask = (u16)-1;
-	bt->end_lba = -1ULL;
+
+	blk_trace_setup_lba(bt, bdev);
 
 	old_bt = xchg(&q->blk_trace, bt);
 	if (old_bt != NULL) {
@@ -1602,7 +1613,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 
 	if (attr == &dev_attr_enable) {
 		if (value)
-			ret = blk_trace_setup_queue(q, bdev->bd_dev);
+			ret = blk_trace_setup_queue(q, bdev);
 		else
 			ret = blk_trace_remove_queue(q);
 		goto out_unlock_bdev;
@@ -1610,7 +1621,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 
 	ret = 0;
 	if (q->blk_trace == NULL)
-		ret = blk_trace_setup_queue(q, bdev->bd_dev);
+		ret = blk_trace_setup_queue(q, bdev);
 
 	if (ret == 0) {
 		if (attr == &dev_attr_act_mask)
-- 
cgit v1.2.3


From 1d54ad6da9192fed5dd3b60224d9f2dfea0dcd82 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 14 Apr 2009 14:00:05 +0800
Subject: blktrace: add trace/ to /sys/block/sda

Impact: allow ftrace-plugin blktrace to trace device-mapper devices

To trace a single partition:
  # echo 1 > /sys/block/sda/sda1/enable

To trace the whole sda instead:
  # echo 1 > /sys/block/sda/enable

Thus we also fix an issue reported by Ted, that ftrace-plugin blktrace
can't be used to trace device-mapper devices.

Now:

  # echo 1 > /sys/block/dm-0/trace/enable
  echo: write error: No such device or address
  # mount -t ext4 /dev/dm-0 /mnt
  # echo 1 > /sys/block/dm-0/trace/enable
  # echo blk > /debug/tracing/current_tracer

Reported-by: Theodore Tso <tytso@mit.edu>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Shawn Du <duyuyang@gmail.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
LKML-Reference: <49E42665.6020506@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index d1098988052..8e7c5da3a3e 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1644,3 +1644,8 @@ out:
 	return ret ? ret : count;
 }
 
+int blk_trace_init_sysfs(struct device *dev)
+{
+	return sysfs_create_group(&dev->kobj, &blk_trace_attr_group);
+}
+
-- 
cgit v1.2.3


From f3948f8857ef5de239f28a61dddb1554a0ae4c2c Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 15 Apr 2009 11:02:56 +0800
Subject: blktrace: fix context-info when mixed-using blk tracer and trace
 events

When current tracer is set to blk tracer, TRACE_ITER_CONTEXT_INFO is
unset, but actually context-info is printed:

    pdflush-431   [000]   821.181576:   8,0    P   N [pdflush]

And then if we enable TRACE_ITER_CONTEXT_INFO:

    # echo context-info > trace_options

We'll see context-info printed twice. What's worse, when we use blk
tracer and trace events at the same time, we'll see no context-info
for trace events at all:

    jbd2_commit_logging: dev dm-0:8 transaction 333227
    jbd2_end_commit: dev dm-0:8 transaction 333227 head 332814
      rm-25433 [001]  9578.307485:   8,18   m   N cfq25433 slice expired t=0
      rm-25433 [001]  9578.307486:   8,18   m   N cfq25433 put_queue

This patch adds blk_tracer->set_flags(), and context-info flag is unset
only when we set the output to classic mode.

Note after this patch, one should unset context-info explicitly if he
wants to get binary output that can be parsed by blkparse:

    # echo nocontext-info > trace_options
    # echo bin > trace_options
    # echo blk > current_tracer
    # cat trace_pipe | blkparse -i -

Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <49E54E60.50408@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 8e7c5da3a3e..c32062bd10b 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1211,7 +1211,6 @@ static void blk_tracer_print_header(struct seq_file *m)
 static void blk_tracer_start(struct trace_array *tr)
 {
 	blk_tracer_enabled = true;
-	trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
 }
 
 static int blk_tracer_init(struct trace_array *tr)
@@ -1224,7 +1223,6 @@ static int blk_tracer_init(struct trace_array *tr)
 static void blk_tracer_stop(struct trace_array *tr)
 {
 	blk_tracer_enabled = false;
-	trace_flags |= TRACE_ITER_CONTEXT_INFO;
 }
 
 static void blk_tracer_reset(struct trace_array *tr)
@@ -1289,9 +1287,6 @@ out:
 static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
 					       int flags)
 {
-	if (!trace_print_context(iter))
-		return TRACE_TYPE_PARTIAL_LINE;
-
 	return print_one_line(iter, false);
 }
 
@@ -1326,6 +1321,18 @@ static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
 	return print_one_line(iter, true);
 }
 
+static int blk_tracer_set_flag(u32 old_flags, u32 bit, int set)
+{
+	/* don't output context-info for blk_classic output */
+	if (bit == TRACE_BLK_OPT_CLASSIC) {
+		if (set)
+			trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
+		else
+			trace_flags |= TRACE_ITER_CONTEXT_INFO;
+	}
+	return 0;
+}
+
 static struct tracer blk_tracer __read_mostly = {
 	.name		= "blk",
 	.init		= blk_tracer_init,
@@ -1335,6 +1342,7 @@ static struct tracer blk_tracer __read_mostly = {
 	.print_header	= blk_tracer_print_header,
 	.print_line	= blk_tracer_print_line,
 	.flags		= &blk_tracer_flags,
+	.set_flag	= blk_tracer_set_flag,
 };
 
 static struct trace_event trace_blk_event = {
-- 
cgit v1.2.3


From 93eb677d74a4f7d3edfb678c94f6c0544d9fbad2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 15 Apr 2009 13:24:06 -0400
Subject: ftrace: use module notifier for function tracer

The hooks in the module code for the function tracer must be called
before any of that module code runs. The function tracer hooks
modify the module (replacing calls to mcount to nops). If the code
is executed while the change occurs, then the CPU can take a GPF.

To handle the above with a bit of paranoia, I originally implemented
the hooks as calls directly from the module code.

After examining the notifier calls, it looks as though the start up
notify is called before any of the module's code is executed. This makes
the use of the notify safe with ftrace.

Only the startup notify is required to be "safe". The shutdown simply
removes the entries from the ftrace function list, and does not modify
any code.

This change has another benefit. It removes a issue with a reverse dependency
in the mutexes of ftrace_lock and module_mutex.

[ Impact: fix lock dependency bug, cleanup ]

Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 90 ++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 64 insertions(+), 26 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a2348898858..5b606f45b6c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -916,30 +916,6 @@ static void ftrace_free_rec(struct dyn_ftrace *rec)
 	rec->flags |= FTRACE_FL_FREE;
 }
 
-void ftrace_release(void *start, unsigned long size)
-{
-	struct dyn_ftrace *rec;
-	struct ftrace_page *pg;
-	unsigned long s = (unsigned long)start;
-	unsigned long e = s + size;
-
-	if (ftrace_disabled || !start)
-		return;
-
-	mutex_lock(&ftrace_lock);
-	do_for_each_ftrace_rec(pg, rec) {
-		if ((rec->ip >= s) && (rec->ip < e)) {
-			/*
-			 * rec->ip is changed in ftrace_free_rec()
-			 * It should not between s and e if record was freed.
-			 */
-			FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
-			ftrace_free_rec(rec);
-		}
-	} while_for_each_ftrace_rec();
-	mutex_unlock(&ftrace_lock);
-}
-
 static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
 {
 	struct dyn_ftrace *rec;
@@ -2752,14 +2728,72 @@ static int ftrace_convert_nops(struct module *mod,
 	return 0;
 }
 
-void ftrace_init_module(struct module *mod,
-			unsigned long *start, unsigned long *end)
+#ifdef CONFIG_MODULES
+void ftrace_release(void *start, void *end)
+{
+	struct dyn_ftrace *rec;
+	struct ftrace_page *pg;
+	unsigned long s = (unsigned long)start;
+	unsigned long e = (unsigned long)end;
+
+	if (ftrace_disabled || !start || start == end)
+		return;
+
+	mutex_lock(&ftrace_lock);
+	do_for_each_ftrace_rec(pg, rec) {
+		if ((rec->ip >= s) && (rec->ip < e)) {
+			/*
+			 * rec->ip is changed in ftrace_free_rec()
+			 * It should not between s and e if record was freed.
+			 */
+			FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
+			ftrace_free_rec(rec);
+		}
+	} while_for_each_ftrace_rec();
+	mutex_unlock(&ftrace_lock);
+}
+
+static void ftrace_init_module(struct module *mod,
+			       unsigned long *start, unsigned long *end)
 {
 	if (ftrace_disabled || start == end)
 		return;
 	ftrace_convert_nops(mod, start, end);
 }
 
+static int ftrace_module_notify(struct notifier_block *self,
+				unsigned long val, void *data)
+{
+	struct module *mod = data;
+
+	switch (val) {
+	case MODULE_STATE_COMING:
+		ftrace_init_module(mod, mod->ftrace_callsites,
+				   mod->ftrace_callsites +
+				   mod->num_ftrace_callsites);
+		break;
+	case MODULE_STATE_GOING:
+		ftrace_release(mod->ftrace_callsites,
+			       mod->ftrace_callsites +
+			       mod->num_ftrace_callsites);
+		break;
+	}
+
+	return 0;
+}
+#else
+static int ftrace_module_notify(struct notifier_block *self,
+				unsigned long val, void *data)
+{
+	return 0;
+}
+#endif /* CONFIG_MODULES */
+
+struct notifier_block ftrace_module_nb = {
+	.notifier_call = ftrace_module_notify,
+	.priority = 0,
+};
+
 extern unsigned long __start_mcount_loc[];
 extern unsigned long __stop_mcount_loc[];
 
@@ -2791,6 +2825,10 @@ void __init ftrace_init(void)
 				  __start_mcount_loc,
 				  __stop_mcount_loc);
 
+	ret = register_module_notifier(&ftrace_module_nb);
+	if (!ret)
+		pr_warning("Failed to register trace ftrace module notifier\n");
+
 	return;
  failed:
 	ftrace_disabled = 1;
-- 
cgit v1.2.3


From e6187007d6c365b551c69ea3df46f06fd1c8bd19 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 15 Apr 2009 13:36:40 -0400
Subject: tracing/events: add startup tests for events
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As events start to become popular, and the new way to add tracing
infrastructure into ftrace, it is important to catch any problems
that might happen with a mistake in the TRACE_EVENT macro.

This patch introduces a startup self test on the registered trace
events. Note, it can only do a generic test, any type of testing that
needs more involement is needed to be implemented by the tracepoint
creators.

The test goes down one by one enabling a trace point and running
some random tasks (random in the sense that I just made them up).
Those tasks are creating threads, grabbing mutexes and spinlocks
and using workqueues.

After testing each event individually, it does the same test after
enabling each system of trace points. Like sched, irq, lockdep.

Then finally it enables all tracepoints and performs the tasks again.
The output to the console on bootup will look like this when everything
works:

Running tests on trace events:
Testing event kfree_skb: OK
Testing event kmalloc: OK
Testing event kmem_cache_alloc: OK
Testing event kmalloc_node: OK
Testing event kmem_cache_alloc_node: OK
Testing event kfree: OK
Testing event kmem_cache_free: OK
Testing event irq_handler_exit: OK
Testing event irq_handler_entry: OK
Testing event softirq_entry: OK
Testing event softirq_exit: OK
Testing event lock_acquire: OK
Testing event lock_release: OK
Testing event sched_kthread_stop: OK
Testing event sched_kthread_stop_ret: OK
Testing event sched_wait_task: OK
Testing event sched_wakeup: OK
Testing event sched_wakeup_new: OK
Testing event sched_switch: OK
Testing event sched_migrate_task: OK
Testing event sched_process_free: OK
Testing event sched_process_exit: OK
Testing event sched_process_wait: OK
Testing event sched_process_fork: OK
Testing event sched_signal_send: OK
Running tests on trace event systems:
Testing event system skb: OK
Testing event system kmem: OK
Testing event system irq: OK
Testing event system lockdep: OK
Testing event system sched: OK
Running tests on all trace events:
Testing all events: OK

[ folded in:

  tracing: add #include <linux/delay.h> to fix build failure in test_work()

  This build failure occured on a few rare configs:

   kernel/trace/trace_events.c: In function ‘test_work’:
   kernel/trace/trace_events.c:975: error: implicit declaration of function ‘udelay’
   kernel/trace/trace_events.c:980: error: implicit declaration of function ‘msleep’

  delay.h is included in way too many other headers, hiding cases
  where new usage is added without header inclusion.

  [ Impact: build fix ]

  Signed-off-by: Ingo Molnar <mingo@elte.hu>
]

[ Impact: add event tracer self-tests ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 178 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 178 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 6591d83e1e7..f81d6eec4e4 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -8,10 +8,14 @@
  *
  */
 
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
+#include <linux/delay.h>
 
 #include "trace_output.h"
 
@@ -920,3 +924,177 @@ static __init int event_trace_init(void)
 	return 0;
 }
 fs_initcall(event_trace_init);
+
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+
+static DEFINE_SPINLOCK(test_spinlock);
+static DEFINE_SPINLOCK(test_spinlock_irq);
+static DEFINE_MUTEX(test_mutex);
+
+static __init void test_work(struct work_struct *dummy)
+{
+	spin_lock(&test_spinlock);
+	spin_lock_irq(&test_spinlock_irq);
+	udelay(1);
+	spin_unlock_irq(&test_spinlock_irq);
+	spin_unlock(&test_spinlock);
+
+	mutex_lock(&test_mutex);
+	msleep(1);
+	mutex_unlock(&test_mutex);
+}
+
+static __init int event_test_thread(void *unused)
+{
+	void *test_malloc;
+
+	test_malloc = kmalloc(1234, GFP_KERNEL);
+	if (!test_malloc)
+		pr_info("failed to kmalloc\n");
+
+	schedule_on_each_cpu(test_work);
+
+	kfree(test_malloc);
+
+	set_current_state(TASK_INTERRUPTIBLE);
+	while (!kthread_should_stop())
+		schedule();
+
+	return 0;
+}
+
+/*
+ * Do various things that may trigger events.
+ */
+static __init void event_test_stuff(void)
+{
+	struct task_struct *test_thread;
+
+	test_thread = kthread_run(event_test_thread, NULL, "test-events");
+	msleep(1);
+	kthread_stop(test_thread);
+}
+
+/*
+ * For every trace event defined, we will test each trace point separately,
+ * and then by groups, and finally all trace points.
+ */
+static __init int event_trace_self_tests(void)
+{
+	struct ftrace_event_call *call;
+	struct event_subsystem *system;
+	char *sysname;
+	int ret;
+
+	pr_info("Running tests on trace events:\n");
+
+	list_for_each_entry(call, &ftrace_events, list) {
+
+		/* Only test those that have a regfunc */
+		if (!call->regfunc)
+			continue;
+
+		pr_info("Testing event %s: ", call->name);
+
+		/*
+		 * If an event is already enabled, someone is using
+		 * it and the self test should not be on.
+		 */
+		if (call->enabled) {
+			pr_warning("Enabled event during self test!\n");
+			WARN_ON_ONCE(1);
+			continue;
+		}
+
+		call->enabled = 1;
+		call->regfunc();
+
+		event_test_stuff();
+
+		call->unregfunc();
+		call->enabled = 0;
+
+		pr_cont("OK\n");
+	}
+
+	/* Now test at the sub system level */
+
+	pr_info("Running tests on trace event systems:\n");
+
+	list_for_each_entry(system, &event_subsystems, list) {
+
+		/* the ftrace system is special, skip it */
+		if (strcmp(system->name, "ftrace") == 0)
+			continue;
+
+		pr_info("Testing event system %s: ", system->name);
+
+		/* ftrace_set_clr_event can modify the name passed in. */
+		sysname = kstrdup(system->name, GFP_KERNEL);
+		if (WARN_ON(!sysname)) {
+			pr_warning("Can't allocate memory, giving up!\n");
+			return 0;
+		}
+		ret = ftrace_set_clr_event(sysname, 1);
+		kfree(sysname);
+		if (WARN_ON_ONCE(ret)) {
+			pr_warning("error enabling system %s\n",
+				   system->name);
+			continue;
+		}
+
+		event_test_stuff();
+
+		sysname = kstrdup(system->name, GFP_KERNEL);
+		if (WARN_ON(!sysname)) {
+			pr_warning("Can't allocate memory, giving up!\n");
+			return 0;
+		}
+		ret = ftrace_set_clr_event(sysname, 0);
+		kfree(sysname);
+
+		if (WARN_ON_ONCE(ret))
+			pr_warning("error disabling system %s\n",
+				   system->name);
+
+		pr_cont("OK\n");
+	}
+
+	/* Test with all events enabled */
+
+	pr_info("Running tests on all trace events:\n");
+	pr_info("Testing all events: ");
+
+	sysname = kmalloc(4, GFP_KERNEL);
+	if (WARN_ON(!sysname)) {
+		pr_warning("Can't allocate memory, giving up!\n");
+		return 0;
+	}
+	memcpy(sysname, "*:*", 4);
+	ret = ftrace_set_clr_event(sysname, 1);
+	if (WARN_ON_ONCE(ret)) {
+		kfree(sysname);
+		pr_warning("error enabling all events\n");
+		return 0;
+	}
+
+	event_test_stuff();
+
+	/* reset sysname */
+	memcpy(sysname, "*:*", 4);
+	ret = ftrace_set_clr_event(sysname, 0);
+	kfree(sysname);
+
+	if (WARN_ON_ONCE(ret)) {
+		pr_warning("error disabling all events\n");
+		return 0;
+	}
+
+	pr_cont("OK\n");
+
+	return 0;
+}
+
+late_initcall(event_trace_self_tests);
+
+#endif
-- 
cgit v1.2.3


From d1b182a8d49ed6416325b4e0a1cb0f17cd4e702a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 15 Apr 2009 16:53:47 -0400
Subject: tracing/events/ring-buffer: expose format of ring buffer headers to
 users

Currently, every thing needed to read the binary output from the
ring buffers is available, with the exception of the way the ring
buffers handles itself internally.

This patch creates two special files in the debugfs/tracing/events
directory:

 # cat /debug/tracing/events/header_page
        field: u64 timestamp;   offset:0;       size:8;
        field: local_t commit;  offset:8;       size:8;
        field: char data;       offset:16;      size:4080;

 # cat /debug/tracing/events/header_event
        type        :    2 bits
        len         :    3 bits
        time_delta  :   27 bits
        array       :   32 bits

        padding     : type == 0
        time_extend : type == 1
        data        : type == 3

This is to allow a userspace app to see if the ring buffer format changes
or not.

[ Impact: allow userspace apps to know of ringbuffer format changes ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c  | 44 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_events.c | 38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 82 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f935bd5ec3e..84a6055f37c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -21,6 +21,28 @@
 
 #include "trace.h"
 
+/*
+ * The ring buffer header is special. We must manually up keep it.
+ */
+int ring_buffer_print_entry_header(struct trace_seq *s)
+{
+	int ret;
+
+	ret = trace_seq_printf(s, "\ttype        :    2 bits\n");
+	ret = trace_seq_printf(s, "\tlen         :    3 bits\n");
+	ret = trace_seq_printf(s, "\ttime_delta  :   27 bits\n");
+	ret = trace_seq_printf(s, "\tarray       :   32 bits\n");
+	ret = trace_seq_printf(s, "\n");
+	ret = trace_seq_printf(s, "\tpadding     : type == %d\n",
+			       RINGBUF_TYPE_PADDING);
+	ret = trace_seq_printf(s, "\ttime_extend : type == %d\n",
+			       RINGBUF_TYPE_TIME_EXTEND);
+	ret = trace_seq_printf(s, "\tdata        : type == %d\n",
+			       RINGBUF_TYPE_DATA);
+
+	return ret;
+}
+
 /*
  * The ring buffer is made up of a list of pages. A separate list of pages is
  * allocated for each CPU. A writer may only write to a buffer that is
@@ -340,6 +362,28 @@ static inline int test_time_stamp(u64 delta)
 
 #define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
 
+int ring_buffer_print_page_header(struct trace_seq *s)
+{
+	struct buffer_data_page field;
+	int ret;
+
+	ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t"
+			       "offset:0;\tsize:%u;\n",
+			       (unsigned int)sizeof(field.time_stamp));
+
+	ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
+			       "offset:%u;\tsize:%u;\n",
+			       (unsigned int)offsetof(typeof(field), commit),
+			       (unsigned int)sizeof(field.commit));
+
+	ret = trace_seq_printf(s, "\tfield: char data;\t"
+			       "offset:%u;\tsize:%u;\n",
+			       (unsigned int)offsetof(typeof(field), data),
+			       (unsigned int)BUF_PAGE_SIZE);
+
+	return ret;
+}
+
 /*
  * head_page == tail_page && head == tail then buffer is empty.
  */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f81d6eec4e4..7163a2bb021 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -610,6 +610,30 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	return cnt;
 }
 
+static ssize_t
+show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+	int (*func)(struct trace_seq *s) = filp->private_data;
+	struct trace_seq *s;
+	int r;
+
+	if (*ppos)
+		return 0;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	trace_seq_init(s);
+
+	func(s);
+	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
+
+	kfree(s);
+
+	return r;
+}
+
 static const struct seq_operations show_event_seq_ops = {
 	.start = t_start,
 	.next = t_next,
@@ -667,6 +691,11 @@ static const struct file_operations ftrace_subsystem_filter_fops = {
 	.write = subsystem_filter_write,
 };
 
+static const struct file_operations ftrace_show_header_fops = {
+	.open = tracing_open_generic,
+	.read = show_header,
+};
+
 static struct dentry *event_trace_events_dir(void)
 {
 	static struct dentry *d_tracer;
@@ -909,6 +938,15 @@ static __init int event_trace_init(void)
 	if (!d_events)
 		return 0;
 
+	/* ring buffer internal formats */
+	trace_create_file("header_page", 0444, d_events,
+			  ring_buffer_print_page_header,
+			  &ftrace_show_header_fops);
+
+	trace_create_file("header_event", 0444, d_events,
+			  ring_buffer_print_entry_header,
+			  &ftrace_show_header_fops);
+
 	for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
 		/* The linker may leave blanks */
 		if (!call->name)
-- 
cgit v1.2.3


From 69abe6a5d18a9394baa325bab8f57748b037c517 Mon Sep 17 00:00:00 2001
From: Avadh Patel <avadh4all@gmail.com>
Date: Fri, 10 Apr 2009 16:04:48 -0400
Subject: tracing: add saved_cmdlines file to show cached task comms

Export the cached task comms to userspace. This allows user apps to translate
the pids from a trace into their respective task command lines.

[ Impact: let userspace apps reading binary buffer know comm's of pids ]

Signed-off-by: Avadh Patel <avadh4all@gmail.com>
[ added error checking and use of buf pointer to index file_buf ]
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2d69b26b3cc..031c46f11bb 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2421,6 +2421,56 @@ static const struct file_operations tracing_readme_fops = {
 	.read		= tracing_readme_read,
 };
 
+static ssize_t
+tracing_saved_cmdlines_read(struct file *file, char __user *ubuf,
+				size_t cnt, loff_t *ppos)
+{
+	char *buf_comm;
+	char *file_buf;
+	char *buf;
+	int len = 0;
+	int pid;
+	int i;
+
+	file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL);
+	if (!file_buf)
+		return -ENOMEM;
+
+	buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL);
+	if (!buf_comm) {
+		kfree(file_buf);
+		return -ENOMEM;
+	}
+
+	buf = file_buf;
+
+	for (i = 0; i < SAVED_CMDLINES; i++) {
+		int r;
+
+		pid = map_cmdline_to_pid[i];
+		if (pid == -1 || pid == NO_CMDLINE_MAP)
+			continue;
+
+		trace_find_cmdline(pid, buf_comm);
+		r = sprintf(buf, "%d %s\n", pid, buf_comm);
+		buf += r;
+		len += r;
+	}
+
+	len = simple_read_from_buffer(ubuf, cnt, ppos,
+				      file_buf, len);
+
+	kfree(file_buf);
+	kfree(buf_comm);
+
+	return len;
+}
+
+static const struct file_operations tracing_saved_cmdlines_fops = {
+    .open       = tracing_open_generic,
+    .read       = tracing_saved_cmdlines_read,
+};
+
 static ssize_t
 tracing_ctrl_read(struct file *filp, char __user *ubuf,
 		  size_t cnt, loff_t *ppos)
@@ -3973,6 +4023,9 @@ static __init int tracer_init_debugfs(void)
 	trace_create_file("trace_marker", 0220, d_tracer,
 			NULL, &tracing_mark_fops);
 
+	trace_create_file("saved_cmdlines", 0444, d_tracer,
+			NULL, &tracing_saved_cmdlines_fops);
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 	trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
 			&ftrace_update_tot_cnt, &tracing_dyn_info_fops);
-- 
cgit v1.2.3


From 9ea21c1ecdb35ecdcac5fd9d95f62a1f6a7ffec0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 16 Apr 2009 12:15:44 -0400
Subject: tracing/events: perform function tracing in event selftests

We can find some bugs in the trace events if we stress the writes as well.
The function tracer is a good way to stress the events.

[ Impact: extend scope of event tracer self-tests ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20090416161746.604786131@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events.c | 78 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 72 insertions(+), 6 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 7163a2bb021..1137f951be4 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1017,7 +1017,7 @@ static __init void event_test_stuff(void)
  * For every trace event defined, we will test each trace point separately,
  * and then by groups, and finally all trace points.
  */
-static __init int event_trace_self_tests(void)
+static __init void event_trace_self_tests(void)
 {
 	struct ftrace_event_call *call;
 	struct event_subsystem *system;
@@ -1071,7 +1071,7 @@ static __init int event_trace_self_tests(void)
 		sysname = kstrdup(system->name, GFP_KERNEL);
 		if (WARN_ON(!sysname)) {
 			pr_warning("Can't allocate memory, giving up!\n");
-			return 0;
+			return;
 		}
 		ret = ftrace_set_clr_event(sysname, 1);
 		kfree(sysname);
@@ -1086,7 +1086,7 @@ static __init int event_trace_self_tests(void)
 		sysname = kstrdup(system->name, GFP_KERNEL);
 		if (WARN_ON(!sysname)) {
 			pr_warning("Can't allocate memory, giving up!\n");
-			return 0;
+			return;
 		}
 		ret = ftrace_set_clr_event(sysname, 0);
 		kfree(sysname);
@@ -1106,14 +1106,14 @@ static __init int event_trace_self_tests(void)
 	sysname = kmalloc(4, GFP_KERNEL);
 	if (WARN_ON(!sysname)) {
 		pr_warning("Can't allocate memory, giving up!\n");
-		return 0;
+		return;
 	}
 	memcpy(sysname, "*:*", 4);
 	ret = ftrace_set_clr_event(sysname, 1);
 	if (WARN_ON_ONCE(ret)) {
 		kfree(sysname);
 		pr_warning("error enabling all events\n");
-		return 0;
+		return;
 	}
 
 	event_test_stuff();
@@ -1125,10 +1125,76 @@ static __init int event_trace_self_tests(void)
 
 	if (WARN_ON_ONCE(ret)) {
 		pr_warning("error disabling all events\n");
-		return 0;
+		return;
 	}
 
 	pr_cont("OK\n");
+}
+
+#ifdef CONFIG_FUNCTION_TRACER
+
+static DEFINE_PER_CPU(atomic_t, test_event_disable);
+
+static void
+function_test_events_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct ring_buffer_event *event;
+	struct ftrace_entry *entry;
+	unsigned long flags;
+	long disabled;
+	int resched;
+	int cpu;
+	int pc;
+
+	pc = preempt_count();
+	resched = ftrace_preempt_disable();
+	cpu = raw_smp_processor_id();
+	disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu));
+
+	if (disabled != 1)
+		goto out;
+
+	local_save_flags(flags);
+
+	event = trace_current_buffer_lock_reserve(TRACE_FN, sizeof(*entry),
+						  flags, pc);
+	if (!event)
+		goto out;
+	entry	= ring_buffer_event_data(event);
+	entry->ip			= ip;
+	entry->parent_ip		= parent_ip;
+
+	trace_current_buffer_unlock_commit(event, flags, pc);
+
+ out:
+	atomic_dec(&per_cpu(test_event_disable, cpu));
+	ftrace_preempt_enable(resched);
+}
+
+static struct ftrace_ops trace_ops __initdata  =
+{
+	.func = function_test_events_call,
+};
+
+static __init void event_trace_self_test_with_function(void)
+{
+	register_ftrace_function(&trace_ops);
+	pr_info("Running tests again, along with the function tracer\n");
+	event_trace_self_tests();
+	unregister_ftrace_function(&trace_ops);
+}
+#else
+static __init void event_trace_self_test_with_function(void)
+{
+}
+#endif
+
+static __init int event_trace_self_tests_init(void)
+{
+
+	event_trace_self_tests();
+
+	event_trace_self_test_with_function();
 
 	return 0;
 }
-- 
cgit v1.2.3


From 339ae5d3c3fc2025e3657637921495fd600027c7 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 17 Apr 2009 10:34:30 +0800
Subject: tracing: fix file mode of trace and README

trace is read-write and README is read-only.

[ Impact: fix /debug/tracing/ file permissions. ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <49E7EAB6.4070605@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 031c46f11bb..f681f646aa0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4002,7 +4002,7 @@ static __init int tracer_init_debugfs(void)
 	trace_create_file("available_tracers", 0444, d_tracer,
 			&global_trace, &show_traces_fops);
 
-	trace_create_file("current_tracer", 0444, d_tracer,
+	trace_create_file("current_tracer", 0644, d_tracer,
 			&global_trace, &set_tracer_fops);
 
 	trace_create_file("tracing_max_latency", 0644, d_tracer,
@@ -4011,7 +4011,7 @@ static __init int tracer_init_debugfs(void)
 	trace_create_file("tracing_thresh", 0644, d_tracer,
 			&tracing_thresh, &tracing_max_lat_fops);
 
-	trace_create_file("README", 0644, d_tracer,
+	trace_create_file("README", 0444, d_tracer,
 			NULL, &tracing_readme_fops);
 
 	trace_create_file("trace_pipe", 0444, d_tracer,
-- 
cgit v1.2.3


From ac1adc55fc71c7515caa2eb0e63e49b3d1c6a47c Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Fri, 17 Apr 2009 00:27:08 -0500
Subject: tracing/filters: add filter_mutex to protect filter predicates

This patch adds a filter_mutex to prevent the filter predicates from
being accessed concurrently by various external functions.

It's based on a previous patch by Li Zefan:
        "[PATCH 7/7] tracing/filters: make filter preds RCU safe"

v2 changes:

- fixed wrong value returned in a add_subsystem_pred() failure case
  noticed by Li Zefan.

[ Impact: fix trace filter corruption/crashes on parallel access ]

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Tested-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: paulmck@linux.vnet.ibm.com
LKML-Reference: <1239946028.6639.13.camel@tropicana>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h               |  4 +-
 kernel/trace/trace_events.c        |  4 +-
 kernel/trace/trace_events_filter.c | 90 +++++++++++++++++++++++++++++---------
 3 files changed, 75 insertions(+), 23 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8817c18ef97..247948e81b0 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -757,13 +757,15 @@ struct filter_pred {
 };
 
 extern void filter_free_pred(struct filter_pred *pred);
-extern void filter_print_preds(struct filter_pred **preds, int n_preds,
+extern void filter_print_preds(struct ftrace_event_call *call,
 			       struct trace_seq *s);
 extern int filter_parse(char **pbuf, struct filter_pred *pred);
 extern int filter_add_pred(struct ftrace_event_call *call,
 			   struct filter_pred *pred);
 extern void filter_disable_preds(struct ftrace_event_call *call);
 extern void filter_free_subsystem_preds(struct event_subsystem *system);
+extern void filter_print_subsystem_preds(struct event_subsystem *system,
+					 struct trace_seq *s);
 extern int filter_add_subsystem_pred(struct event_subsystem *system,
 				     struct filter_pred *pred);
 
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 1137f951be4..64f9d6d2735 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -488,7 +488,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 
 	trace_seq_init(s);
 
-	filter_print_preds(call->preds, call->n_preds, s);
+	filter_print_preds(call, s);
 	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
 
 	kfree(s);
@@ -558,7 +558,7 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 
 	trace_seq_init(s);
 
-	filter_print_preds(system->preds, system->n_preds, s);
+	filter_print_subsystem_preds(system, s);
 	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
 
 	kfree(s);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index f8e5eab0424..e0fcfd2a16d 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -22,10 +22,13 @@
 #include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
+#include <linux/mutex.h>
 
 #include "trace.h"
 #include "trace_output.h"
 
+static DEFINE_MUTEX(filter_mutex);
+
 static int filter_pred_64(struct filter_pred *pred, void *event)
 {
 	u64 *addr = (u64 *)(event + pred->offset);
@@ -112,8 +115,8 @@ int filter_match_preds(struct ftrace_event_call *call, void *rec)
 }
 EXPORT_SYMBOL_GPL(filter_match_preds);
 
-void filter_print_preds(struct filter_pred **preds, int n_preds,
-			struct trace_seq *s)
+static void __filter_print_preds(struct filter_pred **preds, int n_preds,
+				 struct trace_seq *s)
 {
 	char *field_name;
 	struct filter_pred *pred;
@@ -138,6 +141,21 @@ void filter_print_preds(struct filter_pred **preds, int n_preds,
 	}
 }
 
+void filter_print_preds(struct ftrace_event_call *call, struct trace_seq *s)
+{
+	mutex_lock(&filter_mutex);
+	__filter_print_preds(call->preds, call->n_preds, s);
+	mutex_unlock(&filter_mutex);
+}
+
+void filter_print_subsystem_preds(struct event_subsystem *system,
+				  struct trace_seq *s)
+{
+	mutex_lock(&filter_mutex);
+	__filter_print_preds(system->preds, system->n_preds, s);
+	mutex_unlock(&filter_mutex);
+}
+
 static struct ftrace_event_field *
 find_event_field(struct ftrace_event_call *call, char *name)
 {
@@ -180,7 +198,7 @@ static int filter_set_pred(struct filter_pred *dest,
 	return 0;
 }
 
-void filter_disable_preds(struct ftrace_event_call *call)
+static void __filter_disable_preds(struct ftrace_event_call *call)
 {
 	int i;
 
@@ -190,6 +208,13 @@ void filter_disable_preds(struct ftrace_event_call *call)
 		call->preds[i]->fn = filter_pred_none;
 }
 
+void filter_disable_preds(struct ftrace_event_call *call)
+{
+	mutex_lock(&filter_mutex);
+	__filter_disable_preds(call);
+	mutex_unlock(&filter_mutex);
+}
+
 int init_preds(struct ftrace_event_call *call)
 {
 	struct filter_pred *pred;
@@ -223,7 +248,7 @@ oom:
 }
 EXPORT_SYMBOL_GPL(init_preds);
 
-void filter_free_subsystem_preds(struct event_subsystem *system)
+static void __filter_free_subsystem_preds(struct event_subsystem *system)
 {
 	struct ftrace_event_call *call;
 	int i;
@@ -241,18 +266,25 @@ void filter_free_subsystem_preds(struct event_subsystem *system)
 			continue;
 
 		if (!strcmp(call->system, system->name))
-			filter_disable_preds(call);
+			__filter_disable_preds(call);
 	}
 }
 
-static int __filter_add_pred(struct ftrace_event_call *call,
-			     struct filter_pred *pred,
-			     filter_pred_fn_t fn)
+void filter_free_subsystem_preds(struct event_subsystem *system)
+{
+	mutex_lock(&filter_mutex);
+	__filter_free_subsystem_preds(system);
+	mutex_unlock(&filter_mutex);
+}
+
+static int filter_add_pred_fn(struct ftrace_event_call *call,
+			      struct filter_pred *pred,
+			      filter_pred_fn_t fn)
 {
 	int idx, err;
 
 	if (call->n_preds && !pred->compound)
-		filter_disable_preds(call);
+		__filter_disable_preds(call);
 
 	if (call->n_preds == MAX_FILTER_PRED)
 		return -ENOSPC;
@@ -276,7 +308,8 @@ static int is_string_field(const char *type)
 	return 0;
 }
 
-int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
+static int __filter_add_pred(struct ftrace_event_call *call,
+			     struct filter_pred *pred)
 {
 	struct ftrace_event_field *field;
 	filter_pred_fn_t fn;
@@ -293,7 +326,7 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
 			return -EINVAL;
 		fn = filter_pred_string;
 		pred->str_len = field->size;
-		return __filter_add_pred(call, pred, fn);
+		return filter_add_pred_fn(call, pred, fn);
 	} else {
 		if (pred->str_len)
 			return -EINVAL;
@@ -316,7 +349,18 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
 		return -EINVAL;
 	}
 
-	return __filter_add_pred(call, pred, fn);
+	return filter_add_pred_fn(call, pred, fn);
+}
+
+int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
+{
+	int err;
+
+	mutex_lock(&filter_mutex);
+	err = __filter_add_pred(call, pred);
+	mutex_unlock(&filter_mutex);
+
+	return err;
 }
 
 int filter_add_subsystem_pred(struct event_subsystem *system,
@@ -324,20 +368,27 @@ int filter_add_subsystem_pred(struct event_subsystem *system,
 {
 	struct ftrace_event_call *call;
 
+	mutex_lock(&filter_mutex);
+
 	if (system->n_preds && !pred->compound)
-		filter_free_subsystem_preds(system);
+		__filter_free_subsystem_preds(system);
 
 	if (!system->n_preds) {
 		system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
 					GFP_KERNEL);
-		if (!system->preds)
+		if (!system->preds) {
+			mutex_unlock(&filter_mutex);
 			return -ENOMEM;
+		}
 	}
 
-	if (system->n_preds == MAX_FILTER_PRED)
+	if (system->n_preds == MAX_FILTER_PRED) {
+		mutex_unlock(&filter_mutex);
 		return -ENOSPC;
+	}
 
 	system->preds[system->n_preds] = pred;
+	system->n_preds++;
 
 	list_for_each_entry(call, &ftrace_events, list) {
 		int err;
@@ -348,17 +399,16 @@ int filter_add_subsystem_pred(struct event_subsystem *system,
 		if (strcmp(call->system, system->name))
 			continue;
 
-		if (!find_event_field(call, pred->field_name))
-			continue;
-
-		err = filter_add_pred(call, pred);
+		err = __filter_add_pred(call, pred);
 		if (err == -ENOMEM) {
 			system->preds[system->n_preds] = NULL;
+			system->n_preds--;
+			mutex_unlock(&filter_mutex);
 			return err;
 		}
 	}
 
-	system->n_preds++;
+	mutex_unlock(&filter_mutex);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 12acd473d45cf2e40de3782cb2de712e5cd4d715 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 17 Apr 2009 16:01:56 -0400
Subject: tracing: add EXPORT_SYMBOL_GPL for trace commits

Not all the necessary symbols were exported to allow for tracing
by modules. This patch adds them in.

[ Impact: allow modules to commit data to the ring buffer ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f681f646aa0..183d788038e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -894,18 +894,20 @@ void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
 {
 	__trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1);
 }
+EXPORT_SYMBOL(trace_current_buffer_unlock_commit);
 
 void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
 					unsigned long flags, int pc)
 {
 	__trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0);
 }
+EXPORT_SYMBOL(trace_nowake_buffer_unlock_commit);
 
 void trace_current_buffer_discard_commit(struct ring_buffer_event *event)
 {
 	ring_buffer_discard_commit(global_trace.buffer, event);
 }
-EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
+EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit);
 
 void
 trace_function(struct trace_array *tr,
-- 
cgit v1.2.3


From 261842b7c9099f56de2eb969c8ad65402d68e00e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 16 Apr 2009 21:41:52 -0400
Subject: tracing: add same level recursion detection

The tracing infrastructure allows for recursion. That is, an interrupt
may interrupt the act of tracing an event, and that interrupt may very well
perform its own trace. This is a recursive trace, and is fine to do.

The problem arises when there is a bug, and the utility doing the trace
calls something that recurses back into the tracer. This recursion is not
caused by an external event like an interrupt, but by code that is not
expected to recurse. The result could be a lockup.

This patch adds a bitmask to the task structure that keeps track
of the trace recursion. To find the interrupt depth, the following
algorithm is used:

  level = hardirq_count() + softirq_count() + in_nmi;

Here, level will be the depth of interrutps and softirqs, and even handles
the nmi. Then the corresponding bit is set in the recursion bitmask.
If the bit was already set, we know we had a recursion at the same level
and we warn about it and fail the writing to the buffer.

After the data has been committed to the buffer, we clear the bit.
No atomics are needed. The only races are with interrupts and they reset
the bitmask before returning anywy.

[ Impact: detect same irq level trace recursion ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 84a6055f37c..b421b0ea911 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1481,6 +1481,40 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 	return event;
 }
 
+static int trace_irq_level(void)
+{
+	return hardirq_count() + softirq_count() + in_nmi();
+}
+
+static int trace_recursive_lock(void)
+{
+	int level;
+
+	level = trace_irq_level();
+
+	if (unlikely(current->trace_recursion & (1 << level))) {
+		/* Disable all tracing before we do anything else */
+		tracing_off_permanent();
+		WARN_ON_ONCE(1);
+		return -1;
+	}
+
+	current->trace_recursion |= 1 << level;
+
+	return 0;
+}
+
+static void trace_recursive_unlock(void)
+{
+	int level;
+
+	level = trace_irq_level();
+
+	WARN_ON_ONCE(!current->trace_recursion & (1 << level));
+
+	current->trace_recursion &= ~(1 << level);
+}
+
 static DEFINE_PER_CPU(int, rb_need_resched);
 
 /**
@@ -1514,6 +1548,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
 	/* If we are tracing schedule, we don't want to recurse */
 	resched = ftrace_preempt_disable();
 
+	if (trace_recursive_lock())
+		goto out_nocheck;
+
 	cpu = raw_smp_processor_id();
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
@@ -1543,6 +1580,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
 	return event;
 
  out:
+	trace_recursive_unlock();
+
+ out_nocheck:
 	ftrace_preempt_enable(resched);
 	return NULL;
 }
@@ -1581,6 +1621,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
 
 	rb_commit(cpu_buffer, event);
 
+	trace_recursive_unlock();
+
 	/*
 	 * Only the last preempt count needs to restore preemption.
 	 */
-- 
cgit v1.2.3


From 3189cdb31622f4e40688ce5a6fc5d940b42bc805 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 17 Apr 2009 16:13:55 -0400
Subject: tracing: protect trace_printk from recursion

trace_printk can be called from any context, including NMIs.
If this happens, then we must test for for recursion before
grabbing any spinlocks.

This patch prevents trace_printk from being called recursively.

[ Impact: prevent hard lockup in lockdep event tracer ]

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 183d788038e..b9a3adce922 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1259,6 +1259,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 	struct trace_array_cpu *data;
 	struct bprint_entry *entry;
 	unsigned long flags;
+	int disable;
 	int resched;
 	int cpu, len = 0, size, pc;
 
@@ -1273,7 +1274,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 
-	if (unlikely(atomic_read(&data->disabled)))
+	disable = atomic_inc_return(&data->disabled);
+	if (unlikely(disable != 1))
 		goto out;
 
 	/* Lockdep uses trace_printk for lock tracing */
@@ -1301,6 +1303,7 @@ out_unlock:
 	local_irq_restore(flags);
 
 out:
+	atomic_dec_return(&data->disabled);
 	ftrace_preempt_enable(resched);
 	unpause_graph_tracing();
 
@@ -1320,6 +1323,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 	int cpu, len = 0, size, pc;
 	struct print_entry *entry;
 	unsigned long irq_flags;
+	int disable;
 
 	if (tracing_disabled || tracing_selftest_running)
 		return 0;
@@ -1329,7 +1333,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 
-	if (unlikely(atomic_read(&data->disabled)))
+	disable = atomic_inc_return(&data->disabled);
+	if (unlikely(disable != 1))
 		goto out;
 
 	pause_graph_tracing();
@@ -1357,6 +1362,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 	raw_local_irq_restore(irq_flags);
 	unpause_graph_tracing();
  out:
+	atomic_dec_return(&data->disabled);
 	preempt_enable_notrace();
 
 	return len;
-- 
cgit v1.2.3


From e057a5e5647a1c9d0d0054fbd298bfa04b3d1cb4 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 19 Apr 2009 23:38:12 +0200
Subject: tracing/core: Add current context on tracing recursion warning

In case of tracing recursion detection, we only get the stacktrace.
But the current context may be very useful to debug the issue.

This patch adds the softirq/hardirq/nmi context with the warning
using lockdep context display to have a familiar output.

v2: Use printk_once()
v3: drop {hardirq,softirq}_context which depend on lockdep,
    only keep what is part of current->trace_recursion,
    sufficient to debug the warning source.

[ Impact: print context necessary to debug recursion ]

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/ring_buffer.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b421b0ea911..bffde630c4e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1495,6 +1495,13 @@ static int trace_recursive_lock(void)
 	if (unlikely(current->trace_recursion & (1 << level))) {
 		/* Disable all tracing before we do anything else */
 		tracing_off_permanent();
+
+		printk_once(KERN_WARNING "Tracing recursion: "
+			    "HC[%lu]:SC[%lu]:NMI[%lu]\n",
+			    hardirq_count() >> HARDIRQ_SHIFT,
+			    softirq_count() >> SOFTIRQ_SHIFT,
+			    in_nmi());
+
 		WARN_ON_ONCE(1);
 		return -1;
 	}
-- 
cgit v1.2.3


From f3b9aae16219aaeca2dd5a9ca69f7a10faa063df Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 19 Apr 2009 23:39:33 +0200
Subject: tracing/ring-buffer: Add unlock recursion protection on discard

The pair of helpers trace_recursive_lock() and trace_recursive_unlock()
have been introduced recently to provide generic tracing recursion
protection.

They are used in a symetric way:

 - trace_recursive_lock() on buffer reserve
 - trace_recursive_unlock() on buffer commit

However sometimes, we don't commit but discard on entry
to the buffer, ie: in case of filter checking.

Then we must also unlock the recursion protection on discard time,
otherwise the tracing gets definitely deactivated and a warning
is raised spuriously, such as:

111.119821] ------------[ cut here ]------------
[  111.119829] WARNING: at kernel/trace/ring_buffer.c:1498 ring_buffer_lock_reserve+0x1b7/0x1d0()
[  111.119835] Hardware name: AMILO Li 2727
[  111.119839] Modules linked in:
[  111.119846] Pid: 5731, comm: Xorg Tainted: G        W  2.6.30-rc1 #69
[  111.119851] Call Trace:
[  111.119863]  [<ffffffff8025ce68>] warn_slowpath+0xd8/0x130
[  111.119873]  [<ffffffff8028a30f>] ? __lock_acquire+0x19f/0x1ae0
[  111.119882]  [<ffffffff8028a30f>] ? __lock_acquire+0x19f/0x1ae0
[  111.119891]  [<ffffffff802199b0>] ? native_sched_clock+0x20/0x70
[  111.119899]  [<ffffffff80286dee>] ? put_lock_stats+0xe/0x30
[  111.119906]  [<ffffffff80286eb8>] ? lock_release_holdtime+0xa8/0x150
[  111.119913]  [<ffffffff802c8ae7>] ring_buffer_lock_reserve+0x1b7/0x1d0
[  111.119921]  [<ffffffff802cd110>] trace_buffer_lock_reserve+0x30/0x70
[  111.119930]  [<ffffffff802ce000>] trace_current_buffer_lock_reserve+0x20/0x30
[  111.119939]  [<ffffffff802474e8>] ftrace_raw_event_sched_switch+0x58/0x100
[  111.119948]  [<ffffffff808103b7>] __schedule+0x3a7/0x4cd
[  111.119957]  [<ffffffff80211b56>] ? ftrace_call+0x5/0x2b
[  111.119964]  [<ffffffff80211b56>] ? ftrace_call+0x5/0x2b
[  111.119971]  [<ffffffff80810c08>] schedule+0x18/0x40
[  111.119977]  [<ffffffff80810e09>] preempt_schedule+0x39/0x60
[  111.119985]  [<ffffffff80813bd3>] _read_unlock+0x53/0x60
[  111.119993]  [<ffffffff807259d2>] sock_def_readable+0x72/0x80
[  111.120002]  [<ffffffff807ad5ed>] unix_stream_sendmsg+0x24d/0x3d0
[  111.120011]  [<ffffffff807219a3>] sock_aio_write+0x143/0x160
[  111.120019]  [<ffffffff80211b56>] ? ftrace_call+0x5/0x2b
[  111.120026]  [<ffffffff80721860>] ? sock_aio_write+0x0/0x160
[  111.120033]  [<ffffffff80721860>] ? sock_aio_write+0x0/0x160
[  111.120042]  [<ffffffff8031c283>] do_sync_readv_writev+0xf3/0x140
[  111.120049]  [<ffffffff80211b56>] ? ftrace_call+0x5/0x2b
[  111.120057]  [<ffffffff80276ff0>] ? autoremove_wake_function+0x0/0x40
[  111.120067]  [<ffffffff8045d489>] ? cap_file_permission+0x9/0x10
[  111.120074]  [<ffffffff8045c1e6>] ? security_file_permission+0x16/0x20
[  111.120082]  [<ffffffff8031cab4>] do_readv_writev+0xd4/0x1f0
[  111.120089]  [<ffffffff80211b56>] ? ftrace_call+0x5/0x2b
[  111.120097]  [<ffffffff80211b56>] ? ftrace_call+0x5/0x2b
[  111.120105]  [<ffffffff8031cc18>] vfs_writev+0x48/0x70
[  111.120111]  [<ffffffff8031cd65>] sys_writev+0x55/0xc0
[  111.120119]  [<ffffffff80211e32>] system_call_fastpath+0x16/0x1b
[  111.120125] ---[ end trace 15605f4e98d5ccb5 ]---

[ Impact: fix spurious warning triggering tracing shutdown ]

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/ring_buffer.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bffde630c4e..e145969a8ed 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1642,6 +1642,14 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
 }
 EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
 
+static inline void rb_event_discard(struct ring_buffer_event *event)
+{
+	event->type = RINGBUF_TYPE_PADDING;
+	/* time delta must be non zero */
+	if (!event->time_delta)
+		event->time_delta = 1;
+}
+
 /**
  * ring_buffer_event_discard - discard any event in the ring buffer
  * @event: the event to discard
@@ -1656,10 +1664,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
  */
 void ring_buffer_event_discard(struct ring_buffer_event *event)
 {
-	event->type = RINGBUF_TYPE_PADDING;
-	/* time delta must be non zero */
-	if (!event->time_delta)
-		event->time_delta = 1;
+	rb_event_discard(event);
+	trace_recursive_unlock();
 }
 EXPORT_SYMBOL_GPL(ring_buffer_event_discard);
 
@@ -1690,7 +1696,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
 	int cpu;
 
 	/* The event is discarded regardless */
-	ring_buffer_event_discard(event);
+	rb_event_discard(event);
 
 	/*
 	 * This must only be called if the event has not been
@@ -1735,6 +1741,8 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
 	if (rb_is_commit(cpu_buffer, event))
 		rb_set_commit_to_write(cpu_buffer);
 
+	trace_recursive_unlock();
+
 	/*
 	 * Only the last preempt count needs to restore preemption.
 	 */
-- 
cgit v1.2.3


From 9ae5b8790037d05d32746f521af146c32089bfec Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 20 Apr 2009 10:27:58 -0400
Subject: tracing: change branch profiling to a choice selection

This patch makes the branch profiling into a choice selection:

  None               - no branch profiling
  likely/unlikely    - only profile likely/unlikely branches
  all                - profile all branches

The all profiler will also enable the likely/unlikely branches.

This does not change the way the profiler works or the dependencies
between the profilers.

What this patch does, is keep the branch profiling from being selected
by an allyesconfig make. The branch profiler is very intrusive and
it is known to break various architecture builds when selected as an
allyesconfig.

[ Impact: prevent branch profiler from being selected in allyesconfig ]

Reported-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Reported-by: Al Viro <viro@zeniv.linux.org.uk>
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Kconfig | 39 +++++++++++++++++++++++++++++++++------
 1 file changed, 33 insertions(+), 6 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 57981d338d1..3ee28db69be 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -212,8 +212,36 @@ config BOOT_TRACER
 	  to enable this on bootup.
 
 config TRACE_BRANCH_PROFILING
-	bool "Trace likely/unlikely profiler"
+	bool
 	select TRACING
+
+choice
+	prompt "Branch Profiling"
+	default BRANCH_PROFILE_NONE
+	help
+	 The branch profiling is a software profiler. It will add hooks
+	 into the C conditionals to test which path a branch takes.
+
+	 The likely/unlikely profiler only looks at the conditions that
+	 are annotated with a likely or unlikely macro.
+
+	 The "all branch" profiler will profile every if statement in the
+	 kernel. This profiler will also enable the likely/unlikely
+	 profiler as well.
+
+	 Either of the above profilers add a bit of overhead to the system.
+	 If unsure choose "No branch profiling".
+
+config BRANCH_PROFILE_NONE
+	bool "No branch profiling"
+	help
+	 No branch profiling. Branch profiling adds a bit of overhead.
+	 Only enable it if you want to analyse the branching behavior.
+	 Otherwise keep it disabled.
+
+config PROFILE_ANNOTATED_BRANCHES
+	bool "Trace likely/unlikely profiler"
+	select TRACE_BRANCH_PROFILING
 	help
 	  This tracer profiles all the the likely and unlikely macros
 	  in the kernel. It will display the results in:
@@ -223,11 +251,9 @@ config TRACE_BRANCH_PROFILING
 	  Note: this will add a significant overhead, only turn this
 	  on if you need to profile the system's use of these macros.
 
-	  Say N if unsure.
-
 config PROFILE_ALL_BRANCHES
 	bool "Profile all if conditionals"
-	depends on TRACE_BRANCH_PROFILING
+	select TRACE_BRANCH_PROFILING
 	help
 	  This tracer profiles all branch conditions. Every if ()
 	  taken in the kernel is recorded whether it hit or miss.
@@ -235,11 +261,12 @@ config PROFILE_ALL_BRANCHES
 
 	  /debugfs/tracing/profile_branch
 
+	  This option also enables the likely/unlikely profiler.
+
 	  This configuration, when enabled, will impose a great overhead
 	  on the system. This should only be enabled when the system
 	  is to be analyzed
-
-	  Say N if unsure.
+endchoice
 
 config TRACING_BRANCHES
 	bool
-- 
cgit v1.2.3


From 4ed9f0716e46bb9646f26e73f4a1b5b24db7947a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 20 Apr 2009 10:47:36 -0400
Subject: tracing: create menuconfig for tracing infrastructure

During testing we often use randconfig to test various kernels.
The current configuration set up does not give an easy way to disable
all tracing with a single config. The case where randconfig would
test all tracing disabled is very unlikely.

This patch adds a config option to enable or disable all tracing.
It is hooked into the tracing menu just like other submenus are done.

[ Impact: allow randconfig to easily produce all traces disabled ]

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Kconfig | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 3ee28db69be..3fa36d2bc29 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -77,7 +77,12 @@ config TRACING_SUPPORT
 
 if TRACING_SUPPORT
 
-menu "Tracers"
+menuconfig FTRACE
+	bool "Tracers"
+	help
+	 Enable the kernel tracing infrastructure.
+
+if FTRACE
 
 config FUNCTION_TRACER
 	bool "Kernel Function Tracer"
@@ -462,7 +467,7 @@ config MMIOTRACE_TEST
 
 	  Say N, unless you absolutely know what you are doing.
 
-endmenu
+endif # FTRACE
 
 endif # TRACING_SUPPORT
 
-- 
cgit v1.2.3


From a7abe97fd8e7a6ccabba5a04a9f17be9211d418c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 20 Apr 2009 10:59:34 -0400
Subject: tracing: rename EVENT_TRACER config to ENABLE_EVENT_TRACING

Currently we have two configs: EVENT_TRACING and EVENT_TRACER.
All tracers enable EVENT_TRACING. The EVENT_TRACER is only a
convenience to enable the EVENT_TRACING when no other tracers
are enabled.

The names EVENT_TRACER and EVENT_TRACING are too similar and confusing.
This patch renames EVENT_TRACER to ENABLE_EVENT_TRACING to be more
appropriate to what it actually does, as well as add a comment in
the help menu to explain the option's purpose.

[ Impact: rename config option to reduce confusion ]

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Kconfig | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 3fa36d2bc29..450d3c2cfbd 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -183,7 +183,7 @@ config CONTEXT_SWITCH_TRACER
 	  This tracer gets called from the context switch and records
 	  all switching of tasks.
 
-config EVENT_TRACER
+config ENABLE_EVENT_TRACING
 	bool "Trace various events in the kernel"
 	select TRACING
 	help
@@ -191,6 +191,10 @@ config EVENT_TRACER
 	  allowing the user to pick and choose which trace point they
 	  want to trace.
 
+	  Note, all tracers enable event tracing. This option is
+	  only a convenience to enable event tracing when no other
+	  tracers are selected.
+
 config FTRACE_SYSCALLS
 	bool "Trace syscalls"
 	depends on HAVE_FTRACE_SYSCALLS
-- 
cgit v1.2.3


From 28d20e2d6e94434827e11c310788b87204b84559 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 20 Apr 2009 12:12:44 -0400
Subject: tracing/events: call the correct event trace selftest init function

The late_initcall calls a helper function instead of the proper
init event selftest function.

This update may have been lost due to conflicting merges.

[ Impact: fix compiler warning and call extended event trace self tests ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 64f9d6d2735..98daf5dc74a 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1199,6 +1199,6 @@ static __init int event_trace_self_tests_init(void)
 	return 0;
 }
 
-late_initcall(event_trace_self_tests);
+late_initcall(event_trace_self_tests_init);
 
 #endif
-- 
cgit v1.2.3


From 17487bfeb6cfb05920e6a9d5a54f345f2917b4e7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 20 Apr 2009 13:24:21 -0400
Subject: tracing: fix recursive test level calculation

The recursive tests to detect same level recursion in the ring buffers
did not account for the hard/softirq_counts to be shifted. Thus the
numbers could be larger than then mask to be tested.

This patch includes the shift for the calculation of the irq depth.

[ Impact: stop false positives in trace recursion detection ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index e145969a8ed..aa40ae92233 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1483,7 +1483,9 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 
 static int trace_irq_level(void)
 {
-	return hardirq_count() + softirq_count() + in_nmi();
+	return (hardirq_count() >> HARDIRQ_SHIFT) +
+		(softirq_count() >> + SOFTIRQ_SHIFT) +
+		!!in_nmi();
 }
 
 static int trace_recursive_lock(void)
-- 
cgit v1.2.3


From e395898e98119085f666febbc7b631dd69bc637f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 20 Apr 2009 13:32:44 -0400
Subject: tracing: remove recursive test from ring_buffer_event_discard

The ring_buffer_event_discard is not tied to ring_buffer_lock_reserve.
It can be called inside or outside the reserve/commit. Even if it
is called inside the reserve/commit the commit part must also be called.

Only ring_buffer_discard_commit can be used as a replacement for
ring_buffer_unlock_commit.

This patch removes the trace_recursive_unlock from ring_buffer_event_discard
since it would be the wrong place to do so.

[Impact: prevent breakage in trace recursive testing ]

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index aa40ae92233..a6997670cc4 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1667,7 +1667,6 @@ static inline void rb_event_discard(struct ring_buffer_event *event)
 void ring_buffer_event_discard(struct ring_buffer_event *event)
 {
 	rb_event_discard(event);
-	trace_recursive_unlock();
 }
 EXPORT_SYMBOL_GPL(ring_buffer_event_discard);
 
-- 
cgit v1.2.3


From aa18efb2a2f07e1cf062039848e9d369bb358724 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 20 Apr 2009 16:16:11 -0400
Subject: tracing: use recursive counter over irq level

Althought using the irq level (hardirq_count, softirq_count and in_nmi)
was nice to detect bad recursion right away, but since the counters are
not atomically updated with respect to the interrupts, the function tracer
might trigger the test from an interrupt handler before the hardirq_count
is updated. This will trigger a false warning.

This patch converts the recursive detection to a simple counter.
If the depth is greater than 16 then the recursive detection will trigger.
16 is more than enough for any nested interrupts.

[ Impact: fix false positive trace recursion detection ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 45 ++++++++++++++++-----------------------------
 1 file changed, 16 insertions(+), 29 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a6997670cc4..7bcfd3e6053 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1481,47 +1481,34 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 	return event;
 }
 
-static int trace_irq_level(void)
-{
-	return (hardirq_count() >> HARDIRQ_SHIFT) +
-		(softirq_count() >> + SOFTIRQ_SHIFT) +
-		!!in_nmi();
-}
+#define TRACE_RECURSIVE_DEPTH 16
 
 static int trace_recursive_lock(void)
 {
-	int level;
-
-	level = trace_irq_level();
+	current->trace_recursion++;
 
-	if (unlikely(current->trace_recursion & (1 << level))) {
-		/* Disable all tracing before we do anything else */
-		tracing_off_permanent();
+	if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
+		return 0;
 
-		printk_once(KERN_WARNING "Tracing recursion: "
-			    "HC[%lu]:SC[%lu]:NMI[%lu]\n",
-			    hardirq_count() >> HARDIRQ_SHIFT,
-			    softirq_count() >> SOFTIRQ_SHIFT,
-			    in_nmi());
+	/* Disable all tracing before we do anything else */
+	tracing_off_permanent();
 
-		WARN_ON_ONCE(1);
-		return -1;
-	}
+	printk_once(KERN_WARNING "Tracing recursion: depth[%d]:"
+		    "HC[%lu]:SC[%lu]:NMI[%lu]\n",
+		    current->trace_recursion,
+		    hardirq_count() >> HARDIRQ_SHIFT,
+		    softirq_count() >> SOFTIRQ_SHIFT,
+		    in_nmi());
 
-	current->trace_recursion |= 1 << level;
-
-	return 0;
+	WARN_ON_ONCE(1);
+	return -1;
 }
 
 static void trace_recursive_unlock(void)
 {
-	int level;
-
-	level = trace_irq_level();
-
-	WARN_ON_ONCE(!current->trace_recursion & (1 << level));
+	WARN_ON_ONCE(!current->trace_recursion);
 
-	current->trace_recursion &= ~(1 << level);
+	current->trace_recursion--;
 }
 
 static DEFINE_PER_CPU(int, rb_need_resched);
-- 
cgit v1.2.3


From cb4764a6dbffd9bb3cf759421ae82384071a933d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 20 Apr 2009 18:16:44 -0400
Subject: tracing: use nowakeup version of commit for function event trace
 tests

The startup tests for the event tracer also runs with the function
tracer enabled. The "wakeup" version of the trace commit was used
which can grab spinlocks. If a task was preempted by an NMI
that called a function being traced, it could deadlock due to the
function tracer trying to grab the same lock.

Thanks to Frederic Weisbecker for pointing out where the bug was.

Reported-by: Ingo Molnar <mingo@elte.hu>
Reported-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 98daf5dc74a..672b195f86c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1164,7 +1164,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
 	entry->ip			= ip;
 	entry->parent_ip		= parent_ip;
 
-	trace_current_buffer_unlock_commit(event, flags, pc);
+	trace_nowake_buffer_unlock_commit(event, flags, pc);
 
  out:
 	atomic_dec(&per_cpu(test_event_disable, cpu));
-- 
cgit v1.2.3


From e8082f3f5a17d7a7bfc7dd1050a3f958dc034e9a Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 21 Apr 2009 17:11:46 +0800
Subject: tracing/filters: don't remove old filters when failed to write
 subsys->filter

If writing subsys->filter returns EINVAL or ENOSPC, the original
filters in subsys/ and subsys/events/ will be removed. This is
definitely wrong.

[ Impact: fix filter setting semantics on error condition ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <49ED8DD2.2070700@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 672b195f86c..9ea55a7dfde 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -600,7 +600,6 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 
 	err = filter_add_subsystem_pred(system, pred);
 	if (err < 0) {
-		filter_free_subsystem_preds(system);
 		filter_free_pred(pred);
 		return err;
 	}
-- 
cgit v1.2.3


From f66578a7637b87810cbb9041c4e3a77fd2fa4706 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 21 Apr 2009 17:12:11 +0800
Subject: tracing/filters: allow user-input to be integer-like string

Suppose we would like to trace all tasks named '123', but this
will fail:

 # echo 'parent_comm == 123' > events/sched/sched_process_fork/filter
 bash: echo: write error: Invalid argument

Don't guess the type of the filter pred in filter_parse(), but instead
we check it in __filter_add_pred().

[ Impact: extend allowed filter field string values ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <49ED8DEB.6000700@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events_filter.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e0fcfd2a16d..65418288f95 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -313,6 +313,7 @@ static int __filter_add_pred(struct ftrace_event_call *call,
 {
 	struct ftrace_event_field *field;
 	filter_pred_fn_t fn;
+	unsigned long long val;
 
 	field = find_event_field(call, pred->field_name);
 	if (!field)
@@ -322,14 +323,13 @@ static int __filter_add_pred(struct ftrace_event_call *call,
 	pred->offset = field->offset;
 
 	if (is_string_field(field->type)) {
-		if (!pred->str_len)
-			return -EINVAL;
 		fn = filter_pred_string;
 		pred->str_len = field->size;
 		return filter_add_pred_fn(call, pred, fn);
 	} else {
-		if (pred->str_len)
+		if (strict_strtoull(pred->str_val, 0, &val))
 			return -EINVAL;
+		pred->val = val;
 	}
 
 	switch (field->size) {
@@ -413,12 +413,16 @@ int filter_add_subsystem_pred(struct event_subsystem *system,
 	return 0;
 }
 
+/*
+ * The filter format can be
+ *   - 0, which means remove all filter preds
+ *   - [||/&&] <field> ==/!= <val>
+ */
 int filter_parse(char **pbuf, struct filter_pred *pred)
 {
-	char *tmp, *tok, *val_str = NULL;
+	char *tok, *val_str = NULL;
 	int tok_n = 0;
 
-	/* field ==/!= number, or/and field ==/!= number, number */
 	while ((tok = strsep(pbuf, " \n"))) {
 		if (tok_n == 0) {
 			if (!strcmp(tok, "0")) {
@@ -478,19 +482,13 @@ int filter_parse(char **pbuf, struct filter_pred *pred)
 		return -EINVAL;
 	}
 
+	strcpy(pred->str_val, val_str);
+	pred->str_len = strlen(val_str);
+
 	pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
 	if (!pred->field_name)
 		return -ENOMEM;
 
-	pred->str_len = 0;
-	pred->val = simple_strtoull(val_str, &tmp, 0);
-	if (tmp == val_str) {
-		strncpy(pred->str_val, val_str, MAX_FILTER_STR_VAL);
-		pred->str_len = strlen(val_str);
-		pred->str_val[pred->str_len] = '\0';
-	} else if (*tmp != '\0')
-		return -EINVAL;
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From 3554228d4289098a8fe5cfd87512ec32a19bbe5a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 21 Apr 2009 09:41:26 -0400
Subject: ring-buffer: only warn on wrap if buffer is bigger than two pages

On boot up, to save memory, ftrace allocates the minimum buffer
which is two pages. Ftrace also goes through a series of tests
(when configured) on boot up. These tests can fill up a page within
a single interrupt.

The ring buffer also has a WARN_ON when it detects that the buffer was
completely filled within a single commit (other commits are allowed to
be nested).

Combine the small buffer on start up, with the tests that can fill more
than a single page within an interrupt, this can trigger the WARN_ON.

This patch makes the WARN_ON only happen when the ring buffer consists
of more than two pages.

[ Impact: prevent false WARN_ON in ftrace startup tests ]

Reported-by: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <20090421094616.GA14561@elte.hu>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7bcfd3e6053..61dbdf21cd3 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1241,7 +1241,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		 * about it.
 		 */
 		if (unlikely(next_page == commit_page)) {
-			WARN_ON_ONCE(1);
+			/* This can easily happen on small ring buffers */
+			WARN_ON_ONCE(buffer->pages > 2);
 			goto out_reset;
 		}
 
-- 
cgit v1.2.3


From 7a4f453b6d7379a7c380825949977c5a838aa012 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 22 Apr 2009 16:53:34 +0800
Subject: tracing/events: make struct trace_entry->type to be int type

struct trace_entry->type is unsigned char, while trace event's id is
int type, thus for a event with id >= 256, it's entry->type is cast
to (id % 256), and then we can't see the trace output of this event.

 # insmod trace-events-sample.ko
 # echo foo_bar > /mnt/tracing/set_event
 # cat /debug/tracing/events/trace-events-sample/foo_bar/id
 256
 # cat /mnt/tracing/trace_pipe
           <...>-3548  [001]   215.091142: Unknown type 0
           <...>-3548  [001]   216.089207: Unknown type 0
           <...>-3548  [001]   217.087271: Unknown type 0
           <...>-3548  [001]   218.085332: Unknown type 0

[ Impact: fix output for trace events with id >= 256 ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <49EEDB0E.5070207@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c        | 4 ++--
 kernel/trace/trace.h        | 2 +-
 kernel/trace/trace_events.c | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b9a3adce922..b6183bc9eca 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -838,7 +838,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 }
 
 struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
-						    unsigned char type,
+						    int type,
 						    unsigned long len,
 						    unsigned long flags, int pc)
 {
@@ -881,7 +881,7 @@ void trace_buffer_unlock_commit(struct trace_array *tr,
 }
 
 struct ring_buffer_event *
-trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
+trace_current_buffer_lock_reserve(int type, unsigned long len,
 				  unsigned long flags, int pc)
 {
 	return trace_buffer_lock_reserve(&global_trace,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 247948e81b0..7d55bcf50e4 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -422,7 +422,7 @@ void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
 struct ring_buffer_event;
 
 struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
-						    unsigned char type,
+						    int type,
 						    unsigned long len,
 						    unsigned long flags,
 						    int pc);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 9ea55a7dfde..5d6e879cf87 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -398,7 +398,7 @@ static int trace_write_header(struct trace_seq *s)
 				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
 				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
 				"\n",
-				FIELD(unsigned char, type),
+				FIELD(int, type),
 				FIELD(unsigned char, flags),
 				FIELD(unsigned char, preempt_count),
 				FIELD(int, pid),
-- 
cgit v1.2.3


From 9be24414aad047dcf9d8d2a9a929321536c7ebec Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 26 Mar 2009 10:25:24 -0400
Subject: tracing/wakeup: move access to wakeup_cpu into spinlock

The code had the following outside the lock:

        if (next != wakeup_task)
                return;

        pc = preempt_count();

        /* The task we are waiting for is waking up */
        data = wakeup_trace->data[wakeup_cpu];

On initialization, wakeup_task is NULL and wakeup_cpu -1. This code
is not under a lock. If wakeup_task is set on another CPU as that
task is waking up, we can see the wakeup_task before wakeup_cpu is
set. If we read wakeup_cpu while it is still -1 then we will have
a bad data pointer.

This patch moves the reading of wakeup_cpu within the protection of
the spinlock used to protect the writing of wakeup_cpu and wakeup_task.

[ Impact: remove possible race causing invalid pointer dereference ]

Reported-by: Maneesh Soni <maneesh@in.ibm.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_sched_wakeup.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index b8b13c5540f..eacb2722517 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -138,9 +138,6 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
 
 	pc = preempt_count();
 
-	/* The task we are waiting for is waking up */
-	data = wakeup_trace->data[wakeup_cpu];
-
 	/* disable local data, not wakeup_cpu data */
 	cpu = raw_smp_processor_id();
 	disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled);
@@ -154,6 +151,9 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
 	if (unlikely(!tracer_enabled || next != wakeup_task))
 		goto out_unlock;
 
+	/* The task we are waiting for is waking up */
+	data = wakeup_trace->data[wakeup_cpu];
+
 	trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
 	tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
 
-- 
cgit v1.2.3


From 89ec0dee9eba6275d47be0b878cf5f6d5c2fb6eb Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 26 Mar 2009 11:03:29 -0400
Subject: tracing: increase size of number of possible events

With the new event tracing registration, we must increase the number
of events that can be registered. Currently the type field is only
one byte, which leaves us only 256 possible events.

Since we do not save the CPU number in the tracer anymore (it is determined
by the per cpu ring buffer that is used) we have an extra byte to use.

This patch increases the size of type from 1 byte (256 events) to
2 bytes (65,536 events).

It also adds a WARN_ON_ONCE if we exceed that limit.

[ Impact: allow more than 255 events ]

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events.c | 2 +-
 kernel/trace/trace_output.c | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 5d6e879cf87..9887131afa0 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -398,7 +398,7 @@ static int trace_write_header(struct trace_seq *s)
 				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
 				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
 				"\n",
-				FIELD(int, type),
+				FIELD(unsigned short, type),
 				FIELD(unsigned char, flags),
 				FIELD(unsigned char, preempt_count),
 				FIELD(int, pid),
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 83a8abb9640..06997e75114 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -537,6 +537,8 @@ int register_ftrace_event(struct trace_event *event)
  out:
 	mutex_unlock(&trace_event_mutex);
 
+	WARN_ON_ONCE(next_event_type > FTRACE_MAX_EVENT);
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(register_ftrace_event);
-- 
cgit v1.2.3


From 75db37d2f4c0ad9466ead57d467277d097b4105c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 26 Mar 2009 11:43:36 -0400
Subject: tracing: add size checks for exported ftrace internal structures

The events exported by TRACE_EVENT are automated and are guaranteed
to be correct when used.

The internal ftrace structures on the other hand are more manually
exported. These require the ftrace maintainer to make sure they
are up to date.

This patch adds a size check to help flag when a type changes in
an internal ftrace data structure, and the update needs to be reflected
in the export.

If a export is incorrect, then the only harm is that the user space
tools will not know how to correctly read the internal structures of
ftrace.

[ Impact: help prevent inconsistent ftrace format print outs ]

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events.c | 3 +++
 kernel/trace/trace_export.c | 4 ++++
 2 files changed, 7 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 9887131afa0..b9208158808 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -381,8 +381,11 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	return cnt;
 }
 
+extern char *__bad_type_size(void);
+
 #undef FIELD
 #define FIELD(type, name)						\
+	sizeof(type) != sizeof(field.name) ? __bad_type_size() :	\
 	#type, "common_" #name, offsetof(typeof(field), name),		\
 		sizeof(field.name)
 
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 48fc02fe73a..0cb1a142c74 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -19,8 +19,12 @@
 #undef TRACE_STRUCT
 #define TRACE_STRUCT(args...) args
 
+extern void __bad_type_size(void);
+
 #undef TRACE_FIELD
 #define TRACE_FIELD(type, item, assign)					\
+	if (sizeof(type) != sizeof(field.item))				\
+		__bad_type_size();					\
 	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
 			       "offset:%u;\tsize:%u;\n",		\
 			       (unsigned int)offsetof(typeof(field), item), \
-- 
cgit v1.2.3


From 334d4169a6592d3fcd863bbe822a8f6985ffa9af Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 24 Apr 2009 11:27:05 +0800
Subject: ring_buffer: compressed event header

RB_MAX_SMALL_DATA = 28bytes is too small for most tracers, it wastes
an 'u32' to save the actually length for events which data size > 28.

This fix uses compressed event header and enlarges RB_MAX_SMALL_DATA.

[ Impact: saves about 0%-12.5%(depends on tracer) memory in ring_buffer ]

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
LKML-Reference: <49F13189.3090000@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 83 +++++++++++++++++++++++-----------------------
 1 file changed, 41 insertions(+), 42 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 61dbdf21cd3..9692f100ec1 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -28,8 +28,8 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
 {
 	int ret;
 
-	ret = trace_seq_printf(s, "\ttype        :    2 bits\n");
-	ret = trace_seq_printf(s, "\tlen         :    3 bits\n");
+	ret = trace_seq_printf(s, "# compressed entry header\n");
+	ret = trace_seq_printf(s, "\ttype_len    :    5 bits\n");
 	ret = trace_seq_printf(s, "\ttime_delta  :   27 bits\n");
 	ret = trace_seq_printf(s, "\tarray       :   32 bits\n");
 	ret = trace_seq_printf(s, "\n");
@@ -37,8 +37,8 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
 			       RINGBUF_TYPE_PADDING);
 	ret = trace_seq_printf(s, "\ttime_extend : type == %d\n",
 			       RINGBUF_TYPE_TIME_EXTEND);
-	ret = trace_seq_printf(s, "\tdata        : type == %d\n",
-			       RINGBUF_TYPE_DATA);
+	ret = trace_seq_printf(s, "\tdata max type_len  == %d\n",
+			       RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
 
 	return ret;
 }
@@ -204,7 +204,10 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
 
 #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
 #define RB_ALIGNMENT		4U
-#define RB_MAX_SMALL_DATA	28
+#define RB_MAX_SMALL_DATA	(RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
+
+/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
+#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
 
 enum {
 	RB_LEN_TIME_EXTEND = 8,
@@ -213,17 +216,18 @@ enum {
 
 static inline int rb_null_event(struct ring_buffer_event *event)
 {
-	return event->type == RINGBUF_TYPE_PADDING && event->time_delta == 0;
+	return event->type_len == RINGBUF_TYPE_PADDING
+			&& event->time_delta == 0;
 }
 
 static inline int rb_discarded_event(struct ring_buffer_event *event)
 {
-	return event->type == RINGBUF_TYPE_PADDING && event->time_delta;
+	return event->type_len == RINGBUF_TYPE_PADDING && event->time_delta;
 }
 
 static void rb_event_set_padding(struct ring_buffer_event *event)
 {
-	event->type = RINGBUF_TYPE_PADDING;
+	event->type_len = RINGBUF_TYPE_PADDING;
 	event->time_delta = 0;
 }
 
@@ -232,8 +236,8 @@ rb_event_data_length(struct ring_buffer_event *event)
 {
 	unsigned length;
 
-	if (event->len)
-		length = event->len * RB_ALIGNMENT;
+	if (event->type_len)
+		length = event->type_len * RB_ALIGNMENT;
 	else
 		length = event->array[0];
 	return length + RB_EVNT_HDR_SIZE;
@@ -243,12 +247,12 @@ rb_event_data_length(struct ring_buffer_event *event)
 static unsigned
 rb_event_length(struct ring_buffer_event *event)
 {
-	switch (event->type) {
+	switch (event->type_len) {
 	case RINGBUF_TYPE_PADDING:
 		if (rb_null_event(event))
 			/* undefined */
 			return -1;
-		return rb_event_data_length(event);
+		return  event->array[0] + RB_EVNT_HDR_SIZE;
 
 	case RINGBUF_TYPE_TIME_EXTEND:
 		return RB_LEN_TIME_EXTEND;
@@ -272,7 +276,7 @@ rb_event_length(struct ring_buffer_event *event)
 unsigned ring_buffer_event_length(struct ring_buffer_event *event)
 {
 	unsigned length = rb_event_length(event);
-	if (event->type != RINGBUF_TYPE_DATA)
+	if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
 		return length;
 	length -= RB_EVNT_HDR_SIZE;
 	if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0]))
@@ -285,9 +289,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
 static void *
 rb_event_data(struct ring_buffer_event *event)
 {
-	BUG_ON(event->type != RINGBUF_TYPE_DATA);
+	BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
 	/* If length is in len field, then array[0] has the data */
-	if (event->len)
+	if (event->type_len)
 		return (void *)&event->array[0];
 	/* Otherwise length is in array[0] and array[1] has the data */
 	return (void *)&event->array[1];
@@ -988,7 +992,7 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
 		if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
 			return;
 		/* Only count data entries */
-		if (event->type != RINGBUF_TYPE_DATA)
+		if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
 			continue;
 		cpu_buffer->overrun++;
 		cpu_buffer->entries--;
@@ -1133,28 +1137,21 @@ static void
 rb_update_event(struct ring_buffer_event *event,
 			 unsigned type, unsigned length)
 {
-	event->type = type;
+	event->type_len = type;
 
 	switch (type) {
 
 	case RINGBUF_TYPE_PADDING:
-		break;
-
 	case RINGBUF_TYPE_TIME_EXTEND:
-		event->len = DIV_ROUND_UP(RB_LEN_TIME_EXTEND, RB_ALIGNMENT);
-		break;
-
 	case RINGBUF_TYPE_TIME_STAMP:
-		event->len = DIV_ROUND_UP(RB_LEN_TIME_STAMP, RB_ALIGNMENT);
 		break;
 
-	case RINGBUF_TYPE_DATA:
+	case 0:
 		length -= RB_EVNT_HDR_SIZE;
-		if (length > RB_MAX_SMALL_DATA) {
-			event->len = 0;
+		if (length > RB_MAX_SMALL_DATA)
 			event->array[0] = length;
-		} else
-			event->len = DIV_ROUND_UP(length, RB_ALIGNMENT);
+		else
+			event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
 		break;
 	default:
 		BUG();
@@ -1562,7 +1559,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
 	if (length > BUF_PAGE_SIZE)
 		goto out;
 
-	event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length);
+	event = rb_reserve_next_event(cpu_buffer, 0, length);
 	if (!event)
 		goto out;
 
@@ -1634,7 +1631,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
 
 static inline void rb_event_discard(struct ring_buffer_event *event)
 {
-	event->type = RINGBUF_TYPE_PADDING;
+	/* array[0] holds the actual length for the discarded event */
+	event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
+	event->type_len = RINGBUF_TYPE_PADDING;
 	/* time delta must be non zero */
 	if (!event->time_delta)
 		event->time_delta = 1;
@@ -1786,8 +1785,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
 		goto out;
 
 	event_length = rb_calculate_event_length(length);
-	event = rb_reserve_next_event(cpu_buffer,
-				      RINGBUF_TYPE_DATA, event_length);
+	event = rb_reserve_next_event(cpu_buffer, 0, event_length);
 	if (!event)
 		goto out;
 
@@ -2035,7 +2033,7 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 {
 	u64 delta;
 
-	switch (event->type) {
+	switch (event->type_len) {
 	case RINGBUF_TYPE_PADDING:
 		return;
 
@@ -2066,7 +2064,7 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
 {
 	u64 delta;
 
-	switch (event->type) {
+	switch (event->type_len) {
 	case RINGBUF_TYPE_PADDING:
 		return;
 
@@ -2181,7 +2179,8 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
 
 	event = rb_reader_event(cpu_buffer);
 
-	if (event->type == RINGBUF_TYPE_DATA || rb_discarded_event(event))
+	if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX
+			|| rb_discarded_event(event))
 		cpu_buffer->entries--;
 
 	rb_update_read_stamp(cpu_buffer, event);
@@ -2262,7 +2261,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 
 	event = rb_reader_event(cpu_buffer);
 
-	switch (event->type) {
+	switch (event->type_len) {
 	case RINGBUF_TYPE_PADDING:
 		if (rb_null_event(event))
 			RB_WARN_ON(cpu_buffer, 1);
@@ -2334,7 +2333,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 
 	event = rb_iter_head_event(iter);
 
-	switch (event->type) {
+	switch (event->type_len) {
 	case RINGBUF_TYPE_PADDING:
 		if (rb_null_event(event)) {
 			rb_inc_iter(iter);
@@ -2393,7 +2392,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 	event = rb_buffer_peek(buffer, cpu, ts);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
-	if (event && event->type == RINGBUF_TYPE_PADDING) {
+	if (event && event->type_len == RINGBUF_TYPE_PADDING) {
 		cpu_relax();
 		goto again;
 	}
@@ -2421,7 +2420,7 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 	event = rb_iter_peek(iter, ts);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
-	if (event && event->type == RINGBUF_TYPE_PADDING) {
+	if (event && event->type_len == RINGBUF_TYPE_PADDING) {
 		cpu_relax();
 		goto again;
 	}
@@ -2466,7 +2465,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
  out:
 	preempt_enable();
 
-	if (event && event->type == RINGBUF_TYPE_PADDING) {
+	if (event && event->type_len == RINGBUF_TYPE_PADDING) {
 		cpu_relax();
 		goto again;
 	}
@@ -2559,7 +2558,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
  out:
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
-	if (event && event->type == RINGBUF_TYPE_PADDING) {
+	if (event && event->type_len == RINGBUF_TYPE_PADDING) {
 		cpu_relax();
 		goto again;
 	}
@@ -2766,7 +2765,7 @@ static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
 		if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
 			return;
 		/* Only count data entries */
-		if (event->type != RINGBUF_TYPE_DATA)
+		if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
 			continue;
 		cpu_buffer->entries--;
 	}
-- 
cgit v1.2.3


From 060fa5c83e67901ba47ab484cfcdb32737d630ba Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 24 Apr 2009 12:20:52 -0400
Subject: tracing/events: reuse trace event ids after overflow

With modules being able to add trace events, and the max trace event
counter is 16 bits (65536) we can overflow the counter easily
with a simple while loop adding and removing modules that contain
trace events.

This patch links together the registered trace events and on overflow
searches for available trace event ids. It will still fail if
over 65536 events are registered, but considering that a typical
kernel only has 22000 functions, 65000 events should be sufficient.

Reported-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_output.c | 71 ++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 60 insertions(+), 11 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 06997e75114..5fc51f0f75f 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -483,6 +483,36 @@ struct trace_event *ftrace_find_event(int type)
 	return NULL;
 }
 
+static LIST_HEAD(ftrace_event_list);
+
+static int trace_search_list(struct list_head **list)
+{
+	struct trace_event *e;
+	int last = __TRACE_LAST_TYPE;
+
+	if (list_empty(&ftrace_event_list)) {
+		*list = &ftrace_event_list;
+		return last + 1;
+	}
+
+	/*
+	 * We used up all possible max events,
+	 * lets see if somebody freed one.
+	 */
+	list_for_each_entry(e, &ftrace_event_list, list) {
+		if (e->type != last + 1)
+			break;
+		last++;
+	}
+
+	/* Did we used up all 65 thousand events??? */
+	if ((last + 1) > FTRACE_MAX_EVENT)
+		return 0;
+
+	*list = &e->list;
+	return last + 1;
+}
+
 /**
  * register_ftrace_event - register output for an event type
  * @event: the event type to register
@@ -505,20 +535,40 @@ int register_ftrace_event(struct trace_event *event)
 
 	mutex_lock(&trace_event_mutex);
 
-	if (!event) {
-		ret = next_event_type++;
+	if (WARN_ON(!event))
 		goto out;
-	}
 
-	if (!event->type)
-		event->type = next_event_type++;
-	else if (event->type > __TRACE_LAST_TYPE) {
+	INIT_LIST_HEAD(&event->list);
+
+	if (!event->type) {
+		struct list_head *list;
+
+		if (next_event_type > FTRACE_MAX_EVENT) {
+
+			event->type = trace_search_list(&list);
+			if (!event->type)
+				goto out;
+
+		} else {
+			
+			event->type = next_event_type++;
+			list = &ftrace_event_list;
+		}
+
+		if (WARN_ON(ftrace_find_event(event->type)))
+			goto out;
+
+		list_add_tail(&event->list, list);
+
+	} else if (event->type > __TRACE_LAST_TYPE) {
 		printk(KERN_WARNING "Need to add type to trace.h\n");
 		WARN_ON(1);
-	}
-
-	if (ftrace_find_event(event->type))
 		goto out;
+	} else {
+		/* Is this event already used */
+		if (ftrace_find_event(event->type))
+			goto out;
+	}
 
 	if (event->trace == NULL)
 		event->trace = trace_nop_print;
@@ -537,8 +587,6 @@ int register_ftrace_event(struct trace_event *event)
  out:
 	mutex_unlock(&trace_event_mutex);
 
-	WARN_ON_ONCE(next_event_type > FTRACE_MAX_EVENT);
-
 	return ret;
 }
 EXPORT_SYMBOL_GPL(register_ftrace_event);
@@ -551,6 +599,7 @@ int unregister_ftrace_event(struct trace_event *event)
 {
 	mutex_lock(&trace_event_mutex);
 	hlist_del(&event->node);
+	list_del(&event->list);
 	mutex_unlock(&trace_event_mutex);
 
 	return 0;
-- 
cgit v1.2.3


From 701970b3a83cc639c1ec8fc6f40a7871cb99426f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 24 Apr 2009 23:11:22 -0400
Subject: tracing/events: make modules have their own file_operations structure

For proper module reference counting, the file_operations that modules use
must have the "owner" field set to the module. Unfortunately, the trace events
use share file_operations. The same file_operations are used by all both
kernel core and all modules.

This patch makes the modules allocate their own file_operations and
copies the functions from the core kernel. This allows those file
operations to be owned by the module.

Care is taken to free this code on module unload.

Thanks to Greg KH for reminding me that file_operations must be owned
by the module to have reference counting take place.

[ Impact: fix modular tracepoints / potential crash ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/trace/trace_events.c | 95 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 88 insertions(+), 7 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index b9208158808..be4d3a437c1 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -770,7 +770,11 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
 }
 
 static int
-event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
+event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
+		 const struct file_operations *id,
+		 const struct file_operations *enable,
+		 const struct file_operations *filter,
+		 const struct file_operations *format)
 {
 	struct dentry *entry;
 	int ret;
@@ -800,11 +804,11 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 
 	if (call->regfunc)
 		entry = trace_create_file("enable", 0644, call->dir, call,
-					  &ftrace_enable_fops);
+					  enable);
 
 	if (call->id)
 		entry = trace_create_file("id", 0444, call->dir, call,
-					  &ftrace_event_id_fops);
+					  id);
 
 	if (call->define_fields) {
 		ret = call->define_fields();
@@ -814,7 +818,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 			return ret;
 		}
 		entry = trace_create_file("filter", 0644, call->dir, call,
-					  &ftrace_event_filter_fops);
+					  filter);
 	}
 
 	/* A trace may not want to export its format */
@@ -822,7 +826,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 		return 0;
 
 	entry = trace_create_file("format", 0444, call->dir, call,
-				  &ftrace_event_format_fops);
+				  format);
 
 	return 0;
 }
@@ -833,8 +837,60 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 	     event++)
 
 #ifdef CONFIG_MODULES
+
+static LIST_HEAD(ftrace_module_file_list);
+
+/*
+ * Modules must own their file_operations to keep up with
+ * reference counting.
+ */
+struct ftrace_module_file_ops {
+	struct list_head		list;
+	struct module			*mod;
+	struct file_operations		id;
+	struct file_operations		enable;
+	struct file_operations		format;
+	struct file_operations		filter;
+};
+
+static struct ftrace_module_file_ops *
+trace_create_file_ops(struct module *mod)
+{
+	struct ftrace_module_file_ops *file_ops;
+
+	/*
+	 * This is a bit of a PITA. To allow for correct reference
+	 * counting, modules must "own" their file_operations.
+	 * To do this, we allocate the file operations that will be
+	 * used in the event directory.
+	 */
+
+	file_ops = kmalloc(sizeof(*file_ops), GFP_KERNEL);
+	if (!file_ops)
+		return NULL;
+
+	file_ops->mod = mod;
+
+	file_ops->id = ftrace_event_id_fops;
+	file_ops->id.owner = mod;
+
+	file_ops->enable = ftrace_enable_fops;
+	file_ops->enable.owner = mod;
+
+	file_ops->filter = ftrace_event_filter_fops;
+	file_ops->filter.owner = mod;
+
+	file_ops->format = ftrace_event_format_fops;
+	file_ops->format.owner = mod;
+
+	list_add(&file_ops->list, &ftrace_module_file_list);
+
+	return file_ops;
+}
+
 static void trace_module_add_events(struct module *mod)
 {
+	struct ftrace_module_file_ops *file_ops = NULL;
 	struct ftrace_event_call *call, *start, *end;
 	struct dentry *d_events;
 
@@ -852,14 +908,27 @@ static void trace_module_add_events(struct module *mod)
 		/* The linker may leave blanks */
 		if (!call->name)
 			continue;
+
+		/*
+		 * This module has events, create file ops for this module
+		 * if not already done.
+		 */
+		if (!file_ops) {
+			file_ops = trace_create_file_ops(mod);
+			if (!file_ops)
+				return;
+		}
 		call->mod = mod;
 		list_add(&call->list, &ftrace_events);
-		event_create_dir(call, d_events);
+		event_create_dir(call, d_events,
+				 &file_ops->id, &file_ops->enable,
+				 &file_ops->filter, &file_ops->format);
 	}
 }
 
 static void trace_module_remove_events(struct module *mod)
 {
+	struct ftrace_module_file_ops *file_ops;
 	struct ftrace_event_call *call, *p;
 
 	list_for_each_entry_safe(call, p, &ftrace_events, list) {
@@ -874,6 +943,16 @@ static void trace_module_remove_events(struct module *mod)
 			list_del(&call->list);
 		}
 	}
+
+	/* Now free the file_operations */
+	list_for_each_entry(file_ops, &ftrace_module_file_list, list) {
+		if (file_ops->mod == mod)
+			break;
+	}
+	if (&file_ops->list != &ftrace_module_file_list) {
+		list_del(&file_ops->list);
+		kfree(file_ops);
+	}
 }
 
 static int trace_module_notify(struct notifier_block *self,
@@ -954,7 +1033,9 @@ static __init int event_trace_init(void)
 		if (!call->name)
 			continue;
 		list_add(&call->list, &ftrace_events);
-		event_create_dir(call, d_events);
+		event_create_dir(call, d_events, &ftrace_event_id_fops,
+				 &ftrace_enable_fops, &ftrace_event_filter_fops,
+				 &ftrace_event_format_fops);
 	}
 
 	ret = register_module_notifier(&trace_module_nb);
-- 
cgit v1.2.3


From cd891ae0305601bdb4d2e7e85282961c4ff256cd Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 28 Apr 2009 11:39:34 -0400
Subject: tracing: convert ftrace_dump spinlocks to raw

ftrace_dump is used for printing out the contents of the ftrace ring buffer
to the console on failure. Currently it uses a spinlock to synchronize
the output from multiple failures on different CPUs. This spin lock
currently is a normal spinlock and can cause issues with lockdep and
lock tracing.

This patch converts it to raw since it is for error handling only.
The lock is local to the ftrace_dump and is not used by any other
infrastructure.

[ Impact: prevent ftrace_dump from locking up by internal tracing ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b6183bc9eca..5d704a41f83 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4114,7 +4114,8 @@ trace_printk_seq(struct trace_seq *s)
 
 static void __ftrace_dump(bool disable_tracing)
 {
-	static DEFINE_SPINLOCK(ftrace_dump_lock);
+	static raw_spinlock_t ftrace_dump_lock =
+		(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 	/* use static because iter can be a bit big for the stack */
 	static struct trace_iterator iter;
 	unsigned int old_userobj;
@@ -4123,7 +4124,8 @@ static void __ftrace_dump(bool disable_tracing)
 	int cnt = 0, cpu;
 
 	/* only one dump */
-	spin_lock_irqsave(&ftrace_dump_lock, flags);
+	local_irq_save(flags);
+	__raw_spin_lock(&ftrace_dump_lock);
 	if (dump_ran)
 		goto out;
 
@@ -4195,7 +4197,8 @@ static void __ftrace_dump(bool disable_tracing)
 	}
 
  out:
-	spin_unlock_irqrestore(&ftrace_dump_lock, flags);
+	__raw_spin_unlock(&ftrace_dump_lock);
+	local_irq_restore(flags);
 }
 
 /* By default: disable tracing after the dump */
-- 
cgit v1.2.3


From 5beae6efd1004b44c3e257dc96087978e4c763c1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 29 Apr 2009 00:16:21 -0400
Subject: tracing: fix ref count in splice pages

The pages allocated for the splice binary buffer did not initialize
the ref count correctly. This caused pages not to be freed and causes
a drastic memory leak.

Thanks to logdev I was able to trace the tracer to find where the leak
was.

[ Impact: stop memory leak when using splice ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5d704a41f83..9058240c85c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3531,6 +3531,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		if (!ref)
 			break;
 
+		ref->ref = 1;
 		ref->buffer = info->tr->buffer;
 		ref->page = ring_buffer_alloc_read_page(ref->buffer);
 		if (!ref->page) {
-- 
cgit v1.2.3


From 93459c6cb9816c52200993d29dd18cea1daee335 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 29 Apr 2009 00:23:13 -0400
Subject: tracing: only add splice page if entries exist

The splice code allocates a page even when the ring buffer is empty.
It detects the ring buffer being empty when it it fails to copy
anything from the ring buffer into the page.

This patch adds a check to see if there is anything in the ring buffer
before allocating a page.

Thanks to logdev for letting me trace the tracer to find this.

[ Impact: speed up due to removing unnecessary allocation ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9058240c85c..0aeb3b93414 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3508,7 +3508,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		.spd_release	= buffer_spd_release,
 	};
 	struct buffer_ref *ref;
-	int size, i;
+	int entries, size, i;
 	size_t ret;
 
 	if (*ppos & (PAGE_SIZE - 1)) {
@@ -3523,7 +3523,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		len &= PAGE_MASK;
 	}
 
-	for (i = 0; i < PIPE_BUFFERS && len; i++, len -= PAGE_SIZE) {
+	entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
+
+	for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) {
 		struct page *page;
 		int r;
 
@@ -3564,6 +3566,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		spd.partial[i].private = (unsigned long)ref;
 		spd.nr_pages++;
 		*ppos += PAGE_SIZE;
+
+		entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
 	}
 
 	spd.nr_pages = i;
-- 
cgit v1.2.3


From f2957f1f196b0217644a17c1379855a118a37d72 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 29 Apr 2009 00:26:30 -0400
Subject: tracing: have splice only copy full pages

Splice works with pages, it is much more effecient to use an entire
page than to copy bits over several pages.

Using logdev to trace the internals of the splice mechanism, I was
able to see that splice can be very aggressive. When tracing is
occurring, and the reader caught up to the writer, and the writer
is on the reader page, the reader will copy what is there into the
splice page. Splice may iterate over several pages and if the
writer is still writing to the page, the reader will keep copying
bits to new pages to pass to userspace.

This patch changes it to only pass data to userspace if the page
is full (the writer has left the page). This has a small side effect
that splice can not read a partial page, and must wait for the
page to fill. This should not be an issue. If tracing has stopped,
then a use of "read" will still read all of the page.

[ Impact: better performance for ring buffer splice code ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0aeb3b93414..f5427e0fc98 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3542,7 +3542,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		}
 
 		r = ring_buffer_read_page(ref->buffer, &ref->page,
-					  len, info->cpu, 0);
+					  len, info->cpu, 1);
 		if (r < 0) {
 			ring_buffer_free_read_page(ref->buffer,
 						   ref->page);
-- 
cgit v1.2.3


From 7d7d2b803159d4edeb051b0e5efbc1a8d9ef1c67 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 27 Apr 2009 12:37:49 -0400
Subject: ring-buffer: fix printk output

The warning output in trace_recursive_lock uses %d for a long when
it should be %ld.

[ Impact: fix compile warning ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9692f100ec1..f4cc59040eb 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1491,7 +1491,7 @@ static int trace_recursive_lock(void)
 	/* Disable all tracing before we do anything else */
 	tracing_off_permanent();
 
-	printk_once(KERN_WARNING "Tracing recursion: depth[%d]:"
+	printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:"
 		    "HC[%lu]:SC[%lu]:NMI[%lu]\n",
 		    current->trace_recursion,
 		    hardirq_count() >> HARDIRQ_SHIFT,
-- 
cgit v1.2.3


From 30e673b230f9d556eb81ef68a7b1a08c8b3b142c Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 28 Apr 2009 03:04:47 -0500
Subject: tracing/filters: move preds into event_filter object

Create a new event_filter object, and move the pred-related members
out of the call and subsystem objects and into the filter object - the
details of the filter implementation don't need to be exposed in the
call and subsystem in any case, and it will also help make the new
parser implementation a little cleaner.

[ Impact: refactor trace-filter code to prepare for new features ]

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: fweisbec@gmail.com
Cc: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <1240905887.6416.119.camel@tropicana>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h               |  10 ++--
 kernel/trace/trace_events.c        |   3 +-
 kernel/trace/trace_events_filter.c | 107 +++++++++++++++++++++++--------------
 3 files changed, 74 insertions(+), 46 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 7d55bcf50e4..1fb7d6ccadf 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -731,12 +731,16 @@ struct ftrace_event_field {
 	int			size;
 };
 
+struct event_filter {
+	int			n_preds;
+	struct filter_pred	**preds;
+};
+
 struct event_subsystem {
 	struct list_head	list;
 	const char		*name;
 	struct dentry		*entry;
-	int			n_preds;
-	struct filter_pred	**preds;
+	void			*filter;
 };
 
 struct filter_pred;
@@ -774,7 +778,7 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
 		     struct ring_buffer *buffer,
 		     struct ring_buffer_event *event)
 {
-	if (unlikely(call->n_preds) && !filter_match_preds(call, rec)) {
+	if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) {
 		ring_buffer_discard_commit(buffer, event);
 		return 1;
 	}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index be4d3a437c1..1cd1f37373d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -757,8 +757,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
 
 	list_add(&system->list, &event_subsystems);
 
-	system->preds = NULL;
-	system->n_preds = 0;
+	system->filter = NULL;
 
 	entry = debugfs_create_file("filter", 0644, system->entry, system,
 				    &ftrace_subsystem_filter_fops);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 65418288f95..1e861eca3d0 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -93,11 +93,12 @@ static int filter_pred_none(struct filter_pred *pred, void *event)
 /* return 1 if event matches, 0 otherwise (discard) */
 int filter_match_preds(struct ftrace_event_call *call, void *rec)
 {
+	struct event_filter *filter = call->filter;
 	int i, matched, and_failed = 0;
 	struct filter_pred *pred;
 
-	for (i = 0; i < call->n_preds; i++) {
-		pred = call->preds[i];
+	for (i = 0; i < filter->n_preds; i++) {
+		pred = filter->preds[i];
 		if (and_failed && !pred->or)
 			continue;
 		matched = pred->fn(pred, rec);
@@ -115,20 +116,20 @@ int filter_match_preds(struct ftrace_event_call *call, void *rec)
 }
 EXPORT_SYMBOL_GPL(filter_match_preds);
 
-static void __filter_print_preds(struct filter_pred **preds, int n_preds,
+static void __filter_print_preds(struct event_filter *filter,
 				 struct trace_seq *s)
 {
-	char *field_name;
 	struct filter_pred *pred;
+	char *field_name;
 	int i;
 
-	if (!n_preds) {
+	if (!filter || !filter->n_preds) {
 		trace_seq_printf(s, "none\n");
 		return;
 	}
 
-	for (i = 0; i < n_preds; i++) {
-		pred = preds[i];
+	for (i = 0; i < filter->n_preds; i++) {
+		pred = filter->preds[i];
 		field_name = pred->field_name;
 		if (i)
 			trace_seq_printf(s, pred->or ? "|| " : "&& ");
@@ -144,7 +145,7 @@ static void __filter_print_preds(struct filter_pred **preds, int n_preds,
 void filter_print_preds(struct ftrace_event_call *call, struct trace_seq *s)
 {
 	mutex_lock(&filter_mutex);
-	__filter_print_preds(call->preds, call->n_preds, s);
+	__filter_print_preds(call->filter, s);
 	mutex_unlock(&filter_mutex);
 }
 
@@ -152,7 +153,7 @@ void filter_print_subsystem_preds(struct event_subsystem *system,
 				  struct trace_seq *s)
 {
 	mutex_lock(&filter_mutex);
-	__filter_print_preds(system->preds, system->n_preds, s);
+	__filter_print_preds(system->filter, s);
 	mutex_unlock(&filter_mutex);
 }
 
@@ -200,12 +201,14 @@ static int filter_set_pred(struct filter_pred *dest,
 
 static void __filter_disable_preds(struct ftrace_event_call *call)
 {
+	struct event_filter *filter = call->filter;
 	int i;
 
-	call->n_preds = 0;
+	call->filter_active = 0;
+	filter->n_preds = 0;
 
 	for (i = 0; i < MAX_FILTER_PRED; i++)
-		call->preds[i]->fn = filter_pred_none;
+		filter->preds[i]->fn = filter_pred_none;
 }
 
 void filter_disable_preds(struct ftrace_event_call *call)
@@ -217,32 +220,39 @@ void filter_disable_preds(struct ftrace_event_call *call)
 
 int init_preds(struct ftrace_event_call *call)
 {
+	struct event_filter *filter;
 	struct filter_pred *pred;
 	int i;
 
-	call->n_preds = 0;
-
-	call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
-	if (!call->preds)
+	filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+	if (!call->filter)
 		return -ENOMEM;
 
+	call->filter_active = 0;
+	filter->n_preds = 0;
+
+	filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
+	if (!filter->preds)
+		goto oom;
+
 	for (i = 0; i < MAX_FILTER_PRED; i++) {
 		pred = kzalloc(sizeof(*pred), GFP_KERNEL);
 		if (!pred)
 			goto oom;
 		pred->fn = filter_pred_none;
-		call->preds[i] = pred;
+		filter->preds[i] = pred;
 	}
 
 	return 0;
 
 oom:
 	for (i = 0; i < MAX_FILTER_PRED; i++) {
-		if (call->preds[i])
-			filter_free_pred(call->preds[i]);
+		if (filter->preds[i])
+			filter_free_pred(filter->preds[i]);
 	}
-	kfree(call->preds);
-	call->preds = NULL;
+	kfree(filter->preds);
+	kfree(call->filter);
+	call->filter = NULL;
 
 	return -ENOMEM;
 }
@@ -250,15 +260,16 @@ EXPORT_SYMBOL_GPL(init_preds);
 
 static void __filter_free_subsystem_preds(struct event_subsystem *system)
 {
+	struct event_filter *filter = system->filter;
 	struct ftrace_event_call *call;
 	int i;
 
-	if (system->n_preds) {
-		for (i = 0; i < system->n_preds; i++)
-			filter_free_pred(system->preds[i]);
-		kfree(system->preds);
-		system->preds = NULL;
-		system->n_preds = 0;
+	if (filter && filter->n_preds) {
+		for (i = 0; i < filter->n_preds; i++)
+			filter_free_pred(filter->preds[i]);
+		kfree(filter->preds);
+		kfree(filter);
+		system->filter = NULL;
 	}
 
 	list_for_each_entry(call, &ftrace_events, list) {
@@ -281,21 +292,23 @@ static int filter_add_pred_fn(struct ftrace_event_call *call,
 			      struct filter_pred *pred,
 			      filter_pred_fn_t fn)
 {
+	struct event_filter *filter = call->filter;
 	int idx, err;
 
-	if (call->n_preds && !pred->compound)
+	if (filter->n_preds && !pred->compound)
 		__filter_disable_preds(call);
 
-	if (call->n_preds == MAX_FILTER_PRED)
+	if (filter->n_preds == MAX_FILTER_PRED)
 		return -ENOSPC;
 
-	idx = call->n_preds;
-	filter_clear_pred(call->preds[idx]);
-	err = filter_set_pred(call->preds[idx], pred, fn);
+	idx = filter->n_preds;
+	filter_clear_pred(filter->preds[idx]);
+	err = filter_set_pred(filter->preds[idx], pred, fn);
 	if (err)
 		return err;
 
-	call->n_preds++;
+	filter->n_preds++;
+	call->filter_active = 1;
 
 	return 0;
 }
@@ -366,29 +379,41 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
 int filter_add_subsystem_pred(struct event_subsystem *system,
 			      struct filter_pred *pred)
 {
+	struct event_filter *filter = system->filter;
 	struct ftrace_event_call *call;
 
 	mutex_lock(&filter_mutex);
 
-	if (system->n_preds && !pred->compound)
+	if (filter && filter->n_preds && !pred->compound) {
 		__filter_free_subsystem_preds(system);
+		filter = NULL;
+	}
 
-	if (!system->n_preds) {
-		system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
+	if (!filter) {
+		system->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+		if (!system->filter) {
+			mutex_unlock(&filter_mutex);
+			return -ENOMEM;
+		}
+		filter = system->filter;
+		filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
 					GFP_KERNEL);
-		if (!system->preds) {
+
+		if (!filter->preds) {
+			kfree(system->filter);
+			system->filter = NULL;
 			mutex_unlock(&filter_mutex);
 			return -ENOMEM;
 		}
 	}
 
-	if (system->n_preds == MAX_FILTER_PRED) {
+	if (filter->n_preds == MAX_FILTER_PRED) {
 		mutex_unlock(&filter_mutex);
 		return -ENOSPC;
 	}
 
-	system->preds[system->n_preds] = pred;
-	system->n_preds++;
+	filter->preds[filter->n_preds] = pred;
+	filter->n_preds++;
 
 	list_for_each_entry(call, &ftrace_events, list) {
 		int err;
@@ -401,8 +426,8 @@ int filter_add_subsystem_pred(struct event_subsystem *system,
 
 		err = __filter_add_pred(call, pred);
 		if (err == -ENOMEM) {
-			system->preds[system->n_preds] = NULL;
-			system->n_preds--;
+			filter->preds[filter->n_preds] = NULL;
+			filter->n_preds--;
 			mutex_unlock(&filter_mutex);
 			return err;
 		}
-- 
cgit v1.2.3


From a118e4d1402f1349fe3d953493e4168a300a752d Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 28 Apr 2009 03:04:53 -0500
Subject: tracing/filters: distinguish between signed and unsigned fields

The new filter comparison ops need to be able to distinguish between
signed and unsigned field types, so add an is_signed flag/param to the
event field struct/trace_define_fields().  Also define a simple macro,
is_signed_type() to determine the signedness at compile time, used in the
trace macros.  If the is_signed_type() macro won't work with a specific
type, a new slightly modified version of TRACE_FIELD() called
TRACE_FIELD_SIGN(), allows the signedness to be set explicitly.

[ Impact: extend trace-filter code for new feature ]

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: fweisbec@gmail.com
Cc: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <1240905893.6416.120.camel@tropicana>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h             |  1 +
 kernel/trace/trace_event_types.h |  4 ++--
 kernel/trace/trace_events.c      |  3 ++-
 kernel/trace/trace_export.c      | 29 ++++++++++++++++++++++-------
 4 files changed, 27 insertions(+), 10 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 1fb7d6ccadf..866d0108fd2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -729,6 +729,7 @@ struct ftrace_event_field {
 	char			*type;
 	int			offset;
 	int			size;
+	int			is_signed;
 };
 
 struct event_filter {
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index cfcecc4fd86..5e32e375134 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -141,8 +141,8 @@ TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
 
 TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
 	TRACE_STRUCT(
-		TRACE_FIELD(ktime_t, state_data.stamp, stamp)
-		TRACE_FIELD(ktime_t, state_data.end, end)
+		TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1)
+		TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1)
 		TRACE_FIELD(int, state_data.type, type)
 		TRACE_FIELD(int, state_data.state, state)
 	),
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 1cd1f37373d..bbbea747937 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -26,7 +26,7 @@ static DEFINE_MUTEX(event_mutex);
 LIST_HEAD(ftrace_events);
 
 int trace_define_field(struct ftrace_event_call *call, char *type,
-		       char *name, int offset, int size)
+		       char *name, int offset, int size, int is_signed)
 {
 	struct ftrace_event_field *field;
 
@@ -44,6 +44,7 @@ int trace_define_field(struct ftrace_event_call *call, char *type,
 
 	field->offset = offset;
 	field->size = size;
+	field->is_signed = is_signed;
 	list_add(&field->link, &call->fields);
 
 	return 0;
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 0cb1a142c74..d06cf898dc8 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -50,6 +50,9 @@ extern void __bad_type_size(void);
 	if (!ret)							\
 		return 0;
 
+#undef TRACE_FIELD_SIGN
+#define TRACE_FIELD_SIGN(type, item, assign, is_signed)	\
+	TRACE_FIELD(type, item, assign)
 
 #undef TP_RAW_FMT
 #define TP_RAW_FMT(args...) args
@@ -98,6 +101,10 @@ ftrace_format_##call(struct trace_seq *s)				\
 #define TRACE_FIELD(type, item, assign)\
 	entry->item = assign;
 
+#undef TRACE_FIELD_SIGN
+#define TRACE_FIELD_SIGN(type, item, assign, is_signed)	\
+	TRACE_FIELD(type, item, assign)
+
 #undef TP_CMD
 #define TP_CMD(cmd...)	cmd
 
@@ -149,7 +156,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 #define TRACE_FIELD(type, item, assign)					\
 	ret = trace_define_field(event_call, #type, #item,		\
 				 offsetof(typeof(field), item),		\
-				 sizeof(field.item));			\
+				 sizeof(field.item), is_signed_type(type));	\
 	if (ret)							\
 		return ret;
 
@@ -157,7 +164,15 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 #define TRACE_FIELD_SPECIAL(type, item, len, cmd)			\
 	ret = trace_define_field(event_call, #type "[" #len "]", #item,	\
 				 offsetof(typeof(field), item),		\
-				 sizeof(field.item));			\
+				 sizeof(field.item), 0);		\
+	if (ret)							\
+		return ret;
+
+#undef TRACE_FIELD_SIGN
+#define TRACE_FIELD_SIGN(type, item, assign, is_signed)			\
+	ret = trace_define_field(event_call, #type, #item,		\
+				 offsetof(typeof(field), item),		\
+				 sizeof(field.item), is_signed);	\
 	if (ret)							\
 		return ret;
 
@@ -173,11 +188,11 @@ ftrace_define_fields_##call(void)					\
 	struct args field;						\
 	int ret;							\
 									\
-	__common_field(unsigned char, type);				\
-	__common_field(unsigned char, flags);				\
-	__common_field(unsigned char, preempt_count);			\
-	__common_field(int, pid);					\
-	__common_field(int, tgid);					\
+	__common_field(unsigned char, type, 0);				\
+	__common_field(unsigned char, flags, 0);			\
+	__common_field(unsigned char, preempt_count, 0);		\
+	__common_field(int, pid, 1);					\
+	__common_field(int, tgid, 1);					\
 									\
 	tstruct;							\
 									\
-- 
cgit v1.2.3


From 8b3725621074040d380664964ffbc40610aef8c6 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 28 Apr 2009 03:04:59 -0500
Subject: tracing/filters: a better event parser

Replace the current event parser hack with a better one.  Filters are
no longer specified predicate by predicate, but all at once and can
use parens and any of the following operators:

numeric fields:

==, !=, <, <=, >, >=

string fields:

==, !=

predicates can be combined with the logical operators:

&&, ||

examples:

"common_preempt_count > 4" > filter

"((sig >= 10 && sig < 15) || sig == 17) && comm != bash" > filter

If there was an error, the erroneous string along with an error
message can be seen by looking at the filter e.g.:

((sig >= 10 && sig < 15) || dsig == 17) && comm != bash
^
parse_error: Field not found

Currently the caret for an error always appears at the beginning of
the filter; a real position should be used, but the error message
should be useful even without it.

To clear a filter, '0' can be written to the filter file.

Filters can also be set or cleared for a complete subsystem by writing
the same filter as would be written to an individual event to the
filter file at the root of the subsytem.  Note however, that if any
event in the subsystem lacks a field specified in the filter being
set, the set will fail and all filters in the subsytem are
automatically cleared.  This change from the previous version was made
because using only the fields that happen to exist for a given event
would most likely result in a meaningless filter.

Because the logical operators are now implemented as predicates, the
maximum number of predicates in a filter was increased from 8 to 16.

[ Impact: add new, extended trace-filter implementation ]

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: fweisbec@gmail.com
Cc: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <1240905899.6416.121.camel@tropicana>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h               |   66 ++-
 kernel/trace/trace_events.c        |   86 ++-
 kernel/trace/trace_events_filter.c | 1020 ++++++++++++++++++++++++++++--------
 3 files changed, 883 insertions(+), 289 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 866d0108fd2..7736fe8c1b7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -735,6 +735,7 @@ struct ftrace_event_field {
 struct event_filter {
 	int			n_preds;
 	struct filter_pred	**preds;
+	char			*filter_string;
 };
 
 struct event_subsystem {
@@ -746,7 +747,8 @@ struct event_subsystem {
 
 struct filter_pred;
 
-typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
+typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event,
+				 int val1, int val2);
 
 struct filter_pred {
 	filter_pred_fn_t fn;
@@ -756,23 +758,18 @@ struct filter_pred {
 	char *field_name;
 	int offset;
 	int not;
-	int or;
-	int compound;
-	int clear;
+	int op;
+	int pop_n;
 };
 
-extern void filter_free_pred(struct filter_pred *pred);
-extern void filter_print_preds(struct ftrace_event_call *call,
+extern void print_event_filter(struct ftrace_event_call *call,
 			       struct trace_seq *s);
-extern int filter_parse(char **pbuf, struct filter_pred *pred);
-extern int filter_add_pred(struct ftrace_event_call *call,
-			   struct filter_pred *pred);
-extern void filter_disable_preds(struct ftrace_event_call *call);
-extern void filter_free_subsystem_preds(struct event_subsystem *system);
-extern void filter_print_subsystem_preds(struct event_subsystem *system,
+extern int apply_event_filter(struct ftrace_event_call *call,
+			      char *filter_string);
+extern int apply_subsystem_event_filter(struct event_subsystem *system,
+					char *filter_string);
+extern void print_subsystem_event_filter(struct event_subsystem *system,
 					 struct trace_seq *s);
-extern int filter_add_subsystem_pred(struct event_subsystem *system,
-				     struct filter_pred *pred);
 
 static inline int
 filter_check_discard(struct ftrace_event_call *call, void *rec,
@@ -787,6 +784,47 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
 	return 0;
 }
 
+#define DEFINE_COMPARISON_PRED(type)					\
+static int filter_pred_##type(struct filter_pred *pred, void *event,	\
+			      int val1, int val2)			\
+{									\
+	type *addr = (type *)(event + pred->offset);			\
+	type val = (type)pred->val;					\
+	int match = 0;							\
+									\
+	switch (pred->op) {						\
+	case OP_LT:							\
+		match = (*addr < val);					\
+		break;							\
+	case OP_LE:							\
+		match = (*addr <= val);					\
+		break;							\
+	case OP_GT:							\
+		match = (*addr > val);					\
+		break;							\
+	case OP_GE:							\
+		match = (*addr >= val);					\
+		break;							\
+	default:							\
+		break;							\
+	}								\
+									\
+	return match;							\
+}
+
+#define DEFINE_EQUALITY_PRED(size)					\
+static int filter_pred_##size(struct filter_pred *pred, void *event,	\
+			      int val1, int val2)			\
+{									\
+	u##size *addr = (u##size *)(event + pred->offset);		\
+	u##size val = (u##size)pred->val;				\
+	int match;							\
+									\
+	match = (val == *addr) ^ pred->not;				\
+									\
+	return match;							\
+}
+
 extern struct list_head ftrace_events;
 
 extern const char *__start___trace_bprintk_fmt[];
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index bbbea747937..f789ca540fe 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -492,7 +492,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 
 	trace_seq_init(s);
 
-	filter_print_preds(call, s);
+	print_event_filter(call, s);
 	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
 
 	kfree(s);
@@ -505,40 +505,26 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		   loff_t *ppos)
 {
 	struct ftrace_event_call *call = filp->private_data;
-	char buf[64], *pbuf = buf;
-	struct filter_pred *pred;
+	char *buf;
 	int err;
 
-	if (cnt >= sizeof(buf))
+	if (cnt >= PAGE_SIZE)
 		return -EINVAL;
 
-	if (copy_from_user(&buf, ubuf, cnt))
-		return -EFAULT;
-	buf[cnt] = '\0';
-
-	pred = kzalloc(sizeof(*pred), GFP_KERNEL);
-	if (!pred)
+	buf = (char *)__get_free_page(GFP_TEMPORARY);
+	if (!buf)
 		return -ENOMEM;
 
-	err = filter_parse(&pbuf, pred);
-	if (err < 0) {
-		filter_free_pred(pred);
-		return err;
-	}
-
-	if (pred->clear) {
-		filter_disable_preds(call);
-		filter_free_pred(pred);
-		return cnt;
+	if (copy_from_user(buf, ubuf, cnt)) {
+		free_page((unsigned long) buf);
+		return -EFAULT;
 	}
+	buf[cnt] = '\0';
 
-	err = filter_add_pred(call, pred);
-	if (err < 0) {
-		filter_free_pred(pred);
+	err = apply_event_filter(call, buf);
+	free_page((unsigned long) buf);
+	if (err < 0)
 		return err;
-	}
-
-	filter_free_pred(pred);
 
 	*ppos += cnt;
 
@@ -562,7 +548,7 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 
 	trace_seq_init(s);
 
-	filter_print_subsystem_preds(system, s);
+	print_subsystem_event_filter(system, s);
 	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
 
 	kfree(s);
@@ -575,38 +561,26 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		       loff_t *ppos)
 {
 	struct event_subsystem *system = filp->private_data;
-	char buf[64], *pbuf = buf;
-	struct filter_pred *pred;
+	char *buf;
 	int err;
 
-	if (cnt >= sizeof(buf))
+	if (cnt >= PAGE_SIZE)
 		return -EINVAL;
 
-	if (copy_from_user(&buf, ubuf, cnt))
-		return -EFAULT;
-	buf[cnt] = '\0';
-
-	pred = kzalloc(sizeof(*pred), GFP_KERNEL);
-	if (!pred)
+	buf = (char *)__get_free_page(GFP_TEMPORARY);
+	if (!buf)
 		return -ENOMEM;
 
-	err = filter_parse(&pbuf, pred);
-	if (err < 0) {
-		filter_free_pred(pred);
-		return err;
-	}
-
-	if (pred->clear) {
-		filter_free_subsystem_preds(system);
-		filter_free_pred(pred);
-		return cnt;
+	if (copy_from_user(buf, ubuf, cnt)) {
+		free_page((unsigned long) buf);
+		return -EFAULT;
 	}
+	buf[cnt] = '\0';
 
-	err = filter_add_subsystem_pred(system, pred);
-	if (err < 0) {
-		filter_free_pred(pred);
+	err = apply_subsystem_event_filter(system, buf);
+	free_page((unsigned long) buf);
+	if (err < 0)
 		return err;
-	}
 
 	*ppos += cnt;
 
@@ -760,11 +734,21 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
 
 	system->filter = NULL;
 
+	system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL);
+	if (!system->filter) {
+		pr_warning("Could not allocate filter for subsystem "
+			   "'%s'\n", name);
+		return system->entry;
+	}
+
 	entry = debugfs_create_file("filter", 0644, system->entry, system,
 				    &ftrace_subsystem_filter_fops);
-	if (!entry)
+	if (!entry) {
+		kfree(system->filter);
+		system->filter = NULL;
 		pr_warning("Could not create debugfs "
 			   "'%s/filter' entry\n", name);
+	}
 
 	return system->entry;
 }
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 1e861eca3d0..f49486687ee 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -29,51 +29,130 @@
 
 static DEFINE_MUTEX(filter_mutex);
 
-static int filter_pred_64(struct filter_pred *pred, void *event)
+enum filter_op_ids
 {
-	u64 *addr = (u64 *)(event + pred->offset);
-	u64 val = (u64)pred->val;
-	int match;
-
-	match = (val == *addr) ^ pred->not;
-
-	return match;
-}
-
-static int filter_pred_32(struct filter_pred *pred, void *event)
-{
-	u32 *addr = (u32 *)(event + pred->offset);
-	u32 val = (u32)pred->val;
-	int match;
-
-	match = (val == *addr) ^ pred->not;
-
-	return match;
-}
-
-static int filter_pred_16(struct filter_pred *pred, void *event)
+	OP_OR,
+	OP_AND,
+	OP_NE,
+	OP_EQ,
+	OP_LT,
+	OP_LE,
+	OP_GT,
+	OP_GE,
+	OP_NONE,
+	OP_OPEN_PAREN,
+};
+
+struct filter_op {
+	int id;
+	char *string;
+	int precedence;
+};
+
+static struct filter_op filter_ops[] = {
+	{ OP_OR, "||", 1 },
+	{ OP_AND, "&&", 2 },
+	{ OP_NE, "!=", 4 },
+	{ OP_EQ, "==", 4 },
+	{ OP_LT, "<", 5 },
+	{ OP_LE, "<=", 5 },
+	{ OP_GT, ">", 5 },
+	{ OP_GE, ">=", 5 },
+	{ OP_NONE, "OP_NONE", 0 },
+	{ OP_OPEN_PAREN, "(", 0 },
+};
+
+enum {
+	FILT_ERR_NONE,
+	FILT_ERR_INVALID_OP,
+	FILT_ERR_UNBALANCED_PAREN,
+	FILT_ERR_TOO_MANY_OPERANDS,
+	FILT_ERR_OPERAND_TOO_LONG,
+	FILT_ERR_FIELD_NOT_FOUND,
+	FILT_ERR_ILLEGAL_FIELD_OP,
+	FILT_ERR_ILLEGAL_INTVAL,
+	FILT_ERR_BAD_SUBSYS_FILTER,
+	FILT_ERR_TOO_MANY_PREDS,
+	FILT_ERR_MISSING_FIELD,
+	FILT_ERR_INVALID_FILTER,
+};
+
+static char *err_text[] = {
+	"No error",
+	"Invalid operator",
+	"Unbalanced parens",
+	"Too many operands",
+	"Operand too long",
+	"Field not found",
+	"Illegal operation for field type",
+	"Illegal integer value",
+	"Couldn't find or set field in one of a subsystem's events",
+	"Too many terms in predicate expression",
+	"Missing field name and/or value",
+	"Meaningless filter expression",
+};
+
+struct opstack_op {
+	int op;
+	struct list_head list;
+};
+
+struct postfix_elt {
+	int op;
+	char *operand;
+	struct list_head list;
+};
+
+struct filter_parse_state {
+	struct filter_op *ops;
+	struct list_head opstack;
+	struct list_head postfix;
+	int lasterr;
+	int lasterr_pos;
+
+	struct {
+		char *string;
+		unsigned int cnt;
+		unsigned int tail;
+	} infix;
+
+	struct {
+		char string[MAX_FILTER_STR_VAL];
+		int pos;
+		unsigned int tail;
+	} operand;
+};
+
+DEFINE_COMPARISON_PRED(s64);
+DEFINE_COMPARISON_PRED(u64);
+DEFINE_COMPARISON_PRED(s32);
+DEFINE_COMPARISON_PRED(u32);
+DEFINE_COMPARISON_PRED(s16);
+DEFINE_COMPARISON_PRED(u16);
+DEFINE_COMPARISON_PRED(s8);
+DEFINE_COMPARISON_PRED(u8);
+
+DEFINE_EQUALITY_PRED(64);
+DEFINE_EQUALITY_PRED(32);
+DEFINE_EQUALITY_PRED(16);
+DEFINE_EQUALITY_PRED(8);
+
+static int filter_pred_and(struct filter_pred *pred __attribute((unused)),
+			   void *event __attribute((unused)),
+			   int val1, int val2)
 {
-	u16 *addr = (u16 *)(event + pred->offset);
-	u16 val = (u16)pred->val;
-	int match;
-
-	match = (val == *addr) ^ pred->not;
-
-	return match;
+	return val1 && val2;
 }
 
-static int filter_pred_8(struct filter_pred *pred, void *event)
+static int filter_pred_or(struct filter_pred *pred __attribute((unused)),
+			  void *event __attribute((unused)),
+			  int val1, int val2)
 {
-	u8 *addr = (u8 *)(event + pred->offset);
-	u8 val = (u8)pred->val;
-	int match;
-
-	match = (val == *addr) ^ pred->not;
-
-	return match;
+	return val1 || val2;
 }
 
-static int filter_pred_string(struct filter_pred *pred, void *event)
+static int filter_pred_string(struct filter_pred *pred, void *event,
+			      int val1, int val2)
 {
 	char *addr = (char *)(event + pred->offset);
 	int cmp, match;
@@ -85,7 +164,8 @@ static int filter_pred_string(struct filter_pred *pred, void *event)
 	return match;
 }
 
-static int filter_pred_none(struct filter_pred *pred, void *event)
+static int filter_pred_none(struct filter_pred *pred, void *event,
+			    int val1, int val2)
 {
 	return 0;
 }
@@ -94,66 +174,119 @@ static int filter_pred_none(struct filter_pred *pred, void *event)
 int filter_match_preds(struct ftrace_event_call *call, void *rec)
 {
 	struct event_filter *filter = call->filter;
-	int i, matched, and_failed = 0;
+	int match, top = 0, val1 = 0, val2 = 0;
+	int stack[MAX_FILTER_PRED];
 	struct filter_pred *pred;
+	int i;
 
 	for (i = 0; i < filter->n_preds; i++) {
 		pred = filter->preds[i];
-		if (and_failed && !pred->or)
+		if (!pred->pop_n) {
+			match = pred->fn(pred, rec, val1, val2);
+			stack[top++] = match;
 			continue;
-		matched = pred->fn(pred, rec);
-		if (!matched && !pred->or) {
-			and_failed = 1;
-			continue;
-		} else if (matched && pred->or)
-			return 1;
+		}
+		if (pred->pop_n > top) {
+			WARN_ON_ONCE(1);
+			return 0;
+		}
+		val1 = stack[--top];
+		val2 = stack[--top];
+		match = pred->fn(pred, rec, val1, val2);
+		stack[top++] = match;
 	}
 
-	if (and_failed)
-		return 0;
-
-	return 1;
+	return stack[--top];
 }
 EXPORT_SYMBOL_GPL(filter_match_preds);
 
-static void __filter_print_preds(struct event_filter *filter,
-				 struct trace_seq *s)
+static void parse_error(struct filter_parse_state *ps, int err, int pos)
 {
-	struct filter_pred *pred;
-	char *field_name;
-	int i;
+	ps->lasterr = err;
+	ps->lasterr_pos = pos;
+}
 
-	if (!filter || !filter->n_preds) {
-		trace_seq_printf(s, "none\n");
+static void remove_filter_string(struct event_filter *filter)
+{
+	kfree(filter->filter_string);
+	filter->filter_string = NULL;
+}
+
+static int replace_filter_string(struct event_filter *filter,
+				 char *filter_string)
+{
+	kfree(filter->filter_string);
+	filter->filter_string = kstrdup(filter_string, GFP_KERNEL);
+	if (!filter->filter_string)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int append_filter_string(struct event_filter *filter,
+				char *string)
+{
+	int newlen;
+	char *new_filter_string;
+
+	BUG_ON(!filter->filter_string);
+	newlen = strlen(filter->filter_string) + strlen(string) + 1;
+	new_filter_string = kmalloc(newlen, GFP_KERNEL);
+	if (!new_filter_string)
+		return -ENOMEM;
+
+	strcpy(new_filter_string, filter->filter_string);
+	strcat(new_filter_string, string);
+	kfree(filter->filter_string);
+	filter->filter_string = new_filter_string;
+
+	return 0;
+}
+
+static void append_filter_err(struct filter_parse_state *ps,
+			      struct event_filter *filter)
+{
+	int pos = ps->lasterr_pos;
+	char *buf, *pbuf;
+
+	buf = (char *)__get_free_page(GFP_TEMPORARY);
+	if (!buf)
 		return;
-	}
 
-	for (i = 0; i < filter->n_preds; i++) {
-		pred = filter->preds[i];
-		field_name = pred->field_name;
-		if (i)
-			trace_seq_printf(s, pred->or ? "|| " : "&& ");
-		trace_seq_printf(s, "%s ", field_name);
-		trace_seq_printf(s, pred->not ? "!= " : "== ");
-		if (pred->str_len)
-			trace_seq_printf(s, "%s\n", pred->str_val);
-		else
-			trace_seq_printf(s, "%llu\n", pred->val);
-	}
+	append_filter_string(filter, "\n");
+	memset(buf, ' ', PAGE_SIZE);
+	if (pos > PAGE_SIZE - 128)
+		pos = 0;
+	buf[pos] = '^';
+	pbuf = &buf[pos] + 1;
+
+	sprintf(pbuf, "\nparse_error: %s\n", err_text[ps->lasterr]);
+	append_filter_string(filter, buf);
+	free_page((unsigned long) buf);
 }
 
-void filter_print_preds(struct ftrace_event_call *call, struct trace_seq *s)
+void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
 {
+	struct event_filter *filter = call->filter;
+
 	mutex_lock(&filter_mutex);
-	__filter_print_preds(call->filter, s);
+	if (filter->filter_string)
+		trace_seq_printf(s, "%s\n", filter->filter_string);
+	else
+		trace_seq_printf(s, "none\n");
 	mutex_unlock(&filter_mutex);
 }
 
-void filter_print_subsystem_preds(struct event_subsystem *system,
+void print_subsystem_event_filter(struct event_subsystem *system,
 				  struct trace_seq *s)
 {
+	struct event_filter *filter = system->filter;
+
 	mutex_lock(&filter_mutex);
-	__filter_print_preds(system->filter, s);
+	if (filter->filter_string)
+		trace_seq_printf(s, "%s\n", filter->filter_string);
+	else
+		trace_seq_printf(s, "none\n");
 	mutex_unlock(&filter_mutex);
 }
 
@@ -170,7 +303,7 @@ find_event_field(struct ftrace_event_call *call, char *name)
 	return NULL;
 }
 
-void filter_free_pred(struct filter_pred *pred)
+static void filter_free_pred(struct filter_pred *pred)
 {
 	if (!pred)
 		return;
@@ -191,15 +324,17 @@ static int filter_set_pred(struct filter_pred *dest,
 			   filter_pred_fn_t fn)
 {
 	*dest = *src;
-	dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
-	if (!dest->field_name)
-		return -ENOMEM;
+	if (src->field_name) {
+		dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
+		if (!dest->field_name)
+			return -ENOMEM;
+	}
 	dest->fn = fn;
 
 	return 0;
 }
 
-static void __filter_disable_preds(struct ftrace_event_call *call)
+static void filter_disable_preds(struct ftrace_event_call *call)
 {
 	struct event_filter *filter = call->filter;
 	int i;
@@ -211,13 +346,6 @@ static void __filter_disable_preds(struct ftrace_event_call *call)
 		filter->preds[i]->fn = filter_pred_none;
 }
 
-void filter_disable_preds(struct ftrace_event_call *call)
-{
-	mutex_lock(&filter_mutex);
-	__filter_disable_preds(call);
-	mutex_unlock(&filter_mutex);
-}
-
 int init_preds(struct ftrace_event_call *call)
 {
 	struct event_filter *filter;
@@ -258,48 +386,43 @@ oom:
 }
 EXPORT_SYMBOL_GPL(init_preds);
 
-static void __filter_free_subsystem_preds(struct event_subsystem *system)
+static void filter_free_subsystem_preds(struct event_subsystem *system)
 {
 	struct event_filter *filter = system->filter;
 	struct ftrace_event_call *call;
 	int i;
 
-	if (filter && filter->n_preds) {
+	if (filter->n_preds) {
 		for (i = 0; i < filter->n_preds; i++)
 			filter_free_pred(filter->preds[i]);
 		kfree(filter->preds);
-		kfree(filter);
-		system->filter = NULL;
+		filter->preds = NULL;
+		filter->n_preds = 0;
 	}
 
 	list_for_each_entry(call, &ftrace_events, list) {
 		if (!call->define_fields)
 			continue;
 
-		if (!strcmp(call->system, system->name))
-			__filter_disable_preds(call);
+		if (!strcmp(call->system, system->name)) {
+			filter_disable_preds(call);
+			remove_filter_string(call->filter);
+		}
 	}
 }
 
-void filter_free_subsystem_preds(struct event_subsystem *system)
-{
-	mutex_lock(&filter_mutex);
-	__filter_free_subsystem_preds(system);
-	mutex_unlock(&filter_mutex);
-}
-
-static int filter_add_pred_fn(struct ftrace_event_call *call,
+static int filter_add_pred_fn(struct filter_parse_state *ps,
+			      struct ftrace_event_call *call,
 			      struct filter_pred *pred,
 			      filter_pred_fn_t fn)
 {
 	struct event_filter *filter = call->filter;
 	int idx, err;
 
-	if (filter->n_preds && !pred->compound)
-		__filter_disable_preds(call);
-
-	if (filter->n_preds == MAX_FILTER_PRED)
+	if (filter->n_preds == MAX_FILTER_PRED) {
+		parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
 		return -ENOSPC;
+	}
 
 	idx = filter->n_preds;
 	filter_clear_pred(filter->preds[idx]);
@@ -321,94 +444,132 @@ static int is_string_field(const char *type)
 	return 0;
 }
 
-static int __filter_add_pred(struct ftrace_event_call *call,
-			     struct filter_pred *pred)
+static int is_legal_op(struct ftrace_event_field *field, int op)
+{
+	if (is_string_field(field->type) && (op != OP_EQ && op != OP_NE))
+		return 0;
+
+	return 1;
+}
+
+static filter_pred_fn_t select_comparison_fn(int op, int field_size,
+					     int field_is_signed)
+{
+	filter_pred_fn_t fn = NULL;
+
+	switch (field_size) {
+	case 8:
+		if (op == OP_EQ || op == OP_NE)
+			fn = filter_pred_64;
+		else if (field_is_signed)
+			fn = filter_pred_s64;
+		else
+			fn = filter_pred_u64;
+		break;
+	case 4:
+		if (op == OP_EQ || op == OP_NE)
+			fn = filter_pred_32;
+		else if (field_is_signed)
+			fn = filter_pred_s32;
+		else
+			fn = filter_pred_u32;
+		break;
+	case 2:
+		if (op == OP_EQ || op == OP_NE)
+			fn = filter_pred_16;
+		else if (field_is_signed)
+			fn = filter_pred_s16;
+		else
+			fn = filter_pred_u16;
+		break;
+	case 1:
+		if (op == OP_EQ || op == OP_NE)
+			fn = filter_pred_8;
+		else if (field_is_signed)
+			fn = filter_pred_s8;
+		else
+			fn = filter_pred_u8;
+		break;
+	}
+
+	return fn;
+}
+
+static int filter_add_pred(struct filter_parse_state *ps,
+			   struct ftrace_event_call *call,
+			   struct filter_pred *pred)
 {
 	struct ftrace_event_field *field;
 	filter_pred_fn_t fn;
 	unsigned long long val;
 
+	pred->fn = filter_pred_none;
+
+	if (pred->op == OP_AND) {
+		pred->pop_n = 2;
+		return filter_add_pred_fn(ps, call, pred, filter_pred_and);
+	} else if (pred->op == OP_OR) {
+		pred->pop_n = 2;
+		return filter_add_pred_fn(ps, call, pred, filter_pred_or);
+	}
+
 	field = find_event_field(call, pred->field_name);
-	if (!field)
+	if (!field) {
+		parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
 		return -EINVAL;
+	}
 
-	pred->fn = filter_pred_none;
 	pred->offset = field->offset;
 
+	if (!is_legal_op(field, pred->op)) {
+		parse_error(ps, FILT_ERR_ILLEGAL_FIELD_OP, 0);
+		return -EINVAL;
+	}
+
 	if (is_string_field(field->type)) {
 		fn = filter_pred_string;
 		pred->str_len = field->size;
-		return filter_add_pred_fn(call, pred, fn);
+		if (pred->op == OP_NE)
+			pred->not = 1;
+		return filter_add_pred_fn(ps, call, pred, fn);
 	} else {
-		if (strict_strtoull(pred->str_val, 0, &val))
+		if (strict_strtoull(pred->str_val, 0, &val)) {
+			parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
 			return -EINVAL;
+		}
 		pred->val = val;
 	}
 
-	switch (field->size) {
-	case 8:
-		fn = filter_pred_64;
-		break;
-	case 4:
-		fn = filter_pred_32;
-		break;
-	case 2:
-		fn = filter_pred_16;
-		break;
-	case 1:
-		fn = filter_pred_8;
-		break;
-	default:
+	fn = select_comparison_fn(pred->op, field->size, field->is_signed);
+	if (!fn) {
+		parse_error(ps, FILT_ERR_INVALID_OP, 0);
 		return -EINVAL;
 	}
 
-	return filter_add_pred_fn(call, pred, fn);
-}
-
-int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
-{
-	int err;
-
-	mutex_lock(&filter_mutex);
-	err = __filter_add_pred(call, pred);
-	mutex_unlock(&filter_mutex);
+	if (pred->op == OP_NE)
+		pred->not = 1;
 
-	return err;
+	return filter_add_pred_fn(ps, call, pred, fn);
 }
 
-int filter_add_subsystem_pred(struct event_subsystem *system,
-			      struct filter_pred *pred)
+static int filter_add_subsystem_pred(struct filter_parse_state *ps,
+				     struct event_subsystem *system,
+				     struct filter_pred *pred,
+				     char *filter_string)
 {
 	struct event_filter *filter = system->filter;
 	struct ftrace_event_call *call;
 
-	mutex_lock(&filter_mutex);
-
-	if (filter && filter->n_preds && !pred->compound) {
-		__filter_free_subsystem_preds(system);
-		filter = NULL;
-	}
-
-	if (!filter) {
-		system->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
-		if (!system->filter) {
-			mutex_unlock(&filter_mutex);
-			return -ENOMEM;
-		}
-		filter = system->filter;
+	if (!filter->preds) {
 		filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
 					GFP_KERNEL);
 
-		if (!filter->preds) {
-			kfree(system->filter);
-			system->filter = NULL;
-			mutex_unlock(&filter_mutex);
+		if (!filter->preds)
 			return -ENOMEM;
-		}
 	}
 
 	if (filter->n_preds == MAX_FILTER_PRED) {
-		mutex_unlock(&filter_mutex);
+		parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
 		return -ENOSPC;
 	}
 
@@ -424,97 +585,508 @@ int filter_add_subsystem_pred(struct event_subsystem *system,
 		if (strcmp(call->system, system->name))
 			continue;
 
-		err = __filter_add_pred(call, pred);
-		if (err == -ENOMEM) {
-			filter->preds[filter->n_preds] = NULL;
-			filter->n_preds--;
-			mutex_unlock(&filter_mutex);
+		err = filter_add_pred(ps, call, pred);
+		if (err) {
+			filter_free_subsystem_preds(system);
+			parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
 			return err;
 		}
+		replace_filter_string(call->filter, filter_string);
 	}
 
-	mutex_unlock(&filter_mutex);
+	return 0;
+}
+
+static void parse_init(struct filter_parse_state *ps,
+		       struct filter_op *ops,
+		       char *infix_string)
+{
+	memset(ps, '\0', sizeof(*ps));
+
+	ps->infix.string = infix_string;
+	ps->infix.cnt = strlen(infix_string);
+	ps->ops = ops;
+
+	INIT_LIST_HEAD(&ps->opstack);
+	INIT_LIST_HEAD(&ps->postfix);
+}
+
+static char infix_next(struct filter_parse_state *ps)
+{
+	ps->infix.cnt--;
+
+	return ps->infix.string[ps->infix.tail++];
+}
+
+static char infix_peek(struct filter_parse_state *ps)
+{
+	if (ps->infix.tail == strlen(ps->infix.string))
+		return 0;
+
+	return ps->infix.string[ps->infix.tail];
+}
+
+static void infix_advance(struct filter_parse_state *ps)
+{
+	ps->infix.cnt--;
+	ps->infix.tail++;
+}
+
+static inline int is_precedence_lower(struct filter_parse_state *ps,
+				      int a, int b)
+{
+	return ps->ops[a].precedence < ps->ops[b].precedence;
+}
+
+static inline int is_op_char(struct filter_parse_state *ps, char c)
+{
+	int i;
+
+	for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
+		if (ps->ops[i].string[0] == c)
+			return 1;
+	}
 
 	return 0;
 }
 
-/*
- * The filter format can be
- *   - 0, which means remove all filter preds
- *   - [||/&&] <field> ==/!= <val>
- */
-int filter_parse(char **pbuf, struct filter_pred *pred)
-{
-	char *tok, *val_str = NULL;
-	int tok_n = 0;
-
-	while ((tok = strsep(pbuf, " \n"))) {
-		if (tok_n == 0) {
-			if (!strcmp(tok, "0")) {
-				pred->clear = 1;
-				return 0;
-			} else if (!strcmp(tok, "&&")) {
-				pred->or = 0;
-				pred->compound = 1;
-			} else if (!strcmp(tok, "||")) {
-				pred->or = 1;
-				pred->compound = 1;
-			} else
-				pred->field_name = tok;
-			tok_n = 1;
-			continue;
+static int infix_get_op(struct filter_parse_state *ps, char firstc)
+{
+	char nextc = infix_peek(ps);
+	char opstr[3];
+	int i;
+
+	opstr[0] = firstc;
+	opstr[1] = nextc;
+	opstr[2] = '\0';
+
+	for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
+		if (!strcmp(opstr, ps->ops[i].string)) {
+			infix_advance(ps);
+			return ps->ops[i].id;
 		}
-		if (tok_n == 1) {
-			if (!pred->field_name)
-				pred->field_name = tok;
-			else if (!strcmp(tok, "!="))
-				pred->not = 1;
-			else if (!strcmp(tok, "=="))
-				pred->not = 0;
-			else {
-				pred->field_name = NULL;
+	}
+
+	opstr[1] = '\0';
+
+	for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
+		if (!strcmp(opstr, ps->ops[i].string))
+			return ps->ops[i].id;
+	}
+
+	return OP_NONE;
+}
+
+static inline void clear_operand_string(struct filter_parse_state *ps)
+{
+	memset(ps->operand.string, '\0', MAX_FILTER_STR_VAL);
+	ps->operand.tail = 0;
+}
+
+static inline int append_operand_char(struct filter_parse_state *ps, char c)
+{
+	if (ps->operand.tail == MAX_FILTER_STR_VAL)
+		return -EINVAL;
+
+	ps->operand.string[ps->operand.tail++] = c;
+
+	return 0;
+}
+
+static int filter_opstack_push(struct filter_parse_state *ps, int op)
+{
+	struct opstack_op *opstack_op;
+
+	opstack_op = kmalloc(sizeof(*opstack_op), GFP_KERNEL);
+	if (!opstack_op)
+		return -ENOMEM;
+
+	opstack_op->op = op;
+	list_add(&opstack_op->list, &ps->opstack);
+
+	return 0;
+}
+
+static int filter_opstack_empty(struct filter_parse_state *ps)
+{
+	return list_empty(&ps->opstack);
+}
+
+static int filter_opstack_top(struct filter_parse_state *ps)
+{
+	struct opstack_op *opstack_op;
+
+	if (filter_opstack_empty(ps))
+		return OP_NONE;
+
+	opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list);
+
+	return opstack_op->op;
+}
+
+static int filter_opstack_pop(struct filter_parse_state *ps)
+{
+	struct opstack_op *opstack_op;
+	int op;
+
+	if (filter_opstack_empty(ps))
+		return OP_NONE;
+
+	opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list);
+	op = opstack_op->op;
+	list_del(&opstack_op->list);
+
+	kfree(opstack_op);
+
+	return op;
+}
+
+static void filter_opstack_clear(struct filter_parse_state *ps)
+{
+	while (!filter_opstack_empty(ps))
+		filter_opstack_pop(ps);
+}
+
+static char *curr_operand(struct filter_parse_state *ps)
+{
+	return ps->operand.string;
+}
+
+static int postfix_append_operand(struct filter_parse_state *ps, char *operand)
+{
+	struct postfix_elt *elt;
+
+	elt = kmalloc(sizeof(*elt), GFP_KERNEL);
+	if (!elt)
+		return -ENOMEM;
+
+	elt->op = OP_NONE;
+	elt->operand = kstrdup(operand, GFP_KERNEL);
+	if (!elt->operand) {
+		kfree(elt);
+		return -ENOMEM;
+	}
+
+	list_add_tail(&elt->list, &ps->postfix);
+
+	return 0;
+}
+
+static int postfix_append_op(struct filter_parse_state *ps, int op)
+{
+	struct postfix_elt *elt;
+
+	elt = kmalloc(sizeof(*elt), GFP_KERNEL);
+	if (!elt)
+		return -ENOMEM;
+
+	elt->op = op;
+	elt->operand = NULL;
+
+	list_add_tail(&elt->list, &ps->postfix);
+
+	return 0;
+}
+
+static void postfix_clear(struct filter_parse_state *ps)
+{
+	struct postfix_elt *elt;
+
+	while (!list_empty(&ps->postfix)) {
+		elt = list_first_entry(&ps->postfix, struct postfix_elt, list);
+		kfree(elt->operand);
+		list_del(&elt->list);
+	}
+}
+
+static int filter_parse(struct filter_parse_state *ps)
+{
+	int op, top_op;
+	char ch;
+
+	while ((ch = infix_next(ps))) {
+		if (isspace(ch))
+			continue;
+
+		if (is_op_char(ps, ch)) {
+			op = infix_get_op(ps, ch);
+			if (op == OP_NONE) {
+				parse_error(ps, FILT_ERR_INVALID_OP, 0);
 				return -EINVAL;
 			}
-			tok_n = 2;
+
+			if (strlen(curr_operand(ps))) {
+				postfix_append_operand(ps, curr_operand(ps));
+				clear_operand_string(ps);
+			}
+
+			while (!filter_opstack_empty(ps)) {
+				top_op = filter_opstack_top(ps);
+				if (!is_precedence_lower(ps, top_op, op)) {
+					top_op = filter_opstack_pop(ps);
+					postfix_append_op(ps, top_op);
+					continue;
+				}
+				break;
+			}
+
+			filter_opstack_push(ps, op);
 			continue;
 		}
-		if (tok_n == 2) {
-			if (pred->compound) {
-				if (!strcmp(tok, "!="))
-					pred->not = 1;
-				else if (!strcmp(tok, "=="))
-					pred->not = 0;
-				else {
-					pred->field_name = NULL;
-					return -EINVAL;
-				}
-			} else {
-				val_str = tok;
-				break; /* done */
+
+		if (ch == '(') {
+			filter_opstack_push(ps, OP_OPEN_PAREN);
+			continue;
+		}
+
+		if (ch == ')') {
+			if (strlen(curr_operand(ps))) {
+				postfix_append_operand(ps, curr_operand(ps));
+				clear_operand_string(ps);
+			}
+
+			top_op = filter_opstack_pop(ps);
+			while (top_op != OP_NONE) {
+				if (top_op == OP_OPEN_PAREN)
+					break;
+				postfix_append_op(ps, top_op);
+				top_op = filter_opstack_pop(ps);
+			}
+			if (top_op == OP_NONE) {
+				parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0);
+				return -EINVAL;
 			}
-			tok_n = 3;
 			continue;
 		}
-		if (tok_n == 3) {
-			val_str = tok;
-			break; /* done */
+		if (append_operand_char(ps, ch)) {
+			parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0);
+			return -EINVAL;
+		}
+	}
+
+	if (strlen(curr_operand(ps)))
+		postfix_append_operand(ps, curr_operand(ps));
+
+	while (!filter_opstack_empty(ps)) {
+		top_op = filter_opstack_pop(ps);
+		if (top_op == OP_NONE)
+			break;
+		if (top_op == OP_OPEN_PAREN) {
+			parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0);
+			return -EINVAL;
+		}
+		postfix_append_op(ps, top_op);
+	}
+
+	return 0;
+}
+
+static struct filter_pred *create_pred(int op, char *operand1, char *operand2)
+{
+	struct filter_pred *pred;
+
+	pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+	if (!pred)
+		return NULL;
+
+	pred->field_name = kstrdup(operand1, GFP_KERNEL);
+	if (!pred->field_name) {
+		kfree(pred);
+		return NULL;
+	}
+
+	strcpy(pred->str_val, operand2);
+	pred->str_len = strlen(operand2);
+
+	pred->op = op;
+
+	return pred;
+}
+
+static struct filter_pred *create_logical_pred(int op)
+{
+	struct filter_pred *pred;
+
+	pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+	if (!pred)
+		return NULL;
+
+	pred->op = op;
+
+	return pred;
+}
+
+static int check_preds(struct filter_parse_state *ps)
+{
+	int n_normal_preds = 0, n_logical_preds = 0;
+	struct postfix_elt *elt;
+
+	list_for_each_entry(elt, &ps->postfix, list) {
+		if (elt->op == OP_NONE)
+			continue;
+
+		if (elt->op == OP_AND || elt->op == OP_OR) {
+			n_logical_preds++;
+			continue;
 		}
+		n_normal_preds++;
 	}
 
-	if (!val_str || !strlen(val_str)
-	    || strlen(val_str) >= MAX_FILTER_STR_VAL) {
-		pred->field_name = NULL;
+	if (!n_normal_preds || n_logical_preds >= n_normal_preds) {
+		parse_error(ps, FILT_ERR_INVALID_FILTER, 0);
 		return -EINVAL;
 	}
 
-	strcpy(pred->str_val, val_str);
-	pred->str_len = strlen(val_str);
+	return 0;
+}
 
-	pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
-	if (!pred->field_name)
-		return -ENOMEM;
+static int replace_preds(struct event_subsystem *system,
+			 struct ftrace_event_call *call,
+			 struct filter_parse_state *ps,
+			 char *filter_string)
+{
+	char *operand1 = NULL, *operand2 = NULL;
+	struct filter_pred *pred;
+	struct postfix_elt *elt;
+	int err;
+
+	err = check_preds(ps);
+	if (err)
+		return err;
+
+	list_for_each_entry(elt, &ps->postfix, list) {
+		if (elt->op == OP_NONE) {
+			if (!operand1)
+				operand1 = elt->operand;
+			else if (!operand2)
+				operand2 = elt->operand;
+			else {
+				parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0);
+				return -EINVAL;
+			}
+			continue;
+		}
+
+		if (elt->op == OP_AND || elt->op == OP_OR) {
+			pred = create_logical_pred(elt->op);
+			if (call) {
+				err = filter_add_pred(ps, call, pred);
+				filter_free_pred(pred);
+			} else
+				err = filter_add_subsystem_pred(ps, system,
+							pred, filter_string);
+			if (err)
+				return err;
+
+			operand1 = operand2 = NULL;
+			continue;
+		}
+
+		if (!operand1 || !operand2) {
+			parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
+			return -EINVAL;
+		}
+
+		pred = create_pred(elt->op, operand1, operand2);
+		if (call) {
+			err = filter_add_pred(ps, call, pred);
+			filter_free_pred(pred);
+		} else
+			err = filter_add_subsystem_pred(ps, system, pred,
+							filter_string);
+		if (err)
+			return err;
+
+		operand1 = operand2 = NULL;
+	}
 
 	return 0;
 }
 
+int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
+{
+	int err;
+
+	struct filter_parse_state *ps;
+
+	mutex_lock(&filter_mutex);
+
+	if (!strcmp(strstrip(filter_string), "0")) {
+		filter_disable_preds(call);
+		remove_filter_string(call->filter);
+		mutex_unlock(&filter_mutex);
+		return 0;
+	}
+
+	ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+	if (!ps)
+		return -ENOMEM;
+
+	filter_disable_preds(call);
+	replace_filter_string(call->filter, filter_string);
+
+	parse_init(ps, filter_ops, filter_string);
+	err = filter_parse(ps);
+	if (err) {
+		append_filter_err(ps, call->filter);
+		goto out;
+	}
+
+	err = replace_preds(NULL, call, ps, filter_string);
+	if (err)
+		append_filter_err(ps, call->filter);
+
+out:
+	filter_opstack_clear(ps);
+	postfix_clear(ps);
+	kfree(ps);
+
+	mutex_unlock(&filter_mutex);
+
+	return err;
+}
+
+int apply_subsystem_event_filter(struct event_subsystem *system,
+				 char *filter_string)
+{
+	int err;
+
+	struct filter_parse_state *ps;
+
+	mutex_lock(&filter_mutex);
+
+	if (!strcmp(strstrip(filter_string), "0")) {
+		filter_free_subsystem_preds(system);
+		remove_filter_string(system->filter);
+		mutex_unlock(&filter_mutex);
+		return 0;
+	}
+
+	ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+	if (!ps)
+		return -ENOMEM;
+
+	filter_free_subsystem_preds(system);
+	replace_filter_string(system->filter, filter_string);
+
+	parse_init(ps, filter_ops, filter_string);
+	err = filter_parse(ps);
+	if (err) {
+		append_filter_err(ps, system->filter);
+		goto out;
+	}
+
+	err = replace_preds(system, NULL, ps, filter_string);
+	if (err)
+		append_filter_err(ps, system->filter);
+
+out:
+	filter_opstack_clear(ps);
+	postfix_clear(ps);
+	kfree(ps);
+
+	mutex_unlock(&filter_mutex);
+
+	return err;
+}
 
-- 
cgit v1.2.3


From d6ce96dabe2c4409fd009ec14250a1fdbab4b133 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 5 May 2009 01:15:24 -0400
Subject: ring-buffer: export symbols

I'm adding a module to do a series of tests on the ring buffer as well
as benchmarks. This module needs to have more of the ring buffer API
exported. There's nothing wrong with reading the ring buffer from a
module.

[ Impact: allow modules to read pages from the ring buffer ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f4cc59040eb..3e86da9b2a0 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2802,6 +2802,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
 
 	return bpage;
 }
+EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
 
 /**
  * ring_buffer_free_read_page - free an allocated read page
@@ -2814,6 +2815,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
 {
 	free_page((unsigned long)data);
 }
+EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
 
 /**
  * ring_buffer_read_page - extract a page from the ring buffer
@@ -2959,6 +2961,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
  out:
 	return ret;
 }
+EXPORT_SYMBOL_GPL(ring_buffer_read_page);
 
 static ssize_t
 rb_simple_read(struct file *filp, char __user *ubuf,
-- 
cgit v1.2.3


From f0d2c681ac0a85142fc8abe65fc33fcad35cb9b7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 29 Apr 2009 13:43:37 -0400
Subject: ring-buffer: add counters for commit overrun and nmi dropped entries

The WARN_ON in the ring buffer when a commit is preempted and the
buffer is filled by preceding writes can happen in normal operations.
The WARN_ON makes it look like a bug, not to mention, because
it does not stop tracing and calls printk which can also recurse, this
is prone to deadlock (the WARN_ON is not in a position to recurse).

This patch removes the WARN_ON and replaces it with a counter that
can be retrieved by a tracer. This counter is called commit_overrun.

While at it, I added a nmi_dropped counter to count any time an NMI entry
is dropped because the NMI could not take the spinlock.

[ Impact: prevent deadlock by printing normal case warning ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 52 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 49 insertions(+), 3 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3e86da9b2a0..26e1359fe19 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -402,6 +402,8 @@ struct ring_buffer_per_cpu {
 	struct buffer_page		*tail_page;	/* write to tail */
 	struct buffer_page		*commit_page;	/* committed pages */
 	struct buffer_page		*reader_page;
+	unsigned long			nmi_dropped;
+	unsigned long			commit_overrun;
 	unsigned long			overrun;
 	unsigned long			entries;
 	u64				write_stamp;
@@ -1216,8 +1218,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		 * simply fail.
 		 */
 		if (unlikely(in_nmi())) {
-			if (!__raw_spin_trylock(&cpu_buffer->lock))
+			if (!__raw_spin_trylock(&cpu_buffer->lock)) {
+				cpu_buffer->nmi_dropped++;
 				goto out_reset;
+			}
 		} else
 			__raw_spin_lock(&cpu_buffer->lock);
 
@@ -1238,8 +1242,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		 * about it.
 		 */
 		if (unlikely(next_page == commit_page)) {
-			/* This can easily happen on small ring buffers */
-			WARN_ON_ONCE(buffer->pages > 2);
+			cpu_buffer->commit_overrun++;
 			goto out_reset;
 		}
 
@@ -1925,6 +1928,47 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
 
+/**
+ * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of overruns from
+ */
+unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	unsigned long ret;
+
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
+		return 0;
+
+	cpu_buffer = buffer->buffers[cpu];
+	ret = cpu_buffer->nmi_dropped;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu);
+
+/**
+ * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of overruns from
+ */
+unsigned long
+ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	unsigned long ret;
+
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
+		return 0;
+
+	cpu_buffer = buffer->buffers[cpu];
+	ret = cpu_buffer->commit_overrun;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu);
+
 /**
  * ring_buffer_entries - get the number of entries in a buffer
  * @buffer: The ring buffer
@@ -2595,6 +2639,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 	local_set(&cpu_buffer->reader_page->page->commit, 0);
 	cpu_buffer->reader_page->read = 0;
 
+	cpu_buffer->nmi_dropped = 0;
+	cpu_buffer->commit_overrun = 0;
 	cpu_buffer->overrun = 0;
 	cpu_buffer->entries = 0;
 
-- 
cgit v1.2.3


From c8d771835e18c938dae8690611d65fe98ad30f58 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 29 Apr 2009 18:03:45 -0400
Subject: tracing: export stats of ring buffers to userspace

This patch adds stats to the ftrace ring buffers:

 # cat /debugfs/tracing/per_cpu/cpu0/stats
 entries: 42360
 overrun: 30509326
 commit overrun: 0
 nmi dropped: 0

Where entries are the total number of data entries in the buffer.

overrun is the number of entries not consumed and were overwritten by
the writer.

commit overrun is the number of entries dropped due to nested writers
wrapping the buffer before the initial writer finished the commit.

nmi dropped is the number of entries dropped due to the ring buffer
lock being held when an nmi was going to write to the ring buffer.
Note, this field will be meaningless and will go away when the ring
buffer becomes lockless.

[ Impact: let userspace know what is happening in the ring buffers ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f5427e0fc98..74df029056b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3595,6 +3595,45 @@ static const struct file_operations tracing_buffers_fops = {
 	.llseek		= no_llseek,
 };
 
+static ssize_t
+tracing_stats_read(struct file *filp, char __user *ubuf,
+		   size_t count, loff_t *ppos)
+{
+	unsigned long cpu = (unsigned long)filp->private_data;
+	struct trace_array *tr = &global_trace;
+	struct trace_seq *s;
+	unsigned long cnt;
+
+	s = kmalloc(sizeof(*s), GFP_ATOMIC);
+	if (!s)
+		return ENOMEM;
+
+	trace_seq_init(s);
+
+	cnt = ring_buffer_entries_cpu(tr->buffer, cpu);
+	trace_seq_printf(s, "entries: %ld\n", cnt);
+
+	cnt = ring_buffer_overrun_cpu(tr->buffer, cpu);
+	trace_seq_printf(s, "overrun: %ld\n", cnt);
+
+	cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
+	trace_seq_printf(s, "commit overrun: %ld\n", cnt);
+
+	cnt = ring_buffer_nmi_dropped_cpu(tr->buffer, cpu);
+	trace_seq_printf(s, "nmi dropped: %ld\n", cnt);
+
+	count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
+
+	kfree(s);
+
+	return count;
+}
+
+static const struct file_operations tracing_stats_fops = {
+	.open		= tracing_open_generic,
+	.read		= tracing_stats_read,
+};
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 
 int __weak ftrace_arch_read_dyn_info(char *buf, int size)
@@ -3708,6 +3747,9 @@ static void tracing_init_debugfs_percpu(long cpu)
 
 	trace_create_file("trace_pipe_raw", 0444, d_cpu,
 			(void *) cpu, &tracing_buffers_fops);
+
+	trace_create_file("stats", 0444, d_cpu,
+			(void *) cpu, &tracing_stats_fops);
 }
 
 #ifdef CONFIG_FTRACE_SELFTEST
-- 
cgit v1.2.3


From e4906eff9e6fbd2d311abcbcc53d5a531773c982 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 30 Apr 2009 20:49:44 -0400
Subject: ring-buffer: convert cpu buffer entries to local_t

The entries counter in cpu buffer is not atomic. It can be updated by
other interrupts or from another CPU (readers).

But making entries into "atomic_t" causes an atomic operation that can
hurt performance. Instead we convert it to a local_t that will increment
a counter with a local CPU atomic operation (if the arch supports it).

Instead of fighting with readers and overwrites that decrement the counter,
I added a "read" counter. Every time a reader reads an entry it is
incremented.

We already have a overrun counter and with that, the entries counter and
the read counter, we can calculate the total number of entries in the
buffer with:

  (entries - overrun) - read

As long as the total number of entries in the ring buffer is less than
the word size, this will work. But since the entries counter was previously
a long, this is no different than what we had before.

Thanks to Andrew Morton for pointing out in the first version that
atomic_t does not replace unsigned long. I switched to atomic_long_t
even though it is signed. A negative count is most likely a bug.

[ Impact: keep accurate count of cpu buffer entries ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 26e1359fe19..c792ea893b0 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -405,7 +405,8 @@ struct ring_buffer_per_cpu {
 	unsigned long			nmi_dropped;
 	unsigned long			commit_overrun;
 	unsigned long			overrun;
-	unsigned long			entries;
+	unsigned long			read;
+	local_t				entries;
 	u64				write_stamp;
 	u64				read_stamp;
 	atomic_t			record_disabled;
@@ -997,7 +998,6 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
 		if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
 			continue;
 		cpu_buffer->overrun++;
-		cpu_buffer->entries--;
 	}
 }
 
@@ -1588,7 +1588,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
 static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
 		      struct ring_buffer_event *event)
 {
-	cpu_buffer->entries++;
+	local_inc(&cpu_buffer->entries);
 
 	/* Only process further if we own the commit */
 	if (!rb_is_commit(cpu_buffer, event))
@@ -1722,7 +1722,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
 	 * The commit is still visible by the reader, so we
 	 * must increment entries.
 	 */
-	cpu_buffer->entries++;
+	local_inc(&cpu_buffer->entries);
  out:
 	/*
 	 * If a write came in and pushed the tail page
@@ -1902,7 +1902,8 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
 		return 0;
 
 	cpu_buffer = buffer->buffers[cpu];
-	ret = cpu_buffer->entries;
+	ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun)
+		- cpu_buffer->read;
 
 	return ret;
 }
@@ -1985,7 +1986,8 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
 	/* if you care about this being correct, lock the buffer */
 	for_each_buffer_cpu(buffer, cpu) {
 		cpu_buffer = buffer->buffers[cpu];
-		entries += cpu_buffer->entries;
+		entries += (local_read(&cpu_buffer->entries) -
+			    cpu_buffer->overrun) - cpu_buffer->read;
 	}
 
 	return entries;
@@ -2225,7 +2227,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
 
 	if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX
 			|| rb_discarded_event(event))
-		cpu_buffer->entries--;
+		cpu_buffer->read++;
 
 	rb_update_read_stamp(cpu_buffer, event);
 
@@ -2642,7 +2644,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 	cpu_buffer->nmi_dropped = 0;
 	cpu_buffer->commit_overrun = 0;
 	cpu_buffer->overrun = 0;
-	cpu_buffer->entries = 0;
+	cpu_buffer->read = 0;
+	local_set(&cpu_buffer->entries, 0);
 
 	cpu_buffer->write_stamp = 0;
 	cpu_buffer->read_stamp = 0;
@@ -2813,7 +2816,7 @@ static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
 		/* Only count data entries */
 		if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
 			continue;
-		cpu_buffer->entries--;
+		cpu_buffer->read++;
 	}
 	__raw_spin_unlock(&cpu_buffer->lock);
 }
-- 
cgit v1.2.3


From 778c55d44eb4f5f658915ed631d68ed9d1ac3ad1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 1 May 2009 18:44:45 -0400
Subject: ring-buffer: record page entries in buffer page descriptor

Currently, when the ring buffer writer overflows the buffer and must
write over non consumed data, we increment the overrun counter by
reading the entries on the page we are about to overwrite. This reads
the entries one by one.

This is not very effecient. This patch adds another entry counter
into each buffer page descriptor that keeps track of the number of
entries on the page. Now on overwrite, the overrun counter simply
needs to add the number of entries that is on the page it is about
to overwrite.

[ Impact: speed up of ring buffer in overwrite mode ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 39 +++++++++++++--------------------------
 1 file changed, 13 insertions(+), 26 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index c792ea893b0..342eacc4baa 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -321,9 +321,10 @@ struct buffer_data_page {
 };
 
 struct buffer_page {
+	struct list_head list;		/* list of buffer pages */
 	local_t		 write;		/* index for next write */
 	unsigned	 read;		/* index for next read */
-	struct list_head list;		/* list of free pages */
+	local_t		 entries;	/* entries on this page */
 	struct buffer_data_page *page;	/* Actual data page */
 };
 
@@ -977,30 +978,6 @@ static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
 	return rb_page_commit(cpu_buffer->head_page);
 }
 
-/*
- * When the tail hits the head and the buffer is in overwrite mode,
- * the head jumps to the next page and all content on the previous
- * page is discarded. But before doing so, we update the overrun
- * variable of the buffer.
- */
-static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
-{
-	struct ring_buffer_event *event;
-	unsigned long head;
-
-	for (head = 0; head < rb_head_size(cpu_buffer);
-	     head += rb_event_length(event)) {
-
-		event = __rb_page_index(cpu_buffer->head_page, head);
-		if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
-			return;
-		/* Only count data entries */
-		if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
-			continue;
-		cpu_buffer->overrun++;
-	}
-}
-
 static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
 			       struct buffer_page **bpage)
 {
@@ -1253,7 +1230,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 			/* tail_page has not moved yet? */
 			if (tail_page == cpu_buffer->tail_page) {
 				/* count overflows */
-				rb_update_overflow(cpu_buffer);
+				cpu_buffer->overrun +=
+					local_read(&head_page->entries);
 
 				rb_inc_page(cpu_buffer, &head_page);
 				cpu_buffer->head_page = head_page;
@@ -1268,6 +1246,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		 */
 		if (tail_page == cpu_buffer->tail_page) {
 			local_set(&next_page->write, 0);
+			local_set(&next_page->entries, 0);
 			local_set(&next_page->page->commit, 0);
 			cpu_buffer->tail_page = next_page;
 
@@ -1313,6 +1292,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	event = __rb_page_index(tail_page, tail);
 	rb_update_event(event, type, length);
 
+	/* The passed in type is zero for DATA */
+	if (likely(!type))
+		local_inc(&tail_page->entries);
+
 	/*
 	 * If this is a commit and the tail is zero, then update
 	 * this page's time stamp.
@@ -2183,6 +2166,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	cpu_buffer->reader_page->list.prev = reader->list.prev;
 
 	local_set(&cpu_buffer->reader_page->write, 0);
+	local_set(&cpu_buffer->reader_page->entries, 0);
 	local_set(&cpu_buffer->reader_page->page->commit, 0);
 
 	/* Make the reader page now replace the head */
@@ -2629,6 +2613,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 	cpu_buffer->head_page
 		= list_entry(cpu_buffer->pages.next, struct buffer_page, list);
 	local_set(&cpu_buffer->head_page->write, 0);
+	local_set(&cpu_buffer->head_page->entries, 0);
 	local_set(&cpu_buffer->head_page->page->commit, 0);
 
 	cpu_buffer->head_page->read = 0;
@@ -2638,6 +2623,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 
 	INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
 	local_set(&cpu_buffer->reader_page->write, 0);
+	local_set(&cpu_buffer->reader_page->entries, 0);
 	local_set(&cpu_buffer->reader_page->page->commit, 0);
 	cpu_buffer->reader_page->read = 0;
 
@@ -2996,6 +2982,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 		bpage = reader->page;
 		reader->page = *data_page;
 		local_set(&reader->write, 0);
+		local_set(&reader->entries, 0);
 		reader->read = 0;
 		*data_page = bpage;
 
-- 
cgit v1.2.3


From afbab76a62b69ea6197e19727d4b8a8aef8deb25 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 1 May 2009 19:40:05 -0400
Subject: ring-buffer: have read page swap increment counter with page entries

In the swap page ring buffer code that is used by the ftrace splice code,
we scan the page to increment the counter of entries read.

With the number of entries already in the page we simply need to add it.

[ Impact: speed up reading page from ring buffer ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 28 +++-------------------------
 1 file changed, 3 insertions(+), 25 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 342eacc4baa..9e42a742a3f 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2785,28 +2785,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
 
-static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
-			      struct buffer_data_page *bpage,
-			      unsigned int offset)
-{
-	struct ring_buffer_event *event;
-	unsigned long head;
-
-	__raw_spin_lock(&cpu_buffer->lock);
-	for (head = offset; head < local_read(&bpage->commit);
-	     head += rb_event_length(event)) {
-
-		event = __rb_data_page_index(bpage, head);
-		if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
-			return;
-		/* Only count data entries */
-		if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
-			continue;
-		cpu_buffer->read++;
-	}
-	__raw_spin_unlock(&cpu_buffer->lock);
-}
-
 /**
  * ring_buffer_alloc_read_page - allocate a page to read from buffer
  * @buffer: the buffer to allocate for.
@@ -2977,6 +2955,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 		/* we copied everything to the beginning */
 		read = 0;
 	} else {
+		/* update the entry counter */
+		cpu_buffer->read += local_read(&reader->entries);
+
 		/* swap the pages */
 		rb_init_page(bpage);
 		bpage = reader->page;
@@ -2985,9 +2966,6 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 		local_set(&reader->entries, 0);
 		reader->read = 0;
 		*data_page = bpage;
-
-		/* update the entry counter */
-		rb_remove_entries(cpu_buffer, bpage, read);
 	}
 	ret = read;
 
-- 
cgit v1.2.3


From 41ede23eded40832c955d98d4b71bc244809abb3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 1 May 2009 20:26:54 -0400
Subject: ring-buffer: disable writers when resetting buffers

As a precaution, it is best to disable writing to the ring buffers
when reseting them.

[ Impact: prevent weird things if write happens during reset ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9e42a742a3f..7876df00695 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2650,6 +2650,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return;
 
+	atomic_inc(&cpu_buffer->record_disabled);
+
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 
 	__raw_spin_lock(&cpu_buffer->lock);
@@ -2659,6 +2661,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
 	__raw_spin_unlock(&cpu_buffer->lock);
 
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+	atomic_dec(&cpu_buffer->record_disabled);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
 
-- 
cgit v1.2.3


From 94487d6d53af5acae10cf9fd52f74498994d46b1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 5 May 2009 19:22:53 -0400
Subject: tracing: use proper export symbol for tracing api

When adding the EXPORT_SYMBOL to some of the tracing API, I accidently
used EXPORT_SYMBOL instead of EXPORT_SYMBOL_GPL. This patch fixes
that mistake.

[ Impact: export the tracing code only for GPL modules ]

Reported-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 74df029056b..4164a344e72 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -887,21 +887,21 @@ trace_current_buffer_lock_reserve(int type, unsigned long len,
 	return trace_buffer_lock_reserve(&global_trace,
 					 type, len, flags, pc);
 }
-EXPORT_SYMBOL(trace_current_buffer_lock_reserve);
+EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve);
 
 void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
 					unsigned long flags, int pc)
 {
 	__trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1);
 }
-EXPORT_SYMBOL(trace_current_buffer_unlock_commit);
+EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit);
 
 void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
 					unsigned long flags, int pc)
 {
 	__trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0);
 }
-EXPORT_SYMBOL(trace_nowake_buffer_unlock_commit);
+EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
 
 void trace_current_buffer_discard_commit(struct ring_buffer_event *event)
 {
-- 
cgit v1.2.3


From aa20ae8444fc6c318272c643f856d8d8ad3e198d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 5 May 2009 21:16:11 -0400
Subject: ring-buffer: move big if statement down

In the hot path of the ring buffer "__rb_reserve_next" there's a big
if statement that does not even return back to the work flow.

	code;

	if (cross to next page) {

		[ lots of code ]

		return;
	}

	more code;

The condition is even the unlikely path, although we do not denote it
with an unlikely because gcc is fine with it. The condition is true when
the write crosses a page boundary, and we need to start at a new page.

Having this if statement makes it hard to read, but calling another
function to do the work is also not appropriate, because we are using a lot
of variables that were set before the if statement, and we do not want to
send them as parameters.

This patch changes it to a goto:

	code;

	if (cross to next page)
		goto next_page;

	more code;

	return;

next_page:

	[ lots of code]

This makes the code easier to understand, and a bit more obvious.

The output from gcc is practically identical. For some reason, gcc decided
to use different registers when I switched it to a goto. But other than that,
the logic is the same.

[ Impact: easier to read code ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 218 +++++++++++++++++++++++----------------------
 1 file changed, 111 insertions(+), 107 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7876df00695..424129eb20a 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1159,6 +1159,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		  unsigned type, unsigned long length, u64 *ts)
 {
 	struct buffer_page *tail_page, *head_page, *reader_page, *commit_page;
+	struct buffer_page *next_page;
 	unsigned long tail, write;
 	struct ring_buffer *buffer = cpu_buffer->buffer;
 	struct ring_buffer_event *event;
@@ -1173,137 +1174,140 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	tail = write - length;
 
 	/* See if we shot pass the end of this buffer page */
-	if (write > BUF_PAGE_SIZE) {
-		struct buffer_page *next_page = tail_page;
+	if (write > BUF_PAGE_SIZE)
+		goto next_page;
 
-		local_irq_save(flags);
-		/*
-		 * Since the write to the buffer is still not
-		 * fully lockless, we must be careful with NMIs.
-		 * The locks in the writers are taken when a write
-		 * crosses to a new page. The locks protect against
-		 * races with the readers (this will soon be fixed
-		 * with a lockless solution).
-		 *
-		 * Because we can not protect against NMIs, and we
-		 * want to keep traces reentrant, we need to manage
-		 * what happens when we are in an NMI.
-		 *
-		 * NMIs can happen after we take the lock.
-		 * If we are in an NMI, only take the lock
-		 * if it is not already taken. Otherwise
-		 * simply fail.
-		 */
-		if (unlikely(in_nmi())) {
-			if (!__raw_spin_trylock(&cpu_buffer->lock)) {
-				cpu_buffer->nmi_dropped++;
-				goto out_reset;
-			}
-		} else
-			__raw_spin_lock(&cpu_buffer->lock);
-
-		lock_taken = true;
+	/* We reserved something on the buffer */
 
-		rb_inc_page(cpu_buffer, &next_page);
+	if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE))
+		return NULL;
 
-		head_page = cpu_buffer->head_page;
-		reader_page = cpu_buffer->reader_page;
+	event = __rb_page_index(tail_page, tail);
+	rb_update_event(event, type, length);
 
-		/* we grabbed the lock before incrementing */
-		if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
-			goto out_reset;
+	/* The passed in type is zero for DATA */
+	if (likely(!type))
+		local_inc(&tail_page->entries);
 
-		/*
-		 * If for some reason, we had an interrupt storm that made
-		 * it all the way around the buffer, bail, and warn
-		 * about it.
-		 */
-		if (unlikely(next_page == commit_page)) {
-			cpu_buffer->commit_overrun++;
-			goto out_reset;
-		}
+	/*
+	 * If this is a commit and the tail is zero, then update
+	 * this page's time stamp.
+	 */
+	if (!tail && rb_is_commit(cpu_buffer, event))
+		cpu_buffer->commit_page->page->time_stamp = *ts;
 
-		if (next_page == head_page) {
-			if (!(buffer->flags & RB_FL_OVERWRITE))
-				goto out_reset;
+	return event;
 
-			/* tail_page has not moved yet? */
-			if (tail_page == cpu_buffer->tail_page) {
-				/* count overflows */
-				cpu_buffer->overrun +=
-					local_read(&head_page->entries);
+ next_page:
 
-				rb_inc_page(cpu_buffer, &head_page);
-				cpu_buffer->head_page = head_page;
-				cpu_buffer->head_page->read = 0;
-			}
-		}
+	next_page = tail_page;
 
-		/*
-		 * If the tail page is still the same as what we think
-		 * it is, then it is up to us to update the tail
-		 * pointer.
-		 */
-		if (tail_page == cpu_buffer->tail_page) {
-			local_set(&next_page->write, 0);
-			local_set(&next_page->entries, 0);
-			local_set(&next_page->page->commit, 0);
-			cpu_buffer->tail_page = next_page;
-
-			/* reread the time stamp */
-			*ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu);
-			cpu_buffer->tail_page->page->time_stamp = *ts;
+	local_irq_save(flags);
+	/*
+	 * Since the write to the buffer is still not
+	 * fully lockless, we must be careful with NMIs.
+	 * The locks in the writers are taken when a write
+	 * crosses to a new page. The locks protect against
+	 * races with the readers (this will soon be fixed
+	 * with a lockless solution).
+	 *
+	 * Because we can not protect against NMIs, and we
+	 * want to keep traces reentrant, we need to manage
+	 * what happens when we are in an NMI.
+	 *
+	 * NMIs can happen after we take the lock.
+	 * If we are in an NMI, only take the lock
+	 * if it is not already taken. Otherwise
+	 * simply fail.
+	 */
+	if (unlikely(in_nmi())) {
+		if (!__raw_spin_trylock(&cpu_buffer->lock)) {
+			cpu_buffer->nmi_dropped++;
+			goto out_reset;
 		}
+	} else
+		__raw_spin_lock(&cpu_buffer->lock);
 
-		/*
-		 * The actual tail page has moved forward.
-		 */
-		if (tail < BUF_PAGE_SIZE) {
-			/* Mark the rest of the page with padding */
-			event = __rb_page_index(tail_page, tail);
-			rb_event_set_padding(event);
-		}
+	lock_taken = true;
 
-		if (tail <= BUF_PAGE_SIZE)
-			/* Set the write back to the previous setting */
-			local_set(&tail_page->write, tail);
+	rb_inc_page(cpu_buffer, &next_page);
 
-		/*
-		 * If this was a commit entry that failed,
-		 * increment that too
-		 */
-		if (tail_page == cpu_buffer->commit_page &&
-		    tail == rb_commit_index(cpu_buffer)) {
-			rb_set_commit_to_write(cpu_buffer);
-		}
+	head_page = cpu_buffer->head_page;
+	reader_page = cpu_buffer->reader_page;
 
-		__raw_spin_unlock(&cpu_buffer->lock);
-		local_irq_restore(flags);
+	/* we grabbed the lock before incrementing */
+	if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
+		goto out_reset;
 
-		/* fail and let the caller try again */
-		return ERR_PTR(-EAGAIN);
+	/*
+	 * If for some reason, we had an interrupt storm that made
+	 * it all the way around the buffer, bail, and warn
+	 * about it.
+	 */
+	if (unlikely(next_page == commit_page)) {
+		cpu_buffer->commit_overrun++;
+		goto out_reset;
 	}
 
-	/* We reserved something on the buffer */
+	if (next_page == head_page) {
+		if (!(buffer->flags & RB_FL_OVERWRITE))
+			goto out_reset;
 
-	if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE))
-		return NULL;
+		/* tail_page has not moved yet? */
+		if (tail_page == cpu_buffer->tail_page) {
+			/* count overflows */
+			cpu_buffer->overrun +=
+				local_read(&head_page->entries);
 
-	event = __rb_page_index(tail_page, tail);
-	rb_update_event(event, type, length);
+			rb_inc_page(cpu_buffer, &head_page);
+			cpu_buffer->head_page = head_page;
+			cpu_buffer->head_page->read = 0;
+		}
+	}
 
-	/* The passed in type is zero for DATA */
-	if (likely(!type))
-		local_inc(&tail_page->entries);
+	/*
+	 * If the tail page is still the same as what we think
+	 * it is, then it is up to us to update the tail
+	 * pointer.
+	 */
+	if (tail_page == cpu_buffer->tail_page) {
+		local_set(&next_page->write, 0);
+		local_set(&next_page->entries, 0);
+		local_set(&next_page->page->commit, 0);
+		cpu_buffer->tail_page = next_page;
+
+		/* reread the time stamp */
+		*ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu);
+		cpu_buffer->tail_page->page->time_stamp = *ts;
+	}
 
 	/*
-	 * If this is a commit and the tail is zero, then update
-	 * this page's time stamp.
+	 * The actual tail page has moved forward.
 	 */
-	if (!tail && rb_is_commit(cpu_buffer, event))
-		cpu_buffer->commit_page->page->time_stamp = *ts;
+	if (tail < BUF_PAGE_SIZE) {
+		/* Mark the rest of the page with padding */
+		event = __rb_page_index(tail_page, tail);
+		rb_event_set_padding(event);
+	}
 
-	return event;
+	if (tail <= BUF_PAGE_SIZE)
+		/* Set the write back to the previous setting */
+		local_set(&tail_page->write, tail);
+
+	/*
+	 * If this was a commit entry that failed,
+	 * increment that too
+	 */
+	if (tail_page == cpu_buffer->commit_page &&
+	    tail == rb_commit_index(cpu_buffer)) {
+		rb_set_commit_to_write(cpu_buffer);
+	}
+
+	__raw_spin_unlock(&cpu_buffer->lock);
+	local_irq_restore(flags);
+
+	/* fail and let the caller try again */
+	return ERR_PTR(-EAGAIN);
 
  out_reset:
 	/* reset write */
-- 
cgit v1.2.3


From 5092dbc96f3acdac5433b27c06860352dc6d23b9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 5 May 2009 22:47:18 -0400
Subject: ring-buffer: add benchmark and tester

This patch adds code that can benchmark the ring buffer as well as
test it. This code can be compiled into the kernel (not recommended)
or as a module.

A separate ring buffer is used to not interfer with other users, like
ftrace. It creates a producer and a consumer (option to disable creation
of the consumer) and will run for 10 seconds, then sleep for 10 seconds
and then repeat.

While running, the producer will write 10 byte loads into the ring
buffer with just putting in the current CPU number. The reader will
continually try to read the buffer. The reader will alternate from reading
the buffer via event by event, or by full pages.

The output is a pr_info, thus it will fill up the syslogs.

  Starting ring buffer hammer
  End ring buffer hammer
  Time:     9000349 (usecs)
  Overruns: 12578640
  Read:     5358440  (by events)
  Entries:  0
  Total:    17937080
  Missed:   0
  Hit:      17937080
  Entries per millisec: 1993
  501 ns per entry
  Sleeping for 10 secs
  Starting ring buffer hammer
  End ring buffer hammer
  Time:     9936350 (usecs)
  Overruns: 0
  Read:     28146644  (by pages)
  Entries:  74
  Total:    28146718
  Missed:   0
  Hit:      28146718
  Entries per millisec: 2832
  353 ns per entry
  Sleeping for 10 secs

Time:      is the time the test ran
Overruns:  the number of events that were overwritten and not read
Read:      the number of events read (either by pages or events)
Entries:   the number of entries left in the buffer
                 (the by pages will only read full pages)
Total:     Entries + Read + Overruns
Missed:    the number of entries that failed to write
Hit:       the number of entries that were written

The above example shows that it takes ~353 nanosecs per entry when
there is a reader, reading by pages (and no overruns)

The event by event reader slowed the producer down to 501 nanosecs.

[ Impact: see how changes to the ring buffer affect stability and performance ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Kconfig                 |  16 ++
 kernel/trace/Makefile                |   1 +
 kernel/trace/ring_buffer_benchmark.c | 379 +++++++++++++++++++++++++++++++++++
 3 files changed, 396 insertions(+)
 create mode 100644 kernel/trace/ring_buffer_benchmark.c

(limited to 'kernel/trace')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 450d3c2cfbd..50f62a296e1 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -471,6 +471,22 @@ config MMIOTRACE_TEST
 
 	  Say N, unless you absolutely know what you are doing.
 
+config RING_BUFFER_BENCHMARK
+	tristate "Ring buffer benchmark stress tester"
+	depends on RING_BUFFER
+	help
+	  This option creates a test to stress the ring buffer and bench mark it.
+	  It creates its own ring buffer such that it will not interfer with
+	  any other users of the ring buffer (such as ftrace). It then creates
+	  a producer and consumer that will run for 10 seconds and sleep for
+	  10 seconds. Each interval it will print out the number of events
+	  it recorded and give a rough estimate of how long each iteration took.
+
+	  It does not disable interrupts or raise its priority, so it may be
+	  affected by processes that are running.
+
+	  If unsure, say N
+
 endif # FTRACE
 
 endif # TRACING_SUPPORT
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index fb9d7f96489..7c34cbfff96 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -17,6 +17,7 @@ endif
 
 obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
 obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
+obj-$(CONFIG_RING_BUFFER_BENCHMARK) += ring_buffer_benchmark.o
 
 obj-$(CONFIG_TRACING) += trace.o
 obj-$(CONFIG_TRACING) += trace_clock.o
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
new file mode 100644
index 00000000000..747244acb8f
--- /dev/null
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -0,0 +1,379 @@
+/*
+ * ring buffer tester and benchmark
+ *
+ * Copyright (C) 2009 Steven Rostedt <srostedt@redhat.com>
+ */
+#include <linux/ring_buffer.h>
+#include <linux/completion.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/time.h>
+
+struct rb_page {
+	u64		ts;
+	local_t		commit;
+	char		data[4080];
+};
+
+/* run time and sleep time in seconds */
+#define RUN_TIME	10
+#define SLEEP_TIME	10
+
+/* number of events for writer to wake up the reader */
+static int wakeup_interval = 100;
+
+static int reader_finish;
+static struct completion read_start;
+static struct completion read_done;
+
+static struct ring_buffer *buffer;
+static struct task_struct *producer;
+static struct task_struct *consumer;
+static unsigned long read;
+
+static int disable_reader;
+module_param(disable_reader, uint, 0644);
+MODULE_PARM_DESC(disable_reader, "only run producer");
+
+static int read_events;
+
+static int kill_test;
+
+#define KILL_TEST()				\
+	do {					\
+		if (!kill_test) {		\
+			kill_test = 1;		\
+			WARN_ON(1);		\
+		}				\
+	} while (0)
+
+enum event_status {
+	EVENT_FOUND,
+	EVENT_DROPPED,
+};
+
+static enum event_status read_event(int cpu)
+{
+	struct ring_buffer_event *event;
+	int *entry;
+	u64 ts;
+
+	event = ring_buffer_consume(buffer, cpu, &ts);
+	if (!event)
+		return EVENT_DROPPED;
+
+	entry = ring_buffer_event_data(event);
+	if (*entry != cpu) {
+		KILL_TEST();
+		return EVENT_DROPPED;
+	}
+
+	read++;
+	return EVENT_FOUND;
+}
+
+static enum event_status read_page(int cpu)
+{
+	struct ring_buffer_event *event;
+	struct rb_page *rpage;
+	unsigned long commit;
+	void *bpage;
+	int *entry;
+	int ret;
+	int inc;
+	int i;
+
+	bpage = ring_buffer_alloc_read_page(buffer);
+	ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
+	if (ret >= 0) {
+		rpage = bpage;
+		commit = local_read(&rpage->commit);
+		for (i = 0; i < commit && !kill_test; i += inc) {
+
+			if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
+				KILL_TEST();
+				break;
+			}
+
+			inc = -1;
+			event = (void *)&rpage->data[i];
+			switch (event->type_len) {
+			case RINGBUF_TYPE_PADDING:
+				/* We don't expect any padding */
+				KILL_TEST();
+				break;
+			case RINGBUF_TYPE_TIME_EXTEND:
+				inc = 8;
+				break;
+			case 0:
+				entry = ring_buffer_event_data(event);
+				if (*entry != cpu) {
+					KILL_TEST();
+					break;
+				}
+				read++;
+				if (!event->array[0]) {
+					KILL_TEST();
+					break;
+				}
+				inc = event->array[0];
+				break;
+			default:
+				entry = ring_buffer_event_data(event);
+				if (*entry != cpu) {
+					KILL_TEST();
+					break;
+				}
+				read++;
+				inc = ((event->type_len + 1) * 4);
+			}
+			if (kill_test)
+				break;
+
+			if (inc <= 0) {
+				KILL_TEST();
+				break;
+			}
+		}
+	}
+	ring_buffer_free_read_page(buffer, bpage);
+
+	if (ret < 0)
+		return EVENT_DROPPED;
+	return EVENT_FOUND;
+}
+
+static void ring_buffer_consumer(void)
+{
+	/* toggle between reading pages and events */
+	read_events ^= 1;
+
+	read = 0;
+	while (!reader_finish && !kill_test) {
+		int found;
+
+		do {
+			int cpu;
+
+			found = 0;
+			for_each_online_cpu(cpu) {
+				enum event_status stat;
+
+				if (read_events)
+					stat = read_event(cpu);
+				else
+					stat = read_page(cpu);
+
+				if (kill_test)
+					break;
+				if (stat == EVENT_FOUND)
+					found = 1;
+			}
+		} while (found && !kill_test);
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (reader_finish)
+			break;
+
+		schedule();
+		__set_current_state(TASK_RUNNING);
+	}
+	reader_finish = 0;
+	complete(&read_done);
+}
+
+static void ring_buffer_producer(void)
+{
+	struct timeval start_tv;
+	struct timeval end_tv;
+	unsigned long long time;
+	unsigned long long entries;
+	unsigned long long overruns;
+	unsigned long missed = 0;
+	unsigned long hit = 0;
+	unsigned long avg;
+	int cnt = 0;
+
+	/*
+	 * Hammer the buffer for 10 secs (this may
+	 * make the system stall)
+	 */
+	pr_info("Starting ring buffer hammer\n");
+	do_gettimeofday(&start_tv);
+	do {
+		struct ring_buffer_event *event;
+		int *entry;
+
+		event = ring_buffer_lock_reserve(buffer, 10);
+		if (!event) {
+			missed++;
+		} else {
+			hit++;
+			entry = ring_buffer_event_data(event);
+			*entry = smp_processor_id();
+			ring_buffer_unlock_commit(buffer, event);
+		}
+		do_gettimeofday(&end_tv);
+
+		if (consumer && !(++cnt % wakeup_interval))
+			wake_up_process(consumer);
+
+	} while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test);
+	pr_info("End ring buffer hammer\n");
+
+	if (consumer) {
+		/* Init both completions here to avoid races */
+		init_completion(&read_start);
+		init_completion(&read_done);
+		/* the completions must be visible before the finish var */
+		smp_wmb();
+		reader_finish = 1;
+		/* finish var visible before waking up the consumer */
+		smp_wmb();
+		wake_up_process(consumer);
+		wait_for_completion(&read_done);
+	}
+
+	time = end_tv.tv_sec - start_tv.tv_sec;
+	time *= 1000000;
+	time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec);
+
+	entries = ring_buffer_entries(buffer);
+	overruns = ring_buffer_overruns(buffer);
+
+	if (kill_test)
+		pr_info("ERROR!\n");
+	pr_info("Time:     %lld (usecs)\n", time);
+	pr_info("Overruns: %lld\n", overruns);
+	if (disable_reader)
+		pr_info("Read:     (reader disabled)\n");
+	else
+		pr_info("Read:     %ld  (by %s)\n", read,
+			read_events ? "events" : "pages");
+	pr_info("Entries:  %lld\n", entries);
+	pr_info("Total:    %lld\n", entries + overruns + read);
+	pr_info("Missed:   %ld\n", missed);
+	pr_info("Hit:      %ld\n", hit);
+
+	do_div(time, 1000);
+	if (time)
+		hit /= (long)time;
+	else
+		pr_info("TIME IS ZERO??\n");
+
+	pr_info("Entries per millisec: %ld\n", hit);
+
+	if (hit) {
+		avg = 1000000 / hit;
+		pr_info("%ld ns per entry\n", avg);
+	}
+}
+
+static void wait_to_die(void)
+{
+	set_current_state(TASK_INTERRUPTIBLE);
+	while (!kthread_should_stop()) {
+		schedule();
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+}
+
+static int ring_buffer_consumer_thread(void *arg)
+{
+	while (!kthread_should_stop() && !kill_test) {
+		complete(&read_start);
+
+		ring_buffer_consumer();
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (kthread_should_stop() || kill_test)
+			break;
+
+		schedule();
+		__set_current_state(TASK_RUNNING);
+	}
+	__set_current_state(TASK_RUNNING);
+
+	if (kill_test)
+		wait_to_die();
+
+	return 0;
+}
+
+static int ring_buffer_producer_thread(void *arg)
+{
+	init_completion(&read_start);
+
+	while (!kthread_should_stop() && !kill_test) {
+		ring_buffer_reset(buffer);
+
+		if (consumer) {
+			smp_wmb();
+			wake_up_process(consumer);
+			wait_for_completion(&read_start);
+		}
+
+		ring_buffer_producer();
+
+		pr_info("Sleeping for 10 secs\n");
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(HZ * SLEEP_TIME);
+		__set_current_state(TASK_RUNNING);
+	}
+
+	if (kill_test)
+		wait_to_die();
+
+	return 0;
+}
+
+static int __init ring_buffer_benchmark_init(void)
+{
+	int ret;
+
+	/* make a one meg buffer in overwite mode */
+	buffer = ring_buffer_alloc(1000000, RB_FL_OVERWRITE);
+	if (!buffer)
+		return -ENOMEM;
+
+	if (!disable_reader) {
+		consumer = kthread_create(ring_buffer_consumer_thread,
+					  NULL, "rb_consumer");
+		ret = PTR_ERR(consumer);
+		if (IS_ERR(consumer))
+			goto out_fail;
+	}
+
+	producer = kthread_run(ring_buffer_producer_thread,
+			       NULL, "rb_producer");
+	ret = PTR_ERR(producer);
+
+	if (IS_ERR(producer))
+		goto out_kill;
+
+	return 0;
+
+ out_kill:
+	if (consumer)
+		kthread_stop(consumer);
+
+ out_fail:
+	ring_buffer_free(buffer);
+	return ret;
+}
+
+static void __exit ring_buffer_benchmark_exit(void)
+{
+	kthread_stop(producer);
+	if (consumer)
+		kthread_stop(consumer);
+	ring_buffer_free(buffer);
+}
+
+module_init(ring_buffer_benchmark_init);
+module_exit(ring_buffer_benchmark_exit);
+
+MODULE_AUTHOR("Steven Rostedt");
+MODULE_DESCRIPTION("ring_buffer_benchmark");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 2df75e415709ad12862028916c772c1f377f6a7c Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 6 May 2009 10:33:04 +0800
Subject: tracing/events: fix memory leak when unloading module

When unloading a module, memory allocated by init_preds() and
trace_define_field() is not freed.

[ Impact: fix memory leak ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <4A00F6E0.3040503@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events.c        | 18 ++++++++++++++++++
 kernel/trace/trace_events_filter.c | 22 +++++++++++++++-------
 2 files changed, 33 insertions(+), 7 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f789ca540fe..f251a150e75 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -60,6 +60,22 @@ err:
 }
 EXPORT_SYMBOL_GPL(trace_define_field);
 
+#ifdef CONFIG_MODULES
+
+static void trace_destroy_fields(struct ftrace_event_call *call)
+{
+	struct ftrace_event_field *field, *next;
+
+	list_for_each_entry_safe(field, next, &call->fields, link) {
+		list_del(&field->link);
+		kfree(field->type);
+		kfree(field->name);
+		kfree(field);
+	}
+}
+
+#endif /* CONFIG_MODULES */
+
 static void ftrace_clear_events(void)
 {
 	struct ftrace_event_call *call;
@@ -925,6 +941,8 @@ static void trace_module_remove_events(struct module *mod)
 				unregister_ftrace_event(call->event);
 			debugfs_remove_recursive(call->dir);
 			list_del(&call->list);
+			trace_destroy_fields(call);
+			destroy_preds(call);
 		}
 	}
 
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index f49486687ee..ce07b818671 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -346,6 +346,20 @@ static void filter_disable_preds(struct ftrace_event_call *call)
 		filter->preds[i]->fn = filter_pred_none;
 }
 
+void destroy_preds(struct ftrace_event_call *call)
+{
+	struct event_filter *filter = call->filter;
+	int i;
+
+	for (i = 0; i < MAX_FILTER_PRED; i++) {
+		if (filter->preds[i])
+			filter_free_pred(filter->preds[i]);
+	}
+	kfree(filter->preds);
+	kfree(filter);
+	call->filter = NULL;
+}
+
 int init_preds(struct ftrace_event_call *call)
 {
 	struct event_filter *filter;
@@ -374,13 +388,7 @@ int init_preds(struct ftrace_event_call *call)
 	return 0;
 
 oom:
-	for (i = 0; i < MAX_FILTER_PRED; i++) {
-		if (filter->preds[i])
-			filter_free_pred(filter->preds[i]);
-	}
-	kfree(filter->preds);
-	kfree(call->filter);
-	call->filter = NULL;
+	destroy_preds(call);
 
 	return -ENOMEM;
 }
-- 
cgit v1.2.3


From 20c8928abe70e204bd077ab6cfe23002d7788983 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 6 May 2009 10:33:45 +0800
Subject: tracing/events: fix concurrent access to ftrace_events list

A module will add/remove its trace events when it gets loaded/unloaded, so
the ftrace_events list is not "const", and concurrent access needs to be
protected.

This patch thus fixes races between loading/unloding modules and read
'available_events' or read/write 'set_event', etc.

Below shows how to reproduce the race:

 # for ((; ;)) { cat /mnt/tracing/available_events; } > /dev/null &
 # for ((; ;)) { insmod trace-events-sample.ko; rmmod sample; } &

After a while:

BUG: unable to handle kernel paging request at 0010011c
IP: [<c1080f27>] t_next+0x1b/0x2d
...
Call Trace:
 [<c10c90e6>] ? seq_read+0x217/0x30d
 [<c10c8ecf>] ? seq_read+0x0/0x30d
 [<c10b4c19>] ? vfs_read+0x8f/0x136
 [<c10b4fc3>] ? sys_read+0x40/0x65
 [<c1002a68>] ? sysenter_do_call+0x12/0x36

[ Impact: fix races when concurrent accessing ftrace_events list ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4A00F709.3080800@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h               |  1 +
 kernel/trace/trace_event_profile.c | 19 ++++++++++++++-----
 kernel/trace/trace_events.c        | 20 +++++++++++---------
 kernel/trace/trace_events_filter.c | 10 +++++++---
 4 files changed, 33 insertions(+), 17 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 7736fe8c1b7..777c6c3a0cd 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -825,6 +825,7 @@ static int filter_pred_##size(struct filter_pred *pred, void *event,	\
 	return match;							\
 }
 
+extern struct mutex event_mutex;
 extern struct list_head ftrace_events;
 
 extern const char *__start___trace_bprintk_fmt[];
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 7bf2ad65eee..5b5895afecf 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -10,21 +10,30 @@
 int ftrace_profile_enable(int event_id)
 {
 	struct ftrace_event_call *event;
+	int ret = -EINVAL;
 
+	mutex_lock(&event_mutex);
 	list_for_each_entry(event, &ftrace_events, list) {
-		if (event->id == event_id)
-			return event->profile_enable(event);
+		if (event->id == event_id) {
+			ret = event->profile_enable(event);
+			break;
+		}
 	}
+	mutex_unlock(&event_mutex);
 
-	return -EINVAL;
+	return ret;
 }
 
 void ftrace_profile_disable(int event_id)
 {
 	struct ftrace_event_call *event;
 
+	mutex_lock(&event_mutex);
 	list_for_each_entry(event, &ftrace_events, list) {
-		if (event->id == event_id)
-			return event->profile_disable(event);
+		if (event->id == event_id) {
+			event->profile_disable(event);
+			break;
+		}
 	}
+	mutex_unlock(&event_mutex);
 }
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f251a150e75..8d579ff2361 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -21,7 +21,7 @@
 
 #define TRACE_SYSTEM "TRACE_SYSTEM"
 
-static DEFINE_MUTEX(event_mutex);
+DEFINE_MUTEX(event_mutex);
 
 LIST_HEAD(ftrace_events);
 
@@ -80,6 +80,7 @@ static void ftrace_clear_events(void)
 {
 	struct ftrace_event_call *call;
 
+	mutex_lock(&event_mutex);
 	list_for_each_entry(call, &ftrace_events, list) {
 
 		if (call->enabled) {
@@ -87,6 +88,7 @@ static void ftrace_clear_events(void)
 			call->unregfunc();
 		}
 	}
+	mutex_unlock(&event_mutex);
 }
 
 static void ftrace_event_enable_disable(struct ftrace_event_call *call,
@@ -274,6 +276,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 
 static void *t_start(struct seq_file *m, loff_t *pos)
 {
+	mutex_lock(&event_mutex);
+	if (*pos == 0)
+		m->private = ftrace_events.next;
 	return t_next(m, NULL, pos);
 }
 
@@ -303,6 +308,9 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
 
 static void *s_start(struct seq_file *m, loff_t *pos)
 {
+	mutex_lock(&event_mutex);
+	if (*pos == 0)
+		m->private = ftrace_events.next;
 	return s_next(m, NULL, pos);
 }
 
@@ -319,12 +327,12 @@ static int t_show(struct seq_file *m, void *v)
 
 static void t_stop(struct seq_file *m, void *p)
 {
+	mutex_unlock(&event_mutex);
 }
 
 static int
 ftrace_event_seq_open(struct inode *inode, struct file *file)
 {
-	int ret;
 	const struct seq_operations *seq_ops;
 
 	if ((file->f_mode & FMODE_WRITE) &&
@@ -332,13 +340,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file)
 		ftrace_clear_events();
 
 	seq_ops = inode->i_private;
-	ret = seq_open(file, seq_ops);
-	if (!ret) {
-		struct seq_file *m = file->private_data;
-
-		m->private = ftrace_events.next;
-	}
-	return ret;
+	return seq_open(file, seq_ops);
 }
 
 static ssize_t
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index ce07b818671..7ac69108527 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -408,6 +408,7 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
 		filter->n_preds = 0;
 	}
 
+	mutex_lock(&event_mutex);
 	list_for_each_entry(call, &ftrace_events, list) {
 		if (!call->define_fields)
 			continue;
@@ -417,6 +418,7 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
 			remove_filter_string(call->filter);
 		}
 	}
+	mutex_unlock(&event_mutex);
 }
 
 static int filter_add_pred_fn(struct filter_parse_state *ps,
@@ -567,6 +569,7 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
 {
 	struct event_filter *filter = system->filter;
 	struct ftrace_event_call *call;
+	int err = 0;
 
 	if (!filter->preds) {
 		filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
@@ -584,8 +587,8 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
 	filter->preds[filter->n_preds] = pred;
 	filter->n_preds++;
 
+	mutex_lock(&event_mutex);
 	list_for_each_entry(call, &ftrace_events, list) {
-		int err;
 
 		if (!call->define_fields)
 			continue;
@@ -597,12 +600,13 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
 		if (err) {
 			filter_free_subsystem_preds(system);
 			parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
-			return err;
+			break;
 		}
 		replace_filter_string(call->filter, filter_string);
 	}
+	mutex_unlock(&event_mutex);
 
-	return 0;
+	return err;
 }
 
 static void parse_init(struct filter_parse_state *ps,
-- 
cgit v1.2.3


From a42aaa3bbce85ac487ad4fad5db99e8e91b7aac1 Mon Sep 17 00:00:00 2001
From: "Alan D. Brunelle" <Alan.Brunelle@hp.com>
Date: Mon, 4 May 2009 16:27:26 -0400
Subject: blktrace: correct remap names

This attempts to clarify names utilized during block I/O remap
operations (partition, volume manager). It correctly matches up the
/from/ information for both device & sector. This takes in the concept
from Kosaki Motohiro and extends it to include better naming for the
"device_from" field.

[ Impact: cleanup ]

Signed-off-by: Alan D. Brunelle <alan.brunelle@hp.com>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <49FF4FAE.3000301@hp.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c32062bd10b..f8d46d6f5d3 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -830,8 +830,8 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
  * @q:		queue the io is for
  * @bio:	the source bio
  * @dev:	target device
- * @from:	source sector
  * @to:		target sector
+ * @from:	source sector
  *
  * Description:
  *     Device mapper or raid target sometimes need to split a bio because
@@ -839,7 +839,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
  *
  **/
 static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
-				       dev_t dev, sector_t from, sector_t to)
+				       dev_t dev, sector_t to, sector_t from)
 {
 	struct blk_trace *bt = q->blk_trace;
 	struct blk_io_trace_remap r;
@@ -847,9 +847,9 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
 	if (likely(!bt))
 		return;
 
-	r.device = cpu_to_be32(dev);
-	r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
-	r.sector = cpu_to_be64(to);
+	r.device_from = cpu_to_be32(dev);
+	r.device_to   = cpu_to_be32(bio->bi_bdev->bd_dev);
+	r.sector_from = cpu_to_be64(from);
 
 	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP,
 			!bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
@@ -1028,11 +1028,11 @@ static void get_pdu_remap(const struct trace_entry *ent,
 			  struct blk_io_trace_remap *r)
 {
 	const struct blk_io_trace_remap *__r = pdu_start(ent);
-	__u64 sector = __r->sector;
+	__u64 sector_from = __r->sector_from;
 
-	r->device = be32_to_cpu(__r->device);
 	r->device_from = be32_to_cpu(__r->device_from);
-	r->sector = be64_to_cpu(sector);
+	r->device_to   = be32_to_cpu(__r->device_to);
+	r->sector_from = be64_to_cpu(sector_from);
 }
 
 typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act);
@@ -1148,13 +1148,13 @@ static int blk_log_with_error(struct trace_seq *s,
 
 static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
 {
-	struct blk_io_trace_remap r = { .device = 0, };
+	struct blk_io_trace_remap r = { .device_from = 0, };
 
 	get_pdu_remap(ent, &r);
 	return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
-			       t_sector(ent),
-			       t_sec(ent), MAJOR(r.device), MINOR(r.device),
-			       (unsigned long long)r.sector);
+				t_sector(ent), t_sec(ent),
+				MAJOR(r.device_from), MINOR(r.device_from),
+				(unsigned long long)r.sector_from);
 }
 
 static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
-- 
cgit v1.2.3


From 22a7c31a9659deaddafbbcec6562d44141e84474 Mon Sep 17 00:00:00 2001
From: "Alan D. Brunelle" <Alan.Brunelle@hp.com>
Date: Mon, 4 May 2009 16:35:08 -0400
Subject: blktrace: from-sector redundant in trace_block_remap

Remove redundant from-sector parameter: it's /always/ the bio's sector
passed in.

[ Impact: cleanup ]

Signed-off-by: Alan D. Brunelle <alan.brunelle@hp.com>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <49FF517C.7000503@hp.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index f8d46d6f5d3..e099f8cc1d1 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -830,7 +830,6 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
  * @q:		queue the io is for
  * @bio:	the source bio
  * @dev:	target device
- * @to:		target sector
  * @from:	source sector
  *
  * Description:
@@ -839,7 +838,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
  *
  **/
 static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
-				       dev_t dev, sector_t to, sector_t from)
+				       dev_t dev, sector_t from)
 {
 	struct blk_trace *bt = q->blk_trace;
 	struct blk_io_trace_remap r;
@@ -851,8 +850,9 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
 	r.device_to   = cpu_to_be32(bio->bi_bdev->bd_dev);
 	r.sector_from = cpu_to_be64(from);
 
-	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP,
-			!bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
+			BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE),
+			sizeof(r), &r);
 }
 
 /**
-- 
cgit v1.2.3


From 48dd0fed90e2b1f1ba87401439b85942181c6df3 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Wed, 6 May 2009 15:45:45 +0530
Subject: tracing: trace_output.c, fix false positive compiler warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This compiler warning:

  CC      kernel/trace/trace_output.o
 kernel/trace/trace_output.c: In function ‘register_ftrace_event’:
 kernel/trace/trace_output.c:544: warning: ‘list’ may be used uninitialized in this function

Is wrong as 'list' is always initialized - but GCC (4.3.2) does not
recognize this relationship properly.

Work around the warning by initializing the variable to NULL.

[ Impact: fix false positive compiler warning ]

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_output.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 5fc51f0f75f..8bd9a2c1a46 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -541,7 +541,7 @@ int register_ftrace_event(struct trace_event *event)
 	INIT_LIST_HEAD(&event->list);
 
 	if (!event->type) {
-		struct list_head *list;
+		struct list_head *list = NULL;
 
 		if (next_event_type > FTRACE_MAX_EVENT) {
 
-- 
cgit v1.2.3


From 8e7abf1c62941ebb7a1416cbc62392c8a0902625 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 6 May 2009 10:26:45 -0400
Subject: ring-buffer: remove unneeded conditional in rb_reserve_next

The code in __rb_reserve_next checks on page overflow if it is the
original commiter and then resets the page back to the original
setting.  Although this is fine, and the code is correct, it is
a bit fragil. Some experimental work I did breaks it easily.

The better and more robust solution is to have all commiters that
overflow the page, simply subtract what they added.

[ Impact: more robust ring buffer account management ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 424129eb20a..03ed52b67db 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1290,9 +1290,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		rb_event_set_padding(event);
 	}
 
-	if (tail <= BUF_PAGE_SIZE)
-		/* Set the write back to the previous setting */
-		local_set(&tail_page->write, tail);
+	/* Set the write back to the previous setting */
+	local_sub(length, &tail_page->write);
 
 	/*
 	 * If this was a commit entry that failed,
@@ -1311,8 +1310,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 
  out_reset:
 	/* reset write */
-	if (tail <= BUF_PAGE_SIZE)
-		local_set(&tail_page->write, tail);
+	local_sub(length, &tail_page->write);
 
 	if (likely(lock_taken))
 		__raw_spin_unlock(&cpu_buffer->lock);
-- 
cgit v1.2.3


From 00c81a58c5b4e0de14ee33bfbc3d71c90f69f9ea Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 6 May 2009 12:40:51 -0400
Subject: ring-buffer: check for failed allocation in ring buffer benchmark

The result of the allocation of the ring buffer read page in the
ring buffer bench mark does not check the return to see if a page
was actually allocated. This patch fixes that.

[ Impact: avoid NULL dereference ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer_benchmark.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 747244acb8f..dcd75e9e49f 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -84,6 +84,9 @@ static enum event_status read_page(int cpu)
 	int i;
 
 	bpage = ring_buffer_alloc_read_page(buffer);
+	if (!bpage)
+		return EVENT_DROPPED;
+
 	ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
 	if (ret >= 0) {
 		rpage = bpage;
-- 
cgit v1.2.3


From 6634ff26cce2da04e5c2a5481bcb8888e7d01786 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 6 May 2009 15:30:07 -0400
Subject: ring-buffer: make moving the tail page a separate function

Ingo Molnar thought the code would be cleaner if we used a function call
instead of a goto for moving the tail page. After implementing this,
it seems that gcc still inlines the result and the output is pretty much
the same. Since this is considered a cleaner approach, might as well
implement it.

[ Impact: code clean up ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 89 +++++++++++++++++++++++++---------------------
 1 file changed, 49 insertions(+), 40 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 03ed52b67db..3ae5ccf2c0f 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1154,51 +1154,18 @@ static unsigned rb_calculate_event_length(unsigned length)
 	return length;
 }
 
+
 static struct ring_buffer_event *
-__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
-		  unsigned type, unsigned long length, u64 *ts)
+rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
+	     unsigned long length, unsigned long tail,
+	     struct buffer_page *commit_page,
+	     struct buffer_page *tail_page, u64 *ts)
 {
-	struct buffer_page *tail_page, *head_page, *reader_page, *commit_page;
-	struct buffer_page *next_page;
-	unsigned long tail, write;
+	struct buffer_page *next_page, *head_page, *reader_page;
 	struct ring_buffer *buffer = cpu_buffer->buffer;
 	struct ring_buffer_event *event;
-	unsigned long flags;
 	bool lock_taken = false;
-
-	commit_page = cpu_buffer->commit_page;
-	/* we just need to protect against interrupts */
-	barrier();
-	tail_page = cpu_buffer->tail_page;
-	write = local_add_return(length, &tail_page->write);
-	tail = write - length;
-
-	/* See if we shot pass the end of this buffer page */
-	if (write > BUF_PAGE_SIZE)
-		goto next_page;
-
-	/* We reserved something on the buffer */
-
-	if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE))
-		return NULL;
-
-	event = __rb_page_index(tail_page, tail);
-	rb_update_event(event, type, length);
-
-	/* The passed in type is zero for DATA */
-	if (likely(!type))
-		local_inc(&tail_page->entries);
-
-	/*
-	 * If this is a commit and the tail is zero, then update
-	 * this page's time stamp.
-	 */
-	if (!tail && rb_is_commit(cpu_buffer, event))
-		cpu_buffer->commit_page->page->time_stamp = *ts;
-
-	return event;
-
- next_page:
+	unsigned long flags;
 
 	next_page = tail_page;
 
@@ -1318,6 +1285,48 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	return NULL;
 }
 
+static struct ring_buffer_event *
+__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
+		  unsigned type, unsigned long length, u64 *ts)
+{
+	struct buffer_page *tail_page, *commit_page;
+	struct ring_buffer_event *event;
+	unsigned long tail, write;
+
+	commit_page = cpu_buffer->commit_page;
+	/* we just need to protect against interrupts */
+	barrier();
+	tail_page = cpu_buffer->tail_page;
+	write = local_add_return(length, &tail_page->write);
+	tail = write - length;
+
+	/* See if we shot pass the end of this buffer page */
+	if (write > BUF_PAGE_SIZE)
+		return rb_move_tail(cpu_buffer, length, tail,
+				    commit_page, tail_page, ts);
+
+	/* We reserved something on the buffer */
+
+	if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE))
+		return NULL;
+
+	event = __rb_page_index(tail_page, tail);
+	rb_update_event(event, type, length);
+
+	/* The passed in type is zero for DATA */
+	if (likely(!type))
+		local_inc(&tail_page->entries);
+
+	/*
+	 * If this is a commit and the tail is zero, then update
+	 * this page's time stamp.
+	 */
+	if (!tail && rb_is_commit(cpu_buffer, event))
+		cpu_buffer->commit_page->page->time_stamp = *ts;
+
+	return event;
+}
+
 static int
 rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 		  u64 *ts, u64 *delta)
-- 
cgit v1.2.3


From 3e07a4f680adc66dfa175aa5021aedf340251b12 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 6 May 2009 18:36:59 -0400
Subject: ring-buffer: change test to be more latency friendly

The ring buffer benchmark/test runs a producer for 10 seconds.
This is done with preemption and interrupts enabled. But if the kernel
is not compiled with CONFIG_PREEMPT, it basically stops everything
but interrupts for 10 seconds.

Although this is just a test and is not for production, this attribute
can be quite annoying. It can also spawn badness elsewhere.

This patch solves the issues by calling "cond_resched" when the system
is not compiled with CONFIG_PREEMPT. It also keeps track of the time
spent to call cond_resched such that it does not go against the
time calculations. That is, if the task schedules away, the time scheduled
out is removed from the test data. Note, this only works for non PREEMPT
because we do not know when the task is scheduled out if we have PREEMPT
enabled.

[ Impact: prevent test from stopping the world for 10 seconds ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer_benchmark.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index dcd75e9e49f..a26fc67b63b 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -185,6 +185,35 @@ static void ring_buffer_consumer(void)
 	complete(&read_done);
 }
 
+/*
+ * If we are a non preempt kernel, the 10 second run will
+ * stop everything while it runs. Instead, we will call cond_resched
+ * and also add any time that was lost by a rescedule.
+ */
+#ifdef CONFIG_PREEMPT
+static void sched_if_needed(struct timeval *start_tv, struct timeval *end_tv)
+{
+}
+#else
+static void sched_if_needed(struct timeval *start_tv, struct timeval *end_tv)
+{
+	struct timeval tv;
+
+	cond_resched();
+	do_gettimeofday(&tv);
+	if (tv.tv_usec < end_tv->tv_usec) {
+		tv.tv_usec += 1000000;
+		tv.tv_sec--;
+	}
+	start_tv->tv_sec += tv.tv_sec - end_tv->tv_sec;
+	start_tv->tv_usec += tv.tv_usec - end_tv->tv_usec;
+	if (start_tv->tv_usec > 1000000) {
+		start_tv->tv_usec -= 1000000;
+		start_tv->tv_sec++;
+	}
+}
+#endif
+
 static void ring_buffer_producer(void)
 {
 	struct timeval start_tv;
@@ -221,6 +250,8 @@ static void ring_buffer_producer(void)
 		if (consumer && !(++cnt % wakeup_interval))
 			wake_up_process(consumer);
 
+		sched_if_needed(&start_tv, &end_tv);
+
 	} while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test);
 	pr_info("End ring buffer hammer\n");
 
-- 
cgit v1.2.3


From 9456f0fa6d3cb944d3b9fc31c9a244e0362c26ea Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 6 May 2009 21:54:09 -0400
Subject: tracing: reset ring buffer when removing modules with events

Li Zefan found that there's a race using the event ids of events and
modules. When a module is loaded, an event id is incremented. We only
have 16 bits for event ids (65536) and there is a possible (but highly
unlikely) race that we could load and unload a module that registers
events so many times that the event id counter overflows.

When it overflows, it then restarts and goes looking for available
ids. An id is available if it was added by a module and released.

The race is if you have one module add an id, and then is removed.
Another module loaded can use that same event id. But if the old module
still had events in the ring buffer, the new module's call back would
get bogus data.  At best (and most likely) the output would just be
garbage. But if the module for some reason used pointers (not recommended)
then this could potentially crash.

The safest thing to do is just reset the ring buffer if a module that
registered events is removed.

[ Impact: prevent unpredictable results of event id overflows ]

Reported-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <49FEAFD0.30106@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c        | 10 ++++++++++
 kernel/trace/trace.h        |  2 ++
 kernel/trace/trace_events.c |  9 +++++++++
 3 files changed, 21 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4164a344e72..dd40d232034 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -639,6 +639,16 @@ void tracing_reset_online_cpus(struct trace_array *tr)
 		tracing_reset(tr, cpu);
 }
 
+void tracing_reset_current(int cpu)
+{
+	tracing_reset(&global_trace, cpu);
+}
+
+void tracing_reset_current_online_cpus(void)
+{
+	tracing_reset_online_cpus(&global_trace);
+}
+
 #define SAVED_CMDLINES 128
 #define NO_CMDLINE_MAP UINT_MAX
 static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 777c6c3a0cd..ba25793ffe6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -409,6 +409,8 @@ int tracing_is_enabled(void);
 void trace_wake_up(void);
 void tracing_reset(struct trace_array *tr, int cpu);
 void tracing_reset_online_cpus(struct trace_array *tr);
+void tracing_reset_current(int cpu);
+void tracing_reset_current_online_cpus(void);
 int tracing_open_generic(struct inode *inode, struct file *filp);
 struct dentry *trace_create_file(const char *name,
 				 mode_t mode,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 8d579ff2361..6d2c842a024 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -932,9 +932,11 @@ static void trace_module_remove_events(struct module *mod)
 {
 	struct ftrace_module_file_ops *file_ops;
 	struct ftrace_event_call *call, *p;
+	bool found = false;
 
 	list_for_each_entry_safe(call, p, &ftrace_events, list) {
 		if (call->mod == mod) {
+			found = true;
 			if (call->enabled) {
 				call->enabled = 0;
 				call->unregfunc();
@@ -957,6 +959,13 @@ static void trace_module_remove_events(struct module *mod)
 		list_del(&file_ops->list);
 		kfree(file_ops);
 	}
+
+	/*
+	 * It is safest to reset the ring buffer if the module being unloaded
+	 * registered any events.
+	 */
+	if (found)
+		tracing_reset_current_online_cpus();
 }
 
 static int trace_module_notify(struct notifier_block *self,
-- 
cgit v1.2.3


From 8ae79a138e88aceeeb07077bff2883245fb7c218 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 6 May 2009 22:52:15 -0400
Subject: tracing: add hierarchical enabling of events

With the current event directory, you can only enable individual events.
The file debugfs/tracing/set_event is used to be able to enable or
disable several events at once. But that can still be awkward.

This patch adds hierarchical enabling of events. That is, each directory
in debugfs/tracing/events has an "enable" file. This file can enable
or disable all events within the directory and below.

 # echo 1 > /debugfs/tracing/events/enable

will enable all events.

 # echo 1 > /debugfs/tracing/events/sched/enable

will enable all events in the sched subsystem.

 # echo 1 > /debugfs/tracing/events/enable
 # echo 0 > /debugfs/tracing/events/irq/enable

will enable all events, but then disable just the irq subsystem events.

When reading one of these enable files, there are four results:

 0 - all events this file affects are disabled
 1 - all events this file affects are enabled
 X - there is a mixture of events enabled and disabled
 ? - this file does not affect any event

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 140 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 140 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 6d2c842a024..87feb0117ce 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -400,6 +400,133 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	return cnt;
 }
 
+static ssize_t
+system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
+		   loff_t *ppos)
+{
+	const char *system = filp->private_data;
+	struct ftrace_event_call *call;
+	char buf[2];
+	int set = -1;
+	int all = 0;
+	int ret;
+
+	if (system[0] == '*')
+		all = 1;
+
+	mutex_lock(&event_mutex);
+	list_for_each_entry(call, &ftrace_events, list) {
+		if (!call->name || !call->regfunc)
+			continue;
+
+		if (!all && strcmp(call->system, system) != 0)
+			continue;
+
+		/*
+		 * We need to find out if all the events are set
+		 * or if all events or cleared, or if we have
+		 * a mixture.
+		 */
+		if (call->enabled) {
+			switch (set) {
+			case -1:
+				set = 1;
+				break;
+			case 0:
+				set = 2;
+				break;
+			}
+		} else {
+			switch (set) {
+			case -1:
+				set = 0;
+				break;
+			case 1:
+				set = 2;
+				break;
+			}
+		}
+		/*
+		 * If we have a mixture, no need to look further.
+		 */
+		if (set == 2)
+			break;
+	}
+	mutex_unlock(&event_mutex);
+
+	buf[1] = '\n';
+	switch (set) {
+	case 0:
+		buf[0] = '0';
+		break;
+	case 1:
+		buf[0] = '1';
+		break;
+	case 2:
+		buf[0] = 'X';
+		break;
+	default:
+		buf[0] = '?';
+	}
+
+	ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
+
+	return ret;
+}
+
+static ssize_t
+system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
+		    loff_t *ppos)
+{
+	const char *system = filp->private_data;
+	unsigned long val;
+	char *command;
+	char buf[64];
+	ssize_t ret;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	ret = strict_strtoul(buf, 10, &val);
+	if (ret < 0)
+		return ret;
+
+	ret = tracing_update_buffers();
+	if (ret < 0)
+		return ret;
+
+	switch (val) {
+	case 0:
+	case 1:
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	command = kstrdup(system, GFP_KERNEL);
+	if (!command)
+		return -ENOMEM;
+
+	ret = ftrace_set_clr_event(command, val);
+	if (ret)
+		goto out_free;
+
+	ret = cnt;
+
+ out_free:
+	kfree(command);
+
+	*ppos += cnt;
+
+	return ret;
+}
+
 extern char *__bad_type_size(void);
 
 #undef FIELD
@@ -686,6 +813,12 @@ static const struct file_operations ftrace_subsystem_filter_fops = {
 	.write = subsystem_filter_write,
 };
 
+static const struct file_operations ftrace_system_enable_fops = {
+	.open = tracing_open_generic,
+	.read = system_enable_read,
+	.write = system_enable_write,
+};
+
 static const struct file_operations ftrace_show_header_fops = {
 	.open = tracing_open_generic,
 	.read = show_header,
@@ -768,6 +901,10 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
 			   "'%s/filter' entry\n", name);
 	}
 
+	entry = trace_create_file("enable", 0644, system->entry,
+				  (void *)system->name,
+				  &ftrace_system_enable_fops);
+
 	return system->entry;
 }
 
@@ -1041,6 +1178,9 @@ static __init int event_trace_init(void)
 			  ring_buffer_print_entry_header,
 			  &ftrace_show_header_fops);
 
+	trace_create_file("enable", 0644, d_events,
+			  "*:*", &ftrace_system_enable_fops);
+
 	for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
 		/* The linker may leave blanks */
 		if (!call->name)
-- 
cgit v1.2.3


From e8808c1019b048a43686dbd25c188a035842c2e2 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 3 May 2009 02:48:52 +0200
Subject: tracing/filters: support for filters of dynamic sized arrays

Currently the filtering infrastructure supports well the
numeric types and fixed sized array types.

But the recently added __string() field uses a specific
indirect offset mechanism which requires a specific
predicate. Until now it wasn't supported.

This patch adds this support and implies very few changes,
only a new predicate is needed, the management of this specific
field can be done through the usual string helpers in the
filtering infrastructure.

[ Impact: support all kinds of strings in the tracing filters ]

Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Zhaolei <zhaolei@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_events_filter.c | 44 +++++++++++++++++++++++++++++++++++---
 1 file changed, 41 insertions(+), 3 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 7ac69108527..01c76eb3e16 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -151,6 +151,7 @@ static int filter_pred_or(struct filter_pred *pred __attribute((unused)),
 	return val1 || val2;
 }
 
+/* Filter predicate for fixed sized arrays of characters */
 static int filter_pred_string(struct filter_pred *pred, void *event,
 			      int val1, int val2)
 {
@@ -164,6 +165,30 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
 	return match;
 }
 
+/*
+ * Filter predicate for dynamic sized arrays of characters.
+ * These are implemented through a list of strings at the end
+ * of the entry.
+ * Also each of these strings have a field in the entry which
+ * contains its offset from the beginning of the entry.
+ * We have then first to get this field, dereference it
+ * and add it to the address of the entry, and at last we have
+ * the address of the string.
+ */
+static int filter_pred_strloc(struct filter_pred *pred, void *event,
+			      int val1, int val2)
+{
+	int str_loc = *(int *)(event + pred->offset);
+	char *addr = (char *)(event + str_loc);
+	int cmp, match;
+
+	cmp = strncmp(addr, pred->str_val, pred->str_len);
+
+	match = (!cmp) ^ pred->not;
+
+	return match;
+}
+
 static int filter_pred_none(struct filter_pred *pred, void *event,
 			    int val1, int val2)
 {
@@ -446,10 +471,18 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
 	return 0;
 }
 
+enum {
+	FILTER_STATIC_STRING = 1,
+	FILTER_DYN_STRING
+};
+
 static int is_string_field(const char *type)
 {
 	if (strchr(type, '[') && strstr(type, "char"))
-		return 1;
+		return FILTER_STATIC_STRING;
+
+	if (!strcmp(type, "__str_loc"))
+		return FILTER_DYN_STRING;
 
 	return 0;
 }
@@ -512,6 +545,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
 	struct ftrace_event_field *field;
 	filter_pred_fn_t fn;
 	unsigned long long val;
+	int string_type;
 
 	pred->fn = filter_pred_none;
 
@@ -536,8 +570,12 @@ static int filter_add_pred(struct filter_parse_state *ps,
 		return -EINVAL;
 	}
 
-	if (is_string_field(field->type)) {
-		fn = filter_pred_string;
+	string_type = is_string_field(field->type);
+	if (string_type) {
+		if (string_type == FILTER_STATIC_STRING)
+			fn = filter_pred_string;
+		else
+			fn = filter_pred_strloc;
 		pred->str_len = field->size;
 		if (pred->op == OP_NE)
 			pred->not = 1;
-- 
cgit v1.2.3


From 5928c3cc0ffcb6894bbab6be591b7ae1786b2d87 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 3 May 2009 03:03:57 +0200
Subject: tracing/filters: support for operator reserved characters in strings

When we set a filter for an event, such as:

echo "name == my_lock_name" > \
	/debug/tracing/events/lockdep/lock_acquired/filter

then the following order of token type is parsed:

- space
- operator
- parentheses
- operand

Because the operators and parentheses have a higher precedence
than the operand characters, which is normal, then we can't
use any string containing such special characters:

()=<>!&|

To get this support and also avoid ambiguous intepretation from
the parser or the human, we can do it using double quotes so that
we keep the usual languages habits.

Then after this patch you can still declare string condition like
before:

echo name == myname

But if you want to compare against a string containing an operator
character, you can use double quotes:

echo 'name == "&myname"'

Don't forget to include the whole expression into single quotes or
the double ones will be eaten by echo.

[ Impact: support strings with special characters for tracing filters ]

Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Zhaolei <zhaolei@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_events_filter.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 01c76eb3e16..8c62e5bdff0 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -851,10 +851,19 @@ static void postfix_clear(struct filter_parse_state *ps)
 
 static int filter_parse(struct filter_parse_state *ps)
 {
+	int in_string = 0;
 	int op, top_op;
 	char ch;
 
 	while ((ch = infix_next(ps))) {
+		if (ch == '"') {
+			in_string ^= 1;
+			continue;
+		}
+
+		if (in_string)
+			goto parse_operand;
+
 		if (isspace(ch))
 			continue;
 
@@ -908,6 +917,7 @@ static int filter_parse(struct filter_parse_state *ps)
 			}
 			continue;
 		}
+parse_operand:
 		if (append_operand_char(ps, ch)) {
 			parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0);
 			return -EINVAL;
-- 
cgit v1.2.3


From d94fc523f3c35bd8013f04827e94756cbc0212f4 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 7 May 2009 15:11:15 +0800
Subject: tracing/events: fix concurrent access to ftrace_events list, fix

In filter_add_subsystem_pred() we should release event_mutex before
calling filter_free_subsystem_preds(), since both functions hold
event_mutex.

[ Impact: fix deadlock when writing invalid pred into subsystem filter ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: tzanussi@gmail.com
Cc: a.p.zijlstra@chello.nl
Cc: fweisbec@gmail.com
Cc: rostedt@goodmis.org
LKML-Reference: <4A028993.7020509@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events_filter.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 8c62e5bdff0..85ad6a8939a 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -636,14 +636,15 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
 
 		err = filter_add_pred(ps, call, pred);
 		if (err) {
+			mutex_unlock(&event_mutex);
 			filter_free_subsystem_preds(system);
 			parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
-			break;
+			goto out;
 		}
 		replace_filter_string(call->filter, filter_string);
 	}
 	mutex_unlock(&event_mutex);
-
+out:
 	return err;
 }
 
-- 
cgit v1.2.3


From 29c8000ee7da3a6756d26143991e573eaaf2a9f6 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 7 May 2009 11:13:42 -0400
Subject: ring-buffer: remove complex calculations in ring-buffer-test

Ingo Molnar thought that the code to calculate the time in cond_resched
is a bit too ugly and is not needed. This patch removes it and replaces
it with a simple call to cond_resched. I kept the comment that explains
the reason for the cond_resched.

[ Impact: remove ugly code ]

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer_benchmark.c | 37 +++++++-----------------------------
 1 file changed, 7 insertions(+), 30 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index a26fc67b63b..f4ceb453c7d 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -185,35 +185,6 @@ static void ring_buffer_consumer(void)
 	complete(&read_done);
 }
 
-/*
- * If we are a non preempt kernel, the 10 second run will
- * stop everything while it runs. Instead, we will call cond_resched
- * and also add any time that was lost by a rescedule.
- */
-#ifdef CONFIG_PREEMPT
-static void sched_if_needed(struct timeval *start_tv, struct timeval *end_tv)
-{
-}
-#else
-static void sched_if_needed(struct timeval *start_tv, struct timeval *end_tv)
-{
-	struct timeval tv;
-
-	cond_resched();
-	do_gettimeofday(&tv);
-	if (tv.tv_usec < end_tv->tv_usec) {
-		tv.tv_usec += 1000000;
-		tv.tv_sec--;
-	}
-	start_tv->tv_sec += tv.tv_sec - end_tv->tv_sec;
-	start_tv->tv_usec += tv.tv_usec - end_tv->tv_usec;
-	if (start_tv->tv_usec > 1000000) {
-		start_tv->tv_usec -= 1000000;
-		start_tv->tv_sec++;
-	}
-}
-#endif
-
 static void ring_buffer_producer(void)
 {
 	struct timeval start_tv;
@@ -250,7 +221,13 @@ static void ring_buffer_producer(void)
 		if (consumer && !(++cnt % wakeup_interval))
 			wake_up_process(consumer);
 
-		sched_if_needed(&start_tv, &end_tv);
+		/*
+		 * If we are a non preempt kernel, the 10 second run will
+		 * stop everything while it runs. Instead, we will call
+		 * cond_resched and also add any time that was lost by a
+		 * rescedule.
+		 */
+		cond_resched();
 
 	} while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test);
 	pr_info("End ring buffer hammer\n");
-- 
cgit v1.2.3


From d6bf81ef0f7474434c2a049e8bf3c9146a14dd96 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 7 May 2009 11:49:35 -0400
Subject: tracing: append ":*" to internal setting of system events

The system enabling of events uses the same code as the set_event file.
It passes in the name of the system to the parser and that will enable
all the events that has that system as a name.

The problem is that it will also enable events with the same name as the
system.

If you have system name foo, and system name bar, but within the system
bar, there exists an event called foo. By setting the system name foo,
you will also be enabling the event foo in the system bar. This is not
an expected result.

The solution is to pass in "foo:*", which will only enable the system
foo and not events called foo.

[ Impact: prevent accidental enabling of events with same name as a system ]

Reported-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 87feb0117ce..8d0fae3af59 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -509,9 +509,11 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		return -EINVAL;
 	}
 
-	command = kstrdup(system, GFP_KERNEL);
+	/* +3 for the ":*\0" */
+	command = kmalloc(strlen(system)+3, GFP_KERNEL);
 	if (!command)
 		return -ENOMEM;
+	sprintf(command, "%s:*", system);
 
 	ret = ftrace_set_clr_event(command, val);
 	if (ret)
@@ -1179,7 +1181,7 @@ static __init int event_trace_init(void)
 			  &ftrace_show_header_fops);
 
 	trace_create_file("enable", 0644, d_events,
-			  "*:*", &ftrace_system_enable_fops);
+			  "*", &ftrace_system_enable_fops);
 
 	for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
 		/* The linker may leave blanks */
-- 
cgit v1.2.3


From 65b77242043f74bca6a0d733c0e48ef03a8c9893 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 7 May 2009 12:49:27 -0400
Subject: tracing: have menu default enabled when kernel debug is configured

Tracing can be very helpful to debug the kernel. When DEBUG_KERNEL is
enabled it is nice to enable the trace menu as well.

This patch only make the tracing menu enabled by default, it does not
make any of the tracers enabled. And the menu is only enabled by
default if DEBUG_KERNEL is enabled.

[ Impact: show tracing options to those debugging the kernel ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 50f62a296e1..f61be301578 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -79,6 +79,7 @@ if TRACING_SUPPORT
 
 menuconfig FTRACE
 	bool "Tracers"
+	default y if DEBUG_KERNEL
 	help
 	 Enable the kernel tracing infrastructure.
 
-- 
cgit v1.2.3


From 0574ea421b90e0e45a72c447dd3c2c79ffd8c153 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 7 May 2009 14:20:28 -0400
Subject: ring-buffer: only periodically call cond_resched to
 ring-buffer-benchmark

Calling cond_resched at every iteration of the loop adds a bit of
overhead to the benchmark.

This patch does two things.

1) only calls cond-resched when CONFIG_PREEMPT is not enabled
2) only calls cond-resched after so many traces has been performed.

[ Impact: less overhead to the ring-buffer-benchmark ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer_benchmark.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index f4ceb453c7d..a7c048bb446 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -218,16 +218,23 @@ static void ring_buffer_producer(void)
 		}
 		do_gettimeofday(&end_tv);
 
-		if (consumer && !(++cnt % wakeup_interval))
+		cnt++;
+		if (consumer && !(cnt % wakeup_interval))
 			wake_up_process(consumer);
 
+#ifndef CONFIG_PREEMPT
 		/*
 		 * If we are a non preempt kernel, the 10 second run will
 		 * stop everything while it runs. Instead, we will call
 		 * cond_resched and also add any time that was lost by a
 		 * rescedule.
+		 *
+		 * Do a cond resched at the same frequency we would wake up
+		 * the reader.
 		 */
-		cond_resched();
+		if (cnt % wakeup_interval)
+			cond_resched();
+#endif
 
 	} while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test);
 	pr_info("End ring buffer hammer\n");
-- 
cgit v1.2.3


From 7da3046d6ce6ea97494020081c509b642b7016af Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 7 May 2009 19:52:20 -0400
Subject: ring-buffer: add total count in ring-buffer-benchmark

It is nice to see the overhead of the benchmark test when tracing is
disabled. That is, we turn off the ring buffer just to see what the
cost of running the loop that calls into the ring buffer is.

Currently, if no entries wer made, we get 0. This is not informative.
This patch changes it to check if we had any "missed" (non recorded)
events. If so, a total count is also reported.

[ Impact: evaluate the over head of the ring buffer benchmark test ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer_benchmark.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index a7c048bb446..a21aa7b3d05 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -285,6 +285,17 @@ static void ring_buffer_producer(void)
 		avg = 1000000 / hit;
 		pr_info("%ld ns per entry\n", avg);
 	}
+
+
+	if (missed) {
+		if (time)
+			missed /= (long)time;
+
+		pr_info("Total iterations per millisec: %ld\n", hit + missed);
+
+		avg = 1000000 / (hit + missed);
+		pr_info("%ld ns per entry\n", avg);
+	}
 }
 
 static void wait_to_die(void)
-- 
cgit v1.2.3


From 74f4fd21664148b8c454cc07bfe74e4dd51cf07b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 7 May 2009 19:58:55 -0400
Subject: ring-buffer: change WARN_ON from checking preempt_count to
 preemptible

There's a WARN_ON in the ring buffer code that makes sure preemption
is disabled. It checks "!preempt_count()". But when CONFIG_PREEMPT is not
enabled, preempt_count() is always zero, and this will trigger the warning.

[ Impact: prevent false warning on non preemptible kernels ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3ae5ccf2c0f..361170609bd 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1688,7 +1688,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
 	 * committed yet. Thus we can assume that preemption
 	 * is still disabled.
 	 */
-	RB_WARN_ON(buffer, !preempt_count());
+	RB_WARN_ON(buffer, preemptible());
 
 	cpu = smp_processor_id();
 	cpu_buffer = buffer->buffers[cpu];
-- 
cgit v1.2.3


From 8f31bfe538ebafac187d2d4465a92e1d9ee6d8c2 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 8 May 2009 10:31:42 +0800
Subject: tracing/events: clean up for ftrace_set_clr_event()

Add a helper function __ftrace_set_clr_event(), and replace some
ftrace_set_clr_event() calls with this helper, thus we don't need any
kstrdup() or kmalloc().

As a side effect, this patch fixes an issue in self tests code, which is
similar to the one fixed in commit d6bf81ef0f7474434c2a049e8bf3c9146a14dd96
("tracing: append ":*" to internal setting of system events")

It's a small issue and won't cause any bug in fact, but we should do things
right anyway.

[ Impact: prevent spurious event-enabling in tracing self-tests ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4A03998E.3020503@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events.c | 126 ++++++++++++++++----------------------------
 1 file changed, 46 insertions(+), 80 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 8d0fae3af59..45f1099386b 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -111,11 +111,44 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
 	}
 }
 
-static int ftrace_set_clr_event(char *buf, int set)
+/*
+ * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
+ */
+static int __ftrace_set_clr_event(const char *match, const char *sub,
+				  const char *event, int set)
 {
 	struct ftrace_event_call *call;
+	int ret;
+
+	mutex_lock(&event_mutex);
+	list_for_each_entry(call, &ftrace_events, list) {
+
+		if (!call->name || !call->regfunc)
+			continue;
+
+		if (match &&
+		    strcmp(match, call->name) != 0 &&
+		    strcmp(match, call->system) != 0)
+			continue;
+
+		if (sub && strcmp(sub, call->system) != 0)
+			continue;
+
+		if (event && strcmp(event, call->name) != 0)
+			continue;
+
+		ftrace_event_enable_disable(call, set);
+
+		ret = 0;
+	}
+	mutex_unlock(&event_mutex);
+
+	return ret;
+}
+
+static int ftrace_set_clr_event(char *buf, int set)
+{
 	char *event = NULL, *sub = NULL, *match;
-	int ret = -EINVAL;
 
 	/*
 	 * The buf format can be <subsystem>:<event-name>
@@ -141,30 +174,7 @@ static int ftrace_set_clr_event(char *buf, int set)
 			event = NULL;
 	}
 
-	mutex_lock(&event_mutex);
-	list_for_each_entry(call, &ftrace_events, list) {
-
-		if (!call->name || !call->regfunc)
-			continue;
-
-		if (match &&
-		    strcmp(match, call->name) != 0 &&
-		    strcmp(match, call->system) != 0)
-			continue;
-
-		if (sub && strcmp(sub, call->system) != 0)
-			continue;
-
-		if (event && strcmp(event, call->name) != 0)
-			continue;
-
-		ftrace_event_enable_disable(call, set);
-
-		ret = 0;
-	}
-	mutex_unlock(&event_mutex);
-
-	return ret;
+	return __ftrace_set_clr_event(match, sub, event, set);
 }
 
 /* 128 should be much more than enough */
@@ -408,18 +418,14 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 	struct ftrace_event_call *call;
 	char buf[2];
 	int set = -1;
-	int all = 0;
 	int ret;
 
-	if (system[0] == '*')
-		all = 1;
-
 	mutex_lock(&event_mutex);
 	list_for_each_entry(call, &ftrace_events, list) {
 		if (!call->name || !call->regfunc)
 			continue;
 
-		if (!all && strcmp(call->system, system) != 0)
+		if (system && strcmp(call->system, system) != 0)
 			continue;
 
 		/*
@@ -480,7 +486,6 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 {
 	const char *system = filp->private_data;
 	unsigned long val;
-	char *command;
 	char buf[64];
 	ssize_t ret;
 
@@ -500,30 +505,16 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	if (ret < 0)
 		return ret;
 
-	switch (val) {
-	case 0:
-	case 1:
-		break;
-
-	default:
+	if (val != 0 && val != 1)
 		return -EINVAL;
-	}
 
-	/* +3 for the ":*\0" */
-	command = kmalloc(strlen(system)+3, GFP_KERNEL);
-	if (!command)
-		return -ENOMEM;
-	sprintf(command, "%s:*", system);
-
-	ret = ftrace_set_clr_event(command, val);
+	ret = __ftrace_set_clr_event(NULL, system, NULL, val);
 	if (ret)
-		goto out_free;
+		goto out;
 
 	ret = cnt;
 
- out_free:
-	kfree(command);
-
+out:
 	*ppos += cnt;
 
 	return ret;
@@ -1181,7 +1172,7 @@ static __init int event_trace_init(void)
 			  &ftrace_show_header_fops);
 
 	trace_create_file("enable", 0644, d_events,
-			  "*", &ftrace_system_enable_fops);
+			  NULL, &ftrace_system_enable_fops);
 
 	for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
 		/* The linker may leave blanks */
@@ -1259,7 +1250,6 @@ static __init void event_trace_self_tests(void)
 {
 	struct ftrace_event_call *call;
 	struct event_subsystem *system;
-	char *sysname;
 	int ret;
 
 	pr_info("Running tests on trace events:\n");
@@ -1305,14 +1295,7 @@ static __init void event_trace_self_tests(void)
 
 		pr_info("Testing event system %s: ", system->name);
 
-		/* ftrace_set_clr_event can modify the name passed in. */
-		sysname = kstrdup(system->name, GFP_KERNEL);
-		if (WARN_ON(!sysname)) {
-			pr_warning("Can't allocate memory, giving up!\n");
-			return;
-		}
-		ret = ftrace_set_clr_event(sysname, 1);
-		kfree(sysname);
+		ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1);
 		if (WARN_ON_ONCE(ret)) {
 			pr_warning("error enabling system %s\n",
 				   system->name);
@@ -1321,14 +1304,7 @@ static __init void event_trace_self_tests(void)
 
 		event_test_stuff();
 
-		sysname = kstrdup(system->name, GFP_KERNEL);
-		if (WARN_ON(!sysname)) {
-			pr_warning("Can't allocate memory, giving up!\n");
-			return;
-		}
-		ret = ftrace_set_clr_event(sysname, 0);
-		kfree(sysname);
-
+		ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0);
 		if (WARN_ON_ONCE(ret))
 			pr_warning("error disabling system %s\n",
 				   system->name);
@@ -1341,15 +1317,8 @@ static __init void event_trace_self_tests(void)
 	pr_info("Running tests on all trace events:\n");
 	pr_info("Testing all events: ");
 
-	sysname = kmalloc(4, GFP_KERNEL);
-	if (WARN_ON(!sysname)) {
-		pr_warning("Can't allocate memory, giving up!\n");
-		return;
-	}
-	memcpy(sysname, "*:*", 4);
-	ret = ftrace_set_clr_event(sysname, 1);
+	ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1);
 	if (WARN_ON_ONCE(ret)) {
-		kfree(sysname);
 		pr_warning("error enabling all events\n");
 		return;
 	}
@@ -1357,10 +1326,7 @@ static __init void event_trace_self_tests(void)
 	event_test_stuff();
 
 	/* reset sysname */
-	memcpy(sysname, "*:*", 4);
-	ret = ftrace_set_clr_event(sysname, 0);
-	kfree(sysname);
-
+	ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0);
 	if (WARN_ON_ONCE(ret)) {
 		pr_warning("error disabling all events\n");
 		return;
-- 
cgit v1.2.3


From c142b15dc56ee6d55cb97a062e3c8e9c61e384c0 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 8 May 2009 10:32:05 +0800
Subject: tracing/events: simplify system_enable_read()

A smarter way to figure out the output of an enable file.

[ Impact: clean up ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4A0399A5.2080603@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events.c | 40 ++++++----------------------------------
 1 file changed, 6 insertions(+), 34 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 45f1099386b..df394bc6d54 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -414,10 +414,11 @@ static ssize_t
 system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 		   loff_t *ppos)
 {
+	const char set_to_char[4] = { '?', '0', '1', 'X' };
 	const char *system = filp->private_data;
 	struct ftrace_event_call *call;
 	char buf[2];
-	int set = -1;
+	int set = 0;
 	int ret;
 
 	mutex_lock(&event_mutex);
@@ -433,47 +434,18 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 		 * or if all events or cleared, or if we have
 		 * a mixture.
 		 */
-		if (call->enabled) {
-			switch (set) {
-			case -1:
-				set = 1;
-				break;
-			case 0:
-				set = 2;
-				break;
-			}
-		} else {
-			switch (set) {
-			case -1:
-				set = 0;
-				break;
-			case 1:
-				set = 2;
-				break;
-			}
-		}
+		set |= (1 << !!call->enabled);
+
 		/*
 		 * If we have a mixture, no need to look further.
 		 */
-		if (set == 2)
+		if (set == 3)
 			break;
 	}
 	mutex_unlock(&event_mutex);
 
+	buf[0] = set_to_char[set];
 	buf[1] = '\n';
-	switch (set) {
-	case 0:
-		buf[0] = '0';
-		break;
-	case 1:
-		buf[0] = '1';
-		break;
-	case 2:
-		buf[0] = 'X';
-		break;
-	default:
-		buf[0] = '?';
-	}
 
 	ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
 
-- 
cgit v1.2.3


From 29f93943d1916d1a3faa3f10f4a06994347ac990 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 8 May 2009 16:06:47 -0400
Subject: tracing: initialize return value for __ftrace_set_clr_event

Commit 8f31bfe538ebafac187d2d4465a92e1d9ee6d8c2
tracing/events: clean up for ftrace_set_clr_event()

Moved out the code for ftrace_set_clr_event into a helper funciton but
did not initialize the return value. As a result, we do not warn about
a typo in the echoing of events in set_event.

This patch restores the old warning:

 # echo foobar > set_event
-bash: echo: write error: Invalid argument

[ Impact: restore warning of invalid entries to set_event ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index df394bc6d54..2eecb87e42d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -118,7 +118,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
 				  const char *event, int set)
 {
 	struct ftrace_event_call *call;
-	int ret;
+	int ret = -EINVAL;
 
 	mutex_lock(&event_mutex);
 	list_for_each_entry(call, &ftrace_events, list) {
-- 
cgit v1.2.3


From 4671c79408a3f8a5a6a45e39c4c164dada3a5678 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 8 May 2009 16:27:41 -0400
Subject: tracing: add trace_set_clr_event to export event enabling function

Other parts of the kernel may need to be able to enable or disable
specific events. Especially parts that create trace events.

[ Impact: allow enabling of trace events by those that create the event ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 2eecb87e42d..0eec0c55dd8 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -177,6 +177,23 @@ static int ftrace_set_clr_event(char *buf, int set)
 	return __ftrace_set_clr_event(match, sub, event, set);
 }
 
+/**
+ * trace_set_clr_event - enable or disable an event
+ * @system: system name to match (NULL for any system)
+ * @event: event name to match (NULL for all events, within system)
+ * @set: 1 to enable, 0 to disable
+ *
+ * This is a way for other parts of the kernel to enable or disable
+ * event recording.
+ *
+ * Returns 0 on success, -EINVAL if the parameters do not match any
+ * registered events.
+ */
+int trace_set_clr_event(const char *system, const char *event, int set)
+{
+	return __ftrace_set_clr_event(NULL, system, event, set);
+}
+
 /* 128 should be much more than enough */
 #define EVENT_BUF_SIZE		127
 
-- 
cgit v1.2.3


From 049862579333cc6cd9e6edfd6987cd0addfd8c59 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 11 May 2009 14:33:23 +0800
Subject: blktrace: pdu_buf of pc events should be unsigned

I got this:
  8,0    1   305.417782332  2037  I   R 32 (ffffff9e 10 00 ...) [bash]

It should be:
  8,0    1   305.417782332  2037  I   R 32 (9e 10 00 ...) [bash]

[ Impact: fix output of pc events ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4A07C6B3.9080802@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index e099f8cc1d1..05b4747fd87 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1065,7 +1065,7 @@ static int blk_log_action(struct trace_iterator *iter, const char *act)
 
 static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
 {
-	const char *pdu_buf;
+	const unsigned char *pdu_buf;
 	int pdu_len;
 	int i, end, ret;
 
-- 
cgit v1.2.3


From 5a772b2b3c68e7e0b503c5a48469113bb0634314 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 8 May 2009 10:56:33 -0400
Subject: ring-buffer: replace constants with time macros in
 ring-buffer-benchmark

The use of numeric constants is discouraged. It is cleaner and more
descriptive to use macros for constant time conversions.

This patch also removes an extra new line.

[ Impact: more descriptive time conversions ]

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer_benchmark.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index a21aa7b3d05..7d3aef93c49 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -253,7 +253,7 @@ static void ring_buffer_producer(void)
 	}
 
 	time = end_tv.tv_sec - start_tv.tv_sec;
-	time *= 1000000;
+	time *= USEC_PER_SEC;
 	time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec);
 
 	entries = ring_buffer_entries(buffer);
@@ -273,7 +273,8 @@ static void ring_buffer_producer(void)
 	pr_info("Missed:   %ld\n", missed);
 	pr_info("Hit:      %ld\n", hit);
 
-	do_div(time, 1000);
+	/* Convert time from usecs to millisecs */
+	do_div(time, USEC_PER_MSEC);
 	if (time)
 		hit /= (long)time;
 	else
@@ -282,18 +283,19 @@ static void ring_buffer_producer(void)
 	pr_info("Entries per millisec: %ld\n", hit);
 
 	if (hit) {
-		avg = 1000000 / hit;
+		/* Calculate the average time in nanosecs */
+		avg = NSEC_PER_MSEC / hit;
 		pr_info("%ld ns per entry\n", avg);
 	}
 
-
 	if (missed) {
 		if (time)
 			missed /= (long)time;
 
 		pr_info("Total iterations per millisec: %ld\n", hit + missed);
 
-		avg = 1000000 / (hit + missed);
+		/* Caculate the average time in nanosecs */
+		avg = NSEC_PER_MSEC / (hit + missed);
 		pr_info("%ld ns per entry\n", avg);
 	}
 }
-- 
cgit v1.2.3


From d988ff94c1074c4c914235c8591bcceafb585ecf Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 8 May 2009 11:03:57 -0400
Subject: ring-buffer: check for divide by zero in ring-buffer-benchmark

Although we check if "missed" is not zero, we divide by hit + missed,
and the addition can possible overflow and become a divide by zero.

This patch checks for this case, and will report it when it happens
then modify "hit" to make the calculation be non zero.

[ Impact: prevent possible divide by zero in ring-buffer-benchmark ]

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer_benchmark.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 7d3aef93c49..8d68e149a8b 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -294,6 +294,12 @@ static void ring_buffer_producer(void)
 
 		pr_info("Total iterations per millisec: %ld\n", hit + missed);
 
+		/* it is possible that hit + missed will overflow and be zero */
+		if (!(hit + missed)) {
+			pr_info("hit + missed overflowed and totalled zero!\n");
+			hit--; /* make it non zero */
+		}
+
 		/* Caculate the average time in nanosecs */
 		avg = NSEC_PER_MSEC / (hit + missed);
 		pr_info("%ld ns per entry\n", avg);
-- 
cgit v1.2.3


From 1cd8d7358948909ab80b254eb14bcebc555ad417 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 11 May 2009 14:08:09 -0400
Subject: ring-buffer: remove type parameter from rb_reserve_next_event

The rb_reserve_next_event is only called for the data type (type = 0).
There is no reason to pass in the type to the function.

Before:
   text    data     bss     dec     hex filename
  16554      24      12   16590    40ce kernel/trace/ring_buffer.o

After:
   text    data     bss     dec     hex filename
  16538      24      12   16574    40be kernel/trace/ring_buffer.o

[ Impact: cleaner, smaller and slightly more efficient code ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 361170609bd..fe40f6c3507 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1389,7 +1389,7 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 
 static struct ring_buffer_event *
 rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
-		      unsigned type, unsigned long length)
+		      unsigned long length)
 {
 	struct ring_buffer_event *event;
 	u64 ts, delta;
@@ -1448,7 +1448,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 		/* Non commits have zero deltas */
 		delta = 0;
 
-	event = __rb_reserve_next(cpu_buffer, type, length, &ts);
+	event = __rb_reserve_next(cpu_buffer, 0, length, &ts);
 	if (PTR_ERR(event) == -EAGAIN)
 		goto again;
 
@@ -1556,7 +1556,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
 	if (length > BUF_PAGE_SIZE)
 		goto out;
 
-	event = rb_reserve_next_event(cpu_buffer, 0, length);
+	event = rb_reserve_next_event(cpu_buffer, length);
 	if (!event)
 		goto out;
 
@@ -1782,7 +1782,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
 		goto out;
 
 	event_length = rb_calculate_event_length(length);
-	event = rb_reserve_next_event(cpu_buffer, 0, event_length);
+	event = rb_reserve_next_event(cpu_buffer, event_length);
 	if (!event)
 		goto out;
 
-- 
cgit v1.2.3


From be957c447f7233a67904a1b11eb3ab61e702bf4d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 11 May 2009 14:42:53 -0400
Subject: ring-buffer: move calculation of event length

The event length is calculated and passed in to rb_reserve_next_event
in two different locations. Having rb_reserve_next_event do the
calculations directly makes only one location to do the change and
causes the calculation to be inlined by gcc.

Before:
   text    data     bss     dec     hex filename
  16538      24      12   16574    40be kernel/trace/ring_buffer.o

After:
   text    data     bss     dec     hex filename
  16490      24      12   16526    408e kernel/trace/ring_buffer.o

[ Impact: smaller more efficient code ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index fe40f6c3507..493cba46abc 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -367,6 +367,9 @@ static inline int test_time_stamp(u64 delta)
 
 #define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
 
+/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
+#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
+
 int ring_buffer_print_page_header(struct trace_seq *s)
 {
 	struct buffer_data_page field;
@@ -1396,6 +1399,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 	int commit = 0;
 	int nr_loops = 0;
 
+	length = rb_calculate_event_length(length);
  again:
 	/*
 	 * We allow for interrupts to reenter here and do a trace.
@@ -1552,8 +1556,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
 	if (atomic_read(&cpu_buffer->record_disabled))
 		goto out;
 
-	length = rb_calculate_event_length(length);
-	if (length > BUF_PAGE_SIZE)
+	if (length > BUF_MAX_DATA_SIZE)
 		goto out;
 
 	event = rb_reserve_next_event(cpu_buffer, length);
@@ -1758,7 +1761,6 @@ int ring_buffer_write(struct ring_buffer *buffer,
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_event *event;
-	unsigned long event_length;
 	void *body;
 	int ret = -EBUSY;
 	int cpu, resched;
@@ -1781,8 +1783,10 @@ int ring_buffer_write(struct ring_buffer *buffer,
 	if (atomic_read(&cpu_buffer->record_disabled))
 		goto out;
 
-	event_length = rb_calculate_event_length(length);
-	event = rb_reserve_next_event(cpu_buffer, event_length);
+	if (length > BUF_MAX_DATA_SIZE)
+		goto out;
+
+	event = rb_reserve_next_event(cpu_buffer, length);
 	if (!event)
 		goto out;
 
-- 
cgit v1.2.3


From 0f0c85fc80adbbd2265d89867d743f929d516805 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 11 May 2009 16:08:00 -0400
Subject: ring-buffer: small optimizations

Doing some small changes in the fast path of the ring buffer recording
saves over 3% in the ring-buffer-benchmark test.

[ Impact: a little faster ring buffer recording ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 493cba46abc..f452de2ce49 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1000,7 +1000,7 @@ rb_event_index(struct ring_buffer_event *event)
 	return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE);
 }
 
-static int
+static inline int
 rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
 	     struct ring_buffer_event *event)
 {
@@ -1423,9 +1423,9 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 	 * also be made. But only the entry that did the actual
 	 * commit will be something other than zero.
 	 */
-	if (cpu_buffer->tail_page == cpu_buffer->commit_page &&
-	    rb_page_write(cpu_buffer->tail_page) ==
-	    rb_commit_index(cpu_buffer)) {
+	if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
+		   rb_page_write(cpu_buffer->tail_page) ==
+		   rb_commit_index(cpu_buffer))) {
 
 		delta = ts - cpu_buffer->write_stamp;
 
@@ -1436,7 +1436,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 		if (unlikely(ts < cpu_buffer->write_stamp))
 			delta = 0;
 
-		if (test_time_stamp(delta)) {
+		else if (unlikely(test_time_stamp(delta))) {
 
 			commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
 
@@ -1470,7 +1470,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 	 * If the timestamp was commited, make the commit our entry
 	 * now so that we will update it when needed.
 	 */
-	if (commit)
+	if (unlikely(commit))
 		rb_set_commit_event(cpu_buffer, event);
 	else if (!rb_is_commit(cpu_buffer, event))
 		delta = 0;
-- 
cgit v1.2.3


From 88eb0125362f2ab272cbaf84252cf101ddc2dec9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 11 May 2009 16:28:23 -0400
Subject: ring-buffer: use internal time stamp function

The ring_buffer_time_stamp that is exported adds a little more overhead
than is needed for using it internally. This patch adds an internal
timestamp function that can be inlined (a single line function)
and used internally for the ring buffer.

[ Impact: a little less overhead to the ring buffer ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f452de2ce49..a9e645a5bc1 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -454,13 +454,18 @@ struct ring_buffer_iter {
 /* Up this if you want to test the TIME_EXTENTS and normalization */
 #define DEBUG_SHIFT 0
 
+static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu)
+{
+	/* shift to debug/test normalization and TIME_EXTENTS */
+	return buffer->clock() << DEBUG_SHIFT;
+}
+
 u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
 {
 	u64 time;
 
 	preempt_disable_notrace();
-	/* shift to debug/test normalization and TIME_EXTENTS */
-	time = buffer->clock() << DEBUG_SHIFT;
+	time = rb_time_stamp(buffer, cpu);
 	preempt_enable_no_resched_notrace();
 
 	return time;
@@ -1247,7 +1252,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 		cpu_buffer->tail_page = next_page;
 
 		/* reread the time stamp */
-		*ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu);
+		*ts = rb_time_stamp(buffer, cpu_buffer->cpu);
 		cpu_buffer->tail_page->page->time_stamp = *ts;
 	}
 
@@ -1413,7 +1418,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
 		return NULL;
 
-	ts = ring_buffer_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
+	ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
 
 	/*
 	 * Only the first commit can update the timestamp.
-- 
cgit v1.2.3


From 168b6b1d0594c7866caa73b12f3b8d91075695f2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 11 May 2009 22:11:05 -0400
Subject: ring-buffer: move code around to remove some branches

This is a bit of micro-optimizations. But since the ring buffer is used
in tracing every function call, it is an extreme hot path. Every nanosecond
counts.

This change shows over 5% improvement in the ring-buffer-benchmark.

[ Impact: more efficient code ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a9e645a5bc1..16b24d49604 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1400,7 +1400,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 		      unsigned long length)
 {
 	struct ring_buffer_event *event;
-	u64 ts, delta;
+	u64 ts, delta = 0;
 	int commit = 0;
 	int nr_loops = 0;
 
@@ -1431,20 +1431,21 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 	if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
 		   rb_page_write(cpu_buffer->tail_page) ==
 		   rb_commit_index(cpu_buffer))) {
+		u64 diff;
 
-		delta = ts - cpu_buffer->write_stamp;
+		diff = ts - cpu_buffer->write_stamp;
 
-		/* make sure this delta is calculated here */
+		/* make sure this diff is calculated here */
 		barrier();
 
 		/* Did the write stamp get updated already? */
 		if (unlikely(ts < cpu_buffer->write_stamp))
-			delta = 0;
+			goto get_event;
 
-		else if (unlikely(test_time_stamp(delta))) {
+		delta = diff;
+		if (unlikely(test_time_stamp(delta))) {
 
 			commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
-
 			if (commit == -EBUSY)
 				return NULL;
 
@@ -1453,12 +1454,11 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 
 			RB_WARN_ON(cpu_buffer, commit < 0);
 		}
-	} else
-		/* Non commits have zero deltas */
-		delta = 0;
+	}
 
+ get_event:
 	event = __rb_reserve_next(cpu_buffer, 0, length, &ts);
-	if (PTR_ERR(event) == -EAGAIN)
+	if (unlikely(PTR_ERR(event) == -EAGAIN))
 		goto again;
 
 	if (!event) {
-- 
cgit v1.2.3


From 1ec7c4849c214fc78b023230264399836ea3b245 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 14 May 2009 23:40:06 -0400
Subject: tracing: stop stack trace on first empty entry

The stack tracer stores eight entries in the ring buffer when an event
traces the stack. The output outputs all eight entries regardless of
how many entries were recorded.

This patch breaks out of the loop when a null entry is discovered.

[ Impact: only print the stack that is recorded ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_output.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 8bd9a2c1a46..489c0e8ada0 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -898,6 +898,8 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
 	trace_assign_type(field, iter->ent);
 
 	for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+		if (!field->caller[i])
+			break;
 		if (i) {
 			if (!trace_seq_puts(s, " <= "))
 				goto partial;
-- 
cgit v1.2.3


From 8cd995b6deedf98b7694ed32a786ee7f793d1eec Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 15 May 2009 11:07:27 +0800
Subject: tracing/filters: add missing unlock in a failure path

[ Impact: fix deadlock in a rare case we fail to allocate memory ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4A0CDC6F.7070200@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 85ad6a8939a..22c29984fe0 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1079,9 +1079,10 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
 		return 0;
 	}
 
+	err = -ENOMEM;
 	ps = kzalloc(sizeof(*ps), GFP_KERNEL);
 	if (!ps)
-		return -ENOMEM;
+		goto out_unlock;
 
 	filter_disable_preds(call);
 	replace_filter_string(call->filter, filter_string);
@@ -1101,7 +1102,7 @@ out:
 	filter_opstack_clear(ps);
 	postfix_clear(ps);
 	kfree(ps);
-
+out_unlock:
 	mutex_unlock(&filter_mutex);
 
 	return err;
@@ -1123,9 +1124,10 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
 		return 0;
 	}
 
+	err = -ENOMEM;
 	ps = kzalloc(sizeof(*ps), GFP_KERNEL);
 	if (!ps)
-		return -ENOMEM;
+		goto out_unlock;
 
 	filter_free_subsystem_preds(system);
 	replace_filter_string(system->filter, filter_string);
@@ -1145,7 +1147,7 @@ out:
 	filter_opstack_clear(ps);
 	postfix_clear(ps);
 	kfree(ps);
-
+out_unlock:
 	mutex_unlock(&filter_mutex);
 
 	return err;
-- 
cgit v1.2.3


From 5872144f64b34a5942f6b4acedc90b02de72c58b Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 15 May 2009 11:07:56 +0800
Subject: tracing/filters: fix off-by-one bug

We should leave the last slot for the ending '\0'.

[ Impact: fix possible crash when the length of an operand is 128 ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4A0CDC8C.30602@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 22c29984fe0..a7430b16d24 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -736,7 +736,7 @@ static inline void clear_operand_string(struct filter_parse_state *ps)
 
 static inline int append_operand_char(struct filter_parse_state *ps, char c)
 {
-	if (ps->operand.tail == MAX_FILTER_STR_VAL)
+	if (ps->operand.tail == MAX_FILTER_STR_VAL - 1)
 		return -EINVAL;
 
 	ps->operand.string[ps->operand.tail++] = c;
-- 
cgit v1.2.3


From 24ed0c4bfc7d2d7507bb9d50f7f3bbdcd85d76dd Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Sun, 17 May 2009 15:31:38 +0800
Subject: tracing: fix check for return value of register_module_notifier

return zero should be correct, so fix it.

[ Impact: eliminate incorrect syslog message ]

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: rostedt@goodmis.org
LKML-Reference: <1242545498-7285-1-git-send-email-tom.leiming@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 5b606f45b6c..140699a9a8a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2826,7 +2826,7 @@ void __init ftrace_init(void)
 				  __stop_mcount_loc);
 
 	ret = register_module_notifier(&ftrace_module_nb);
-	if (!ret)
+	if (ret)
 		pr_warning("Failed to register trace ftrace module notifier\n");
 
 	return;
-- 
cgit v1.2.3


From fd51d251e4cdb21f68e9dbc4336514d64a105a79 Mon Sep 17 00:00:00 2001
From: Stefan Raspl <raspl@linux.vnet.ibm.com>
Date: Tue, 19 May 2009 09:59:08 +0200
Subject: blktrace: remove debugfs entries on bad path

debugfs directory entries for devices are not removed on some
of the failure pathes in do_blk_trace_setup().
One way to reproduce is to start blktrace on multiple devices
with insufficient Vmalloc space: Devices will fail with
a message like this:

	BLKTRACESETUP(2) /dev/sdu failed: 5/Input/output error

If so, the respective entries in debugfs
(e.g. /sys/kernel/debug/block/sdu) will remain and subsequent
attempts to start blktrace on the respective devices will not
succeed due to existing directories.

[ Impact: fix /debug/tracing file cleanup corner case ]

Signed-off-by: Stefan Raspl <stefan.raspl@linux.vnet.ibm.com>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: schwidefsky@de.ibm.com
Cc: heiko.carstens@de.ibm.com
LKML-Reference: <4A1266CC.5040801@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 05b4747fd87..e3abf55bc8e 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -262,6 +262,7 @@ static void blk_trace_free(struct blk_trace *bt)
 {
 	debugfs_remove(bt->msg_file);
 	debugfs_remove(bt->dropped_file);
+	debugfs_remove(bt->dir);
 	relay_close(bt->rchan);
 	free_percpu(bt->sequence);
 	free_percpu(bt->msg_data);
-- 
cgit v1.2.3


From 5537937696c55530447c20aa27daccb8d0d29b33 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Mon, 18 May 2009 23:04:46 +0800
Subject: ftrace: fix check for return value of register_module_notifier in
 event_trace_init

register_module_notifier() returns zero in the success case.
So fix the inverted fail case check in trace events modules
handler.

[ Impact: fix spurious warning on ftrace initialization]

Reported-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_events.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 0eec0c55dd8..9e91c4ad7c8 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1174,7 +1174,7 @@ static __init int event_trace_init(void)
 	}
 
 	ret = register_module_notifier(&trace_module_nb);
-	if (!ret)
+	if (ret)
 		pr_warning("Failed to register trace events module notifier\n");
 
 	return 0;
-- 
cgit v1.2.3


From 4f5359685af6de7dca101393dc606620adbe963f Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Mon, 18 May 2009 19:35:34 +0800
Subject: tracing: add trace_event_read_lock()

I found that there is nothing to protect event_hash in
ftrace_find_event(). Rcu protects the event hashlist
but not the event itself while we use it after its extraction
through ftrace_find_event().

This lack of a proper locking in this spot opens a race
window between any event dereferencing and module removal.

Eg:

--Task A--

print_trace_line(trace) {
  event = find_ftrace_event(trace)

--Task B--

trace_module_remove_events(mod) {
  list_trace_events_module(ev, mod) {
    unregister_ftrace_event(ev->event) {
      hlist_del(ev->event->node)
        list_del(....)
    }
  }
}
|--> module removed, the event has been dropped

--Task A--

  event->print(trace); // Dereferencing freed memory

If the event retrieved belongs to a module and this module
is concurrently removed, we may end up dereferencing a data
from a freed module.

RCU could solve this, but it would add latency to the kernel and
forbid tracers output callbacks to call any sleepable code.
So this fix converts 'trace_event_mutex' to a read/write semaphore,
and adds trace_event_read_lock() to protect ftrace_find_event().

[ Impact: fix possible freed memory dereference in ftrace ]

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <4A114806.7090302@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace.c        |  8 ++++++++
 kernel/trace/trace_output.c | 25 ++++++++++++++++++-------
 kernel/trace/trace_output.h |  2 ++
 3 files changed, 28 insertions(+), 7 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dd40d232034..02d32baa23a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1569,12 +1569,14 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 		p = s_next(m, p, &l);
 	}
 
+	trace_event_read_lock();
 	return p;
 }
 
 static void s_stop(struct seq_file *m, void *p)
 {
 	atomic_dec(&trace_record_cmdline_disabled);
+	trace_event_read_unlock();
 }
 
 static void print_lat_help_header(struct seq_file *m)
@@ -1817,6 +1819,7 @@ static int trace_empty(struct trace_iterator *iter)
 	return 1;
 }
 
+/*  Called with trace_event_read_lock() held. */
 static enum print_line_t print_trace_line(struct trace_iterator *iter)
 {
 	enum print_line_t ret;
@@ -3008,6 +3011,7 @@ waitagain:
 	       offsetof(struct trace_iterator, seq));
 	iter->pos = -1;
 
+	trace_event_read_lock();
 	while (find_next_entry_inc(iter) != NULL) {
 		enum print_line_t ret;
 		int len = iter->seq.len;
@@ -3024,6 +3028,7 @@ waitagain:
 		if (iter->seq.len >= cnt)
 			break;
 	}
+	trace_event_read_unlock();
 
 	/* Now copy what we have to the user */
 	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
@@ -3146,6 +3151,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 		goto out_err;
 	}
 
+	trace_event_read_lock();
+
 	/* Fill as many pages as possible. */
 	for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
 		pages[i] = alloc_page(GFP_KERNEL);
@@ -3168,6 +3175,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 		trace_seq_init(&iter->seq);
 	}
 
+	trace_event_read_unlock();
 	mutex_unlock(&iter->mutex);
 
 	spd.nr_pages = i;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 489c0e8ada0..7136420603a 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -14,7 +14,7 @@
 /* must be a power of 2 */
 #define EVENT_HASHSIZE	128
 
-static DEFINE_MUTEX(trace_event_mutex);
+static DECLARE_RWSEM(trace_event_mutex);
 static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 
 static int next_event_type = __TRACE_LAST_TYPE + 1;
@@ -466,6 +466,7 @@ static int task_state_char(unsigned long state)
  * @type: the type of event to look for
  *
  * Returns an event of type @type otherwise NULL
+ * Called with trace_event_read_lock() held.
  */
 struct trace_event *ftrace_find_event(int type)
 {
@@ -475,7 +476,7 @@ struct trace_event *ftrace_find_event(int type)
 
 	key = type & (EVENT_HASHSIZE - 1);
 
-	hlist_for_each_entry_rcu(event, n, &event_hash[key], node) {
+	hlist_for_each_entry(event, n, &event_hash[key], node) {
 		if (event->type == type)
 			return event;
 	}
@@ -513,6 +514,16 @@ static int trace_search_list(struct list_head **list)
 	return last + 1;
 }
 
+void trace_event_read_lock(void)
+{
+	down_read(&trace_event_mutex);
+}
+
+void trace_event_read_unlock(void)
+{
+	up_read(&trace_event_mutex);
+}
+
 /**
  * register_ftrace_event - register output for an event type
  * @event: the event type to register
@@ -533,7 +544,7 @@ int register_ftrace_event(struct trace_event *event)
 	unsigned key;
 	int ret = 0;
 
-	mutex_lock(&trace_event_mutex);
+	down_write(&trace_event_mutex);
 
 	if (WARN_ON(!event))
 		goto out;
@@ -581,11 +592,11 @@ int register_ftrace_event(struct trace_event *event)
 
 	key = event->type & (EVENT_HASHSIZE - 1);
 
-	hlist_add_head_rcu(&event->node, &event_hash[key]);
+	hlist_add_head(&event->node, &event_hash[key]);
 
 	ret = event->type;
  out:
-	mutex_unlock(&trace_event_mutex);
+	up_write(&trace_event_mutex);
 
 	return ret;
 }
@@ -597,10 +608,10 @@ EXPORT_SYMBOL_GPL(register_ftrace_event);
  */
 int unregister_ftrace_event(struct trace_event *event)
 {
-	mutex_lock(&trace_event_mutex);
+	down_write(&trace_event_mutex);
 	hlist_del(&event->node);
 	list_del(&event->list);
-	mutex_unlock(&trace_event_mutex);
+	up_write(&trace_event_mutex);
 
 	return 0;
 }
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 6e220a8e570..ac240e76eb0 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -20,6 +20,8 @@ extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
 extern int trace_print_context(struct trace_iterator *iter);
 extern int trace_print_lat_context(struct trace_iterator *iter);
 
+extern void trace_event_read_lock(void);
+extern void trace_event_read_unlock(void);
 extern struct trace_event *ftrace_find_event(int type);
 
 extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
-- 
cgit v1.2.3


From b11c53e12f94a46b50bccc7a1a953d7ca1d54a31 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Mon, 25 May 2009 18:11:59 +0800
Subject: ftrace: Add task_comm support for trace_event

If we enable a trace event alone without any tracer running (such as
function tracer, sched switch tracer, etc...) it can't output enough
task command information.

We need to use the tracing_{start/stop}_cmdline_record() helpers
which are designed to keep track of cmdlines for any tasks that
were scheduled during the tracing.

Before this patch:
 # echo 1 > debugfs/tracing/events/sched/sched_switch/enable
 # cat debugfs/tracing/trace
 # tracer: nop
 #
 #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
 #              | |       |          |         |
            <...>-2289  [000] 526276.724790: sched_switch: task bash:2289 [120] ==> sshd:2287 [120]
            <...>-2287  [000] 526276.725231: sched_switch: task sshd:2287 [120] ==> bash:2289 [120]
            <...>-2289  [000] 526276.725452: sched_switch: task bash:2289 [120] ==> sshd:2287 [120]
            <...>-2287  [000] 526276.727181: sched_switch: task sshd:2287 [120] ==> swapper:0 [140]
           <idle>-0     [000] 526277.032734: sched_switch: task swapper:0 [140] ==> events/0:5 [115]
            <...>-5     [000] 526277.032782: sched_switch: task events/0:5 [115] ==> swapper:0 [140]
 ...

After this patch:
 # tracer: nop
 #
 #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
 #              | |       |          |         |
             bash-2269  [000] 527347.989229: sched_switch: task bash:2269 [120] ==> sshd:2267 [120]
             sshd-2267  [000] 527347.990960: sched_switch: task sshd:2267 [120] ==> bash:2269 [120]
             bash-2269  [000] 527347.991143: sched_switch: task bash:2269 [120] ==> sshd:2267 [120]
             sshd-2267  [000] 527347.992959: sched_switch: task sshd:2267 [120] ==> swapper:0 [140]
           <idle>-0     [000] 527348.531989: sched_switch: task swapper:0 [140] ==> events/0:5 [115]
         events/0-5     [000] 527348.532115: sched_switch: task events/0:5 [115] ==> swapper:0 [140]
 ...

Changelog:
v1->v2: Update Kconfig to select CONTEXT_SWITCH_TRACER in
        ENABLE_EVENT_TRACING
v2->v3: v2 can solve problem that was caused by config EVENT_TRACING
        alone, but when CONFIG_FTRACE is off and CONFIG_TRACING is
        selected by other config, compile fail happened again.
        This version solves it.

[ Impact: fix incomplete output of event tracing ]

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <4A14FDFE.2080402@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/Kconfig        | 9 +++++++--
 kernel/trace/trace_events.c | 6 ++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index f61be301578..a508b9d2adb 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -49,6 +49,11 @@ config FTRACE_NMI_ENTER
        default y
 
 config EVENT_TRACING
+	select CONTEXT_SWITCH_TRACER
+	bool
+
+config CONTEXT_SWITCH_TRACER
+	select MARKERS
 	bool
 
 config TRACING
@@ -176,10 +181,10 @@ config SCHED_TRACER
 	  This tracer tracks the latency of the highest priority task
 	  to be scheduled in, starting from the point it has woken up.
 
-config CONTEXT_SWITCH_TRACER
+config ENABLE_CONTEXT_SWITCH_TRACER
 	bool "Trace process context switches"
 	select TRACING
-	select MARKERS
+	select CONTEXT_SWITCH_TRACER
 	help
 	  This tracer gets called from the context switch and records
 	  all switching of tasks.
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 9e91c4ad7c8..9b246eb01d5 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -85,6 +85,7 @@ static void ftrace_clear_events(void)
 
 		if (call->enabled) {
 			call->enabled = 0;
+			tracing_stop_cmdline_record();
 			call->unregfunc();
 		}
 	}
@@ -99,12 +100,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
 	case 0:
 		if (call->enabled) {
 			call->enabled = 0;
+			tracing_stop_cmdline_record();
 			call->unregfunc();
 		}
 		break;
 	case 1:
 		if (!call->enabled) {
 			call->enabled = 1;
+			tracing_start_cmdline_record();
 			call->regfunc();
 		}
 		break;
@@ -1058,6 +1061,7 @@ static void trace_module_remove_events(struct module *mod)
 			found = true;
 			if (call->enabled) {
 				call->enabled = 0;
+				tracing_stop_cmdline_record();
 				call->unregfunc();
 			}
 			if (call->event)
@@ -1262,11 +1266,13 @@ static __init void event_trace_self_tests(void)
 		}
 
 		call->enabled = 1;
+		tracing_start_cmdline_record();
 		call->regfunc();
 
 		event_test_stuff();
 
 		call->unregfunc();
+		tracing_stop_cmdline_record();
 		call->enabled = 0;
 
 		pr_cont("OK\n");
-- 
cgit v1.2.3


From 0e907c99391362385c8e3af2c43b904dd1fd5d73 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Mon, 25 May 2009 18:13:59 +0800
Subject: ftrace: clean up of using ftrace_event_enable_disable()

Always use ftrace_event_enable_disable() to enable/disable an event
so that we can factorize out the event toggling code.

[ Impact: factorize and cleanup event tracing code ]

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <4A14FDFE.2080402@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_events.c | 44 ++++++++++++++------------------------------
 1 file changed, 14 insertions(+), 30 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 9b246eb01d5..6c81f9c2142 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -76,26 +76,9 @@ static void trace_destroy_fields(struct ftrace_event_call *call)
 
 #endif /* CONFIG_MODULES */
 
-static void ftrace_clear_events(void)
-{
-	struct ftrace_event_call *call;
-
-	mutex_lock(&event_mutex);
-	list_for_each_entry(call, &ftrace_events, list) {
-
-		if (call->enabled) {
-			call->enabled = 0;
-			tracing_stop_cmdline_record();
-			call->unregfunc();
-		}
-	}
-	mutex_unlock(&event_mutex);
-}
-
 static void ftrace_event_enable_disable(struct ftrace_event_call *call,
 					int enable)
 {
-
 	switch (enable) {
 	case 0:
 		if (call->enabled) {
@@ -114,6 +97,17 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
 	}
 }
 
+static void ftrace_clear_events(void)
+{
+	struct ftrace_event_call *call;
+
+	mutex_lock(&event_mutex);
+	list_for_each_entry(call, &ftrace_events, list) {
+		ftrace_event_enable_disable(call, 0);
+	}
+	mutex_unlock(&event_mutex);
+}
+
 /*
  * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
  */
@@ -1059,11 +1053,7 @@ static void trace_module_remove_events(struct module *mod)
 	list_for_each_entry_safe(call, p, &ftrace_events, list) {
 		if (call->mod == mod) {
 			found = true;
-			if (call->enabled) {
-				call->enabled = 0;
-				tracing_stop_cmdline_record();
-				call->unregfunc();
-			}
+			ftrace_event_enable_disable(call, 0);
 			if (call->event)
 				unregister_ftrace_event(call->event);
 			debugfs_remove_recursive(call->dir);
@@ -1265,15 +1255,9 @@ static __init void event_trace_self_tests(void)
 			continue;
 		}
 
-		call->enabled = 1;
-		tracing_start_cmdline_record();
-		call->regfunc();
-
+		ftrace_event_enable_disable(call, 1);
 		event_test_stuff();
-
-		call->unregfunc();
-		tracing_stop_cmdline_record();
-		call->enabled = 0;
+		ftrace_event_enable_disable(call, 0);
 
 		pr_cont("OK\n");
 	}
-- 
cgit v1.2.3


From be74b73a57645cc253d881ab0c1014eb64b9cf22 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 26 May 2009 20:25:22 +0200
Subject: tracing: add __print_flags for events

Developers have been asking for the ability in the ftrace event tracer
to display names of bits in a flags variable.

Instead of printing out c2, it would be easier to read FOO|BAR|GOO,
assuming that FOO is bit 1, BAR is bit 6 and GOO is bit 7.

Some examples where this would be useful are the state flags in a context
switch, kmalloc flags, and even permision flags in accessing files.

[
  v2 changes include:

  Frederic Weisbecker's idea of using a mask instead of bits,
  thus we can output GFP_KERNEL instead of GPF_WAIT|GFP_IO|GFP_FS.

  Li Zefan's idea of allowing the caller of __print_flags to add their
  own delimiter (or no delimiter) where we can get for file permissions
  rwx instead of r|w|x.
]

[
  v3 changes:

   Christoph Hellwig's idea of using an array instead of va_args.
]

[ Impact: better displaying of flags in trace output ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_output.c | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 7136420603a..a4840c260c8 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -15,6 +15,9 @@
 #define EVENT_HASHSIZE	128
 
 static DECLARE_RWSEM(trace_event_mutex);
+
+DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq);
+
 static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 
 static int next_event_type = __TRACE_LAST_TYPE + 1;
@@ -212,6 +215,42 @@ int trace_seq_path(struct trace_seq *s, struct path *path)
 	return 0;
 }
 
+const char *
+ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
+		       unsigned long flags,
+		       const struct trace_print_flags *flag_array)
+{
+	unsigned long mask;
+	const char *str;
+	int i;
+
+	trace_seq_init(p);
+
+	for (i = 0;  flag_array[i].name && flags; i++) {
+
+		mask = flag_array[i].mask;
+		if ((flags & mask) != mask)
+			continue;
+
+		str = flag_array[i].name;
+		flags &= ~mask;
+		if (p->len && delim)
+			trace_seq_puts(p, delim);
+		trace_seq_puts(p, str);
+	}
+
+	/* check for left over flags */
+	if (flags) {
+		if (p->len && delim)
+			trace_seq_puts(p, delim);
+		trace_seq_printf(p, "0x%lx", flags);
+	}
+
+	trace_seq_putc(p, 0);
+
+	return p->buffer;
+}
+
 #ifdef CONFIG_KRETPROBES
 static inline const char *kretprobed(const char *name)
 {
-- 
cgit v1.2.3


From 0f4fc29dd68dfab9c6ddd5d087d34a5b6818cb00 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 20 May 2009 19:21:47 -0400
Subject: tracing: add __print_symbolic to trace events

This patch adds __print_symbolic which is similar to __print_flags but
works for an enumeration type instead. That is, there is only a one to one
mapping between the values and the symbols. When a match is made, then
it is printed, otherwise the hex value is outputed.

[ Impact: add interface for showing symbol names in events ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_output.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index a4840c260c8..c12d95db2f5 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -251,6 +251,31 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
 	return p->buffer;
 }
 
+const char *
+ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
+			 const struct trace_print_flags *symbol_array)
+{
+	int i;
+
+	trace_seq_init(p);
+
+	for (i = 0;  symbol_array[i].name; i++) {
+
+		if (val != symbol_array[i].mask)
+			continue;
+
+		trace_seq_puts(p, symbol_array[i].name);
+		break;
+	}
+
+	if (!p->len)
+		trace_seq_printf(p, "0x%lx", val);
+		
+	trace_seq_putc(p, 0);
+
+	return p->buffer;
+}
+
 #ifdef CONFIG_KRETPROBES
 static inline const char *kretprobed(const char *name)
 {
-- 
cgit v1.2.3


From 5b6045a906f48d37591365c5dcdd6d1d146bfd4a Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 26 May 2009 17:28:02 +0200
Subject: trace: disable preemption before taking raw spinlocks

s390 code uses smp_processor_id() in __raw_spin_lock() code which
reveals that a (raw) spinlock is taken without preemption disabled.
This can potentially deadlock.

To fix this explicitly disable and enable preemption.

BUG: using smp_processor_id() in preemptible [00000000] code: cat/2278
caller is trace_find_cmdline+0x40/0xfc
CPU: 0 Not tainted 2.6.30-rc7-dirty #39
Process cat (pid: 2278, task: 000000003faedb68, ksp: 000000003b33b988)
000000003b33b988 000000003b33bae0 0000000000000002 0000000000000000
       000000003b33bb80 000000003b33baf8 000000003b33baf8 00000000000175d6
       0000000000000001 000000003b33b988 000000003f9b0000 000000000000000b
       000000000000000c 000000003b33bb40 000000003b33bae0 0000000000000000
       0000000000000000 00000000000175d6 000000003b33bae0 000000003b33bb28
Call Trace:
([<00000000000174b2>] show_trace+0x112/0x170)
 [<0000000000017582>] show_stack+0x72/0x100
 [<0000000000441538>] dump_stack+0xc8/0xd8
 [<000000000025c350>] debug_smp_processor_id+0x114/0x130
 [<00000000000bf0e4>] trace_find_cmdline+0x40/0xfc
 [<00000000000c35d4>] trace_print_context+0x58/0xac
 [<00000000000bb676>] print_trace_line+0x416/0x470
 [<00000000000bc8fe>] s_show+0x4e/0x428
 [<000000000013834e>] seq_read+0x36a/0x5d4
 [<0000000000112a78>] vfs_read+0xc8/0x174
 [<0000000000112c58>] SyS_read+0x74/0xc4
 [<000000000002c7ae>] sysc_noemu+0x10/0x16
 [<000002000012436c>] 0x2000012436c
1 lock held by cat/2278:
 #0:  (&p->lock){+.+.+.}, at: [<0000000000138056>] seq_read+0x72/0x5d4

[ Impact: fix preempt-unsafe raw spinlock ]

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 02d32baa23a..a3a8a87d7e9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -808,6 +808,7 @@ void trace_find_cmdline(int pid, char comm[])
 		return;
 	}
 
+	preempt_disable();
 	__raw_spin_lock(&trace_cmdline_lock);
 	map = map_pid_to_cmdline[pid];
 	if (map != NO_CMDLINE_MAP)
@@ -816,6 +817,7 @@ void trace_find_cmdline(int pid, char comm[])
 		strcpy(comm, "<...>");
 
 	__raw_spin_unlock(&trace_cmdline_lock);
+	preempt_enable();
 }
 
 void tracing_record_cmdline(struct task_struct *tsk)
-- 
cgit v1.2.3


From fb39125fd79a25c5002f3b45cf4c80e3fa6b961b Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Fri, 17 Apr 2009 15:15:51 +0800
Subject: ftrace, workqueuetrace: make workqueue tracepoints use TRACE_EVENT
 macro

v3: zhaolei@cn.fujitsu.com: Change TRACE_EVENT definition to new format
    introduced by Steven Rostedt: consolidate trace and trace_event headers
v2: kosaki@jp.fujitsu.com: print the function names instead of addr, and zap
    the work addr
v1: zhaolei@cn.fujitsu.com: Make workqueue tracepoints use TRACE_EVENT macro

TRACE_EVENT is a more generic way to define tracepoints.
Doing so adds these new capabilities to the tracepoints:

  - zero-copy and per-cpu splice() tracing
  - binary tracing without printf overhead
  - structured logging records exposed under /debug/tracing/events
  - trace events embedded in function tracer output and other plugins
  - user-defined, per tracepoint filter expressions

Then, this patch converts DEFINE_TRACE to TRACE_EVENT in workqueue related
tracepoints.

[ Impact: expand workqueue tracer to events tracing ]

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_workqueue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 984b9175c13..cfe56d31d85 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -6,7 +6,7 @@
  */
 
 
-#include <trace/workqueue.h>
+#include <trace/events/workqueue.h>
 #include <linux/list.h>
 #include <linux/percpu.h>
 #include "trace_stat.h"
-- 
cgit v1.2.3


From 1fdfca9c577aac96a559c1ea68f5c9156f17d636 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Mon, 20 Apr 2009 14:58:26 +0800
Subject: trace_workqueue: use list_for_each_entry() instead of
 list_for_each_entry_safe()

No need to use list_for_each_entry_safe() in iteration without deleting
any node, we can use list_for_each_entry() instead.

[ Impact: cleanup ]

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_workqueue.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index cfe56d31d85..128b64b93f1 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -47,12 +47,11 @@ probe_workqueue_insertion(struct task_struct *wq_thread,
 			  struct work_struct *work)
 {
 	int cpu = cpumask_first(&wq_thread->cpus_allowed);
-	struct cpu_workqueue_stats *node, *next;
+	struct cpu_workqueue_stats *node;
 	unsigned long flags;
 
 	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-	list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
-							list) {
+	list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
 		if (node->pid == wq_thread->pid) {
 			atomic_inc(&node->inserted);
 			goto found;
@@ -69,12 +68,11 @@ probe_workqueue_execution(struct task_struct *wq_thread,
 			  struct work_struct *work)
 {
 	int cpu = cpumask_first(&wq_thread->cpus_allowed);
-	struct cpu_workqueue_stats *node, *next;
+	struct cpu_workqueue_stats *node;
 	unsigned long flags;
 
 	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-	list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
-							list) {
+	list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
 		if (node->pid == wq_thread->pid) {
 			node->executed++;
 			goto found;
-- 
cgit v1.2.3


From b8867164f05791a6b5363bd51c1274e03600886e Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Mon, 20 Apr 2009 14:59:36 +0800
Subject: trace_workqueue: remove cpu_workqueue_stats->first_entry

cpu_workqueue_stats->first_entry is useless because we can retrieve the
header of a cpu workqueue using:
if (&cpu_workqueue_stats->list == workqueue_cpu_stat(cpu)->list.next)

[ Impact: cleanup ]

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_workqueue.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 128b64b93f1..890974aed64 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -16,8 +16,6 @@
 /* A cpu workqueue thread */
 struct cpu_workqueue_stats {
 	struct list_head            list;
-/* Useful to know if we print the cpu headers */
-	bool		            first_entry;
 	int		            cpu;
 	pid_t			    pid;
 /* Can be inserted from interrupt or user context, need to be atomic */
@@ -103,8 +101,6 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
 	cws->pid = wq_thread->pid;
 
 	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-	if (list_empty(&workqueue_cpu_stat(cpu)->list))
-		cws->first_entry = true;
 	list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list);
 	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
 }
-- 
cgit v1.2.3


From f3c4ae26e93d354152196b62797ba86ad86dd0cc Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Mon, 20 Apr 2009 15:02:17 +0800
Subject: trace_workqueue: remove blank line between each cpu

The blankline between each cpu's workqueue stat is not necessary, because
the cpu number is enough to part them by eye.
Old style also caused a blankline below headline, and made code complex
by using lock, disableirq and get cpu var.

Old style:
 # CPU  INSERTED  EXECUTED   NAME
 # |      |         |          |

   0   8644       8644       events/0
   0      0          0       cpuset
   ...
   0      1          1       kdmflush

   1  35365      35365       events/1
   ...

New style:
 # CPU  INSERTED  EXECUTED   NAME
 # |      |         |          |

   0   8644       8644       events/0
   0      0          0       cpuset
   ...
   0      1          1       kdmflush
   1  35365      35365       events/1
   ...

[ Impact: provide more readable code ]

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_workqueue.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 890974aed64..97fcea4acce 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -185,16 +185,9 @@ static void *workqueue_stat_next(void *prev, int idx)
 static int workqueue_stat_show(struct seq_file *s, void *p)
 {
 	struct cpu_workqueue_stats *cws = p;
-	unsigned long flags;
-	int cpu = cws->cpu;
 	struct pid *pid;
 	struct task_struct *tsk;
 
-	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-	if (&cws->list == workqueue_cpu_stat(cpu)->list.next)
-		seq_printf(s, "\n");
-	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-
 	pid = find_get_pid(cws->pid);
 	if (pid) {
 		tsk = get_pid_task(pid, PIDTYPE_PID);
-- 
cgit v1.2.3


From 0d64f8342de26d02451900b1aad94716fe92c4ab Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 16 May 2009 05:58:49 +0200
Subject: tracing/stat: replace trace_stat_session by stat_session

The "trace" prefix in struct trace_stat_session type is annoying while
reading the trace_stat.c file. It makes the lines longer, and
is not that much useful to explain the sense of this type.

Just keep "struct stat_session" for this type.

[ Impact: make the code a bit more readable ]

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_stat.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index fdde3a4a94c..3b6816be825 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -22,7 +22,7 @@ struct trace_stat_list {
 };
 
 /* A stat session is the stats output in one file */
-struct tracer_stat_session {
+struct stat_session {
 	struct list_head	session_list;
 	struct tracer_stat	*ts;
 	struct list_head	stat_list;
@@ -38,7 +38,7 @@ static DEFINE_MUTEX(all_stat_sessions_mutex);
 static struct dentry		*stat_dir;
 
 
-static void reset_stat_session(struct tracer_stat_session *session)
+static void reset_stat_session(struct stat_session *session)
 {
 	struct trace_stat_list *node, *next;
 
@@ -48,7 +48,7 @@ static void reset_stat_session(struct tracer_stat_session *session)
 	INIT_LIST_HEAD(&session->stat_list);
 }
 
-static void destroy_session(struct tracer_stat_session *session)
+static void destroy_session(struct stat_session *session)
 {
 	debugfs_remove(session->file);
 	reset_stat_session(session);
@@ -71,7 +71,7 @@ static int dummy_cmp(void *p1, void *p2)
  * All of these copies and sorting are required on all opening
  * since the stats could have changed between two file sessions.
  */
-static int stat_seq_init(struct tracer_stat_session *session)
+static int stat_seq_init(struct stat_session *session)
 {
 	struct trace_stat_list *iter_entry, *new_entry;
 	struct tracer_stat *ts = session->ts;
@@ -154,7 +154,7 @@ exit_free_list:
 
 static void *stat_seq_start(struct seq_file *s, loff_t *pos)
 {
-	struct tracer_stat_session *session = s->private;
+	struct stat_session *session = s->private;
 
 	/* Prevent from tracer switch or stat_list modification */
 	mutex_lock(&session->stat_mutex);
@@ -168,7 +168,7 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos)
 
 static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)
 {
-	struct tracer_stat_session *session = s->private;
+	struct stat_session *session = s->private;
 
 	if (p == SEQ_START_TOKEN)
 		return seq_list_start(&session->stat_list, *pos);
@@ -178,13 +178,13 @@ static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)
 
 static void stat_seq_stop(struct seq_file *s, void *p)
 {
-	struct tracer_stat_session *session = s->private;
+	struct stat_session *session = s->private;
 	mutex_unlock(&session->stat_mutex);
 }
 
 static int stat_seq_show(struct seq_file *s, void *v)
 {
-	struct tracer_stat_session *session = s->private;
+	struct stat_session *session = s->private;
 	struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list);
 
 	if (v == SEQ_START_TOKEN)
@@ -205,7 +205,7 @@ static int tracing_stat_open(struct inode *inode, struct file *file)
 {
 	int ret;
 
-	struct tracer_stat_session *session = inode->i_private;
+	struct stat_session *session = inode->i_private;
 
 	ret = seq_open(file, &trace_stat_seq_ops);
 	if (!ret) {
@@ -222,7 +222,7 @@ static int tracing_stat_open(struct inode *inode, struct file *file)
  */
 static int tracing_stat_release(struct inode *i, struct file *f)
 {
-	struct tracer_stat_session *session = i->i_private;
+	struct stat_session *session = i->i_private;
 
 	mutex_lock(&session->stat_mutex);
 	reset_stat_session(session);
@@ -251,7 +251,7 @@ static int tracing_stat_init(void)
 	return 0;
 }
 
-static int init_stat_file(struct tracer_stat_session *session)
+static int init_stat_file(struct stat_session *session)
 {
 	if (!stat_dir && tracing_stat_init())
 		return -ENODEV;
@@ -266,7 +266,7 @@ static int init_stat_file(struct tracer_stat_session *session)
 
 int register_stat_tracer(struct tracer_stat *trace)
 {
-	struct tracer_stat_session *session, *node, *tmp;
+	struct stat_session *session, *node, *tmp;
 	int ret;
 
 	if (!trace)
@@ -286,7 +286,7 @@ int register_stat_tracer(struct tracer_stat *trace)
 	mutex_unlock(&all_stat_sessions_mutex);
 
 	/* Init the session */
-	session = kmalloc(sizeof(struct tracer_stat_session), GFP_KERNEL);
+	session = kmalloc(sizeof(struct stat_session), GFP_KERNEL);
 	if (!session)
 		return -ENOMEM;
 
@@ -312,7 +312,7 @@ int register_stat_tracer(struct tracer_stat *trace)
 
 void unregister_stat_tracer(struct tracer_stat *trace)
 {
-	struct tracer_stat_session *node, *tmp;
+	struct stat_session *node, *tmp;
 
 	mutex_lock(&all_stat_sessions_mutex);
 	list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
-- 
cgit v1.2.3


From 8f184f27300f66f6dcc8296c2dae7a1fbe8429c9 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 16 May 2009 06:24:36 +0200
Subject: tracing/stat: replace linked list by an rbtree for sorting

When the stat tracing framework prepares the entries from a tracer
to output them to the user, it starts by computing a linear sort
through a linked list to give the entries ordered by relevance
to the user.

This is quite ugly and causes a small latency when we begin to
read the file.

This patch changes that by turning the linked list into a red-black
tree. Athough the whole iteration using the start and next tracer
callbacks while opening the file remain the same, it is now much
more fast and scalable.

The rbtree guarantees O(log(n)) insertions whereas a linked
list with linear sorting brought us a O(n) despair. Now the
(visible) latency has disapeared.

[ Impact: kill the latency while starting to read a stat tracer file ]

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_stat.c | 140 +++++++++++++++++++++++++++++++++-------------
 1 file changed, 100 insertions(+), 40 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 3b6816be825..0bd0fc82da5 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -1,7 +1,7 @@
 /*
  * Infrastructure for statistic tracing (histogram output).
  *
- * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ * Copyright (C) 2008-2009 Frederic Weisbecker <fweisbec@gmail.com>
  *
  * Based on the code from trace_branch.c which is
  * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
@@ -10,14 +10,19 @@
 
 
 #include <linux/list.h>
+#include <linux/rbtree.h>
 #include <linux/debugfs.h>
 #include "trace_stat.h"
 #include "trace.h"
 
 
-/* List of stat entries from a tracer */
-struct trace_stat_list {
-	struct list_head	list;
+/*
+ * List of stat red-black nodes from a tracer
+ * We use a such tree to sort quickly the stat
+ * entries from the tracer.
+ */
+struct stat_node {
+	struct rb_node		node;
 	void			*stat;
 };
 
@@ -25,7 +30,7 @@ struct trace_stat_list {
 struct stat_session {
 	struct list_head	session_list;
 	struct tracer_stat	*ts;
-	struct list_head	stat_list;
+	struct rb_root		stat_root;
 	struct mutex		stat_mutex;
 	struct dentry		*file;
 };
@@ -37,15 +42,45 @@ static DEFINE_MUTEX(all_stat_sessions_mutex);
 /* The root directory for all stat files */
 static struct dentry		*stat_dir;
 
+/*
+ * Iterate through the rbtree using a post order traversal path
+ * to release the next node.
+ * It won't necessary release one at each iteration
+ * but it will at least advance closer to the next one
+ * to be released.
+ */
+static struct rb_node *release_next(struct rb_node *node)
+{
+	struct stat_node *snode;
+	struct rb_node *parent = rb_parent(node);
+
+	if (node->rb_left)
+		return node->rb_left;
+	else if (node->rb_right)
+		return node->rb_right;
+	else {
+		if (!parent)
+			return NULL;
+		if (parent->rb_left == node)
+			parent->rb_left = NULL;
+		else
+			parent->rb_right = NULL;
+
+		snode = container_of(node, struct stat_node, node);
+		kfree(snode);
+
+		return parent;
+	}
+}
 
 static void reset_stat_session(struct stat_session *session)
 {
-	struct trace_stat_list *node, *next;
+	struct rb_node *node = session->stat_root.rb_node;
 
-	list_for_each_entry_safe(node, next, &session->stat_list, list)
-		kfree(node);
+	while (node)
+		node = release_next(node);
 
-	INIT_LIST_HEAD(&session->stat_list);
+	session->stat_root = RB_ROOT;
 }
 
 static void destroy_session(struct stat_session *session)
@@ -56,6 +91,35 @@ static void destroy_session(struct stat_session *session)
 	kfree(session);
 }
 
+typedef int (*cmp_stat_t)(void *, void *);
+
+static void
+insert_stat(struct rb_root *root, struct stat_node *data, cmp_stat_t cmp)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+	/*
+	 * Figure out where to put new node
+	 * This is a descendent sorting
+	 */
+	while (*new) {
+		struct stat_node *this;
+		int result;
+
+		this = container_of(*new, struct stat_node, node);
+		result = cmp(data->stat, this->stat);
+
+		parent = *new;
+		if (result >= 0)
+			new = &((*new)->rb_left);
+		else
+			new = &((*new)->rb_right);
+	}
+
+	rb_link_node(&data->node, parent, new);
+	rb_insert_color(&data->node, root);
+}
+
 /*
  * For tracers that don't provide a stat_cmp callback.
  * This one will force an immediate insertion on tail of
@@ -73,8 +137,9 @@ static int dummy_cmp(void *p1, void *p2)
  */
 static int stat_seq_init(struct stat_session *session)
 {
-	struct trace_stat_list *iter_entry, *new_entry;
 	struct tracer_stat *ts = session->ts;
+	struct stat_node *new_entry;
+	struct rb_root *root;
 	void *stat;
 	int ret = 0;
 	int i;
@@ -93,15 +158,13 @@ static int stat_seq_init(struct stat_session *session)
 	 * The first entry. Actually this is the second, but the first
 	 * one (the stat_list head) is pointless.
 	 */
-	new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL);
+	new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL);
 	if (!new_entry) {
 		ret = -ENOMEM;
 		goto exit;
 	}
-
-	INIT_LIST_HEAD(&new_entry->list);
-
-	list_add(&new_entry->list, &session->stat_list);
+	root = &session->stat_root;
+	insert_stat(root, new_entry, dummy_cmp);
 
 	new_entry->stat = stat;
 
@@ -116,31 +179,17 @@ static int stat_seq_init(struct stat_session *session)
 		if (!stat)
 			break;
 
-		new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL);
+		new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL);
 		if (!new_entry) {
 			ret = -ENOMEM;
 			goto exit_free_list;
 		}
 
-		INIT_LIST_HEAD(&new_entry->list);
 		new_entry->stat = stat;
 
-		list_for_each_entry_reverse(iter_entry, &session->stat_list,
-				list) {
-
-			/* Insertion with a descendent sorting */
-			if (ts->stat_cmp(iter_entry->stat,
-					new_entry->stat) >= 0) {
-
-				list_add(&new_entry->list, &iter_entry->list);
-				break;
-			}
-		}
-
-		/* The current larger value */
-		if (list_empty(&new_entry->list))
-			list_add(&new_entry->list, &session->stat_list);
+		insert_stat(root, new_entry, ts->stat_cmp);
 	}
+
 exit:
 	mutex_unlock(&session->stat_mutex);
 	return ret;
@@ -155,25 +204,38 @@ exit_free_list:
 static void *stat_seq_start(struct seq_file *s, loff_t *pos)
 {
 	struct stat_session *session = s->private;
+	struct rb_node *node;
+	int i;
 
 	/* Prevent from tracer switch or stat_list modification */
 	mutex_lock(&session->stat_mutex);
 
 	/* If we are in the beginning of the file, print the headers */
-	if (!*pos && session->ts->stat_headers)
+	if (!*pos && session->ts->stat_headers) {
+		(*pos)++;
 		return SEQ_START_TOKEN;
+	}
 
-	return seq_list_start(&session->stat_list, *pos);
+	node = rb_first(&session->stat_root);
+	for (i = 0; node && i < *pos; i++)
+		node = rb_next(node);
+
+	(*pos)++;
+
+	return node;
 }
 
 static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)
 {
 	struct stat_session *session = s->private;
+	struct rb_node *node = p;
+
+	(*pos)++;
 
 	if (p == SEQ_START_TOKEN)
-		return seq_list_start(&session->stat_list, *pos);
+		return rb_first(&session->stat_root);
 
-	return seq_list_next(p, &session->stat_list, pos);
+	return rb_next(node);
 }
 
 static void stat_seq_stop(struct seq_file *s, void *p)
@@ -185,7 +247,7 @@ static void stat_seq_stop(struct seq_file *s, void *p)
 static int stat_seq_show(struct seq_file *s, void *v)
 {
 	struct stat_session *session = s->private;
-	struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list);
+	struct stat_node *l = container_of(v, struct stat_node, node);
 
 	if (v == SEQ_START_TOKEN)
 		return session->ts->stat_headers(s);
@@ -286,15 +348,13 @@ int register_stat_tracer(struct tracer_stat *trace)
 	mutex_unlock(&all_stat_sessions_mutex);
 
 	/* Init the session */
-	session = kmalloc(sizeof(struct stat_session), GFP_KERNEL);
+	session = kzalloc(sizeof(*session), GFP_KERNEL);
 	if (!session)
 		return -ENOMEM;
 
 	session->ts = trace;
 	INIT_LIST_HEAD(&session->session_list);
-	INIT_LIST_HEAD(&session->stat_list);
 	mutex_init(&session->stat_mutex);
-	session->file = NULL;
 
 	ret = init_stat_file(session);
 	if (ret) {
-- 
cgit v1.2.3


From b3dd7ba7d862707800c7ac45068f14ade2b65155 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 27 May 2009 11:04:26 +0800
Subject: tracing/stat: change dummpy_cmp() to return -1

Currently the output of trace_stat/workqueues is totally reversed:

 # cat /debug/tracing/trace_stat/workqueues
    ...
    1       17       17      210       37   `-blk_unplug_work+0x0/0x57
    1     3779     3779      181       11   |-cfq_kick_queue+0x0/0x2f
    1     3796     3796                     kblockd/1:120
    ...

The correct output should be:

    1     3796     3796                     kblockd/1:120
    1     3779     3779      181       11   |-cfq_kick_queue+0x0/0x2f
    1       17       17      210       37   `-blk_unplug_work+0x0/0x57

It's caused by "tracing/stat: replace linked list by an rbtree for
sorting"
(53059c9b67a62a3dc8c80204d3da42b9267ea5a0).

dummpy_cmp() should return -1, so rb_node will always be inserted as
right-most node in the rbtree, thus we sort the output in ascending
order.

[ Impact: fix the output of trace_stat/workqueues ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_stat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 0bd0fc82da5..5816d1aebcc 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -127,7 +127,7 @@ insert_stat(struct rb_root *root, struct stat_node *data, cmp_stat_t cmp)
  */
 static int dummy_cmp(void *p1, void *p2)
 {
-	return 1;
+	return -1;
 }
 
 /*
-- 
cgit v1.2.3


From e16228069083a2f6b94383ac5739aea7a0f38ce4 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 27 May 2009 11:04:48 +0800
Subject: tracing/stat: remember to free root node

When closing a trace_stat file, we destroy the rbtree constructed during
file open, but there is memory leak that the root node is not freed.

[ Impact: fix memory leak when closing a trace_stat file ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_stat.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 5816d1aebcc..8030ec98dba 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -60,8 +60,8 @@ static struct rb_node *release_next(struct rb_node *node)
 		return node->rb_right;
 	else {
 		if (!parent)
-			return NULL;
-		if (parent->rb_left == node)
+			;
+		else if (parent->rb_left == node)
 			parent->rb_left = NULL;
 		else
 			parent->rb_right = NULL;
-- 
cgit v1.2.3


From dbd3fbdfeecfad4e71139db05d72560c3583e2a9 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 27 May 2009 11:42:46 +0800
Subject: tracing/stat: do some cleanups

- remove duplicate code in stat_seq_init()
- update comments to reflect the change from stat list to stat rbtree

[ Impact: clean up ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_stat.c | 54 ++++++++++++++++++-----------------------------
 1 file changed, 21 insertions(+), 33 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 8030ec98dba..17f20ebdad2 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -93,10 +93,15 @@ static void destroy_session(struct stat_session *session)
 
 typedef int (*cmp_stat_t)(void *, void *);
 
-static void
-insert_stat(struct rb_root *root, struct stat_node *data, cmp_stat_t cmp)
+static int insert_stat(struct rb_root *root, void *stat, cmp_stat_t cmp)
 {
 	struct rb_node **new = &(root->rb_node), *parent = NULL;
+	struct stat_node *data;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+	data->stat = stat;
 
 	/*
 	 * Figure out where to put new node
@@ -118,12 +123,13 @@ insert_stat(struct rb_root *root, struct stat_node *data, cmp_stat_t cmp)
 
 	rb_link_node(&data->node, parent, new);
 	rb_insert_color(&data->node, root);
+	return 0;
 }
 
 /*
  * For tracers that don't provide a stat_cmp callback.
- * This one will force an immediate insertion on tail of
- * the list.
+ * This one will force an insertion as right-most node
+ * in the rbtree.
  */
 static int dummy_cmp(void *p1, void *p2)
 {
@@ -131,15 +137,14 @@ static int dummy_cmp(void *p1, void *p2)
 }
 
 /*
- * Initialize the stat list at each trace_stat file opening.
+ * Initialize the stat rbtree at each trace_stat file opening.
  * All of these copies and sorting are required on all opening
  * since the stats could have changed between two file sessions.
  */
 static int stat_seq_init(struct stat_session *session)
 {
 	struct tracer_stat *ts = session->ts;
-	struct stat_node *new_entry;
-	struct rb_root *root;
+	struct rb_root *root = &session->stat_root;
 	void *stat;
 	int ret = 0;
 	int i;
@@ -154,23 +159,12 @@ static int stat_seq_init(struct stat_session *session)
 	if (!stat)
 		goto exit;
 
-	/*
-	 * The first entry. Actually this is the second, but the first
-	 * one (the stat_list head) is pointless.
-	 */
-	new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL);
-	if (!new_entry) {
-		ret = -ENOMEM;
+	ret = insert_stat(root, stat, ts->stat_cmp);
+	if (ret)
 		goto exit;
-	}
-	root = &session->stat_root;
-	insert_stat(root, new_entry, dummy_cmp);
-
-	new_entry->stat = stat;
 
 	/*
-	 * Iterate over the tracer stat entries and store them in a sorted
-	 * list.
+	 * Iterate over the tracer stat entries and store them in an rbtree.
 	 */
 	for (i = 1; ; i++) {
 		stat = ts->stat_next(stat, i);
@@ -179,22 +173,16 @@ static int stat_seq_init(struct stat_session *session)
 		if (!stat)
 			break;
 
-		new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL);
-		if (!new_entry) {
-			ret = -ENOMEM;
-			goto exit_free_list;
-		}
-
-		new_entry->stat = stat;
-
-		insert_stat(root, new_entry, ts->stat_cmp);
+		ret = insert_stat(root, stat, ts->stat_cmp);
+		if (ret)
+			goto exit_free_rbtree;
 	}
 
 exit:
 	mutex_unlock(&session->stat_mutex);
 	return ret;
 
-exit_free_list:
+exit_free_rbtree:
 	reset_stat_session(session);
 	mutex_unlock(&session->stat_mutex);
 	return ret;
@@ -207,7 +195,7 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos)
 	struct rb_node *node;
 	int i;
 
-	/* Prevent from tracer switch or stat_list modification */
+	/* Prevent from tracer switch or rbtree modification */
 	mutex_lock(&session->stat_mutex);
 
 	/* If we are in the beginning of the file, print the headers */
@@ -280,7 +268,7 @@ static int tracing_stat_open(struct inode *inode, struct file *file)
 }
 
 /*
- * Avoid consuming memory with our now useless list.
+ * Avoid consuming memory with our now useless rbtree.
  */
 static int tracing_stat_release(struct inode *i, struct file *f)
 {
-- 
cgit v1.2.3


From 43bd1236234cacbc18d1476a9b57e7a306efddf5 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 30 May 2009 04:25:30 +0200
Subject: tracing/stat: remove unappropriate safe walk on list

register_stat_tracer() uses list_for_each_entry_safe
to check whether a tracer is already present in the list.
But we don't delete anything from the list here, so
we don't need the safe version

[ Impact: cleanup list use is stat tracing ]

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_stat.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 17f20ebdad2..c00643733f4 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -316,7 +316,7 @@ static int init_stat_file(struct stat_session *session)
 
 int register_stat_tracer(struct tracer_stat *trace)
 {
-	struct stat_session *session, *node, *tmp;
+	struct stat_session *session, *node;
 	int ret;
 
 	if (!trace)
@@ -327,7 +327,7 @@ int register_stat_tracer(struct tracer_stat *trace)
 
 	/* Already registered? */
 	mutex_lock(&all_stat_sessions_mutex);
-	list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
+	list_for_each_entry(node, &all_stat_sessions, session_list) {
 		if (node->ts == trace) {
 			mutex_unlock(&all_stat_sessions_mutex);
 			return -EINVAL;
-- 
cgit v1.2.3


From 2af15d6a44b871ad4c2a651302374cde8f335480 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 28 May 2009 13:37:24 -0400
Subject: ftrace: add kernel command line function filtering

When using ftrace=function on the command line to trace functions
on boot up, one can not filter out functions that are commonly called.

This patch adds two new ftrace command line commands.

  ftrace_notrace=function-list
  ftrace_filter=function-list

Where function-list is a comma separated list of functions to filter.
The ftrace_notrace will make the functions listed not be included
in the function tracing, and ftrace_filter will only trace the functions
listed.

These two act the same as the debugfs/tracing/set_ftrace_notrace and
debugfs/tracing/set_ftrace_filter respectively.

The simple glob expressions that are allowed by the filter files can also
be used by the command line interface.

	ftrace_notrace=rcu*,*lock,*spin*

Will not trace any function that starts with rcu, ends with lock, or has
the word spin in it.

Note, if the self tests are enabled, they may interfere with the filtering
set by the command lines.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 140699a9a8a..2074e5b7766 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -32,6 +32,7 @@
 #include <trace/events/sched.h>
 
 #include <asm/ftrace.h>
+#include <asm/setup.h>
 
 #include "trace_output.h"
 #include "trace_stat.h"
@@ -2369,6 +2370,45 @@ void ftrace_set_notrace(unsigned char *buf, int len, int reset)
 	ftrace_set_regex(buf, len, reset, 0);
 }
 
+/*
+ * command line interface to allow users to set filters on boot up.
+ */
+#define FTRACE_FILTER_SIZE		COMMAND_LINE_SIZE
+static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
+static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
+
+static int __init set_ftrace_notrace(char *str)
+{
+	strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
+	return 1;
+}
+__setup("ftrace_notrace=", set_ftrace_notrace);
+
+static int __init set_ftrace_filter(char *str)
+{
+	strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
+	return 1;
+}
+__setup("ftrace_filter=", set_ftrace_filter);
+
+static void __init set_ftrace_early_filter(char *buf, int enable)
+{
+	char *func;
+
+	while (buf) {
+		func = strsep(&buf, ",");
+		ftrace_set_regex(func, strlen(func), 0, enable);
+	}
+}
+
+static void __init set_ftrace_early_filters(void)
+{
+	if (ftrace_filter_buf[0])
+		set_ftrace_early_filter(ftrace_filter_buf, 1);
+	if (ftrace_notrace_buf[0])
+		set_ftrace_early_filter(ftrace_notrace_buf, 0);
+}
+
 static int
 ftrace_regex_release(struct inode *inode, struct file *file, int enable)
 {
@@ -2829,6 +2869,8 @@ void __init ftrace_init(void)
 	if (ret)
 		pr_warning("Failed to register trace ftrace module notifier\n");
 
+	set_ftrace_early_filters();
+
 	return;
  failed:
 	ftrace_disabled = 1;
-- 
cgit v1.2.3


From 5e0a093910876882f91f1d4b8a1635a099e6c7ba Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 28 May 2009 15:50:13 -0400
Subject: tracing: fix config options to not show when automatically selected

There are two options that are selected by all tracers, but we want
to have those options available when no tracer is selected. These are

 The event tracer and sched switch tracer.

The are enabled by all tracers, but if a tracer is not selected we want
the options to appear. All tracers including them select TRACING.
Thus what we would like to do is:

  config EVENT_TRACER
	bool "prompt"
	depends on TRACING
	select TRACING

But that gives us a bug in the kbuild system since we just created a
circular dependency. We only want the prompt to show when TRACING is off.

This patch adds GENERIC_TRACER that all tracers will select instead of
TRACING. The two options (sched switch and event tracer) will select
TRACING directly and depend on !GENERIC_TRACER. This solves the cicular
dependency.

[ Impact: hide options that are selected by default ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Kconfig | 43 ++++++++++++++++++++++++++++---------------
 1 file changed, 28 insertions(+), 15 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a508b9d2adb..6e55cc3ac49 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -56,6 +56,13 @@ config CONTEXT_SWITCH_TRACER
 	select MARKERS
 	bool
 
+# All tracer options should select GENERIC_TRACER. For those options that are
+# enabled by all tracers (context switch and event tracer) they select TRACING.
+# This allows those options to appear when no other tracer is selected. But the
+# options do not appear when something else selects it. We need the two options
+# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the
+# hidding of the automatic options options.
+
 config TRACING
 	bool
 	select DEBUG_FS
@@ -66,6 +73,10 @@ config TRACING
 	select BINARY_PRINTF
 	select EVENT_TRACING
 
+config GENERIC_TRACER
+	bool
+	select TRACING
+
 #
 # Minimum requirements an architecture has to meet for us to
 # be able to offer generic tracing facilities:
@@ -95,7 +106,7 @@ config FUNCTION_TRACER
 	depends on HAVE_FUNCTION_TRACER
 	select FRAME_POINTER
 	select KALLSYMS
-	select TRACING
+	select GENERIC_TRACER
 	select CONTEXT_SWITCH_TRACER
 	help
 	  Enable the kernel to trace every kernel function. This is done
@@ -126,7 +137,7 @@ config IRQSOFF_TRACER
 	depends on TRACE_IRQFLAGS_SUPPORT
 	depends on GENERIC_TIME
 	select TRACE_IRQFLAGS
-	select TRACING
+	select GENERIC_TRACER
 	select TRACER_MAX_TRACE
 	help
 	  This option measures the time spent in irqs-off critical
@@ -147,7 +158,7 @@ config PREEMPT_TRACER
 	default n
 	depends on GENERIC_TIME
 	depends on PREEMPT
-	select TRACING
+	select GENERIC_TRACER
 	select TRACER_MAX_TRACE
 	help
 	  This option measures the time spent in preemption off critical
@@ -166,7 +177,7 @@ config PREEMPT_TRACER
 config SYSPROF_TRACER
 	bool "Sysprof Tracer"
 	depends on X86
-	select TRACING
+	select GENERIC_TRACER
 	select CONTEXT_SWITCH_TRACER
 	help
 	  This tracer provides the trace needed by the 'Sysprof' userspace
@@ -174,7 +185,7 @@ config SYSPROF_TRACER
 
 config SCHED_TRACER
 	bool "Scheduling Latency Tracer"
-	select TRACING
+	select GENERIC_TRACER
 	select CONTEXT_SWITCH_TRACER
 	select TRACER_MAX_TRACE
 	help
@@ -183,6 +194,7 @@ config SCHED_TRACER
 
 config ENABLE_CONTEXT_SWITCH_TRACER
 	bool "Trace process context switches"
+	depends on !GENERIC_TRACER
 	select TRACING
 	select CONTEXT_SWITCH_TRACER
 	help
@@ -191,6 +203,7 @@ config ENABLE_CONTEXT_SWITCH_TRACER
 
 config ENABLE_EVENT_TRACING
 	bool "Trace various events in the kernel"
+	depends on !GENERIC_TRACER
 	select TRACING
 	help
 	  This tracer hooks to various trace points in the kernel
@@ -204,14 +217,14 @@ config ENABLE_EVENT_TRACING
 config FTRACE_SYSCALLS
 	bool "Trace syscalls"
 	depends on HAVE_FTRACE_SYSCALLS
-	select TRACING
+	select GENERIC_TRACER
 	select KALLSYMS
 	help
 	  Basic tracer to catch the syscall entry and exit events.
 
 config BOOT_TRACER
 	bool "Trace boot initcalls"
-	select TRACING
+	select GENERIC_TRACER
 	select CONTEXT_SWITCH_TRACER
 	help
 	  This tracer helps developers to optimize boot times: it records
@@ -228,7 +241,7 @@ config BOOT_TRACER
 
 config TRACE_BRANCH_PROFILING
 	bool
-	select TRACING
+	select GENERIC_TRACER
 
 choice
 	prompt "Branch Profiling"
@@ -308,7 +321,7 @@ config BRANCH_TRACER
 config POWER_TRACER
 	bool "Trace power consumption behavior"
 	depends on X86
-	select TRACING
+	select GENERIC_TRACER
 	help
 	  This tracer helps developers to analyze and optimize the kernels
 	  power management decisions, specifically the C-state and P-state
@@ -342,14 +355,14 @@ config STACK_TRACER
 config HW_BRANCH_TRACER
 	depends on HAVE_HW_BRANCH_TRACER
 	bool "Trace hw branches"
-	select TRACING
+	select GENERIC_TRACER
 	help
 	  This tracer records all branches on the system in a circular
 	  buffer giving access to the last N branches for each cpu.
 
 config KMEMTRACE
 	bool "Trace SLAB allocations"
-	select TRACING
+	select GENERIC_TRACER
 	help
 	  kmemtrace provides tracing for slab allocator functions, such as
 	  kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
@@ -369,7 +382,7 @@ config KMEMTRACE
 
 config WORKQUEUE_TRACER
 	bool "Trace workqueues"
-	select TRACING
+	select GENERIC_TRACER
 	help
 	  The workqueue tracer provides some statistical informations
           about each cpu workqueue thread such as the number of the
@@ -385,7 +398,7 @@ config BLK_DEV_IO_TRACE
 	select RELAY
 	select DEBUG_FS
 	select TRACEPOINTS
-	select TRACING
+	select GENERIC_TRACER
 	select STACKTRACE
 	help
 	  Say Y here if you want to be able to trace the block layer actions
@@ -446,7 +459,7 @@ config FTRACE_SELFTEST
 
 config FTRACE_STARTUP_TEST
 	bool "Perform a startup test on ftrace"
-	depends on TRACING
+	depends on GENERIC_TRACER
 	select FTRACE_SELFTEST
 	help
 	  This option performs a series of startup tests on ftrace. On bootup
@@ -457,7 +470,7 @@ config FTRACE_STARTUP_TEST
 config MMIOTRACE
 	bool "Memory mapped IO tracing"
 	depends on HAVE_MMIOTRACE_SUPPORT && PCI
-	select TRACING
+	select GENERIC_TRACER
 	help
 	  Mmiotrace traces Memory Mapped I/O access and is meant for
 	  debugging and reverse engineering. It is called from the ioremap
-- 
cgit v1.2.3


From 897f17a65389a26509bd0c79a9812d1c9ea8ea6f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 28 May 2009 16:31:21 -0400
Subject: tracing: combine the default tracers into one config

Both event tracer and sched switch plugin are selected by default
by all generic tracers. But if no generic tracer is enabled, their options
appear. But ether one of them will select the other, thus it only
makes sense to have the default tracers be selected by one option.

[ Impact: clean up kconfig menu ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Kconfig | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 6e55cc3ac49..4a13e5a01ce 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -192,27 +192,14 @@ config SCHED_TRACER
 	  This tracer tracks the latency of the highest priority task
 	  to be scheduled in, starting from the point it has woken up.
 
-config ENABLE_CONTEXT_SWITCH_TRACER
-	bool "Trace process context switches"
-	depends on !GENERIC_TRACER
-	select TRACING
-	select CONTEXT_SWITCH_TRACER
-	help
-	  This tracer gets called from the context switch and records
-	  all switching of tasks.
-
-config ENABLE_EVENT_TRACING
-	bool "Trace various events in the kernel"
+config ENABLE_DEFAULT_TRACERS
+	bool "Trace process context switches and events"
 	depends on !GENERIC_TRACER
 	select TRACING
 	help
 	  This tracer hooks to various trace points in the kernel
 	  allowing the user to pick and choose which trace point they
-	  want to trace.
-
-	  Note, all tracers enable event tracing. This option is
-	  only a convenience to enable event tracing when no other
-	  tracers are selected.
+	  want to trace. It also includes the sched_switch tracer plugin.
 
 config FTRACE_SYSCALLS
 	bool "Trace syscalls"
-- 
cgit v1.2.3


From 7fcb7c472f455d1711eb5a7633204dba8800a6d6 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 1 Jun 2009 15:35:46 +0800
Subject: tracing/events: introduce __dynamic_array()

__string() is limited:

  - it's a char array, but we may want to define array with other types
  - a source string should be available, but we may just know the string size

We introduce __dynamic_array() to break those limitations, and __string()
becomes a wrapper of it. As a side effect, now __get_str() can be used
in TP_fast_assign but not only TP_print.

Take XFS for example, we have the string length in the dirent, but the
string itself is not NULL-terminated, so __dynamic_array() can be used:

TRACE_EVENT(xfs_dir2,
	TP_PROTO(struct xfs_da_args *args),
	TP_ARGS(args),

	TP_STRUCT__entry(
		__field(int, namelen)
		__dynamic_array(char, name, args->namelen + 1)
		...
	),

	TP_fast_assign(
		char *name = __get_str(name);

		if (args->namelen)
			memcpy(name, args->name, args->namelen);
		name[args->namelen] = '\0';

		__entry->namelen = args->namelen;
	),

	TP_printk("name %.*s namelen %d",
		  __entry->namelen ? __get_str(name) : NULL
		  __entry->namelen)
);

[ Impact: allow defining dynamic size arrays ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4A2384D2.3080403@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index a7430b16d24..db6e54bdb59 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -478,12 +478,12 @@ enum {
 
 static int is_string_field(const char *type)
 {
+	if (strstr(type, "__data_loc") && strstr(type, "char"))
+		return FILTER_DYN_STRING;
+
 	if (strchr(type, '[') && strstr(type, "char"))
 		return FILTER_STATIC_STRING;
 
-	if (!strcmp(type, "__str_loc"))
-		return FILTER_DYN_STRING;
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From ec081ddc3d90aab35bc0de19a358b964978837cf Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 1 Jun 2009 15:53:35 +0100
Subject: tracing: add exports to use __print_symbolic and __print_flags from a
 module

A patch to allow the use of __print_symbolic and __print_flags
from a module. This allows the current GFS2 tracing patch to
build.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
LKML-Reference: <1243868015.29604.542.camel@localhost.localdomain>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_output.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index c12d95db2f5..0fe3b223f7e 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -17,6 +17,7 @@
 static DECLARE_RWSEM(trace_event_mutex);
 
 DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq);
+EXPORT_PER_CPU_SYMBOL(ftrace_event_seq);
 
 static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 
@@ -250,6 +251,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
 
 	return p->buffer;
 }
+EXPORT_SYMBOL(ftrace_print_flags_seq);
 
 const char *
 ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
@@ -275,6 +277,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
 
 	return p->buffer;
 }
+EXPORT_SYMBOL(ftrace_print_symbols_seq);
 
 #ifdef CONFIG_KRETPROBES
 static inline const char *kretprobed(const char *name)
-- 
cgit v1.2.3


From 112f38a7e36e9d688b389507136bf3af3e6d159b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 1 Jun 2009 15:16:05 -0400
Subject: tracing: make trace pipe recognize latency format flag

The trace_pipe did not recognize the latency format flag and would produce
different output than the trace file. The problem was partly due that
the trace flags in the iterator was not set as well as the trace_pipe
zeros out part of the iterator (including the flags) to be able to use
the same routines as the trace file. trace_flags of the iterator should
not cause any problems when not zeroed out by for trace_pipe.

Reported-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a3a8a87d7e9..cae34c69752 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2826,6 +2826,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
 	/* trace pipe does not show start of buffer */
 	cpumask_setall(iter->started);
 
+	if (trace_flags & TRACE_ITER_LATENCY_FMT)
+		iter->iter_flags |= TRACE_FILE_LAT_FMT;
+
 	iter->cpu_file = cpu_file;
 	iter->tr = &global_trace;
 	mutex_init(&iter->mutex);
-- 
cgit v1.2.3


From 0f6ce3de4ef6ff940308087c49760d068851c1a7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 1 Jun 2009 21:51:28 -0400
Subject: ftrace: do not profile functions when disabled

A race was found that if one were to enable and disable the function
profiler repeatedly, then the system can panic. This was because a profiled
function may be preempted just before disabling interrupts. While
the profiler is disabled and then reenabled, the preempted function
could start again, and access the hash as it is being initialized.

This just adds a check in the irq disabled part to check if the profiler
is enabled, and if it is not then it will just exit.

When the system is disabled, the profile_enabled variable is cleared
before calling the unregistering of the function profiler. This
unregistering calls stop machine which also acts as a synchronize schedule.

[ Impact: fix panic in enabling/disabling function profiler ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2074e5b7766..d6973dfadb3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -599,7 +599,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip)
 	local_irq_save(flags);
 
 	stat = &__get_cpu_var(ftrace_profile_stats);
-	if (!stat->hash)
+	if (!stat->hash || !ftrace_profile_enabled)
 		goto out;
 
 	rec = ftrace_find_profiled_func(stat, ip);
@@ -630,7 +630,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
 
 	local_irq_save(flags);
 	stat = &__get_cpu_var(ftrace_profile_stats);
-	if (!stat->hash)
+	if (!stat->hash || !ftrace_profile_enabled)
 		goto out;
 
 	calltime = trace->rettime - trace->calltime;
@@ -724,6 +724,10 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
 			ftrace_profile_enabled = 1;
 		} else {
 			ftrace_profile_enabled = 0;
+			/*
+			 * unregister_ftrace_profiler calls stop_machine
+			 * so this acts like an synchronize_sched.
+			 */
 			unregister_ftrace_profiler();
 		}
 	}
-- 
cgit v1.2.3


From 179c498ae2998461fe436437a74dc29036fc7dcc Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 2 Jun 2009 12:03:19 -0400
Subject: function-graph: only allocate init tasks if it was not already done

When the function graph tracer is enabled, it calls the initialization
needed for the init tasks that would be called on all created tasks.

The problem is that this is called every time the function graph tracer
is enabled, and the ret_stack is allocated for the idle tasks each time.
Thus, the old ret_stack is lost and a memory leak is created.

This is also dangerous because if an interrupt happened on another CPU
with the init task and the ret_stack is replaced, we then lose all the
return pointers for the interrupt, and a crash would take place.

[ Impact: fix memory leak and possible crash due to race ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f1ed080406c..ebff62ef40b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2643,8 +2643,10 @@ static int start_graph_tracing(void)
 		return -ENOMEM;
 
 	/* The cpu_boot init_task->ret_stack will never be freed */
-	for_each_online_cpu(cpu)
-		ftrace_graph_init_task(idle_task(cpu));
+	for_each_online_cpu(cpu) {
+		if (!idle_task(cpu)->ret_stack)
+			ftrace_graph_init_task(idle_task(cpu));
+	}
 
 	do {
 		ret = alloc_retstack_tasklist(ret_stack_list);
-- 
cgit v1.2.3


From 82310a3272d5a2a7652f5649ad8a55f58c8f74d9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 2 Jun 2009 12:26:07 -0400
Subject: function-graph: enable the stack after initialization of other
 variables

The function graph tracer checks if the task_struct has ret_stack defined
to know if it is OK or not to use it. The initialization is done for
all tasks by one process, but the idle tasks use the same initialization
used by new tasks.

If an interrupt happens on an idle task that just had the ret_stack
created, but before the rest of the initialization took place, then
we can corrupt the return address of the functions.

This patch moves the setting of the task_struct's ret_stack to after
the other variables have been initialized.

[ Impact: prevent kernel panic on idle task when starting function graph ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c                | 9 +++++++--
 kernel/trace/trace_functions_graph.c | 6 ++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ebff62ef40b..20e066065eb 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2739,15 +2739,20 @@ void unregister_ftrace_graph(void)
 void ftrace_graph_init_task(struct task_struct *t)
 {
 	if (atomic_read(&ftrace_graph_active)) {
-		t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
+		struct ftrace_ret_stack *ret_stack;
+
+		ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
 				* sizeof(struct ftrace_ret_stack),
 				GFP_KERNEL);
-		if (!t->ret_stack)
+		if (!ret_stack)
 			return;
 		t->curr_ret_stack = -1;
 		atomic_set(&t->tracing_graph_pause, 0);
 		atomic_set(&t->trace_overrun, 0);
 		t->ftrace_timestamp = 0;
+		/* make curr_ret_stack visable before we add the ret_stack */
+		smp_wmb();
+		t->ret_stack = ret_stack;
 	} else
 		t->ret_stack = NULL;
 }
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index d28687e7b3a..baeb5fe3610 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -65,6 +65,12 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
 	if (!current->ret_stack)
 		return -EBUSY;
 
+	/*
+	 * We must make sure the ret_stack is tested before we read
+	 * anything else.
+	 */
+	smp_rmb();
+
 	/* The return trace stack is full */
 	if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
 		atomic_inc(&current->trace_overrun);
-- 
cgit v1.2.3


From 26c01624a2a40f8a4ddf6449b65c9b1c418d0e72 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 2 Jun 2009 14:01:19 -0400
Subject: function-graph: add memory barriers for accessing task's ret_stack

The code that handles the tasks ret_stack allocation for every task
assumes that only an interrupt can cause issues (even though interrupts
are disabled).

In reality, the code is allocating the ret_stack for tasks that may be
running on other CPUs and there are not efficient memory barriers to
handle this case.

[ Impact: prevent crash due to using of uninitialized ret_stack variables ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 20e066065eb..1664d3f33d3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2580,12 +2580,12 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
 		}
 
 		if (t->ret_stack == NULL) {
-			t->curr_ret_stack = -1;
-			/* Make sure IRQs see the -1 first: */
-			barrier();
-			t->ret_stack = ret_stack_list[start++];
 			atomic_set(&t->tracing_graph_pause, 0);
 			atomic_set(&t->trace_overrun, 0);
+			t->curr_ret_stack = -1;
+			/* Make sure the tasks see the -1 first: */
+			smp_wmb();
+			t->ret_stack = ret_stack_list[start++];
 		}
 	} while_each_thread(g, t);
 
-- 
cgit v1.2.3


From 84047e360af0394ac5861d433f26bbcf30f77dd1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 2 Jun 2009 16:51:55 -0400
Subject: function-graph: always initialize task ret_stack

On creating a new task while running the function graph tracer, if
we fail to allocate the ret_stack, and then fail the fork, the
code will free the parent ret_stack. This is because the child
duplicated the parent and currently points to the parent's ret_stack.

This patch always initializes the task's ret_stack to NULL.

[ Impact: prevent crash of parent on low memory during fork ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1664d3f33d3..bb081f37cac 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2738,6 +2738,9 @@ void unregister_ftrace_graph(void)
 /* Allocate a return stack for newly created task */
 void ftrace_graph_init_task(struct task_struct *t)
 {
+	/* Make sure we do not use the parent ret_stack */
+	t->ret_stack = NULL;
+
 	if (atomic_read(&ftrace_graph_active)) {
 		struct ftrace_ret_stack *ret_stack;
 
@@ -2753,8 +2756,7 @@ void ftrace_graph_init_task(struct task_struct *t)
 		/* make curr_ret_stack visable before we add the ret_stack */
 		smp_wmb();
 		t->ret_stack = ret_stack;
-	} else
-		t->ret_stack = NULL;
+	}
 }
 
 void ftrace_graph_exit_task(struct task_struct *t)
-- 
cgit v1.2.3


From a2023556409cf7fec5d67a26f7fcfa57c5a4086d Mon Sep 17 00:00:00 2001
From: Tim Bird <tim.bird@am.sony.com>
Date: Tue, 2 Jun 2009 17:06:54 -0700
Subject: ring-buffer: fix bug in ring_buffer_discard_commit

There's a bug in ring_buffer_discard_commit.  The wrong
pointer is being compared in order to check if the event
can be freed from the buffer rather than discarded
(i.e. marked as PAD).

I noticed this when I was working on duration filtering.
The bug is not deadly - it just results in lots of wasted
space in the buffer.  All filtered events are left in
the buffer and marked as discarded, rather than being
removed from the buffer to make space for other events.

Unfortunately, when I fixed this bug, I got errors doing a
filtered function trace.  Multiple TIME_EXTEND
events pile up in the buffer, and trigger the
following loop overage warning in rb_iter_peek():

again:
	...
	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
		return NULL;

I'm not sure what the best way is to fix this. I don't
know if I should extend the loop threshhold, or if I should
make the test more complex (ignore TIME_EXTEND
events), or just get rid of this loop check completely.

Note that if I implement a workaround for this, then I
see another problem from rb_advance_iter().  I haven't
tracked that one down yet.

In general, it seems like the case of removing filtered
events has not been working properly, and so some assumptions
about buffer invariant conditions need to be revisited.

Here's the patch for the simple fix:

Compare correct pointer for checking if an event can be
freed rather than left as discarded in the buffer.

Signed-off-by: Tim Bird <tim.bird@am.sony.com>
LKML-Reference: <4A25BE9E.5090909@am.sony.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 16b24d49604..94530236869 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1708,7 +1708,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
 
 	bpage = cpu_buffer->tail_page;
 
-	if (bpage == (void *)addr && rb_page_write(bpage) == old_index) {
+	if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
 		/*
 		 * This is on the tail page. It is possible that
 		 * a write could come in and move the tail page
-- 
cgit v1.2.3


From edd813bffc62a980bb4fb9b1243f31c1cce78da3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 2 Jun 2009 23:00:53 -0400
Subject: ring-buffer: try to discard unneeded timestamps

There are times that a race may happen that we add a timestamp in a
nested write. This timestamp would just contain a zero delta and serves
no purpose.

Now that we have a way to discard events, this patch will try to discard
the timestamp instead of just wasting the space in the ring buffer.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 67 ++++++++++++++++++++++++++++------------------
 1 file changed, 41 insertions(+), 26 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 94530236869..50926601a28 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1335,6 +1335,38 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	return event;
 }
 
+static inline int
+rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
+		  struct ring_buffer_event *event)
+{
+	unsigned long new_index, old_index;
+	struct buffer_page *bpage;
+	unsigned long index;
+	unsigned long addr;
+
+	new_index = rb_event_index(event);
+	old_index = new_index + rb_event_length(event);
+	addr = (unsigned long)event;
+	addr &= PAGE_MASK;
+
+	bpage = cpu_buffer->tail_page;
+
+	if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
+		/*
+		 * This is on the tail page. It is possible that
+		 * a write could come in and move the tail page
+		 * and write to the next page. That is fine
+		 * because we just shorten what is on this page.
+		 */
+		index = local_cmpxchg(&bpage->write, old_index, new_index);
+		if (index == old_index)
+			return 1;
+	}
+
+	/* could not discard */
+	return 0;
+}
+
 static int
 rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 		  u64 *ts, u64 *delta)
@@ -1384,10 +1416,13 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 		/* let the caller know this was the commit */
 		ret = 1;
 	} else {
-		/* Darn, this is just wasted space */
-		event->time_delta = 0;
-		event->array[0] = 0;
-		ret = 0;
+		/* Try to discard the event */
+		if (!rb_try_to_discard(cpu_buffer, event)) {
+			/* Darn, this is just wasted space */
+			event->time_delta = 0;
+			event->array[0] = 0;
+			ret = 0;
+		}
 	}
 
 	*delta = 0;
@@ -1682,10 +1717,6 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
 				struct ring_buffer_event *event)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
-	unsigned long new_index, old_index;
-	struct buffer_page *bpage;
-	unsigned long index;
-	unsigned long addr;
 	int cpu;
 
 	/* The event is discarded regardless */
@@ -1701,24 +1732,8 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
 	cpu = smp_processor_id();
 	cpu_buffer = buffer->buffers[cpu];
 
-	new_index = rb_event_index(event);
-	old_index = new_index + rb_event_length(event);
-	addr = (unsigned long)event;
-	addr &= PAGE_MASK;
-
-	bpage = cpu_buffer->tail_page;
-
-	if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
-		/*
-		 * This is on the tail page. It is possible that
-		 * a write could come in and move the tail page
-		 * and write to the next page. That is fine
-		 * because we just shorten what is on this page.
-		 */
-		index = local_cmpxchg(&bpage->write, old_index, new_index);
-		if (index == old_index)
-			goto out;
-	}
+	if (!rb_try_to_discard(cpu_buffer, event))
+		goto out;
 
 	/*
 	 * The commit is still visible by the reader, so we
-- 
cgit v1.2.3


From ea05b57cc19234d8de9887c8a32c2e58e84b56ba Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 3 Jun 2009 09:30:10 -0400
Subject: ring-buffer: discard timestamps that are at the start of the buffer

Every buffer page in the ring buffer includes its own time stamp.
When an event is recorded to the ring buffer with a delta time greater
than what can be held in the event header, a time stamp event is created.

If the the create timestamp falls over to the next buffer page, it is
redundant because the buffer page holds a full time stamp. This patch
will try to discard the time stamp when it falls to the start of the
next page.

This change also fixes a issues with disarding events. If most events are
discarded, timestamps will start to creep into the ring buffer. If we
do not discard the timestamps then they can fill up the ring buffer over
time and waste space.

This change will keep time stamps from filling up over another page. If
something is recorded in the buffer page, and the rest is filtered, then
the time stamps can only fill up to the end of the page.

[ Impact: prevent time stamps from filling ring buffer ]

Reported-by: Tim Bird <tim.bird@am.sony.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 50926601a28..7102d7a2fad 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -370,6 +370,9 @@ static inline int test_time_stamp(u64 delta)
 /* Max payload is BUF_PAGE_SIZE - header (8bytes) */
 #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
 
+/* Max number of timestamps that can fit on a page */
+#define RB_TIMESTAMPS_PER_PAGE	(BUF_PAGE_SIZE / RB_LEN_TIME_STAMP)
+
 int ring_buffer_print_page_header(struct trace_seq *s)
 {
 	struct buffer_data_page field;
@@ -1409,8 +1412,12 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 			event->array[0] = *delta >> TS_SHIFT;
 		} else {
 			cpu_buffer->commit_page->page->time_stamp = *ts;
-			event->time_delta = 0;
-			event->array[0] = 0;
+			/* try to discard, since we do not need this */
+			if (!rb_try_to_discard(cpu_buffer, event)) {
+				/* nope, just zero it */
+				event->time_delta = 0;
+				event->array[0] = 0;
+			}
 		}
 		cpu_buffer->write_stamp = *ts;
 		/* let the caller know this was the commit */
@@ -2268,8 +2275,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
 	 * Check if we are at the end of the buffer.
 	 */
 	if (iter->head >= rb_page_size(iter->head_page)) {
-		if (RB_WARN_ON(buffer,
-			       iter->head_page == cpu_buffer->commit_page))
+		/* discarded commits can make the page empty */
+		if (iter->head_page == cpu_buffer->commit_page)
 			return;
 		rb_inc_iter(iter);
 		return;
@@ -2312,12 +2319,10 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 	/*
 	 * We repeat when a timestamp is encountered. It is possible
 	 * to get multiple timestamps from an interrupt entering just
-	 * as one timestamp is about to be written. The max times
-	 * that this can happen is the number of nested interrupts we
-	 * can have.  Nesting 10 deep of interrupts is clearly
-	 * an anomaly.
+	 * as one timestamp is about to be written, or from discarded
+	 * commits. The most that we can have is the number on a single page.
 	 */
-	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
+	if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
 		return NULL;
 
 	reader = rb_get_reader_page(cpu_buffer);
@@ -2383,14 +2388,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 
  again:
 	/*
-	 * We repeat when a timestamp is encountered. It is possible
-	 * to get multiple timestamps from an interrupt entering just
-	 * as one timestamp is about to be written. The max times
-	 * that this can happen is the number of nested interrupts we
-	 * can have. Nesting 10 deep of interrupts is clearly
-	 * an anomaly.
+	 * We repeat when a timestamp is encountered.
+	 * We can get multiple timestamps by nested interrupts or also
+	 * if filtering is on (discarding commits). Since discarding
+	 * commits can be frequent we can get a lot of timestamps.
+	 * But we limit them by not adding timestamps if they begin
+	 * at the start of a page.
 	 */
-	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
+	if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
 		return NULL;
 
 	if (rb_per_cpu_empty(cpu_buffer))
-- 
cgit v1.2.3


From 083a63b48e4dd0a6a2d44216720076dc81ebb255 Mon Sep 17 00:00:00 2001
From: walimis <walimisdev@gmail.com>
Date: Wed, 3 Jun 2009 16:01:28 +0800
Subject: tracing/trace_stack: fix the number of entries in the header

The last entry in the stack_dump_trace is ULONG_MAX, which is not
a valid entry, but max_stack_trace.nr_entries has accounted for it.
So when printing the header, we should decrease it by one.
Before fix, print as following, for example:

	Depth    Size   Location    (53 entries)	<--- should be 52
	-----    ----   --------
  0)     3264     108   update_wall_time+0x4d5/0x9a0
  ...
 51)       80      80   syscall_call+0x7/0xb
 ^^^
   it's correct.

Signed-off-by: walimis <walimisdev@gmail.com>
LKML-Reference: <1244016090-7814-1-git-send-email-walimisdev@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_stack.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 1796f00524e..2d7aebd71db 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -265,7 +265,7 @@ static int t_show(struct seq_file *m, void *v)
 		seq_printf(m, "        Depth    Size   Location"
 			   "    (%d entries)\n"
 			   "        -----    ----   --------\n",
-			   max_stack_trace.nr_entries);
+			   max_stack_trace.nr_entries - 1);
 
 		if (!stack_tracer_enabled && !max_stack_size)
 			print_disabled(m);
-- 
cgit v1.2.3


From f11b3f4e2932bfdcfc458ab8d1ece62724ceabfc Mon Sep 17 00:00:00 2001
From: walimis <walimisdev@gmail.com>
Date: Wed, 3 Jun 2009 16:01:29 +0800
Subject: tracing/events: fix output format of kernel stack

According to "events/ftrace/kernel_stack/format", output format of
kernel stack should use "=>" instead of "<=".

The second problem is that we shouldn't skip the first entry in the stack,
although it seems to be duplicated when used in the "function" tracer,
but events also use it. If we skip the first one, we will drop the topmost
entry of the stack.

The last problem is that if the last entry is ULONG_MAX(0xffffffff), we should
drop it, otherwise it will print a NULL name line.

before fix:

      sh-1072  [000]   26.957239: sched_process_fork: parent sh:1072 child sh:1073
      sh-1072  [000]   26.957262:
 <= syscall_call
 <=
      sh-1072  [000]   26.957744: sched_switch: task sh:1072 [120] (R) ==> sh:1073 [120]
      sh-1072  [000]   26.957752:
 <= preempt_schedule
 <= wake_up_new_task
 <= do_fork
 <= sys_clone
 <= syscall_call
 <=

After fix:

      sh-1075  [000]    39.791848: sched_process_fork: parent sh:1075  child sh:1076
      sh-1075  [000]    39.791871:
 => sys_clone
 => syscall_call
      sh-1075  [000]    39.792713: sched_switch: task sh:1075 [120] (R) ==> sh:1076 [120]
      sh-1075  [000]    39.792722:
 => schedule
 => preempt_schedule
 => wake_up_new_task
 => do_fork
 => sys_clone
 => syscall_call

Signed-off-by: walimis <walimisdev@gmail.com>
LKML-Reference: <1244016090-7814-2-git-send-email-walimisdev@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_output.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 0fe3b223f7e..64596a57160 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -975,16 +975,16 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
 
 	trace_assign_type(field, iter->ent);
 
+	if (!trace_seq_puts(s, "\n"))
+		goto partial;
 	for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
-		if (!field->caller[i])
+		if (!field->caller[i] || (field->caller[i] == ULONG_MAX))
 			break;
-		if (i) {
-			if (!trace_seq_puts(s, " <= "))
-				goto partial;
+		if (!trace_seq_puts(s, " => "))
+			goto partial;
 
-			if (!seq_print_ip_sym(s, field->caller[i], flags))
-				goto partial;
-		}
+		if (!seq_print_ip_sym(s, field->caller[i], flags))
+			goto partial;
 		if (!trace_seq_puts(s, "\n"))
 			goto partial;
 	}
-- 
cgit v1.2.3


From 048dc50c5e7eada19ebabbad70b7966d14283d41 Mon Sep 17 00:00:00 2001
From: walimis <walimisdev@gmail.com>
Date: Wed, 3 Jun 2009 16:01:30 +0800
Subject: tracing/events: fix output format of user stack

According to "events/ftrace/user_stack/format", fix the output of
user stack.

before fix:

  sh-1073  [000]    31.137561:  <b7f274fe> <-  <0804e33c> <-  <080835c1>

after fix:

  sh-1072  [000]    37.039329:
 =>  <b7f8a4fe>
 =>  <0804e33c>
 =>  <080835c1>

Signed-off-by: walimis <walimisdev@gmail.com>
LKML-Reference: <1244016090-7814-3-git-send-email-walimisdev@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_output.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 64596a57160..8dadbbbd2d5 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -389,17 +389,20 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
 
 		if (ip == ULONG_MAX || !ret)
 			break;
-		if (i && ret)
-			ret = trace_seq_puts(s, " <- ");
+		if (ret)
+			ret = trace_seq_puts(s, " => ");
 		if (!ip) {
 			if (ret)
 				ret = trace_seq_puts(s, "??");
+			if (ret)
+				ret = trace_seq_puts(s, "\n");
 			continue;
 		}
 		if (!ret)
 			break;
 		if (ret)
 			ret = seq_print_user_ip(s, mm, ip, sym_flags);
+		ret = trace_seq_puts(s, "\n");
 	}
 
 	if (mm)
@@ -1012,10 +1015,10 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
 
 	trace_assign_type(field, iter->ent);
 
-	if (!seq_print_userip_objs(field, s, flags))
+	if (!trace_seq_putc(s, '\n'))
 		goto partial;
 
-	if (!trace_seq_putc(s, '\n'))
+	if (!seq_print_userip_objs(field, s, flags))
 		goto partial;
 
 	return TRACE_TYPE_HANDLED;
-- 
cgit v1.2.3


From 56d8bd3f0b98972312cad683947ec90b21011199 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 3 Jun 2009 14:52:03 +0100
Subject: tracing: fix multiple use of __print_flags and __print_symbolic

Here is an updated patch to include the extra call to
trace_seq_init() as requested. This is vs. the latest
-tip tree and fixes the use of multiple __print_flags
and __print_symbolic in a single tracer. Also tested
to ensure its working now:

mount.gfs2-2534  [000]   235.850587: gfs2_glock_queue: 8.7 glock 1:2 dequeue PR
mount.gfs2-2534  [000]   235.850591: gfs2_demote_rq: 8.7 glock 1:0 demote EX to NL flags:DI
mount.gfs2-2534  [000]   235.850591: gfs2_glock_queue: 8.7 glock 1:0 dequeue EX
glock_workqueue-2529  [000]   235.850666: gfs2_glock_state_change: 8.7 glock 1:0 state EX => NL tgt:NL dmt:NL flags:lDpI
glock_workqueue-2529  [000]   235.850672: gfs2_glock_put: 8.7 glock 1:0 state NL => IV flags:I

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
LKML-Reference: <1244037123.29604.603.camel@localhost.localdomain>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_output.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 8dadbbbd2d5..8afeea412e7 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -223,10 +223,9 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
 {
 	unsigned long mask;
 	const char *str;
+	const char *ret = p->buffer + p->len;
 	int i;
 
-	trace_seq_init(p);
-
 	for (i = 0;  flag_array[i].name && flags; i++) {
 
 		mask = flag_array[i].mask;
@@ -249,7 +248,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
 
 	trace_seq_putc(p, 0);
 
-	return p->buffer;
+	return ret;
 }
 EXPORT_SYMBOL(ftrace_print_flags_seq);
 
@@ -258,8 +257,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
 			 const struct trace_print_flags *symbol_array)
 {
 	int i;
-
-	trace_seq_init(p);
+	const char *ret = p->buffer + p->len;
 
 	for (i = 0;  symbol_array[i].name; i++) {
 
@@ -275,7 +273,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
 		
 	trace_seq_putc(p, 0);
 
-	return p->buffer;
+	return ret;
 }
 EXPORT_SYMBOL(ftrace_print_symbols_seq);
 
-- 
cgit v1.2.3


From 563af16c30ede41eda2d614195d88e07f7c7103d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 3 Jun 2009 11:10:44 -0400
Subject: tracing: add annotation to what type of stack trace is recorded

The current method of printing out a stack trace is to add a new line
and print out the trace:

    yum-updatesd-3120  [002]   573.691303:
 => do_softirq
 => irq_exit
 => smp_apic_timer_interrupt
 => apic_timer_interrupt

This looks a bit awkward, and if we have both stack and user stack traces
running, it would be nice to have a title to tell them apart, although
it is easy to tell by the output.

This patch adds an annotation to the start of the stack traces:

            init-1     [003]   929.304979: <stack trace>
 => user_path_at
 => vfs_fstatat
 => vfs_stat
 => sys_newstat
 => system_call_fastpath

             cat-3459  [002]  1016.824040: <user stack trace>
 =>  <0000003aae6c0250>
 =>  <00007ffff4b06ae4>
 =>  <69636172742f6775>

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_output.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 8afeea412e7..425725c1622 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -976,7 +976,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
 
 	trace_assign_type(field, iter->ent);
 
-	if (!trace_seq_puts(s, "\n"))
+	if (!trace_seq_puts(s, "<stack trace>\n"))
 		goto partial;
 	for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
 		if (!field->caller[i] || (field->caller[i] == ULONG_MAX))
@@ -1013,7 +1013,7 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
 
 	trace_assign_type(field, iter->ent);
 
-	if (!trace_seq_putc(s, '\n'))
+	if (!trace_seq_puts(s, "<user stack trace>\n"))
 		goto partial;
 
 	if (!seq_print_userip_objs(field, s, flags))
-- 
cgit v1.2.3


From 1f8a6a10fb9437eac3f516ea4324a19087872f30 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 8 Jun 2009 18:18:39 +0200
Subject: ring-buffer: pass in lockdep class key for reader_lock

On Sun, 7 Jun 2009, Ingo Molnar wrote:
> Testing tracer sched_switch: <6>Starting ring buffer hammer
> PASSED
> Testing tracer sysprof: PASSED
> Testing tracer function: PASSED
> Testing tracer irqsoff:
> =============================================
> PASSED
> Testing tracer preemptoff: PASSED
> Testing tracer preemptirqsoff: [ INFO: possible recursive locking detected ]
> PASSED
> Testing tracer branch: 2.6.30-rc8-tip-01972-ge5b9078-dirty #5760
> ---------------------------------------------
> rb_consumer/431 is trying to acquire lock:
>  (&cpu_buffer->reader_lock){......}, at: [<c109eef7>] ring_buffer_reset_cpu+0x37/0x70
>
> but task is already holding lock:
>  (&cpu_buffer->reader_lock){......}, at: [<c10a019e>] ring_buffer_consume+0x7e/0xc0
>
> other info that might help us debug this:
> 1 lock held by rb_consumer/431:
>  #0:  (&cpu_buffer->reader_lock){......}, at: [<c10a019e>] ring_buffer_consume+0x7e/0xc0

The ring buffer is a generic structure, and can be used outside of
ftrace. If ftrace traces within the use of the ring buffer, it can produce
false positives with lockdep.

This patch passes in a static lock key into the allocation of the ring
buffer, so that different ring buffers will have their own lock class.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1244477919.13761.9042.camel@twins>

[ store key in ring buffer descriptor ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7102d7a2fad..22878b0d370 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -426,6 +426,8 @@ struct ring_buffer {
 	atomic_t			record_disabled;
 	cpumask_var_t			cpumask;
 
+	struct lock_class_key		*reader_lock_key;
+
 	struct mutex			mutex;
 
 	struct ring_buffer_per_cpu	**buffers;
@@ -565,6 +567,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
 	cpu_buffer->cpu = cpu;
 	cpu_buffer->buffer = buffer;
 	spin_lock_init(&cpu_buffer->reader_lock);
+	lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
 	cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 	INIT_LIST_HEAD(&cpu_buffer->pages);
 
@@ -635,7 +638,8 @@ static int rb_cpu_notify(struct notifier_block *self,
  * when the buffer wraps. If this flag is not set, the buffer will
  * drop data when the tail hits the head.
  */
-struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
+struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
+					struct lock_class_key *key)
 {
 	struct ring_buffer *buffer;
 	int bsize;
@@ -658,6 +662,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
 	buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
 	buffer->flags = flags;
 	buffer->clock = trace_clock_local;
+	buffer->reader_lock_key = key;
 
 	/* need at least two pages */
 	if (buffer->pages == 1)
@@ -715,7 +720,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
 	kfree(buffer);
 	return NULL;
 }
-EXPORT_SYMBOL_GPL(ring_buffer_alloc);
+EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
 
 /**
  * ring_buffer_free - free a ring buffer.
-- 
cgit v1.2.3


From f57a8a1911342265e7acdc190333c4e9235a6632 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 5 Jun 2009 14:11:30 -0400
Subject: ring-buffer: fix ret in rb_add_time_stamp

The update of ret got mistakenly added to the if statement of
rb_try_to_discard. The variable ret should be 1 on commit and zero
otherwise.

[ Impact: fix compiler warning and real bug ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 22878b0d370..2e642b2b725 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1433,8 +1433,8 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 			/* Darn, this is just wasted space */
 			event->time_delta = 0;
 			event->array[0] = 0;
-			ret = 0;
 		}
+		ret = 0;
 	}
 
 	*delta = 0;
-- 
cgit v1.2.3


From 55782138e47d9baf2f7d3a7af9e7cf42adf72c56 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 9 Jun 2009 13:43:05 +0800
Subject: tracing/events: convert block trace points to TRACE_EVENT()

TRACE_EVENT is a more generic way to define tracepoints. Doing so adds
these new capabilities to this tracepoint:

  - zero-copy and per-cpu splice() tracing
  - binary tracing without printf overhead
  - structured logging records exposed under /debug/tracing/events
  - trace events embedded in function tracer output and other plugins
  - user-defined, per tracepoint filter expressions
  ...

Cons:

  - no dev_t info for the output of plug, unplug_timer and unplug_io events.
    no dev_t info for getrq and sleeprq events if bio == NULL.
    no dev_t info for rq_abort,...,rq_requeue events if rq->rq_disk == NULL.

    This is mainly because we can't get the deivce from a request queue.
    But this may change in the future.

  - A packet command is converted to a string in TP_assign, not TP_print.
    While blktrace do the convertion just before output.

    Since pc requests should be rather rare, this is not a big issue.

  - In blktrace, an event can have 2 different print formats, but a TRACE_EVENT
    has a unique format, which means we have some unused data in a trace entry.

    The overhead is minimized by using __dynamic_array() instead of __array().

I've benchmarked the ioctl blktrace vs the splice based TRACE_EVENT tracing:

      dd                   dd + ioctl blktrace       dd + TRACE_EVENT (splice)
1     7.36s, 42.7 MB/s     7.50s, 42.0 MB/s          7.41s, 42.5 MB/s
2     7.43s, 42.3 MB/s     7.48s, 42.1 MB/s          7.43s, 42.4 MB/s
3     7.38s, 42.6 MB/s     7.45s, 42.2 MB/s          7.41s, 42.5 MB/s

So the overhead of tracing is very small, and no regression when using
those trace events vs blktrace.

And the binary output of TRACE_EVENT is much smaller than blktrace:

 # ls -l -h
 -rw-r--r-- 1 root root 8.8M 06-09 13:24 sda.blktrace.0
 -rw-r--r-- 1 root root 195K 06-09 13:24 sda.blktrace.1
 -rw-r--r-- 1 root root 2.7M 06-09 13:25 trace_splice.out

Following are some comparisons between TRACE_EVENT and blktrace:

plug:
  kjournald-480   [000]   303.084981: block_plug: [kjournald]
  kjournald-480   [000]   303.084981:   8,0    P   N [kjournald]

unplug_io:
  kblockd/0-118   [000]   300.052973: block_unplug_io: [kblockd/0] 1
  kblockd/0-118   [000]   300.052974:   8,0    U   N [kblockd/0] 1

remap:
  kjournald-480   [000]   303.085042: block_remap: 8,0 W 102736992 + 8 <- (8,8) 33384
  kjournald-480   [000]   303.085043:   8,0    A   W 102736992 + 8 <- (8,8) 33384

bio_backmerge:
  kjournald-480   [000]   303.085086: block_bio_backmerge: 8,0 W 102737032 + 8 [kjournald]
  kjournald-480   [000]   303.085086:   8,0    M   W 102737032 + 8 [kjournald]

getrq:
  kjournald-480   [000]   303.084974: block_getrq: 8,0 W 102736984 + 8 [kjournald]
  kjournald-480   [000]   303.084975:   8,0    G   W 102736984 + 8 [kjournald]

  bash-2066  [001]  1072.953770:   8,0    G   N [bash]
  bash-2066  [001]  1072.953773: block_getrq: 0,0 N 0 + 0 [bash]

rq_complete:
  konsole-2065  [001]   300.053184: block_rq_complete: 8,0 W () 103669040 + 16 [0]
  konsole-2065  [001]   300.053191:   8,0    C   W 103669040 + 16 [0]

  ksoftirqd/1-7   [001]  1072.953811:   8,0    C   N (5a 00 08 00 00 00 00 00 24 00) [0]
  ksoftirqd/1-7   [001]  1072.953813: block_rq_complete: 0,0 N (5a 00 08 00 00 00 00 00 24 00) 0 + 0 [0]

rq_insert:
  kjournald-480   [000]   303.084985: block_rq_insert: 8,0 W 0 () 102736984 + 8 [kjournald]
  kjournald-480   [000]   303.084986:   8,0    I   W 102736984 + 8 [kjournald]

Changelog from v2 -> v3:

- use the newly introduced __dynamic_array().

Changelog from v1 -> v2:

- use __string() instead of __array() to minimize the memory required
  to store hex dump of rq->cmd().

- support large pc requests.

- add missing blk_fill_rwbs_rq() in block_rq_requeue TRACE_EVENT.

- some cleanups.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4A2DF669.5070905@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Makefile   |  5 +++-
 kernel/trace/blktrace.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 81 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 06b85850fab..844164dca90 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -45,7 +45,10 @@ obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
 obj-$(CONFIG_POWER_TRACER) += trace_power.o
 obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
-obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
+obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
+ifeq ($(CONFIG_BLOCK),y)
+obj-$(CONFIG_EVENT_TRACING) += blktrace.o
+endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events.o
 obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index e3abf55bc8e..7bd6a9893c2 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -23,10 +23,14 @@
 #include <linux/mutex.h>
 #include <linux/debugfs.h>
 #include <linux/time.h>
-#include <trace/block.h>
 #include <linux/uaccess.h>
+
+#include <trace/events/block.h>
+
 #include "trace_output.h"
 
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+
 static unsigned int blktrace_seq __read_mostly = 1;
 
 static struct trace_array *blk_tr;
@@ -1658,3 +1662,75 @@ int blk_trace_init_sysfs(struct device *dev)
 	return sysfs_create_group(&dev->kobj, &blk_trace_attr_group);
 }
 
+#endif /* CONFIG_BLK_DEV_IO_TRACE */
+
+#ifdef CONFIG_EVENT_TRACING
+
+void blk_dump_cmd(char *buf, struct request *rq)
+{
+	int i, end;
+	int len = rq->cmd_len;
+	unsigned char *cmd = rq->cmd;
+
+	if (!blk_pc_request(rq)) {
+		buf[0] = '\0';
+		return;
+	}
+
+	for (end = len - 1; end >= 0; end--)
+		if (cmd[end])
+			break;
+	end++;
+
+	for (i = 0; i < len; i++) {
+		buf += sprintf(buf, "%s%02x", i == 0 ? "" : " ", cmd[i]);
+		if (i == end && end != len - 1) {
+			sprintf(buf, " ..");
+			break;
+		}
+	}
+}
+
+void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
+{
+	int i = 0;
+
+	if (rw & WRITE)
+		rwbs[i++] = 'W';
+	else if (rw & 1 << BIO_RW_DISCARD)
+		rwbs[i++] = 'D';
+	else if (bytes)
+		rwbs[i++] = 'R';
+	else
+		rwbs[i++] = 'N';
+
+	if (rw & 1 << BIO_RW_AHEAD)
+		rwbs[i++] = 'A';
+	if (rw & 1 << BIO_RW_BARRIER)
+		rwbs[i++] = 'B';
+	if (rw & 1 << BIO_RW_SYNCIO)
+		rwbs[i++] = 'S';
+	if (rw & 1 << BIO_RW_META)
+		rwbs[i++] = 'M';
+
+	rwbs[i] = '\0';
+}
+
+void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
+{
+	int rw = rq->cmd_flags & 0x03;
+	int bytes;
+
+	if (blk_discard_rq(rq))
+		rw |= (1 << BIO_RW_DISCARD);
+
+	if (blk_pc_request(rq))
+		bytes = rq->data_len;
+	else
+		bytes = rq->hard_nr_sectors << 9;
+
+	blk_fill_rwbs(rwbs, rw, bytes);
+}
+
+#endif /* CONFIG_EVENT_TRACING */
+
-- 
cgit v1.2.3


From 725c624a58a10ef90a2ff889e122158fabf36147 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 8 Jun 2009 19:09:45 -0400
Subject: tracing: add trace_seq_vprint interface

The code to update the print formats for events requires a vprintf
format in the trace_seq. This patch adds that interface.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_output.c | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 425725c1622..c05aff465dc 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -100,6 +100,38 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 }
 EXPORT_SYMBOL_GPL(trace_seq_printf);
 
+/**
+ * trace_seq_vprintf - sequence printing of trace information
+ * @s: trace sequence descriptor
+ * @fmt: printf format string
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * trace_seq_printf is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ */
+int
+trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
+{
+	int len = (PAGE_SIZE - 1) - s->len;
+	int ret;
+
+	if (!len)
+		return 0;
+
+	ret = vsnprintf(s->buffer + s->len, len, fmt, args);
+
+	/* If we can't write it all, don't bother writing anything */
+	if (ret >= len)
+		return 0;
+
+	s->len += ret;
+
+	return len;
+}
+EXPORT_SYMBOL_GPL(trace_seq_vprintf);
+
 int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
 {
 	int len = (PAGE_SIZE - 1) - s->len;
-- 
cgit v1.2.3


From 110bf2b764eb6026b868d84499263cb24b1bcc8d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 9 Jun 2009 17:29:07 -0400
Subject: tracing: add protection around module events unload

When reading the trace buffer, there is a race that when a module
is unloaded it removes events that is stilled referenced in the buffers.
This patch adds the protection around the unloading of the events
from modules and the reading of the trace buffers.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c |  4 +++-
 kernel/trace/trace_output.c | 15 ++++++++++++---
 kernel/trace/trace_output.h |  4 ++++
 3 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 6c81f9c2142..aa08be69a1b 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1050,12 +1050,13 @@ static void trace_module_remove_events(struct module *mod)
 	struct ftrace_event_call *call, *p;
 	bool found = false;
 
+	down_write(&trace_event_mutex);
 	list_for_each_entry_safe(call, p, &ftrace_events, list) {
 		if (call->mod == mod) {
 			found = true;
 			ftrace_event_enable_disable(call, 0);
 			if (call->event)
-				unregister_ftrace_event(call->event);
+				__unregister_ftrace_event(call->event);
 			debugfs_remove_recursive(call->dir);
 			list_del(&call->list);
 			trace_destroy_fields(call);
@@ -1079,6 +1080,7 @@ static void trace_module_remove_events(struct module *mod)
 	 */
 	if (found)
 		tracing_reset_current_online_cpus();
+	up_write(&trace_event_mutex);
 }
 
 static int trace_module_notify(struct notifier_block *self,
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index c05aff465dc..7938f3ae93e 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -14,7 +14,7 @@
 /* must be a power of 2 */
 #define EVENT_HASHSIZE	128
 
-static DECLARE_RWSEM(trace_event_mutex);
+DECLARE_RWSEM(trace_event_mutex);
 
 DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq);
 EXPORT_PER_CPU_SYMBOL(ftrace_event_seq);
@@ -702,6 +702,16 @@ int register_ftrace_event(struct trace_event *event)
 }
 EXPORT_SYMBOL_GPL(register_ftrace_event);
 
+/*
+ * Used by module code with the trace_event_mutex held for write.
+ */
+int __unregister_ftrace_event(struct trace_event *event)
+{
+	hlist_del(&event->node);
+	list_del(&event->list);
+	return 0;
+}
+
 /**
  * unregister_ftrace_event - remove a no longer used event
  * @event: the event to remove
@@ -709,8 +719,7 @@ EXPORT_SYMBOL_GPL(register_ftrace_event);
 int unregister_ftrace_event(struct trace_event *event)
 {
 	down_write(&trace_event_mutex);
-	hlist_del(&event->node);
-	list_del(&event->list);
+	__unregister_ftrace_event(event);
 	up_write(&trace_event_mutex);
 
 	return 0;
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index ac240e76eb0..d38bec4a9c3 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -27,6 +27,10 @@ extern struct trace_event *ftrace_find_event(int type);
 extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
 					 int flags);
 
+/* used by module unregistering */
+extern int __unregister_ftrace_event(struct trace_event *event);
+extern struct rw_semaphore trace_event_mutex;
+
 #define MAX_MEMHEX_BYTES	8
 #define HEX_CHARS		(MAX_MEMHEX_BYTES*2 + 1)
 
-- 
cgit v1.2.3