aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile12
-rw-r--r--kernel/audit_tree.c139
-rw-r--r--kernel/auditfilter.c14
-rw-r--r--kernel/cgroup.c24
-rw-r--r--kernel/cgroup_freezer.c70
-rw-r--r--kernel/cpu.c5
-rw-r--r--kernel/cpuset.c33
-rw-r--r--kernel/exec_domain.c33
-rw-r--r--kernel/exit.c19
-rw-r--r--kernel/fork.c18
-rw-r--r--kernel/freezer.c20
-rw-r--r--kernel/futex.c11
-rw-r--r--kernel/hrtimer.c232
-rw-r--r--kernel/irq/chip.c3
-rw-r--r--kernel/irq/internals.h2
-rw-r--r--kernel/irq/manage.c68
-rw-r--r--kernel/irq/migration.c11
-rw-r--r--kernel/irq/proc.c4
-rw-r--r--kernel/kallsyms.c17
-rw-r--r--kernel/kprobes.c23
-rw-r--r--kernel/kthread.c3
-rw-r--r--kernel/lockdep.c21
-rw-r--r--kernel/marker.c192
-rw-r--r--kernel/module.c350
-rw-r--r--kernel/panic.c18
-rw-r--r--kernel/params.c276
-rw-r--r--kernel/posix-cpu-timers.c7
-rw-r--r--kernel/posix-timers.c10
-rw-r--r--kernel/power/Kconfig2
-rw-r--r--kernel/power/disk.c15
-rw-r--r--kernel/power/main.c7
-rw-r--r--kernel/power/power.h2
-rw-r--r--kernel/power/swap.c14
-rw-r--r--kernel/printk.c39
-rw-r--r--kernel/profile.c8
-rw-r--r--kernel/ptrace.c4
-rw-r--r--kernel/rcupdate.c19
-rw-r--r--kernel/relay.c9
-rw-r--r--kernel/resource.c8
-rw-r--r--kernel/rtmutex.c3
-rw-r--r--kernel/sched.c115
-rw-r--r--kernel/sched_debug.c48
-rw-r--r--kernel/sched_fair.c308
-rw-r--r--kernel/sched_features.h3
-rw-r--r--kernel/sched_idletask.c5
-rw-r--r--kernel/sched_rt.c5
-rw-r--r--kernel/sched_stats.h26
-rw-r--r--kernel/signal.c5
-rw-r--r--kernel/smp.c18
-rw-r--r--kernel/softirq.c7
-rw-r--r--kernel/stop_machine.c123
-rw-r--r--kernel/sys.c10
-rw-r--r--kernel/sys_ni.c2
-rw-r--r--kernel/sysctl.c32
-rw-r--r--kernel/time.c18
-rw-r--r--kernel/time/ntp.c3
-rw-r--r--kernel/time/tick-sched.c39
-rw-r--r--kernel/time/timer_list.c8
-rw-r--r--kernel/timer.c129
-rw-r--r--kernel/trace/Kconfig132
-rw-r--r--kernel/trace/Makefile15
-rw-r--r--kernel/trace/ftrace.c1177
-rw-r--r--kernel/trace/ring_buffer.c509
-rw-r--r--kernel/trace/trace.c924
-rw-r--r--kernel/trace/trace.h241
-rw-r--r--kernel/trace/trace_boot.c166
-rw-r--r--kernel/trace/trace_branch.c342
-rw-r--r--kernel/trace/trace_bts.c276
-rw-r--r--kernel/trace/trace_functions.c20
-rw-r--r--kernel/trace/trace_functions_graph.c400
-rw-r--r--kernel/trace/trace_irqsoff.c65
-rw-r--r--kernel/trace/trace_mmiotrace.c41
-rw-r--r--kernel/trace/trace_nop.c65
-rw-r--r--kernel/trace/trace_power.c179
-rw-r--r--kernel/trace/trace_sched_switch.c106
-rw-r--r--kernel/trace/trace_sched_wakeup.c74
-rw-r--r--kernel/trace/trace_selftest.c191
-rw-r--r--kernel/trace/trace_stack.c36
-rw-r--r--kernel/trace/trace_sysprof.c19
-rw-r--r--kernel/tracepoint.c291
-rw-r--r--kernel/workqueue.c52
81 files changed, 5645 insertions, 2345 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 305f11dbef2..703cf3b7389 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,9 +11,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
notifier.o ksysfs.o pm_qos_params.o sched_clock.o
-CFLAGS_REMOVE_sched.o = -mno-spe
-
-ifdef CONFIG_FTRACE
+ifdef CONFIG_FUNCTION_TRACER
# Do not trace debug files and internal ftrace files
CFLAGS_REMOVE_lockdep.o = -pg
CFLAGS_REMOVE_lockdep_proc.o = -pg
@@ -21,7 +19,11 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
CFLAGS_REMOVE_rtmutex-debug.o = -pg
CFLAGS_REMOVE_cgroup-debug.o = -pg
CFLAGS_REMOVE_sched_clock.o = -pg
-CFLAGS_REMOVE_sched.o = -mno-spe -pg
+CFLAGS_REMOVE_sched.o = -pg
+endif
+ifdef CONFIG_FUNCTION_GRAPH_TRACER
+CFLAGS_REMOVE_extable.o = -pg # For __kernel_text_address()
+CFLAGS_REMOVE_module.o = -pg # For __module_text_address()
endif
obj-$(CONFIG_FREEZER) += freezer.o
@@ -88,7 +90,7 @@ obj-$(CONFIG_MARKERS) += marker.o
obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
obj-$(CONFIG_LATENCYTOP) += latencytop.o
obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
-obj-$(CONFIG_FTRACE) += trace/
+obj-$(CONFIG_FUNCTION_TRACER) += trace/
obj-$(CONFIG_TRACING) += trace/
obj-$(CONFIG_SMP) += sched_cpupri.o
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index f7921a2ecf1..8b509441f49 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -24,6 +24,7 @@ struct audit_chunk {
struct list_head trees; /* with root here */
int dead;
int count;
+ atomic_long_t refs;
struct rcu_head head;
struct node {
struct list_head list;
@@ -56,7 +57,8 @@ static LIST_HEAD(prune_list);
* tree is refcounted; one reference for "some rules on rules_list refer to
* it", one for each chunk with pointer to it.
*
- * chunk is refcounted by embedded inotify_watch.
+ * chunk is refcounted by embedded inotify_watch + .refs (non-zero refcount
+ * of watch contributes 1 to .refs).
*
* node.index allows to get from node.list to containing chunk.
* MSB of that sucker is stolen to mark taggings that we might have to
@@ -121,6 +123,7 @@ static struct audit_chunk *alloc_chunk(int count)
INIT_LIST_HEAD(&chunk->hash);
INIT_LIST_HEAD(&chunk->trees);
chunk->count = count;
+ atomic_long_set(&chunk->refs, 1);
for (i = 0; i < count; i++) {
INIT_LIST_HEAD(&chunk->owners[i].list);
chunk->owners[i].index = i;
@@ -129,9 +132,8 @@ static struct audit_chunk *alloc_chunk(int count)
return chunk;
}
-static void __free_chunk(struct rcu_head *rcu)
+static void free_chunk(struct audit_chunk *chunk)
{
- struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head);
int i;
for (i = 0; i < chunk->count; i++) {
@@ -141,14 +143,16 @@ static void __free_chunk(struct rcu_head *rcu)
kfree(chunk);
}
-static inline void free_chunk(struct audit_chunk *chunk)
+void audit_put_chunk(struct audit_chunk *chunk)
{
- call_rcu(&chunk->head, __free_chunk);
+ if (atomic_long_dec_and_test(&chunk->refs))
+ free_chunk(chunk);
}
-void audit_put_chunk(struct audit_chunk *chunk)
+static void __put_chunk(struct rcu_head *rcu)
{
- put_inotify_watch(&chunk->watch);
+ struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head);
+ audit_put_chunk(chunk);
}
enum {HASH_SIZE = 128};
@@ -176,7 +180,7 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
list_for_each_entry_rcu(p, list, hash) {
if (p->watch.inode == inode) {
- get_inotify_watch(&p->watch);
+ atomic_long_inc(&p->refs);
return p;
}
}
@@ -194,17 +198,49 @@ int audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree)
/* tagging and untagging inodes with trees */
-static void untag_chunk(struct audit_chunk *chunk, struct node *p)
+static struct audit_chunk *find_chunk(struct node *p)
+{
+ int index = p->index & ~(1U<<31);
+ p -= index;
+ return container_of(p, struct audit_chunk, owners[0]);
+}
+
+static void untag_chunk(struct node *p)
{
+ struct audit_chunk *chunk = find_chunk(p);
struct audit_chunk *new;
struct audit_tree *owner;
int size = chunk->count - 1;
int i, j;
+ if (!pin_inotify_watch(&chunk->watch)) {
+ /*
+ * Filesystem is shutting down; all watches are getting
+ * evicted, just take it off the node list for this
+ * tree and let the eviction logics take care of the
+ * rest.
+ */
+ owner = p->owner;
+ if (owner->root == chunk) {
+ list_del_init(&owner->same_root);
+ owner->root = NULL;
+ }
+ list_del_init(&p->list);
+ p->owner = NULL;
+ put_tree(owner);
+ return;
+ }
+
+ spin_unlock(&hash_lock);
+
+ /*
+ * pin_inotify_watch() succeeded, so the watch won't go away
+ * from under us.
+ */
mutex_lock(&chunk->watch.inode->inotify_mutex);
if (chunk->dead) {
mutex_unlock(&chunk->watch.inode->inotify_mutex);
- return;
+ goto out;
}
owner = p->owner;
@@ -221,7 +257,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct node *p)
inotify_evict_watch(&chunk->watch);
mutex_unlock(&chunk->watch.inode->inotify_mutex);
put_inotify_watch(&chunk->watch);
- return;
+ goto out;
}
new = alloc_chunk(size);
@@ -263,7 +299,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct node *p)
inotify_evict_watch(&chunk->watch);
mutex_unlock(&chunk->watch.inode->inotify_mutex);
put_inotify_watch(&chunk->watch);
- return;
+ goto out;
Fallback:
// do the best we can
@@ -277,6 +313,9 @@ Fallback:
put_tree(owner);
spin_unlock(&hash_lock);
mutex_unlock(&chunk->watch.inode->inotify_mutex);
+out:
+ unpin_inotify_watch(&chunk->watch);
+ spin_lock(&hash_lock);
}
static int create_chunk(struct inode *inode, struct audit_tree *tree)
@@ -387,13 +426,6 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
return 0;
}
-static struct audit_chunk *find_chunk(struct node *p)
-{
- int index = p->index & ~(1U<<31);
- p -= index;
- return container_of(p, struct audit_chunk, owners[0]);
-}
-
static void kill_rules(struct audit_tree *tree)
{
struct audit_krule *rule, *next;
@@ -431,17 +463,10 @@ static void prune_one(struct audit_tree *victim)
spin_lock(&hash_lock);
while (!list_empty(&victim->chunks)) {
struct node *p;
- struct audit_chunk *chunk;
p = list_entry(victim->chunks.next, struct node, list);
- chunk = find_chunk(p);
- get_inotify_watch(&chunk->watch);
- spin_unlock(&hash_lock);
-
- untag_chunk(chunk, p);
- put_inotify_watch(&chunk->watch);
- spin_lock(&hash_lock);
+ untag_chunk(p);
}
spin_unlock(&hash_lock);
put_tree(victim);
@@ -469,7 +494,6 @@ static void trim_marked(struct audit_tree *tree)
while (!list_empty(&tree->chunks)) {
struct node *node;
- struct audit_chunk *chunk;
node = list_entry(tree->chunks.next, struct node, list);
@@ -477,14 +501,7 @@ static void trim_marked(struct audit_tree *tree)
if (!(node->index & (1U<<31)))
break;
- chunk = find_chunk(node);
- get_inotify_watch(&chunk->watch);
- spin_unlock(&hash_lock);
-
- untag_chunk(chunk, node);
-
- put_inotify_watch(&chunk->watch);
- spin_lock(&hash_lock);
+ untag_chunk(node);
}
if (!tree->root && !tree->goner) {
tree->goner = 1;
@@ -532,7 +549,7 @@ void audit_trim_trees(void)
list_add(&cursor, &tree_list);
while (cursor.next != &tree_list) {
struct audit_tree *tree;
- struct nameidata nd;
+ struct path path;
struct vfsmount *root_mnt;
struct node *node;
struct list_head list;
@@ -544,12 +561,12 @@ void audit_trim_trees(void)
list_add(&cursor, &tree->list);
mutex_unlock(&audit_filter_mutex);
- err = path_lookup(tree->pathname, 0, &nd);
+ err = kern_path(tree->pathname, 0, &path);
if (err)
goto skip_it;
- root_mnt = collect_mounts(nd.path.mnt, nd.path.dentry);
- path_put(&nd.path);
+ root_mnt = collect_mounts(path.mnt, path.dentry);
+ path_put(&path);
if (!root_mnt)
goto skip_it;
@@ -580,19 +597,19 @@ skip_it:
}
static int is_under(struct vfsmount *mnt, struct dentry *dentry,
- struct nameidata *nd)
+ struct path *path)
{
- if (mnt != nd->path.mnt) {
+ if (mnt != path->mnt) {
for (;;) {
if (mnt->mnt_parent == mnt)
return 0;
- if (mnt->mnt_parent == nd->path.mnt)
+ if (mnt->mnt_parent == path->mnt)
break;
mnt = mnt->mnt_parent;
}
dentry = mnt->mnt_mountpoint;
}
- return is_subdir(dentry, nd->path.dentry);
+ return is_subdir(dentry, path->dentry);
}
int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
@@ -618,7 +635,7 @@ void audit_put_tree(struct audit_tree *tree)
int audit_add_tree_rule(struct audit_krule *rule)
{
struct audit_tree *seed = rule->tree, *tree;
- struct nameidata nd;
+ struct path path;
struct vfsmount *mnt, *p;
struct list_head list;
int err;
@@ -637,11 +654,11 @@ int audit_add_tree_rule(struct audit_krule *rule)
/* do not set rule->tree yet */
mutex_unlock(&audit_filter_mutex);
- err = path_lookup(tree->pathname, 0, &nd);
+ err = kern_path(tree->pathname, 0, &path);
if (err)
goto Err;
- mnt = collect_mounts(nd.path.mnt, nd.path.dentry);
- path_put(&nd.path);
+ mnt = collect_mounts(path.mnt, path.dentry);
+ path_put(&path);
if (!mnt) {
err = -ENOMEM;
goto Err;
@@ -690,29 +707,29 @@ int audit_tag_tree(char *old, char *new)
{
struct list_head cursor, barrier;
int failed = 0;
- struct nameidata nd;
+ struct path path;
struct vfsmount *tagged;
struct list_head list;
struct vfsmount *mnt;
struct dentry *dentry;
int err;
- err = path_lookup(new, 0, &nd);
+ err = kern_path(new, 0, &path);
if (err)
return err;
- tagged = collect_mounts(nd.path.mnt, nd.path.dentry);
- path_put(&nd.path);
+ tagged = collect_mounts(path.mnt, path.dentry);
+ path_put(&path);
if (!tagged)
return -ENOMEM;
- err = path_lookup(old, 0, &nd);
+ err = kern_path(old, 0, &path);
if (err) {
drop_collected_mounts(tagged);
return err;
}
- mnt = mntget(nd.path.mnt);
- dentry = dget(nd.path.dentry);
- path_put(&nd.path);
+ mnt = mntget(path.mnt);
+ dentry = dget(path.dentry);
+ path_put(&path);
if (dentry == tagged->mnt_root && dentry == mnt->mnt_root)
follow_up(&mnt, &dentry);
@@ -733,7 +750,7 @@ int audit_tag_tree(char *old, char *new)
list_add(&cursor, &tree->list);
mutex_unlock(&audit_filter_mutex);
- err = path_lookup(tree->pathname, 0, &nd);
+ err = kern_path(tree->pathname, 0, &path);
if (err) {
put_tree(tree);
mutex_lock(&audit_filter_mutex);
@@ -741,15 +758,15 @@ int audit_tag_tree(char *old, char *new)
}
spin_lock(&vfsmount_lock);
- if (!is_under(mnt, dentry, &nd)) {
+ if (!is_under(mnt, dentry, &path)) {
spin_unlock(&vfsmount_lock);
- path_put(&nd.path);
+ path_put(&path);
put_tree(tree);
mutex_lock(&audit_filter_mutex);
continue;
}
spin_unlock(&vfsmount_lock);
- path_put(&nd.path);
+ path_put(&path);
list_for_each_entry(p, &list, mnt_list) {
failed = tag_chunk(p->mnt_root->d_inode, tree);
@@ -878,7 +895,7 @@ static void handle_event(struct inotify_watch *watch, u32 wd, u32 mask,
static void destroy_watch(struct inotify_watch *watch)
{
struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch);
- free_chunk(chunk);
+ call_rcu(&chunk->head, __put_chunk);
}
static const struct inotify_operations rtree_inotify_ops = {
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index b7d354e2b0e..9fd85a4640a 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1094,8 +1094,8 @@ static void audit_inotify_unregister(struct list_head *in_list)
list_for_each_entry_safe(p, n, in_list, ilist) {
list_del(&p->ilist);
inotify_rm_watch(audit_ih, &p->wdata);
- /* the put matching the get in audit_do_del_rule() */
- put_inotify_watch(&p->wdata);
+ /* the unpin matching the pin in audit_do_del_rule() */
+ unpin_inotify_watch(&p->wdata);
}
}
@@ -1389,9 +1389,13 @@ static inline int audit_del_rule(struct audit_entry *entry,
/* Put parent on the inotify un-registration
* list. Grab a reference before releasing
* audit_filter_mutex, to be released in
- * audit_inotify_unregister(). */
- list_add(&parent->ilist, &inotify_list);
- get_inotify_watch(&parent->wdata);
+ * audit_inotify_unregister().
+ * If filesystem is going away, just leave
+ * the sucker alone, eviction will take
+ * care of it.
+ */
+ if (pin_inotify_watch(&parent->wdata))
+ list_add(&parent->ilist, &inotify_list);
}
}
}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 046c1609606..fe00b3b983a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2039,10 +2039,13 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
struct cgroup *cgrp;
struct cgroup_iter it;
struct task_struct *tsk;
+
/*
- * Validate dentry by checking the superblock operations
+ * Validate dentry by checking the superblock operations,
+ * and make sure it's a directory.
*/
- if (dentry->d_sb->s_op != &cgroup_ops)
+ if (dentry->d_sb->s_op != &cgroup_ops ||
+ !S_ISDIR(dentry->d_inode->i_mode))
goto err;
ret = 0;
@@ -2104,7 +2107,7 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
down_read(&cgrp->pids_mutex);
if (pid) {
int end = cgrp->pids_length;
- int i;
+
while (index < end) {
int mid = (index + end) / 2;
if (cgrp->tasks_pids[mid] == pid) {
@@ -2472,10 +2475,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
mutex_unlock(&cgroup_mutex);
return -EBUSY;
}
-
- parent = cgrp->parent;
- root = cgrp->root;
- sb = root->sb;
+ mutex_unlock(&cgroup_mutex);
/*
* Call pre_destroy handlers of subsys. Notify subsystems
@@ -2483,7 +2483,14 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
*/
cgroup_call_pre_destroy(cgrp);
- if (cgroup_has_css_refs(cgrp)) {
+ mutex_lock(&cgroup_mutex);
+ parent = cgrp->parent;
+ root = cgrp->root;
+ sb = root->sb;
+
+ if (atomic_read(&cgrp->count)
+ || !list_empty(&cgrp->children)
+ || cgroup_has_css_refs(cgrp)) {
mutex_unlock(&cgroup_mutex);
return -EBUSY;
}
@@ -2497,7 +2504,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
list_del(&cgrp->sibling);
spin_lock(&cgrp->dentry->d_lock);
d = dget(cgrp->dentry);
- cgrp->dentry = NULL;
spin_unlock(&d->d_lock);
cgroup_d_remove_dir(d);
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index e9505695449..fb249e2bcad 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -162,9 +162,13 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
struct task_struct *task)
{
struct freezer *freezer;
- int retval;
- /* Anything frozen can't move or be moved to/from */
+ /*
+ * Anything frozen can't move or be moved to/from.
+ *
+ * Since orig_freezer->state == FROZEN means that @task has been
+ * frozen, so it's sufficient to check the latter condition.
+ */
if (is_task_frozen_enough(task))
return -EBUSY;
@@ -173,25 +177,31 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
if (freezer->state == CGROUP_FROZEN)
return -EBUSY;
- retval = 0;
- task_lock(task);
- freezer = task_freezer(task);
- if (freezer->state == CGROUP_FROZEN)
- retval = -EBUSY;
- task_unlock(task);
- return retval;
+ return 0;
}
static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
{
struct freezer *freezer;
- task_lock(task);
+ /*
+ * No lock is needed, since the task isn't on tasklist yet,
+ * so it can't be moved to another cgroup, which means the
+ * freezer won't be removed and will be valid during this
+ * function call.
+ */
freezer = task_freezer(task);
- task_unlock(task);
- BUG_ON(freezer->state == CGROUP_FROZEN);
+ /*
+ * The root cgroup is non-freezable, so we can skip the
+ * following check.
+ */
+ if (!freezer->css.cgroup->parent)
+ return;
+
spin_lock_irq(&freezer->lock);
+ BUG_ON(freezer->state == CGROUP_FROZEN);
+
/* Locking avoids race with FREEZING -> THAWED transitions. */
if (freezer->state == CGROUP_FREEZING)
freeze_task(task, true);
@@ -276,25 +286,18 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
return num_cant_freeze_now ? -EBUSY : 0;
}
-static int unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
+static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
{
struct cgroup_iter it;
struct task_struct *task;
cgroup_iter_start(cgroup, &it);
while ((task = cgroup_iter_next(cgroup, &it))) {
- int do_wake;
-
- task_lock(task);
- do_wake = __thaw_process(task);
- task_unlock(task);
- if (do_wake)
- wake_up_process(task);
+ thaw_process(task);
}
cgroup_iter_end(cgroup, &it);
- freezer->state = CGROUP_THAWED;
- return 0;
+ freezer->state = CGROUP_THAWED;
}
static int freezer_change_state(struct cgroup *cgroup,
@@ -304,27 +307,22 @@ static int freezer_change_state(struct cgroup *cgroup,
int retval = 0;
freezer = cgroup_freezer(cgroup);
+
spin_lock_irq(&freezer->lock);
+
update_freezer_state(cgroup, freezer);
if (goal_state == freezer->state)
goto out;
- switch (freezer->state) {
+
+ switch (goal_state) {
case CGROUP_THAWED:
- retval = try_to_freeze_cgroup(cgroup, freezer);
+ unfreeze_cgroup(cgroup, freezer);
break;
- case CGROUP_FREEZING:
- if (goal_state == CGROUP_FROZEN) {
- /* Userspace is retrying after
- * "/bin/echo FROZEN > freezer.state" returned -EBUSY */
- retval = try_to_freeze_cgroup(cgroup, freezer);
- break;
- }
- /* state == FREEZING and goal_state == THAWED, so unfreeze */
case CGROUP_FROZEN:
- retval = unfreeze_cgroup(cgroup, freezer);
+ retval = try_to_freeze_cgroup(cgroup, freezer);
break;
default:
- break;
+ BUG();
}
out:
spin_unlock_irq(&freezer->lock);
@@ -344,7 +342,7 @@ static int freezer_write(struct cgroup *cgroup,
else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0)
goal_state = CGROUP_FROZEN;
else
- return -EIO;
+ return -EINVAL;
if (!cgroup_lock_live_group(cgroup))
return -ENODEV;
@@ -363,6 +361,8 @@ static struct cftype files[] = {
static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
{
+ if (!cgroup->parent)
+ return 0;
return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files));
}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 86d49045dae..8ea32e8d68b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -462,7 +462,7 @@ out:
* It must be called by the arch code on the new cpu, before the new cpu
* enables interrupts and before the "boot" cpu returns from __cpu_up().
*/
-void notify_cpu_starting(unsigned int cpu)
+void __cpuinit notify_cpu_starting(unsigned int cpu)
{
unsigned long val = CPU_STARTING;
@@ -499,3 +499,6 @@ const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
#endif
};
EXPORT_SYMBOL_GPL(cpu_bit_bitmap);
+
+const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
+EXPORT_SYMBOL(cpu_all_bits);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3e00526f52e..96c0ba13b8c 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -36,6 +36,7 @@
#include <linux/list.h>
#include <linux/mempolicy.h>
#include <linux/mm.h>
+#include <linux/memory.h>
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/namei.h>
@@ -584,10 +585,9 @@ static int generate_sched_domains(cpumask_t **domains,
int i, j, k; /* indices for partition finding loops */
cpumask_t *doms; /* resulting partition; i.e. sched domains */
struct sched_domain_attr *dattr; /* attributes for custom domains */
- int ndoms; /* number of sched domains in result */
+ int ndoms = 0; /* number of sched domains in result */
int nslot; /* next empty doms[] cpumask_t slot */
- ndoms = 0;
doms = NULL;
dattr = NULL;
csa = NULL;
@@ -674,10 +674,8 @@ restart:
* Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
*/
doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
- if (!doms) {
- ndoms = 0;
+ if (!doms)
goto done;
- }
/*
* The rest of the code, including the scheduler, can deal with
@@ -732,6 +730,13 @@ restart:
done:
kfree(csa);
+ /*
+ * Fallback to the default domain if kmalloc() failed.
+ * See comments in partition_sched_domains().
+ */
+ if (doms == NULL)
+ ndoms = 1;
+
*domains = doms;
*attributes = dattr;
return ndoms;
@@ -2011,12 +2016,23 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
* Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
* See also the previous routine cpuset_track_online_cpus().
*/
-void cpuset_track_online_nodes(void)
+static int cpuset_track_online_nodes(struct notifier_block *self,
+ unsigned long action, void *arg)
{
cgroup_lock();
- top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
- scan_for_empty_cpusets(&top_cpuset);
+ switch (action) {
+ case MEM_ONLINE:
+ top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
+ break;
+ case MEM_OFFLINE:
+ top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
+ scan_for_empty_cpusets(&top_cpuset);
+ break;
+ default:
+ break;
+ }
cgroup_unlock();
+ return NOTIFY_OK;
}
#endif
@@ -2032,6 +2048,7 @@ void __init cpuset_init_smp(void)
top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
hotcpu_notifier(cpuset_track_online_cpus, 0);
+ hotplug_memory_notifier(cpuset_track_online_nodes, 10);
}
/**
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 0d407e88673..0511716e942 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -12,7 +12,9 @@
#include <linux/kmod.h>
#include <linux/module.h>
#include <linux/personality.h>
+#include <linux/proc_fs.h>
#include <linux/sched.h>
+#include <linux/seq_file.h>
#include <linux/syscalls.h>
#include <linux/sysctl.h>
#include <linux/types.h>
@@ -173,20 +175,39 @@ __set_personality(u_long personality)
return 0;
}
-int
-get_exec_domain_list(char *page)
+#ifdef CONFIG_PROC_FS
+static int execdomains_proc_show(struct seq_file *m, void *v)
{
struct exec_domain *ep;
- int len = 0;
read_lock(&exec_domains_lock);
- for (ep = exec_domains; ep && len < PAGE_SIZE - 80; ep = ep->next)
- len += sprintf(page + len, "%d-%d\t%-16s\t[%s]\n",
+ for (ep = exec_domains; ep; ep = ep->next)
+ seq_printf(m, "%d-%d\t%-16s\t[%s]\n",
ep->pers_low, ep->pers_high, ep->name,
module_name(ep->module));
read_unlock(&exec_domains_lock);
- return (len);
+ return 0;
+}
+
+static int execdomains_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, execdomains_proc_show, NULL);
+}
+
+static const struct file_operations execdomains_proc_fops = {
+ .open = execdomains_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init proc_execdomains_init(void)
+{
+ proc_create("execdomains", 0, NULL, &execdomains_proc_fops);
+ return 0;
}
+module_init(proc_execdomains_init);
+#endif
asmlinkage long
sys_personality(u_long personality)
diff --git a/kernel/exit.c b/kernel/exit.c
index 80137a5d946..e5ae36ebe8a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -40,7 +40,6 @@
#include <linux/cn_proc.h>
#include <linux/mutex.h>
#include <linux/futex.h>
-#include <linux/compat.h>
#include <linux/pipe_fs_i.h>
#include <linux/audit.h> /* for audit_free() */
#include <linux/resource.h>
@@ -54,6 +53,10 @@
#include <asm/pgtable.h>
#include <asm/mmu_context.h>
+DEFINE_TRACE(sched_process_free);
+DEFINE_TRACE(sched_process_exit);
+DEFINE_TRACE(sched_process_wait);
+
static void exit_mm(struct task_struct * tsk);
static inline int task_detached(struct task_struct *p)
@@ -141,6 +144,11 @@ static void __exit_signal(struct task_struct *tsk)
if (sig) {
flush_sigqueue(&sig->shared_pending);
taskstats_tgid_free(sig);
+ /*
+ * Make sure ->signal can't go away under rq->lock,
+ * see account_group_exec_runtime().
+ */
+ task_rq_unlock_wait(tsk);
__cleanup_signal(sig);
}
}
@@ -1054,14 +1062,6 @@ NORET_TYPE void do_exit(long code)
exit_itimers(tsk->signal);
}
acct_collect(code, group_dead);
-#ifdef CONFIG_FUTEX
- if (unlikely(tsk->robust_list))
- exit_robust_list(tsk);
-#ifdef CONFIG_COMPAT
- if (unlikely(tsk->compat_robust_list))
- compat_exit_robust_list(tsk);
-#endif
-#endif
if (group_dead)
tty_audit_exit();
if (unlikely(tsk->audit_context))
@@ -1127,7 +1127,6 @@ NORET_TYPE void do_exit(long code)
preempt_disable();
/* causes final put_task_struct in finish_task_switch(). */
tsk->state = TASK_DEAD;
-
schedule();
BUG();
/* Avoid "noreturn function does return". */
diff --git a/kernel/fork.c b/kernel/fork.c
index 4d093552dd6..5f82a999c03 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -40,12 +40,14 @@
#include <linux/jiffies.h>
#include <linux/tracehook.h>
#include <linux/futex.h>
+#include <linux/compat.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/rcupdate.h>
#include <linux/ptrace.h>
#include <linux/mount.h>
#include <linux/audit.h>
#include <linux/memcontrol.h>
+#include <linux/ftrace.h>
#include <linux/profile.h>
#include <linux/rmap.h>
#include <linux/acct.h>
@@ -79,6 +81,8 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0;
__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
+DEFINE_TRACE(sched_process_fork);
+
int nr_processes(void)
{
int cpu;
@@ -136,6 +140,7 @@ void free_task(struct task_struct *tsk)
prop_local_destroy_single(&tsk->dirties);
free_thread_info(tsk->stack);
rt_mutex_debug_task_free(tsk);
+ ftrace_graph_exit_task(tsk);
free_task_struct(tsk);
}
EXPORT_SYMBOL(free_task);
@@ -519,6 +524,16 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
struct completion *vfork_done = tsk->vfork_done;
+ /* Get rid of any futexes when releasing the mm */
+#ifdef CONFIG_FUTEX
+ if (unlikely(tsk->robust_list))
+ exit_robust_list(tsk);
+#ifdef CONFIG_COMPAT
+ if (unlikely(tsk->compat_robust_list))
+ compat_exit_robust_list(tsk);
+#endif
+#endif
+
/* Get rid of any cached register state */
deactivate_mm(tsk, mm);
@@ -1018,6 +1033,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->prev_utime = cputime_zero;
p->prev_stime = cputime_zero;
+ p->default_timer_slack_ns = current->timer_slack_ns;
+
#ifdef CONFIG_DETECT_SOFTLOCKUP
p->last_switch_count = 0;
p->last_switch_timestamp = 0;
@@ -1254,6 +1271,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
total_forks++;
spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock);
+ ftrace_graph_init_task(p);
proc_fork_connector(p);
cgroup_post_fork(p);
return p;
diff --git a/kernel/freezer.c b/kernel/freezer.c
index ba6248b323e..2f4936cf708 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -121,16 +121,7 @@ void cancel_freezing(struct task_struct *p)
}
}
-/*
- * Wake up a frozen process
- *
- * task_lock() is needed to prevent the race with refrigerator() which may
- * occur if the freezing of tasks fails. Namely, without the lock, if the
- * freezing of tasks failed, thaw_tasks() might have run before a task in
- * refrigerator() could call frozen_process(), in which case the task would be
- * frozen and no one would thaw it.
- */
-int __thaw_process(struct task_struct *p)
+static int __thaw_process(struct task_struct *p)
{
if (frozen(p)) {
p->flags &= ~PF_FROZEN;
@@ -140,6 +131,15 @@ int __thaw_process(struct task_struct *p)
return 0;
}
+/*
+ * Wake up a frozen process
+ *
+ * task_lock() is needed to prevent the race with refrigerator() which may
+ * occur if the freezing of tasks fails. Namely, without the lock, if the
+ * freezing of tasks failed, thaw_tasks() might have run before a task in
+ * refrigerator() could call frozen_process(), in which case the task would be
+ * frozen and no one would thaw it.
+ */
int thaw_process(struct task_struct *p)
{
task_lock(p);
diff --git a/kernel/futex.c b/kernel/futex.c
index 7d1136e97c1..8af10027514 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1296,13 +1296,16 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
if (!abs_time)
schedule();
else {
+ unsigned long slack;
+ slack = current->timer_slack_ns;
+ if (rt_task(current))
+ slack = 0;
hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC,
HRTIMER_MODE_ABS);
hrtimer_init_sleeper(&t, current);
- t.timer.expires = *abs_time;
+ hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack);
- hrtimer_start(&t.timer, t.timer.expires,
- HRTIMER_MODE_ABS);
+ hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
if (!hrtimer_active(&t.timer))
t.task = NULL;
@@ -1404,7 +1407,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
HRTIMER_MODE_ABS);
hrtimer_init_sleeper(to, current);
- to->timer.expires = *time;
+ hrtimer_set_expires(&to->timer, *time);
}
q.pi_state = NULL;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 95978f48e03..47e63349d1b 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -517,7 +517,7 @@ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base)
if (!base->first)
continue;
timer = rb_entry(base->first, struct hrtimer, node);
- expires = ktime_sub(timer->expires, base->offset);
+ expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
if (expires.tv64 < cpu_base->expires_next.tv64)
cpu_base->expires_next = expires;
}
@@ -539,10 +539,10 @@ static int hrtimer_reprogram(struct hrtimer *timer,
struct hrtimer_clock_base *base)
{
ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next;
- ktime_t expires = ktime_sub(timer->expires, base->offset);
+ ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
int res;
- WARN_ON_ONCE(timer->expires.tv64 < 0);
+ WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
/*
* When the callback is running, we do not reprogram the clock event
@@ -664,14 +664,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
/* Timer is expired, act upon the callback mode */
switch(timer->cb_mode) {
- case HRTIMER_CB_IRQSAFE_NO_RESTART:
- debug_hrtimer_deactivate(timer);
- /*
- * We can call the callback from here. No restart
- * happens, so no danger of recursion
- */
- BUG_ON(timer->function(timer) != HRTIMER_NORESTART);
- return 1;
case HRTIMER_CB_IRQSAFE_PERCPU:
case HRTIMER_CB_IRQSAFE_UNLOCKED:
/*
@@ -683,7 +675,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
*/
debug_hrtimer_deactivate(timer);
return 1;
- case HRTIMER_CB_IRQSAFE:
case HRTIMER_CB_SOFTIRQ:
/*
* Move everything else into the softirq pending list !
@@ -795,7 +786,7 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
u64 orun = 1;
ktime_t delta;
- delta = ktime_sub(now, timer->expires);
+ delta = ktime_sub(now, hrtimer_get_expires(timer));
if (delta.tv64 < 0)
return 0;
@@ -807,8 +798,8 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
s64 incr = ktime_to_ns(interval);
orun = ktime_divns(delta, incr);
- timer->expires = ktime_add_ns(timer->expires, incr * orun);
- if (timer->expires.tv64 > now.tv64)
+ hrtimer_add_expires_ns(timer, incr * orun);
+ if (hrtimer_get_expires_tv64(timer) > now.tv64)
return orun;
/*
* This (and the ktime_add() below) is the
@@ -816,7 +807,7 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
*/
orun++;
}
- timer->expires = ktime_add_safe(timer->expires, interval);
+ hrtimer_add_expires(timer, interval);
return orun;
}
@@ -848,7 +839,8 @@ static void enqueue_hrtimer(struct hrtimer *timer,
* We dont care about collisions. Nodes with
* the same expiry time stay together.
*/
- if (timer->expires.tv64 < entry->expires.tv64) {
+ if (hrtimer_get_expires_tv64(timer) <
+ hrtimer_get_expires_tv64(entry)) {
link = &(*link)->rb_left;
} else {
link = &(*link)->rb_right;
@@ -945,9 +937,10 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
}
/**
- * hrtimer_start - (re)start an relative timer on the current CPU
+ * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
* @timer: the timer to be added
* @tim: expiry time
+ * @delta_ns: "slack" range for the timer
* @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
*
* Returns:
@@ -955,7 +948,8 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
* 1 when the timer was active
*/
int
-hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
+hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_ns,
+ const enum hrtimer_mode mode)
{
struct hrtimer_clock_base *base, *new_base;
unsigned long flags;
@@ -983,7 +977,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
#endif
}
- timer->expires = tim;
+ hrtimer_set_expires_range_ns(timer, tim, delta_ns);
timer_stats_hrtimer_set_start_info(timer);
@@ -1016,8 +1010,26 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
return ret;
}
+EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
+
+/**
+ * hrtimer_start - (re)start an hrtimer on the current CPU
+ * @timer: the timer to be added
+ * @tim: expiry time
+ * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
+ *
+ * Returns:
+ * 0 on success
+ * 1 when the timer was active
+ */
+int
+hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
+{
+ return hrtimer_start_range_ns(timer, tim, 0, mode);
+}
EXPORT_SYMBOL_GPL(hrtimer_start);
+
/**
* hrtimer_try_to_cancel - try to deactivate a timer
* @timer: hrtimer to stop
@@ -1077,7 +1089,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
ktime_t rem;
base = lock_hrtimer_base(timer, &flags);
- rem = ktime_sub(timer->expires, base->get_time());
+ rem = hrtimer_expires_remaining(timer);
unlock_hrtimer_base(timer, &flags);
return rem;
@@ -1109,7 +1121,7 @@ ktime_t hrtimer_get_next_event(void)
continue;
timer = rb_entry(base->first, struct hrtimer, node);
- delta.tv64 = timer->expires.tv64;
+ delta.tv64 = hrtimer_get_expires_tv64(timer);
delta = ktime_sub(delta, base->get_time());
if (delta.tv64 < mindelta.tv64)
mindelta.tv64 = delta.tv64;
@@ -1188,6 +1200,7 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
enum hrtimer_restart (*fn)(struct hrtimer *);
struct hrtimer *timer;
int restart;
+ int emulate_hardirq_ctx = 0;
timer = list_entry(cpu_base->cb_pending.next,
struct hrtimer, cb_entry);
@@ -1196,10 +1209,24 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
timer_stats_account_hrtimer(timer);
fn = timer->function;
+ /*
+ * A timer might have been added to the cb_pending list
+ * when it was migrated during a cpu-offline operation.
+ * Emulate hardirq context for such timers.
+ */
+ if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
+ timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED)
+ emulate_hardirq_ctx = 1;
+
__remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
spin_unlock_irq(&cpu_base->lock);
- restart = fn(timer);
+ if (unlikely(emulate_hardirq_ctx)) {
+ local_irq_disable();
+ restart = fn(timer);
+ local_irq_enable();
+ } else
+ restart = fn(timer);
spin_lock_irq(&cpu_base->lock);
@@ -1310,10 +1337,23 @@ void hrtimer_interrupt(struct clock_event_device *dev)
timer = rb_entry(node, struct hrtimer, node);
- if (basenow.tv64 < timer->expires.tv64) {
+ /*
+ * The immediate goal for using the softexpires is
+ * minimizing wakeups, not running timers at the
+ * earliest interrupt after their soft expiration.
+ * This allows us to avoid using a Priority Search
+ * Tree, which can answer a stabbing querry for
+ * overlapping intervals and instead use the simple
+ * BST we already have.
+ * We don't add extra wakeups by delaying timers that
+ * are right-of a not yet expired timer, because that
+ * timer will have to trigger a wakeup anyway.
+ */
+
+ if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) {
ktime_t expires;
- expires = ktime_sub(timer->expires,
+ expires = ktime_sub(hrtimer_get_expires(timer),
base->offset);
if (expires.tv64 < expires_next.tv64)
expires_next = expires;
@@ -1349,6 +1389,30 @@ void hrtimer_interrupt(struct clock_event_device *dev)
raise_softirq(HRTIMER_SOFTIRQ);
}
+/**
+ * hrtimer_peek_ahead_timers -- run soft-expired timers now
+ *
+ * hrtimer_peek_ahead_timers will peek at the timer queue of
+ * the current cpu and check if there are any timers for which
+ * the soft expires time has passed. If any such timers exist,
+ * they are run immediately and then removed from the timer queue.
+ *
+ */
+void hrtimer_peek_ahead_timers(void)
+{
+ struct tick_device *td;
+ unsigned long flags;
+
+ if (!hrtimer_hres_active())
+ return;
+
+ local_irq_save(flags);
+ td = &__get_cpu_var(tick_cpu_device);
+ if (td && td->evtdev)
+ hrtimer_interrupt(td->evtdev);
+ local_irq_restore(flags);
+}
+
static void run_hrtimer_softirq(struct softirq_action *h)
{
run_hrtimer_pending(&__get_cpu_var(hrtimer_bases));
@@ -1414,7 +1478,8 @@ void hrtimer_run_queues(void)
struct hrtimer *timer;
timer = rb_entry(node, struct hrtimer, node);
- if (base->softirq_time.tv64 <= timer->expires.tv64)
+ if (base->softirq_time.tv64 <=
+ hrtimer_get_expires_tv64(timer))
break;
if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
@@ -1462,7 +1527,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
do {
set_current_state(TASK_INTERRUPTIBLE);
- hrtimer_start(&t->timer, t->timer.expires, mode);
+ hrtimer_start_expires(&t->timer, mode);
if (!hrtimer_active(&t->timer))
t->task = NULL;
@@ -1484,7 +1549,7 @@ static int update_rmtp(struct hrtimer *timer, struct timespec __user *rmtp)
struct timespec rmt;
ktime_t rem;
- rem = ktime_sub(timer->expires, timer->base->get_time());
+ rem = hrtimer_expires_remaining(timer);
if (rem.tv64 <= 0)
return 0;
rmt = ktime_to_timespec(rem);
@@ -1503,7 +1568,7 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
hrtimer_init_on_stack(&t.timer, restart->nanosleep.index,
HRTIMER_MODE_ABS);
- t.timer.expires.tv64 = restart->nanosleep.expires;
+ hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
if (do_nanosleep(&t, HRTIMER_MODE_ABS))
goto out;
@@ -1528,9 +1593,14 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
struct restart_block *restart;
struct hrtimer_sleeper t;
int ret = 0;
+ unsigned long slack;
+
+ slack = current->timer_slack_ns;
+ if (rt_task(current))
+ slack = 0;
hrtimer_init_on_stack(&t.timer, clockid, mode);
- t.timer.expires = timespec_to_ktime(*rqtp);
+ hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
if (do_nanosleep(&t, mode))
goto out;
@@ -1550,7 +1620,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
restart->fn = hrtimer_nanosleep_restart;
restart->nanosleep.index = t.timer.base->index;
restart->nanosleep.rmtp = rmtp;
- restart->nanosleep.expires = t.timer.expires.tv64;
+ restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
ret = -ERESTART_RESTARTBLOCK;
out:
@@ -1752,3 +1822,103 @@ void __init hrtimers_init(void)
#endif
}
+/**
+ * schedule_hrtimeout_range - sleep until timeout
+ * @expires: timeout value (ktime_t)
+ * @delta: slack in expires timeout (ktime_t)
+ * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
+ *
+ * Make the current task sleep until the given expiry time has
+ * elapsed. The routine will return immediately unless
+ * the current task state has been set (see set_current_state()).
+ *
+ * The @delta argument gives the kernel the freedom to schedule the
+ * actual wakeup to a time that is both power and performance friendly.
+ * The kernel give the normal best effort behavior for "@expires+@delta",
+ * but may decide to fire the timer earlier, but no earlier than @expires.
+ *
+ * You can set the task state as follows -
+ *
+ * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
+ * pass before the routine returns.
+ *
+ * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
+ * delivered to the current task.
+ *
+ * The current task state is guaranteed to be TASK_RUNNING when this
+ * routine returns.
+ *
+ * Returns 0 when the timer has expired otherwise -EINTR
+ */
+int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
+ const enum hrtimer_mode mode)
+{
+ struct hrtimer_sleeper t;
+
+ /*
+ * Optimize when a zero timeout value is given. It does not
+ * matter whether this is an absolute or a relative time.
+ */
+ if (expires && !expires->tv64) {
+ __set_current_state(TASK_RUNNING);
+ return 0;
+ }
+
+ /*
+ * A NULL parameter means "inifinte"
+ */
+ if (!expires) {
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ return -EINTR;
+ }
+
+ hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, mode);
+ hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
+
+ hrtimer_init_sleeper(&t, current);
+
+ hrtimer_start_expires(&t.timer, mode);
+ if (!hrtimer_active(&t.timer))
+ t.task = NULL;
+
+ if (likely(t.task))
+ schedule();
+
+ hrtimer_cancel(&t.timer);
+ destroy_hrtimer_on_stack(&t.timer);
+
+ __set_current_state(TASK_RUNNING);
+
+ return !t.task ? 0 : -EINTR;
+}
+EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
+
+/**
+ * schedule_hrtimeout - sleep until timeout
+ * @expires: timeout value (ktime_t)
+ * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
+ *
+ * Make the current task sleep until the given expiry time has
+ * elapsed. The routine will return immediately unless
+ * the current task state has been set (see set_current_state()).
+ *
+ * You can set the task state as follows -
+ *
+ * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
+ * pass before the routine returns.
+ *
+ * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
+ * delivered to the current task.
+ *
+ * The current task state is guaranteed to be TASK_RUNNING when this
+ * routine returns.
+ *
+ * Returns 0 when the timer has expired otherwise -EINTR
+ */
+int __sched schedule_hrtimeout(ktime_t *expires,
+ const enum hrtimer_mode mode)
+{
+ return schedule_hrtimeout_range(expires, 0, mode);
+}
+EXPORT_SYMBOL_GPL(schedule_hrtimeout);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 4895fde4eb9..10b5092e9bf 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -76,6 +76,7 @@ void dynamic_irq_cleanup(unsigned int irq)
desc->chip_data = NULL;
desc->handle_irq = handle_bad_irq;
desc->chip = &no_irq_chip;
+ desc->name = NULL;
spin_unlock_irqrestore(&desc->lock, flags);
}
@@ -127,7 +128,7 @@ int set_irq_type(unsigned int irq, unsigned int type)
return 0;
spin_lock_irqsave(&desc->lock, flags);
- ret = __irq_set_trigger(desc, irq, flags);
+ ret = __irq_set_trigger(desc, irq, type);
spin_unlock_irqrestore(&desc->lock, flags);
return ret;
}
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index c9767e64198..64c1c7253da 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -25,6 +25,8 @@ static inline void unregister_handler_proc(unsigned int irq,
struct irqaction *action) { }
#endif
+extern int irq_select_affinity_usr(unsigned int irq);
+
/*
* Debugging printout:
*/
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index c498a1b8c62..801addda3c4 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -82,24 +82,27 @@ int irq_can_set_affinity(unsigned int irq)
int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
{
struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
if (!desc->chip->set_affinity)
return -EINVAL;
+ spin_lock_irqsave(&desc->lock, flags);
+
#ifdef CONFIG_GENERIC_PENDING_IRQ
if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
- unsigned long flags;
-
- spin_lock_irqsave(&desc->lock, flags);
desc->affinity = cpumask;
desc->chip->set_affinity(irq, cpumask);
- spin_unlock_irqrestore(&desc->lock, flags);
- } else
- set_pending_irq(irq, cpumask);
+ } else {
+ desc->status |= IRQ_MOVE_PENDING;
+ desc->pending_mask = cpumask;
+ }
#else
desc->affinity = cpumask;
desc->chip->set_affinity(irq, cpumask);
#endif
+ desc->status |= IRQ_AFFINITY_SET;
+ spin_unlock_irqrestore(&desc->lock, flags);
return 0;
}
@@ -107,24 +110,59 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
/*
* Generic version of the affinity autoselector.
*/
-int irq_select_affinity(unsigned int irq)
+int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc)
{
cpumask_t mask;
- struct irq_desc *desc;
if (!irq_can_set_affinity(irq))
return 0;
cpus_and(mask, cpu_online_map, irq_default_affinity);
- desc = irq_to_desc(irq);
+ /*
+ * Preserve an userspace affinity setup, but make sure that
+ * one of the targets is online.
+ */
+ if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
+ if (cpus_intersects(desc->affinity, cpu_online_map))
+ mask = desc->affinity;
+ else
+ desc->status &= ~IRQ_AFFINITY_SET;
+ }
+
desc->affinity = mask;
desc->chip->set_affinity(irq, mask);
return 0;
}
+#else
+static inline int do_irq_select_affinity(unsigned int irq, struct irq_desc *d)
+{
+ return irq_select_affinity(irq);
+}
#endif
+/*
+ * Called when affinity is set via /proc/irq
+ */
+int irq_select_affinity_usr(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&desc->lock, flags);
+ ret = do_irq_select_affinity(irq, desc);
+ spin_unlock_irqrestore(&desc->lock, flags);
+
+ return ret;
+}
+
+#else
+static inline int do_irq_select_affinity(int irq, struct irq_desc *desc)
+{
+ return 0;
+}
#endif
/**
@@ -327,7 +365,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
* IRQF_TRIGGER_* but the PIC does not support multiple
* flow-types?
*/
- pr_warning("No set_type function for IRQ %d (%s)\n", irq,
+ pr_debug("No set_type function for IRQ %d (%s)\n", irq,
chip ? (chip->name ? : "unknown") : "unknown");
return 0;
}
@@ -445,8 +483,12 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
/* Undo nested disables: */
desc->depth = 1;
+ /* Exclude IRQ from balancing if requested */
+ if (new->flags & IRQF_NOBALANCING)
+ desc->status |= IRQ_NO_BALANCING;
+
/* Set default affinity mask once everything is setup */
- irq_select_affinity(irq);
+ do_irq_select_affinity(irq, desc);
} else if ((new->flags & IRQF_TRIGGER_MASK)
&& (new->flags & IRQF_TRIGGER_MASK)
@@ -459,10 +501,6 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
*p = new;
- /* Exclude IRQ from balancing */
- if (new->flags & IRQF_NOBALANCING)
- desc->status |= IRQ_NO_BALANCING;
-
/* Reset broken irq detection when installing new handler */
desc->irq_count = 0;
desc->irqs_unhandled = 0;
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 90b920d3f52..9db681d9581 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -1,17 +1,6 @@
#include <linux/irq.h>
-void set_pending_irq(unsigned int irq, cpumask_t mask)
-{
- struct irq_desc *desc = irq_to_desc(irq);
- unsigned long flags;
-
- spin_lock_irqsave(&desc->lock, flags);
- desc->status |= IRQ_MOVE_PENDING;
- desc->pending_mask = mask;
- spin_unlock_irqrestore(&desc->lock, flags);
-}
-
void move_masked_irq(int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index fac014a81b2..d257e7d6a8a 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -62,7 +62,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
if (!cpus_intersects(new_value, cpu_online_map))
/* Special case for empty set - allow the architecture
code to set default SMP affinity. */
- return irq_select_affinity(irq) ? -EINVAL : count;
+ return irq_select_affinity_usr(irq) ? -EINVAL : count;
irq_set_affinity(irq, new_value);
@@ -220,7 +220,7 @@ void unregister_handler_proc(unsigned int irq, struct irqaction *action)
}
}
-void register_default_affinity_proc(void)
+static void register_default_affinity_proc(void)
{
#ifdef CONFIG_SMP
proc_create("irq/default_smp_affinity", 0600, NULL,
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 5072cf1685a..7b8b0f21a5b 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -304,17 +304,24 @@ int sprint_symbol(char *buffer, unsigned long address)
char *modname;
const char *name;
unsigned long offset, size;
- char namebuf[KSYM_NAME_LEN];
+ int len;
- name = kallsyms_lookup(address, &size, &offset, &modname, namebuf);
+ name = kallsyms_lookup(address, &size, &offset, &modname, buffer);
if (!name)
return sprintf(buffer, "0x%lx", address);
+ if (name != buffer)
+ strcpy(buffer, name);
+ len = strlen(buffer);
+ buffer += len;
+
if (modname)
- return sprintf(buffer, "%s+%#lx/%#lx [%s]", name, offset,
- size, modname);
+ len += sprintf(buffer, "+%#lx/%#lx [%s]",
+ offset, size, modname);
else
- return sprintf(buffer, "%s+%#lx/%#lx", name, offset, size);
+ len += sprintf(buffer, "+%#lx/%#lx", offset, size);
+
+ return len;
}
/* Look up a kernel symbol and print it to the kernel messages. */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 8b57a2597f2..9f8a3f25259 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -72,7 +72,7 @@ static bool kprobe_enabled;
DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
static struct {
- spinlock_t lock ____cacheline_aligned;
+ spinlock_t lock ____cacheline_aligned_in_smp;
} kretprobe_table_locks[KPROBE_TABLE_SIZE];
static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
@@ -613,30 +613,37 @@ static int __kprobes __register_kprobe(struct kprobe *p,
return -EINVAL;
p->addr = addr;
- if (!kernel_text_address((unsigned long) p->addr) ||
- in_kprobes_functions((unsigned long) p->addr))
+ preempt_disable();
+ if (!__kernel_text_address((unsigned long) p->addr) ||
+ in_kprobes_functions((unsigned long) p->addr)) {
+ preempt_enable();
return -EINVAL;
+ }
p->mod_refcounted = 0;
/*
* Check if are we probing a module.
*/
- probed_mod = module_text_address((unsigned long) p->addr);
+ probed_mod = __module_text_address((unsigned long) p->addr);
if (probed_mod) {
- struct module *calling_mod = module_text_address(called_from);
+ struct module *calling_mod;
+ calling_mod = __module_text_address(called_from);
/*
* We must allow modules to probe themself and in this case
* avoid incrementing the module refcount, so as to allow
* unloading of self probing modules.
*/
if (calling_mod && calling_mod != probed_mod) {
- if (unlikely(!try_module_get(probed_mod)))
+ if (unlikely(!try_module_get(probed_mod))) {
+ preempt_enable();
return -EINVAL;
+ }
p->mod_refcounted = 1;
} else
probed_mod = NULL;
}
+ preempt_enable();
p->nmissed = 0;
INIT_LIST_HEAD(&p->list);
@@ -718,6 +725,10 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
struct kprobe *old_p;
if (p->mod_refcounted) {
+ /*
+ * Since we've already incremented refcount,
+ * we don't need to disable preemption.
+ */
mod = module_text_address((unsigned long)p->addr);
if (mod)
module_put(mod);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 8e7a7ce3ed0..4fbc456f393 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -21,6 +21,9 @@ static DEFINE_SPINLOCK(kthread_create_lock);
static LIST_HEAD(kthread_create_list);
struct task_struct *kthreadd_task;
+DEFINE_TRACE(sched_kthread_stop);
+DEFINE_TRACE(sched_kthread_stop_ret);
+
struct kthread_create_info
{
/* Information passed to kthread() from kthreadd. */
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index dbda475b13b..46a404173db 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2169,12 +2169,11 @@ void early_boot_irqs_on(void)
/*
* Hardirqs will be enabled:
*/
-void trace_hardirqs_on_caller(unsigned long a0)
+void trace_hardirqs_on_caller(unsigned long ip)
{
struct task_struct *curr = current;
- unsigned long ip;
- time_hardirqs_on(CALLER_ADDR0, a0);
+ time_hardirqs_on(CALLER_ADDR0, ip);
if (unlikely(!debug_locks || current->lockdep_recursion))
return;
@@ -2188,7 +2187,6 @@ void trace_hardirqs_on_caller(unsigned long a0)
}
/* we'll do an OFF -> ON transition: */
curr->hardirqs_enabled = 1;
- ip = (unsigned long) __builtin_return_address(0);
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return;
@@ -2224,11 +2222,11 @@ EXPORT_SYMBOL(trace_hardirqs_on);
/*
* Hardirqs were disabled:
*/
-void trace_hardirqs_off_caller(unsigned long a0)
+void trace_hardirqs_off_caller(unsigned long ip)
{
struct task_struct *curr = current;
- time_hardirqs_off(CALLER_ADDR0, a0);
+ time_hardirqs_off(CALLER_ADDR0, ip);
if (unlikely(!debug_locks || current->lockdep_recursion))
return;
@@ -2241,7 +2239,7 @@ void trace_hardirqs_off_caller(unsigned long a0)
* We have done an ON -> OFF transition:
*/
curr->hardirqs_enabled = 0;
- curr->hardirq_disable_ip = _RET_IP_;
+ curr->hardirq_disable_ip = ip;
curr->hardirq_disable_event = ++curr->irq_events;
debug_atomic_inc(&hardirqs_off_events);
} else
@@ -3278,10 +3276,10 @@ void __init lockdep_info(void)
{
printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n");
- printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES);
+ printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES);
printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH);
printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS);
- printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE);
+ printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE);
printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES);
printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS);
printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE);
@@ -3417,9 +3415,10 @@ retry:
}
printk(" ignoring it.\n");
unlock = 0;
+ } else {
+ if (count != 10)
+ printk(KERN_CONT " locked it.\n");
}
- if (count != 10)
- printk(" locked it.\n");
do_each_thread(g, p) {
/*
diff --git a/kernel/marker.c b/kernel/marker.c
index e9c6b2bc940..ea54f264786 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -43,6 +43,7 @@ static DEFINE_MUTEX(markers_mutex);
*/
#define MARKER_HASH_BITS 6
#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
+static struct hlist_head marker_table[MARKER_TABLE_SIZE];
/*
* Note about RCU :
@@ -64,11 +65,10 @@ struct marker_entry {
void *oldptr;
int rcu_pending;
unsigned char ptype:1;
+ unsigned char format_allocated:1;
char name[0]; /* Contains name'\0'format'\0' */
};
-static struct hlist_head marker_table[MARKER_TABLE_SIZE];
-
/**
* __mark_empty_function - Empty probe callback
* @probe_private: probe private data
@@ -81,7 +81,7 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE];
* though the function pointer change and the marker enabling are two distinct
* operations that modifies the execution flow of preemptible code.
*/
-void __mark_empty_function(void *probe_private, void *call_private,
+notrace void __mark_empty_function(void *probe_private, void *call_private,
const char *fmt, va_list *args)
{
}
@@ -97,7 +97,8 @@ EXPORT_SYMBOL_GPL(__mark_empty_function);
* need to put a full smp_rmb() in this branch. This is why we do not use
* rcu_dereference() for the pointer read.
*/
-void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
+notrace void marker_probe_cb(const struct marker *mdata,
+ void *call_private, ...)
{
va_list args;
char ptype;
@@ -107,7 +108,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
* sure the teardown of the callbacks can be done correctly when they
* are in modules and they insure RCU read coherency.
*/
- rcu_read_lock_sched();
+ rcu_read_lock_sched_notrace();
ptype = mdata->ptype;
if (likely(!ptype)) {
marker_probe_func *func;
@@ -145,7 +146,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
va_end(args);
}
}
- rcu_read_unlock_sched();
+ rcu_read_unlock_sched_notrace();
}
EXPORT_SYMBOL_GPL(marker_probe_cb);
@@ -157,12 +158,13 @@ EXPORT_SYMBOL_GPL(marker_probe_cb);
*
* Should be connected to markers "MARK_NOARGS".
*/
-void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
+static notrace void marker_probe_cb_noarg(const struct marker *mdata,
+ void *call_private, ...)
{
va_list args; /* not initialized */
char ptype;
- rcu_read_lock_sched();
+ rcu_read_lock_sched_notrace();
ptype = mdata->ptype;
if (likely(!ptype)) {
marker_probe_func *func;
@@ -195,9 +197,8 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
multi[i].func(multi[i].probe_private, call_private,
mdata->format, &args);
}
- rcu_read_unlock_sched();
+ rcu_read_unlock_sched_notrace();
}
-EXPORT_SYMBOL_GPL(marker_probe_cb_noarg);
static void free_old_closure(struct rcu_head *head)
{
@@ -416,6 +417,7 @@ static struct marker_entry *add_marker(const char *name, const char *format)
e->single.probe_private = NULL;
e->multi = NULL;
e->ptype = 0;
+ e->format_allocated = 0;
e->refcount = 0;
e->rcu_pending = 0;
hlist_add_head(&e->hlist, head);
@@ -447,6 +449,8 @@ static int remove_marker(const char *name)
if (e->single.func != __mark_empty_function)
return -EBUSY;
hlist_del(&e->hlist);
+ if (e->format_allocated)
+ kfree(e->format);
/* Make sure the call_rcu has been executed */
if (e->rcu_pending)
rcu_barrier_sched();
@@ -457,57 +461,34 @@ static int remove_marker(const char *name)
/*
* Set the mark_entry format to the format found in the element.
*/
-static int marker_set_format(struct marker_entry **entry, const char *format)
+static int marker_set_format(struct marker_entry *entry, const char *format)
{
- struct marker_entry *e;
- size_t name_len = strlen((*entry)->name) + 1;
- size_t format_len = strlen(format) + 1;
-
-
- e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
- GFP_KERNEL);
- if (!e)
+ entry->format = kstrdup(format, GFP_KERNEL);
+ if (!entry->format)
return -ENOMEM;
- memcpy(&e->name[0], (*entry)->name, name_len);
- e->format = &e->name[name_len];
- memcpy(e->format, format, format_len);
- if (strcmp(e->format, MARK_NOARGS) == 0)
- e->call = marker_probe_cb_noarg;
- else
- e->call = marker_probe_cb;
- e->single = (*entry)->single;
- e->multi = (*entry)->multi;
- e->ptype = (*entry)->ptype;
- e->refcount = (*entry)->refcount;
- e->rcu_pending = 0;
- hlist_add_before(&e->hlist, &(*entry)->hlist);
- hlist_del(&(*entry)->hlist);
- /* Make sure the call_rcu has been executed */
- if ((*entry)->rcu_pending)
- rcu_barrier_sched();
- kfree(*entry);
- *entry = e;
+ entry->format_allocated = 1;
+
trace_mark(core_marker_format, "name %s format %s",
- e->name, e->format);
+ entry->name, entry->format);
return 0;
}
/*
* Sets the probe callback corresponding to one marker.
*/
-static int set_marker(struct marker_entry **entry, struct marker *elem,
+static int set_marker(struct marker_entry *entry, struct marker *elem,
int active)
{
- int ret;
- WARN_ON(strcmp((*entry)->name, elem->name) != 0);
+ int ret = 0;
+ WARN_ON(strcmp(entry->name, elem->name) != 0);
- if ((*entry)->format) {
- if (strcmp((*entry)->format, elem->format) != 0) {
+ if (entry->format) {
+ if (strcmp(entry->format, elem->format) != 0) {
printk(KERN_NOTICE
"Format mismatch for probe %s "
"(%s), marker (%s)\n",
- (*entry)->name,
- (*entry)->format,
+ entry->name,
+ entry->format,
elem->format);
return -EPERM;
}
@@ -523,37 +504,67 @@ static int set_marker(struct marker_entry **entry, struct marker *elem,
* pass from a "safe" callback (with argument) to an "unsafe"
* callback (does not set arguments).
*/
- elem->call = (*entry)->call;
+ elem->call = entry->call;
/*
* Sanity check :
* We only update the single probe private data when the ptr is
* set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
*/
WARN_ON(elem->single.func != __mark_empty_function
- && elem->single.probe_private
- != (*entry)->single.probe_private &&
- !elem->ptype);
- elem->single.probe_private = (*entry)->single.probe_private;
+ && elem->single.probe_private != entry->single.probe_private
+ && !elem->ptype);
+ elem->single.probe_private = entry->single.probe_private;
/*
* Make sure the private data is valid when we update the
* single probe ptr.
*/
smp_wmb();
- elem->single.func = (*entry)->single.func;
+ elem->single.func = entry->single.func;
/*
* We also make sure that the new probe callbacks array is consistent
* before setting a pointer to it.
*/
- rcu_assign_pointer(elem->multi, (*entry)->multi);
+ rcu_assign_pointer(elem->multi, entry->multi);
/*
* Update the function or multi probe array pointer before setting the
* ptype.
*/
smp_wmb();
- elem->ptype = (*entry)->ptype;
+ elem->ptype = entry->ptype;
+
+ if (elem->tp_name && (active ^ elem->state)) {
+ WARN_ON(!elem->tp_cb);
+ /*
+ * It is ok to directly call the probe registration because type
+ * checking has been done in the __trace_mark_tp() macro.
+ */
+
+ if (active) {
+ /*
+ * try_module_get should always succeed because we hold
+ * lock_module() to get the tp_cb address.
+ */
+ ret = try_module_get(__module_text_address(
+ (unsigned long)elem->tp_cb));
+ BUG_ON(!ret);
+ ret = tracepoint_probe_register_noupdate(
+ elem->tp_name,
+ elem->tp_cb);
+ } else {
+ ret = tracepoint_probe_unregister_noupdate(
+ elem->tp_name,
+ elem->tp_cb);
+ /*
+ * tracepoint_probe_update_all() must be called
+ * before the module containing tp_cb is unloaded.
+ */
+ module_put(__module_text_address(
+ (unsigned long)elem->tp_cb));
+ }
+ }
elem->state = active;
- return 0;
+ return ret;
}
/*
@@ -564,7 +575,24 @@ static int set_marker(struct marker_entry **entry, struct marker *elem,
*/
static void disable_marker(struct marker *elem)
{
+ int ret;
+
/* leave "call" as is. It is known statically. */
+ if (elem->tp_name && elem->state) {
+ WARN_ON(!elem->tp_cb);
+ /*
+ * It is ok to directly call the probe registration because type
+ * checking has been done in the __trace_mark_tp() macro.
+ */
+ ret = tracepoint_probe_unregister_noupdate(elem->tp_name,
+ elem->tp_cb);
+ WARN_ON(ret);
+ /*
+ * tracepoint_probe_update_all() must be called
+ * before the module containing tp_cb is unloaded.
+ */
+ module_put(__module_text_address((unsigned long)elem->tp_cb));
+ }
elem->state = 0;
elem->single.func = __mark_empty_function;
/* Update the function before setting the ptype */
@@ -594,8 +622,7 @@ void marker_update_probe_range(struct marker *begin,
for (iter = begin; iter < end; iter++) {
mark_entry = get_marker(iter->name);
if (mark_entry) {
- set_marker(&mark_entry, iter,
- !!mark_entry->refcount);
+ set_marker(mark_entry, iter, !!mark_entry->refcount);
/*
* ignore error, continue
*/
@@ -629,6 +656,7 @@ static void marker_update_probes(void)
marker_update_probe_range(__start___markers, __stop___markers);
/* Markers in modules. */
module_update_markers();
+ tracepoint_probe_update_all();
}
/**
@@ -657,7 +685,7 @@ int marker_probe_register(const char *name, const char *format,
ret = PTR_ERR(entry);
} else if (format) {
if (!entry->format)
- ret = marker_set_format(&entry, format);
+ ret = marker_set_format(entry, format);
else if (strcmp(entry->format, format))
ret = -EPERM;
}
@@ -676,10 +704,11 @@ int marker_probe_register(const char *name, const char *format,
goto end;
}
mutex_unlock(&markers_mutex);
- marker_update_probes(); /* may update entry */
+ marker_update_probes();
mutex_lock(&markers_mutex);
entry = get_marker(name);
- WARN_ON(!entry);
+ if (!entry)
+ goto end;
if (entry->rcu_pending)
rcu_barrier_sched();
entry->oldptr = old;
@@ -720,7 +749,7 @@ int marker_probe_unregister(const char *name,
rcu_barrier_sched();
old = marker_entry_remove_probe(entry, probe, probe_private);
mutex_unlock(&markers_mutex);
- marker_update_probes(); /* may update entry */
+ marker_update_probes();
mutex_lock(&markers_mutex);
entry = get_marker(name);
if (!entry)
@@ -801,10 +830,11 @@ int marker_probe_unregister_private_data(marker_probe_func *probe,
rcu_barrier_sched();
old = marker_entry_remove_probe(entry, NULL, probe_private);
mutex_unlock(&markers_mutex);
- marker_update_probes(); /* may update entry */
+ marker_update_probes();
mutex_lock(&markers_mutex);
entry = get_marker_from_private_data(probe, probe_private);
- WARN_ON(!entry);
+ if (!entry)
+ goto end;
if (entry->rcu_pending)
rcu_barrier_sched();
entry->oldptr = old;
@@ -848,8 +878,6 @@ void *marker_get_private_data(const char *name, marker_probe_func *probe,
if (!e->ptype) {
if (num == 0 && e->single.func == probe)
return e->single.probe_private;
- else
- break;
} else {
struct marker_probe_closure *closure;
int match = 0;
@@ -861,8 +889,42 @@ void *marker_get_private_data(const char *name, marker_probe_func *probe,
return closure[i].probe_private;
}
}
+ break;
}
}
return ERR_PTR(-ENOENT);
}
EXPORT_SYMBOL_GPL(marker_get_private_data);
+
+#ifdef CONFIG_MODULES
+
+int marker_module_notify(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ struct module *mod = data;
+
+ switch (val) {
+ case MODULE_STATE_COMING:
+ marker_update_probe_range(mod->markers,
+ mod->markers + mod->num_markers);
+ break;
+ case MODULE_STATE_GOING:
+ marker_update_probe_range(mod->markers,
+ mod->markers + mod->num_markers);
+ break;
+ }
+ return 0;
+}
+
+struct notifier_block marker_module_nb = {
+ .notifier_call = marker_module_notify,
+ .priority = 0,
+};
+
+static int init_markers(void)
+{
+ return register_module_notifier(&marker_module_nb);
+}
+__initcall(init_markers);
+
+#endif /* CONFIG_MODULES */
diff --git a/kernel/module.c b/kernel/module.c
index 0d8d21ee792..89bcf7c1327 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -20,11 +20,13 @@
#include <linux/moduleloader.h>
#include <linux/init.h>
#include <linux/kallsyms.h>
+#include <linux/fs.h>
#include <linux/sysfs.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/elf.h>
+#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/syscalls.h>
#include <linux/fcntl.h>
@@ -42,6 +44,7 @@
#include <linux/string.h>
#include <linux/mutex.h>
#include <linux/unwind.h>
+#include <linux/rculist.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
#include <linux/license.h>
@@ -63,7 +66,7 @@
#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
/* List of modules, protected by module_mutex or preempt_disable
- * (add/delete uses stop_machine). */
+ * (delete uses stop_machine/add uses RCU list operations). */
static DEFINE_MUTEX(module_mutex);
static LIST_HEAD(modules);
@@ -132,6 +135,29 @@ static unsigned int find_sec(Elf_Ehdr *hdr,
return 0;
}
+/* Find a module section, or NULL. */
+static void *section_addr(Elf_Ehdr *hdr, Elf_Shdr *shdrs,
+ const char *secstrings, const char *name)
+{
+ /* Section 0 has sh_addr 0. */
+ return (void *)shdrs[find_sec(hdr, shdrs, secstrings, name)].sh_addr;
+}
+
+/* Find a module section, or NULL. Fill in number of "objects" in section. */
+static void *section_objs(Elf_Ehdr *hdr,
+ Elf_Shdr *sechdrs,
+ const char *secstrings,
+ const char *name,
+ size_t object_size,
+ unsigned int *num)
+{
+ unsigned int sec = find_sec(hdr, sechdrs, secstrings, name);
+
+ /* Section 0 has sh_addr 0 and sh_size 0. */
+ *num = sechdrs[sec].sh_size / object_size;
+ return (void *)sechdrs[sec].sh_addr;
+}
+
/* Provided by the linker */
extern const struct kernel_symbol __start___ksymtab[];
extern const struct kernel_symbol __stop___ksymtab[];
@@ -218,7 +244,7 @@ static bool each_symbol(bool (*fn)(const struct symsearch *arr,
if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data))
return true;
- list_for_each_entry(mod, &modules, list) {
+ list_for_each_entry_rcu(mod, &modules, list) {
struct symsearch arr[] = {
{ mod->syms, mod->syms + mod->num_syms, mod->crcs,
NOT_GPL_ONLY, false },
@@ -1394,17 +1420,6 @@ static void mod_kobject_remove(struct module *mod)
}
/*
- * link the module with the whole machine is stopped with interrupts off
- * - this defends against kallsyms not taking locks
- */
-static int __link_module(void *_mod)
-{
- struct module *mod = _mod;
- list_add(&mod->list, &modules);
- return 0;
-}
-
-/*
* unlink the module with the whole machine is stopped with interrupts off
* - this defends against kallsyms not taking locks
*/
@@ -1789,32 +1804,20 @@ static inline void add_kallsyms(struct module *mod,
}
#endif /* CONFIG_KALLSYMS */
-#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG
-static void dynamic_printk_setup(Elf_Shdr *sechdrs, unsigned int verboseindex)
+static void dynamic_printk_setup(struct mod_debug *debug, unsigned int num)
{
- struct mod_debug *debug_info;
- unsigned long pos, end;
- unsigned int num_verbose;
-
- pos = sechdrs[verboseindex].sh_addr;
- num_verbose = sechdrs[verboseindex].sh_size /
- sizeof(struct mod_debug);
- end = pos + (num_verbose * sizeof(struct mod_debug));
+#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG
+ unsigned int i;
- for (; pos < end; pos += sizeof(struct mod_debug)) {
- debug_info = (struct mod_debug *)pos;
- register_dynamic_debug_module(debug_info->modname,
- debug_info->type, debug_info->logical_modname,
- debug_info->flag_names, debug_info->hash,
- debug_info->hash2);
+ for (i = 0; i < num; i++) {
+ register_dynamic_debug_module(debug[i].modname,
+ debug[i].type,
+ debug[i].logical_modname,
+ debug[i].flag_names,
+ debug[i].hash, debug[i].hash2);
}
-}
-#else
-static inline void dynamic_printk_setup(Elf_Shdr *sechdrs,
- unsigned int verboseindex)
-{
-}
#endif /* CONFIG_DYNAMIC_PRINTK_DEBUG */
+}
static void *module_alloc_update_bounds(unsigned long size)
{
@@ -1843,37 +1846,14 @@ static noinline struct module *load_module(void __user *umod,
unsigned int i;
unsigned int symindex = 0;
unsigned int strindex = 0;
- unsigned int setupindex;
- unsigned int exindex;
- unsigned int exportindex;
- unsigned int modindex;
- unsigned int obsparmindex;
- unsigned int infoindex;
- unsigned int gplindex;
- unsigned int crcindex;
- unsigned int gplcrcindex;
- unsigned int versindex;
- unsigned int pcpuindex;
- unsigned int gplfutureindex;
- unsigned int gplfuturecrcindex;
+ unsigned int modindex, versindex, infoindex, pcpuindex;
unsigned int unwindex = 0;
-#ifdef CONFIG_UNUSED_SYMBOLS
- unsigned int unusedindex;
- unsigned int unusedcrcindex;
- unsigned int unusedgplindex;
- unsigned int unusedgplcrcindex;
-#endif
- unsigned int markersindex;
- unsigned int markersstringsindex;
- unsigned int verboseindex;
- unsigned int tracepointsindex;
- unsigned int tracepointsstringsindex;
- unsigned int mcountindex;
+ unsigned int num_kp, num_mcount;
+ struct kernel_param *kp;
struct module *mod;
long err = 0;
void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
- void *mseg;
- struct exception_table_entry *extable;
+ unsigned long *mseg;
mm_segment_t old_fs;
DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
@@ -1937,6 +1917,7 @@ static noinline struct module *load_module(void __user *umod,
err = -ENOEXEC;
goto free_hdr;
}
+ /* This is temporary: point mod into copy of data. */
mod = (void *)sechdrs[modindex].sh_addr;
if (symindex == 0) {
@@ -1946,22 +1927,6 @@ static noinline struct module *load_module(void __user *umod,
goto free_hdr;
}
- /* Optional sections */
- exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab");
- gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl");
- gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future");
- crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab");
- gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl");
- gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future");
-#ifdef CONFIG_UNUSED_SYMBOLS
- unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused");
- unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl");
- unusedcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused");
- unusedgplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused_gpl");
-#endif
- setupindex = find_sec(hdr, sechdrs, secstrings, "__param");
- exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table");
- obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm");
versindex = find_sec(hdr, sechdrs, secstrings, "__versions");
infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
@@ -2117,42 +2082,57 @@ static noinline struct module *load_module(void __user *umod,
if (err < 0)
goto cleanup;
- /* Set up EXPORTed & EXPORT_GPLed symbols (section 0 is 0 length) */
- mod->num_syms = sechdrs[exportindex].sh_size / sizeof(*mod->syms);
- mod->syms = (void *)sechdrs[exportindex].sh_addr;
- if (crcindex)
- mod->crcs = (void *)sechdrs[crcindex].sh_addr;
- mod->num_gpl_syms = sechdrs[gplindex].sh_size / sizeof(*mod->gpl_syms);
- mod->gpl_syms = (void *)sechdrs[gplindex].sh_addr;
- if (gplcrcindex)
- mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr;
- mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size /
- sizeof(*mod->gpl_future_syms);
- mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr;
- if (gplfuturecrcindex)
- mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr;
+ /* Now we've got everything in the final locations, we can
+ * find optional sections. */
+ kp = section_objs(hdr, sechdrs, secstrings, "__param", sizeof(*kp),
+ &num_kp);
+ mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab",
+ sizeof(*mod->syms), &mod->num_syms);
+ mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab");
+ mod->gpl_syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab_gpl",
+ sizeof(*mod->gpl_syms),
+ &mod->num_gpl_syms);
+ mod->gpl_crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab_gpl");
+ mod->gpl_future_syms = section_objs(hdr, sechdrs, secstrings,
+ "__ksymtab_gpl_future",
+ sizeof(*mod->gpl_future_syms),
+ &mod->num_gpl_future_syms);
+ mod->gpl_future_crcs = section_addr(hdr, sechdrs, secstrings,
+ "__kcrctab_gpl_future");
#ifdef CONFIG_UNUSED_SYMBOLS
- mod->num_unused_syms = sechdrs[unusedindex].sh_size /
- sizeof(*mod->unused_syms);
- mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size /
- sizeof(*mod->unused_gpl_syms);
- mod->unused_syms = (void *)sechdrs[unusedindex].sh_addr;
- if (unusedcrcindex)
- mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr;
- mod->unused_gpl_syms = (void *)sechdrs[unusedgplindex].sh_addr;
- if (unusedgplcrcindex)
- mod->unused_gpl_crcs
- = (void *)sechdrs[unusedgplcrcindex].sh_addr;
+ mod->unused_syms = section_objs(hdr, sechdrs, secstrings,
+ "__ksymtab_unused",
+ sizeof(*mod->unused_syms),
+ &mod->num_unused_syms);
+ mod->unused_crcs = section_addr(hdr, sechdrs, secstrings,
+ "__kcrctab_unused");
+ mod->unused_gpl_syms = section_objs(hdr, sechdrs, secstrings,
+ "__ksymtab_unused_gpl",
+ sizeof(*mod->unused_gpl_syms),
+ &mod->num_unused_gpl_syms);
+ mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings,
+ "__kcrctab_unused_gpl");
+#endif
+
+#ifdef CONFIG_MARKERS
+ mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers",
+ sizeof(*mod->markers), &mod->num_markers);
+#endif
+#ifdef CONFIG_TRACEPOINTS
+ mod->tracepoints = section_objs(hdr, sechdrs, secstrings,
+ "__tracepoints",
+ sizeof(*mod->tracepoints),
+ &mod->num_tracepoints);
#endif
#ifdef CONFIG_MODVERSIONS
- if ((mod->num_syms && !crcindex)
- || (mod->num_gpl_syms && !gplcrcindex)
- || (mod->num_gpl_future_syms && !gplfuturecrcindex)
+ if ((mod->num_syms && !mod->crcs)
+ || (mod->num_gpl_syms && !mod->gpl_crcs)
+ || (mod->num_gpl_future_syms && !mod->gpl_future_crcs)
#ifdef CONFIG_UNUSED_SYMBOLS
- || (mod->num_unused_syms && !unusedcrcindex)
- || (mod->num_unused_gpl_syms && !unusedgplcrcindex)
+ || (mod->num_unused_syms && !mod->unused_crcs)
+ || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
#endif
) {
printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name);
@@ -2161,16 +2141,6 @@ static noinline struct module *load_module(void __user *umod,
goto cleanup;
}
#endif
- markersindex = find_sec(hdr, sechdrs, secstrings, "__markers");
- markersstringsindex = find_sec(hdr, sechdrs, secstrings,
- "__markers_strings");
- verboseindex = find_sec(hdr, sechdrs, secstrings, "__verbose");
- tracepointsindex = find_sec(hdr, sechdrs, secstrings, "__tracepoints");
- tracepointsstringsindex = find_sec(hdr, sechdrs, secstrings,
- "__tracepoints_strings");
-
- mcountindex = find_sec(hdr, sechdrs, secstrings,
- "__mcount_loc");
/* Now do relocations. */
for (i = 1; i < hdr->e_shnum; i++) {
@@ -2193,28 +2163,16 @@ static noinline struct module *load_module(void __user *umod,
if (err < 0)
goto cleanup;
}
-#ifdef CONFIG_MARKERS
- mod->markers = (void *)sechdrs[markersindex].sh_addr;
- mod->num_markers =
- sechdrs[markersindex].sh_size / sizeof(*mod->markers);
-#endif
-#ifdef CONFIG_TRACEPOINTS
- mod->tracepoints = (void *)sechdrs[tracepointsindex].sh_addr;
- mod->num_tracepoints =
- sechdrs[tracepointsindex].sh_size / sizeof(*mod->tracepoints);
-#endif
-
/* Find duplicate symbols */
err = verify_export_symbols(mod);
-
if (err < 0)
goto cleanup;
/* Set up and sort exception table */
- mod->num_exentries = sechdrs[exindex].sh_size / sizeof(*mod->extable);
- mod->extable = extable = (void *)sechdrs[exindex].sh_addr;
- sort_extable(extable, extable + mod->num_exentries);
+ mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table",
+ sizeof(*mod->extable), &mod->num_exentries);
+ sort_extable(mod->extable, mod->extable + mod->num_exentries);
/* Finally, copy percpu area over. */
percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr,
@@ -2223,20 +2181,18 @@ static noinline struct module *load_module(void __user *umod,
add_kallsyms(mod, sechdrs, symindex, strindex, secstrings);
if (!mod->taints) {
-#ifdef CONFIG_MARKERS
- marker_update_probe_range(mod->markers,
- mod->markers + mod->num_markers);
-#endif
- dynamic_printk_setup(sechdrs, verboseindex);
-#ifdef CONFIG_TRACEPOINTS
- tracepoint_update_probe_range(mod->tracepoints,
- mod->tracepoints + mod->num_tracepoints);
-#endif
+ struct mod_debug *debug;
+ unsigned int num_debug;
+
+ debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
+ sizeof(*debug), &num_debug);
+ dynamic_printk_setup(debug, num_debug);
}
/* sechdrs[0].sh_size is always zero */
- mseg = (void *)sechdrs[mcountindex].sh_addr;
- ftrace_init_module(mseg, mseg + sechdrs[mcountindex].sh_size);
+ mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc",
+ sizeof(*mseg), &num_mcount);
+ ftrace_init_module(mod, mseg, mseg + num_mcount);
err = module_finalize(hdr, sechdrs, mod);
if (err < 0)
@@ -2261,30 +2217,24 @@ static noinline struct module *load_module(void __user *umod,
set_fs(old_fs);
mod->args = args;
- if (obsparmindex)
+ if (section_addr(hdr, sechdrs, secstrings, "__obsparm"))
printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
mod->name);
/* Now sew it into the lists so we can get lockdep and oops
- * info during argument parsing. Noone should access us, since
- * strong_try_module_get() will fail. */
- stop_machine(__link_module, mod, NULL);
-
- /* Size of section 0 is 0, so this works well if no params */
- err = parse_args(mod->name, mod->args,
- (struct kernel_param *)
- sechdrs[setupindex].sh_addr,
- sechdrs[setupindex].sh_size
- / sizeof(struct kernel_param),
- NULL);
+ * info during argument parsing. Noone should access us, since
+ * strong_try_module_get() will fail.
+ * lockdep/oops can run asynchronous, so use the RCU list insertion
+ * function to insert in a way safe to concurrent readers.
+ * The mutex protects against concurrent writers.
+ */
+ list_add_rcu(&mod->list, &modules);
+
+ err = parse_args(mod->name, mod->args, kp, num_kp, NULL);
if (err < 0)
goto unlink;
- err = mod_sysfs_setup(mod,
- (struct kernel_param *)
- sechdrs[setupindex].sh_addr,
- sechdrs[setupindex].sh_size
- / sizeof(struct kernel_param));
+ err = mod_sysfs_setup(mod, kp, num_kp);
if (err < 0)
goto unlink;
add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
@@ -2473,7 +2423,7 @@ const char *module_address_lookup(unsigned long addr,
const char *ret = NULL;
preempt_disable();
- list_for_each_entry(mod, &modules, list) {
+ list_for_each_entry_rcu(mod, &modules, list) {
if (within(addr, mod->module_init, mod->init_size)
|| within(addr, mod->module_core, mod->core_size)) {
if (modname)
@@ -2496,7 +2446,7 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)
struct module *mod;
preempt_disable();
- list_for_each_entry(mod, &modules, list) {
+ list_for_each_entry_rcu(mod, &modules, list) {
if (within(addr, mod->module_init, mod->init_size) ||
within(addr, mod->module_core, mod->core_size)) {
const char *sym;
@@ -2520,7 +2470,7 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
struct module *mod;
preempt_disable();
- list_for_each_entry(mod, &modules, list) {
+ list_for_each_entry_rcu(mod, &modules, list) {
if (within(addr, mod->module_init, mod->init_size) ||
within(addr, mod->module_core, mod->core_size)) {
const char *sym;
@@ -2547,7 +2497,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
struct module *mod;
preempt_disable();
- list_for_each_entry(mod, &modules, list) {
+ list_for_each_entry_rcu(mod, &modules, list) {
if (symnum < mod->num_symtab) {
*value = mod->symtab[symnum].st_value;
*type = mod->symtab[symnum].st_info;
@@ -2590,7 +2540,7 @@ unsigned long module_kallsyms_lookup_name(const char *name)
ret = mod_find_symname(mod, colon+1);
*colon = ':';
} else {
- list_for_each_entry(mod, &modules, list)
+ list_for_each_entry_rcu(mod, &modules, list)
if ((ret = mod_find_symname(mod, name)) != 0)
break;
}
@@ -2599,23 +2549,6 @@ unsigned long module_kallsyms_lookup_name(const char *name)
}
#endif /* CONFIG_KALLSYMS */
-/* Called by the /proc file system to return a list of modules. */
-static void *m_start(struct seq_file *m, loff_t *pos)
-{
- mutex_lock(&module_mutex);
- return seq_list_start(&modules, *pos);
-}
-
-static void *m_next(struct seq_file *m, void *p, loff_t *pos)
-{
- return seq_list_next(p, &modules, pos);
-}
-
-static void m_stop(struct seq_file *m, void *p)
-{
- mutex_unlock(&module_mutex);
-}
-
static char *module_flags(struct module *mod, char *buf)
{
int bx = 0;
@@ -2649,6 +2582,24 @@ static char *module_flags(struct module *mod, char *buf)
return buf;
}
+#ifdef CONFIG_PROC_FS
+/* Called by the /proc file system to return a list of modules. */
+static void *m_start(struct seq_file *m, loff_t *pos)
+{
+ mutex_lock(&module_mutex);
+ return seq_list_start(&modules, *pos);
+}
+
+static void *m_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ return seq_list_next(p, &modules, pos);
+}
+
+static void m_stop(struct seq_file *m, void *p)
+{
+ mutex_unlock(&module_mutex);
+}
+
static int m_show(struct seq_file *m, void *p)
{
struct module *mod = list_entry(p, struct module, list);
@@ -2679,13 +2630,33 @@ static int m_show(struct seq_file *m, void *p)
Where refcount is a number or -, and deps is a comma-separated list
of depends or -.
*/
-const struct seq_operations modules_op = {
+static const struct seq_operations modules_op = {
.start = m_start,
.next = m_next,
.stop = m_stop,
.show = m_show
};
+static int modules_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &modules_op);
+}
+
+static const struct file_operations proc_modules_operations = {
+ .open = modules_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init proc_modules_init(void)
+{
+ proc_create("modules", 0, NULL, &proc_modules_operations);
+ return 0;
+}
+module_init(proc_modules_init);
+#endif
+
/* Given an address, look for it in the module exception tables. */
const struct exception_table_entry *search_module_extables(unsigned long addr)
{
@@ -2693,7 +2664,7 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
struct module *mod;
preempt_disable();
- list_for_each_entry(mod, &modules, list) {
+ list_for_each_entry_rcu(mod, &modules, list) {
if (mod->num_exentries == 0)
continue;
@@ -2719,7 +2690,7 @@ int is_module_address(unsigned long addr)
preempt_disable();
- list_for_each_entry(mod, &modules, list) {
+ list_for_each_entry_rcu(mod, &modules, list) {
if (within(addr, mod->module_core, mod->core_size)) {
preempt_enable();
return 1;
@@ -2740,7 +2711,7 @@ struct module *__module_text_address(unsigned long addr)
if (addr < module_addr_min || addr > module_addr_max)
return NULL;
- list_for_each_entry(mod, &modules, list)
+ list_for_each_entry_rcu(mod, &modules, list)
if (within(addr, mod->module_init, mod->init_text_size)
|| within(addr, mod->module_core, mod->core_text_size))
return mod;
@@ -2765,8 +2736,11 @@ void print_modules(void)
char buf[8];
printk("Modules linked in:");
- list_for_each_entry(mod, &modules, list)
+ /* Most callers should already have preempt disabled, but make sure */
+ preempt_disable();
+ list_for_each_entry_rcu(mod, &modules, list)
printk(" %s%s", mod->name, module_flags(mod, buf));
+ preempt_enable();
if (last_unloaded_module[0])
printk(" [last unloaded: %s]", last_unloaded_module);
printk("\n");
diff --git a/kernel/panic.c b/kernel/panic.c
index bda561ef3cd..4d5088355bf 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -34,13 +34,6 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
EXPORT_SYMBOL(panic_notifier_list);
-static int __init panic_setup(char *str)
-{
- panic_timeout = simple_strtoul(str, NULL, 0);
- return 1;
-}
-__setup("panic=", panic_setup);
-
static long no_blink(long time)
{
return 0;
@@ -174,6 +167,7 @@ static const struct tnt tnts[] = {
* 'M' - System experienced a machine check exception.
* 'B' - System has hit bad_page.
* 'U' - Userspace-defined naughtiness.
+ * 'D' - Kernel has oopsed before
* 'A' - ACPI table overridden.
* 'W' - Taint on warning.
* 'C' - modules from drivers/staging are loaded.
@@ -218,13 +212,6 @@ void add_taint(unsigned flag)
}
EXPORT_SYMBOL(add_taint);
-static int __init pause_on_oops_setup(char *str)
-{
- pause_on_oops = simple_strtoul(str, NULL, 0);
- return 1;
-}
-__setup("pause_on_oops=", pause_on_oops_setup);
-
static void spin_msec(int msecs)
{
int i;
@@ -384,3 +371,6 @@ void __stack_chk_fail(void)
}
EXPORT_SYMBOL(__stack_chk_fail);
#endif
+
+core_param(panic, panic_timeout, int, 0644);
+core_param(pause_on_oops, pause_on_oops, int, 0644);
diff --git a/kernel/params.c b/kernel/params.c
index afc46a23eb6..a1e3025b19a 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -373,6 +373,8 @@ int param_get_string(char *buffer, struct kernel_param *kp)
}
/* sysfs output in /sys/modules/XYZ/parameters/ */
+#define to_module_attr(n) container_of(n, struct module_attribute, attr);
+#define to_module_kobject(n) container_of(n, struct module_kobject, kobj);
extern struct kernel_param __start___param[], __stop___param[];
@@ -384,6 +386,7 @@ struct param_attribute
struct module_param_attrs
{
+ unsigned int num;
struct attribute_group grp;
struct param_attribute attrs[0];
};
@@ -434,93 +437,120 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
#ifdef CONFIG_SYSFS
/*
- * param_sysfs_setup - setup sysfs support for one module or KBUILD_MODNAME
- * @mk: struct module_kobject (contains parent kobject)
- * @kparam: array of struct kernel_param, the actual parameter definitions
- * @num_params: number of entries in array
- * @name_skip: offset where the parameter name start in kparam[].name. Needed for built-in "modules"
+ * add_sysfs_param - add a parameter to sysfs
+ * @mk: struct module_kobject
+ * @kparam: the actual parameter definition to add to sysfs
+ * @name: name of parameter
*
- * Create a kobject for a (per-module) group of parameters, and create files
- * in sysfs. A pointer to the param_kobject is returned on success,
- * NULL if there's no parameter to export, or other ERR_PTR(err).
+ * Create a kobject if for a (per-module) parameter if mp NULL, and
+ * create file in sysfs. Returns an error on out of memory. Always cleans up
+ * if there's an error.
*/
-static __modinit struct module_param_attrs *
-param_sysfs_setup(struct module_kobject *mk,
- struct kernel_param *kparam,
- unsigned int num_params,
- unsigned int name_skip)
+static __modinit int add_sysfs_param(struct module_kobject *mk,
+ struct kernel_param *kp,
+ const char *name)
{
- struct module_param_attrs *mp;
- unsigned int valid_attrs = 0;
- unsigned int i, size[2];
- struct param_attribute *pattr;
- struct attribute **gattr;
- int err;
-
- for (i=0; i<num_params; i++) {
- if (kparam[i].perm)
- valid_attrs++;
+ struct module_param_attrs *new;
+ struct attribute **attrs;
+ int err, num;
+
+ /* We don't bother calling this with invisible parameters. */
+ BUG_ON(!kp->perm);
+
+ if (!mk->mp) {
+ num = 0;
+ attrs = NULL;
+ } else {
+ num = mk->mp->num;
+ attrs = mk->mp->grp.attrs;
}
- if (!valid_attrs)
- return NULL;
-
- size[0] = ALIGN(sizeof(*mp) +
- valid_attrs * sizeof(mp->attrs[0]),
- sizeof(mp->grp.attrs[0]));
- size[1] = (valid_attrs + 1) * sizeof(mp->grp.attrs[0]);
-
- mp = kzalloc(size[0] + size[1], GFP_KERNEL);
- if (!mp)
- return ERR_PTR(-ENOMEM);
-
- mp->grp.name = "parameters";
- mp->grp.attrs = (void *)mp + size[0];
-
- pattr = &mp->attrs[0];
- gattr = &mp->grp.attrs[0];
- for (i = 0; i < num_params; i++) {
- struct kernel_param *kp = &kparam[i];
- if (kp->perm) {
- pattr->param = kp;
- pattr->mattr.show = param_attr_show;
- pattr->mattr.store = param_attr_store;
- pattr->mattr.attr.name = (char *)&kp->name[name_skip];
- pattr->mattr.attr.mode = kp->perm;
- *(gattr++) = &(pattr++)->mattr.attr;
- }
+ /* Enlarge. */
+ new = krealloc(mk->mp,
+ sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1),
+ GFP_KERNEL);
+ if (!new) {
+ kfree(mk->mp);
+ err = -ENOMEM;
+ goto fail;
}
- *gattr = NULL;
-
- if ((err = sysfs_create_group(&mk->kobj, &mp->grp))) {
- kfree(mp);
- return ERR_PTR(err);
+ attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL);
+ if (!attrs) {
+ err = -ENOMEM;
+ goto fail_free_new;
}
- return mp;
+
+ /* Sysfs wants everything zeroed. */
+ memset(new, 0, sizeof(*new));
+ memset(&new->attrs[num], 0, sizeof(new->attrs[num]));
+ memset(&attrs[num], 0, sizeof(attrs[num]));
+ new->grp.name = "parameters";
+ new->grp.attrs = attrs;
+
+ /* Tack new one on the end. */
+ new->attrs[num].param = kp;
+ new->attrs[num].mattr.show = param_attr_show;
+ new->attrs[num].mattr.store = param_attr_store;
+ new->attrs[num].mattr.attr.name = (char *)name;
+ new->attrs[num].mattr.attr.mode = kp->perm;
+ new->num = num+1;
+
+ /* Fix up all the pointers, since krealloc can move us */
+ for (num = 0; num < new->num; num++)
+ new->grp.attrs[num] = &new->attrs[num].mattr.attr;
+ new->grp.attrs[num] = NULL;
+
+ mk->mp = new;
+ return 0;
+
+fail_free_new:
+ kfree(new);
+fail:
+ mk->mp = NULL;
+ return err;
}
#ifdef CONFIG_MODULES
+static void free_module_param_attrs(struct module_kobject *mk)
+{
+ kfree(mk->mp->grp.attrs);
+ kfree(mk->mp);
+ mk->mp = NULL;
+}
+
/*
* module_param_sysfs_setup - setup sysfs support for one module
* @mod: module
* @kparam: module parameters (array)
* @num_params: number of module parameters
*
- * Adds sysfs entries for module parameters, and creates a link from
- * /sys/module/[mod->name]/parameters to /sys/parameters/[mod->name]/
+ * Adds sysfs entries for module parameters under
+ * /sys/module/[mod->name]/parameters/
*/
int module_param_sysfs_setup(struct module *mod,
struct kernel_param *kparam,
unsigned int num_params)
{
- struct module_param_attrs *mp;
+ int i, err;
+ bool params = false;
+
+ for (i = 0; i < num_params; i++) {
+ if (kparam[i].perm == 0)
+ continue;
+ err = add_sysfs_param(&mod->mkobj, &kparam[i], kparam[i].name);
+ if (err)
+ return err;
+ params = true;
+ }
- mp = param_sysfs_setup(&mod->mkobj, kparam, num_params, 0);
- if (IS_ERR(mp))
- return PTR_ERR(mp);
+ if (!params)
+ return 0;
- mod->param_attrs = mp;
- return 0;
+ /* Create the param group. */
+ err = sysfs_create_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp);
+ if (err)
+ free_module_param_attrs(&mod->mkobj);
+ return err;
}
/*
@@ -532,43 +562,55 @@ int module_param_sysfs_setup(struct module *mod,
*/
void module_param_sysfs_remove(struct module *mod)
{
- if (mod->param_attrs) {
- sysfs_remove_group(&mod->mkobj.kobj,
- &mod->param_attrs->grp);
+ if (mod->mkobj.mp) {
+ sysfs_remove_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp);
/* We are positive that no one is using any param
* attrs at this point. Deallocate immediately. */
- kfree(mod->param_attrs);
- mod->param_attrs = NULL;
+ free_module_param_attrs(&mod->mkobj);
}
}
#endif
-/*
- * kernel_param_sysfs_setup - wrapper for built-in params support
- */
-static void __init kernel_param_sysfs_setup(const char *name,
- struct kernel_param *kparam,
- unsigned int num_params,
- unsigned int name_skip)
+static void __init kernel_add_sysfs_param(const char *name,
+ struct kernel_param *kparam,
+ unsigned int name_skip)
{
struct module_kobject *mk;
- int ret;
+ struct kobject *kobj;
+ int err;
- mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
- BUG_ON(!mk);
-
- mk->mod = THIS_MODULE;
- mk->kobj.kset = module_kset;
- ret = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, "%s", name);
- if (ret) {
- kobject_put(&mk->kobj);
- printk(KERN_ERR "Module '%s' failed to be added to sysfs, "
- "error number %d\n", name, ret);
- printk(KERN_ERR "The system will be unstable now.\n");
- return;
+ kobj = kset_find_obj(module_kset, name);
+ if (kobj) {
+ /* We already have one. Remove params so we can add more. */
+ mk = to_module_kobject(kobj);
+ /* We need to remove it before adding parameters. */
+ sysfs_remove_group(&mk->kobj, &mk->mp->grp);
+ } else {
+ mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
+ BUG_ON(!mk);
+
+ mk->mod = THIS_MODULE;
+ mk->kobj.kset = module_kset;
+ err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL,
+ "%s", name);
+ if (err) {
+ kobject_put(&mk->kobj);
+ printk(KERN_ERR "Module '%s' failed add to sysfs, "
+ "error number %d\n", name, err);
+ printk(KERN_ERR "The system will be unstable now.\n");
+ return;
+ }
+ /* So that exit path is even. */
+ kobject_get(&mk->kobj);
}
- param_sysfs_setup(mk, kparam, num_params, name_skip);
+
+ /* These should not fail at boot. */
+ err = add_sysfs_param(mk, kparam, kparam->name + name_skip);
+ BUG_ON(err);
+ err = sysfs_create_group(&mk->kobj, &mk->mp->grp);
+ BUG_ON(err);
kobject_uevent(&mk->kobj, KOBJ_ADD);
+ kobject_put(&mk->kobj);
}
/*
@@ -579,60 +621,36 @@ static void __init kernel_param_sysfs_setup(const char *name,
* The "module" name (KBUILD_MODNAME) is stored before a dot, the
* "parameter" name is stored behind a dot in kernel_param->name. So,
* extract the "module" name for all built-in kernel_param-eters,
- * and for all who have the same, call kernel_param_sysfs_setup.
+ * and for all who have the same, call kernel_add_sysfs_param.
*/
static void __init param_sysfs_builtin(void)
{
- struct kernel_param *kp, *kp_begin = NULL;
- unsigned int i, name_len, count = 0;
- char modname[MODULE_NAME_LEN + 1] = "";
+ struct kernel_param *kp;
+ unsigned int name_len;
+ char modname[MODULE_NAME_LEN];
- for (i=0; i < __stop___param - __start___param; i++) {
+ for (kp = __start___param; kp < __stop___param; kp++) {
char *dot;
- size_t max_name_len;
- kp = &__start___param[i];
- max_name_len =
- min_t(size_t, MODULE_NAME_LEN, strlen(kp->name));
+ if (kp->perm == 0)
+ continue;
- dot = memchr(kp->name, '.', max_name_len);
+ dot = strchr(kp->name, '.');
if (!dot) {
- DEBUGP("couldn't find period in first %d characters "
- "of %s\n", MODULE_NAME_LEN, kp->name);
- continue;
- }
- name_len = dot - kp->name;
-
- /* new kbuild_modname? */
- if (strlen(modname) != name_len
- || strncmp(modname, kp->name, name_len) != 0) {
- /* add a new kobject for previous kernel_params. */
- if (count)
- kernel_param_sysfs_setup(modname,
- kp_begin,
- count,
- strlen(modname)+1);
-
- strncpy(modname, kp->name, name_len);
- modname[name_len] = '\0';
- count = 0;
- kp_begin = kp;
+ /* This happens for core_param() */
+ strcpy(modname, "kernel");
+ name_len = 0;
+ } else {
+ name_len = dot - kp->name + 1;
+ strlcpy(modname, kp->name, name_len);
}
- count++;
+ kernel_add_sysfs_param(modname, kp, name_len);
}
-
- /* last kernel_params need to be registered as well */
- if (count)
- kernel_param_sysfs_setup(modname, kp_begin, count,
- strlen(modname)+1);
}
/* module-related sysfs stuff */
-#define to_module_attr(n) container_of(n, struct module_attribute, attr);
-#define to_module_kobject(n) container_of(n, struct module_kobject, kobj);
-
static ssize_t module_attr_show(struct kobject *kobj,
struct attribute *attr,
char *buf)
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 153dcb2639c..895337b16a2 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1308,9 +1308,10 @@ static inline int task_cputime_expired(const struct task_cputime *sample,
*/
static inline int fastpath_timer_check(struct task_struct *tsk)
{
- struct signal_struct *sig = tsk->signal;
+ struct signal_struct *sig;
- if (unlikely(!sig))
+ /* tsk == current, ensure it is safe to use ->signal/sighand */
+ if (unlikely(tsk->exit_state))
return 0;
if (!task_cputime_zero(&tsk->cputime_expires)) {
@@ -1323,6 +1324,8 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
return 1;
}
+
+ sig = tsk->signal;
if (!task_cputime_zero(&sig->cputime_expires)) {
struct task_cputime group_sample;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index b931d7cedbf..5e79c662294 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -639,7 +639,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
(timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv);
- remaining = ktime_sub(timer->expires, now);
+ remaining = ktime_sub(hrtimer_get_expires(timer), now);
/* Return 0 only, when the timer is expired and not pending */
if (remaining.tv64 <= 0) {
/*
@@ -733,7 +733,7 @@ common_timer_set(struct k_itimer *timr, int flags,
hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
timr->it.real.timer.function = posix_timer_fn;
- timer->expires = timespec_to_ktime(new_setting->it_value);
+ hrtimer_set_expires(timer, timespec_to_ktime(new_setting->it_value));
/* Convert interval */
timr->it.real.interval = timespec_to_ktime(new_setting->it_interval);
@@ -742,14 +742,12 @@ common_timer_set(struct k_itimer *timr, int flags,
if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
/* Setup correct expiry time for relative timers */
if (mode == HRTIMER_MODE_REL) {
- timer->expires =
- ktime_add_safe(timer->expires,
- timer->base->get_time());
+ hrtimer_add_expires(timer, timer->base->get_time());
}
return 0;
}
- hrtimer_start(timer, timer->expires, mode);
+ hrtimer_start_expires(timer, mode);
return 0;
}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index dcd165f92a8..23bd4daeb96 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -96,7 +96,7 @@ config SUSPEND
config PM_TEST_SUSPEND
bool "Test suspend/resume and wakealarm during bootup"
- depends on SUSPEND && PM_DEBUG && RTC_LIB=y
+ depends on SUSPEND && PM_DEBUG && RTC_CLASS=y
---help---
This option will let you suspend your machine during bootup, and
make it wake up a few seconds later using an RTC wakeup alarm.
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 331f9836383..f77d3819ef5 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -22,7 +22,6 @@
#include <linux/console.h>
#include <linux/cpu.h>
#include <linux/freezer.h>
-#include <linux/ftrace.h>
#include "power.h"
@@ -257,7 +256,7 @@ static int create_image(int platform_mode)
int hibernation_snapshot(int platform_mode)
{
- int error, ftrace_save;
+ int error;
/* Free memory before shutting down devices. */
error = swsusp_shrink_memory();
@@ -269,7 +268,6 @@ int hibernation_snapshot(int platform_mode)
goto Close;
suspend_console();
- ftrace_save = __ftrace_enabled_save();
error = device_suspend(PMSG_FREEZE);
if (error)
goto Recover_platform;
@@ -299,7 +297,6 @@ int hibernation_snapshot(int platform_mode)
Resume_devices:
device_resume(in_suspend ?
(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
- __ftrace_enabled_restore(ftrace_save);
resume_console();
Close:
platform_end(platform_mode);
@@ -370,11 +367,10 @@ static int resume_target_kernel(void)
int hibernation_restore(int platform_mode)
{
- int error, ftrace_save;
+ int error;
pm_prepare_console();
suspend_console();
- ftrace_save = __ftrace_enabled_save();
error = device_suspend(PMSG_QUIESCE);
if (error)
goto Finish;
@@ -389,7 +385,6 @@ int hibernation_restore(int platform_mode)
platform_restore_cleanup(platform_mode);
device_resume(PMSG_RECOVER);
Finish:
- __ftrace_enabled_restore(ftrace_save);
resume_console();
pm_restore_console();
return error;
@@ -402,7 +397,7 @@ int hibernation_restore(int platform_mode)
int hibernation_platform_enter(void)
{
- int error, ftrace_save;
+ int error;
if (!hibernation_ops)
return -ENOSYS;
@@ -417,7 +412,6 @@ int hibernation_platform_enter(void)
goto Close;
suspend_console();
- ftrace_save = __ftrace_enabled_save();
error = device_suspend(PMSG_HIBERNATE);
if (error) {
if (hibernation_ops->recover)
@@ -452,7 +446,6 @@ int hibernation_platform_enter(void)
hibernation_ops->finish();
Resume_devices:
device_resume(PMSG_RESTORE);
- __ftrace_enabled_restore(ftrace_save);
resume_console();
Close:
hibernation_ops->end();
@@ -651,7 +644,7 @@ static int software_resume(void)
pr_debug("PM: Preparing processes for restore.\n");
error = prepare_processes();
if (error) {
- swsusp_close();
+ swsusp_close(FMODE_READ);
goto Done;
}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 19122cf6d82..613f16941b8 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -22,7 +22,6 @@
#include <linux/freezer.h>
#include <linux/vmstat.h>
#include <linux/syscalls.h>
-#include <linux/ftrace.h>
#include "power.h"
@@ -174,7 +173,7 @@ static void suspend_test_finish(const char *label)
* has some performance issues. The stack dump of a WARN_ON
* is more likely to get the right attention than a printk...
*/
- WARN_ON(msec > (TEST_SUSPEND_SECONDS * 1000));
+ WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label);
}
#else
@@ -317,7 +316,7 @@ static int suspend_enter(suspend_state_t state)
*/
int suspend_devices_and_enter(suspend_state_t state)
{
- int error, ftrace_save;
+ int error;
if (!suspend_ops)
return -ENOSYS;
@@ -328,7 +327,6 @@ int suspend_devices_and_enter(suspend_state_t state)
goto Close;
}
suspend_console();
- ftrace_save = __ftrace_enabled_save();
suspend_test_start();
error = device_suspend(PMSG_SUSPEND);
if (error) {
@@ -360,7 +358,6 @@ int suspend_devices_and_enter(suspend_state_t state)
suspend_test_start();
device_resume(PMSG_RESUME);
suspend_test_finish("resume devices");
- __ftrace_enabled_restore(ftrace_save);
resume_console();
Close:
if (suspend_ops->end)
diff --git a/kernel/power/power.h b/kernel/power/power.h
index acc0c101dbd..46b5ec7a3af 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -153,7 +153,7 @@ extern int swsusp_shrink_memory(void);
extern void swsusp_free(void);
extern int swsusp_read(unsigned int *flags_p);
extern int swsusp_write(unsigned int flags);
-extern void swsusp_close(void);
+extern void swsusp_close(fmode_t);
struct timeval;
/* kernel/power/swsusp.c */
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 80ccac849e4..b7713b53d07 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -172,13 +172,13 @@ static int swsusp_swap_check(void) /* This is called before saving image */
return res;
root_swap = res;
- res = blkdev_get(resume_bdev, FMODE_WRITE, O_RDWR);
+ res = blkdev_get(resume_bdev, FMODE_WRITE);
if (res)
return res;
res = set_blocksize(resume_bdev, PAGE_SIZE);
if (res < 0)
- blkdev_put(resume_bdev);
+ blkdev_put(resume_bdev, FMODE_WRITE);
return res;
}
@@ -426,7 +426,7 @@ int swsusp_write(unsigned int flags)
release_swap_writer(&handle);
out:
- swsusp_close();
+ swsusp_close(FMODE_WRITE);
return error;
}
@@ -574,7 +574,7 @@ int swsusp_read(unsigned int *flags_p)
error = load_image(&handle, &snapshot, header->pages - 1);
release_swap_reader(&handle);
- blkdev_put(resume_bdev);
+ blkdev_put(resume_bdev, FMODE_READ);
if (!error)
pr_debug("PM: Image successfully loaded\n");
@@ -609,7 +609,7 @@ int swsusp_check(void)
return -EINVAL;
}
if (error)
- blkdev_put(resume_bdev);
+ blkdev_put(resume_bdev, FMODE_READ);
else
pr_debug("PM: Signature found, resuming\n");
} else {
@@ -626,14 +626,14 @@ int swsusp_check(void)
* swsusp_close - close swap device.
*/
-void swsusp_close(void)
+void swsusp_close(fmode_t mode)
{
if (IS_ERR(resume_bdev)) {
pr_debug("PM: Image device not initialised\n");
return;
}
- blkdev_put(resume_bdev);
+ blkdev_put(resume_bdev, mode); /* move up */
}
static int swsusp_header_init(void)
diff --git a/kernel/printk.c b/kernel/printk.c
index 6341af77eb6..f492f1583d7 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -233,45 +233,6 @@ static inline void boot_delay_msec(void)
#endif
/*
- * Return the number of unread characters in the log buffer.
- */
-static int log_buf_get_len(void)
-{
- return logged_chars;
-}
-
-/*
- * Copy a range of characters from the log buffer.
- */
-int log_buf_copy(char *dest, int idx, int len)
-{
- int ret, max;
- bool took_lock = false;
-
- if (!oops_in_progress) {
- spin_lock_irq(&logbuf_lock);
- took_lock = true;
- }
-
- max = log_buf_get_len();
- if (idx < 0 || idx >= max) {
- ret = -1;
- } else {
- if (len > max)
- len = max;
- ret = len;
- idx += (log_end - max);
- while (len-- > 0)
- dest[len] = LOG_BUF(idx + len);
- }
-
- if (took_lock)
- spin_unlock_irq(&logbuf_lock);
-
- return ret;
-}
-
-/*
* Commands to do_syslog:
*
* 0 -- Close the log. Currently a NOP.
diff --git a/kernel/profile.c b/kernel/profile.c
index a9e422df6bf..60adefb59b5 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -102,7 +102,7 @@ int profile_setup(char *str)
__setup("profile=", profile_setup);
-int profile_init(void)
+int __ref profile_init(void)
{
int buffer_bytes;
if (!prof_on)
@@ -351,7 +351,7 @@ out:
put_cpu();
}
-static int __devinit profile_cpu_callback(struct notifier_block *info,
+static int __cpuinit profile_cpu_callback(struct notifier_block *info,
unsigned long action, void *__cpu)
{
int node, cpu = (unsigned long)__cpu;
@@ -544,7 +544,7 @@ static const struct file_operations proc_profile_operations = {
};
#ifdef CONFIG_SMP
-static void __init profile_nop(void *unused)
+static void profile_nop(void *unused)
{
}
@@ -596,7 +596,7 @@ out_cleanup:
#define create_hash_tables() ({ 0; })
#endif
-int create_proc_profile(void)
+int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */
{
struct proc_dir_entry *entry;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1e68e4c39e2..4c8bcd7dd8e 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -612,7 +612,7 @@ int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data)
return (copied == sizeof(data)) ? 0 : -EIO;
}
-#if defined CONFIG_COMPAT && defined __ARCH_WANT_COMPAT_SYS_PTRACE
+#if defined CONFIG_COMPAT
#include <linux/compat.h>
int compat_ptrace_request(struct task_struct *child, compat_long_t request,
@@ -709,4 +709,4 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
unlock_kernel();
return ret;
}
-#endif /* CONFIG_COMPAT && __ARCH_WANT_COMPAT_SYS_PTRACE */
+#endif /* CONFIG_COMPAT */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 467d5940f62..ad63af8b252 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -119,18 +119,19 @@ static void _rcu_barrier(enum rcu_barrier type)
/* Take cpucontrol mutex to protect against CPU hotplug */
mutex_lock(&rcu_barrier_mutex);
init_completion(&rcu_barrier_completion);
- atomic_set(&rcu_barrier_cpu_count, 0);
/*
- * The queueing of callbacks in all CPUs must be atomic with
- * respect to RCU, otherwise one CPU may queue a callback,
- * wait for a grace period, decrement barrier count and call
- * complete(), while other CPUs have not yet queued anything.
- * So, we need to make sure that grace periods cannot complete
- * until all the callbacks are queued.
+ * Initialize rcu_barrier_cpu_count to 1, then invoke
+ * rcu_barrier_func() on each CPU, so that each CPU also has
+ * incremented rcu_barrier_cpu_count. Only then is it safe to
+ * decrement rcu_barrier_cpu_count -- otherwise the first CPU
+ * might complete its grace period before all of the other CPUs
+ * did their increment, causing this function to return too
+ * early.
*/
- rcu_read_lock();
+ atomic_set(&rcu_barrier_cpu_count, 1);
on_each_cpu(rcu_barrier_func, (void *)type, 1);
- rcu_read_unlock();
+ if (atomic_dec_and_test(&rcu_barrier_cpu_count))
+ complete(&rcu_barrier_completion);
wait_for_completion(&rcu_barrier_completion);
mutex_unlock(&rcu_barrier_mutex);
}
diff --git a/kernel/relay.c b/kernel/relay.c
index 8d13a7855c0..32b0befdcb6 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -400,7 +400,7 @@ void relay_reset(struct rchan *chan)
}
mutex_lock(&relay_channels_mutex);
- for_each_online_cpu(i)
+ for_each_possible_cpu(i)
if (chan->buf[i])
__relay_reset(chan->buf[i], 0);
mutex_unlock(&relay_channels_mutex);
@@ -611,10 +611,9 @@ struct rchan *relay_open(const char *base_filename,
return chan;
free_bufs:
- for_each_online_cpu(i) {
- if (!chan->buf[i])
- break;
- relay_close_buf(chan->buf[i]);
+ for_each_possible_cpu(i) {
+ if (chan->buf[i])
+ relay_close_buf(chan->buf[i]);
}
kref_put(&chan->kref, relay_destroy_channel);
diff --git a/kernel/resource.c b/kernel/resource.c
index 4089d12af6e..4337063663e 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -17,6 +17,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/device.h>
+#include <linux/pfn.h>
#include <asm/io.h>
@@ -522,7 +523,7 @@ static void __init __reserve_region_with_split(struct resource *root,
{
struct resource *parent = root;
struct resource *conflict;
- struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
+ struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC);
if (!res)
return;
@@ -571,7 +572,7 @@ static void __init __reserve_region_with_split(struct resource *root,
}
-void reserve_region_with_split(struct resource *root,
+void __init reserve_region_with_split(struct resource *root,
resource_size_t start, resource_size_t end,
const char *name)
{
@@ -849,7 +850,8 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
continue;
if (p->end < addr)
continue;
- if (p->start <= addr && (p->end >= addr + size - 1))
+ if (PFN_DOWN(p->start) <= PFN_DOWN(addr) &&
+ PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1))
continue;
printk(KERN_WARNING "resource map sanity check conflict: "
"0x%llx 0x%llx 0x%llx 0x%llx %s\n",
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 6522ae5b14a..69d9cb921ff 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -631,8 +631,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
/* Setup the timer, when timeout != NULL */
if (unlikely(timeout)) {
- hrtimer_start(&timeout->timer, timeout->timer.expires,
- HRTIMER_MODE_ABS);
+ hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
if (!hrtimer_active(&timeout->timer))
timeout->task = NULL;
}
diff --git a/kernel/sched.c b/kernel/sched.c
index d906f72b42d..7729c4bbc8b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -55,6 +55,7 @@
#include <linux/cpuset.h>
#include <linux/percpu.h>
#include <linux/kthread.h>
+#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/sysctl.h>
#include <linux/syscalls.h>
@@ -117,6 +118,12 @@
*/
#define RUNTIME_INF ((u64)~0ULL)
+DEFINE_TRACE(sched_wait_task);
+DEFINE_TRACE(sched_wakeup);
+DEFINE_TRACE(sched_wakeup_new);
+DEFINE_TRACE(sched_switch);
+DEFINE_TRACE(sched_migrate_task);
+
#ifdef CONFIG_SMP
/*
* Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
@@ -227,9 +234,8 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
- hrtimer_start(&rt_b->rt_period_timer,
- rt_b->rt_period_timer.expires,
- HRTIMER_MODE_ABS);
+ hrtimer_start_expires(&rt_b->rt_period_timer,
+ HRTIMER_MODE_ABS);
}
spin_unlock(&rt_b->rt_runtime_lock);
}
@@ -386,7 +392,6 @@ struct cfs_rq {
u64 exec_clock;
u64 min_vruntime;
- u64 pair_start;
struct rb_root tasks_timeline;
struct rb_node *rb_leftmost;
@@ -398,9 +403,9 @@ struct cfs_rq {
* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
- struct sched_entity *curr, *next;
+ struct sched_entity *curr, *next, *last;
- unsigned long nr_spread_over;
+ unsigned int nr_spread_over;
#ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
@@ -819,6 +824,13 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
unsigned int sysctl_sched_shares_ratelimit = 250000;
/*
+ * Inject some fuzzyness into changing the per-cpu group shares
+ * this avoids remote rq-locks at the expense of fairness.
+ * default: 4
+ */
+unsigned int sysctl_sched_shares_thresh = 4;
+
+/*
* period over which we measure -rt task cpu usage in us.
* default: 1s
*/
@@ -963,6 +975,14 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
}
}
+void task_rq_unlock_wait(struct task_struct *p)
+{
+ struct rq *rq = task_rq(p);
+
+ smp_mb(); /* spin-unlock-wait is not a full memory barrier */
+ spin_unlock_wait(&rq->lock);
+}
+
static void __task_rq_unlock(struct rq *rq)
__releases(rq->lock)
{
@@ -1064,7 +1084,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
struct hrtimer *timer = &rq->hrtick_timer;
ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
- timer->expires = time;
+ hrtimer_set_expires(timer, time);
if (rq == this_rq()) {
hrtimer_restart(timer);
@@ -1439,9 +1459,12 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
+ unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
- if (rq->nr_running)
- rq->avg_load_per_task = rq->load.weight / rq->nr_running;
+ if (nr_running)
+ rq->avg_load_per_task = rq->load.weight / nr_running;
+ else
+ rq->avg_load_per_task = 0;
return rq->avg_load_per_task;
}
@@ -1454,8 +1477,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
* Calculate and set the cpu's group shares.
*/
static void
-__update_group_shares_cpu(struct task_group *tg, int cpu,
- unsigned long sd_shares, unsigned long sd_rq_weight)
+update_group_shares_cpu(struct task_group *tg, int cpu,
+ unsigned long sd_shares, unsigned long sd_rq_weight)
{
int boost = 0;
unsigned long shares;
@@ -1486,19 +1509,23 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
*
*/
shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
+ shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
- /*
- * record the actual number of shares, not the boosted amount.
- */
- tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
- tg->cfs_rq[cpu]->rq_weight = rq_weight;
+ if (abs(shares - tg->se[cpu]->load.weight) >
+ sysctl_sched_shares_thresh) {
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
- if (shares < MIN_SHARES)
- shares = MIN_SHARES;
- else if (shares > MAX_SHARES)
- shares = MAX_SHARES;
+ spin_lock_irqsave(&rq->lock, flags);
+ /*
+ * record the actual number of shares, not the boosted amount.
+ */
+ tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
+ tg->cfs_rq[cpu]->rq_weight = rq_weight;
- __set_se_shares(tg->se[cpu], shares);
+ __set_se_shares(tg->se[cpu], shares);
+ spin_unlock_irqrestore(&rq->lock, flags);
+ }
}
/*
@@ -1527,14 +1554,8 @@ static int tg_shares_up(struct task_group *tg, void *data)
if (!rq_weight)
rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
- for_each_cpu_mask(i, sd->span) {
- struct rq *rq = cpu_rq(i);
- unsigned long flags;
-
- spin_lock_irqsave(&rq->lock, flags);
- __update_group_shares_cpu(tg, i, shares, rq_weight);
- spin_unlock_irqrestore(&rq->lock, flags);
- }
+ for_each_cpu_mask(i, sd->span)
+ update_group_shares_cpu(tg, i, shares, rq_weight);
return 0;
}
@@ -1801,7 +1822,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
/*
* Buddy candidates are cache hot:
*/
- if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next))
+ if (sched_feat(CACHE_HOT_BUDDY) &&
+ (&p->se == cfs_rq_of(&p->se)->next ||
+ &p->se == cfs_rq_of(&p->se)->last))
return 1;
if (p->sched_class != &fair_sched_class)
@@ -3339,7 +3362,7 @@ small_imbalance:
} else
this_load_per_task = cpu_avg_load_per_task(this_cpu);
- if (max_load - this_load + 2*busiest_load_per_task >=
+ if (max_load - this_load + busiest_load_per_task >=
busiest_load_per_task * imbn) {
*imbalance = busiest_load_per_task;
return busiest;
@@ -4443,12 +4466,8 @@ need_resched_nonpreemptible:
if (sched_feat(HRTICK))
hrtick_clear(rq);
- /*
- * Do the rq-clock update outside the rq lock:
- */
- local_irq_disable();
+ spin_lock_irq(&rq->lock);
update_rq_clock(rq);
- spin_lock(&rq->lock);
clear_tsk_need_resched(prev);
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@ -5858,6 +5877,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
+ spin_lock_irqsave(&rq->lock, flags);
+
__sched_fork(idle);
idle->se.exec_start = sched_clock();
@@ -5865,7 +5886,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
idle->cpus_allowed = cpumask_of_cpu(cpu);
__set_task_cpu(idle, cpu);
- spin_lock_irqsave(&rq->lock, flags);
rq->curr = rq->idle = idle;
#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
idle->oncpu = 1;
@@ -5882,6 +5902,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
* The idle tasks have their own, simple scheduling class:
*/
idle->sched_class = &idle_sched_class;
+ ftrace_graph_init_task(idle);
}
/*
@@ -6875,15 +6896,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
struct sched_domain *tmp;
/* Remove the sched domains which do not contribute to scheduling. */
- for (tmp = sd; tmp; tmp = tmp->parent) {
+ for (tmp = sd; tmp; ) {
struct sched_domain *parent = tmp->parent;
if (!parent)
break;
+
if (sd_parent_degenerate(tmp, parent)) {
tmp->parent = parent->parent;
if (parent->parent)
parent->parent->child = tmp;
- }
+ } else
+ tmp = tmp->parent;
}
if (sd && sd_degenerate(sd)) {
@@ -7672,6 +7695,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
error:
free_sched_groups(cpu_map, tmpmask);
SCHED_CPUMASK_FREE((void *)allmasks);
+ kfree(rd);
return -ENOMEM;
#endif
}
@@ -7773,13 +7797,14 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
*
* The passed in 'doms_new' should be kmalloc'd. This routine takes
* ownership of it and will kfree it when done with it. If the caller
- * failed the kmalloc call, then it can pass in doms_new == NULL,
- * and partition_sched_domains() will fallback to the single partition
- * 'fallback_doms', it also forces the domains to be rebuilt.
+ * failed the kmalloc call, then it can pass in doms_new == NULL &&
+ * ndoms_new == 1, and partition_sched_domains() will fallback to
+ * the single partition 'fallback_doms', it also forces the domains
+ * to be rebuilt.
*
- * If doms_new==NULL it will be replaced with cpu_online_map.
- * ndoms_new==0 is a special case for destroying existing domains.
- * It will not create the default domain.
+ * If doms_new == NULL it will be replaced with cpu_online_map.
+ * ndoms_new == 0 is a special case for destroying existing domains,
+ * and it will not create the default domain.
*
* Call with hotplug lock held
*/
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index ad958c1ec70..26ed8e3d1c1 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -144,7 +144,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
last = __pick_last_entity(cfs_rq);
if (last)
max_vruntime = last->vruntime;
- min_vruntime = rq->cfs.min_vruntime;
+ min_vruntime = cfs_rq->min_vruntime;
rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime;
spin_unlock_irqrestore(&rq->lock, flags);
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
@@ -161,26 +161,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SPLIT_NS(spread0));
SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
-#ifdef CONFIG_SCHEDSTATS
-#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
-
- P(yld_exp_empty);
- P(yld_act_empty);
- P(yld_both_empty);
- P(yld_count);
- P(sched_switch);
- P(sched_count);
- P(sched_goidle);
-
- P(ttwu_count);
- P(ttwu_local);
-
- P(bkl_count);
-
-#undef P
-#endif
- SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over",
+ SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
cfs_rq->nr_spread_over);
#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_SMP
@@ -260,6 +242,25 @@ static void print_cpu(struct seq_file *m, int cpu)
#undef P
#undef PN
+#ifdef CONFIG_SCHEDSTATS
+#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
+
+ P(yld_exp_empty);
+ P(yld_act_empty);
+ P(yld_both_empty);
+ P(yld_count);
+
+ P(sched_switch);
+ P(sched_count);
+ P(sched_goidle);
+
+ P(ttwu_count);
+ P(ttwu_local);
+
+ P(bkl_count);
+
+#undef P
+#endif
print_cfs_stats(m, cpu);
print_rt_stats(m, cpu);
@@ -319,7 +320,7 @@ static int __init init_sched_debug_procfs(void)
{
struct proc_dir_entry *pe;
- pe = proc_create("sched_debug", 0644, NULL, &sched_debug_fops);
+ pe = proc_create("sched_debug", 0444, NULL, &sched_debug_fops);
if (!pe)
return -ENOMEM;
return 0;
@@ -422,10 +423,11 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
#undef __P
{
+ unsigned int this_cpu = raw_smp_processor_id();
u64 t0, t1;
- t0 = sched_clock();
- t1 = sched_clock();
+ t0 = cpu_clock(this_cpu);
+ t1 = cpu_clock(this_cpu);
SEQ_printf(m, "%-35s:%21Ld\n",
"clock-delta", (long long)(t1-t0));
}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f604dae7131..98345e45b05 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -73,6 +73,8 @@ unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+static const struct sched_class fair_sched_class;
+
/**************************************************************
* CFS operations on generic schedulable entities:
*/
@@ -141,6 +143,49 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
return se->parent;
}
+/* return depth at which a sched entity is present in the hierarchy */
+static inline int depth_se(struct sched_entity *se)
+{
+ int depth = 0;
+
+ for_each_sched_entity(se)
+ depth++;
+
+ return depth;
+}
+
+static void
+find_matching_se(struct sched_entity **se, struct sched_entity **pse)
+{
+ int se_depth, pse_depth;
+
+ /*
+ * preemption test can be made between sibling entities who are in the
+ * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
+ * both tasks until we find their ancestors who are siblings of common
+ * parent.
+ */
+
+ /* First walk up until both entities are at same depth */
+ se_depth = depth_se(*se);
+ pse_depth = depth_se(*pse);
+
+ while (se_depth > pse_depth) {
+ se_depth--;
+ *se = parent_entity(*se);
+ }
+
+ while (pse_depth > se_depth) {
+ pse_depth--;
+ *pse = parent_entity(*pse);
+ }
+
+ while (!is_same_group(*se, *pse)) {
+ *se = parent_entity(*se);
+ *pse = parent_entity(*pse);
+ }
+}
+
#else /* CONFIG_FAIR_GROUP_SCHED */
static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
@@ -191,6 +236,11 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
return NULL;
}
+static inline void
+find_matching_se(struct sched_entity **se, struct sched_entity **pse)
+{
+}
+
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -221,6 +271,27 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
return se->vruntime - cfs_rq->min_vruntime;
}
+static void update_min_vruntime(struct cfs_rq *cfs_rq)
+{
+ u64 vruntime = cfs_rq->min_vruntime;
+
+ if (cfs_rq->curr)
+ vruntime = cfs_rq->curr->vruntime;
+
+ if (cfs_rq->rb_leftmost) {
+ struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
+ struct sched_entity,
+ run_node);
+
+ if (vruntime == cfs_rq->min_vruntime)
+ vruntime = se->vruntime;
+ else
+ vruntime = min_vruntime(vruntime, se->vruntime);
+ }
+
+ cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
+}
+
/*
* Enqueue an entity into the rb-tree:
*/
@@ -254,15 +325,8 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Maintain a cache of leftmost tree entries (it is frequently
* used):
*/
- if (leftmost) {
+ if (leftmost)
cfs_rq->rb_leftmost = &se->run_node;
- /*
- * maintain cfs_rq->min_vruntime to be a monotonic increasing
- * value tracking the leftmost vruntime in the tree.
- */
- cfs_rq->min_vruntime =
- max_vruntime(cfs_rq->min_vruntime, se->vruntime);
- }
rb_link_node(&se->run_node, parent, link);
rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
@@ -272,37 +336,25 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if (cfs_rq->rb_leftmost == &se->run_node) {
struct rb_node *next_node;
- struct sched_entity *next;
next_node = rb_next(&se->run_node);
cfs_rq->rb_leftmost = next_node;
-
- if (next_node) {
- next = rb_entry(next_node,
- struct sched_entity, run_node);
- cfs_rq->min_vruntime =
- max_vruntime(cfs_rq->min_vruntime,
- next->vruntime);
- }
}
- if (cfs_rq->next == se)
- cfs_rq->next = NULL;
-
rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
}
-static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
-{
- return cfs_rq->rb_leftmost;
-}
-
static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
{
- return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node);
+ struct rb_node *left = cfs_rq->rb_leftmost;
+
+ if (!left)
+ return NULL;
+
+ return rb_entry(left, struct sched_entity, run_node);
}
-static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
+static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
@@ -334,7 +386,7 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
#endif
/*
- * delta *= w / rw
+ * delta *= P[w / rw]
*/
static inline unsigned long
calc_delta_weight(unsigned long delta, struct sched_entity *se)
@@ -348,15 +400,13 @@ calc_delta_weight(unsigned long delta, struct sched_entity *se)
}
/*
- * delta *= rw / w
+ * delta /= w
*/
static inline unsigned long
calc_delta_fair(unsigned long delta, struct sched_entity *se)
{
- for_each_sched_entity(se) {
- delta = calc_delta_mine(delta,
- cfs_rq_of(se)->load.weight, &se->load);
- }
+ if (unlikely(se->load.weight != NICE_0_LOAD))
+ delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
return delta;
}
@@ -386,26 +436,26 @@ static u64 __sched_period(unsigned long nr_running)
* We calculate the wall-time slice from the period by taking a part
* proportional to the weight.
*
- * s = p*w/rw
+ * s = p*P[w/rw]
*/
static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);
+ unsigned long nr_running = cfs_rq->nr_running;
+
+ if (unlikely(!se->on_rq))
+ nr_running++;
+
+ return calc_delta_weight(__sched_period(nr_running), se);
}
/*
* We calculate the vruntime slice of a to be inserted task
*
- * vs = s*rw/w = p
+ * vs = s/w
*/
-static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- unsigned long nr_running = cfs_rq->nr_running;
-
- if (!se->on_rq)
- nr_running++;
-
- return __sched_period(nr_running);
+ return calc_delta_fair(sched_slice(cfs_rq, se), se);
}
/*
@@ -424,6 +474,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
schedstat_add(cfs_rq, exec_clock, delta_exec);
delta_exec_weighted = calc_delta_fair(delta_exec, curr);
curr->vruntime += delta_exec_weighted;
+ update_min_vruntime(cfs_rq);
}
static void update_curr(struct cfs_rq *cfs_rq)
@@ -613,13 +664,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
- u64 vruntime;
-
- if (first_fair(cfs_rq)) {
- vruntime = min_vruntime(cfs_rq->min_vruntime,
- __pick_next_entity(cfs_rq)->vruntime);
- } else
- vruntime = cfs_rq->min_vruntime;
+ u64 vruntime = cfs_rq->min_vruntime;
/*
* The 'current' period is already promised to the current tasks,
@@ -628,7 +673,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
* stays open at the end.
*/
if (initial && sched_feat(START_DEBIT))
- vruntime += sched_vslice_add(cfs_rq, se);
+ vruntime += sched_vslice(cfs_rq, se);
if (!initial) {
/* sleeps upto a single latency don't count. */
@@ -671,6 +716,15 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
__enqueue_entity(cfs_rq, se);
}
+static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ if (cfs_rq->last == se)
+ cfs_rq->last = NULL;
+
+ if (cfs_rq->next == se)
+ cfs_rq->next = NULL;
+}
+
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
{
@@ -693,9 +747,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
#endif
}
+ clear_buddies(cfs_rq, se);
+
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
account_entity_dequeue(cfs_rq, se);
+ update_min_vruntime(cfs_rq);
}
/*
@@ -742,29 +799,18 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
-static struct sched_entity *
-pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- struct rq *rq = rq_of(cfs_rq);
- u64 pair_slice = rq->clock - cfs_rq->pair_start;
-
- if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) {
- cfs_rq->pair_start = rq->clock;
- return se;
- }
-
- return cfs_rq->next;
-}
+static int
+wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
{
- struct sched_entity *se = NULL;
+ struct sched_entity *se = __pick_next_entity(cfs_rq);
- if (first_fair(cfs_rq)) {
- se = __pick_next_entity(cfs_rq);
- se = pick_next(cfs_rq, se);
- set_next_entity(cfs_rq, se);
- }
+ if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1)
+ return cfs_rq->next;
+
+ if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1)
+ return cfs_rq->last;
return se;
}
@@ -849,11 +895,31 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
hrtick_start(rq, delta);
}
}
+
+/*
+ * called from enqueue/dequeue and updates the hrtick when the
+ * current task is from our class and nr_running is low enough
+ * to matter.
+ */
+static void hrtick_update(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+
+ if (curr->sched_class != &fair_sched_class)
+ return;
+
+ if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
+ hrtick_start_fair(rq, curr);
+}
#else /* !CONFIG_SCHED_HRTICK */
static inline void
hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
}
+
+static inline void hrtick_update(struct rq *rq)
+{
+}
#endif
/*
@@ -874,7 +940,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
wakeup = 1;
}
- hrtick_start_fair(rq, rq->curr);
+ hrtick_update(rq);
}
/*
@@ -896,7 +962,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
sleep = 1;
}
- hrtick_start_fair(rq, rq->curr);
+ hrtick_update(rq);
}
/*
@@ -916,6 +982,8 @@ static void yield_task_fair(struct rq *rq)
if (unlikely(cfs_rq->nr_running == 1))
return;
+ clear_buddies(cfs_rq, se);
+
if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
update_rq_clock(rq);
/*
@@ -1002,8 +1070,6 @@ static inline int wake_idle(int cpu, struct task_struct *p)
#ifdef CONFIG_SMP
-static const struct sched_class fair_sched_class;
-
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* effective_load() calculates the load change as seen from the root_task_group
@@ -1104,10 +1170,9 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
return 0;
- if (!sync && sched_feat(SYNC_WAKEUPS) &&
- curr->se.avg_overlap < sysctl_sched_migration_cost &&
- p->se.avg_overlap < sysctl_sched_migration_cost)
- sync = 1;
+ if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
+ p->se.avg_overlap > sysctl_sched_migration_cost))
+ sync = 0;
/*
* If sync wakeup then subtract the (maximum possible)
@@ -1226,33 +1291,88 @@ static unsigned long wakeup_gran(struct sched_entity *se)
* More easily preempt - nice tasks, while not making it harder for
* + nice tasks.
*/
- if (sched_feat(ASYM_GRAN))
- gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load);
+ if (!sched_feat(ASYM_GRAN) || se->load.weight > NICE_0_LOAD)
+ gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
return gran;
}
/*
+ * Should 'se' preempt 'curr'.
+ *
+ * |s1
+ * |s2
+ * |s3
+ * g
+ * |<--->|c
+ *
+ * w(c, s1) = -1
+ * w(c, s2) = 0
+ * w(c, s3) = 1
+ *
+ */
+static int
+wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
+{
+ s64 gran, vdiff = curr->vruntime - se->vruntime;
+
+ if (vdiff <= 0)
+ return -1;
+
+ gran = wakeup_gran(curr);
+ if (vdiff > gran)
+ return 1;
+
+ return 0;
+}
+
+static void set_last_buddy(struct sched_entity *se)
+{
+ for_each_sched_entity(se)
+ cfs_rq_of(se)->last = se;
+}
+
+static void set_next_buddy(struct sched_entity *se)
+{
+ for_each_sched_entity(se)
+ cfs_rq_of(se)->next = se;
+}
+
+/*
* Preempt the current task with a newly woken task if needed:
*/
static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
{
struct task_struct *curr = rq->curr;
- struct cfs_rq *cfs_rq = task_cfs_rq(curr);
struct sched_entity *se = &curr->se, *pse = &p->se;
- s64 delta_exec;
if (unlikely(rt_prio(p->prio))) {
+ struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+
update_rq_clock(rq);
update_curr(cfs_rq);
resched_task(curr);
return;
}
+ if (unlikely(p->sched_class != &fair_sched_class))
+ return;
+
if (unlikely(se == pse))
return;
- cfs_rq_of(pse)->next = pse;
+ /*
+ * Only set the backward buddy when the current task is still on the
+ * rq. This can happen when a wakeup gets interleaved with schedule on
+ * the ->pre_schedule() or idle_balance() point, either of which can
+ * drop the rq lock.
+ *
+ * Also, during early boot the idle thread is in the fair class, for
+ * obvious reasons its a bad idea to schedule back to the idle thread.
+ */
+ if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
+ set_last_buddy(se);
+ set_next_buddy(pse);
/*
* We can come here with TIF_NEED_RESCHED already set from new task
@@ -1278,9 +1398,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
return;
}
- delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;
- if (delta_exec > wakeup_gran(pse))
- resched_task(curr);
+ find_matching_se(&se, &pse);
+
+ while (se) {
+ BUG_ON(!pse);
+
+ if (wakeup_preempt_entity(se, pse) == 1) {
+ resched_task(curr);
+ break;
+ }
+
+ se = parent_entity(se);
+ pse = parent_entity(pse);
+ }
}
static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1294,6 +1424,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
do {
se = pick_next_entity(cfs_rq);
+ set_next_entity(cfs_rq, se);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
@@ -1576,9 +1707,6 @@ static const struct sched_class fair_sched_class = {
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
-#ifdef CONFIG_SMP
- .select_task_rq = select_task_rq_fair,
-#endif /* CONFIG_SMP */
.check_preempt_curr = check_preempt_wakeup,
@@ -1586,6 +1714,8 @@ static const struct sched_class fair_sched_class = {
.put_prev_task = put_prev_task_fair,
#ifdef CONFIG_SMP
+ .select_task_rq = select_task_rq_fair,
+
.load_balance = load_balance_fair,
.move_one_task = move_one_task_fair,
#endif
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 7c9e8f4a049..da5d93b5d2c 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -5,10 +5,11 @@ SCHED_FEAT(START_DEBIT, 1)
SCHED_FEAT(AFFINE_WAKEUPS, 1)
SCHED_FEAT(CACHE_HOT_BUDDY, 1)
SCHED_FEAT(SYNC_WAKEUPS, 1)
-SCHED_FEAT(HRTICK, 1)
+SCHED_FEAT(HRTICK, 0)
SCHED_FEAT(DOUBLE_TICK, 0)
SCHED_FEAT(ASYM_GRAN, 1)
SCHED_FEAT(LB_BIAS, 1)
SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
SCHED_FEAT(ASYM_EFF_LOAD, 1)
SCHED_FEAT(WAKEUP_OVERLAP, 0)
+SCHED_FEAT(LAST_BUDDY, 1)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index dec4ccabe2f..8a21a2e28c1 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -105,9 +105,6 @@ static const struct sched_class idle_sched_class = {
/* dequeue is not valid, we print a debug message there: */
.dequeue_task = dequeue_task_idle,
-#ifdef CONFIG_SMP
- .select_task_rq = select_task_rq_idle,
-#endif /* CONFIG_SMP */
.check_preempt_curr = check_preempt_curr_idle,
@@ -115,6 +112,8 @@ static const struct sched_class idle_sched_class = {
.put_prev_task = put_prev_task_idle,
#ifdef CONFIG_SMP
+ .select_task_rq = select_task_rq_idle,
+
.load_balance = load_balance_idle,
.move_one_task = move_one_task_idle,
#endif
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index b446dc87494..d9ba9d5f99d 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1504,9 +1504,6 @@ static const struct sched_class rt_sched_class = {
.enqueue_task = enqueue_task_rt,
.dequeue_task = dequeue_task_rt,
.yield_task = yield_task_rt,
-#ifdef CONFIG_SMP
- .select_task_rq = select_task_rq_rt,
-#endif /* CONFIG_SMP */
.check_preempt_curr = check_preempt_curr_rt,
@@ -1514,6 +1511,8 @@ static const struct sched_class rt_sched_class = {
.put_prev_task = put_prev_task_rt,
#ifdef CONFIG_SMP
+ .select_task_rq = select_task_rq_rt,
+
.load_balance = load_balance_rt,
.move_one_task = move_one_task_rt,
.set_cpus_allowed = set_cpus_allowed_rt,
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index b8c156979cf..7dbf72a2b02 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -9,7 +9,7 @@
static int show_schedstat(struct seq_file *seq, void *v)
{
int cpu;
- int mask_len = NR_CPUS/32 * 9;
+ int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
char *mask_str = kmalloc(mask_len, GFP_KERNEL);
if (mask_str == NULL)
@@ -90,13 +90,20 @@ static int schedstat_open(struct inode *inode, struct file *file)
return res;
}
-const struct file_operations proc_schedstat_operations = {
+static const struct file_operations proc_schedstat_operations = {
.open = schedstat_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
+static int __init proc_schedstat_init(void)
+{
+ proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
+ return 0;
+}
+module_init(proc_schedstat_init);
+
/*
* Expects runqueue lock to be held for atomicity of update
*/
@@ -291,9 +298,11 @@ static inline void account_group_user_time(struct task_struct *tsk,
{
struct signal_struct *sig;
- sig = tsk->signal;
- if (unlikely(!sig))
+ /* tsk == current, ensure it is safe to use ->signal */
+ if (unlikely(tsk->exit_state))
return;
+
+ sig = tsk->signal;
if (sig->cputime.totals) {
struct task_cputime *times;
@@ -318,9 +327,11 @@ static inline void account_group_system_time(struct task_struct *tsk,
{
struct signal_struct *sig;
- sig = tsk->signal;
- if (unlikely(!sig))
+ /* tsk == current, ensure it is safe to use ->signal */
+ if (unlikely(tsk->exit_state))
return;
+
+ sig = tsk->signal;
if (sig->cputime.totals) {
struct task_cputime *times;
@@ -346,8 +357,11 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
struct signal_struct *sig;
sig = tsk->signal;
+ /* see __exit_signal()->task_rq_unlock_wait() */
+ barrier();
if (unlikely(!sig))
return;
+
if (sig->cputime.totals) {
struct task_cputime *times;
diff --git a/kernel/signal.c b/kernel/signal.c
index 105217da5c8..e9afe63da24 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -41,6 +41,8 @@
static struct kmem_cache *sigqueue_cachep;
+DEFINE_TRACE(sched_signal_send);
+
static void __user *sig_handler(struct task_struct *t, int sig)
{
return t->sighand->action[sig - 1].sa.sa_handler;
@@ -1144,7 +1146,8 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
struct task_struct * p;
for_each_process(p) {
- if (p->pid > 1 && !same_thread_group(p, current)) {
+ if (task_pid_vnr(p) > 1 &&
+ !same_thread_group(p, current)) {
int err = group_send_sig_info(sig, info, p);
++count;
if (err != -EPERM)
diff --git a/kernel/smp.c b/kernel/smp.c
index f362a855377..75c8dde58c5 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -51,10 +51,6 @@ static void csd_flag_wait(struct call_single_data *data)
{
/* Wait for response */
do {
- /*
- * We need to see the flags store in the IPI handler
- */
- smp_mb();
if (!(data->flags & CSD_FLAG_WAIT))
break;
cpu_relax();
@@ -76,6 +72,11 @@ static void generic_exec_single(int cpu, struct call_single_data *data)
list_add_tail(&data->list, &dst->list);
spin_unlock_irqrestore(&dst->lock, flags);
+ /*
+ * Make the list addition visible before sending the ipi.
+ */
+ smp_mb();
+
if (ipi)
arch_send_call_function_single_ipi(cpu);
@@ -157,7 +158,7 @@ void generic_smp_call_function_single_interrupt(void)
* Need to see other stores to list head for checking whether
* list is empty without holding q->lock
*/
- smp_mb();
+ smp_read_barrier_depends();
while (!list_empty(&q->list)) {
unsigned int data_flags;
@@ -191,7 +192,7 @@ void generic_smp_call_function_single_interrupt(void)
/*
* See comment on outer loop
*/
- smp_mb();
+ smp_read_barrier_depends();
}
}
@@ -370,6 +371,11 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
list_add_tail_rcu(&data->csd.list, &call_function_queue);
spin_unlock_irqrestore(&call_function_lock, flags);
+ /*
+ * Make the list addition visible before sending the ipi.
+ */
+ smp_mb();
+
/* Send a message to all CPUs in the map */
arch_send_call_function_ipi(mask);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 7110daeb9a9..e7c69a720d6 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -269,10 +269,11 @@ void irq_enter(void)
{
int cpu = smp_processor_id();
- if (idle_cpu(cpu) && !in_interrupt())
+ if (idle_cpu(cpu) && !in_interrupt()) {
+ __irq_enter();
tick_check_idle(cpu);
-
- __irq_enter();
+ } else
+ __irq_enter();
}
#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index af3c7cea258..24e8ceacc38 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -37,9 +37,13 @@ struct stop_machine_data {
/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
static unsigned int num_threads;
static atomic_t thread_ack;
-static struct completion finished;
static DEFINE_MUTEX(lock);
+static struct workqueue_struct *stop_machine_wq;
+static struct stop_machine_data active, idle;
+static const cpumask_t *active_cpus;
+static void *stop_machine_work;
+
static void set_state(enum stopmachine_state newstate)
{
/* Reset ack counter. */
@@ -51,21 +55,26 @@ static void set_state(enum stopmachine_state newstate)
/* Last one to ack a state moves to the next state. */
static void ack_state(void)
{
- if (atomic_dec_and_test(&thread_ack)) {
- /* If we're the last one to ack the EXIT, we're finished. */
- if (state == STOPMACHINE_EXIT)
- complete(&finished);
- else
- set_state(state + 1);
- }
+ if (atomic_dec_and_test(&thread_ack))
+ set_state(state + 1);
}
-/* This is the actual thread which stops the CPU. It exits by itself rather
- * than waiting for kthread_stop(), because it's easier for hotplug CPU. */
-static int stop_cpu(struct stop_machine_data *smdata)
+/* This is the actual function which stops the CPU. It runs
+ * in the context of a dedicated stopmachine workqueue. */
+static void stop_cpu(struct work_struct *unused)
{
enum stopmachine_state curstate = STOPMACHINE_NONE;
-
+ struct stop_machine_data *smdata = &idle;
+ int cpu = smp_processor_id();
+ int err;
+
+ if (!active_cpus) {
+ if (cpu == first_cpu(cpu_online_map))
+ smdata = &active;
+ } else {
+ if (cpu_isset(cpu, *active_cpus))
+ smdata = &active;
+ }
/* Simple state machine */
do {
/* Chill out and ensure we re-read stopmachine_state. */
@@ -78,9 +87,11 @@ static int stop_cpu(struct stop_machine_data *smdata)
hard_irq_disable();
break;
case STOPMACHINE_RUN:
- /* |= allows error detection if functions on
- * multiple CPUs. */
- smdata->fnret |= smdata->fn(smdata->data);
+ /* On multiple CPUs only a single error code
+ * is needed to tell that something failed. */
+ err = smdata->fn(smdata->data);
+ if (err)
+ smdata->fnret = err;
break;
default:
break;
@@ -90,7 +101,6 @@ static int stop_cpu(struct stop_machine_data *smdata)
} while (curstate != STOPMACHINE_EXIT);
local_irq_enable();
- do_exit(0);
}
/* Callback for CPUs which aren't supposed to do anything. */
@@ -101,78 +111,35 @@ static int chill(void *unused)
int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
{
- int i, err;
- struct stop_machine_data active, idle;
- struct task_struct **threads;
+ struct work_struct *sm_work;
+ int i, ret;
+ /* Set up initial state. */
+ mutex_lock(&lock);
+ num_threads = num_online_cpus();
+ active_cpus = cpus;
active.fn = fn;
active.data = data;
active.fnret = 0;
idle.fn = chill;
idle.data = NULL;
- /* This could be too big for stack on large machines. */
- threads = kcalloc(NR_CPUS, sizeof(threads[0]), GFP_KERNEL);
- if (!threads)
- return -ENOMEM;
-
- /* Set up initial state. */
- mutex_lock(&lock);
- init_completion(&finished);
- num_threads = num_online_cpus();
set_state(STOPMACHINE_PREPARE);
- for_each_online_cpu(i) {
- struct stop_machine_data *smdata = &idle;
- struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
-
- if (!cpus) {
- if (i == first_cpu(cpu_online_map))
- smdata = &active;
- } else {
- if (cpu_isset(i, *cpus))
- smdata = &active;
- }
-
- threads[i] = kthread_create((void *)stop_cpu, smdata, "kstop%u",
- i);
- if (IS_ERR(threads[i])) {
- err = PTR_ERR(threads[i]);
- threads[i] = NULL;
- goto kill_threads;
- }
-
- /* Place it onto correct cpu. */
- kthread_bind(threads[i], i);
-
- /* Make it highest prio. */
- if (sched_setscheduler_nocheck(threads[i], SCHED_FIFO, &param))
- BUG();
- }
-
- /* We've created all the threads. Wake them all: hold this CPU so one
+ /* Schedule the stop_cpu work on all cpus: hold this CPU so one
* doesn't hit this CPU until we're ready. */
get_cpu();
- for_each_online_cpu(i)
- wake_up_process(threads[i]);
-
+ for_each_online_cpu(i) {
+ sm_work = percpu_ptr(stop_machine_work, i);
+ INIT_WORK(sm_work, stop_cpu);
+ queue_work_on(i, stop_machine_wq, sm_work);
+ }
/* This will release the thread on our CPU. */
put_cpu();
- wait_for_completion(&finished);
- mutex_unlock(&lock);
-
- kfree(threads);
-
- return active.fnret;
-
-kill_threads:
- for_each_online_cpu(i)
- if (threads[i])
- kthread_stop(threads[i]);
+ flush_workqueue(stop_machine_wq);
+ ret = active.fnret;
mutex_unlock(&lock);
-
- kfree(threads);
- return err;
+ return ret;
}
int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
@@ -187,3 +154,11 @@ int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
return ret;
}
EXPORT_SYMBOL_GPL(stop_machine);
+
+static int __init stop_machine_init(void)
+{
+ stop_machine_wq = create_rt_workqueue("kstop");
+ stop_machine_work = alloc_percpu(struct work_struct);
+ return 0;
+}
+core_initcall(stop_machine_init);
diff --git a/kernel/sys.c b/kernel/sys.c
index 53879cdae48..31deba8f7d1 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1716,6 +1716,16 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
case PR_SET_TSC:
error = SET_TSC_CTL(arg2);
break;
+ case PR_GET_TIMERSLACK:
+ error = current->timer_slack_ns;
+ break;
+ case PR_SET_TIMERSLACK:
+ if (arg2 <= 0)
+ current->timer_slack_ns =
+ current->default_timer_slack_ns;
+ else
+ current->timer_slack_ns = arg2;
+ break;
default:
error = -EINVAL;
break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index a77b27b11b0..e14a2328170 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -31,7 +31,7 @@ cond_syscall(sys_socketpair);
cond_syscall(sys_bind);
cond_syscall(sys_listen);
cond_syscall(sys_accept);
-cond_syscall(sys_paccept);
+cond_syscall(sys_accept4);
cond_syscall(sys_connect);
cond_syscall(sys_getsockname);
cond_syscall(sys_getpeername);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b3cc73931d1..c83f566e940 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -176,6 +176,9 @@ extern struct ctl_table random_table[];
#ifdef CONFIG_INOTIFY_USER
extern struct ctl_table inotify_table[];
#endif
+#ifdef CONFIG_EPOLL
+extern struct ctl_table epoll_table[];
+#endif
#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
int sysctl_legacy_va_layout;
@@ -276,6 +279,16 @@ static struct ctl_table kern_table[] = {
},
{
.ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_shares_thresh",
+ .data = &sysctl_sched_shares_thresh,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
.procname = "sched_child_runs_first",
.data = &sysctl_sched_child_runs_first,
.maxlen = sizeof(unsigned int),
@@ -464,7 +477,7 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
-#ifdef CONFIG_FTRACE
+#ifdef CONFIG_FUNCTION_TRACER
{
.ctl_name = CTL_UNNUMBERED,
.procname = "ftrace_enabled",
@@ -474,6 +487,16 @@ static struct ctl_table kern_table[] = {
.proc_handler = &ftrace_enable_sysctl,
},
#endif
+#ifdef CONFIG_TRACING
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "ftrace_dump_on_oops",
+ .data = &ftrace_dump_on_oops,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
#ifdef CONFIG_MODULES
{
.ctl_name = KERN_MODPROBE,
@@ -1315,6 +1338,13 @@ static struct ctl_table fs_table[] = {
.child = inotify_table,
},
#endif
+#ifdef CONFIG_EPOLL
+ {
+ .procname = "epoll",
+ .mode = 0555,
+ .child = epoll_table,
+ },
+#endif
#endif
{
.ctl_name = KERN_SETUID_DUMPABLE,
diff --git a/kernel/time.c b/kernel/time.c
index 6a08660b4fa..d63a4336fad 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -669,3 +669,21 @@ EXPORT_SYMBOL(get_jiffies_64);
#endif
EXPORT_SYMBOL(jiffies);
+
+/*
+ * Add two timespec values and do a safety check for overflow.
+ * It's assumed that both values are valid (>= 0)
+ */
+struct timespec timespec_add_safe(const struct timespec lhs,
+ const struct timespec rhs)
+{
+ struct timespec res;
+
+ set_normalized_timespec(&res, lhs.tv_sec + rhs.tv_sec,
+ lhs.tv_nsec + rhs.tv_nsec);
+
+ if (res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)
+ res.tv_sec = TIME_T_MAX;
+
+ return res;
+}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 1a20715bfd6..8ff15e5d486 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -142,8 +142,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
time_state = TIME_OOP;
printk(KERN_NOTICE "Clock: "
"inserting leap second 23:59:60 UTC\n");
- leap_timer.expires = ktime_add_ns(leap_timer.expires,
- NSEC_PER_SEC);
+ hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC);
res = HRTIMER_RESTART;
break;
case TIME_DEL:
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 0581c11fe6c..342fc9ccab4 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -300,7 +300,7 @@ void tick_nohz_stop_sched_tick(int inidle)
goto out;
}
- ts->idle_tick = ts->sched_timer.expires;
+ ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
ts->tick_stopped = 1;
ts->idle_jiffies = last_jiffies;
rcu_enter_nohz();
@@ -380,21 +380,21 @@ ktime_t tick_nohz_get_sleep_length(void)
static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
{
hrtimer_cancel(&ts->sched_timer);
- ts->sched_timer.expires = ts->idle_tick;
+ hrtimer_set_expires(&ts->sched_timer, ts->idle_tick);
while (1) {
/* Forward the time to expire in the future */
hrtimer_forward(&ts->sched_timer, now, tick_period);
if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
- hrtimer_start(&ts->sched_timer,
- ts->sched_timer.expires,
+ hrtimer_start_expires(&ts->sched_timer,
HRTIMER_MODE_ABS);
/* Check, if the timer was already in the past */
if (hrtimer_active(&ts->sched_timer))
break;
} else {
- if (!tick_program_event(ts->sched_timer.expires, 0))
+ if (!tick_program_event(
+ hrtimer_get_expires(&ts->sched_timer), 0))
break;
}
/* Update jiffies and reread time */
@@ -456,14 +456,16 @@ void tick_nohz_restart_sched_tick(void)
*/
ts->tick_stopped = 0;
ts->idle_exittime = now;
+
tick_nohz_restart(ts, now);
+
local_irq_enable();
}
static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
{
hrtimer_forward(&ts->sched_timer, now, tick_period);
- return tick_program_event(ts->sched_timer.expires, 0);
+ return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0);
}
/*
@@ -542,7 +544,7 @@ static void tick_nohz_switch_to_nohz(void)
next = tick_init_jiffy_update();
for (;;) {
- ts->sched_timer.expires = next;
+ hrtimer_set_expires(&ts->sched_timer, next);
if (!tick_program_event(next, 0))
break;
next = ktime_add(next, tick_period);
@@ -566,12 +568,26 @@ static void tick_nohz_switch_to_nohz(void)
*/
static void tick_nohz_kick_tick(int cpu)
{
+#if 0
+ /* Switch back to 2.6.27 behaviour */
+
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+ ktime_t delta, now;
if (!ts->tick_stopped)
return;
- tick_nohz_restart(ts, ktime_get());
+ /*
+ * Do not touch the tick device, when the next expiry is either
+ * already reached or less/equal than the tick period.
+ */
+ now = ktime_get();
+ delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
+ if (delta.tv64 <= tick_period.tv64)
+ return;
+
+ tick_nohz_restart(ts, now);
+#endif
}
#else
@@ -668,16 +684,15 @@ void tick_setup_sched_timer(void)
ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
/* Get the next period (per cpu) */
- ts->sched_timer.expires = tick_init_jiffy_update();
+ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
offset = ktime_to_ns(tick_period) >> 1;
do_div(offset, num_possible_cpus());
offset *= smp_processor_id();
- ts->sched_timer.expires = ktime_add_ns(ts->sched_timer.expires, offset);
+ hrtimer_add_expires_ns(&ts->sched_timer, offset);
for (;;) {
hrtimer_forward(&ts->sched_timer, now, tick_period);
- hrtimer_start(&ts->sched_timer, ts->sched_timer.expires,
- HRTIMER_MODE_ABS);
+ hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS);
/* Check, if the timer was already in the past */
if (hrtimer_active(&ts->sched_timer))
break;
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index f6426911e35..a999b92a127 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -66,9 +66,11 @@ print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
SEQ_printf(m, ", %s/%d", tmp, timer->start_pid);
#endif
SEQ_printf(m, "\n");
- SEQ_printf(m, " # expires at %Lu nsecs [in %Ld nsecs]\n",
- (unsigned long long)ktime_to_ns(timer->expires),
- (long long)(ktime_to_ns(timer->expires) - now));
+ SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n",
+ (unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)),
+ (unsigned long long)ktime_to_ns(hrtimer_get_expires(timer)),
+ (long long)(ktime_to_ns(hrtimer_get_softexpires(timer)) - now),
+ (long long)(ktime_to_ns(hrtimer_get_expires(timer)) - now));
}
static void
diff --git a/kernel/timer.c b/kernel/timer.c
index 56becf373c5..dbd50fabe4c 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -112,27 +112,8 @@ timer_set_base(struct timer_list *timer, struct tvec_base *new_base)
tbase_get_deferrable(timer->base));
}
-/**
- * __round_jiffies - function to round jiffies to a full second
- * @j: the time in (absolute) jiffies that should be rounded
- * @cpu: the processor number on which the timeout will happen
- *
- * __round_jiffies() rounds an absolute time in the future (in jiffies)
- * up or down to (approximately) full seconds. This is useful for timers
- * for which the exact time they fire does not matter too much, as long as
- * they fire approximately every X seconds.
- *
- * By rounding these timers to whole seconds, all such timers will fire
- * at the same time, rather than at various times spread out. The goal
- * of this is to have the CPU wake up less, which saves power.
- *
- * The exact rounding is skewed for each processor to avoid all
- * processors firing at the exact same time, which could lead
- * to lock contention or spurious cache line bouncing.
- *
- * The return value is the rounded version of the @j parameter.
- */
-unsigned long __round_jiffies(unsigned long j, int cpu)
+static unsigned long round_jiffies_common(unsigned long j, int cpu,
+ bool force_up)
{
int rem;
unsigned long original = j;
@@ -154,8 +135,9 @@ unsigned long __round_jiffies(unsigned long j, int cpu)
* due to delays of the timer irq, long irq off times etc etc) then
* we should round down to the whole second, not up. Use 1/4th second
* as cutoff for this rounding as an extreme upper bound for this.
+ * But never round down if @force_up is set.
*/
- if (rem < HZ/4) /* round down */
+ if (rem < HZ/4 && !force_up) /* round down */
j = j - rem;
else /* round up */
j = j - rem + HZ;
@@ -167,6 +149,31 @@ unsigned long __round_jiffies(unsigned long j, int cpu)
return original;
return j;
}
+
+/**
+ * __round_jiffies - function to round jiffies to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * __round_jiffies() rounds an absolute time in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The exact rounding is skewed for each processor to avoid all
+ * processors firing at the exact same time, which could lead
+ * to lock contention or spurious cache line bouncing.
+ *
+ * The return value is the rounded version of the @j parameter.
+ */
+unsigned long __round_jiffies(unsigned long j, int cpu)
+{
+ return round_jiffies_common(j, cpu, false);
+}
EXPORT_SYMBOL_GPL(__round_jiffies);
/**
@@ -191,13 +198,10 @@ EXPORT_SYMBOL_GPL(__round_jiffies);
*/
unsigned long __round_jiffies_relative(unsigned long j, int cpu)
{
- /*
- * In theory the following code can skip a jiffy in case jiffies
- * increments right between the addition and the later subtraction.
- * However since the entire point of this function is to use approximate
- * timeouts, it's entirely ok to not handle that.
- */
- return __round_jiffies(j + jiffies, cpu) - jiffies;
+ unsigned long j0 = jiffies;
+
+ /* Use j0 because jiffies might change while we run */
+ return round_jiffies_common(j + j0, cpu, false) - j0;
}
EXPORT_SYMBOL_GPL(__round_jiffies_relative);
@@ -218,7 +222,7 @@ EXPORT_SYMBOL_GPL(__round_jiffies_relative);
*/
unsigned long round_jiffies(unsigned long j)
{
- return __round_jiffies(j, raw_smp_processor_id());
+ return round_jiffies_common(j, raw_smp_processor_id(), false);
}
EXPORT_SYMBOL_GPL(round_jiffies);
@@ -243,6 +247,71 @@ unsigned long round_jiffies_relative(unsigned long j)
}
EXPORT_SYMBOL_GPL(round_jiffies_relative);
+/**
+ * __round_jiffies_up - function to round jiffies up to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * This is the same as __round_jiffies() except that it will never
+ * round down. This is useful for timeouts for which the exact time
+ * of firing does not matter too much, as long as they don't fire too
+ * early.
+ */
+unsigned long __round_jiffies_up(unsigned long j, int cpu)
+{
+ return round_jiffies_common(j, cpu, true);
+}
+EXPORT_SYMBOL_GPL(__round_jiffies_up);
+
+/**
+ * __round_jiffies_up_relative - function to round jiffies up to a full second
+ * @j: the time in (relative) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * This is the same as __round_jiffies_relative() except that it will never
+ * round down. This is useful for timeouts for which the exact time
+ * of firing does not matter too much, as long as they don't fire too
+ * early.
+ */
+unsigned long __round_jiffies_up_relative(unsigned long j, int cpu)
+{
+ unsigned long j0 = jiffies;
+
+ /* Use j0 because jiffies might change while we run */
+ return round_jiffies_common(j + j0, cpu, true) - j0;
+}
+EXPORT_SYMBOL_GPL(__round_jiffies_up_relative);
+
+/**
+ * round_jiffies_up - function to round jiffies up to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ *
+ * This is the same as round_jiffies() except that it will never
+ * round down. This is useful for timeouts for which the exact time
+ * of firing does not matter too much, as long as they don't fire too
+ * early.
+ */
+unsigned long round_jiffies_up(unsigned long j)
+{
+ return round_jiffies_common(j, raw_smp_processor_id(), true);
+}
+EXPORT_SYMBOL_GPL(round_jiffies_up);
+
+/**
+ * round_jiffies_up_relative - function to round jiffies up to a full second
+ * @j: the time in (relative) jiffies that should be rounded
+ *
+ * This is the same as round_jiffies_relative() except that it will never
+ * round down. This is useful for timeouts for which the exact time
+ * of firing does not matter too much, as long as they don't fire too
+ * early.
+ */
+unsigned long round_jiffies_up_relative(unsigned long j)
+{
+ return __round_jiffies_up_relative(j, raw_smp_processor_id());
+}
+EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
+
static inline void set_running_timer(struct tvec_base *base,
struct timer_list *timer)
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 1cb3e1f616a..8b6b673b4d6 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -1,13 +1,26 @@
#
-# Architectures that offer an FTRACE implementation should select HAVE_FTRACE:
+# Architectures that offer an FUNCTION_TRACER implementation should
+# select HAVE_FUNCTION_TRACER:
#
+config USER_STACKTRACE_SUPPORT
+ bool
+
config NOP_TRACER
bool
-config HAVE_FTRACE
+config HAVE_FUNCTION_TRACER
bool
- select NOP_TRACER
+
+config HAVE_FUNCTION_GRAPH_TRACER
+ bool
+
+config HAVE_FUNCTION_TRACE_MCOUNT_TEST
+ bool
+ help
+ This gets selected when the arch tests the function_trace_stop
+ variable at the mcount call site. Otherwise, this variable
+ is tested by the called function.
config HAVE_DYNAMIC_FTRACE
bool
@@ -15,6 +28,9 @@ config HAVE_DYNAMIC_FTRACE
config HAVE_FTRACE_MCOUNT_RECORD
bool
+config HAVE_HW_BRANCH_TRACER
+ bool
+
config TRACER_MAX_TRACE
bool
@@ -25,12 +41,15 @@ config TRACING
bool
select DEBUG_FS
select RING_BUFFER
- select STACKTRACE
+ select STACKTRACE if STACKTRACE_SUPPORT
select TRACEPOINTS
+ select NOP_TRACER
-config FTRACE
+menu "Tracers"
+
+config FUNCTION_TRACER
bool "Kernel Function Tracer"
- depends on HAVE_FTRACE
+ depends on HAVE_FUNCTION_TRACER
depends on DEBUG_KERNEL
select FRAME_POINTER
select TRACING
@@ -44,12 +63,24 @@ config FTRACE
(the bootup default), then the overhead of the instructions is very
small and not measurable even in micro-benchmarks.
+config FUNCTION_GRAPH_TRACER
+ bool "Kernel Function Graph Tracer"
+ depends on HAVE_FUNCTION_GRAPH_TRACER
+ depends on FUNCTION_TRACER
+ help
+ Enable the kernel to trace a function at both its return
+ and its entry.
+ It's first purpose is to trace the duration of functions and
+ draw a call graph for each thread with some informations like
+ the return value.
+ This is done by setting the current return address on the current
+ task structure into a stack of calls.
+
config IRQSOFF_TRACER
bool "Interrupts-off Latency Tracer"
default n
depends on TRACE_IRQFLAGS_SUPPORT
depends on GENERIC_TIME
- depends on HAVE_FTRACE
depends on DEBUG_KERNEL
select TRACE_IRQFLAGS
select TRACING
@@ -73,7 +104,6 @@ config PREEMPT_TRACER
default n
depends on GENERIC_TIME
depends on PREEMPT
- depends on HAVE_FTRACE
depends on DEBUG_KERNEL
select TRACING
select TRACER_MAX_TRACE
@@ -101,7 +131,6 @@ config SYSPROF_TRACER
config SCHED_TRACER
bool "Scheduling Latency Tracer"
- depends on HAVE_FTRACE
depends on DEBUG_KERNEL
select TRACING
select CONTEXT_SWITCH_TRACER
@@ -112,7 +141,6 @@ config SCHED_TRACER
config CONTEXT_SWITCH_TRACER
bool "Trace process context switches"
- depends on HAVE_FTRACE
depends on DEBUG_KERNEL
select TRACING
select MARKERS
@@ -122,9 +150,9 @@ config CONTEXT_SWITCH_TRACER
config BOOT_TRACER
bool "Trace boot initcalls"
- depends on HAVE_FTRACE
depends on DEBUG_KERNEL
select TRACING
+ select CONTEXT_SWITCH_TRACER
help
This tracer helps developers to optimize boot times: it records
the timings of the initcalls and traces key events and the identity
@@ -139,11 +167,75 @@ config BOOT_TRACER
selected, because the self-tests are an initcall as well and that
would invalidate the boot trace. )
+config TRACE_BRANCH_PROFILING
+ bool "Trace likely/unlikely profiler"
+ depends on DEBUG_KERNEL
+ select TRACING
+ help
+ This tracer profiles all the the likely and unlikely macros
+ in the kernel. It will display the results in:
+
+ /debugfs/tracing/profile_annotated_branch
+
+ Note: this will add a significant overhead, only turn this
+ on if you need to profile the system's use of these macros.
+
+ Say N if unsure.
+
+config PROFILE_ALL_BRANCHES
+ bool "Profile all if conditionals"
+ depends on TRACE_BRANCH_PROFILING
+ help
+ This tracer profiles all branch conditions. Every if ()
+ taken in the kernel is recorded whether it hit or miss.
+ The results will be displayed in:
+
+ /debugfs/tracing/profile_branch
+
+ This configuration, when enabled, will impose a great overhead
+ on the system. This should only be enabled when the system
+ is to be analyzed
+
+ Say N if unsure.
+
+config TRACING_BRANCHES
+ bool
+ help
+ Selected by tracers that will trace the likely and unlikely
+ conditions. This prevents the tracers themselves from being
+ profiled. Profiling the tracing infrastructure can only happen
+ when the likelys and unlikelys are not being traced.
+
+config BRANCH_TRACER
+ bool "Trace likely/unlikely instances"
+ depends on TRACE_BRANCH_PROFILING
+ select TRACING_BRANCHES
+ help
+ This traces the events of likely and unlikely condition
+ calls in the kernel. The difference between this and the
+ "Trace likely/unlikely profiler" is that this is not a
+ histogram of the callers, but actually places the calling
+ events into a running trace buffer to see when and where the
+ events happened, as well as their results.
+
+ Say N if unsure.
+
+config POWER_TRACER
+ bool "Trace power consumption behavior"
+ depends on DEBUG_KERNEL
+ depends on X86
+ select TRACING
+ help
+ This tracer helps developers to analyze and optimize the kernels
+ power management decisions, specifically the C-state and P-state
+ behavior.
+
+
config STACK_TRACER
bool "Trace max stack"
- depends on HAVE_FTRACE
+ depends on HAVE_FUNCTION_TRACER
depends on DEBUG_KERNEL
- select FTRACE
+ select FUNCTION_TRACER
select STACKTRACE
help
This special tracer records the maximum stack footprint of the
@@ -158,9 +250,17 @@ config STACK_TRACER
Say N if unsure.
+config BTS_TRACER
+ depends on HAVE_HW_BRANCH_TRACER
+ bool "Trace branches"
+ select TRACING
+ help
+ This tracer records all branches on the system in a circular
+ buffer giving access to the last N branches for each cpu.
+
config DYNAMIC_FTRACE
bool "enable/disable ftrace tracepoints dynamically"
- depends on FTRACE
+ depends on FUNCTION_TRACER
depends on HAVE_DYNAMIC_FTRACE
depends on DEBUG_KERNEL
default y
@@ -170,7 +270,7 @@ config DYNAMIC_FTRACE
with a No-Op instruction) as they are called. A table is
created to dynamically enable them again.
- This way a CONFIG_FTRACE kernel is slightly larger, but otherwise
+ This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but otherwise
has native performance as long as no tracing is active.
The changes to the code are done by a kernel thread that
@@ -195,3 +295,5 @@ config FTRACE_STARTUP_TEST
a series of tests are made to verify that the tracer is
functioning properly. It will do tests on all the configured
tracers of ftrace.
+
+endmenu
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index a85dfba88ba..62dc561b667 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -1,7 +1,7 @@
# Do not instrument the tracer itself:
-ifdef CONFIG_FTRACE
+ifdef CONFIG_FUNCTION_TRACER
ORIG_CFLAGS := $(KBUILD_CFLAGS)
KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
@@ -10,13 +10,18 @@ CFLAGS_trace_selftest_dynamic.o = -pg
obj-y += trace_selftest_dynamic.o
endif
-obj-$(CONFIG_FTRACE) += libftrace.o
+# If unlikely tracing is enabled, do not trace these files
+ifdef CONFIG_TRACING_BRANCHES
+KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
+endif
+
+obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
obj-$(CONFIG_TRACING) += trace.o
obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
-obj-$(CONFIG_FTRACE) += trace_functions.o
+obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
@@ -24,5 +29,9 @@ obj-$(CONFIG_NOP_TRACER) += trace_nop.o
obj-$(CONFIG_STACK_TRACER) += trace_stack.o
obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
+obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
+obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
+obj-$(CONFIG_BTS_TRACER) += trace_bts.o
+obj-$(CONFIG_POWER_TRACER) += trace_power.o
libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4dda4f60a2a..2e78628443e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -25,17 +25,34 @@
#include <linux/ftrace.h>
#include <linux/sysctl.h>
#include <linux/ctype.h>
-#include <linux/hash.h>
#include <linux/list.h>
#include <asm/ftrace.h>
#include "trace.h"
+#define FTRACE_WARN_ON(cond) \
+ do { \
+ if (WARN_ON(cond)) \
+ ftrace_kill(); \
+ } while (0)
+
+#define FTRACE_WARN_ON_ONCE(cond) \
+ do { \
+ if (WARN_ON_ONCE(cond)) \
+ ftrace_kill(); \
+ } while (0)
+
/* ftrace_enabled is a method to turn ftrace on or off */
int ftrace_enabled __read_mostly;
static int last_ftrace_enabled;
+/* ftrace_pid_trace >= 0 will only trace threads with this pid */
+static int ftrace_pid_trace = -1;
+
+/* Quick disabling of function tracer. */
+int function_trace_stop;
+
/*
* ftrace_disabled is set when an anomaly is discovered.
* ftrace_disabled is much stronger than ftrace_enabled.
@@ -44,6 +61,7 @@ static int ftrace_disabled __read_mostly;
static DEFINE_SPINLOCK(ftrace_lock);
static DEFINE_MUTEX(ftrace_sysctl_lock);
+static DEFINE_MUTEX(ftrace_start_lock);
static struct ftrace_ops ftrace_list_end __read_mostly =
{
@@ -52,6 +70,8 @@ static struct ftrace_ops ftrace_list_end __read_mostly =
static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
+ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
+ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
{
@@ -68,6 +88,21 @@ static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
};
}
+static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip)
+{
+ if (current->pid != ftrace_pid_trace)
+ return;
+
+ ftrace_pid_function(ip, parent_ip);
+}
+
+static void set_ftrace_pid_function(ftrace_func_t func)
+{
+ /* do not set ftrace_pid_function to itself! */
+ if (func != ftrace_pid_func)
+ ftrace_pid_function = func;
+}
+
/**
* clear_ftrace_function - reset the ftrace function
*
@@ -77,8 +112,24 @@ static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
void clear_ftrace_function(void)
{
ftrace_trace_function = ftrace_stub;
+ __ftrace_trace_function = ftrace_stub;
+ ftrace_pid_function = ftrace_stub;
}
+#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
+/*
+ * For those archs that do not test ftrace_trace_stop in their
+ * mcount call site, we need to do it from C.
+ */
+static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
+{
+ if (function_trace_stop)
+ return;
+
+ __ftrace_trace_function(ip, parent_ip);
+}
+#endif
+
static int __register_ftrace_function(struct ftrace_ops *ops)
{
/* should not be called from interrupt context */
@@ -95,14 +146,28 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
ftrace_list = ops;
if (ftrace_enabled) {
+ ftrace_func_t func;
+
+ if (ops->next == &ftrace_list_end)
+ func = ops->func;
+ else
+ func = ftrace_list_func;
+
+ if (ftrace_pid_trace >= 0) {
+ set_ftrace_pid_function(func);
+ func = ftrace_pid_func;
+ }
+
/*
* For one func, simply call it directly.
* For more than one func, call the chain.
*/
- if (ops->next == &ftrace_list_end)
- ftrace_trace_function = ops->func;
- else
- ftrace_trace_function = ftrace_list_func;
+#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
+ ftrace_trace_function = func;
+#else
+ __ftrace_trace_function = func;
+ ftrace_trace_function = ftrace_test_stop_func;
+#endif
}
spin_unlock(&ftrace_lock);
@@ -141,9 +206,19 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
if (ftrace_enabled) {
/* If we only have one func left, then call that directly */
- if (ftrace_list == &ftrace_list_end ||
- ftrace_list->next == &ftrace_list_end)
- ftrace_trace_function = ftrace_list->func;
+ if (ftrace_list->next == &ftrace_list_end) {
+ ftrace_func_t func = ftrace_list->func;
+
+ if (ftrace_pid_trace >= 0) {
+ set_ftrace_pid_function(func);
+ func = ftrace_pid_func;
+ }
+#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
+ ftrace_trace_function = func;
+#else
+ __ftrace_trace_function = func;
+#endif
+ }
}
out:
@@ -152,22 +227,39 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
return ret;
}
-#ifdef CONFIG_DYNAMIC_FTRACE
+static void ftrace_update_pid_func(void)
+{
+ ftrace_func_t func;
-#ifndef CONFIG_FTRACE_MCOUNT_RECORD
-/*
- * The hash lock is only needed when the recording of the mcount
- * callers are dynamic. That is, by the caller themselves and
- * not recorded via the compilation.
- */
-static DEFINE_SPINLOCK(ftrace_hash_lock);
-#define ftrace_hash_lock(flags) spin_lock_irqsave(&ftrace_hash_lock, flags)
-#define ftrace_hash_unlock(flags) \
- spin_unlock_irqrestore(&ftrace_hash_lock, flags)
+ /* should not be called from interrupt context */
+ spin_lock(&ftrace_lock);
+
+ if (ftrace_trace_function == ftrace_stub)
+ goto out;
+
+ func = ftrace_trace_function;
+
+ if (ftrace_pid_trace >= 0) {
+ set_ftrace_pid_function(func);
+ func = ftrace_pid_func;
+ } else {
+ if (func == ftrace_pid_func)
+ func = ftrace_pid_function;
+ }
+
+#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
+ ftrace_trace_function = func;
#else
-/* This is protected via the ftrace_lock with MCOUNT_RECORD. */
-#define ftrace_hash_lock(flags) do { (void)(flags); } while (0)
-#define ftrace_hash_unlock(flags) do { } while(0)
+ __ftrace_trace_function = func;
+#endif
+
+ out:
+ spin_unlock(&ftrace_lock);
+}
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+#ifndef CONFIG_FTRACE_MCOUNT_RECORD
+# error Dynamic ftrace depends on MCOUNT_RECORD
#endif
/*
@@ -178,25 +270,20 @@ static DEFINE_SPINLOCK(ftrace_hash_lock);
*/
static unsigned long mcount_addr = MCOUNT_ADDR;
-static struct task_struct *ftraced_task;
-
enum {
FTRACE_ENABLE_CALLS = (1 << 0),
FTRACE_DISABLE_CALLS = (1 << 1),
FTRACE_UPDATE_TRACE_FUNC = (1 << 2),
FTRACE_ENABLE_MCOUNT = (1 << 3),
FTRACE_DISABLE_MCOUNT = (1 << 4),
+ FTRACE_START_FUNC_RET = (1 << 5),
+ FTRACE_STOP_FUNC_RET = (1 << 6),
};
static int ftrace_filtered;
-static int tracing_on;
-static int frozen_record_count;
-static struct hlist_head ftrace_hash[FTRACE_HASHSIZE];
+static LIST_HEAD(ftrace_new_addrs);
-static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu);
-
-static DEFINE_MUTEX(ftraced_lock);
static DEFINE_MUTEX(ftrace_regex_lock);
struct ftrace_page {
@@ -214,16 +301,13 @@ struct ftrace_page {
static struct ftrace_page *ftrace_pages_start;
static struct ftrace_page *ftrace_pages;
-static int ftraced_trigger;
-static int ftraced_suspend;
-static int ftraced_stop;
-
-static int ftrace_record_suspend;
-
static struct dyn_ftrace *ftrace_free_records;
#ifdef CONFIG_KPROBES
+
+static int frozen_record_count;
+
static inline void freeze_record(struct dyn_ftrace *rec)
{
if (!(rec->flags & FTRACE_FL_FROZEN)) {
@@ -250,72 +334,6 @@ static inline int record_frozen(struct dyn_ftrace *rec)
# define record_frozen(rec) ({ 0; })
#endif /* CONFIG_KPROBES */
-int skip_trace(unsigned long ip)
-{
- unsigned long fl;
- struct dyn_ftrace *rec;
- struct hlist_node *t;
- struct hlist_head *head;
-
- if (frozen_record_count == 0)
- return 0;
-
- head = &ftrace_hash[hash_long(ip, FTRACE_HASHBITS)];
- hlist_for_each_entry_rcu(rec, t, head, node) {
- if (rec->ip == ip) {
- if (record_frozen(rec)) {
- if (rec->flags & FTRACE_FL_FAILED)
- return 1;
-
- if (!(rec->flags & FTRACE_FL_CONVERTED))
- return 1;
-
- if (!tracing_on || !ftrace_enabled)
- return 1;
-
- if (ftrace_filtered) {
- fl = rec->flags & (FTRACE_FL_FILTER |
- FTRACE_FL_NOTRACE);
- if (!fl || (fl & FTRACE_FL_NOTRACE))
- return 1;
- }
- }
- break;
- }
- }
-
- return 0;
-}
-
-static inline int
-ftrace_ip_in_hash(unsigned long ip, unsigned long key)
-{
- struct dyn_ftrace *p;
- struct hlist_node *t;
- int found = 0;
-
- hlist_for_each_entry_rcu(p, t, &ftrace_hash[key], node) {
- if (p->ip == ip) {
- found = 1;
- break;
- }
- }
-
- return found;
-}
-
-static inline void
-ftrace_add_hash(struct dyn_ftrace *node, unsigned long key)
-{
- hlist_add_head_rcu(&node->node, &ftrace_hash[key]);
-}
-
-/* called from kstop_machine */
-static inline void ftrace_del_hash(struct dyn_ftrace *node)
-{
- hlist_del(&node->node);
-}
-
static void ftrace_free_rec(struct dyn_ftrace *rec)
{
rec->ip = (unsigned long)ftrace_free_records;
@@ -346,7 +364,6 @@ void ftrace_release(void *start, unsigned long size)
}
}
spin_unlock(&ftrace_lock);
-
}
static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
@@ -358,10 +375,8 @@ static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
rec = ftrace_free_records;
if (unlikely(!(rec->flags & FTRACE_FL_FREE))) {
- WARN_ON_ONCE(1);
+ FTRACE_WARN_ON_ONCE(1);
ftrace_free_records = NULL;
- ftrace_disabled = 1;
- ftrace_enabled = 0;
return NULL;
}
@@ -371,179 +386,163 @@ static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
}
if (ftrace_pages->index == ENTRIES_PER_PAGE) {
- if (!ftrace_pages->next)
- return NULL;
+ if (!ftrace_pages->next) {
+ /* allocate another page */
+ ftrace_pages->next =
+ (void *)get_zeroed_page(GFP_KERNEL);
+ if (!ftrace_pages->next)
+ return NULL;
+ }
ftrace_pages = ftrace_pages->next;
}
return &ftrace_pages->records[ftrace_pages->index++];
}
-static void
+static struct dyn_ftrace *
ftrace_record_ip(unsigned long ip)
{
- struct dyn_ftrace *node;
- unsigned long flags;
- unsigned long key;
- int resched;
- int cpu;
-
- if (!ftrace_enabled || ftrace_disabled)
- return;
-
- resched = need_resched();
- preempt_disable_notrace();
-
- /*
- * We simply need to protect against recursion.
- * Use the the raw version of smp_processor_id and not
- * __get_cpu_var which can call debug hooks that can
- * cause a recursive crash here.
- */
- cpu = raw_smp_processor_id();
- per_cpu(ftrace_shutdown_disable_cpu, cpu)++;
- if (per_cpu(ftrace_shutdown_disable_cpu, cpu) != 1)
- goto out;
-
- if (unlikely(ftrace_record_suspend))
- goto out;
-
- key = hash_long(ip, FTRACE_HASHBITS);
-
- WARN_ON_ONCE(key >= FTRACE_HASHSIZE);
+ struct dyn_ftrace *rec;
- if (ftrace_ip_in_hash(ip, key))
- goto out;
+ if (ftrace_disabled)
+ return NULL;
- ftrace_hash_lock(flags);
+ rec = ftrace_alloc_dyn_node(ip);
+ if (!rec)
+ return NULL;
- /* This ip may have hit the hash before the lock */
- if (ftrace_ip_in_hash(ip, key))
- goto out_unlock;
+ rec->ip = ip;
- node = ftrace_alloc_dyn_node(ip);
- if (!node)
- goto out_unlock;
+ list_add(&rec->list, &ftrace_new_addrs);
- node->ip = ip;
+ return rec;
+}
- ftrace_add_hash(node, key);
+static void print_ip_ins(const char *fmt, unsigned char *p)
+{
+ int i;
- ftraced_trigger = 1;
+ printk(KERN_CONT "%s", fmt);
- out_unlock:
- ftrace_hash_unlock(flags);
- out:
- per_cpu(ftrace_shutdown_disable_cpu, cpu)--;
+ for (i = 0; i < MCOUNT_INSN_SIZE; i++)
+ printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
+}
- /* prevent recursion with scheduler */
- if (resched)
- preempt_enable_no_resched_notrace();
- else
- preempt_enable_notrace();
+static void ftrace_bug(int failed, unsigned long ip)
+{
+ switch (failed) {
+ case -EFAULT:
+ FTRACE_WARN_ON_ONCE(1);
+ pr_info("ftrace faulted on modifying ");
+ print_ip_sym(ip);
+ break;
+ case -EINVAL:
+ FTRACE_WARN_ON_ONCE(1);
+ pr_info("ftrace failed to modify ");
+ print_ip_sym(ip);
+ print_ip_ins(" actual: ", (unsigned char *)ip);
+ printk(KERN_CONT "\n");
+ break;
+ case -EPERM:
+ FTRACE_WARN_ON_ONCE(1);
+ pr_info("ftrace faulted on writing ");
+ print_ip_sym(ip);
+ break;
+ default:
+ FTRACE_WARN_ON_ONCE(1);
+ pr_info("ftrace faulted on unknown error ");
+ print_ip_sym(ip);
+ }
}
-#define FTRACE_ADDR ((long)(ftrace_caller))
static int
-__ftrace_replace_code(struct dyn_ftrace *rec,
- unsigned char *old, unsigned char *new, int enable)
+__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
{
unsigned long ip, fl;
+ unsigned long ftrace_addr;
+
+ ftrace_addr = (unsigned long)ftrace_caller;
ip = rec->ip;
- if (ftrace_filtered && enable) {
+ /*
+ * If this record is not to be traced and
+ * it is not enabled then do nothing.
+ *
+ * If this record is not to be traced and
+ * it is enabled then disabled it.
+ *
+ */
+ if (rec->flags & FTRACE_FL_NOTRACE) {
+ if (rec->flags & FTRACE_FL_ENABLED)
+ rec->flags &= ~FTRACE_FL_ENABLED;
+ else
+ return 0;
+
+ } else if (ftrace_filtered && enable) {
/*
- * If filtering is on:
- *
- * If this record is set to be filtered and
- * is enabled then do nothing.
- *
- * If this record is set to be filtered and
- * it is not enabled, enable it.
- *
- * If this record is not set to be filtered
- * and it is not enabled do nothing.
- *
- * If this record is set not to trace then
- * do nothing.
- *
- * If this record is set not to trace and
- * it is enabled then disable it.
- *
- * If this record is not set to be filtered and
- * it is enabled, disable it.
+ * Filtering is on:
*/
- fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE |
- FTRACE_FL_ENABLED);
+ fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED);
- if ((fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) ||
- (fl == (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE)) ||
- !fl || (fl == FTRACE_FL_NOTRACE))
+ /* Record is filtered and enabled, do nothing */
+ if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED))
return 0;
- /*
- * If it is enabled disable it,
- * otherwise enable it!
- */
- if (fl & FTRACE_FL_ENABLED) {
- /* swap new and old */
- new = old;
- old = ftrace_call_replace(ip, FTRACE_ADDR);
+ /* Record is not filtered and is not enabled do nothing */
+ if (!fl)
+ return 0;
+
+ /* Record is not filtered but enabled, disable it */
+ if (fl == FTRACE_FL_ENABLED)
rec->flags &= ~FTRACE_FL_ENABLED;
- } else {
- new = ftrace_call_replace(ip, FTRACE_ADDR);
+ else
+ /* Otherwise record is filtered but not enabled, enable it */
rec->flags |= FTRACE_FL_ENABLED;
- }
} else {
+ /* Disable or not filtered */
if (enable) {
- /*
- * If this record is set not to trace and is
- * not enabled, do nothing.
- */
- fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED);
- if (fl == FTRACE_FL_NOTRACE)
- return 0;
-
- new = ftrace_call_replace(ip, FTRACE_ADDR);
- } else
- old = ftrace_call_replace(ip, FTRACE_ADDR);
-
- if (enable) {
+ /* if record is enabled, do nothing */
if (rec->flags & FTRACE_FL_ENABLED)
return 0;
+
rec->flags |= FTRACE_FL_ENABLED;
+
} else {
+
+ /* if record is not enabled do nothing */
if (!(rec->flags & FTRACE_FL_ENABLED))
return 0;
+
rec->flags &= ~FTRACE_FL_ENABLED;
}
}
- return ftrace_modify_code(ip, old, new);
+ if (rec->flags & FTRACE_FL_ENABLED)
+ return ftrace_make_call(rec, ftrace_addr);
+ else
+ return ftrace_make_nop(NULL, rec, ftrace_addr);
}
static void ftrace_replace_code(int enable)
{
int i, failed;
- unsigned char *new = NULL, *old = NULL;
struct dyn_ftrace *rec;
struct ftrace_page *pg;
- if (enable)
- old = ftrace_nop_replace();
- else
- new = ftrace_nop_replace();
-
for (pg = ftrace_pages_start; pg; pg = pg->next) {
for (i = 0; i < pg->index; i++) {
rec = &pg->records[i];
- /* don't modify code that has already faulted */
- if (rec->flags & FTRACE_FL_FAILED)
+ /*
+ * Skip over free records and records that have
+ * failed.
+ */
+ if (rec->flags & FTRACE_FL_FREE ||
+ rec->flags & FTRACE_FL_FAILED)
continue;
/* ignore updates to this record's mcount site */
@@ -554,105 +553,52 @@ static void ftrace_replace_code(int enable)
unfreeze_record(rec);
}
- failed = __ftrace_replace_code(rec, old, new, enable);
+ failed = __ftrace_replace_code(rec, enable);
if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
rec->flags |= FTRACE_FL_FAILED;
if ((system_state == SYSTEM_BOOTING) ||
!core_kernel_text(rec->ip)) {
- ftrace_del_hash(rec);
ftrace_free_rec(rec);
- }
+ } else
+ ftrace_bug(failed, rec->ip);
}
}
}
}
-static void ftrace_shutdown_replenish(void)
-{
- if (ftrace_pages->next)
- return;
-
- /* allocate another page */
- ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
-}
-
-static void print_ip_ins(const char *fmt, unsigned char *p)
-{
- int i;
-
- printk(KERN_CONT "%s", fmt);
-
- for (i = 0; i < MCOUNT_INSN_SIZE; i++)
- printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
-}
-
static int
-ftrace_code_disable(struct dyn_ftrace *rec)
+ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
{
unsigned long ip;
- unsigned char *nop, *call;
- int failed;
+ int ret;
ip = rec->ip;
- nop = ftrace_nop_replace();
- call = ftrace_call_replace(ip, mcount_addr);
-
- failed = ftrace_modify_code(ip, call, nop);
- if (failed) {
- switch (failed) {
- case 1:
- WARN_ON_ONCE(1);
- pr_info("ftrace faulted on modifying ");
- print_ip_sym(ip);
- break;
- case 2:
- WARN_ON_ONCE(1);
- pr_info("ftrace failed to modify ");
- print_ip_sym(ip);
- print_ip_ins(" expected: ", call);
- print_ip_ins(" actual: ", (unsigned char *)ip);
- print_ip_ins(" replace: ", nop);
- printk(KERN_CONT "\n");
- break;
- }
-
+ ret = ftrace_make_nop(mod, rec, mcount_addr);
+ if (ret) {
+ ftrace_bug(ret, ip);
rec->flags |= FTRACE_FL_FAILED;
return 0;
}
return 1;
}
-static int __ftrace_update_code(void *ignore);
-
static int __ftrace_modify_code(void *data)
{
- unsigned long addr;
int *command = data;
- if (*command & FTRACE_ENABLE_CALLS) {
- /*
- * Update any recorded ips now that we have the
- * machine stopped
- */
- __ftrace_update_code(NULL);
+ if (*command & FTRACE_ENABLE_CALLS)
ftrace_replace_code(1);
- tracing_on = 1;
- } else if (*command & FTRACE_DISABLE_CALLS) {
+ else if (*command & FTRACE_DISABLE_CALLS)
ftrace_replace_code(0);
- tracing_on = 0;
- }
if (*command & FTRACE_UPDATE_TRACE_FUNC)
ftrace_update_ftrace_func(ftrace_trace_function);
- if (*command & FTRACE_ENABLE_MCOUNT) {
- addr = (unsigned long)ftrace_record_ip;
- ftrace_mcount_set(&addr);
- } else if (*command & FTRACE_DISABLE_MCOUNT) {
- addr = (unsigned long)ftrace_stub;
- ftrace_mcount_set(&addr);
- }
+ if (*command & FTRACE_START_FUNC_RET)
+ ftrace_enable_ftrace_graph_caller();
+ else if (*command & FTRACE_STOP_FUNC_RET)
+ ftrace_disable_ftrace_graph_caller();
return 0;
}
@@ -662,62 +608,44 @@ static void ftrace_run_update_code(int command)
stop_machine(__ftrace_modify_code, &command, NULL);
}
-void ftrace_disable_daemon(void)
-{
- /* Stop the daemon from calling kstop_machine */
- mutex_lock(&ftraced_lock);
- ftraced_stop = 1;
- mutex_unlock(&ftraced_lock);
-
- ftrace_force_update();
-}
-
-void ftrace_enable_daemon(void)
-{
- mutex_lock(&ftraced_lock);
- ftraced_stop = 0;
- mutex_unlock(&ftraced_lock);
-
- ftrace_force_update();
-}
-
static ftrace_func_t saved_ftrace_func;
+static int ftrace_start_up;
-static void ftrace_startup(void)
+static void ftrace_startup_enable(int command)
{
- int command = 0;
-
- if (unlikely(ftrace_disabled))
- return;
-
- mutex_lock(&ftraced_lock);
- ftraced_suspend++;
- if (ftraced_suspend == 1)
- command |= FTRACE_ENABLE_CALLS;
-
if (saved_ftrace_func != ftrace_trace_function) {
saved_ftrace_func = ftrace_trace_function;
command |= FTRACE_UPDATE_TRACE_FUNC;
}
if (!command || !ftrace_enabled)
- goto out;
+ return;
ftrace_run_update_code(command);
- out:
- mutex_unlock(&ftraced_lock);
}
-static void ftrace_shutdown(void)
+static void ftrace_startup(int command)
{
- int command = 0;
+ if (unlikely(ftrace_disabled))
+ return;
+
+ mutex_lock(&ftrace_start_lock);
+ ftrace_start_up++;
+ command |= FTRACE_ENABLE_CALLS;
+
+ ftrace_startup_enable(command);
+ mutex_unlock(&ftrace_start_lock);
+}
+
+static void ftrace_shutdown(int command)
+{
if (unlikely(ftrace_disabled))
return;
- mutex_lock(&ftraced_lock);
- ftraced_suspend--;
- if (!ftraced_suspend)
+ mutex_lock(&ftrace_start_lock);
+ ftrace_start_up--;
+ if (!ftrace_start_up)
command |= FTRACE_DISABLE_CALLS;
if (saved_ftrace_func != ftrace_trace_function) {
@@ -730,7 +658,7 @@ static void ftrace_shutdown(void)
ftrace_run_update_code(command);
out:
- mutex_unlock(&ftraced_lock);
+ mutex_unlock(&ftrace_start_lock);
}
static void ftrace_startup_sysctl(void)
@@ -740,15 +668,15 @@ static void ftrace_startup_sysctl(void)
if (unlikely(ftrace_disabled))
return;
- mutex_lock(&ftraced_lock);
+ mutex_lock(&ftrace_start_lock);
/* Force update next time */
saved_ftrace_func = NULL;
- /* ftraced_suspend is true if we want ftrace running */
- if (ftraced_suspend)
+ /* ftrace_start_up is true if we want ftrace running */
+ if (ftrace_start_up)
command |= FTRACE_ENABLE_CALLS;
ftrace_run_update_code(command);
- mutex_unlock(&ftraced_lock);
+ mutex_unlock(&ftrace_start_lock);
}
static void ftrace_shutdown_sysctl(void)
@@ -758,112 +686,50 @@ static void ftrace_shutdown_sysctl(void)
if (unlikely(ftrace_disabled))
return;
- mutex_lock(&ftraced_lock);
- /* ftraced_suspend is true if ftrace is running */
- if (ftraced_suspend)
+ mutex_lock(&ftrace_start_lock);
+ /* ftrace_start_up is true if ftrace is running */
+ if (ftrace_start_up)
command |= FTRACE_DISABLE_CALLS;
ftrace_run_update_code(command);
- mutex_unlock(&ftraced_lock);
+ mutex_unlock(&ftrace_start_lock);
}
static cycle_t ftrace_update_time;
static unsigned long ftrace_update_cnt;
unsigned long ftrace_update_tot_cnt;
-static int __ftrace_update_code(void *ignore)
+static int ftrace_update_code(struct module *mod)
{
- int i, save_ftrace_enabled;
+ struct dyn_ftrace *p, *t;
cycle_t start, stop;
- struct dyn_ftrace *p;
- struct hlist_node *t, *n;
- struct hlist_head *head, temp_list;
-
- /* Don't be recording funcs now */
- ftrace_record_suspend++;
- save_ftrace_enabled = ftrace_enabled;
- ftrace_enabled = 0;
start = ftrace_now(raw_smp_processor_id());
ftrace_update_cnt = 0;
- /* No locks needed, the machine is stopped! */
- for (i = 0; i < FTRACE_HASHSIZE; i++) {
- INIT_HLIST_HEAD(&temp_list);
- head = &ftrace_hash[i];
-
- /* all CPUS are stopped, we are safe to modify code */
- hlist_for_each_entry_safe(p, t, n, head, node) {
- /* Skip over failed records which have not been
- * freed. */
- if (p->flags & FTRACE_FL_FAILED)
- continue;
+ list_for_each_entry_safe(p, t, &ftrace_new_addrs, list) {
- /* Unconverted records are always at the head of the
- * hash bucket. Once we encounter a converted record,
- * simply skip over to the next bucket. Saves ftraced
- * some processor cycles (ftrace does its bid for
- * global warming :-p ). */
- if (p->flags & (FTRACE_FL_CONVERTED))
- break;
-
- /* Ignore updates to this record's mcount site.
- * Reintroduce this record at the head of this
- * bucket to attempt to "convert" it again if
- * the kprobe on it is unregistered before the
- * next run. */
- if (get_kprobe((void *)p->ip)) {
- ftrace_del_hash(p);
- INIT_HLIST_NODE(&p->node);
- hlist_add_head(&p->node, &temp_list);
- freeze_record(p);
- continue;
- } else {
- unfreeze_record(p);
- }
+ /* If something went wrong, bail without enabling anything */
+ if (unlikely(ftrace_disabled))
+ return -1;
- /* convert record (i.e, patch mcount-call with NOP) */
- if (ftrace_code_disable(p)) {
- p->flags |= FTRACE_FL_CONVERTED;
- ftrace_update_cnt++;
- } else {
- if ((system_state == SYSTEM_BOOTING) ||
- !core_kernel_text(p->ip)) {
- ftrace_del_hash(p);
- ftrace_free_rec(p);
- }
- }
- }
+ list_del_init(&p->list);
- hlist_for_each_entry_safe(p, t, n, &temp_list, node) {
- hlist_del(&p->node);
- INIT_HLIST_NODE(&p->node);
- hlist_add_head(&p->node, head);
- }
+ /* convert record (i.e, patch mcount-call with NOP) */
+ if (ftrace_code_disable(mod, p)) {
+ p->flags |= FTRACE_FL_CONVERTED;
+ ftrace_update_cnt++;
+ } else
+ ftrace_free_rec(p);
}
stop = ftrace_now(raw_smp_processor_id());
ftrace_update_time = stop - start;
ftrace_update_tot_cnt += ftrace_update_cnt;
- ftraced_trigger = 0;
-
- ftrace_enabled = save_ftrace_enabled;
- ftrace_record_suspend--;
return 0;
}
-static int ftrace_update_code(void)
-{
- if (unlikely(ftrace_disabled) ||
- !ftrace_enabled || !ftraced_trigger)
- return 0;
-
- stop_machine(__ftrace_update_code, NULL, NULL);
-
- return 1;
-}
-
static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
{
struct ftrace_page *pg;
@@ -892,8 +758,8 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
pg = ftrace_pages = ftrace_pages_start;
cnt = num_to_init / ENTRIES_PER_PAGE;
- pr_info("ftrace: allocating %ld hash entries in %d pages\n",
- num_to_init, cnt);
+ pr_info("ftrace: allocating %ld entries in %d pages\n",
+ num_to_init, cnt + 1);
for (i = 0; i < cnt; i++) {
pg->next = (void *)get_zeroed_page(GFP_KERNEL);
@@ -918,7 +784,6 @@ enum {
#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
struct ftrace_iterator {
- loff_t pos;
struct ftrace_page *pg;
unsigned idx;
unsigned flags;
@@ -943,6 +808,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
iter->pg = iter->pg->next;
iter->idx = 0;
goto retry;
+ } else {
+ iter->idx = -1;
}
} else {
rec = &iter->pg->records[iter->idx++];
@@ -954,6 +821,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
((iter->flags & FTRACE_ITER_FAILURES) &&
!(rec->flags & FTRACE_FL_FAILED)) ||
+ ((iter->flags & FTRACE_ITER_FILTER) &&
+ !(rec->flags & FTRACE_FL_FILTER)) ||
+
((iter->flags & FTRACE_ITER_NOTRACE) &&
!(rec->flags & FTRACE_FL_NOTRACE))) {
rec = NULL;
@@ -962,8 +832,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
}
spin_unlock(&ftrace_lock);
- iter->pos = *pos;
-
return rec;
}
@@ -971,16 +839,16 @@ static void *t_start(struct seq_file *m, loff_t *pos)
{
struct ftrace_iterator *iter = m->private;
void *p = NULL;
- loff_t l = -1;
- if (*pos != iter->pos) {
- for (p = t_next(m, p, &l); p && l < *pos; p = t_next(m, p, &l))
- ;
- } else {
- l = *pos;
- p = t_next(m, p, &l);
+ if (*pos > 0) {
+ if (iter->idx < 0)
+ return p;
+ (*pos)--;
+ iter->idx--;
}
+ p = t_next(m, p, pos);
+
return p;
}
@@ -1024,7 +892,6 @@ ftrace_avail_open(struct inode *inode, struct file *file)
return -ENOMEM;
iter->pg = ftrace_pages_start;
- iter->pos = -1;
ret = seq_open(file, &show_ftrace_seq_ops);
if (!ret) {
@@ -1111,7 +978,6 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
if (file->f_mode & FMODE_READ) {
iter->pg = ftrace_pages_start;
- iter->pos = -1;
iter->flags = enable ? FTRACE_ITER_FILTER :
FTRACE_ITER_NOTRACE;
@@ -1401,10 +1267,10 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
}
mutex_lock(&ftrace_sysctl_lock);
- mutex_lock(&ftraced_lock);
- if (iter->filtered && ftraced_suspend && ftrace_enabled)
+ mutex_lock(&ftrace_start_lock);
+ if (ftrace_start_up && ftrace_enabled)
ftrace_run_update_code(FTRACE_ENABLE_CALLS);
- mutex_unlock(&ftraced_lock);
+ mutex_unlock(&ftrace_start_lock);
mutex_unlock(&ftrace_sysctl_lock);
kfree(iter);
@@ -1424,55 +1290,6 @@ ftrace_notrace_release(struct inode *inode, struct file *file)
return ftrace_regex_release(inode, file, 0);
}
-static ssize_t
-ftraced_read(struct file *filp, char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- /* don't worry about races */
- char *buf = ftraced_stop ? "disabled\n" : "enabled\n";
- int r = strlen(buf);
-
- return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-}
-
-static ssize_t
-ftraced_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- char buf[64];
- long val;
- int ret;
-
- if (cnt >= sizeof(buf))
- return -EINVAL;
-
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
-
- if (strncmp(buf, "enable", 6) == 0)
- val = 1;
- else if (strncmp(buf, "disable", 7) == 0)
- val = 0;
- else {
- buf[cnt] = 0;
-
- ret = strict_strtoul(buf, 10, &val);
- if (ret < 0)
- return ret;
-
- val = !!val;
- }
-
- if (val)
- ftrace_enable_daemon();
- else
- ftrace_disable_daemon();
-
- filp->f_pos += cnt;
-
- return cnt;
-}
-
static struct file_operations ftrace_avail_fops = {
.open = ftrace_avail_open,
.read = seq_read,
@@ -1503,61 +1320,10 @@ static struct file_operations ftrace_notrace_fops = {
.release = ftrace_notrace_release,
};
-static struct file_operations ftraced_fops = {
- .open = tracing_open_generic,
- .read = ftraced_read,
- .write = ftraced_write,
-};
-
-/**
- * ftrace_force_update - force an update to all recording ftrace functions
- */
-int ftrace_force_update(void)
-{
- int ret = 0;
-
- if (unlikely(ftrace_disabled))
- return -ENODEV;
-
- mutex_lock(&ftrace_sysctl_lock);
- mutex_lock(&ftraced_lock);
-
- /*
- * If ftraced_trigger is not set, then there is nothing
- * to update.
- */
- if (ftraced_trigger && !ftrace_update_code())
- ret = -EBUSY;
-
- mutex_unlock(&ftraced_lock);
- mutex_unlock(&ftrace_sysctl_lock);
-
- return ret;
-}
-
-static void ftrace_force_shutdown(void)
-{
- struct task_struct *task;
- int command = FTRACE_DISABLE_CALLS | FTRACE_UPDATE_TRACE_FUNC;
-
- mutex_lock(&ftraced_lock);
- task = ftraced_task;
- ftraced_task = NULL;
- ftraced_suspend = -1;
- ftrace_run_update_code(command);
- mutex_unlock(&ftraced_lock);
-
- if (task)
- kthread_stop(task);
-}
-
-static __init int ftrace_init_debugfs(void)
+static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
{
- struct dentry *d_tracer;
struct dentry *entry;
- d_tracer = tracing_init_dentry();
-
entry = debugfs_create_file("available_filter_functions", 0444,
d_tracer, NULL, &ftrace_avail_fops);
if (!entry)
@@ -1581,47 +1347,47 @@ static __init int ftrace_init_debugfs(void)
pr_warning("Could not create debugfs "
"'set_ftrace_notrace' entry\n");
- entry = debugfs_create_file("ftraced_enabled", 0644, d_tracer,
- NULL, &ftraced_fops);
- if (!entry)
- pr_warning("Could not create debugfs "
- "'ftraced_enabled' entry\n");
return 0;
}
-fs_initcall(ftrace_init_debugfs);
-
-#ifdef CONFIG_FTRACE_MCOUNT_RECORD
-static int ftrace_convert_nops(unsigned long *start,
+static int ftrace_convert_nops(struct module *mod,
+ unsigned long *start,
unsigned long *end)
{
unsigned long *p;
unsigned long addr;
unsigned long flags;
+ mutex_lock(&ftrace_start_lock);
p = start;
while (p < end) {
addr = ftrace_call_adjust(*p++);
- /* should not be called from interrupt context */
- spin_lock(&ftrace_lock);
+ /*
+ * Some architecture linkers will pad between
+ * the different mcount_loc sections of different
+ * object files to satisfy alignments.
+ * Skip any NULL pointers.
+ */
+ if (!addr)
+ continue;
ftrace_record_ip(addr);
- spin_unlock(&ftrace_lock);
- ftrace_shutdown_replenish();
}
- /* p is ignored */
+ /* disable interrupts to prevent kstop machine */
local_irq_save(flags);
- __ftrace_update_code(p);
+ ftrace_update_code(mod);
local_irq_restore(flags);
+ mutex_unlock(&ftrace_start_lock);
return 0;
}
-void ftrace_init_module(unsigned long *start, unsigned long *end)
+void ftrace_init_module(struct module *mod,
+ unsigned long *start, unsigned long *end)
{
if (ftrace_disabled || start == end)
return;
- ftrace_convert_nops(start, end);
+ ftrace_convert_nops(mod, start, end);
}
extern unsigned long __start_mcount_loc[];
@@ -1651,137 +1417,131 @@ void __init ftrace_init(void)
last_ftrace_enabled = ftrace_enabled = 1;
- ret = ftrace_convert_nops(__start_mcount_loc,
+ ret = ftrace_convert_nops(NULL,
+ __start_mcount_loc,
__stop_mcount_loc);
return;
failed:
ftrace_disabled = 1;
}
-#else /* CONFIG_FTRACE_MCOUNT_RECORD */
-static int ftraced(void *ignore)
-{
- unsigned long usecs;
- while (!kthread_should_stop()) {
+#else
- set_current_state(TASK_INTERRUPTIBLE);
+static int __init ftrace_nodyn_init(void)
+{
+ ftrace_enabled = 1;
+ return 0;
+}
+device_initcall(ftrace_nodyn_init);
- /* check once a second */
- schedule_timeout(HZ);
+static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
+static inline void ftrace_startup_enable(int command) { }
+/* Keep as macros so we do not need to define the commands */
+# define ftrace_startup(command) do { } while (0)
+# define ftrace_shutdown(command) do { } while (0)
+# define ftrace_startup_sysctl() do { } while (0)
+# define ftrace_shutdown_sysctl() do { } while (0)
+#endif /* CONFIG_DYNAMIC_FTRACE */
- if (unlikely(ftrace_disabled))
- continue;
+static ssize_t
+ftrace_pid_read(struct file *file, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[64];
+ int r;
- mutex_lock(&ftrace_sysctl_lock);
- mutex_lock(&ftraced_lock);
- if (!ftraced_suspend && !ftraced_stop &&
- ftrace_update_code()) {
- usecs = nsecs_to_usecs(ftrace_update_time);
- if (ftrace_update_tot_cnt > 100000) {
- ftrace_update_tot_cnt = 0;
- pr_info("hm, dftrace overflow: %lu change%s"
- " (%lu total) in %lu usec%s\n",
- ftrace_update_cnt,
- ftrace_update_cnt != 1 ? "s" : "",
- ftrace_update_tot_cnt,
- usecs, usecs != 1 ? "s" : "");
- ftrace_disabled = 1;
- WARN_ON_ONCE(1);
- }
- }
- mutex_unlock(&ftraced_lock);
- mutex_unlock(&ftrace_sysctl_lock);
+ if (ftrace_pid_trace >= 0)
+ r = sprintf(buf, "%u\n", ftrace_pid_trace);
+ else
+ r = sprintf(buf, "no pid\n");
- ftrace_shutdown_replenish();
- }
- __set_current_state(TASK_RUNNING);
- return 0;
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}
-static int __init ftrace_dynamic_init(void)
+static ssize_t
+ftrace_pid_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
{
- struct task_struct *p;
- unsigned long addr;
+ char buf[64];
+ long val;
int ret;
- addr = (unsigned long)ftrace_record_ip;
+ if (cnt >= sizeof(buf))
+ return -EINVAL;
- stop_machine(ftrace_dyn_arch_init, &addr, NULL);
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
- /* ftrace_dyn_arch_init places the return code in addr */
- if (addr) {
- ret = (int)addr;
- goto failed;
- }
+ buf[cnt] = 0;
- ret = ftrace_dyn_table_alloc(NR_TO_INIT);
- if (ret)
- goto failed;
+ ret = strict_strtol(buf, 10, &val);
+ if (ret < 0)
+ return ret;
- p = kthread_run(ftraced, NULL, "ftraced");
- if (IS_ERR(p)) {
- ret = -1;
- goto failed;
+ mutex_lock(&ftrace_start_lock);
+ if (ret < 0) {
+ /* disable pid tracing */
+ if (ftrace_pid_trace < 0)
+ goto out;
+ ftrace_pid_trace = -1;
+
+ } else {
+
+ if (ftrace_pid_trace == val)
+ goto out;
+
+ ftrace_pid_trace = val;
}
- last_ftrace_enabled = ftrace_enabled = 1;
- ftraced_task = p;
+ /* update the function call */
+ ftrace_update_pid_func();
+ ftrace_startup_enable(0);
- return 0;
+ out:
+ mutex_unlock(&ftrace_start_lock);
- failed:
- ftrace_disabled = 1;
- return ret;
+ return cnt;
}
-core_initcall(ftrace_dynamic_init);
-#endif /* CONFIG_FTRACE_MCOUNT_RECORD */
+static struct file_operations ftrace_pid_fops = {
+ .read = ftrace_pid_read,
+ .write = ftrace_pid_write,
+};
-#else
-# define ftrace_startup() do { } while (0)
-# define ftrace_shutdown() do { } while (0)
-# define ftrace_startup_sysctl() do { } while (0)
-# define ftrace_shutdown_sysctl() do { } while (0)
-# define ftrace_force_shutdown() do { } while (0)
-#endif /* CONFIG_DYNAMIC_FTRACE */
+static __init int ftrace_init_debugfs(void)
+{
+ struct dentry *d_tracer;
+ struct dentry *entry;
+
+ d_tracer = tracing_init_dentry();
+ if (!d_tracer)
+ return 0;
+
+ ftrace_init_dyn_debugfs(d_tracer);
+
+ entry = debugfs_create_file("set_ftrace_pid", 0644, d_tracer,
+ NULL, &ftrace_pid_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs "
+ "'set_ftrace_pid' entry\n");
+ return 0;
+}
+
+fs_initcall(ftrace_init_debugfs);
/**
- * ftrace_kill_atomic - kill ftrace from critical sections
+ * ftrace_kill - kill ftrace
*
* This function should be used by panic code. It stops ftrace
* but in a not so nice way. If you need to simply kill ftrace
* from a non-atomic section, use ftrace_kill.
*/
-void ftrace_kill_atomic(void)
-{
- ftrace_disabled = 1;
- ftrace_enabled = 0;
-#ifdef CONFIG_DYNAMIC_FTRACE
- ftraced_suspend = -1;
-#endif
- clear_ftrace_function();
-}
-
-/**
- * ftrace_kill - totally shutdown ftrace
- *
- * This is a safety measure. If something was detected that seems
- * wrong, calling this function will keep ftrace from doing
- * any more modifications, and updates.
- * used when something went wrong.
- */
void ftrace_kill(void)
{
- mutex_lock(&ftrace_sysctl_lock);
ftrace_disabled = 1;
ftrace_enabled = 0;
-
clear_ftrace_function();
- mutex_unlock(&ftrace_sysctl_lock);
-
- /* Try to totally disable ftrace */
- ftrace_force_shutdown();
}
/**
@@ -1803,10 +1563,11 @@ int register_ftrace_function(struct ftrace_ops *ops)
return -1;
mutex_lock(&ftrace_sysctl_lock);
+
ret = __register_ftrace_function(ops);
- ftrace_startup();
- mutex_unlock(&ftrace_sysctl_lock);
+ ftrace_startup(0);
+ mutex_unlock(&ftrace_sysctl_lock);
return ret;
}
@@ -1822,7 +1583,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
mutex_lock(&ftrace_sysctl_lock);
ret = __unregister_ftrace_function(ops);
- ftrace_shutdown();
+ ftrace_shutdown(0);
mutex_unlock(&ftrace_sysctl_lock);
return ret;
@@ -1870,3 +1631,143 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
mutex_unlock(&ftrace_sysctl_lock);
return ret;
}
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+static atomic_t ftrace_graph_active;
+
+/* The callbacks that hook a function */
+trace_func_graph_ret_t ftrace_graph_return =
+ (trace_func_graph_ret_t)ftrace_stub;
+trace_func_graph_ent_t ftrace_graph_entry =
+ (trace_func_graph_ent_t)ftrace_stub;
+
+/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
+static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
+{
+ int i;
+ int ret = 0;
+ unsigned long flags;
+ int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE;
+ struct task_struct *g, *t;
+
+ for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) {
+ ret_stack_list[i] = kmalloc(FTRACE_RETFUNC_DEPTH
+ * sizeof(struct ftrace_ret_stack),
+ GFP_KERNEL);
+ if (!ret_stack_list[i]) {
+ start = 0;
+ end = i;
+ ret = -ENOMEM;
+ goto free;
+ }
+ }
+
+ read_lock_irqsave(&tasklist_lock, flags);
+ do_each_thread(g, t) {
+ if (start == end) {
+ ret = -EAGAIN;
+ goto unlock;
+ }
+
+ if (t->ret_stack == NULL) {
+ t->curr_ret_stack = -1;
+ /* Make sure IRQs see the -1 first: */
+ barrier();
+ t->ret_stack = ret_stack_list[start++];
+ atomic_set(&t->trace_overrun, 0);
+ }
+ } while_each_thread(g, t);
+
+unlock:
+ read_unlock_irqrestore(&tasklist_lock, flags);
+free:
+ for (i = start; i < end; i++)
+ kfree(ret_stack_list[i]);
+ return ret;
+}
+
+/* Allocate a return stack for each task */
+static int start_graph_tracing(void)
+{
+ struct ftrace_ret_stack **ret_stack_list;
+ int ret;
+
+ ret_stack_list = kmalloc(FTRACE_RETSTACK_ALLOC_SIZE *
+ sizeof(struct ftrace_ret_stack *),
+ GFP_KERNEL);
+
+ if (!ret_stack_list)
+ return -ENOMEM;
+
+ do {
+ ret = alloc_retstack_tasklist(ret_stack_list);
+ } while (ret == -EAGAIN);
+
+ kfree(ret_stack_list);
+ return ret;
+}
+
+int register_ftrace_graph(trace_func_graph_ret_t retfunc,
+ trace_func_graph_ent_t entryfunc)
+{
+ int ret = 0;
+
+ mutex_lock(&ftrace_sysctl_lock);
+
+ atomic_inc(&ftrace_graph_active);
+ ret = start_graph_tracing();
+ if (ret) {
+ atomic_dec(&ftrace_graph_active);
+ goto out;
+ }
+
+ ftrace_graph_return = retfunc;
+ ftrace_graph_entry = entryfunc;
+
+ ftrace_startup(FTRACE_START_FUNC_RET);
+
+out:
+ mutex_unlock(&ftrace_sysctl_lock);
+ return ret;
+}
+
+void unregister_ftrace_graph(void)
+{
+ mutex_lock(&ftrace_sysctl_lock);
+
+ atomic_dec(&ftrace_graph_active);
+ ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
+ ftrace_graph_entry = (trace_func_graph_ent_t)ftrace_stub;
+ ftrace_shutdown(FTRACE_STOP_FUNC_RET);
+
+ mutex_unlock(&ftrace_sysctl_lock);
+}
+
+/* Allocate a return stack for newly created task */
+void ftrace_graph_init_task(struct task_struct *t)
+{
+ if (atomic_read(&ftrace_graph_active)) {
+ t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
+ * sizeof(struct ftrace_ret_stack),
+ GFP_KERNEL);
+ if (!t->ret_stack)
+ return;
+ t->curr_ret_stack = -1;
+ atomic_set(&t->trace_overrun, 0);
+ } else
+ t->ret_stack = NULL;
+}
+
+void ftrace_graph_exit_task(struct task_struct *t)
+{
+ struct ftrace_ret_stack *ret_stack = t->ret_stack;
+
+ t->ret_stack = NULL;
+ /* NULL must become visible to IRQs before we free it: */
+ barrier();
+
+ kfree(ret_stack);
+}
+#endif
+
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 94af1fe56bb..e206951603c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -16,14 +16,100 @@
#include <linux/list.h>
#include <linux/fs.h>
+#include "trace.h"
+
+/*
+ * A fast way to enable or disable all ring buffers is to
+ * call tracing_on or tracing_off. Turning off the ring buffers
+ * prevents all ring buffers from being recorded to.
+ * Turning this switch on, makes it OK to write to the
+ * ring buffer, if the ring buffer is enabled itself.
+ *
+ * There's three layers that must be on in order to write
+ * to the ring buffer.
+ *
+ * 1) This global flag must be set.
+ * 2) The ring buffer must be enabled for recording.
+ * 3) The per cpu buffer must be enabled for recording.
+ *
+ * In case of an anomaly, this global flag has a bit set that
+ * will permantly disable all ring buffers.
+ */
+
+/*
+ * Global flag to disable all recording to ring buffers
+ * This has two bits: ON, DISABLED
+ *
+ * ON DISABLED
+ * ---- ----------
+ * 0 0 : ring buffers are off
+ * 1 0 : ring buffers are on
+ * X 1 : ring buffers are permanently disabled
+ */
+
+enum {
+ RB_BUFFERS_ON_BIT = 0,
+ RB_BUFFERS_DISABLED_BIT = 1,
+};
+
+enum {
+ RB_BUFFERS_ON = 1 << RB_BUFFERS_ON_BIT,
+ RB_BUFFERS_DISABLED = 1 << RB_BUFFERS_DISABLED_BIT,
+};
+
+static long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
+
+/**
+ * tracing_on - enable all tracing buffers
+ *
+ * This function enables all tracing buffers that may have been
+ * disabled with tracing_off.
+ */
+void tracing_on(void)
+{
+ set_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
+}
+
+/**
+ * tracing_off - turn off all tracing buffers
+ *
+ * This function stops all tracing buffers from recording data.
+ * It does not disable any overhead the tracers themselves may
+ * be causing. This function simply causes all recording to
+ * the ring buffers to fail.
+ */
+void tracing_off(void)
+{
+ clear_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
+}
+
+/**
+ * tracing_off_permanent - permanently disable ring buffers
+ *
+ * This function, once called, will disable all ring buffers
+ * permanenty.
+ */
+void tracing_off_permanent(void)
+{
+ set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags);
+}
+
+#include "trace.h"
+
/* Up this if you want to test the TIME_EXTENTS and normalization */
#define DEBUG_SHIFT 0
/* FIXME!!! */
u64 ring_buffer_time_stamp(int cpu)
{
+ u64 time;
+
+ preempt_disable_notrace();
/* shift to debug/test normalization and TIME_EXTENTS */
- return sched_clock() << DEBUG_SHIFT;
+ time = sched_clock() << DEBUG_SHIFT;
+ preempt_enable_notrace();
+
+ return time;
}
void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
@@ -130,7 +216,7 @@ struct buffer_page {
static inline void free_buffer_page(struct buffer_page *bpage)
{
if (bpage->page)
- __free_page(bpage->page);
+ free_page((unsigned long)bpage->page);
kfree(bpage);
}
@@ -152,7 +238,8 @@ static inline int test_time_stamp(u64 delta)
struct ring_buffer_per_cpu {
int cpu;
struct ring_buffer *buffer;
- spinlock_t lock;
+ spinlock_t reader_lock; /* serialize readers */
+ raw_spinlock_t lock;
struct lock_class_key lock_key;
struct list_head pages;
struct buffer_page *head_page; /* read from head */
@@ -186,32 +273,16 @@ struct ring_buffer_iter {
u64 read_stamp;
};
+/* buffer may be either ring_buffer or ring_buffer_per_cpu */
#define RB_WARN_ON(buffer, cond) \
- do { \
- if (unlikely(cond)) { \
- atomic_inc(&buffer->record_disabled); \
- WARN_ON(1); \
- } \
- } while (0)
-
-#define RB_WARN_ON_RET(buffer, cond) \
- do { \
- if (unlikely(cond)) { \
+ ({ \
+ int _____ret = unlikely(cond); \
+ if (_____ret) { \
atomic_inc(&buffer->record_disabled); \
WARN_ON(1); \
- return -1; \
} \
- } while (0)
-
-#define RB_WARN_ON_ONCE(buffer, cond) \
- do { \
- static int once; \
- if (unlikely(cond) && !once) { \
- once++; \
- atomic_inc(&buffer->record_disabled); \
- WARN_ON(1); \
- } \
- } while (0)
+ _____ret; \
+ })
/**
* check_pages - integrity check of buffer pages
@@ -225,14 +296,18 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
struct list_head *head = &cpu_buffer->pages;
struct buffer_page *page, *tmp;
- RB_WARN_ON_RET(cpu_buffer, head->next->prev != head);
- RB_WARN_ON_RET(cpu_buffer, head->prev->next != head);
+ if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
+ return -1;
+ if (RB_WARN_ON(cpu_buffer, head->prev->next != head))
+ return -1;
list_for_each_entry_safe(page, tmp, head, list) {
- RB_WARN_ON_RET(cpu_buffer,
- page->list.next->prev != &page->list);
- RB_WARN_ON_RET(cpu_buffer,
- page->list.prev->next != &page->list);
+ if (RB_WARN_ON(cpu_buffer,
+ page->list.next->prev != &page->list))
+ return -1;
+ if (RB_WARN_ON(cpu_buffer,
+ page->list.prev->next != &page->list))
+ return -1;
}
return 0;
@@ -289,7 +364,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
cpu_buffer->cpu = cpu;
cpu_buffer->buffer = buffer;
- spin_lock_init(&cpu_buffer->lock);
+ spin_lock_init(&cpu_buffer->reader_lock);
+ cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
INIT_LIST_HEAD(&cpu_buffer->pages);
page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()),
@@ -438,13 +514,15 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
synchronize_sched();
for (i = 0; i < nr_pages; i++) {
- BUG_ON(list_empty(&cpu_buffer->pages));
+ if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages)))
+ return;
p = cpu_buffer->pages.next;
page = list_entry(p, struct buffer_page, list);
list_del_init(&page->list);
free_buffer_page(page);
}
- BUG_ON(list_empty(&cpu_buffer->pages));
+ if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages)))
+ return;
rb_reset_cpu(cpu_buffer);
@@ -466,7 +544,8 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
synchronize_sched();
for (i = 0; i < nr_pages; i++) {
- BUG_ON(list_empty(pages));
+ if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
+ return;
p = pages->next;
page = list_entry(p, struct buffer_page, list);
list_del_init(&page->list);
@@ -503,6 +582,12 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
LIST_HEAD(pages);
int i, cpu;
+ /*
+ * Always succeed at resizing a non-existent buffer:
+ */
+ if (!buffer)
+ return size;
+
size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
size *= BUF_PAGE_SIZE;
buffer_size = buffer->pages * BUF_PAGE_SIZE;
@@ -521,7 +606,10 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
if (size < buffer_size) {
/* easy case, just free pages */
- BUG_ON(nr_pages >= buffer->pages);
+ if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) {
+ mutex_unlock(&buffer->mutex);
+ return -1;
+ }
rm_pages = buffer->pages - nr_pages;
@@ -540,7 +628,11 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
* add these pages to the cpu_buffers. Otherwise we just free
* them all and return -ENOMEM;
*/
- BUG_ON(nr_pages <= buffer->pages);
+ if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) {
+ mutex_unlock(&buffer->mutex);
+ return -1;
+ }
+
new_pages = nr_pages - buffer->pages;
for_each_buffer_cpu(buffer, cpu) {
@@ -563,7 +655,10 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
rb_insert_pages(cpu_buffer, &pages, new_pages);
}
- BUG_ON(!list_empty(&pages));
+ if (RB_WARN_ON(buffer, !list_empty(&pages))) {
+ mutex_unlock(&buffer->mutex);
+ return -1;
+ }
out:
buffer->pages = nr_pages;
@@ -576,6 +671,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
list_del_init(&page->list);
free_buffer_page(page);
}
+ mutex_unlock(&buffer->mutex);
return -ENOMEM;
}
@@ -651,7 +747,8 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
head += rb_event_length(event)) {
event = __rb_page_index(cpu_buffer->head_page, head);
- BUG_ON(rb_null_event(event));
+ if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
+ return;
/* Only count data entries */
if (event->type != RINGBUF_TYPE_DATA)
continue;
@@ -704,8 +801,9 @@ rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
addr &= PAGE_MASK;
while (cpu_buffer->commit_page->page != (void *)addr) {
- RB_WARN_ON(cpu_buffer,
- cpu_buffer->commit_page == cpu_buffer->tail_page);
+ if (RB_WARN_ON(cpu_buffer,
+ cpu_buffer->commit_page == cpu_buffer->tail_page))
+ return;
cpu_buffer->commit_page->commit =
cpu_buffer->commit_page->write;
rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
@@ -852,7 +950,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
if (write > BUF_PAGE_SIZE) {
struct buffer_page *next_page = tail_page;
- spin_lock_irqsave(&cpu_buffer->lock, flags);
+ local_irq_save(flags);
+ __raw_spin_lock(&cpu_buffer->lock);
rb_inc_page(cpu_buffer, &next_page);
@@ -860,7 +959,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
reader_page = cpu_buffer->reader_page;
/* we grabbed the lock before incrementing */
- RB_WARN_ON(cpu_buffer, next_page == reader_page);
+ if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
+ goto out_unlock;
/*
* If for some reason, we had an interrupt storm that made
@@ -928,7 +1028,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
rb_set_commit_to_write(cpu_buffer);
}
- spin_unlock_irqrestore(&cpu_buffer->lock, flags);
+ __raw_spin_unlock(&cpu_buffer->lock);
+ local_irq_restore(flags);
/* fail and let the caller try again */
return ERR_PTR(-EAGAIN);
@@ -936,7 +1037,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
/* We reserved something on the buffer */
- BUG_ON(write > BUF_PAGE_SIZE);
+ if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE))
+ return NULL;
event = __rb_page_index(tail_page, tail);
rb_update_event(event, type, length);
@@ -951,7 +1053,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
return event;
out_unlock:
- spin_unlock_irqrestore(&cpu_buffer->lock, flags);
+ __raw_spin_unlock(&cpu_buffer->lock);
+ local_irq_restore(flags);
return NULL;
}
@@ -966,7 +1069,9 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
if (unlikely(*delta > (1ULL << 59) && !once++)) {
printk(KERN_WARNING "Delta way too big! %llu"
" ts=%llu write stamp = %llu\n",
- *delta, *ts, cpu_buffer->write_stamp);
+ (unsigned long long)*delta,
+ (unsigned long long)*ts,
+ (unsigned long long)cpu_buffer->write_stamp);
WARN_ON(1);
}
@@ -1020,8 +1125,21 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event;
u64 ts, delta;
int commit = 0;
+ int nr_loops = 0;
again:
+ /*
+ * We allow for interrupts to reenter here and do a trace.
+ * If one does, it will cause this original code to loop
+ * back here. Even with heavy interrupts happening, this
+ * should only happen a few times in a row. If this happens
+ * 1000 times in a row, there must be either an interrupt
+ * storm or we have something buggy.
+ * Bail!
+ */
+ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
+ return NULL;
+
ts = ring_buffer_time_stamp(cpu_buffer->cpu);
/*
@@ -1043,7 +1161,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
/* Did the write stamp get updated already? */
if (unlikely(ts < cpu_buffer->write_stamp))
- goto again;
+ delta = 0;
if (test_time_stamp(delta)) {
@@ -1116,12 +1234,14 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
struct ring_buffer_event *event;
int cpu, resched;
+ if (ring_buffer_flags != RB_BUFFERS_ON)
+ return NULL;
+
if (atomic_read(&buffer->record_disabled))
return NULL;
/* If we are tracing schedule, we don't want to recurse */
- resched = need_resched();
- preempt_disable_notrace();
+ resched = ftrace_preempt_disable();
cpu = raw_smp_processor_id();
@@ -1152,10 +1272,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
return event;
out:
- if (resched)
- preempt_enable_notrace();
- else
- preempt_enable_notrace();
+ ftrace_preempt_enable(resched);
return NULL;
}
@@ -1197,12 +1314,9 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
/*
* Only the last preempt count needs to restore preemption.
*/
- if (preempt_count() == 1) {
- if (per_cpu(rb_need_resched, cpu))
- preempt_enable_no_resched_notrace();
- else
- preempt_enable_notrace();
- } else
+ if (preempt_count() == 1)
+ ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
+ else
preempt_enable_no_resched_notrace();
return 0;
@@ -1232,11 +1346,13 @@ int ring_buffer_write(struct ring_buffer *buffer,
int ret = -EBUSY;
int cpu, resched;
+ if (ring_buffer_flags != RB_BUFFERS_ON)
+ return -EBUSY;
+
if (atomic_read(&buffer->record_disabled))
return -EBUSY;
- resched = need_resched();
- preempt_disable_notrace();
+ resched = ftrace_preempt_disable();
cpu = raw_smp_processor_id();
@@ -1262,10 +1378,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
ret = 0;
out:
- if (resched)
- preempt_enable_no_resched_notrace();
- else
- preempt_enable_notrace();
+ ftrace_preempt_enable(resched);
return ret;
}
@@ -1424,14 +1537,7 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
return overruns;
}
-/**
- * ring_buffer_iter_reset - reset an iterator
- * @iter: The iterator to reset
- *
- * Resets the iterator, so that it will start from the beginning
- * again.
- */
-void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
+static void rb_iter_reset(struct ring_buffer_iter *iter)
{
struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
@@ -1450,6 +1556,23 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
}
/**
+ * ring_buffer_iter_reset - reset an iterator
+ * @iter: The iterator to reset
+ *
+ * Resets the iterator, so that it will start from the beginning
+ * again.
+ */
+void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ rb_iter_reset(iter);
+ spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+}
+
+/**
* ring_buffer_iter_empty - check if an iterator has no more to read
* @iter: The iterator to check
*/
@@ -1530,10 +1653,23 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
{
struct buffer_page *reader = NULL;
unsigned long flags;
+ int nr_loops = 0;
- spin_lock_irqsave(&cpu_buffer->lock, flags);
+ local_irq_save(flags);
+ __raw_spin_lock(&cpu_buffer->lock);
again:
+ /*
+ * This should normally only loop twice. But because the
+ * start of the reader inserts an empty page, it causes
+ * a case where we will loop three times. There should be no
+ * reason to loop four times (that I know of).
+ */
+ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) {
+ reader = NULL;
+ goto out;
+ }
+
reader = cpu_buffer->reader_page;
/* If there's more to read, return this page */
@@ -1541,8 +1677,9 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
goto out;
/* Never should we have an index greater than the size */
- RB_WARN_ON(cpu_buffer,
- cpu_buffer->reader_page->read > rb_page_size(reader));
+ if (RB_WARN_ON(cpu_buffer,
+ cpu_buffer->reader_page->read > rb_page_size(reader)))
+ goto out;
/* check if we caught up to the tail */
reader = NULL;
@@ -1581,7 +1718,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
goto again;
out:
- spin_unlock_irqrestore(&cpu_buffer->lock, flags);
+ __raw_spin_unlock(&cpu_buffer->lock);
+ local_irq_restore(flags);
return reader;
}
@@ -1595,7 +1733,8 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
reader = rb_get_reader_page(cpu_buffer);
/* This function should not be called when buffer is empty */
- BUG_ON(!reader);
+ if (RB_WARN_ON(cpu_buffer, !reader))
+ return;
event = rb_reader_event(cpu_buffer);
@@ -1622,7 +1761,9 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
* Check if we are at the end of the buffer.
*/
if (iter->head >= rb_page_size(iter->head_page)) {
- BUG_ON(iter->head_page == cpu_buffer->commit_page);
+ if (RB_WARN_ON(buffer,
+ iter->head_page == cpu_buffer->commit_page))
+ return;
rb_inc_iter(iter);
return;
}
@@ -1635,8 +1776,10 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
* This should not be called to advance the header if we are
* at the tail of the buffer.
*/
- BUG_ON((iter->head_page == cpu_buffer->commit_page) &&
- (iter->head + length > rb_commit_index(cpu_buffer)));
+ if (RB_WARN_ON(cpu_buffer,
+ (iter->head_page == cpu_buffer->commit_page) &&
+ (iter->head + length > rb_commit_index(cpu_buffer))))
+ return;
rb_update_iter_read_stamp(iter, event);
@@ -1648,21 +1791,13 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
rb_advance_iter(iter);
}
-/**
- * ring_buffer_peek - peek at the next event to be read
- * @buffer: The ring buffer to read
- * @cpu: The cpu to peak at
- * @ts: The timestamp counter of this event.
- *
- * This will return the event that will be read next, but does
- * not consume the data.
- */
-struct ring_buffer_event *
-ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
+static struct ring_buffer_event *
+rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_event *event;
struct buffer_page *reader;
+ int nr_loops = 0;
if (!cpu_isset(cpu, buffer->cpumask))
return NULL;
@@ -1670,6 +1805,17 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
cpu_buffer = buffer->buffers[cpu];
again:
+ /*
+ * We repeat when a timestamp is encountered. It is possible
+ * to get multiple timestamps from an interrupt entering just
+ * as one timestamp is about to be written. The max times
+ * that this can happen is the number of nested interrupts we
+ * can have. Nesting 10 deep of interrupts is clearly
+ * an anomaly.
+ */
+ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
+ return NULL;
+
reader = rb_get_reader_page(cpu_buffer);
if (!reader)
return NULL;
@@ -1706,20 +1852,13 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
return NULL;
}
-/**
- * ring_buffer_iter_peek - peek at the next event to be read
- * @iter: The ring buffer iterator
- * @ts: The timestamp counter of this event.
- *
- * This will return the event that will be read next, but does
- * not increment the iterator.
- */
-struct ring_buffer_event *
-ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
+static struct ring_buffer_event *
+rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
{
struct ring_buffer *buffer;
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_event *event;
+ int nr_loops = 0;
if (ring_buffer_iter_empty(iter))
return NULL;
@@ -1728,6 +1867,17 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
buffer = cpu_buffer->buffer;
again:
+ /*
+ * We repeat when a timestamp is encountered. It is possible
+ * to get multiple timestamps from an interrupt entering just
+ * as one timestamp is about to be written. The max times
+ * that this can happen is the number of nested interrupts we
+ * can have. Nesting 10 deep of interrupts is clearly
+ * an anomaly.
+ */
+ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
+ return NULL;
+
if (rb_per_cpu_empty(cpu_buffer))
return NULL;
@@ -1763,6 +1913,51 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
}
/**
+ * ring_buffer_peek - peek at the next event to be read
+ * @buffer: The ring buffer to read
+ * @cpu: The cpu to peak at
+ * @ts: The timestamp counter of this event.
+ *
+ * This will return the event that will be read next, but does
+ * not consume the data.
+ */
+struct ring_buffer_event *
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+ struct ring_buffer_event *event;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ event = rb_buffer_peek(buffer, cpu, ts);
+ spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ return event;
+}
+
+/**
+ * ring_buffer_iter_peek - peek at the next event to be read
+ * @iter: The ring buffer iterator
+ * @ts: The timestamp counter of this event.
+ *
+ * This will return the event that will be read next, but does
+ * not increment the iterator.
+ */
+struct ring_buffer_event *
+ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+ struct ring_buffer_event *event;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ event = rb_iter_peek(iter, ts);
+ spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ return event;
+}
+
+/**
* ring_buffer_consume - return an event and consume it
* @buffer: The ring buffer to get the next event from
*
@@ -1773,19 +1968,24 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
struct ring_buffer_event *
ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
{
- struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
struct ring_buffer_event *event;
+ unsigned long flags;
if (!cpu_isset(cpu, buffer->cpumask))
return NULL;
- event = ring_buffer_peek(buffer, cpu, ts);
+ spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+ event = rb_buffer_peek(buffer, cpu, ts);
if (!event)
- return NULL;
+ goto out;
- cpu_buffer = buffer->buffers[cpu];
rb_advance_reader(cpu_buffer);
+ out:
+ spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
return event;
}
@@ -1822,9 +2022,11 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
atomic_inc(&cpu_buffer->record_disabled);
synchronize_sched();
- spin_lock_irqsave(&cpu_buffer->lock, flags);
- ring_buffer_iter_reset(iter);
- spin_unlock_irqrestore(&cpu_buffer->lock, flags);
+ spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ __raw_spin_lock(&cpu_buffer->lock);
+ rb_iter_reset(iter);
+ __raw_spin_unlock(&cpu_buffer->lock);
+ spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
return iter;
}
@@ -1856,12 +2058,17 @@ struct ring_buffer_event *
ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
{
struct ring_buffer_event *event;
+ struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+ unsigned long flags;
- event = ring_buffer_iter_peek(iter, ts);
+ spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ event = rb_iter_peek(iter, ts);
if (!event)
- return NULL;
+ goto out;
rb_advance_iter(iter);
+ out:
+ spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
return event;
}
@@ -1910,11 +2117,15 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
if (!cpu_isset(cpu, buffer->cpumask))
return;
- spin_lock_irqsave(&cpu_buffer->lock, flags);
+ spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+ __raw_spin_lock(&cpu_buffer->lock);
rb_reset_cpu(cpu_buffer);
- spin_unlock_irqrestore(&cpu_buffer->lock, flags);
+ __raw_spin_unlock(&cpu_buffer->lock);
+
+ spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
}
/**
@@ -2012,3 +2223,73 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
return 0;
}
+static ssize_t
+rb_simple_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ long *p = filp->private_data;
+ char buf[64];
+ int r;
+
+ if (test_bit(RB_BUFFERS_DISABLED_BIT, p))
+ r = sprintf(buf, "permanently disabled\n");
+ else
+ r = sprintf(buf, "%d\n", test_bit(RB_BUFFERS_ON_BIT, p));
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+rb_simple_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ long *p = filp->private_data;
+ char buf[64];
+ long val;
+ int ret;
+
+ if (cnt >= sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ ret = strict_strtoul(buf, 10, &val);
+ if (ret < 0)
+ return ret;
+
+ if (val)
+ set_bit(RB_BUFFERS_ON_BIT, p);
+ else
+ clear_bit(RB_BUFFERS_ON_BIT, p);
+
+ (*ppos)++;
+
+ return cnt;
+}
+
+static struct file_operations rb_simple_fops = {
+ .open = tracing_open_generic,
+ .read = rb_simple_read,
+ .write = rb_simple_write,
+};
+
+
+static __init int rb_init_debugfs(void)
+{
+ struct dentry *d_tracer;
+ struct dentry *entry;
+
+ d_tracer = tracing_init_dentry();
+
+ entry = debugfs_create_file("tracing_on", 0644, d_tracer,
+ &ring_buffer_flags, &rb_simple_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs 'tracing_on' entry\n");
+
+ return 0;
+}
+
+fs_initcall(rb_init_debugfs);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d345d649d07..91887a280ab 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -30,10 +30,12 @@
#include <linux/gfp.h>
#include <linux/fs.h>
#include <linux/kprobes.h>
+#include <linux/seq_file.h>
#include <linux/writeback.h>
#include <linux/stacktrace.h>
#include <linux/ring_buffer.h>
+#include <linux/irqflags.h>
#include "trace.h"
@@ -42,6 +44,29 @@
unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX;
unsigned long __read_mostly tracing_thresh;
+/* For tracers that don't implement custom flags */
+static struct tracer_opt dummy_tracer_opt[] = {
+ { }
+};
+
+static struct tracer_flags dummy_tracer_flags = {
+ .val = 0,
+ .opts = dummy_tracer_opt
+};
+
+static int dummy_set_flag(u32 old_flags, u32 bit, int set)
+{
+ return 0;
+}
+
+/*
+ * Kill all tracing for good (never come back).
+ * It is initialized to 1 but will turn to zero if the initialization
+ * of the tracer is successful. But that is the only place that sets
+ * this back to zero.
+ */
+int tracing_disabled = 1;
+
static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
static inline void ftrace_disable_cpu(void)
@@ -61,7 +86,36 @@ static cpumask_t __read_mostly tracing_buffer_mask;
#define for_each_tracing_cpu(cpu) \
for_each_cpu_mask(cpu, tracing_buffer_mask)
-static int tracing_disabled = 1;
+/*
+ * ftrace_dump_on_oops - variable to dump ftrace buffer on oops
+ *
+ * If there is an oops (or kernel panic) and the ftrace_dump_on_oops
+ * is set, then ftrace_dump is called. This will output the contents
+ * of the ftrace buffers to the console. This is very useful for
+ * capturing traces that lead to crashes and outputing it to a
+ * serial console.
+ *
+ * It is default off, but you can enable it with either specifying
+ * "ftrace_dump_on_oops" in the kernel command line, or setting
+ * /proc/sys/kernel/ftrace_dump_on_oops to true.
+ */
+int ftrace_dump_on_oops;
+
+static int tracing_set_tracer(char *buf);
+
+static int __init set_ftrace(char *str)
+{
+ tracing_set_tracer(str);
+ return 1;
+}
+__setup("ftrace", set_ftrace);
+
+static int __init set_ftrace_dump_on_oops(char *str)
+{
+ ftrace_dump_on_oops = 1;
+ return 1;
+}
+__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
long
ns2usecs(cycle_t nsec)
@@ -111,6 +165,19 @@ static DEFINE_PER_CPU(struct trace_array_cpu, max_data);
/* tracer_enabled is used to toggle activation of a tracer */
static int tracer_enabled = 1;
+/**
+ * tracing_is_enabled - return tracer_enabled status
+ *
+ * This function is used by other tracers to know the status
+ * of the tracer_enabled flag. Tracers may use this function
+ * to know if it should enable their features when starting
+ * up. See irqsoff tracer for an example (start_irqsoff_tracer).
+ */
+int tracing_is_enabled(void)
+{
+ return tracer_enabled;
+}
+
/* function tracing enabled */
int ftrace_function_enabled;
@@ -152,8 +219,9 @@ static DEFINE_MUTEX(trace_types_lock);
/* trace_wait is a waitqueue for tasks blocked on trace_poll */
static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
-/* trace_flags holds iter_ctrl options */
-unsigned long trace_flags = TRACE_ITER_PRINT_PARENT;
+/* trace_flags holds trace_options default values */
+unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
+ TRACE_ITER_ANNOTATE;
/**
* trace_wake_up - wake up tasks waiting for trace input
@@ -192,13 +260,6 @@ unsigned long nsecs_to_usecs(unsigned long nsecs)
return nsecs / 1000;
}
-/*
- * TRACE_ITER_SYM_MASK masks the options in trace_flags that
- * control the output of kernel symbols.
- */
-#define TRACE_ITER_SYM_MASK \
- (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
-
/* These must match the bit postions in trace_iterator_flags */
static const char *trace_options[] = {
"print-parent",
@@ -212,6 +273,11 @@ static const char *trace_options[] = {
"stacktrace",
"sched-tree",
"ftrace_printk",
+ "ftrace_preempt",
+ "branch",
+ "annotate",
+ "userstacktrace",
+ "sym-userobj",
NULL
};
@@ -358,6 +424,28 @@ trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
return trace_seq_putmem(s, hex, j);
}
+static int
+trace_seq_path(struct trace_seq *s, struct path *path)
+{
+ unsigned char *p;
+
+ if (s->len >= (PAGE_SIZE - 1))
+ return 0;
+ p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
+ if (!IS_ERR(p)) {
+ p = mangle_path(s->buffer + s->len, p, "\n");
+ if (p) {
+ s->len = p - s->buffer;
+ return 1;
+ }
+ } else {
+ s->buffer[s->len++] = '?';
+ return 1;
+ }
+
+ return 0;
+}
+
static void
trace_seq_reset(struct trace_seq *s)
{
@@ -469,7 +557,15 @@ int register_tracer(struct tracer *type)
return -1;
}
+ /*
+ * When this gets called we hold the BKL which means that
+ * preemption is disabled. Various trace selftests however
+ * need to disable and enable preemption for successful tests.
+ * So we drop the BKL here and grab it after the tests again.
+ */
+ unlock_kernel();
mutex_lock(&trace_types_lock);
+
for (t = trace_types; t; t = t->next) {
if (strcmp(type->name, t->name) == 0) {
/* already found */
@@ -480,11 +576,18 @@ int register_tracer(struct tracer *type)
}
}
+ if (!type->set_flag)
+ type->set_flag = &dummy_set_flag;
+ if (!type->flags)
+ type->flags = &dummy_tracer_flags;
+ else
+ if (!type->flags->opts)
+ type->flags->opts = dummy_tracer_opt;
+
#ifdef CONFIG_FTRACE_STARTUP_TEST
if (type->selftest) {
struct tracer *saved_tracer = current_trace;
struct trace_array *tr = &global_trace;
- int saved_ctrl = tr->ctrl;
int i;
/*
* Run a selftest on this tracer.
@@ -493,25 +596,23 @@ int register_tracer(struct tracer *type)
* internal tracing to verify that everything is in order.
* If we fail, we do not register this tracer.
*/
- for_each_tracing_cpu(i) {
+ for_each_tracing_cpu(i)
tracing_reset(tr, i);
- }
+
current_trace = type;
- tr->ctrl = 0;
/* the test is responsible for initializing and enabling */
pr_info("Testing tracer %s: ", type->name);
ret = type->selftest(type, tr);
/* the test is responsible for resetting too */
current_trace = saved_tracer;
- tr->ctrl = saved_ctrl;
if (ret) {
printk(KERN_CONT "FAILED!\n");
goto out;
}
/* Only reset on passing, to avoid touching corrupted buffers */
- for_each_tracing_cpu(i) {
+ for_each_tracing_cpu(i)
tracing_reset(tr, i);
- }
+
printk(KERN_CONT "PASSED\n");
}
#endif
@@ -524,6 +625,7 @@ int register_tracer(struct tracer *type)
out:
mutex_unlock(&trace_types_lock);
+ lock_kernel();
return ret;
}
@@ -580,6 +682,91 @@ static void trace_init_cmdlines(void)
cmdline_idx = 0;
}
+static int trace_stop_count;
+static DEFINE_SPINLOCK(tracing_start_lock);
+
+/**
+ * ftrace_off_permanent - disable all ftrace code permanently
+ *
+ * This should only be called when a serious anomally has
+ * been detected. This will turn off the function tracing,
+ * ring buffers, and other tracing utilites. It takes no
+ * locks and can be called from any context.
+ */
+void ftrace_off_permanent(void)
+{
+ tracing_disabled = 1;
+ ftrace_stop();
+ tracing_off_permanent();
+}
+
+/**
+ * tracing_start - quick start of the tracer
+ *
+ * If tracing is enabled but was stopped by tracing_stop,
+ * this will start the tracer back up.
+ */
+void tracing_start(void)
+{
+ struct ring_buffer *buffer;
+ unsigned long flags;
+
+ if (tracing_disabled)
+ return;
+
+ spin_lock_irqsave(&tracing_start_lock, flags);
+ if (--trace_stop_count)
+ goto out;
+
+ if (trace_stop_count < 0) {
+ /* Someone screwed up their debugging */
+ WARN_ON_ONCE(1);
+ trace_stop_count = 0;
+ goto out;
+ }
+
+
+ buffer = global_trace.buffer;
+ if (buffer)
+ ring_buffer_record_enable(buffer);
+
+ buffer = max_tr.buffer;
+ if (buffer)
+ ring_buffer_record_enable(buffer);
+
+ ftrace_start();
+ out:
+ spin_unlock_irqrestore(&tracing_start_lock, flags);
+}
+
+/**
+ * tracing_stop - quick stop of the tracer
+ *
+ * Light weight way to stop tracing. Use in conjunction with
+ * tracing_start.
+ */
+void tracing_stop(void)
+{
+ struct ring_buffer *buffer;
+ unsigned long flags;
+
+ ftrace_stop();
+ spin_lock_irqsave(&tracing_start_lock, flags);
+ if (trace_stop_count++)
+ goto out;
+
+ buffer = global_trace.buffer;
+ if (buffer)
+ ring_buffer_record_disable(buffer);
+
+ buffer = max_tr.buffer;
+ if (buffer)
+ ring_buffer_record_disable(buffer);
+
+ out:
+ spin_unlock_irqrestore(&tracing_start_lock, flags);
+}
+
void trace_stop_cmdline_recording(void);
static void trace_save_cmdline(struct task_struct *tsk)
@@ -617,7 +804,7 @@ static void trace_save_cmdline(struct task_struct *tsk)
spin_unlock(&trace_cmdline_lock);
}
-static char *trace_find_cmdline(int pid)
+char *trace_find_cmdline(int pid)
{
char *cmdline = "<...>";
unsigned map;
@@ -654,8 +841,13 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
entry->preempt_count = pc & 0xff;
entry->pid = (tsk) ? tsk->pid : 0;
+ entry->tgid = (tsk) ? tsk->tgid : 0;
entry->flags =
+#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
+#else
+ TRACE_FLAG_IRQS_NOSUPPORT |
+#endif
((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
(need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
@@ -686,6 +878,56 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data,
ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
}
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static void __trace_graph_entry(struct trace_array *tr,
+ struct trace_array_cpu *data,
+ struct ftrace_graph_ent *trace,
+ unsigned long flags,
+ int pc)
+{
+ struct ring_buffer_event *event;
+ struct ftrace_graph_ent_entry *entry;
+ unsigned long irq_flags;
+
+ if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
+ return;
+
+ event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
+ &irq_flags);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ tracing_generic_entry_update(&entry->ent, flags, pc);
+ entry->ent.type = TRACE_GRAPH_ENT;
+ entry->graph_ent = *trace;
+ ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
+}
+
+static void __trace_graph_return(struct trace_array *tr,
+ struct trace_array_cpu *data,
+ struct ftrace_graph_ret *trace,
+ unsigned long flags,
+ int pc)
+{
+ struct ring_buffer_event *event;
+ struct ftrace_graph_ret_entry *entry;
+ unsigned long irq_flags;
+
+ if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
+ return;
+
+ event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
+ &irq_flags);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ tracing_generic_entry_update(&entry->ent, flags, pc);
+ entry->ent.type = TRACE_GRAPH_RET;
+ entry->ret = *trace;
+ ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
+}
+#endif
+
void
ftrace(struct trace_array *tr, struct trace_array_cpu *data,
unsigned long ip, unsigned long parent_ip, unsigned long flags,
@@ -700,6 +942,7 @@ static void ftrace_trace_stack(struct trace_array *tr,
unsigned long flags,
int skip, int pc)
{
+#ifdef CONFIG_STACKTRACE
struct ring_buffer_event *event;
struct stack_entry *entry;
struct stack_trace trace;
@@ -725,6 +968,7 @@ static void ftrace_trace_stack(struct trace_array *tr,
save_stack_trace(&trace);
ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+#endif
}
void __trace_stack(struct trace_array *tr,
@@ -735,6 +979,46 @@ void __trace_stack(struct trace_array *tr,
ftrace_trace_stack(tr, data, flags, skip, preempt_count());
}
+static void ftrace_trace_userstack(struct trace_array *tr,
+ struct trace_array_cpu *data,
+ unsigned long flags, int pc)
+{
+#ifdef CONFIG_STACKTRACE
+ struct ring_buffer_event *event;
+ struct userstack_entry *entry;
+ struct stack_trace trace;
+ unsigned long irq_flags;
+
+ if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
+ return;
+
+ event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+ &irq_flags);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ tracing_generic_entry_update(&entry->ent, flags, pc);
+ entry->ent.type = TRACE_USER_STACK;
+
+ memset(&entry->caller, 0, sizeof(entry->caller));
+
+ trace.nr_entries = 0;
+ trace.max_entries = FTRACE_STACK_ENTRIES;
+ trace.skip = 0;
+ trace.entries = entry->caller;
+
+ save_stack_trace_user(&trace);
+ ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+#endif
+}
+
+void __trace_userstack(struct trace_array *tr,
+ struct trace_array_cpu *data,
+ unsigned long flags)
+{
+ ftrace_trace_userstack(tr, data, flags, preempt_count());
+}
+
static void
ftrace_trace_special(void *__tr, void *__data,
unsigned long arg1, unsigned long arg2, unsigned long arg3,
@@ -758,6 +1042,7 @@ ftrace_trace_special(void *__tr, void *__data,
entry->arg3 = arg3;
ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
ftrace_trace_stack(tr, data, irq_flags, 4, pc);
+ ftrace_trace_userstack(tr, data, irq_flags, pc);
trace_wake_up();
}
@@ -796,6 +1081,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
entry->next_cpu = task_cpu(next);
ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
ftrace_trace_stack(tr, data, flags, 5, pc);
+ ftrace_trace_userstack(tr, data, flags, pc);
}
void
@@ -825,6 +1111,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
entry->next_cpu = task_cpu(wakee);
ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
ftrace_trace_stack(tr, data, flags, 6, pc);
+ ftrace_trace_userstack(tr, data, flags, pc);
trace_wake_up();
}
@@ -834,26 +1121,28 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
{
struct trace_array *tr = &global_trace;
struct trace_array_cpu *data;
+ unsigned long flags;
int cpu;
int pc;
- if (tracing_disabled || !tr->ctrl)
+ if (tracing_disabled)
return;
pc = preempt_count();
- preempt_disable_notrace();
+ local_irq_save(flags);
cpu = raw_smp_processor_id();
data = tr->data[cpu];
- if (likely(!atomic_read(&data->disabled)))
+ if (likely(atomic_inc_return(&data->disabled) == 1))
ftrace_trace_special(tr, data, arg1, arg2, arg3, pc);
- preempt_enable_notrace();
+ atomic_dec(&data->disabled);
+ local_irq_restore(flags);
}
-#ifdef CONFIG_FTRACE
+#ifdef CONFIG_FUNCTION_TRACER
static void
-function_trace_call(unsigned long ip, unsigned long parent_ip)
+function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
{
struct trace_array *tr = &global_trace;
struct trace_array_cpu *data;
@@ -865,12 +1154,8 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
if (unlikely(!ftrace_function_enabled))
return;
- if (skip_trace(ip))
- return;
-
pc = preempt_count();
- resched = need_resched();
- preempt_disable_notrace();
+ resched = ftrace_preempt_disable();
local_save_flags(flags);
cpu = raw_smp_processor_id();
data = tr->data[cpu];
@@ -880,11 +1165,83 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
trace_function(tr, data, ip, parent_ip, flags, pc);
atomic_dec(&data->disabled);
- if (resched)
- preempt_enable_no_resched_notrace();
- else
- preempt_enable_notrace();
+ ftrace_preempt_enable(resched);
+}
+
+static void
+function_trace_call(unsigned long ip, unsigned long parent_ip)
+{
+ struct trace_array *tr = &global_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ long disabled;
+ int cpu;
+ int pc;
+
+ if (unlikely(!ftrace_function_enabled))
+ return;
+
+ /*
+ * Need to use raw, since this must be called before the
+ * recursive protection is performed.
+ */
+ local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+ data = tr->data[cpu];
+ disabled = atomic_inc_return(&data->disabled);
+
+ if (likely(disabled == 1)) {
+ pc = preempt_count();
+ trace_function(tr, data, ip, parent_ip, flags, pc);
+ }
+
+ atomic_dec(&data->disabled);
+ local_irq_restore(flags);
+}
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+void trace_graph_entry(struct ftrace_graph_ent *trace)
+{
+ struct trace_array *tr = &global_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ long disabled;
+ int cpu;
+ int pc;
+
+ raw_local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+ data = tr->data[cpu];
+ disabled = atomic_inc_return(&data->disabled);
+ if (likely(disabled == 1)) {
+ pc = preempt_count();
+ __trace_graph_entry(tr, data, trace, flags, pc);
+ }
+ atomic_dec(&data->disabled);
+ raw_local_irq_restore(flags);
+}
+
+void trace_graph_return(struct ftrace_graph_ret *trace)
+{
+ struct trace_array *tr = &global_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ long disabled;
+ int cpu;
+ int pc;
+
+ raw_local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+ data = tr->data[cpu];
+ disabled = atomic_inc_return(&data->disabled);
+ if (likely(disabled == 1)) {
+ pc = preempt_count();
+ __trace_graph_return(tr, data, trace, flags, pc);
+ }
+ atomic_dec(&data->disabled);
+ raw_local_irq_restore(flags);
}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
static struct ftrace_ops trace_ops __read_mostly =
{
@@ -894,9 +1251,14 @@ static struct ftrace_ops trace_ops __read_mostly =
void tracing_start_function_trace(void)
{
ftrace_function_enabled = 0;
+
+ if (trace_flags & TRACE_ITER_PREEMPTONLY)
+ trace_ops.func = function_trace_call_preempt_only;
+ else
+ trace_ops.func = function_trace_call;
+
register_ftrace_function(&trace_ops);
- if (tracer_enabled)
- ftrace_function_enabled = 1;
+ ftrace_function_enabled = 1;
}
void tracing_stop_function_trace(void)
@@ -908,6 +1270,7 @@ void tracing_stop_function_trace(void)
enum trace_file_type {
TRACE_FILE_LAT_FMT = 1,
+ TRACE_FILE_ANNOTATE = 2,
};
static void trace_iterator_increment(struct trace_iterator *iter, int cpu)
@@ -1043,10 +1406,6 @@ static void *s_start(struct seq_file *m, loff_t *pos)
atomic_inc(&trace_record_cmdline_disabled);
- /* let the tracer grab locks here if needed */
- if (current_trace->start)
- current_trace->start(iter);
-
if (*pos != iter->pos) {
iter->ent = NULL;
iter->cpu = 0;
@@ -1073,28 +1432,24 @@ static void *s_start(struct seq_file *m, loff_t *pos)
static void s_stop(struct seq_file *m, void *p)
{
- struct trace_iterator *iter = m->private;
-
atomic_dec(&trace_record_cmdline_disabled);
-
- /* let the tracer release locks here if needed */
- if (current_trace && current_trace == iter->trace && iter->trace->stop)
- iter->trace->stop(iter);
-
mutex_unlock(&trace_types_lock);
}
-#define KRETPROBE_MSG "[unknown/kretprobe'd]"
-
#ifdef CONFIG_KRETPROBES
-static inline int kretprobed(unsigned long addr)
+static inline const char *kretprobed(const char *name)
{
- return addr == (unsigned long)kretprobe_trampoline;
+ static const char tramp_name[] = "kretprobe_trampoline";
+ int size = sizeof(tramp_name);
+
+ if (strncmp(tramp_name, name, size) == 0)
+ return "[unknown/kretprobe'd]";
+ return name;
}
#else
-static inline int kretprobed(unsigned long addr)
+static inline const char *kretprobed(const char *name)
{
- return 0;
+ return name;
}
#endif /* CONFIG_KRETPROBES */
@@ -1103,10 +1458,13 @@ seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
{
#ifdef CONFIG_KALLSYMS
char str[KSYM_SYMBOL_LEN];
+ const char *name;
kallsyms_lookup(address, NULL, NULL, NULL, str);
- return trace_seq_printf(s, fmt, str);
+ name = kretprobed(str);
+
+ return trace_seq_printf(s, fmt, name);
#endif
return 1;
}
@@ -1117,9 +1475,12 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt,
{
#ifdef CONFIG_KALLSYMS
char str[KSYM_SYMBOL_LEN];
+ const char *name;
sprint_symbol(str, address);
- return trace_seq_printf(s, fmt, str);
+ name = kretprobed(str);
+
+ return trace_seq_printf(s, fmt, name);
#endif
return 1;
}
@@ -1130,7 +1491,7 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt,
# define IP_FMT "%016lx"
#endif
-static int
+int
seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
{
int ret;
@@ -1151,6 +1512,78 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
return ret;
}
+static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
+ unsigned long ip, unsigned long sym_flags)
+{
+ struct file *file = NULL;
+ unsigned long vmstart = 0;
+ int ret = 1;
+
+ if (mm) {
+ const struct vm_area_struct *vma;
+
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, ip);
+ if (vma) {
+ file = vma->vm_file;
+ vmstart = vma->vm_start;
+ }
+ if (file) {
+ ret = trace_seq_path(s, &file->f_path);
+ if (ret)
+ ret = trace_seq_printf(s, "[+0x%lx]", ip - vmstart);
+ }
+ up_read(&mm->mmap_sem);
+ }
+ if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
+ ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
+ return ret;
+}
+
+static int
+seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
+ unsigned long sym_flags)
+{
+ struct mm_struct *mm = NULL;
+ int ret = 1;
+ unsigned int i;
+
+ if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
+ struct task_struct *task;
+ /*
+ * we do the lookup on the thread group leader,
+ * since individual threads might have already quit!
+ */
+ rcu_read_lock();
+ task = find_task_by_vpid(entry->ent.tgid);
+ if (task)
+ mm = get_task_mm(task);
+ rcu_read_unlock();
+ }
+
+ for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+ unsigned long ip = entry->caller[i];
+
+ if (ip == ULONG_MAX || !ret)
+ break;
+ if (i && ret)
+ ret = trace_seq_puts(s, " <- ");
+ if (!ip) {
+ if (ret)
+ ret = trace_seq_puts(s, "??");
+ continue;
+ }
+ if (!ret)
+ break;
+ if (ret)
+ ret = seq_print_user_ip(s, mm, ip, sym_flags);
+ }
+
+ if (mm)
+ mmput(mm);
+ return ret;
+}
+
static void print_lat_help_header(struct seq_file *m)
{
seq_puts(m, "# _------=> CPU# \n");
@@ -1246,7 +1679,8 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid);
trace_seq_printf(s, "%3d", cpu);
trace_seq_printf(s, "%c%c",
- (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.',
+ (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+ (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.',
((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
@@ -1324,6 +1758,23 @@ void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter)
trace_seq_putc(s, '\n');
}
+static void test_cpu_buff_start(struct trace_iterator *iter)
+{
+ struct trace_seq *s = &iter->seq;
+
+ if (!(trace_flags & TRACE_ITER_ANNOTATE))
+ return;
+
+ if (!(iter->iter_flags & TRACE_FILE_ANNOTATE))
+ return;
+
+ if (cpu_isset(iter->cpu, iter->started))
+ return;
+
+ cpu_set(iter->cpu, iter->started);
+ trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu);
+}
+
static enum print_line_t
print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
{
@@ -1343,6 +1794,8 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
if (entry->type == TRACE_CONT)
return TRACE_TYPE_HANDLED;
+ test_cpu_buff_start(iter);
+
next_entry = find_next_entry(iter, NULL, &next_ts);
if (!next_entry)
next_ts = iter->ts;
@@ -1372,10 +1825,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
seq_print_ip_sym(s, field->ip, sym_flags);
trace_seq_puts(s, " (");
- if (kretprobed(field->parent_ip))
- trace_seq_puts(s, KRETPROBE_MSG);
- else
- seq_print_ip_sym(s, field->parent_ip, sym_flags);
+ seq_print_ip_sym(s, field->parent_ip, sym_flags);
trace_seq_puts(s, ")\n");
break;
}
@@ -1437,6 +1887,27 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
trace_seq_print_cont(s, iter);
break;
}
+ case TRACE_BRANCH: {
+ struct trace_branch *field;
+
+ trace_assign_type(field, entry);
+
+ trace_seq_printf(s, "[%s] %s:%s:%d\n",
+ field->correct ? " ok " : " MISS ",
+ field->func,
+ field->file,
+ field->line);
+ break;
+ }
+ case TRACE_USER_STACK: {
+ struct userstack_entry *field;
+
+ trace_assign_type(field, entry);
+
+ seq_print_userip_objs(field, s, sym_flags);
+ trace_seq_putc(s, '\n');
+ break;
+ }
default:
trace_seq_printf(s, "Unknown type %d\n", entry->type);
}
@@ -1461,6 +1932,8 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
if (entry->type == TRACE_CONT)
return TRACE_TYPE_HANDLED;
+ test_cpu_buff_start(iter);
+
comm = trace_find_cmdline(iter->ent->pid);
t = ns2usecs(iter->ts);
@@ -1491,12 +1964,9 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
ret = trace_seq_printf(s, " <-");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
- if (kretprobed(field->parent_ip))
- ret = trace_seq_puts(s, KRETPROBE_MSG);
- else
- ret = seq_print_ip_sym(s,
- field->parent_ip,
- sym_flags);
+ ret = seq_print_ip_sym(s,
+ field->parent_ip,
+ sym_flags);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
@@ -1573,6 +2043,37 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
trace_seq_print_cont(s, iter);
break;
}
+ case TRACE_GRAPH_RET: {
+ return print_graph_function(iter);
+ }
+ case TRACE_GRAPH_ENT: {
+ return print_graph_function(iter);
+ }
+ case TRACE_BRANCH: {
+ struct trace_branch *field;
+
+ trace_assign_type(field, entry);
+
+ trace_seq_printf(s, "[%s] %s:%s:%d\n",
+ field->correct ? " ok " : " MISS ",
+ field->func,
+ field->file,
+ field->line);
+ break;
+ }
+ case TRACE_USER_STACK: {
+ struct userstack_entry *field;
+
+ trace_assign_type(field, entry);
+
+ ret = seq_print_userip_objs(field, s, sym_flags);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ ret = trace_seq_putc(s, '\n');
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ break;
+ }
}
return TRACE_TYPE_HANDLED;
}
@@ -1632,6 +2133,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
break;
}
case TRACE_SPECIAL:
+ case TRACE_USER_STACK:
case TRACE_STACK: {
struct special_entry *field;
@@ -1720,6 +2222,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
break;
}
case TRACE_SPECIAL:
+ case TRACE_USER_STACK:
case TRACE_STACK: {
struct special_entry *field;
@@ -1747,7 +2250,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
return TRACE_TYPE_HANDLED;
SEQ_PUT_FIELD_RET(s, entry->pid);
- SEQ_PUT_FIELD_RET(s, iter->cpu);
+ SEQ_PUT_FIELD_RET(s, entry->cpu);
SEQ_PUT_FIELD_RET(s, iter->ts);
switch (entry->type) {
@@ -1774,6 +2277,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
break;
}
case TRACE_SPECIAL:
+ case TRACE_USER_STACK:
case TRACE_STACK: {
struct special_entry *field;
@@ -1839,7 +2343,9 @@ static int s_show(struct seq_file *m, void *v)
seq_printf(m, "# tracer: %s\n", iter->trace->name);
seq_puts(m, "#\n");
}
- if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
+ if (iter->trace && iter->trace->print_header)
+ iter->trace->print_header(m);
+ else if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
/* print nothing if the buffers are empty */
if (trace_empty(iter))
return 0;
@@ -1891,6 +2397,15 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
iter->trace = current_trace;
iter->pos = -1;
+ /* Notify the tracer early; before we stop tracing. */
+ if (iter->trace && iter->trace->open)
+ iter->trace->open(iter);
+
+ /* Annotate start of buffers if we had overruns */
+ if (ring_buffer_overruns(iter->tr->buffer))
+ iter->iter_flags |= TRACE_FILE_ANNOTATE;
+
+
for_each_tracing_cpu(cpu) {
iter->buffer_iter[cpu] =
@@ -1909,13 +2424,7 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
m->private = iter;
/* stop the trace while dumping */
- if (iter->tr->ctrl) {
- tracer_enabled = 0;
- ftrace_function_enabled = 0;
- }
-
- if (iter->trace && iter->trace->open)
- iter->trace->open(iter);
+ tracing_stop();
mutex_unlock(&trace_types_lock);
@@ -1928,6 +2437,7 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
ring_buffer_read_finish(iter->buffer_iter[cpu]);
}
mutex_unlock(&trace_types_lock);
+ kfree(iter);
return ERR_PTR(-ENOMEM);
}
@@ -1957,14 +2467,7 @@ int tracing_release(struct inode *inode, struct file *file)
iter->trace->close(iter);
/* reenable tracing if it was previously enabled */
- if (iter->tr->ctrl) {
- tracer_enabled = 1;
- /*
- * It is safe to enable function tracing even if it
- * isn't used
- */
- ftrace_function_enabled = 1;
- }
+ tracing_start();
mutex_unlock(&trace_types_lock);
seq_release(inode, file);
@@ -2180,13 +2683,16 @@ static struct file_operations tracing_cpumask_fops = {
};
static ssize_t
-tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
+tracing_trace_options_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
+ int i;
char *buf;
int r = 0;
int len = 0;
- int i;
+ u32 tracer_flags = current_trace->flags->val;
+ struct tracer_opt *trace_opts = current_trace->flags->opts;
+
/* calulate max size */
for (i = 0; trace_options[i]; i++) {
@@ -2194,6 +2700,15 @@ tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
len += 3; /* "no" and space */
}
+ /*
+ * Increase the size with names of options specific
+ * of the current tracer.
+ */
+ for (i = 0; trace_opts[i].name; i++) {
+ len += strlen(trace_opts[i].name);
+ len += 3; /* "no" and space */
+ }
+
/* +2 for \n and \0 */
buf = kmalloc(len + 2, GFP_KERNEL);
if (!buf)
@@ -2206,6 +2721,15 @@ tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
r += sprintf(buf + r, "no%s ", trace_options[i]);
}
+ for (i = 0; trace_opts[i].name; i++) {
+ if (tracer_flags & trace_opts[i].bit)
+ r += sprintf(buf + r, "%s ",
+ trace_opts[i].name);
+ else
+ r += sprintf(buf + r, "no%s ",
+ trace_opts[i].name);
+ }
+
r += sprintf(buf + r, "\n");
WARN_ON(r >= len + 2);
@@ -2216,13 +2740,48 @@ tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
return r;
}
+/* Try to assign a tracer specific option */
+static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
+{
+ struct tracer_flags *trace_flags = trace->flags;
+ struct tracer_opt *opts = NULL;
+ int ret = 0, i = 0;
+ int len;
+
+ for (i = 0; trace_flags->opts[i].name; i++) {
+ opts = &trace_flags->opts[i];
+ len = strlen(opts->name);
+
+ if (strncmp(cmp, opts->name, len) == 0) {
+ ret = trace->set_flag(trace_flags->val,
+ opts->bit, !neg);
+ break;
+ }
+ }
+ /* Not found */
+ if (!trace_flags->opts[i].name)
+ return -EINVAL;
+
+ /* Refused to handle */
+ if (ret)
+ return ret;
+
+ if (neg)
+ trace_flags->val &= ~opts->bit;
+ else
+ trace_flags->val |= opts->bit;
+
+ return 0;
+}
+
static ssize_t
-tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
+tracing_trace_options_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
char buf[64];
char *cmp = buf;
int neg = 0;
+ int ret;
int i;
if (cnt >= sizeof(buf))
@@ -2249,11 +2808,13 @@ tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
break;
}
}
- /*
- * If no option could be set, return an error:
- */
- if (!trace_options[i])
- return -EINVAL;
+
+ /* If no option could be set, test the specific tracer options */
+ if (!trace_options[i]) {
+ ret = set_tracer_option(current_trace, cmp, neg);
+ if (ret)
+ return ret;
+ }
filp->f_pos += cnt;
@@ -2262,8 +2823,8 @@ tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
static struct file_operations tracing_iter_fops = {
.open = tracing_open_generic,
- .read = tracing_iter_ctrl_read,
- .write = tracing_iter_ctrl_write,
+ .read = tracing_trace_options_read,
+ .write = tracing_trace_options_write,
};
static const char readme_msg[] =
@@ -2277,9 +2838,9 @@ static const char readme_msg[] =
"# echo sched_switch > /debug/tracing/current_tracer\n"
"# cat /debug/tracing/current_tracer\n"
"sched_switch\n"
- "# cat /debug/tracing/iter_ctrl\n"
+ "# cat /debug/tracing/trace_options\n"
"noprint-parent nosym-offset nosym-addr noverbose\n"
- "# echo print-parent > /debug/tracing/iter_ctrl\n"
+ "# echo print-parent > /debug/tracing/trace_options\n"
"# echo 1 > /debug/tracing/tracing_enabled\n"
"# cat /debug/tracing/trace > /tmp/trace.txt\n"
"echo 0 > /debug/tracing/tracing_enabled\n"
@@ -2302,11 +2863,10 @@ static ssize_t
tracing_ctrl_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- struct trace_array *tr = filp->private_data;
char buf[64];
int r;
- r = sprintf(buf, "%ld\n", tr->ctrl);
+ r = sprintf(buf, "%u\n", tracer_enabled);
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}
@@ -2334,16 +2894,18 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
val = !!val;
mutex_lock(&trace_types_lock);
- if (tr->ctrl ^ val) {
- if (val)
+ if (tracer_enabled ^ val) {
+ if (val) {
tracer_enabled = 1;
- else
+ if (current_trace->start)
+ current_trace->start(tr);
+ tracing_start();
+ } else {
tracer_enabled = 0;
-
- tr->ctrl = val;
-
- if (current_trace && current_trace->ctrl_update)
- current_trace->ctrl_update(tr);
+ tracing_stop();
+ if (current_trace->stop)
+ current_trace->stop(tr);
+ }
}
mutex_unlock(&trace_types_lock);
@@ -2369,28 +2931,11 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}
-static ssize_t
-tracing_set_trace_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
+static int tracing_set_tracer(char *buf)
{
struct trace_array *tr = &global_trace;
struct tracer *t;
- char buf[max_tracer_type_len+1];
- int i;
- size_t ret;
-
- if (cnt > max_tracer_type_len)
- cnt = max_tracer_type_len;
- ret = cnt;
-
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
-
- buf[cnt] = 0;
-
- /* strip ending whitespace. */
- for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
- buf[i] = 0;
+ int ret = 0;
mutex_lock(&trace_types_lock);
for (t = trace_types; t; t = t->next) {
@@ -2404,18 +2949,52 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
if (t == current_trace)
goto out;
+ trace_branch_disable();
if (current_trace && current_trace->reset)
current_trace->reset(tr);
current_trace = t;
- if (t->init)
- t->init(tr);
+ if (t->init) {
+ ret = t->init(tr);
+ if (ret)
+ goto out;
+ }
+ trace_branch_enable(tr);
out:
mutex_unlock(&trace_types_lock);
- if (ret == cnt)
- filp->f_pos += cnt;
+ return ret;
+}
+
+static ssize_t
+tracing_set_trace_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[max_tracer_type_len+1];
+ int i;
+ size_t ret;
+ int err;
+
+ ret = cnt;
+
+ if (cnt > max_tracer_type_len)
+ cnt = max_tracer_type_len;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ /* strip ending whitespace. */
+ for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
+ buf[i] = 0;
+
+ err = tracing_set_tracer(buf);
+ if (err)
+ return err;
+
+ filp->f_pos += ret;
return ret;
}
@@ -2482,6 +3061,10 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
return -ENOMEM;
mutex_lock(&trace_types_lock);
+
+ /* trace pipe does not show start of buffer */
+ cpus_setall(iter->started);
+
iter->tr = &global_trace;
iter->trace = current_trace;
filp->private_data = iter;
@@ -2657,7 +3240,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
char buf[64];
int r;
- r = sprintf(buf, "%lu\n", tr->entries);
+ r = sprintf(buf, "%lu\n", tr->entries >> 10);
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}
@@ -2667,8 +3250,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
{
unsigned long val;
char buf[64];
- int ret;
- struct trace_array *tr = filp->private_data;
+ int ret, cpu;
if (cnt >= sizeof(buf))
return -EINVAL;
@@ -2688,13 +3270,19 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
mutex_lock(&trace_types_lock);
- if (tr->ctrl) {
- cnt = -EBUSY;
- pr_info("ftrace: please disable tracing"
- " before modifying buffer size\n");
- goto out;
+ tracing_stop();
+
+ /* disable all cpu buffers */
+ for_each_tracing_cpu(cpu) {
+ if (global_trace.data[cpu])
+ atomic_inc(&global_trace.data[cpu]->disabled);
+ if (max_tr.data[cpu])
+ atomic_inc(&max_tr.data[cpu]->disabled);
}
+ /* value is in KB */
+ val <<= 10;
+
if (val != global_trace.entries) {
ret = ring_buffer_resize(global_trace.buffer, val);
if (ret < 0) {
@@ -2726,6 +3314,14 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
if (tracing_disabled)
cnt = -ENOMEM;
out:
+ for_each_tracing_cpu(cpu) {
+ if (global_trace.data[cpu])
+ atomic_dec(&global_trace.data[cpu]->disabled);
+ if (max_tr.data[cpu])
+ atomic_dec(&max_tr.data[cpu]->disabled);
+ }
+
+ tracing_start();
max_tr.entries = global_trace.entries;
mutex_unlock(&trace_types_lock);
@@ -2748,9 +3344,8 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
{
char *buf;
char *end;
- struct trace_array *tr = &global_trace;
- if (!tr->ctrl || tracing_disabled)
+ if (tracing_disabled)
return -EINVAL;
if (cnt > TRACE_BUF_SIZE)
@@ -2816,22 +3411,38 @@ static struct file_operations tracing_mark_fops = {
#ifdef CONFIG_DYNAMIC_FTRACE
+int __weak ftrace_arch_read_dyn_info(char *buf, int size)
+{
+ return 0;
+}
+
static ssize_t
-tracing_read_long(struct file *filp, char __user *ubuf,
+tracing_read_dyn_info(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
+ static char ftrace_dyn_info_buffer[1024];
+ static DEFINE_MUTEX(dyn_info_mutex);
unsigned long *p = filp->private_data;
- char buf[64];
+ char *buf = ftrace_dyn_info_buffer;
+ int size = ARRAY_SIZE(ftrace_dyn_info_buffer);
int r;
- r = sprintf(buf, "%ld\n", *p);
+ mutex_lock(&dyn_info_mutex);
+ r = sprintf(buf, "%ld ", *p);
- return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+ r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r);
+ buf[r++] = '\n';
+
+ r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+
+ mutex_unlock(&dyn_info_mutex);
+
+ return r;
}
-static struct file_operations tracing_read_long_fops = {
+static struct file_operations tracing_dyn_info_fops = {
.open = tracing_open_generic,
- .read = tracing_read_long,
+ .read = tracing_read_dyn_info,
};
#endif
@@ -2872,10 +3483,10 @@ static __init int tracer_init_debugfs(void)
if (!entry)
pr_warning("Could not create debugfs 'tracing_enabled' entry\n");
- entry = debugfs_create_file("iter_ctrl", 0644, d_tracer,
+ entry = debugfs_create_file("trace_options", 0644, d_tracer,
NULL, &tracing_iter_fops);
if (!entry)
- pr_warning("Could not create debugfs 'iter_ctrl' entry\n");
+ pr_warning("Could not create debugfs 'trace_options' entry\n");
entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
NULL, &tracing_cpumask_fops);
@@ -2925,11 +3536,11 @@ static __init int tracer_init_debugfs(void)
pr_warning("Could not create debugfs "
"'trace_pipe' entry\n");
- entry = debugfs_create_file("trace_entries", 0644, d_tracer,
+ entry = debugfs_create_file("buffer_size_kb", 0644, d_tracer,
&global_trace, &tracing_entries_fops);
if (!entry)
pr_warning("Could not create debugfs "
- "'trace_entries' entry\n");
+ "'buffer_size_kb' entry\n");
entry = debugfs_create_file("trace_marker", 0220, d_tracer,
NULL, &tracing_mark_fops);
@@ -2940,7 +3551,7 @@ static __init int tracer_init_debugfs(void)
#ifdef CONFIG_DYNAMIC_FTRACE
entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
&ftrace_update_tot_cnt,
- &tracing_read_long_fops);
+ &tracing_dyn_info_fops);
if (!entry)
pr_warning("Could not create debugfs "
"'dyn_ftrace_total_info' entry\n");
@@ -2963,7 +3574,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
unsigned long flags, irq_flags;
int cpu, len = 0, size, pc;
- if (!tr->ctrl || tracing_disabled)
+ if (tracing_disabled)
return 0;
pc = preempt_count();
@@ -3021,7 +3632,8 @@ EXPORT_SYMBOL_GPL(__ftrace_printk);
static int trace_panic_handler(struct notifier_block *this,
unsigned long event, void *unused)
{
- ftrace_dump();
+ if (ftrace_dump_on_oops)
+ ftrace_dump();
return NOTIFY_OK;
}
@@ -3037,7 +3649,8 @@ static int trace_die_handler(struct notifier_block *self,
{
switch (val) {
case DIE_OOPS:
- ftrace_dump();
+ if (ftrace_dump_on_oops)
+ ftrace_dump();
break;
default:
break;
@@ -3078,7 +3691,6 @@ trace_printk_seq(struct trace_seq *s)
trace_seq_reset(s);
}
-
void ftrace_dump(void)
{
static DEFINE_SPINLOCK(ftrace_dump_lock);
@@ -3097,12 +3709,15 @@ void ftrace_dump(void)
dump_ran = 1;
/* No turning back! */
- ftrace_kill_atomic();
+ ftrace_kill();
for_each_tracing_cpu(cpu) {
atomic_inc(&global_trace.data[cpu]->disabled);
}
+ /* don't look at user memory in panic mode */
+ trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
+
printk(KERN_TRACE "Dumping ftrace buffer:\n");
iter.tr = &global_trace;
@@ -3196,7 +3811,6 @@ __init static int tracer_alloc_buffers(void)
#endif
/* All seems OK, enable tracing */
- global_trace.ctrl = tracer_enabled;
tracing_disabled = 0;
atomic_notifier_chain_register(&panic_notifier_list,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f1f99572cde..f96f4e787ff 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -8,6 +8,7 @@
#include <linux/ring_buffer.h>
#include <linux/mmiotrace.h>
#include <linux/ftrace.h>
+#include <trace/boot.h>
enum trace_type {
__TRACE_FIRST_TYPE = 0,
@@ -21,7 +22,14 @@ enum trace_type {
TRACE_SPECIAL,
TRACE_MMIO_RW,
TRACE_MMIO_MAP,
- TRACE_BOOT,
+ TRACE_BRANCH,
+ TRACE_BOOT_CALL,
+ TRACE_BOOT_RET,
+ TRACE_GRAPH_RET,
+ TRACE_GRAPH_ENT,
+ TRACE_USER_STACK,
+ TRACE_BTS,
+ TRACE_POWER,
__TRACE_LAST_TYPE
};
@@ -38,6 +46,7 @@ struct trace_entry {
unsigned char flags;
unsigned char preempt_count;
int pid;
+ int tgid;
};
/*
@@ -48,6 +57,18 @@ struct ftrace_entry {
unsigned long ip;
unsigned long parent_ip;
};
+
+/* Function call entry */
+struct ftrace_graph_ent_entry {
+ struct trace_entry ent;
+ struct ftrace_graph_ent graph_ent;
+};
+
+/* Function return entry */
+struct ftrace_graph_ret_entry {
+ struct trace_entry ent;
+ struct ftrace_graph_ret ret;
+};
extern struct tracer boot_tracer;
/*
@@ -85,6 +106,11 @@ struct stack_entry {
unsigned long caller[FTRACE_STACK_ENTRIES];
};
+struct userstack_entry {
+ struct trace_entry ent;
+ unsigned long caller[FTRACE_STACK_ENTRIES];
+};
+
/*
* ftrace_printk entry:
*/
@@ -112,26 +138,54 @@ struct trace_mmiotrace_map {
struct mmiotrace_map map;
};
-struct trace_boot {
+struct trace_boot_call {
+ struct trace_entry ent;
+ struct boot_trace_call boot_call;
+};
+
+struct trace_boot_ret {
+ struct trace_entry ent;
+ struct boot_trace_ret boot_ret;
+};
+
+#define TRACE_FUNC_SIZE 30
+#define TRACE_FILE_SIZE 20
+struct trace_branch {
+ struct trace_entry ent;
+ unsigned line;
+ char func[TRACE_FUNC_SIZE+1];
+ char file[TRACE_FILE_SIZE+1];
+ char correct;
+};
+
+struct bts_entry {
+ struct trace_entry ent;
+ unsigned long from;
+ unsigned long to;
+};
+
+struct trace_power {
struct trace_entry ent;
- struct boot_trace initcall;
+ struct power_trace state_data;
};
/*
* trace_flag_type is an enumeration that holds different
* states when a trace occurs. These are:
- * IRQS_OFF - interrupts were disabled
- * NEED_RESCED - reschedule is requested
- * HARDIRQ - inside an interrupt handler
- * SOFTIRQ - inside a softirq handler
- * CONT - multiple entries hold the trace item
+ * IRQS_OFF - interrupts were disabled
+ * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags
+ * NEED_RESCED - reschedule is requested
+ * HARDIRQ - inside an interrupt handler
+ * SOFTIRQ - inside a softirq handler
+ * CONT - multiple entries hold the trace item
*/
enum trace_flag_type {
TRACE_FLAG_IRQS_OFF = 0x01,
- TRACE_FLAG_NEED_RESCHED = 0x02,
- TRACE_FLAG_HARDIRQ = 0x04,
- TRACE_FLAG_SOFTIRQ = 0x08,
- TRACE_FLAG_CONT = 0x10,
+ TRACE_FLAG_IRQS_NOSUPPORT = 0x02,
+ TRACE_FLAG_NEED_RESCHED = 0x04,
+ TRACE_FLAG_HARDIRQ = 0x08,
+ TRACE_FLAG_SOFTIRQ = 0x10,
+ TRACE_FLAG_CONT = 0x20,
};
#define TRACE_BUF_SIZE 1024
@@ -170,7 +224,6 @@ struct trace_iterator;
struct trace_array {
struct ring_buffer *buffer;
unsigned long entries;
- long ctrl;
int cpu;
cycle_t time_start;
struct task_struct *waiter;
@@ -210,13 +263,22 @@ extern void __ftrace_bad_type(void);
IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \
IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \
IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \
+ IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
IF_ASSIGN(var, ent, struct special_entry, 0); \
IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
TRACE_MMIO_RW); \
IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
TRACE_MMIO_MAP); \
- IF_ASSIGN(var, ent, struct trace_boot, TRACE_BOOT); \
+ IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\
+ IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\
+ IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
+ IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \
+ TRACE_GRAPH_ENT); \
+ IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
+ TRACE_GRAPH_RET); \
+ IF_ASSIGN(var, ent, struct bts_entry, TRACE_BTS);\
+ IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
__ftrace_bad_type(); \
} while (0)
@@ -227,29 +289,56 @@ enum print_line_t {
TRACE_TYPE_UNHANDLED = 2 /* Relay to other output functions */
};
+
+/*
+ * An option specific to a tracer. This is a boolean value.
+ * The bit is the bit index that sets its value on the
+ * flags value in struct tracer_flags.
+ */
+struct tracer_opt {
+ const char *name; /* Will appear on the trace_options file */
+ u32 bit; /* Mask assigned in val field in tracer_flags */
+};
+
+/*
+ * The set of specific options for a tracer. Your tracer
+ * have to set the initial value of the flags val.
+ */
+struct tracer_flags {
+ u32 val;
+ struct tracer_opt *opts;
+};
+
+/* Makes more easy to define a tracer opt */
+#define TRACER_OPT(s, b) .name = #s, .bit = b
+
/*
* A specific tracer, represented by methods that operate on a trace array:
*/
struct tracer {
const char *name;
- void (*init)(struct trace_array *tr);
+ /* Your tracer should raise a warning if init fails */
+ int (*init)(struct trace_array *tr);
void (*reset)(struct trace_array *tr);
+ void (*start)(struct trace_array *tr);
+ void (*stop)(struct trace_array *tr);
void (*open)(struct trace_iterator *iter);
void (*pipe_open)(struct trace_iterator *iter);
void (*close)(struct trace_iterator *iter);
- void (*start)(struct trace_iterator *iter);
- void (*stop)(struct trace_iterator *iter);
ssize_t (*read)(struct trace_iterator *iter,
struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos);
- void (*ctrl_update)(struct trace_array *tr);
#ifdef CONFIG_FTRACE_STARTUP_TEST
int (*selftest)(struct tracer *trace,
struct trace_array *tr);
#endif
+ void (*print_header)(struct seq_file *m);
enum print_line_t (*print_line)(struct trace_iterator *iter);
+ /* If you handled the flag setting, return 0 */
+ int (*set_flag)(u32 old_flags, u32 bit, int set);
struct tracer *next;
int print_max;
+ struct tracer_flags *flags;
};
struct trace_seq {
@@ -277,8 +366,11 @@ struct trace_iterator {
unsigned long iter_flags;
loff_t pos;
long idx;
+
+ cpumask_t started;
};
+int tracing_is_enabled(void);
void trace_wake_up(void);
void tracing_reset(struct trace_array *tr, int cpu);
int tracing_open_generic(struct inode *inode, struct file *filp);
@@ -319,8 +411,17 @@ void trace_function(struct trace_array *tr,
unsigned long parent_ip,
unsigned long flags, int pc);
+void trace_graph_return(struct ftrace_graph_ret *trace);
+void trace_graph_entry(struct ftrace_graph_ent *trace);
+void trace_bts(struct trace_array *tr,
+ unsigned long from,
+ unsigned long to);
+
void tracing_start_cmdline_record(void);
void tracing_stop_cmdline_record(void);
+void tracing_sched_switch_assign_trace(struct trace_array *tr);
+void tracing_stop_sched_switch_record(void);
+void tracing_start_sched_switch_record(void);
int register_tracer(struct tracer *type);
void unregister_tracer(struct tracer *type);
@@ -335,7 +436,7 @@ void update_max_tr_single(struct trace_array *tr,
extern cycle_t ftrace_now(int cpu);
-#ifdef CONFIG_FTRACE
+#ifdef CONFIG_FUNCTION_TRACER
void tracing_start_function_trace(void);
void tracing_stop_function_trace(void);
#else
@@ -356,6 +457,7 @@ struct tracer_switch_ops {
struct tracer_switch_ops *next;
};
+char *trace_find_cmdline(int pid);
#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
#ifdef CONFIG_DYNAMIC_FTRACE
@@ -381,12 +483,18 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace,
struct trace_array *tr);
extern int trace_selftest_startup_sysprof(struct tracer *trace,
struct trace_array *tr);
+extern int trace_selftest_startup_branch(struct tracer *trace,
+ struct trace_array *tr);
#endif /* CONFIG_FTRACE_STARTUP_TEST */
extern void *head_page(struct trace_array_cpu *data);
extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
extern void trace_seq_print_cont(struct trace_seq *s,
struct trace_iterator *iter);
+
+extern int
+seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
+ unsigned long sym_flags);
extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
size_t cnt);
extern long ns2usecs(cycle_t nsec);
@@ -394,6 +502,17 @@ extern int trace_vprintk(unsigned long ip, const char *fmt, va_list args);
extern unsigned long trace_flags;
+/* Standard output formatting function used for function return traces */
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+extern enum print_line_t print_graph_function(struct trace_iterator *iter);
+#else
+static inline enum print_line_t
+print_graph_function(struct trace_iterator *iter)
+{
+ return TRACE_TYPE_UNHANDLED;
+}
+#endif
+
/*
* trace_iterator_flags is an enumeration that defines bit
* positions into trace_flags that controls the output.
@@ -413,8 +532,92 @@ enum trace_iterator_flags {
TRACE_ITER_STACKTRACE = 0x100,
TRACE_ITER_SCHED_TREE = 0x200,
TRACE_ITER_PRINTK = 0x400,
+ TRACE_ITER_PREEMPTONLY = 0x800,
+ TRACE_ITER_BRANCH = 0x1000,
+ TRACE_ITER_ANNOTATE = 0x2000,
+ TRACE_ITER_USERSTACKTRACE = 0x4000,
+ TRACE_ITER_SYM_USEROBJ = 0x8000
};
+/*
+ * TRACE_ITER_SYM_MASK masks the options in trace_flags that
+ * control the output of kernel symbols.
+ */
+#define TRACE_ITER_SYM_MASK \
+ (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
+
extern struct tracer nop_trace;
+/**
+ * ftrace_preempt_disable - disable preemption scheduler safe
+ *
+ * When tracing can happen inside the scheduler, there exists
+ * cases that the tracing might happen before the need_resched
+ * flag is checked. If this happens and the tracer calls
+ * preempt_enable (after a disable), a schedule might take place
+ * causing an infinite recursion.
+ *
+ * To prevent this, we read the need_recshed flag before
+ * disabling preemption. When we want to enable preemption we
+ * check the flag, if it is set, then we call preempt_enable_no_resched.
+ * Otherwise, we call preempt_enable.
+ *
+ * The rational for doing the above is that if need resched is set
+ * and we have yet to reschedule, we are either in an atomic location
+ * (where we do not need to check for scheduling) or we are inside
+ * the scheduler and do not want to resched.
+ */
+static inline int ftrace_preempt_disable(void)
+{
+ int resched;
+
+ resched = need_resched();
+ preempt_disable_notrace();
+
+ return resched;
+}
+
+/**
+ * ftrace_preempt_enable - enable preemption scheduler safe
+ * @resched: the return value from ftrace_preempt_disable
+ *
+ * This is a scheduler safe way to enable preemption and not miss
+ * any preemption checks. The disabled saved the state of preemption.
+ * If resched is set, then we were either inside an atomic or
+ * are inside the scheduler (we would have already scheduled
+ * otherwise). In this case, we do not want to call normal
+ * preempt_enable, but preempt_enable_no_resched instead.
+ */
+static inline void ftrace_preempt_enable(int resched)
+{
+ if (resched)
+ preempt_enable_no_resched_notrace();
+ else
+ preempt_enable_notrace();
+}
+
+#ifdef CONFIG_BRANCH_TRACER
+extern int enable_branch_tracing(struct trace_array *tr);
+extern void disable_branch_tracing(void);
+static inline int trace_branch_enable(struct trace_array *tr)
+{
+ if (trace_flags & TRACE_ITER_BRANCH)
+ return enable_branch_tracing(tr);
+ return 0;
+}
+static inline void trace_branch_disable(void)
+{
+ /* due to races, always disable */
+ disable_branch_tracing();
+}
+#else
+static inline int trace_branch_enable(struct trace_array *tr)
+{
+ return 0;
+}
+static inline void trace_branch_disable(void)
+{
+}
+#endif /* CONFIG_BRANCH_TRACER */
+
#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index d0a5e50eeff..a4fa2c57e34 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -13,73 +13,117 @@
#include "trace.h"
static struct trace_array *boot_trace;
-static int trace_boot_enabled;
+static bool pre_initcalls_finished;
-
-/* Should be started after do_pre_smp_initcalls() in init/main.c */
+/* Tells the boot tracer that the pre_smp_initcalls are finished.
+ * So we are ready .
+ * It doesn't enable sched events tracing however.
+ * You have to call enable_boot_trace to do so.
+ */
void start_boot_trace(void)
{
- trace_boot_enabled = 1;
+ pre_initcalls_finished = true;
}
-void stop_boot_trace(void)
+void enable_boot_trace(void)
{
- trace_boot_enabled = 0;
+ if (pre_initcalls_finished)
+ tracing_start_sched_switch_record();
}
-void reset_boot_trace(struct trace_array *tr)
+void disable_boot_trace(void)
{
- stop_boot_trace();
+ if (pre_initcalls_finished)
+ tracing_stop_sched_switch_record();
}
-static void boot_trace_init(struct trace_array *tr)
+static void reset_boot_trace(struct trace_array *tr)
{
int cpu;
- boot_trace = tr;
- trace_boot_enabled = 0;
+ tr->time_start = ftrace_now(tr->cpu);
+
+ for_each_online_cpu(cpu)
+ tracing_reset(tr, cpu);
+}
+
+static int boot_trace_init(struct trace_array *tr)
+{
+ int cpu;
+ boot_trace = tr;
for_each_cpu_mask(cpu, cpu_possible_map)
tracing_reset(tr, cpu);
+
+ tracing_sched_switch_assign_trace(tr);
+ return 0;
}
-static void boot_trace_ctrl_update(struct trace_array *tr)
+static enum print_line_t
+initcall_call_print_line(struct trace_iterator *iter)
{
- if (tr->ctrl)
- start_boot_trace();
+ struct trace_entry *entry = iter->ent;
+ struct trace_seq *s = &iter->seq;
+ struct trace_boot_call *field;
+ struct boot_trace_call *call;
+ u64 ts;
+ unsigned long nsec_rem;
+ int ret;
+
+ trace_assign_type(field, entry);
+ call = &field->boot_call;
+ ts = iter->ts;
+ nsec_rem = do_div(ts, 1000000000);
+
+ ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n",
+ (unsigned long)ts, nsec_rem, call->func, call->caller);
+
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
else
- stop_boot_trace();
+ return TRACE_TYPE_HANDLED;
}
-static enum print_line_t initcall_print_line(struct trace_iterator *iter)
+static enum print_line_t
+initcall_ret_print_line(struct trace_iterator *iter)
{
- int ret;
struct trace_entry *entry = iter->ent;
- struct trace_boot *field = (struct trace_boot *)entry;
- struct boot_trace *it = &field->initcall;
struct trace_seq *s = &iter->seq;
- struct timespec calltime = ktime_to_timespec(it->calltime);
- struct timespec rettime = ktime_to_timespec(it->rettime);
-
- if (entry->type == TRACE_BOOT) {
- ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n",
- calltime.tv_sec,
- calltime.tv_nsec,
- it->func, it->caller);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
- "returned %d after %lld msecs\n",
- rettime.tv_sec,
- rettime.tv_nsec,
- it->func, it->result, it->duration);
-
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ struct trace_boot_ret *field;
+ struct boot_trace_ret *init_ret;
+ u64 ts;
+ unsigned long nsec_rem;
+ int ret;
+
+ trace_assign_type(field, entry);
+ init_ret = &field->boot_ret;
+ ts = iter->ts;
+ nsec_rem = do_div(ts, 1000000000);
+
+ ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
+ "returned %d after %llu msecs\n",
+ (unsigned long) ts,
+ nsec_rem,
+ init_ret->func, init_ret->result, init_ret->duration);
+
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ else
return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t initcall_print_line(struct trace_iterator *iter)
+{
+ struct trace_entry *entry = iter->ent;
+
+ switch (entry->type) {
+ case TRACE_BOOT_CALL:
+ return initcall_call_print_line(iter);
+ case TRACE_BOOT_RET:
+ return initcall_ret_print_line(iter);
+ default:
+ return TRACE_TYPE_UNHANDLED;
}
- return TRACE_TYPE_UNHANDLED;
}
struct tracer boot_tracer __read_mostly =
@@ -87,27 +131,53 @@ struct tracer boot_tracer __read_mostly =
.name = "initcall",
.init = boot_trace_init,
.reset = reset_boot_trace,
- .ctrl_update = boot_trace_ctrl_update,
.print_line = initcall_print_line,
};
-void trace_boot(struct boot_trace *it, initcall_t fn)
+void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
{
struct ring_buffer_event *event;
- struct trace_boot *entry;
- struct trace_array_cpu *data;
+ struct trace_boot_call *entry;
unsigned long irq_flags;
struct trace_array *tr = boot_trace;
- if (!trace_boot_enabled)
+ if (!pre_initcalls_finished)
return;
/* Get its name now since this function could
* disappear because it is in the .init section.
*/
- sprint_symbol(it->func, (unsigned long)fn);
+ sprint_symbol(bt->func, (unsigned long)fn);
+ preempt_disable();
+
+ event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+ &irq_flags);
+ if (!event)
+ goto out;
+ entry = ring_buffer_event_data(event);
+ tracing_generic_entry_update(&entry->ent, 0, 0);
+ entry->ent.type = TRACE_BOOT_CALL;
+ entry->boot_call = *bt;
+ ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+ trace_wake_up();
+
+ out:
+ preempt_enable();
+}
+
+void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
+{
+ struct ring_buffer_event *event;
+ struct trace_boot_ret *entry;
+ unsigned long irq_flags;
+ struct trace_array *tr = boot_trace;
+
+ if (!pre_initcalls_finished)
+ return;
+
+ sprint_symbol(bt->func, (unsigned long)fn);
preempt_disable();
- data = tr->data[smp_processor_id()];
event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
&irq_flags);
@@ -115,8 +185,8 @@ void trace_boot(struct boot_trace *it, initcall_t fn)
goto out;
entry = ring_buffer_event_data(event);
tracing_generic_entry_update(&entry->ent, 0, 0);
- entry->ent.type = TRACE_BOOT;
- entry->initcall = *it;
+ entry->ent.type = TRACE_BOOT_RET;
+ entry->boot_ret = *bt;
ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
trace_wake_up();
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
new file mode 100644
index 00000000000..bc972753568
--- /dev/null
+++ b/kernel/trace/trace_branch.c
@@ -0,0 +1,342 @@
+/*
+ * unlikely profiler
+ *
+ * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
+ */
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/irqflags.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ftrace.h>
+#include <linux/hash.h>
+#include <linux/fs.h>
+#include <asm/local.h>
+#include "trace.h"
+
+#ifdef CONFIG_BRANCH_TRACER
+
+static int branch_tracing_enabled __read_mostly;
+static DEFINE_MUTEX(branch_tracing_mutex);
+static struct trace_array *branch_tracer;
+
+static void
+probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
+{
+ struct trace_array *tr = branch_tracer;
+ struct ring_buffer_event *event;
+ struct trace_branch *entry;
+ unsigned long flags, irq_flags;
+ int cpu, pc;
+ const char *p;
+
+ /*
+ * I would love to save just the ftrace_likely_data pointer, but
+ * this code can also be used by modules. Ugly things can happen
+ * if the module is unloaded, and then we go and read the
+ * pointer. This is slower, but much safer.
+ */
+
+ if (unlikely(!tr))
+ return;
+
+ raw_local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+ if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
+ goto out;
+
+ event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+ &irq_flags);
+ if (!event)
+ goto out;
+
+ pc = preempt_count();
+ entry = ring_buffer_event_data(event);
+ tracing_generic_entry_update(&entry->ent, flags, pc);
+ entry->ent.type = TRACE_BRANCH;
+
+ /* Strip off the path, only save the file */
+ p = f->file + strlen(f->file);
+ while (p >= f->file && *p != '/')
+ p--;
+ p++;
+
+ strncpy(entry->func, f->func, TRACE_FUNC_SIZE);
+ strncpy(entry->file, p, TRACE_FILE_SIZE);
+ entry->func[TRACE_FUNC_SIZE] = 0;
+ entry->file[TRACE_FILE_SIZE] = 0;
+ entry->line = f->line;
+ entry->correct = val == expect;
+
+ ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+ out:
+ atomic_dec(&tr->data[cpu]->disabled);
+ raw_local_irq_restore(flags);
+}
+
+static inline
+void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
+{
+ if (!branch_tracing_enabled)
+ return;
+
+ probe_likely_condition(f, val, expect);
+}
+
+int enable_branch_tracing(struct trace_array *tr)
+{
+ int ret = 0;
+
+ mutex_lock(&branch_tracing_mutex);
+ branch_tracer = tr;
+ /*
+ * Must be seen before enabling. The reader is a condition
+ * where we do not need a matching rmb()
+ */
+ smp_wmb();
+ branch_tracing_enabled++;
+ mutex_unlock(&branch_tracing_mutex);
+
+ return ret;
+}
+
+void disable_branch_tracing(void)
+{
+ mutex_lock(&branch_tracing_mutex);
+
+ if (!branch_tracing_enabled)
+ goto out_unlock;
+
+ branch_tracing_enabled--;
+
+ out_unlock:
+ mutex_unlock(&branch_tracing_mutex);
+}
+
+static void start_branch_trace(struct trace_array *tr)
+{
+ enable_branch_tracing(tr);
+}
+
+static void stop_branch_trace(struct trace_array *tr)
+{
+ disable_branch_tracing();
+}
+
+static int branch_trace_init(struct trace_array *tr)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ tracing_reset(tr, cpu);
+
+ start_branch_trace(tr);
+ return 0;
+}
+
+static void branch_trace_reset(struct trace_array *tr)
+{
+ stop_branch_trace(tr);
+}
+
+struct tracer branch_trace __read_mostly =
+{
+ .name = "branch",
+ .init = branch_trace_init,
+ .reset = branch_trace_reset,
+#ifdef CONFIG_FTRACE_SELFTEST
+ .selftest = trace_selftest_startup_branch,
+#endif
+};
+
+__init static int init_branch_trace(void)
+{
+ return register_tracer(&branch_trace);
+}
+
+device_initcall(init_branch_trace);
+#else
+static inline
+void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
+{
+}
+#endif /* CONFIG_BRANCH_TRACER */
+
+void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect)
+{
+ /*
+ * I would love to have a trace point here instead, but the
+ * trace point code is so inundated with unlikely and likely
+ * conditions that the recursive nightmare that exists is too
+ * much to try to get working. At least for now.
+ */
+ trace_likely_condition(f, val, expect);
+
+ /* FIXME: Make this atomic! */
+ if (val == expect)
+ f->correct++;
+ else
+ f->incorrect++;
+}
+EXPORT_SYMBOL(ftrace_likely_update);
+
+struct ftrace_pointer {
+ void *start;
+ void *stop;
+ int hit;
+};
+
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ const struct ftrace_pointer *f = m->private;
+ struct ftrace_branch_data *p = v;
+
+ (*pos)++;
+
+ if (v == (void *)1)
+ return f->start;
+
+ ++p;
+
+ if ((void *)p >= (void *)f->stop)
+ return NULL;
+
+ return p;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+ void *t = (void *)1;
+ loff_t l = 0;
+
+ for (; t && l < *pos; t = t_next(m, t, &l))
+ ;
+
+ return t;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+ const struct ftrace_pointer *fp = m->private;
+ struct ftrace_branch_data *p = v;
+ const char *f;
+ long percent;
+
+ if (v == (void *)1) {
+ if (fp->hit)
+ seq_printf(m, " miss hit %% ");
+ else
+ seq_printf(m, " correct incorrect %% ");
+ seq_printf(m, " Function "
+ " File Line\n"
+ " ------- --------- - "
+ " -------- "
+ " ---- ----\n");
+ return 0;
+ }
+
+ /* Only print the file, not the path */
+ f = p->file + strlen(p->file);
+ while (f >= p->file && *f != '/')
+ f--;
+ f++;
+
+ /*
+ * The miss is overlayed on correct, and hit on incorrect.
+ */
+ if (p->correct) {
+ percent = p->incorrect * 100;
+ percent /= p->correct + p->incorrect;
+ } else
+ percent = p->incorrect ? 100 : -1;
+
+ seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect);
+ if (percent < 0)
+ seq_printf(m, " X ");
+ else
+ seq_printf(m, "%3ld ", percent);
+ seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
+ return 0;
+}
+
+static struct seq_operations tracing_likely_seq_ops = {
+ .start = t_start,
+ .next = t_next,
+ .stop = t_stop,
+ .show = t_show,
+};
+
+static int tracing_branch_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ ret = seq_open(file, &tracing_likely_seq_ops);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ m->private = (void *)inode->i_private;
+ }
+
+ return ret;
+}
+
+static const struct file_operations tracing_branch_fops = {
+ .open = tracing_branch_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+};
+
+#ifdef CONFIG_PROFILE_ALL_BRANCHES
+extern unsigned long __start_branch_profile[];
+extern unsigned long __stop_branch_profile[];
+
+static const struct ftrace_pointer ftrace_branch_pos = {
+ .start = __start_branch_profile,
+ .stop = __stop_branch_profile,
+ .hit = 1,
+};
+
+#endif /* CONFIG_PROFILE_ALL_BRANCHES */
+
+extern unsigned long __start_annotated_branch_profile[];
+extern unsigned long __stop_annotated_branch_profile[];
+
+static const struct ftrace_pointer ftrace_annotated_branch_pos = {
+ .start = __start_annotated_branch_profile,
+ .stop = __stop_annotated_branch_profile,
+};
+
+static __init int ftrace_branch_init(void)
+{
+ struct dentry *d_tracer;
+ struct dentry *entry;
+
+ d_tracer = tracing_init_dentry();
+
+ entry = debugfs_create_file("profile_annotated_branch", 0444, d_tracer,
+ (void *)&ftrace_annotated_branch_pos,
+ &tracing_branch_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs "
+ "'profile_annotatet_branch' entry\n");
+
+#ifdef CONFIG_PROFILE_ALL_BRANCHES
+ entry = debugfs_create_file("profile_branch", 0444, d_tracer,
+ (void *)&ftrace_branch_pos,
+ &tracing_branch_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs"
+ " 'profile_branch' entry\n");
+#endif
+
+ return 0;
+}
+
+device_initcall(ftrace_branch_init);
diff --git a/kernel/trace/trace_bts.c b/kernel/trace/trace_bts.c
new file mode 100644
index 00000000000..23b76e4690e
--- /dev/null
+++ b/kernel/trace/trace_bts.c
@@ -0,0 +1,276 @@
+/*
+ * BTS tracer
+ *
+ * Copyright (C) 2008 Markus Metzger <markus.t.metzger@gmail.com>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/ftrace.h>
+#include <linux/kallsyms.h>
+
+#include <asm/ds.h>
+
+#include "trace.h"
+
+
+#define SIZEOF_BTS (1 << 13)
+
+static DEFINE_PER_CPU(struct bts_tracer *, tracer);
+static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
+
+#define this_tracer per_cpu(tracer, smp_processor_id())
+#define this_buffer per_cpu(buffer, smp_processor_id())
+
+
+/*
+ * Information to interpret a BTS record.
+ * This will go into an in-kernel BTS interface.
+ */
+static unsigned char sizeof_field;
+static unsigned long debugctl_mask;
+
+#define sizeof_bts (3 * sizeof_field)
+
+static void bts_trace_cpuinit(struct cpuinfo_x86 *c)
+{
+ switch (c->x86) {
+ case 0x6:
+ switch (c->x86_model) {
+ case 0x0 ... 0xC:
+ break;
+ case 0xD:
+ case 0xE: /* Pentium M */
+ sizeof_field = sizeof(long);
+ debugctl_mask = (1<<6)|(1<<7);
+ break;
+ default:
+ sizeof_field = 8;
+ debugctl_mask = (1<<6)|(1<<7);
+ break;
+ }
+ break;
+ case 0xF:
+ switch (c->x86_model) {
+ case 0x0:
+ case 0x1:
+ case 0x2: /* Netburst */
+ sizeof_field = sizeof(long);
+ debugctl_mask = (1<<2)|(1<<3);
+ break;
+ default:
+ /* sorry, don't know about them */
+ break;
+ }
+ break;
+ default:
+ /* sorry, don't know about them */
+ break;
+ }
+}
+
+static inline void bts_enable(void)
+{
+ unsigned long debugctl;
+
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl | debugctl_mask);
+}
+
+static inline void bts_disable(void)
+{
+ unsigned long debugctl;
+
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl & ~debugctl_mask);
+}
+
+static void bts_trace_reset(struct trace_array *tr)
+{
+ int cpu;
+
+ tr->time_start = ftrace_now(tr->cpu);
+
+ for_each_online_cpu(cpu)
+ tracing_reset(tr, cpu);
+}
+
+static void bts_trace_start_cpu(void *arg)
+{
+ this_tracer =
+ ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS,
+ /* ovfl = */ NULL, /* th = */ (size_t)-1);
+ if (IS_ERR(this_tracer)) {
+ this_tracer = NULL;
+ return;
+ }
+
+ bts_enable();
+}
+
+static void bts_trace_start(struct trace_array *tr)
+{
+ int cpu;
+
+ bts_trace_reset(tr);
+
+ for_each_cpu_mask(cpu, cpu_possible_map)
+ smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
+}
+
+static void bts_trace_stop_cpu(void *arg)
+{
+ if (this_tracer) {
+ bts_disable();
+
+ ds_release_bts(this_tracer);
+ this_tracer = NULL;
+ }
+}
+
+static void bts_trace_stop(struct trace_array *tr)
+{
+ int cpu;
+
+ for_each_cpu_mask(cpu, cpu_possible_map)
+ smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
+}
+
+static int bts_trace_init(struct trace_array *tr)
+{
+ bts_trace_cpuinit(&boot_cpu_data);
+ bts_trace_reset(tr);
+ bts_trace_start(tr);
+
+ return 0;
+}
+
+static void bts_trace_print_header(struct seq_file *m)
+{
+#ifdef __i386__
+ seq_puts(m, "# CPU# FROM TO FUNCTION\n");
+ seq_puts(m, "# | | | |\n");
+#else
+ seq_puts(m,
+ "# CPU# FROM TO FUNCTION\n");
+ seq_puts(m,
+ "# | | | |\n");
+#endif
+}
+
+static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
+{
+ struct trace_entry *entry = iter->ent;
+ struct trace_seq *seq = &iter->seq;
+ struct bts_entry *it;
+
+ trace_assign_type(it, entry);
+
+ if (entry->type == TRACE_BTS) {
+ int ret;
+#ifdef CONFIG_KALLSYMS
+ char function[KSYM_SYMBOL_LEN];
+ sprint_symbol(function, it->from);
+#else
+ char *function = "<unknown>";
+#endif
+
+ ret = trace_seq_printf(seq, "%4d 0x%lx -> 0x%lx [%s]\n",
+ entry->cpu, it->from, it->to, function);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;;
+ return TRACE_TYPE_HANDLED;
+ }
+ return TRACE_TYPE_UNHANDLED;
+}
+
+void trace_bts(struct trace_array *tr, unsigned long from, unsigned long to)
+{
+ struct ring_buffer_event *event;
+ struct bts_entry *entry;
+ unsigned long irq;
+
+ event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ tracing_generic_entry_update(&entry->ent, 0, from);
+ entry->ent.type = TRACE_BTS;
+ entry->ent.cpu = smp_processor_id();
+ entry->from = from;
+ entry->to = to;
+ ring_buffer_unlock_commit(tr->buffer, event, irq);
+}
+
+static void trace_bts_at(struct trace_array *tr, size_t index)
+{
+ const void *raw = NULL;
+ unsigned long from, to;
+ int err;
+
+ err = ds_access_bts(this_tracer, index, &raw);
+ if (err < 0)
+ return;
+
+ from = *(const unsigned long *)raw;
+ to = *(const unsigned long *)((const char *)raw + sizeof_field);
+
+ trace_bts(tr, from, to);
+}
+
+static void trace_bts_cpu(void *arg)
+{
+ struct trace_array *tr = (struct trace_array *) arg;
+ size_t index = 0, end = 0, i;
+ int err;
+
+ if (!this_tracer)
+ return;
+
+ bts_disable();
+
+ err = ds_get_bts_index(this_tracer, &index);
+ if (err < 0)
+ goto out;
+
+ err = ds_get_bts_end(this_tracer, &end);
+ if (err < 0)
+ goto out;
+
+ for (i = index; i < end; i++)
+ trace_bts_at(tr, i);
+
+ for (i = 0; i < index; i++)
+ trace_bts_at(tr, i);
+
+out:
+ bts_enable();
+}
+
+static void trace_bts_prepare(struct trace_iterator *iter)
+{
+ int cpu;
+
+ for_each_cpu_mask(cpu, cpu_possible_map)
+ smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1);
+}
+
+struct tracer bts_tracer __read_mostly =
+{
+ .name = "bts",
+ .init = bts_trace_init,
+ .reset = bts_trace_stop,
+ .print_header = bts_trace_print_header,
+ .print_line = bts_trace_print_line,
+ .start = bts_trace_start,
+ .stop = bts_trace_stop,
+ .open = trace_bts_prepare
+};
+
+__init static int init_bts_trace(void)
+{
+ return register_tracer(&bts_tracer);
+}
+device_initcall(init_bts_trace);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index e90eb0c2c56..e74f6d0a321 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -42,32 +42,28 @@ static void stop_function_trace(struct trace_array *tr)
tracing_stop_cmdline_record();
}
-static void function_trace_init(struct trace_array *tr)
+static int function_trace_init(struct trace_array *tr)
{
- if (tr->ctrl)
- start_function_trace(tr);
+ start_function_trace(tr);
+ return 0;
}
static void function_trace_reset(struct trace_array *tr)
{
- if (tr->ctrl)
- stop_function_trace(tr);
+ stop_function_trace(tr);
}
-static void function_trace_ctrl_update(struct trace_array *tr)
+static void function_trace_start(struct trace_array *tr)
{
- if (tr->ctrl)
- start_function_trace(tr);
- else
- stop_function_trace(tr);
+ function_reset(tr);
}
static struct tracer function_trace __read_mostly =
{
- .name = "ftrace",
+ .name = "function",
.init = function_trace_init,
.reset = function_trace_reset,
- .ctrl_update = function_trace_ctrl_update,
+ .start = function_trace_start,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_function,
#endif
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
new file mode 100644
index 00000000000..894b50bca31
--- /dev/null
+++ b/kernel/trace/trace_functions_graph.c
@@ -0,0 +1,400 @@
+/*
+ *
+ * Function graph tracer.
+ * Copyright (c) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ * Mostly borrowed from function tracer which
+ * is Copyright (c) Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/fs.h>
+
+#include "trace.h"
+
+#define TRACE_GRAPH_INDENT 2
+
+/* Flag options */
+#define TRACE_GRAPH_PRINT_OVERRUN 0x1
+#define TRACE_GRAPH_PRINT_CPU 0x2
+#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
+
+static struct tracer_opt trace_opts[] = {
+ /* Display overruns ? */
+ { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) },
+ /* Display CPU ? */
+ { TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) },
+ /* Display Overhead ? */
+ { TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) },
+ { } /* Empty entry */
+};
+
+static struct tracer_flags tracer_flags = {
+ /* Don't display overruns by default */
+ .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD,
+ .opts = trace_opts
+};
+
+/* pid on the last trace processed */
+static pid_t last_pid[NR_CPUS] = { [0 ... NR_CPUS-1] = -1 };
+
+static int graph_trace_init(struct trace_array *tr)
+{
+ int cpu, ret;
+
+ for_each_online_cpu(cpu)
+ tracing_reset(tr, cpu);
+
+ ret = register_ftrace_graph(&trace_graph_return,
+ &trace_graph_entry);
+ if (ret)
+ return ret;
+ tracing_start_cmdline_record();
+
+ return 0;
+}
+
+static void graph_trace_reset(struct trace_array *tr)
+{
+ tracing_stop_cmdline_record();
+ unregister_ftrace_graph();
+}
+
+static inline int log10_cpu(int nb)
+{
+ if (nb / 100)
+ return 3;
+ if (nb / 10)
+ return 2;
+ return 1;
+}
+
+static enum print_line_t
+print_graph_cpu(struct trace_seq *s, int cpu)
+{
+ int i;
+ int ret;
+ int log10_this = log10_cpu(cpu);
+ int log10_all = log10_cpu(cpus_weight_nr(cpu_online_map));
+
+
+ /*
+ * Start with a space character - to make it stand out
+ * to the right a bit when trace output is pasted into
+ * email:
+ */
+ ret = trace_seq_printf(s, " ");
+
+ /*
+ * Tricky - we space the CPU field according to the max
+ * number of online CPUs. On a 2-cpu system it would take
+ * a maximum of 1 digit - on a 128 cpu system it would
+ * take up to 3 digits:
+ */
+ for (i = 0; i < log10_all - log10_this; i++) {
+ ret = trace_seq_printf(s, " ");
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ }
+ ret = trace_seq_printf(s, "%d) ", cpu);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+
+/* If the pid changed since the last trace, output this event */
+static int verif_pid(struct trace_seq *s, pid_t pid, int cpu)
+{
+ char *comm, *prev_comm;
+ pid_t prev_pid;
+ int ret;
+
+ if (last_pid[cpu] != -1 && last_pid[cpu] == pid)
+ return 1;
+
+ prev_pid = last_pid[cpu];
+ last_pid[cpu] = pid;
+
+ comm = trace_find_cmdline(pid);
+ prev_comm = trace_find_cmdline(prev_pid);
+
+/*
+ * Context-switch trace line:
+
+ ------------------------------------------
+ | 1) migration/0--1 => sshd-1755
+ ------------------------------------------
+
+ */
+ ret = trace_seq_printf(s,
+ " ------------------------------------------\n");
+ ret += trace_seq_printf(s, " | %d) %s-%d => %s-%d\n",
+ cpu, prev_comm, prev_pid, comm, pid);
+ ret += trace_seq_printf(s,
+ " ------------------------------------------\n\n");
+ return ret;
+}
+
+static bool
+trace_branch_is_leaf(struct trace_iterator *iter,
+ struct ftrace_graph_ent_entry *curr)
+{
+ struct ring_buffer_iter *ring_iter;
+ struct ring_buffer_event *event;
+ struct ftrace_graph_ret_entry *next;
+
+ ring_iter = iter->buffer_iter[iter->cpu];
+
+ if (!ring_iter)
+ return false;
+
+ event = ring_buffer_iter_peek(ring_iter, NULL);
+
+ if (!event)
+ return false;
+
+ next = ring_buffer_event_data(event);
+
+ if (next->ent.type != TRACE_GRAPH_RET)
+ return false;
+
+ if (curr->ent.pid != next->ent.pid ||
+ curr->graph_ent.func != next->ret.func)
+ return false;
+
+ return true;
+}
+
+
+static inline int
+print_graph_duration(unsigned long long duration, struct trace_seq *s)
+{
+ unsigned long nsecs_rem = do_div(duration, 1000);
+ return trace_seq_printf(s, "%4llu.%3lu us | ", duration, nsecs_rem);
+}
+
+/* Signal a overhead of time execution to the output */
+static int
+print_graph_overhead(unsigned long long duration, struct trace_seq *s)
+{
+ /* Duration exceeded 100 msecs */
+ if (duration > 100000ULL)
+ return trace_seq_printf(s, "! ");
+
+ /* Duration exceeded 10 msecs */
+ if (duration > 10000ULL)
+ return trace_seq_printf(s, "+ ");
+
+ return trace_seq_printf(s, " ");
+}
+
+/* Case of a leaf function on its call entry */
+static enum print_line_t
+print_graph_entry_leaf(struct trace_iterator *iter,
+ struct ftrace_graph_ent_entry *entry, struct trace_seq *s)
+{
+ struct ftrace_graph_ret_entry *ret_entry;
+ struct ftrace_graph_ret *graph_ret;
+ struct ring_buffer_event *event;
+ struct ftrace_graph_ent *call;
+ unsigned long long duration;
+ int ret;
+ int i;
+
+ event = ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
+ ret_entry = ring_buffer_event_data(event);
+ graph_ret = &ret_entry->ret;
+ call = &entry->graph_ent;
+ duration = graph_ret->rettime - graph_ret->calltime;
+
+ /* Must not exceed 8 characters: 9999.999 us */
+ if (duration > 10000000ULL)
+ duration = 9999999ULL;
+
+ /* Overhead */
+ if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
+ ret = print_graph_overhead(duration, s);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ }
+
+ /* Duration */
+ ret = print_graph_duration(duration, s);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ /* Function */
+ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
+ ret = trace_seq_printf(s, " ");
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ }
+
+ ret = seq_print_ip_sym(s, call->func, 0);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ ret = trace_seq_printf(s, "();\n");
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
+ struct trace_seq *s)
+{
+ int i;
+ int ret;
+ struct ftrace_graph_ent *call = &entry->graph_ent;
+
+ /* No overhead */
+ if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
+ ret = trace_seq_printf(s, " ");
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ }
+
+ /* No time */
+ ret = trace_seq_printf(s, " | ");
+
+ /* Function */
+ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
+ ret = trace_seq_printf(s, " ");
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ }
+
+ ret = seq_print_ip_sym(s, call->func, 0);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ ret = trace_seq_printf(s, "() {\n");
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
+ struct trace_iterator *iter, int cpu)
+{
+ int ret;
+ struct trace_entry *ent = iter->ent;
+
+ /* Pid */
+ if (!verif_pid(s, ent->pid, cpu))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ /* Cpu */
+ if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
+ ret = print_graph_cpu(s, cpu);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ }
+
+ if (trace_branch_is_leaf(iter, field))
+ return print_graph_entry_leaf(iter, field, s);
+ else
+ return print_graph_entry_nested(field, s);
+
+}
+
+static enum print_line_t
+print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
+ struct trace_entry *ent, int cpu)
+{
+ int i;
+ int ret;
+ unsigned long long duration = trace->rettime - trace->calltime;
+
+ /* Must not exceed 8 characters: xxxx.yyy us */
+ if (duration > 10000000ULL)
+ duration = 9999999ULL;
+
+ /* Pid */
+ if (!verif_pid(s, ent->pid, cpu))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ /* Cpu */
+ if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
+ ret = print_graph_cpu(s, cpu);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ }
+
+ /* Overhead */
+ if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
+ ret = print_graph_overhead(duration, s);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ }
+
+ /* Duration */
+ ret = print_graph_duration(duration, s);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ /* Closing brace */
+ for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
+ ret = trace_seq_printf(s, " ");
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ }
+
+ ret = trace_seq_printf(s, "}\n");
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ /* Overrun */
+ if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
+ ret = trace_seq_printf(s, " (Overruns: %lu)\n",
+ trace->overrun);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ }
+ return TRACE_TYPE_HANDLED;
+}
+
+enum print_line_t
+print_graph_function(struct trace_iterator *iter)
+{
+ struct trace_seq *s = &iter->seq;
+ struct trace_entry *entry = iter->ent;
+
+ switch (entry->type) {
+ case TRACE_GRAPH_ENT: {
+ struct ftrace_graph_ent_entry *field;
+ trace_assign_type(field, entry);
+ return print_graph_entry(field, s, iter,
+ iter->cpu);
+ }
+ case TRACE_GRAPH_RET: {
+ struct ftrace_graph_ret_entry *field;
+ trace_assign_type(field, entry);
+ return print_graph_return(&field->ret, s, entry, iter->cpu);
+ }
+ default:
+ return TRACE_TYPE_UNHANDLED;
+ }
+}
+
+static struct tracer graph_trace __read_mostly = {
+ .name = "function_graph",
+ .init = graph_trace_init,
+ .reset = graph_trace_reset,
+ .print_line = print_graph_function,
+ .flags = &tracer_flags,
+};
+
+static __init int init_graph_trace(void)
+{
+ return register_tracer(&graph_trace);
+}
+
+device_initcall(init_graph_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index a7db7f040ae..7c2e326bbc8 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -63,7 +63,7 @@ irq_trace(void)
*/
static __cacheline_aligned_in_smp unsigned long max_sequence;
-#ifdef CONFIG_FTRACE
+#ifdef CONFIG_FUNCTION_TRACER
/*
* irqsoff uses its own tracer function to keep the overhead down:
*/
@@ -104,7 +104,7 @@ static struct ftrace_ops trace_ops __read_mostly =
{
.func = irqsoff_tracer_call,
};
-#endif /* CONFIG_FTRACE */
+#endif /* CONFIG_FUNCTION_TRACER */
/*
* Should this new latency be reported/recorded?
@@ -353,15 +353,28 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
}
#endif /* CONFIG_PREEMPT_TRACER */
+/*
+ * save_tracer_enabled is used to save the state of the tracer_enabled
+ * variable when we disable it when we open a trace output file.
+ */
+static int save_tracer_enabled;
+
static void start_irqsoff_tracer(struct trace_array *tr)
{
register_ftrace_function(&trace_ops);
- tracer_enabled = 1;
+ if (tracing_is_enabled()) {
+ tracer_enabled = 1;
+ save_tracer_enabled = 1;
+ } else {
+ tracer_enabled = 0;
+ save_tracer_enabled = 0;
+ }
}
static void stop_irqsoff_tracer(struct trace_array *tr)
{
tracer_enabled = 0;
+ save_tracer_enabled = 0;
unregister_ftrace_function(&trace_ops);
}
@@ -370,53 +383,55 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
irqsoff_trace = tr;
/* make sure that the tracer is visible */
smp_wmb();
-
- if (tr->ctrl)
- start_irqsoff_tracer(tr);
+ start_irqsoff_tracer(tr);
}
static void irqsoff_tracer_reset(struct trace_array *tr)
{
- if (tr->ctrl)
- stop_irqsoff_tracer(tr);
+ stop_irqsoff_tracer(tr);
}
-static void irqsoff_tracer_ctrl_update(struct trace_array *tr)
+static void irqsoff_tracer_start(struct trace_array *tr)
{
- if (tr->ctrl)
- start_irqsoff_tracer(tr);
- else
- stop_irqsoff_tracer(tr);
+ tracer_enabled = 1;
+ save_tracer_enabled = 1;
+}
+
+static void irqsoff_tracer_stop(struct trace_array *tr)
+{
+ tracer_enabled = 0;
+ save_tracer_enabled = 0;
}
static void irqsoff_tracer_open(struct trace_iterator *iter)
{
/* stop the trace while dumping */
- if (iter->tr->ctrl)
- stop_irqsoff_tracer(iter->tr);
+ tracer_enabled = 0;
}
static void irqsoff_tracer_close(struct trace_iterator *iter)
{
- if (iter->tr->ctrl)
- start_irqsoff_tracer(iter->tr);
+ /* restart tracing */
+ tracer_enabled = save_tracer_enabled;
}
#ifdef CONFIG_IRQSOFF_TRACER
-static void irqsoff_tracer_init(struct trace_array *tr)
+static int irqsoff_tracer_init(struct trace_array *tr)
{
trace_type = TRACER_IRQS_OFF;
__irqsoff_tracer_init(tr);
+ return 0;
}
static struct tracer irqsoff_tracer __read_mostly =
{
.name = "irqsoff",
.init = irqsoff_tracer_init,
.reset = irqsoff_tracer_reset,
+ .start = irqsoff_tracer_start,
+ .stop = irqsoff_tracer_stop,
.open = irqsoff_tracer_open,
.close = irqsoff_tracer_close,
- .ctrl_update = irqsoff_tracer_ctrl_update,
.print_max = 1,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_irqsoff,
@@ -428,11 +443,12 @@ static struct tracer irqsoff_tracer __read_mostly =
#endif
#ifdef CONFIG_PREEMPT_TRACER
-static void preemptoff_tracer_init(struct trace_array *tr)
+static int preemptoff_tracer_init(struct trace_array *tr)
{
trace_type = TRACER_PREEMPT_OFF;
__irqsoff_tracer_init(tr);
+ return 0;
}
static struct tracer preemptoff_tracer __read_mostly =
@@ -440,9 +456,10 @@ static struct tracer preemptoff_tracer __read_mostly =
.name = "preemptoff",
.init = preemptoff_tracer_init,
.reset = irqsoff_tracer_reset,
+ .start = irqsoff_tracer_start,
+ .stop = irqsoff_tracer_stop,
.open = irqsoff_tracer_open,
.close = irqsoff_tracer_close,
- .ctrl_update = irqsoff_tracer_ctrl_update,
.print_max = 1,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_preemptoff,
@@ -456,11 +473,12 @@ static struct tracer preemptoff_tracer __read_mostly =
#if defined(CONFIG_IRQSOFF_TRACER) && \
defined(CONFIG_PREEMPT_TRACER)
-static void preemptirqsoff_tracer_init(struct trace_array *tr)
+static int preemptirqsoff_tracer_init(struct trace_array *tr)
{
trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF;
__irqsoff_tracer_init(tr);
+ return 0;
}
static struct tracer preemptirqsoff_tracer __read_mostly =
@@ -468,9 +486,10 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
.name = "preemptirqsoff",
.init = preemptirqsoff_tracer_init,
.reset = irqsoff_tracer_reset,
+ .start = irqsoff_tracer_start,
+ .stop = irqsoff_tracer_stop,
.open = irqsoff_tracer_open,
.close = irqsoff_tracer_close,
- .ctrl_update = irqsoff_tracer_ctrl_update,
.print_max = 1,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_preemptirqsoff,
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index f28484618ff..2a98a206acc 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -18,46 +18,43 @@ struct header_iter {
static struct trace_array *mmio_trace_array;
static bool overrun_detected;
+static unsigned long prev_overruns;
static void mmio_reset_data(struct trace_array *tr)
{
int cpu;
overrun_detected = false;
+ prev_overruns = 0;
tr->time_start = ftrace_now(tr->cpu);
for_each_online_cpu(cpu)
tracing_reset(tr, cpu);
}
-static void mmio_trace_init(struct trace_array *tr)
+static int mmio_trace_init(struct trace_array *tr)
{
pr_debug("in %s\n", __func__);
mmio_trace_array = tr;
- if (tr->ctrl) {
- mmio_reset_data(tr);
- enable_mmiotrace();
- }
+
+ mmio_reset_data(tr);
+ enable_mmiotrace();
+ return 0;
}
static void mmio_trace_reset(struct trace_array *tr)
{
pr_debug("in %s\n", __func__);
- if (tr->ctrl)
- disable_mmiotrace();
+
+ disable_mmiotrace();
mmio_reset_data(tr);
mmio_trace_array = NULL;
}
-static void mmio_trace_ctrl_update(struct trace_array *tr)
+static void mmio_trace_start(struct trace_array *tr)
{
pr_debug("in %s\n", __func__);
- if (tr->ctrl) {
- mmio_reset_data(tr);
- enable_mmiotrace();
- } else {
- disable_mmiotrace();
- }
+ mmio_reset_data(tr);
}
static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
@@ -128,16 +125,12 @@ static void mmio_close(struct trace_iterator *iter)
static unsigned long count_overruns(struct trace_iterator *iter)
{
- int cpu;
unsigned long cnt = 0;
-/* FIXME: */
-#if 0
- for_each_online_cpu(cpu) {
- cnt += iter->overrun[cpu];
- iter->overrun[cpu] = 0;
- }
-#endif
- (void)cpu;
+ unsigned long over = ring_buffer_overruns(iter->tr->buffer);
+
+ if (over > prev_overruns)
+ cnt = over - prev_overruns;
+ prev_overruns = over;
return cnt;
}
@@ -298,10 +291,10 @@ static struct tracer mmio_tracer __read_mostly =
.name = "mmiotrace",
.init = mmio_trace_init,
.reset = mmio_trace_reset,
+ .start = mmio_trace_start,
.pipe_open = mmio_pipe_open,
.close = mmio_close,
.read = mmio_read,
- .ctrl_update = mmio_trace_ctrl_update,
.print_line = mmio_print_line,
};
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index 4592b486251..b9767acd30a 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -12,6 +12,27 @@
#include "trace.h"
+/* Our two options */
+enum {
+ TRACE_NOP_OPT_ACCEPT = 0x1,
+ TRACE_NOP_OPT_REFUSE = 0x2
+};
+
+/* Options for the tracer (see trace_options file) */
+static struct tracer_opt nop_opts[] = {
+ /* Option that will be accepted by set_flag callback */
+ { TRACER_OPT(test_nop_accept, TRACE_NOP_OPT_ACCEPT) },
+ /* Option that will be refused by set_flag callback */
+ { TRACER_OPT(test_nop_refuse, TRACE_NOP_OPT_REFUSE) },
+ { } /* Always set a last empty entry */
+};
+
+static struct tracer_flags nop_flags = {
+ /* You can check your flags value here when you want. */
+ .val = 0, /* By default: all flags disabled */
+ .opts = nop_opts
+};
+
static struct trace_array *ctx_trace;
static void start_nop_trace(struct trace_array *tr)
@@ -24,7 +45,7 @@ static void stop_nop_trace(struct trace_array *tr)
/* Nothing to do! */
}
-static void nop_trace_init(struct trace_array *tr)
+static int nop_trace_init(struct trace_array *tr)
{
int cpu;
ctx_trace = tr;
@@ -32,33 +53,53 @@ static void nop_trace_init(struct trace_array *tr)
for_each_online_cpu(cpu)
tracing_reset(tr, cpu);
- if (tr->ctrl)
- start_nop_trace(tr);
+ start_nop_trace(tr);
+ return 0;
}
static void nop_trace_reset(struct trace_array *tr)
{
- if (tr->ctrl)
- stop_nop_trace(tr);
+ stop_nop_trace(tr);
}
-static void nop_trace_ctrl_update(struct trace_array *tr)
+/* It only serves as a signal handler and a callback to
+ * accept or refuse tthe setting of a flag.
+ * If you don't implement it, then the flag setting will be
+ * automatically accepted.
+ */
+static int nop_set_flag(u32 old_flags, u32 bit, int set)
{
- /* When starting a new trace, reset the buffers */
- if (tr->ctrl)
- start_nop_trace(tr);
- else
- stop_nop_trace(tr);
+ /*
+ * Note that you don't need to update nop_flags.val yourself.
+ * The tracing Api will do it automatically if you return 0
+ */
+ if (bit == TRACE_NOP_OPT_ACCEPT) {
+ printk(KERN_DEBUG "nop_test_accept flag set to %d: we accept."
+ " Now cat trace_options to see the result\n",
+ set);
+ return 0;
+ }
+
+ if (bit == TRACE_NOP_OPT_REFUSE) {
+ printk(KERN_DEBUG "nop_test_refuse flag set to %d: we refuse."
+ "Now cat trace_options to see the result\n",
+ set);
+ return -EINVAL;
+ }
+
+ return 0;
}
+
struct tracer nop_trace __read_mostly =
{
.name = "nop",
.init = nop_trace_init,
.reset = nop_trace_reset,
- .ctrl_update = nop_trace_ctrl_update,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_nop,
#endif
+ .flags = &nop_flags,
+ .set_flag = nop_set_flag
};
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
new file mode 100644
index 00000000000..a7172a352f6
--- /dev/null
+++ b/kernel/trace/trace_power.c
@@ -0,0 +1,179 @@
+/*
+ * ring buffer based C-state tracer
+ *
+ * Arjan van de Ven <arjan@linux.intel.com>
+ * Copyright (C) 2008 Intel Corporation
+ *
+ * Much is borrowed from trace_boot.c which is
+ * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/debugfs.h>
+#include <linux/ftrace.h>
+#include <linux/kallsyms.h>
+#include <linux/module.h>
+
+#include "trace.h"
+
+static struct trace_array *power_trace;
+static int __read_mostly trace_power_enabled;
+
+
+static void start_power_trace(struct trace_array *tr)
+{
+ trace_power_enabled = 1;
+}
+
+static void stop_power_trace(struct trace_array *tr)
+{
+ trace_power_enabled = 0;
+}
+
+
+static int power_trace_init(struct trace_array *tr)
+{
+ int cpu;
+ power_trace = tr;
+
+ trace_power_enabled = 1;
+
+ for_each_cpu_mask(cpu, cpu_possible_map)
+ tracing_reset(tr, cpu);
+ return 0;
+}
+
+static enum print_line_t power_print_line(struct trace_iterator *iter)
+{
+ int ret = 0;
+ struct trace_entry *entry = iter->ent;
+ struct trace_power *field ;
+ struct power_trace *it;
+ struct trace_seq *s = &iter->seq;
+ struct timespec stamp;
+ struct timespec duration;
+
+ trace_assign_type(field, entry);
+ it = &field->state_data;
+ stamp = ktime_to_timespec(it->stamp);
+ duration = ktime_to_timespec(ktime_sub(it->end, it->stamp));
+
+ if (entry->type == TRACE_POWER) {
+ if (it->type == POWER_CSTATE)
+ ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n",
+ stamp.tv_sec,
+ stamp.tv_nsec,
+ it->state, iter->cpu,
+ duration.tv_sec,
+ duration.tv_nsec);
+ if (it->type == POWER_PSTATE)
+ ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n",
+ stamp.tv_sec,
+ stamp.tv_nsec,
+ it->state, iter->cpu);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ return TRACE_TYPE_HANDLED;
+ }
+ return TRACE_TYPE_UNHANDLED;
+}
+
+static struct tracer power_tracer __read_mostly =
+{
+ .name = "power",
+ .init = power_trace_init,
+ .start = start_power_trace,
+ .stop = stop_power_trace,
+ .reset = stop_power_trace,
+ .print_line = power_print_line,
+};
+
+static int init_power_trace(void)
+{
+ return register_tracer(&power_tracer);
+}
+device_initcall(init_power_trace);
+
+void trace_power_start(struct power_trace *it, unsigned int type,
+ unsigned int level)
+{
+ if (!trace_power_enabled)
+ return;
+
+ memset(it, 0, sizeof(struct power_trace));
+ it->state = level;
+ it->type = type;
+ it->stamp = ktime_get();
+}
+EXPORT_SYMBOL_GPL(trace_power_start);
+
+
+void trace_power_end(struct power_trace *it)
+{
+ struct ring_buffer_event *event;
+ struct trace_power *entry;
+ struct trace_array_cpu *data;
+ unsigned long irq_flags;
+ struct trace_array *tr = power_trace;
+
+ if (!trace_power_enabled)
+ return;
+
+ preempt_disable();
+ it->end = ktime_get();
+ data = tr->data[smp_processor_id()];
+
+ event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+ &irq_flags);
+ if (!event)
+ goto out;
+ entry = ring_buffer_event_data(event);
+ tracing_generic_entry_update(&entry->ent, 0, 0);
+ entry->ent.type = TRACE_POWER;
+ entry->state_data = *it;
+ ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+ trace_wake_up();
+
+ out:
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(trace_power_end);
+
+void trace_power_mark(struct power_trace *it, unsigned int type,
+ unsigned int level)
+{
+ struct ring_buffer_event *event;
+ struct trace_power *entry;
+ struct trace_array_cpu *data;
+ unsigned long irq_flags;
+ struct trace_array *tr = power_trace;
+
+ if (!trace_power_enabled)
+ return;
+
+ memset(it, 0, sizeof(struct power_trace));
+ it->state = level;
+ it->type = type;
+ it->stamp = ktime_get();
+ preempt_disable();
+ it->end = it->stamp;
+ data = tr->data[smp_processor_id()];
+
+ event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+ &irq_flags);
+ if (!event)
+ goto out;
+ entry = ring_buffer_event_data(event);
+ tracing_generic_entry_update(&entry->ent, 0, 0);
+ entry->ent.type = TRACE_POWER;
+ entry->state_data = *it;
+ ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+ trace_wake_up();
+
+ out:
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(trace_power_mark);
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index b8f56beb1a6..863390557b4 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -16,7 +16,8 @@
static struct trace_array *ctx_trace;
static int __read_mostly tracer_enabled;
-static atomic_t sched_ref;
+static int sched_ref;
+static DEFINE_MUTEX(sched_register_mutex);
static void
probe_sched_switch(struct rq *__rq, struct task_struct *prev,
@@ -27,7 +28,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
int cpu;
int pc;
- if (!atomic_read(&sched_ref))
+ if (!sched_ref)
return;
tracing_record_cmdline(prev);
@@ -123,20 +124,18 @@ static void tracing_sched_unregister(void)
static void tracing_start_sched_switch(void)
{
- long ref;
-
- ref = atomic_inc_return(&sched_ref);
- if (ref == 1)
+ mutex_lock(&sched_register_mutex);
+ if (!(sched_ref++))
tracing_sched_register();
+ mutex_unlock(&sched_register_mutex);
}
static void tracing_stop_sched_switch(void)
{
- long ref;
-
- ref = atomic_dec_and_test(&sched_ref);
- if (ref)
+ mutex_lock(&sched_register_mutex);
+ if (!(--sched_ref))
tracing_sched_unregister();
+ mutex_unlock(&sched_register_mutex);
}
void tracing_start_cmdline_record(void)
@@ -149,40 +148,86 @@ void tracing_stop_cmdline_record(void)
tracing_stop_sched_switch();
}
+/**
+ * tracing_start_sched_switch_record - start tracing context switches
+ *
+ * Turns on context switch tracing for a tracer.
+ */
+void tracing_start_sched_switch_record(void)
+{
+ if (unlikely(!ctx_trace)) {
+ WARN_ON(1);
+ return;
+ }
+
+ tracing_start_sched_switch();
+
+ mutex_lock(&sched_register_mutex);
+ tracer_enabled++;
+ mutex_unlock(&sched_register_mutex);
+}
+
+/**
+ * tracing_stop_sched_switch_record - start tracing context switches
+ *
+ * Turns off context switch tracing for a tracer.
+ */
+void tracing_stop_sched_switch_record(void)
+{
+ mutex_lock(&sched_register_mutex);
+ tracer_enabled--;
+ WARN_ON(tracer_enabled < 0);
+ mutex_unlock(&sched_register_mutex);
+
+ tracing_stop_sched_switch();
+}
+
+/**
+ * tracing_sched_switch_assign_trace - assign a trace array for ctx switch
+ * @tr: trace array pointer to assign
+ *
+ * Some tracers might want to record the context switches in their
+ * trace. This function lets those tracers assign the trace array
+ * to use.
+ */
+void tracing_sched_switch_assign_trace(struct trace_array *tr)
+{
+ ctx_trace = tr;
+}
+
static void start_sched_trace(struct trace_array *tr)
{
sched_switch_reset(tr);
- tracing_start_cmdline_record();
- tracer_enabled = 1;
+ tracing_start_sched_switch_record();
}
static void stop_sched_trace(struct trace_array *tr)
{
- tracer_enabled = 0;
- tracing_stop_cmdline_record();
+ tracing_stop_sched_switch_record();
}
-static void sched_switch_trace_init(struct trace_array *tr)
+static int sched_switch_trace_init(struct trace_array *tr)
{
ctx_trace = tr;
-
- if (tr->ctrl)
- start_sched_trace(tr);
+ start_sched_trace(tr);
+ return 0;
}
static void sched_switch_trace_reset(struct trace_array *tr)
{
- if (tr->ctrl)
+ if (sched_ref)
stop_sched_trace(tr);
}
-static void sched_switch_trace_ctrl_update(struct trace_array *tr)
+static void sched_switch_trace_start(struct trace_array *tr)
{
- /* When starting a new trace, reset the buffers */
- if (tr->ctrl)
- start_sched_trace(tr);
- else
- stop_sched_trace(tr);
+ sched_switch_reset(tr);
+ tracing_start_sched_switch();
+}
+
+static void sched_switch_trace_stop(struct trace_array *tr)
+{
+ tracing_stop_sched_switch();
}
static struct tracer sched_switch_trace __read_mostly =
@@ -190,7 +235,8 @@ static struct tracer sched_switch_trace __read_mostly =
.name = "sched_switch",
.init = sched_switch_trace_init,
.reset = sched_switch_trace_reset,
- .ctrl_update = sched_switch_trace_ctrl_update,
+ .start = sched_switch_trace_start,
+ .stop = sched_switch_trace_stop,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_sched_switch,
#endif
@@ -198,14 +244,6 @@ static struct tracer sched_switch_trace __read_mostly =
__init static int init_sched_switch_trace(void)
{
- int ret = 0;
-
- if (atomic_read(&sched_ref))
- ret = tracing_sched_register();
- if (ret) {
- pr_info("error registering scheduler trace\n");
- return ret;
- }
return register_tracer(&sched_switch_trace);
}
device_initcall(init_sched_switch_trace);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index fe4a252c236..0067b49746c 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -31,7 +31,7 @@ static raw_spinlock_t wakeup_lock =
static void __wakeup_reset(struct trace_array *tr);
-#ifdef CONFIG_FTRACE
+#ifdef CONFIG_FUNCTION_TRACER
/*
* irqsoff uses its own tracer function to keep the overhead down:
*/
@@ -50,8 +50,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
return;
pc = preempt_count();
- resched = need_resched();
- preempt_disable_notrace();
+ resched = ftrace_preempt_disable();
cpu = raw_smp_processor_id();
data = tr->data[cpu];
@@ -81,22 +80,14 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
out:
atomic_dec(&data->disabled);
- /*
- * To prevent recursion from the scheduler, if the
- * resched flag was set before we entered, then
- * don't reschedule.
- */
- if (resched)
- preempt_enable_no_resched_notrace();
- else
- preempt_enable_notrace();
+ ftrace_preempt_enable(resched);
}
static struct ftrace_ops trace_ops __read_mostly =
{
.func = wakeup_tracer_call,
};
-#endif /* CONFIG_FTRACE */
+#endif /* CONFIG_FUNCTION_TRACER */
/*
* Should this new latency be reported/recorded?
@@ -271,6 +262,12 @@ out:
atomic_dec(&wakeup_trace->data[cpu]->disabled);
}
+/*
+ * save_tracer_enabled is used to save the state of the tracer_enabled
+ * variable when we disable it when we open a trace output file.
+ */
+static int save_tracer_enabled;
+
static void start_wakeup_tracer(struct trace_array *tr)
{
int ret;
@@ -309,7 +306,13 @@ static void start_wakeup_tracer(struct trace_array *tr)
register_ftrace_function(&trace_ops);
- tracer_enabled = 1;
+ if (tracing_is_enabled()) {
+ tracer_enabled = 1;
+ save_tracer_enabled = 1;
+ } else {
+ tracer_enabled = 0;
+ save_tracer_enabled = 0;
+ }
return;
fail_deprobe_wake_new:
@@ -321,49 +324,53 @@ fail_deprobe:
static void stop_wakeup_tracer(struct trace_array *tr)
{
tracer_enabled = 0;
+ save_tracer_enabled = 0;
unregister_ftrace_function(&trace_ops);
unregister_trace_sched_switch(probe_wakeup_sched_switch);
unregister_trace_sched_wakeup_new(probe_wakeup);
unregister_trace_sched_wakeup(probe_wakeup);
}
-static void wakeup_tracer_init(struct trace_array *tr)
+static int wakeup_tracer_init(struct trace_array *tr)
{
wakeup_trace = tr;
-
- if (tr->ctrl)
- start_wakeup_tracer(tr);
+ start_wakeup_tracer(tr);
+ return 0;
}
static void wakeup_tracer_reset(struct trace_array *tr)
{
- if (tr->ctrl) {
- stop_wakeup_tracer(tr);
- /* make sure we put back any tasks we are tracing */
- wakeup_reset(tr);
- }
+ stop_wakeup_tracer(tr);
+ /* make sure we put back any tasks we are tracing */
+ wakeup_reset(tr);
+}
+
+static void wakeup_tracer_start(struct trace_array *tr)
+{
+ wakeup_reset(tr);
+ tracer_enabled = 1;
+ save_tracer_enabled = 1;
}
-static void wakeup_tracer_ctrl_update(struct trace_array *tr)
+static void wakeup_tracer_stop(struct trace_array *tr)
{
- if (tr->ctrl)
- start_wakeup_tracer(tr);
- else
- stop_wakeup_tracer(tr);
+ tracer_enabled = 0;
+ save_tracer_enabled = 0;
}
static void wakeup_tracer_open(struct trace_iterator *iter)
{
/* stop the trace while dumping */
- if (iter->tr->ctrl)
- stop_wakeup_tracer(iter->tr);
+ tracer_enabled = 0;
}
static void wakeup_tracer_close(struct trace_iterator *iter)
{
/* forget about any processes we were recording */
- if (iter->tr->ctrl)
- start_wakeup_tracer(iter->tr);
+ if (save_tracer_enabled) {
+ wakeup_reset(iter->tr);
+ tracer_enabled = 1;
+ }
}
static struct tracer wakeup_tracer __read_mostly =
@@ -371,9 +378,10 @@ static struct tracer wakeup_tracer __read_mostly =
.name = "wakeup",
.init = wakeup_tracer_init,
.reset = wakeup_tracer_reset,
+ .start = wakeup_tracer_start,
+ .stop = wakeup_tracer_stop,
.open = wakeup_tracer_open,
.close = wakeup_tracer_close,
- .ctrl_update = wakeup_tracer_ctrl_update,
.print_max = 1,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_wakeup,
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 09cf230d7ec..88c8eb70f54 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -13,6 +13,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
case TRACE_STACK:
case TRACE_PRINT:
case TRACE_SPECIAL:
+ case TRACE_BRANCH:
return 1;
}
return 0;
@@ -51,7 +52,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
int cpu, ret = 0;
/* Don't allow flipping of max traces now */
- raw_local_irq_save(flags);
+ local_irq_save(flags);
__raw_spin_lock(&ftrace_max_lock);
cnt = ring_buffer_entries(tr->buffer);
@@ -62,7 +63,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
break;
}
__raw_spin_unlock(&ftrace_max_lock);
- raw_local_irq_restore(flags);
+ local_irq_restore(flags);
if (count)
*count = cnt;
@@ -70,7 +71,12 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
return ret;
}
-#ifdef CONFIG_FTRACE
+static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret)
+{
+ printk(KERN_WARNING "Failed to init %s tracer, init returned %d\n",
+ trace->name, init_ret);
+}
+#ifdef CONFIG_FUNCTION_TRACER
#ifdef CONFIG_DYNAMIC_FTRACE
@@ -99,13 +105,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
/* passed in by parameter to fool gcc from optimizing */
func();
- /* update the records */
- ret = ftrace_force_update();
- if (ret) {
- printk(KERN_CONT ".. ftraced failed .. ");
- return ret;
- }
-
/*
* Some archs *cough*PowerPC*cough* add charachters to the
* start of the function names. We simply put a '*' to
@@ -117,8 +116,11 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
ftrace_set_filter(func_name, strlen(func_name), 1);
/* enable tracing */
- tr->ctrl = 1;
- trace->init(tr);
+ ret = trace->init(tr);
+ if (ret) {
+ warn_failed_init_tracer(trace, ret);
+ goto out;
+ }
/* Sleep for a 1/10 of a second */
msleep(100);
@@ -141,13 +143,13 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
msleep(100);
/* stop the tracing. */
- tr->ctrl = 0;
- trace->ctrl_update(tr);
+ tracing_stop();
ftrace_enabled = 0;
/* check the trace buffer */
ret = trace_test_buffer(tr, &count);
trace->reset(tr);
+ tracing_start();
/* we should only have one item */
if (!ret && count != 1) {
@@ -155,6 +157,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
ret = -1;
goto out;
}
+
out:
ftrace_enabled = save_ftrace_enabled;
tracer_enabled = save_tracer_enabled;
@@ -183,29 +186,26 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
/* make sure msleep has been recorded */
msleep(1);
- /* force the recorded functions to be traced */
- ret = ftrace_force_update();
- if (ret) {
- printk(KERN_CONT ".. ftraced failed .. ");
- return ret;
- }
-
/* start the tracing */
ftrace_enabled = 1;
tracer_enabled = 1;
- tr->ctrl = 1;
- trace->init(tr);
+ ret = trace->init(tr);
+ if (ret) {
+ warn_failed_init_tracer(trace, ret);
+ goto out;
+ }
+
/* Sleep for a 1/10 of a second */
msleep(100);
/* stop the tracing. */
- tr->ctrl = 0;
- trace->ctrl_update(tr);
+ tracing_stop();
ftrace_enabled = 0;
/* check the trace buffer */
ret = trace_test_buffer(tr, &count);
trace->reset(tr);
+ tracing_start();
if (!ret && !count) {
printk(KERN_CONT ".. no entries found ..");
@@ -226,7 +226,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
return ret;
}
-#endif /* CONFIG_FTRACE */
+#endif /* CONFIG_FUNCTION_TRACER */
#ifdef CONFIG_IRQSOFF_TRACER
int
@@ -237,8 +237,12 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
int ret;
/* start the tracing */
- tr->ctrl = 1;
- trace->init(tr);
+ ret = trace->init(tr);
+ if (ret) {
+ warn_failed_init_tracer(trace, ret);
+ return ret;
+ }
+
/* reset the max latency */
tracing_max_latency = 0;
/* disable interrupts for a bit */
@@ -246,13 +250,13 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
udelay(100);
local_irq_enable();
/* stop the tracing. */
- tr->ctrl = 0;
- trace->ctrl_update(tr);
+ tracing_stop();
/* check both trace buffers */
ret = trace_test_buffer(tr, NULL);
if (!ret)
ret = trace_test_buffer(&max_tr, &count);
trace->reset(tr);
+ tracing_start();
if (!ret && !count) {
printk(KERN_CONT ".. no entries found ..");
@@ -273,9 +277,26 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
unsigned long count;
int ret;
+ /*
+ * Now that the big kernel lock is no longer preemptable,
+ * and this is called with the BKL held, it will always
+ * fail. If preemption is already disabled, simply
+ * pass the test. When the BKL is removed, or becomes
+ * preemptible again, we will once again test this,
+ * so keep it in.
+ */
+ if (preempt_count()) {
+ printk(KERN_CONT "can not test ... force ");
+ return 0;
+ }
+
/* start the tracing */
- tr->ctrl = 1;
- trace->init(tr);
+ ret = trace->init(tr);
+ if (ret) {
+ warn_failed_init_tracer(trace, ret);
+ return ret;
+ }
+
/* reset the max latency */
tracing_max_latency = 0;
/* disable preemption for a bit */
@@ -283,13 +304,13 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
udelay(100);
preempt_enable();
/* stop the tracing. */
- tr->ctrl = 0;
- trace->ctrl_update(tr);
+ tracing_stop();
/* check both trace buffers */
ret = trace_test_buffer(tr, NULL);
if (!ret)
ret = trace_test_buffer(&max_tr, &count);
trace->reset(tr);
+ tracing_start();
if (!ret && !count) {
printk(KERN_CONT ".. no entries found ..");
@@ -310,9 +331,25 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
unsigned long count;
int ret;
+ /*
+ * Now that the big kernel lock is no longer preemptable,
+ * and this is called with the BKL held, it will always
+ * fail. If preemption is already disabled, simply
+ * pass the test. When the BKL is removed, or becomes
+ * preemptible again, we will once again test this,
+ * so keep it in.
+ */
+ if (preempt_count()) {
+ printk(KERN_CONT "can not test ... force ");
+ return 0;
+ }
+
/* start the tracing */
- tr->ctrl = 1;
- trace->init(tr);
+ ret = trace->init(tr);
+ if (ret) {
+ warn_failed_init_tracer(trace, ret);
+ goto out;
+ }
/* reset the max latency */
tracing_max_latency = 0;
@@ -326,27 +363,30 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
local_irq_enable();
/* stop the tracing. */
- tr->ctrl = 0;
- trace->ctrl_update(tr);
+ tracing_stop();
/* check both trace buffers */
ret = trace_test_buffer(tr, NULL);
- if (ret)
+ if (ret) {
+ tracing_start();
goto out;
+ }
ret = trace_test_buffer(&max_tr, &count);
- if (ret)
+ if (ret) {
+ tracing_start();
goto out;
+ }
if (!ret && !count) {
printk(KERN_CONT ".. no entries found ..");
ret = -1;
+ tracing_start();
goto out;
}
/* do the test by disabling interrupts first this time */
tracing_max_latency = 0;
- tr->ctrl = 1;
- trace->ctrl_update(tr);
+ tracing_start();
preempt_disable();
local_irq_disable();
udelay(100);
@@ -355,8 +395,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
local_irq_enable();
/* stop the tracing. */
- tr->ctrl = 0;
- trace->ctrl_update(tr);
+ tracing_stop();
/* check both trace buffers */
ret = trace_test_buffer(tr, NULL);
if (ret)
@@ -372,6 +411,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
out:
trace->reset(tr);
+ tracing_start();
tracing_max_latency = save_max;
return ret;
@@ -437,8 +477,12 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
wait_for_completion(&isrt);
/* start the tracing */
- tr->ctrl = 1;
- trace->init(tr);
+ ret = trace->init(tr);
+ if (ret) {
+ warn_failed_init_tracer(trace, ret);
+ return ret;
+ }
+
/* reset the max latency */
tracing_max_latency = 0;
@@ -462,8 +506,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
msleep(100);
/* stop the tracing. */
- tr->ctrl = 0;
- trace->ctrl_update(tr);
+ tracing_stop();
/* check both trace buffers */
ret = trace_test_buffer(tr, NULL);
if (!ret)
@@ -471,6 +514,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
trace->reset(tr);
+ tracing_start();
tracing_max_latency = save_max;
@@ -494,16 +538,20 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
int ret;
/* start the tracing */
- tr->ctrl = 1;
- trace->init(tr);
+ ret = trace->init(tr);
+ if (ret) {
+ warn_failed_init_tracer(trace, ret);
+ return ret;
+ }
+
/* Sleep for a 1/10 of a second */
msleep(100);
/* stop the tracing. */
- tr->ctrl = 0;
- trace->ctrl_update(tr);
+ tracing_stop();
/* check the trace buffer */
ret = trace_test_buffer(tr, &count);
trace->reset(tr);
+ tracing_start();
if (!ret && !count) {
printk(KERN_CONT ".. no entries found ..");
@@ -522,17 +570,48 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
int ret;
/* start the tracing */
- tr->ctrl = 1;
- trace->init(tr);
+ ret = trace->init(tr);
+ if (ret) {
+ warn_failed_init_tracer(trace, ret);
+ return 0;
+ }
+
/* Sleep for a 1/10 of a second */
msleep(100);
/* stop the tracing. */
- tr->ctrl = 0;
- trace->ctrl_update(tr);
+ tracing_stop();
/* check the trace buffer */
ret = trace_test_buffer(tr, &count);
trace->reset(tr);
+ tracing_start();
return ret;
}
#endif /* CONFIG_SYSPROF_TRACER */
+
+#ifdef CONFIG_BRANCH_TRACER
+int
+trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
+{
+ unsigned long count;
+ int ret;
+
+ /* start the tracing */
+ ret = trace->init(tr);
+ if (ret) {
+ warn_failed_init_tracer(trace, ret);
+ return ret;
+ }
+
+ /* Sleep for a 1/10 of a second */
+ msleep(100);
+ /* stop the tracing. */
+ tracing_stop();
+ /* check the trace buffer */
+ ret = trace_test_buffer(tr, &count);
+ trace->reset(tr);
+ tracing_start();
+
+ return ret;
+}
+#endif /* CONFIG_BRANCH_TRACER */
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 74c5d9a3afa..fde3be15c64 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -44,6 +44,10 @@ static inline void check_stack(void)
if (this_size <= max_stack_size)
return;
+ /* we do not handle interrupt stacks yet */
+ if (!object_is_on_stack(&this_size))
+ return;
+
raw_local_irq_save(flags);
__raw_spin_lock(&max_stack_lock);
@@ -103,8 +107,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
if (unlikely(!ftrace_enabled || stack_trace_disabled))
return;
- resched = need_resched();
- preempt_disable_notrace();
+ resched = ftrace_preempt_disable();
cpu = raw_smp_processor_id();
/* no atomic needed, we only modify this variable by this cpu */
@@ -116,10 +119,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
out:
per_cpu(trace_active, cpu)--;
/* prevent recursion in schedule */
- if (resched)
- preempt_enable_no_resched_notrace();
- else
- preempt_enable_notrace();
+ ftrace_preempt_enable(resched);
}
static struct ftrace_ops trace_ops __read_mostly =
@@ -180,11 +180,16 @@ static struct file_operations stack_max_size_fops = {
static void *
t_next(struct seq_file *m, void *v, loff_t *pos)
{
- long i = (long)m->private;
+ long i;
(*pos)++;
- i++;
+ if (v == SEQ_START_TOKEN)
+ i = 0;
+ else {
+ i = *(long *)v;
+ i++;
+ }
if (i >= max_stack_trace.nr_entries ||
stack_dump_trace[i] == ULONG_MAX)
@@ -197,12 +202,15 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
static void *t_start(struct seq_file *m, loff_t *pos)
{
- void *t = &m->private;
+ void *t = SEQ_START_TOKEN;
loff_t l = 0;
local_irq_disable();
__raw_spin_lock(&max_stack_lock);
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
for (; t && l < *pos; t = t_next(m, t, &l))
;
@@ -231,10 +239,10 @@ static int trace_lookup_stack(struct seq_file *m, long i)
static int t_show(struct seq_file *m, void *v)
{
- long i = *(long *)v;
+ long i;
int size;
- if (i < 0) {
+ if (v == SEQ_START_TOKEN) {
seq_printf(m, " Depth Size Location"
" (%d entries)\n"
" ----- ---- --------\n",
@@ -242,6 +250,8 @@ static int t_show(struct seq_file *m, void *v)
return 0;
}
+ i = *(long *)v;
+
if (i >= max_stack_trace.nr_entries ||
stack_dump_trace[i] == ULONG_MAX)
return 0;
@@ -271,10 +281,6 @@ static int stack_trace_open(struct inode *inode, struct file *file)
int ret;
ret = seq_open(file, &stack_trace_seq_ops);
- if (!ret) {
- struct seq_file *m = file->private_data;
- m->private = (void *)-1;
- }
return ret;
}
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 9587d3bcba5..54960edb96d 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -261,27 +261,17 @@ static void stop_stack_trace(struct trace_array *tr)
mutex_unlock(&sample_timer_lock);
}
-static void stack_trace_init(struct trace_array *tr)
+static int stack_trace_init(struct trace_array *tr)
{
sysprof_trace = tr;
- if (tr->ctrl)
- start_stack_trace(tr);
+ start_stack_trace(tr);
+ return 0;
}
static void stack_trace_reset(struct trace_array *tr)
{
- if (tr->ctrl)
- stop_stack_trace(tr);
-}
-
-static void stack_trace_ctrl_update(struct trace_array *tr)
-{
- /* When starting a new trace, reset the buffers */
- if (tr->ctrl)
- start_stack_trace(tr);
- else
- stop_stack_trace(tr);
+ stop_stack_trace(tr);
}
static struct tracer stack_trace __read_mostly =
@@ -289,7 +279,6 @@ static struct tracer stack_trace __read_mostly =
.name = "sysprof",
.init = stack_trace_init,
.reset = stack_trace_reset,
- .ctrl_update = stack_trace_ctrl_update,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_sysprof,
#endif
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index f2b7c28a470..79602740bbb 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -43,6 +43,7 @@ static DEFINE_MUTEX(tracepoints_mutex);
*/
#define TRACEPOINT_HASH_BITS 6
#define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS)
+static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
/*
* Note about RCU :
@@ -54,40 +55,43 @@ struct tracepoint_entry {
struct hlist_node hlist;
void **funcs;
int refcount; /* Number of times armed. 0 if disarmed. */
- struct rcu_head rcu;
- void *oldptr;
- unsigned char rcu_pending:1;
char name[0];
};
-static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
+struct tp_probes {
+ union {
+ struct rcu_head rcu;
+ struct list_head list;
+ } u;
+ void *probes[0];
+};
-static void free_old_closure(struct rcu_head *head)
+static inline void *allocate_probes(int count)
{
- struct tracepoint_entry *entry = container_of(head,
- struct tracepoint_entry, rcu);
- kfree(entry->oldptr);
- /* Make sure we free the data before setting the pending flag to 0 */
- smp_wmb();
- entry->rcu_pending = 0;
+ struct tp_probes *p = kmalloc(count * sizeof(void *)
+ + sizeof(struct tp_probes), GFP_KERNEL);
+ return p == NULL ? NULL : p->probes;
}
-static void tracepoint_entry_free_old(struct tracepoint_entry *entry, void *old)
+static void rcu_free_old_probes(struct rcu_head *head)
{
- if (!old)
- return;
- entry->oldptr = old;
- entry->rcu_pending = 1;
- /* write rcu_pending before calling the RCU callback */
- smp_wmb();
- call_rcu_sched(&entry->rcu, free_old_closure);
+ kfree(container_of(head, struct tp_probes, u.rcu));
+}
+
+static inline void release_probes(void *old)
+{
+ if (old) {
+ struct tp_probes *tp_probes = container_of(old,
+ struct tp_probes, probes[0]);
+ call_rcu_sched(&tp_probes->u.rcu, rcu_free_old_probes);
+ }
}
static void debug_print_probes(struct tracepoint_entry *entry)
{
int i;
- if (!tracepoint_debug)
+ if (!tracepoint_debug || !entry->funcs)
return;
for (i = 0; entry->funcs[i]; i++)
@@ -111,12 +115,13 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
return ERR_PTR(-EEXIST);
}
/* + 2 : one for new probe, one for NULL func */
- new = kzalloc((nr_probes + 2) * sizeof(void *), GFP_KERNEL);
+ new = allocate_probes(nr_probes + 2);
if (new == NULL)
return ERR_PTR(-ENOMEM);
if (old)
memcpy(new, old, nr_probes * sizeof(void *));
new[nr_probes] = probe;
+ new[nr_probes + 1] = NULL;
entry->refcount = nr_probes + 1;
entry->funcs = new;
debug_print_probes(entry);
@@ -131,6 +136,9 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
old = entry->funcs;
+ if (!old)
+ return ERR_PTR(-ENOENT);
+
debug_print_probes(entry);
/* (N -> M), (N > 1, M >= 0) probes */
for (nr_probes = 0; old[nr_probes]; nr_probes++) {
@@ -148,13 +156,13 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
int j = 0;
/* N -> M, (N > 1, M > 0) */
/* + 1 for NULL */
- new = kzalloc((nr_probes - nr_del + 1)
- * sizeof(void *), GFP_KERNEL);
+ new = allocate_probes(nr_probes - nr_del + 1);
if (new == NULL)
return ERR_PTR(-ENOMEM);
for (i = 0; old[i]; i++)
if ((probe && old[i] != probe))
new[j++] = old[i];
+ new[nr_probes - nr_del] = NULL;
entry->refcount = nr_probes - nr_del;
entry->funcs = new;
}
@@ -212,7 +220,6 @@ static struct tracepoint_entry *add_tracepoint(const char *name)
memcpy(&e->name[0], name, name_len);
e->funcs = NULL;
e->refcount = 0;
- e->rcu_pending = 0;
hlist_add_head(&e->hlist, head);
return e;
}
@@ -221,32 +228,10 @@ static struct tracepoint_entry *add_tracepoint(const char *name)
* Remove the tracepoint from the tracepoint hash table. Must be called with
* mutex_lock held.
*/
-static int remove_tracepoint(const char *name)
+static inline void remove_tracepoint(struct tracepoint_entry *e)
{
- struct hlist_head *head;
- struct hlist_node *node;
- struct tracepoint_entry *e;
- int found = 0;
- size_t len = strlen(name) + 1;
- u32 hash = jhash(name, len-1, 0);
-
- head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
- hlist_for_each_entry(e, node, head, hlist) {
- if (!strcmp(name, e->name)) {
- found = 1;
- break;
- }
- }
- if (!found)
- return -ENOENT;
- if (e->refcount)
- return -EBUSY;
hlist_del(&e->hlist);
- /* Make sure the call_rcu_sched has been executed */
- if (e->rcu_pending)
- rcu_barrier_sched();
kfree(e);
- return 0;
}
/*
@@ -277,6 +262,7 @@ static void set_tracepoint(struct tracepoint_entry **entry,
static void disable_tracepoint(struct tracepoint *elem)
{
elem->state = 0;
+ rcu_assign_pointer(elem->funcs, NULL);
}
/**
@@ -317,6 +303,23 @@ static void tracepoint_update_probes(void)
module_update_tracepoints();
}
+static void *tracepoint_add_probe(const char *name, void *probe)
+{
+ struct tracepoint_entry *entry;
+ void *old;
+
+ entry = get_tracepoint(name);
+ if (!entry) {
+ entry = add_tracepoint(name);
+ if (IS_ERR(entry))
+ return entry;
+ }
+ old = tracepoint_entry_add_probe(entry, probe);
+ if (IS_ERR(old) && !entry->refcount)
+ remove_tracepoint(entry);
+ return old;
+}
+
/**
* tracepoint_probe_register - Connect a probe to a tracepoint
* @name: tracepoint name
@@ -327,44 +330,36 @@ static void tracepoint_update_probes(void)
*/
int tracepoint_probe_register(const char *name, void *probe)
{
- struct tracepoint_entry *entry;
- int ret = 0;
void *old;
mutex_lock(&tracepoints_mutex);
- entry = get_tracepoint(name);
- if (!entry) {
- entry = add_tracepoint(name);
- if (IS_ERR(entry)) {
- ret = PTR_ERR(entry);
- goto end;
- }
- }
- /*
- * If we detect that a call_rcu_sched is pending for this tracepoint,
- * make sure it's executed now.
- */
- if (entry->rcu_pending)
- rcu_barrier_sched();
- old = tracepoint_entry_add_probe(entry, probe);
- if (IS_ERR(old)) {
- ret = PTR_ERR(old);
- goto end;
- }
+ old = tracepoint_add_probe(name, probe);
mutex_unlock(&tracepoints_mutex);
+ if (IS_ERR(old))
+ return PTR_ERR(old);
+
tracepoint_update_probes(); /* may update entry */
- mutex_lock(&tracepoints_mutex);
- entry = get_tracepoint(name);
- WARN_ON(!entry);
- if (entry->rcu_pending)
- rcu_barrier_sched();
- tracepoint_entry_free_old(entry, old);
-end:
- mutex_unlock(&tracepoints_mutex);
- return ret;
+ release_probes(old);
+ return 0;
}
EXPORT_SYMBOL_GPL(tracepoint_probe_register);
+static void *tracepoint_remove_probe(const char *name, void *probe)
+{
+ struct tracepoint_entry *entry;
+ void *old;
+
+ entry = get_tracepoint(name);
+ if (!entry)
+ return ERR_PTR(-ENOENT);
+ old = tracepoint_entry_remove_probe(entry, probe);
+ if (IS_ERR(old))
+ return old;
+ if (!entry->refcount)
+ remove_tracepoint(entry);
+ return old;
+}
+
/**
* tracepoint_probe_unregister - Disconnect a probe from a tracepoint
* @name: tracepoint name
@@ -377,33 +372,104 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register);
*/
int tracepoint_probe_unregister(const char *name, void *probe)
{
- struct tracepoint_entry *entry;
void *old;
- int ret = -ENOENT;
mutex_lock(&tracepoints_mutex);
- entry = get_tracepoint(name);
- if (!entry)
- goto end;
- if (entry->rcu_pending)
- rcu_barrier_sched();
- old = tracepoint_entry_remove_probe(entry, probe);
+ old = tracepoint_remove_probe(name, probe);
mutex_unlock(&tracepoints_mutex);
+ if (IS_ERR(old))
+ return PTR_ERR(old);
+
tracepoint_update_probes(); /* may update entry */
+ release_probes(old);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tracepoint_probe_unregister);
+
+static LIST_HEAD(old_probes);
+static int need_update;
+
+static void tracepoint_add_old_probes(void *old)
+{
+ need_update = 1;
+ if (old) {
+ struct tp_probes *tp_probes = container_of(old,
+ struct tp_probes, probes[0]);
+ list_add(&tp_probes->u.list, &old_probes);
+ }
+}
+
+/**
+ * tracepoint_probe_register_noupdate - register a probe but not connect
+ * @name: tracepoint name
+ * @probe: probe handler
+ *
+ * caller must call tracepoint_probe_update_all()
+ */
+int tracepoint_probe_register_noupdate(const char *name, void *probe)
+{
+ void *old;
+
mutex_lock(&tracepoints_mutex);
- entry = get_tracepoint(name);
- if (!entry)
- goto end;
- if (entry->rcu_pending)
- rcu_barrier_sched();
- tracepoint_entry_free_old(entry, old);
- remove_tracepoint(name); /* Ignore busy error message */
- ret = 0;
-end:
+ old = tracepoint_add_probe(name, probe);
+ if (IS_ERR(old)) {
+ mutex_unlock(&tracepoints_mutex);
+ return PTR_ERR(old);
+ }
+ tracepoint_add_old_probes(old);
mutex_unlock(&tracepoints_mutex);
- return ret;
+ return 0;
}
-EXPORT_SYMBOL_GPL(tracepoint_probe_unregister);
+EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate);
+
+/**
+ * tracepoint_probe_unregister_noupdate - remove a probe but not disconnect
+ * @name: tracepoint name
+ * @probe: probe function pointer
+ *
+ * caller must call tracepoint_probe_update_all()
+ */
+int tracepoint_probe_unregister_noupdate(const char *name, void *probe)
+{
+ void *old;
+
+ mutex_lock(&tracepoints_mutex);
+ old = tracepoint_remove_probe(name, probe);
+ if (IS_ERR(old)) {
+ mutex_unlock(&tracepoints_mutex);
+ return PTR_ERR(old);
+ }
+ tracepoint_add_old_probes(old);
+ mutex_unlock(&tracepoints_mutex);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tracepoint_probe_unregister_noupdate);
+
+/**
+ * tracepoint_probe_update_all - update tracepoints
+ */
+void tracepoint_probe_update_all(void)
+{
+ LIST_HEAD(release_probes);
+ struct tp_probes *pos, *next;
+
+ mutex_lock(&tracepoints_mutex);
+ if (!need_update) {
+ mutex_unlock(&tracepoints_mutex);
+ return;
+ }
+ if (!list_empty(&old_probes))
+ list_replace_init(&old_probes, &release_probes);
+ need_update = 0;
+ mutex_unlock(&tracepoints_mutex);
+
+ tracepoint_update_probes();
+ list_for_each_entry_safe(pos, next, &release_probes, u.list) {
+ list_del(&pos->u.list);
+ call_rcu_sched(&pos->u.rcu, rcu_free_old_probes);
+ }
+}
+EXPORT_SYMBOL_GPL(tracepoint_probe_update_all);
/**
* tracepoint_get_iter_range - Get a next tracepoint iterator given a range.
@@ -475,3 +541,36 @@ void tracepoint_iter_reset(struct tracepoint_iter *iter)
iter->tracepoint = NULL;
}
EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
+
+#ifdef CONFIG_MODULES
+
+int tracepoint_module_notify(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ struct module *mod = data;
+
+ switch (val) {
+ case MODULE_STATE_COMING:
+ tracepoint_update_probe_range(mod->tracepoints,
+ mod->tracepoints + mod->num_tracepoints);
+ break;
+ case MODULE_STATE_GOING:
+ tracepoint_update_probe_range(mod->tracepoints,
+ mod->tracepoints + mod->num_tracepoints);
+ break;
+ }
+ return 0;
+}
+
+struct notifier_block tracepoint_module_nb = {
+ .notifier_call = tracepoint_module_notify,
+ .priority = 0,
+};
+
+static int init_tracepoints(void)
+{
+ return register_module_notifier(&tracepoint_module_nb);
+}
+__initcall(init_tracepoints);
+
+#endif /* CONFIG_MODULES */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 714afad4653..d4dc69ddebd 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -62,6 +62,7 @@ struct workqueue_struct {
const char *name;
int singlethread;
int freezeable; /* Freeze threads during suspend */
+ int rt;
#ifdef CONFIG_LOCKDEP
struct lockdep_map lockdep_map;
#endif
@@ -766,6 +767,7 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
{
+ struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
struct workqueue_struct *wq = cwq->wq;
const char *fmt = is_single_threaded(wq) ? "%s" : "%s/%d";
struct task_struct *p;
@@ -781,7 +783,8 @@ static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
*/
if (IS_ERR(p))
return PTR_ERR(p);
-
+ if (cwq->wq->rt)
+ sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
cwq->thread = p;
return 0;
@@ -801,6 +804,7 @@ static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
struct workqueue_struct *__create_workqueue_key(const char *name,
int singlethread,
int freezeable,
+ int rt,
struct lock_class_key *key,
const char *lock_name)
{
@@ -822,6 +826,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
wq->singlethread = singlethread;
wq->freezeable = freezeable;
+ wq->rt = rt;
INIT_LIST_HEAD(&wq->list);
if (singlethread) {
@@ -965,6 +970,51 @@ undo:
return ret;
}
+#ifdef CONFIG_SMP
+struct work_for_cpu {
+ struct work_struct work;
+ long (*fn)(void *);
+ void *arg;
+ long ret;
+};
+
+static void do_work_for_cpu(struct work_struct *w)
+{
+ struct work_for_cpu *wfc = container_of(w, struct work_for_cpu, work);
+
+ wfc->ret = wfc->fn(wfc->arg);
+}
+
+/**
+ * work_on_cpu - run a function in user context on a particular cpu
+ * @cpu: the cpu to run on
+ * @fn: the function to run
+ * @arg: the function arg
+ *
+ * This will return -EINVAL in the cpu is not online, or the return value
+ * of @fn otherwise.
+ */
+long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
+{
+ struct work_for_cpu wfc;
+
+ INIT_WORK(&wfc.work, do_work_for_cpu);
+ wfc.fn = fn;
+ wfc.arg = arg;
+ get_online_cpus();
+ if (unlikely(!cpu_online(cpu)))
+ wfc.ret = -EINVAL;
+ else {
+ schedule_work_on(cpu, &wfc.work);
+ flush_work(&wfc.work);
+ }
+ put_online_cpus();
+
+ return wfc.ret;
+}
+EXPORT_SYMBOL_GPL(work_on_cpu);
+#endif /* CONFIG_SMP */
+
void __init init_workqueues(void)
{
cpu_populated_map = cpu_online_map;