Merge branch 'master' into next

author: James Morris <jmorris@namei.org> 2009-06-30 09:10:35 +1000
committer: James Morris <jmorris@namei.org> 2009-06-30 09:10:35 +1000
commit: ac7242142b03421c96b0a2f8d99f146d075614c2 (patch)
tree: b0b2ead65858c7a343d38affed86fe815e37e7e9 /kernel
parent: 89c86576ecde504da1eeb4f4882b2189ac2f9c4a (diff)
parent: 2bfdd79eaa0043346e773ba5f6cfd811ea31b73d (diff)
33 files changed, 1362 insertions, 1050 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 0a32cb21ec9..2093a691f1c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -69,7 +69,7 @@ obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
 obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
-obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
+obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 obj-$(CONFIG_GCOV_KERNEL) += gcov/
 obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
@@ -96,6 +96,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_X86_DS) += trace/
+obj-$(CONFIG_RING_BUFFER) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_SLOW_WORK) += slow-work.o
 obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
diff --git a/kernel/audit.c b/kernel/audit.c
index 9442c3533ba..defc2e6f1e3 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -115,9 +115,6 @@ static atomic_t    audit_lost = ATOMIC_INIT(0);
 /* The netlink socket. */
 static struct sock *audit_sock;
 
-/* Inotify handle. */
-struct inotify_handle *audit_ih;
-
 /* Hash for inode-based rules */
 struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
 
@@ -136,7 +133,7 @@ static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
 static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
 
 /* Serialize requests from userspace. */
-static DEFINE_MUTEX(audit_cmd_mutex);
+DEFINE_MUTEX(audit_cmd_mutex);
 
 /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
  * audit records.  Since printk uses a 1024 byte buffer, this buffer
@@ -375,6 +372,25 @@ static void audit_hold_skb(struct sk_buff *skb)
 		kfree_skb(skb);
 }
 
+/*
+ * For one reason or another this nlh isn't getting delivered to the userspace
+ * audit daemon, just send it to printk.
+ */
+static void audit_printk_skb(struct sk_buff *skb)
+{
+	struct nlmsghdr *nlh = nlmsg_hdr(skb);
+	char *data = NLMSG_DATA(nlh);
+
+	if (nlh->nlmsg_type != AUDIT_EOE) {
+		if (printk_ratelimit())
+			printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, data);
+		else
+			audit_log_lost("printk limit exceeded\n");
+	}
+
+	audit_hold_skb(skb);
+}
+
 static void kauditd_send_skb(struct sk_buff *skb)
 {
 	int err;
@@ -427,14 +443,8 @@ static int kauditd_thread(void *dummy)
 		if (skb) {
 			if (audit_pid)
 				kauditd_send_skb(skb);
-			else {
-				if (printk_ratelimit())
-					printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0));
-				else
-					audit_log_lost("printk limit exceeded\n");
-
-				audit_hold_skb(skb);
-			}
+			else
+				audit_printk_skb(skb);
 		} else {
 			DECLARE_WAITQUEUE(wait, current);
 			set_current_state(TASK_INTERRUPTIBLE);
@@ -495,42 +505,25 @@ int audit_send_list(void *_dest)
 	return 0;
 }
 
-#ifdef CONFIG_AUDIT_TREE
-static int prune_tree_thread(void *unused)
-{
-	mutex_lock(&audit_cmd_mutex);
-	audit_prune_trees();
-	mutex_unlock(&audit_cmd_mutex);
-	return 0;
-}
-
-void audit_schedule_prune(void)
-{
-	kthread_run(prune_tree_thread, NULL, "audit_prune_tree");
-}
-#endif
-
 struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
 				 int multi, void *payload, int size)
 {
 	struct sk_buff	*skb;
 	struct nlmsghdr	*nlh;
-	int		len = NLMSG_SPACE(size);
 	void		*data;
 	int		flags = multi ? NLM_F_MULTI : 0;
 	int		t     = done  ? NLMSG_DONE  : type;
 
-	skb = alloc_skb(len, GFP_KERNEL);
+	skb = nlmsg_new(size, GFP_KERNEL);
 	if (!skb)
 		return NULL;
 
-	nlh		 = NLMSG_PUT(skb, pid, seq, t, size);
-	nlh->nlmsg_flags = flags;
-	data		 = NLMSG_DATA(nlh);
+	nlh	= NLMSG_NEW(skb, pid, seq, t, size, flags);
+	data	= NLMSG_DATA(nlh);
 	memcpy(data, payload, size);
 	return skb;
 
-nlmsg_failure:			/* Used by NLMSG_PUT */
+nlmsg_failure:			/* Used by NLMSG_NEW */
 	if (skb)
 		kfree_skb(skb);
 	return NULL;
@@ -926,28 +919,29 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 }
 
 /*
- * Get message from skb (based on rtnetlink_rcv_skb).  Each message is
- * processed by audit_receive_msg.  Malformed skbs with wrong length are
- * discarded silently.
+ * Get message from skb.  Each message is processed by audit_receive_msg.
+ * Malformed skbs with wrong length are discarded silently.
  */
 static void audit_receive_skb(struct sk_buff *skb)
 {
-	int		err;
-	struct nlmsghdr	*nlh;
-	u32		rlen;
+	struct nlmsghdr *nlh;
+	/*
+	 * len MUST be signed for NLMSG_NEXT to be able to dec it below 0
+	 * if the nlmsg_len was not aligned
+	 */
+	int len;
+	int err;
 
-	while (skb->len >= NLMSG_SPACE(0)) {
-		nlh = nlmsg_hdr(skb);
-		if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
-			return;
-		rlen = NLMSG_ALIGN(nlh->nlmsg_len);
-		if (rlen > skb->len)
-			rlen = skb->len;
-		if ((err = audit_receive_msg(skb, nlh))) {
+	nlh = nlmsg_hdr(skb);
+	len = skb->len;
+
+	while (NLMSG_OK(nlh, len)) {
+		err = audit_receive_msg(skb, nlh);
+		/* if err or if this message says it wants a response */
+		if (err || (nlh->nlmsg_flags & NLM_F_ACK))
 			netlink_ack(skb, nlh, err);
-		} else if (nlh->nlmsg_flags & NLM_F_ACK)
-			netlink_ack(skb, nlh, 0);
-		skb_pull(skb, rlen);
+
+		nlh = NLMSG_NEXT(nlh, len);
 	}
 }
 
@@ -959,13 +953,6 @@ static void audit_receive(struct sk_buff  *skb)
 	mutex_unlock(&audit_cmd_mutex);
 }
 
-#ifdef CONFIG_AUDITSYSCALL
-static const struct inotify_operations audit_inotify_ops = {
-	.handle_event	= audit_handle_ievent,
-	.destroy_watch	= audit_free_parent,
-};
-#endif
-
 /* Initialize audit support at boot time. */
 static int __init audit_init(void)
 {
@@ -991,12 +978,6 @@ static int __init audit_init(void)
 
 	audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
 
-#ifdef CONFIG_AUDITSYSCALL
-	audit_ih = inotify_init(&audit_inotify_ops);
-	if (IS_ERR(audit_ih))
-		audit_panic("cannot initialize inotify handle");
-#endif
-
 	for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
 		INIT_LIST_HEAD(&audit_inode_hash[i]);
 
@@ -1070,18 +1051,20 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
 			goto err;
 	}
 
-	ab->skb = alloc_skb(AUDIT_BUFSIZ, gfp_mask);
-	if (!ab->skb)
-		goto err;
-
 	ab->ctx = ctx;
 	ab->gfp_mask = gfp_mask;
-	nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0));
-	nlh->nlmsg_type = type;
-	nlh->nlmsg_flags = 0;
-	nlh->nlmsg_pid = 0;
-	nlh->nlmsg_seq = 0;
+
+	ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
+	if (!ab->skb)
+		goto nlmsg_failure;
+
+	nlh = NLMSG_NEW(ab->skb, 0, 0, type, 0, 0);
+
 	return ab;
+
+nlmsg_failure:                  /* Used by NLMSG_NEW */
+	kfree_skb(ab->skb);
+	ab->skb = NULL;
 err:
 	audit_buffer_free(ab);
 	return NULL;
@@ -1452,6 +1435,15 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
 	kfree(pathname);
 }
 
+void audit_log_key(struct audit_buffer *ab, char *key)
+{
+	audit_log_format(ab, " key=");
+	if (key)
+		audit_log_untrustedstring(ab, key);
+	else
+		audit_log_format(ab, "(null)");
+}
+
 /**
  * audit_log_end - end one audit record
  * @ab: the audit_buffer
@@ -1475,15 +1467,7 @@ void audit_log_end(struct audit_buffer *ab)
 			skb_queue_tail(&audit_skb_queue, ab->skb);
 			wake_up_interruptible(&kauditd_wait);
 		} else {
-			if (nlh->nlmsg_type != AUDIT_EOE) {
-				if (printk_ratelimit()) {
-					printk(KERN_NOTICE "type=%d %s\n",
-						nlh->nlmsg_type,
-						ab->skb->data + NLMSG_SPACE(0));
-				} else
-					audit_log_lost("printk limit exceeded\n");
-			}
-			audit_hold_skb(ab->skb);
+			audit_printk_skb(ab->skb);
 		}
 		ab->skb = NULL;
 	}
diff --git a/kernel/audit.h b/kernel/audit.h
index 16f18cac661..208687be4f3 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -53,18 +53,7 @@ enum audit_state {
 };
 
 /* Rule lists */
-struct audit_parent;
-
-struct audit_watch {
-	atomic_t		count;	/* reference count */
-	char			*path;	/* insertion path */
-	dev_t			dev;	/* associated superblock device */
-	unsigned long		ino;	/* associated inode number */
-	struct audit_parent	*parent; /* associated parent */
-	struct list_head	wlist;	/* entry in parent->watches list */
-	struct list_head	rules;	/* associated rules */
-};
-
+struct audit_watch;
 struct audit_tree;
 struct audit_chunk;
 
@@ -108,19 +97,28 @@ struct audit_netlink_list {
 
 int audit_send_list(void *);
 
-struct inotify_watch;
-/* Inotify handle */
-extern struct inotify_handle *audit_ih;
-
-extern void audit_free_parent(struct inotify_watch *);
-extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32,
-				const char *, struct inode *);
 extern int selinux_audit_rule_update(void);
 
 extern struct mutex audit_filter_mutex;
 extern void audit_free_rule_rcu(struct rcu_head *);
 extern struct list_head audit_filter_list[];
 
+/* audit watch functions */
+extern unsigned long audit_watch_inode(struct audit_watch *watch);
+extern dev_t audit_watch_dev(struct audit_watch *watch);
+extern void audit_put_watch(struct audit_watch *watch);
+extern void audit_get_watch(struct audit_watch *watch);
+extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op);
+extern int audit_add_watch(struct audit_krule *krule);
+extern void audit_remove_watch(struct audit_watch *watch);
+extern void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list);
+extern void audit_inotify_unregister(struct list_head *in_list);
+extern char *audit_watch_path(struct audit_watch *watch);
+extern struct list_head *audit_watch_rules(struct audit_watch *watch);
+
+extern struct audit_entry *audit_dupe_rule(struct audit_krule *old,
+					   struct audit_watch *watch);
+
 #ifdef CONFIG_AUDIT_TREE
 extern struct audit_chunk *audit_tree_lookup(const struct inode *);
 extern void audit_put_chunk(struct audit_chunk *);
@@ -130,10 +128,9 @@ extern int audit_add_tree_rule(struct audit_krule *);
 extern int audit_remove_tree_rule(struct audit_krule *);
 extern void audit_trim_trees(void);
 extern int audit_tag_tree(char *old, char *new);
-extern void audit_schedule_prune(void);
-extern void audit_prune_trees(void);
 extern const char *audit_tree_path(struct audit_tree *);
 extern void audit_put_tree(struct audit_tree *);
+extern void audit_kill_trees(struct list_head *);
 #else
 #define audit_remove_tree_rule(rule) BUG()
 #define audit_add_tree_rule(rule) -EINVAL
@@ -142,6 +139,7 @@ extern void audit_put_tree(struct audit_tree *);
 #define audit_put_tree(tree) (void)0
 #define audit_tag_tree(old, new) -EINVAL
 #define audit_tree_path(rule) ""	/* never called */
+#define audit_kill_trees(list) BUG()
 #endif
 
 extern char *audit_unpack_string(void **, size_t *, size_t);
@@ -160,7 +158,10 @@ static inline int audit_signal_info(int sig, struct task_struct *t)
 	return 0;
 }
 extern void audit_filter_inodes(struct task_struct *, struct audit_context *);
+extern struct list_head *audit_killed_trees(void);
 #else
 #define audit_signal_info(s,t) AUDIT_DISABLED
 #define audit_filter_inodes(t,c) AUDIT_DISABLED
 #endif
+
+extern struct mutex audit_cmd_mutex;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 1f6396d7668..2451dc6f328 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -2,6 +2,7 @@
 #include <linux/inotify.h>
 #include <linux/namei.h>
 #include <linux/mount.h>
+#include <linux/kthread.h>
 
 struct audit_tree;
 struct audit_chunk;
@@ -441,13 +442,11 @@ static void kill_rules(struct audit_tree *tree)
 		if (rule->tree) {
 			/* not a half-baked one */
 			ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
-			audit_log_format(ab, "op=remove rule dir=");
+			audit_log_format(ab, "op=");
+			audit_log_string(ab, "remove rule");
+			audit_log_format(ab, " dir=");
 			audit_log_untrustedstring(ab, rule->tree->pathname);
-			if (rule->filterkey) {
-				audit_log_format(ab, " key=");
-				audit_log_untrustedstring(ab, rule->filterkey);
-			} else
-				audit_log_format(ab, " key=(null)");
+			audit_log_key(ab, rule->filterkey);
 			audit_log_format(ab, " list=%d res=1", rule->listnr);
 			audit_log_end(ab);
 			rule->tree = NULL;
@@ -519,6 +518,8 @@ static void trim_marked(struct audit_tree *tree)
 	}
 }
 
+static void audit_schedule_prune(void);
+
 /* called with audit_filter_mutex */
 int audit_remove_tree_rule(struct audit_krule *rule)
 {
@@ -824,10 +825,11 @@ int audit_tag_tree(char *old, char *new)
 
 /*
  * That gets run when evict_chunk() ends up needing to kill audit_tree.
- * Runs from a separate thread, with audit_cmd_mutex held.
+ * Runs from a separate thread.
  */
-void audit_prune_trees(void)
+static int prune_tree_thread(void *unused)
 {
+	mutex_lock(&audit_cmd_mutex);
 	mutex_lock(&audit_filter_mutex);
 
 	while (!list_empty(&prune_list)) {
@@ -844,6 +846,40 @@ void audit_prune_trees(void)
 	}
 
 	mutex_unlock(&audit_filter_mutex);
+	mutex_unlock(&audit_cmd_mutex);
+	return 0;
+}
+
+static void audit_schedule_prune(void)
+{
+	kthread_run(prune_tree_thread, NULL, "audit_prune_tree");
+}
+
+/*
+ * ... and that one is done if evict_chunk() decides to delay until the end
+ * of syscall.  Runs synchronously.
+ */
+void audit_kill_trees(struct list_head *list)
+{
+	mutex_lock(&audit_cmd_mutex);
+	mutex_lock(&audit_filter_mutex);
+
+	while (!list_empty(list)) {
+		struct audit_tree *victim;
+
+		victim = list_entry(list->next, struct audit_tree, list);
+		kill_rules(victim);
+		list_del_init(&victim->list);
+
+		mutex_unlock(&audit_filter_mutex);
+
+		prune_one(victim);
+
+		mutex_lock(&audit_filter_mutex);
+	}
+
+	mutex_unlock(&audit_filter_mutex);
+	mutex_unlock(&audit_cmd_mutex);
 }
 
 /*
@@ -854,6 +890,8 @@ void audit_prune_trees(void)
 static void evict_chunk(struct audit_chunk *chunk)
 {
 	struct audit_tree *owner;
+	struct list_head *postponed = audit_killed_trees();
+	int need_prune = 0;
 	int n;
 
 	if (chunk->dead)
@@ -869,15 +907,21 @@ static void evict_chunk(struct audit_chunk *chunk)
 		owner->root = NULL;
 		list_del_init(&owner->same_root);
 		spin_unlock(&hash_lock);
-		kill_rules(owner);
-		list_move(&owner->list, &prune_list);
-		audit_schedule_prune();
+		if (!postponed) {
+			kill_rules(owner);
+			list_move(&owner->list, &prune_list);
+			need_prune = 1;
+		} else {
+			list_move(&owner->list, postponed);
+		}
 		spin_lock(&hash_lock);
 	}
 	list_del_rcu(&chunk->hash);
 	for (n = 0; n < chunk->count; n++)
 		list_del_init(&chunk->owners[n].list);
 	spin_unlock(&hash_lock);
+	if (need_prune)
+		audit_schedule_prune();
 	mutex_unlock(&audit_filter_mutex);
 }
 
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
new file mode 100644
index 00000000000..0e96dbc60ea
--- /dev/null
+++ b/kernel/audit_watch.c
@@ -0,0 +1,543 @@
+/* audit_watch.c -- watching inodes
+ *
+ * Copyright 2003-2009 Red Hat, Inc.
+ * Copyright 2005 Hewlett-Packard Development Company, L.P.
+ * Copyright 2005 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/audit.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/netlink.h>
+#include <linux/sched.h>
+#include <linux/inotify.h>
+#include <linux/security.h>
+#include "audit.h"
+
+/*
+ * Reference counting:
+ *
+ * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
+ * 	event.  Each audit_watch holds a reference to its associated parent.
+ *
+ * audit_watch: if added to lists, lifetime is from audit_init_watch() to
+ * 	audit_remove_watch().  Additionally, an audit_watch may exist
+ * 	temporarily to assist in searching existing filter data.  Each
+ * 	audit_krule holds a reference to its associated watch.
+ */
+
+struct audit_watch {
+	atomic_t		count;	/* reference count */
+	char			*path;	/* insertion path */
+	dev_t			dev;	/* associated superblock device */
+	unsigned long		ino;	/* associated inode number */
+	struct audit_parent	*parent; /* associated parent */
+	struct list_head	wlist;	/* entry in parent->watches list */
+	struct list_head	rules;	/* associated rules */
+};
+
+struct audit_parent {
+	struct list_head	ilist;	/* entry in inotify registration list */
+	struct list_head	watches; /* associated watches */
+	struct inotify_watch	wdata;	/* inotify watch data */
+	unsigned		flags;	/* status flags */
+};
+
+/* Inotify handle. */
+struct inotify_handle *audit_ih;
+
+/*
+ * audit_parent status flags:
+ *
+ * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
+ * a filesystem event to ensure we're adding audit watches to a valid parent.
+ * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
+ * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
+ * we can receive while holding nameidata.
+ */
+#define AUDIT_PARENT_INVALID	0x001
+
+/* Inotify events we care about. */
+#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
+
+static void audit_free_parent(struct inotify_watch *i_watch)
+{
+	struct audit_parent *parent;
+
+	parent = container_of(i_watch, struct audit_parent, wdata);
+	WARN_ON(!list_empty(&parent->watches));
+	kfree(parent);
+}
+
+void audit_get_watch(struct audit_watch *watch)
+{
+	atomic_inc(&watch->count);
+}
+
+void audit_put_watch(struct audit_watch *watch)
+{
+	if (atomic_dec_and_test(&watch->count)) {
+		WARN_ON(watch->parent);
+		WARN_ON(!list_empty(&watch->rules));
+		kfree(watch->path);
+		kfree(watch);
+	}
+}
+
+void audit_remove_watch(struct audit_watch *watch)
+{
+	list_del(&watch->wlist);
+	put_inotify_watch(&watch->parent->wdata);
+	watch->parent = NULL;
+	audit_put_watch(watch); /* match initial get */
+}
+
+char *audit_watch_path(struct audit_watch *watch)
+{
+	return watch->path;
+}
+
+struct list_head *audit_watch_rules(struct audit_watch *watch)
+{
+	return &watch->rules;
+}
+
+unsigned long audit_watch_inode(struct audit_watch *watch)
+{
+	return watch->ino;
+}
+
+dev_t audit_watch_dev(struct audit_watch *watch)
+{
+	return watch->dev;
+}
+
+/* Initialize a parent watch entry. */
+static struct audit_parent *audit_init_parent(struct nameidata *ndp)
+{
+	struct audit_parent *parent;
+	s32 wd;
+
+	parent = kzalloc(sizeof(*parent), GFP_KERNEL);
+	if (unlikely(!parent))
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&parent->watches);
+	parent->flags = 0;
+
+	inotify_init_watch(&parent->wdata);
+	/* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
+	get_inotify_watch(&parent->wdata);
+	wd = inotify_add_watch(audit_ih, &parent->wdata,
+			       ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
+	if (wd < 0) {
+		audit_free_parent(&parent->wdata);
+		return ERR_PTR(wd);
+	}
+
+	return parent;
+}
+
+/* Initialize a watch entry. */
+static struct audit_watch *audit_init_watch(char *path)
+{
+	struct audit_watch *watch;
+
+	watch = kzalloc(sizeof(*watch), GFP_KERNEL);
+	if (unlikely(!watch))
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&watch->rules);
+	atomic_set(&watch->count, 1);
+	watch->path = path;
+	watch->dev = (dev_t)-1;
+	watch->ino = (unsigned long)-1;
+
+	return watch;
+}
+
+/* Translate a watch string to kernel respresentation. */
+int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
+{
+	struct audit_watch *watch;
+
+	if (!audit_ih)
+		return -EOPNOTSUPP;
+
+	if (path[0] != '/' || path[len-1] == '/' ||
+	    krule->listnr != AUDIT_FILTER_EXIT ||
+	    op != Audit_equal ||
+	    krule->inode_f || krule->watch || krule->tree)
+		return -EINVAL;
+
+	watch = audit_init_watch(path);
+	if (IS_ERR(watch))
+		return PTR_ERR(watch);
+
+	audit_get_watch(watch);
+	krule->watch = watch;
+
+	return 0;
+}
+
+/* Duplicate the given audit watch.  The new watch's rules list is initialized
+ * to an empty list and wlist is undefined. */
+static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
+{
+	char *path;
+	struct audit_watch *new;
+
+	path = kstrdup(old->path, GFP_KERNEL);
+	if (unlikely(!path))
+		return ERR_PTR(-ENOMEM);
+
+	new = audit_init_watch(path);
+	if (IS_ERR(new)) {
+		kfree(path);
+		goto out;
+	}
+
+	new->dev = old->dev;
+	new->ino = old->ino;
+	get_inotify_watch(&old->parent->wdata);
+	new->parent = old->parent;
+
+out:
+	return new;
+}
+
+static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watch *w, char *op)
+{
+	if (audit_enabled) {
+		struct audit_buffer *ab;
+		ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
+		audit_log_format(ab, "auid=%u ses=%u op=",
+				 audit_get_loginuid(current),
+				 audit_get_sessionid(current));
+		audit_log_string(ab, op);
+		audit_log_format(ab, " path=");
+		audit_log_untrustedstring(ab, w->path);
+		audit_log_key(ab, r->filterkey);
+		audit_log_format(ab, " list=%d res=1", r->listnr);
+		audit_log_end(ab);
+	}
+}
+
+/* Update inode info in audit rules based on filesystem event. */
+static void audit_update_watch(struct audit_parent *parent,
+			       const char *dname, dev_t dev,
+			       unsigned long ino, unsigned invalidating)
+{
+	struct audit_watch *owatch, *nwatch, *nextw;
+	struct audit_krule *r, *nextr;
+	struct audit_entry *oentry, *nentry;
+
+	mutex_lock(&audit_filter_mutex);
+	list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
+		if (audit_compare_dname_path(dname, owatch->path, NULL))
+			continue;
+
+		/* If the update involves invalidating rules, do the inode-based
+		 * filtering now, so we don't omit records. */
+		if (invalidating && current->audit_context)
+			audit_filter_inodes(current, current->audit_context);
+
+		nwatch = audit_dupe_watch(owatch);
+		if (IS_ERR(nwatch)) {
+			mutex_unlock(&audit_filter_mutex);
+			audit_panic("error updating watch, skipping");
+			return;
+		}
+		nwatch->dev = dev;
+		nwatch->ino = ino;
+
+		list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
+
+			oentry = container_of(r, struct audit_entry, rule);
+			list_del(&oentry->rule.rlist);
+			list_del_rcu(&oentry->list);
+
+			nentry = audit_dupe_rule(&oentry->rule, nwatch);
+			if (IS_ERR(nentry)) {
+				list_del(&oentry->rule.list);
+				audit_panic("error updating watch, removing");
+			} else {
+				int h = audit_hash_ino((u32)ino);
+				list_add(&nentry->rule.rlist, &nwatch->rules);
+				list_add_rcu(&nentry->list, &audit_inode_hash[h]);
+				list_replace(&oentry->rule.list,
+					     &nentry->rule.list);
+			}
+
+			audit_watch_log_rule_change(r, owatch, "updated rules");
+
+			call_rcu(&oentry->rcu, audit_free_rule_rcu);
+		}
+
+		audit_remove_watch(owatch);
+		goto add_watch_to_parent; /* event applies to a single watch */
+	}
+	mutex_unlock(&audit_filter_mutex);
+	return;
+
+add_watch_to_parent:
+	list_add(&nwatch->wlist, &parent->watches);
+	mutex_unlock(&audit_filter_mutex);
+	return;
+}
+
+/* Remove all watches & rules associated with a parent that is going away. */
+static void audit_remove_parent_watches(struct audit_parent *parent)
+{
+	struct audit_watch *w, *nextw;
+	struct audit_krule *r, *nextr;
+	struct audit_entry *e;
+
+	mutex_lock(&audit_filter_mutex);
+	parent->flags |= AUDIT_PARENT_INVALID;
+	list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
+		list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
+			e = container_of(r, struct audit_entry, rule);
+			audit_watch_log_rule_change(r, w, "remove rule");
+			list_del(&r->rlist);
+			list_del(&r->list);
+			list_del_rcu(&e->list);
+			call_rcu(&e->rcu, audit_free_rule_rcu);
+		}
+		audit_remove_watch(w);
+	}
+	mutex_unlock(&audit_filter_mutex);
+}
+
+/* Unregister inotify watches for parents on in_list.
+ * Generates an IN_IGNORED event. */
+void audit_inotify_unregister(struct list_head *in_list)
+{
+	struct audit_parent *p, *n;
+
+	list_for_each_entry_safe(p, n, in_list, ilist) {
+		list_del(&p->ilist);
+		inotify_rm_watch(audit_ih, &p->wdata);
+		/* the unpin matching the pin in audit_do_del_rule() */
+		unpin_inotify_watch(&p->wdata);
+	}
+}
+
+/* Get path information necessary for adding watches. */
+static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw)
+{
+	struct nameidata *ndparent, *ndwatch;
+	int err;
+
+	ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
+	if (unlikely(!ndparent))
+		return -ENOMEM;
+
+	ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
+	if (unlikely(!ndwatch)) {
+		kfree(ndparent);
+		return -ENOMEM;
+	}
+
+	err = path_lookup(path, LOOKUP_PARENT, ndparent);
+	if (err) {
+		kfree(ndparent);
+		kfree(ndwatch);
+		return err;
+	}
+
+	err = path_lookup(path, 0, ndwatch);
+	if (err) {
+		kfree(ndwatch);
+		ndwatch = NULL;
+	}
+
+	*ndp = ndparent;
+	*ndw = ndwatch;
+
+	return 0;
+}
+
+/* Release resources used for watch path information. */
+static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
+{
+	if (ndp) {
+		path_put(&ndp->path);
+		kfree(ndp);
+	}
+	if (ndw) {
+		path_put(&ndw->path);
+		kfree(ndw);
+	}
+}
+
+/* Associate the given rule with an existing parent inotify_watch.
+ * Caller must hold audit_filter_mutex. */
+static void audit_add_to_parent(struct audit_krule *krule,
+				struct audit_parent *parent)
+{
+	struct audit_watch *w, *watch = krule->watch;
+	int watch_found = 0;
+
+	list_for_each_entry(w, &parent->watches, wlist) {
+		if (strcmp(watch->path, w->path))
+			continue;
+
+		watch_found = 1;
+
+		/* put krule's and initial refs to temporary watch */
+		audit_put_watch(watch);
+		audit_put_watch(watch);
+
+		audit_get_watch(w);
+		krule->watch = watch = w;
+		break;
+	}
+
+	if (!watch_found) {
+		get_inotify_watch(&parent->wdata);
+		watch->parent = parent;
+
+		list_add(&watch->wlist, &parent->watches);
+	}
+	list_add(&krule->rlist, &watch->rules);
+}
+
+/* Find a matching watch entry, or add this one.
+ * Caller must hold audit_filter_mutex. */
+int audit_add_watch(struct audit_krule *krule)
+{
+	struct audit_watch *watch = krule->watch;
+	struct inotify_watch *i_watch;
+	struct audit_parent *parent;
+	struct nameidata *ndp = NULL, *ndw = NULL;
+	int ret = 0;
+
+	mutex_unlock(&audit_filter_mutex);
+
+	/* Avoid calling path_lookup under audit_filter_mutex. */
+	ret = audit_get_nd(watch->path, &ndp, &ndw);
+	if (ret) {
+		/* caller expects mutex locked */
+		mutex_lock(&audit_filter_mutex);
+		goto error;
+	}
+
+	/* update watch filter fields */
+	if (ndw) {
+		watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
+		watch->ino = ndw->path.dentry->d_inode->i_ino;
+	}
+
+	/* The audit_filter_mutex must not be held during inotify calls because
+	 * we hold it during inotify event callback processing.  If an existing
+	 * inotify watch is found, inotify_find_watch() grabs a reference before
+	 * returning.
+	 */
+	if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
+			       &i_watch) < 0) {
+		parent = audit_init_parent(ndp);
+		if (IS_ERR(parent)) {
+			/* caller expects mutex locked */
+			mutex_lock(&audit_filter_mutex);
+			ret = PTR_ERR(parent);
+			goto error;
+		}
+	} else
+		parent = container_of(i_watch, struct audit_parent, wdata);
+
+	mutex_lock(&audit_filter_mutex);
+
+	/* parent was moved before we took audit_filter_mutex */
+	if (parent->flags & AUDIT_PARENT_INVALID)
+		ret = -ENOENT;
+	else
+		audit_add_to_parent(krule, parent);
+
+	/* match get in audit_init_parent or inotify_find_watch */
+	put_inotify_watch(&parent->wdata);
+
+error:
+	audit_put_nd(ndp, ndw);		/* NULL args OK */
+	return ret;
+
+}
+
+void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list)
+{
+	struct audit_watch *watch = krule->watch;
+	struct audit_parent *parent = watch->parent;
+
+	list_del(&krule->rlist);
+
+	if (list_empty(&watch->rules)) {
+		audit_remove_watch(watch);
+
+		if (list_empty(&parent->watches)) {
+			/* Put parent on the inotify un-registration
+			 * list.  Grab a reference before releasing
+			 * audit_filter_mutex, to be released in
+			 * audit_inotify_unregister().
+			 * If filesystem is going away, just leave
+			 * the sucker alone, eviction will take
+			 * care of it. */
+			if (pin_inotify_watch(&parent->wdata))
+				list_add(&parent->ilist, list);
+		}
+	}
+}
+
+/* Update watch data in audit rules based on inotify events. */
+static void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
+			 u32 cookie, const char *dname, struct inode *inode)
+{
+	struct audit_parent *parent;
+
+	parent = container_of(i_watch, struct audit_parent, wdata);
+
+	if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
+		audit_update_watch(parent, dname, inode->i_sb->s_dev,
+				   inode->i_ino, 0);
+	else if (mask & (IN_DELETE|IN_MOVED_FROM))
+		audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
+	/* inotify automatically removes the watch and sends IN_IGNORED */
+	else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
+		audit_remove_parent_watches(parent);
+	/* inotify does not remove the watch, so remove it manually */
+	else if(mask & IN_MOVE_SELF) {
+		audit_remove_parent_watches(parent);
+		inotify_remove_watch_locked(audit_ih, i_watch);
+	} else if (mask & IN_IGNORED)
+		put_inotify_watch(i_watch);
+}
+
+static const struct inotify_operations audit_inotify_ops = {
+	.handle_event   = audit_handle_ievent,
+	.destroy_watch  = audit_free_parent,
+};
+
+static int __init audit_watch_init(void)
+{
+	audit_ih = inotify_init(&audit_inotify_ops);
+	if (IS_ERR(audit_ih))
+		audit_panic("cannot initialize inotify handle");
+	return 0;
+}
+subsys_initcall(audit_watch_init);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 713098ee5a0..a70604047f3 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -27,7 +27,6 @@
 #include <linux/namei.h>
 #include <linux/netlink.h>
 #include <linux/sched.h>
-#include <linux/inotify.h>
 #include <linux/security.h>
 #include "audit.h"
 
@@ -44,36 +43,6 @@
  * 		be written directly provided audit_filter_mutex is held.
  */
 
-/*
- * Reference counting:
- *
- * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
- * 	event.  Each audit_watch holds a reference to its associated parent.
- *
- * audit_watch: if added to lists, lifetime is from audit_init_watch() to
- * 	audit_remove_watch().  Additionally, an audit_watch may exist
- * 	temporarily to assist in searching existing filter data.  Each
- * 	audit_krule holds a reference to its associated watch.
- */
-
-struct audit_parent {
-	struct list_head	ilist;	/* entry in inotify registration list */
-	struct list_head	watches; /* associated watches */
-	struct inotify_watch	wdata;	/* inotify watch data */
-	unsigned		flags;	/* status flags */
-};
-
-/*
- * audit_parent status flags:
- *
- * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
- * a filesystem event to ensure we're adding audit watches to a valid parent.
- * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
- * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
- * we can receive while holding nameidata.
- */
-#define AUDIT_PARENT_INVALID	0x001
-
 /* Audit filter lists, defined in <linux/audit.h> */
 struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
 	LIST_HEAD_INIT(audit_filter_list[0]),
@@ -97,41 +66,6 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
 
 DEFINE_MUTEX(audit_filter_mutex);
 
-/* Inotify events we care about. */
-#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
-
-void audit_free_parent(struct inotify_watch *i_watch)
-{
-	struct audit_parent *parent;
-
-	parent = container_of(i_watch, struct audit_parent, wdata);
-	WARN_ON(!list_empty(&parent->watches));
-	kfree(parent);
-}
-
-static inline void audit_get_watch(struct audit_watch *watch)
-{
-	atomic_inc(&watch->count);
-}
-
-static void audit_put_watch(struct audit_watch *watch)
-{
-	if (atomic_dec_and_test(&watch->count)) {
-		WARN_ON(watch->parent);
-		WARN_ON(!list_empty(&watch->rules));
-		kfree(watch->path);
-		kfree(watch);
-	}
-}
-
-static void audit_remove_watch(struct audit_watch *watch)
-{
-	list_del(&watch->wlist);
-	put_inotify_watch(&watch->parent->wdata);
-	watch->parent = NULL;
-	audit_put_watch(watch); /* match initial get */
-}
-
 static inline void audit_free_rule(struct audit_entry *e)
 {
 	int i;
@@ -156,50 +90,6 @@ void audit_free_rule_rcu(struct rcu_head *head)
 	audit_free_rule(e);
 }
 
-/* Initialize a parent watch entry. */
-static struct audit_parent *audit_init_parent(struct nameidata *ndp)
-{
-	struct audit_parent *parent;
-	s32 wd;
-
-	parent = kzalloc(sizeof(*parent), GFP_KERNEL);
-	if (unlikely(!parent))
-		return ERR_PTR(-ENOMEM);
-
-	INIT_LIST_HEAD(&parent->watches);
-	parent->flags = 0;
-
-	inotify_init_watch(&parent->wdata);
-	/* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
-	get_inotify_watch(&parent->wdata);
-	wd = inotify_add_watch(audit_ih, &parent->wdata,
-			       ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
-	if (wd < 0) {
-		audit_free_parent(&parent->wdata);
-		return ERR_PTR(wd);
-	}
-
-	return parent;
-}
-
-/* Initialize a watch entry. */
-static struct audit_watch *audit_init_watch(char *path)
-{
-	struct audit_watch *watch;
-
-	watch = kzalloc(sizeof(*watch), GFP_KERNEL);
-	if (unlikely(!watch))
-		return ERR_PTR(-ENOMEM);
-
-	INIT_LIST_HEAD(&watch->rules);
-	atomic_set(&watch->count, 1);
-	watch->path = path;
-	watch->dev = (dev_t)-1;
-	watch->ino = (unsigned long)-1;
-
-	return watch;
-}
-
 /* Initialize an audit filterlist entry. */
 static inline struct audit_entry *audit_init_entry(u32 field_count)
 {
@@ -260,31 +150,6 @@ static inline int audit_to_inode(struct audit_krule *krule,
 	return 0;
 }
 
-/* Translate a watch string to kernel respresentation. */
-static int audit_to_watch(struct audit_krule *krule, char *path, int len,
-			  u32 op)
-{
-	struct audit_watch *watch;
-
-	if (!audit_ih)
-		return -EOPNOTSUPP;
-
-	if (path[0] != '/' || path[len-1] == '/' ||
-	    krule->listnr != AUDIT_FILTER_EXIT ||
-	    op != Audit_equal ||
-	    krule->inode_f || krule->watch || krule->tree)
-		return -EINVAL;
-
-	watch = audit_init_watch(path);
-	if (IS_ERR(watch))
-		return PTR_ERR(watch);
-
-	audit_get_watch(watch);
-	krule->watch = watch;
-
-	return 0;
-}
-
 static __u32 *classes[AUDIT_SYSCALL_CLASSES];
 
 int __init audit_register_class(int class, unsigned *list)
@@ -766,7 +631,8 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
 			break;
 		case AUDIT_WATCH:
 			data->buflen += data->values[i] =
-				audit_pack_string(&bufp, krule->watch->path);
+				audit_pack_string(&bufp,
+						  audit_watch_path(krule->watch));
 			break;
 		case AUDIT_DIR:
 			data->buflen += data->values[i] =
@@ -818,7 +684,8 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
 				return 1;
 			break;
 		case AUDIT_WATCH:
-			if (strcmp(a->watch->path, b->watch->path))
+			if (strcmp(audit_watch_path(a->watch),
+				   audit_watch_path(b->watch)))
 				return 1;
 			break;
 		case AUDIT_DIR:
@@ -844,32 +711,6 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
 	return 0;
 }
 
-/* Duplicate the given audit watch.  The new watch's rules list is initialized
- * to an empty list and wlist is undefined. */
-static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
-{
-	char *path;
-	struct audit_watch *new;
-
-	path = kstrdup(old->path, GFP_KERNEL);
-	if (unlikely(!path))
-		return ERR_PTR(-ENOMEM);
-
-	new = audit_init_watch(path);
-	if (IS_ERR(new)) {
-		kfree(path);
-		goto out;
-	}
-
-	new->dev = old->dev;
-	new->ino = old->ino;
-	get_inotify_watch(&old->parent->wdata);
-	new->parent = old->parent;
-
-out:
-	return new;
-}
-
 /* Duplicate LSM field information.  The lsm_rule is opaque, so must be
  * re-initialized. */
 static inline int audit_dupe_lsm_field(struct audit_field *df,
@@ -904,8 +745,8 @@ static inline int audit_dupe_lsm_field(struct audit_field *df,
  * rule with the new rule in the filterlist, then free the old rule.
  * The rlist element is undefined; list manipulations are handled apart from
  * the initial copy. */
-static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
-					   struct audit_watch *watch)
+struct audit_entry *audit_dupe_rule(struct audit_krule *old,
+				    struct audit_watch *watch)
 {
 	u32 fcount = old->field_count;
 	struct audit_entry *entry;
@@ -977,137 +818,6 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
 	return entry;
 }
 
-/* Update inode info in audit rules based on filesystem event. */
-static void audit_update_watch(struct audit_parent *parent,
-			       const char *dname, dev_t dev,
-			       unsigned long ino, unsigned invalidating)
-{
-	struct audit_watch *owatch, *nwatch, *nextw;
-	struct audit_krule *r, *nextr;
-	struct audit_entry *oentry, *nentry;
-
-	mutex_lock(&audit_filter_mutex);
-	list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
-		if (audit_compare_dname_path(dname, owatch->path, NULL))
-			continue;
-
-		/* If the update involves invalidating rules, do the inode-based
-		 * filtering now, so we don't omit records. */
-		if (invalidating && current->audit_context)
-			audit_filter_inodes(current, current->audit_context);
-
-		nwatch = audit_dupe_watch(owatch);
-		if (IS_ERR(nwatch)) {
-			mutex_unlock(&audit_filter_mutex);
-			audit_panic("error updating watch, skipping");
-			return;
-		}
-		nwatch->dev = dev;
-		nwatch->ino = ino;
-
-		list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
-
-			oentry = container_of(r, struct audit_entry, rule);
-			list_del(&oentry->rule.rlist);
-			list_del_rcu(&oentry->list);
-
-			nentry = audit_dupe_rule(&oentry->rule, nwatch);
-			if (IS_ERR(nentry)) {
-				list_del(&oentry->rule.list);
-				audit_panic("error updating watch, removing");
-			} else {
-				int h = audit_hash_ino((u32)ino);
-				list_add(&nentry->rule.rlist, &nwatch->rules);
-				list_add_rcu(&nentry->list, &audit_inode_hash[h]);
-				list_replace(&oentry->rule.list,
-					     &nentry->rule.list);
-			}
-
-			call_rcu(&oentry->rcu, audit_free_rule_rcu);
-		}
-
-		if (audit_enabled) {
-			struct audit_buffer *ab;
-			ab = audit_log_start(NULL, GFP_NOFS,
-				AUDIT_CONFIG_CHANGE);
-			audit_log_format(ab, "auid=%u ses=%u",
-				audit_get_loginuid(current),
-				audit_get_sessionid(current));
-			audit_log_format(ab,
-				" op=updated rules specifying path=");
-			audit_log_untrustedstring(ab, owatch->path);
-			audit_log_format(ab, " with dev=%u ino=%lu\n",
-				 dev, ino);
-			audit_log_format(ab, " list=%d res=1", r->listnr);
-			audit_log_end(ab);
-		}
-		audit_remove_watch(owatch);
-		goto add_watch_to_parent; /* event applies to a single watch */
-	}
-	mutex_unlock(&audit_filter_mutex);
-	return;
-
-add_watch_to_parent:
-	list_add(&nwatch->wlist, &parent->watches);
-	mutex_unlock(&audit_filter_mutex);
-	return;
-}
-
-/* Remove all watches & rules associated with a parent that is going away. */
-static void audit_remove_parent_watches(struct audit_parent *parent)
-{
-	struct audit_watch *w, *nextw;
-	struct audit_krule *r, *nextr;
-	struct audit_entry *e;
-
-	mutex_lock(&audit_filter_mutex);
-	parent->flags |= AUDIT_PARENT_INVALID;
-	list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
-		list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
-			e = container_of(r, struct audit_entry, rule);
-			if (audit_enabled) {
-				struct audit_buffer *ab;
-				ab = audit_log_start(NULL, GFP_NOFS,
-					AUDIT_CONFIG_CHANGE);
-				audit_log_format(ab, "auid=%u ses=%u",
-					audit_get_loginuid(current),
-					audit_get_sessionid(current));
-				audit_log_format(ab, " op=remove rule path=");
-				audit_log_untrustedstring(ab, w->path);
-				if (r->filterkey) {
-					audit_log_format(ab, " key=");
-					audit_log_untrustedstring(ab,
-							r->filterkey);
-				} else
-					audit_log_format(ab, " key=(null)");
-				audit_log_format(ab, " list=%d res=1",
-					r->listnr);
-				audit_log_end(ab);
-			}
-			list_del(&r->rlist);
-			list_del(&r->list);
-			list_del_rcu(&e->list);
-			call_rcu(&e->rcu, audit_free_rule_rcu);
-		}
-		audit_remove_watch(w);
-	}
-	mutex_unlock(&audit_filter_mutex);
-}
-
-/* Unregister inotify watches for parents on in_list.
- * Generates an IN_IGNORED event. */
-static void audit_inotify_unregister(struct list_head *in_list)
-{
-	struct audit_parent *p, *n;
-
-	list_for_each_entry_safe(p, n, in_list, ilist) {
-		list_del(&p->ilist);
-		inotify_rm_watch(audit_ih, &p->wdata);
-		/* the unpin matching the pin in audit_do_del_rule() */
-		unpin_inotify_watch(&p->wdata);
-	}
-}
-
 /* Find an existing audit rule.
  * Caller must hold audit_filter_mutex to prevent stale rule data. */
 static struct audit_entry *audit_find_rule(struct audit_entry *entry,
@@ -1145,134 +855,6 @@ out:
 	return found;
 }
 
-/* Get path information necessary for adding watches. */
-static int audit_get_nd(char *path, struct nameidata **ndp,
-			struct nameidata **ndw)
-{
-	struct nameidata *ndparent, *ndwatch;
-	int err;
-
-	ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
-	if (unlikely(!ndparent))
-		return -ENOMEM;
-
-	ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
-	if (unlikely(!ndwatch)) {
-		kfree(ndparent);
-		return -ENOMEM;
-	}
-
-	err = path_lookup(path, LOOKUP_PARENT, ndparent);
-	if (err) {
-		kfree(ndparent);
-		kfree(ndwatch);
-		return err;
-	}
-
-	err = path_lookup(path, 0, ndwatch);
-	if (err) {
-		kfree(ndwatch);
-		ndwatch = NULL;
-	}
-
-	*ndp = ndparent;
-	*ndw = ndwatch;
-
-	return 0;
-}
-
-/* Release resources used for watch path information. */
-static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
-{
-	if (ndp) {
-		path_put(&ndp->path);
-		kfree(ndp);
-	}
-	if (ndw) {
-		path_put(&ndw->path);
-		kfree(ndw);
-	}
-}
-
-/* Associate the given rule with an existing parent inotify_watch.
- * Caller must hold audit_filter_mutex. */
-static void audit_add_to_parent(struct audit_krule *krule,
-				struct audit_parent *parent)
-{
-	struct audit_watch *w, *watch = krule->watch;
-	int watch_found = 0;
-
-	list_for_each_entry(w, &parent->watches, wlist) {
-		if (strcmp(watch->path, w->path))
-			continue;
-
-		watch_found = 1;
-
-		/* put krule's and initial refs to temporary watch */
-		audit_put_watch(watch);
-		audit_put_watch(watch);
-
-		audit_get_watch(w);
-		krule->watch = watch = w;
-		break;
-	}
-
-	if (!watch_found) {
-		get_inotify_watch(&parent->wdata);
-		watch->parent = parent;
-
-		list_add(&watch->wlist, &parent->watches);
-	}
-	list_add(&krule->rlist, &watch->rules);
-}
-
-/* Find a matching watch entry, or add this one.
- * Caller must hold audit_filter_mutex. */
-static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
-			   struct nameidata *ndw)
-{
-	struct audit_watch *watch = krule->watch;
-	struct inotify_watch *i_watch;
-	struct audit_parent *parent;
-	int ret = 0;
-
-	/* update watch filter fields */
-	if (ndw) {
-		watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
-		watch->ino = ndw->path.dentry->d_inode->i_ino;
-	}
-
-	/* The audit_filter_mutex must not be held during inotify calls because
-	 * we hold it during inotify event callback processing.  If an existing
-	 * inotify watch is found, inotify_find_watch() grabs a reference before
-	 * returning.
-	 */
-	mutex_unlock(&audit_filter_mutex);
-
-	if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
-			       &i_watch) < 0) {
-		parent = audit_init_parent(ndp);
-		if (IS_ERR(parent)) {
-			/* caller expects mutex locked */
-			mutex_lock(&audit_filter_mutex);
-			return PTR_ERR(parent);
-		}
-	} else
-		parent = container_of(i_watch, struct audit_parent, wdata);
-
-	mutex_lock(&audit_filter_mutex);
-
-	/* parent was moved before we took audit_filter_mutex */
-	if (parent->flags & AUDIT_PARENT_INVALID)
-		ret = -ENOENT;
-	else
-		audit_add_to_parent(krule, parent);
-
-	/* match get in audit_init_parent or inotify_find_watch */
-	put_inotify_watch(&parent->wdata);
-	return ret;
-}
-
 static u64 prio_low = ~0ULL/2;
 static u64 prio_high = ~0ULL/2 - 1;
 
@@ -1282,7 +864,6 @@ static inline int audit_add_rule(struct audit_entry *entry)
 	struct audit_entry *e;
 	struct audit_watch *watch = entry->rule.watch;
 	struct audit_tree *tree = entry->rule.tree;
-	struct nameidata *ndp = NULL, *ndw = NULL;
 	struct list_head *list;
 	int h, err;
 #ifdef CONFIG_AUDITSYSCALL
@@ -1296,8 +877,8 @@ static inline int audit_add_rule(struct audit_entry *entry)
 
 	mutex_lock(&audit_filter_mutex);
 	e = audit_find_rule(entry, &list);
-	mutex_unlock(&audit_filter_mutex);
 	if (e) {
+		mutex_unlock(&audit_filter_mutex);
 		err = -EEXIST;
 		/* normally audit_add_tree_rule() will free it on failure */
 		if (tree)
@@ -1305,22 +886,16 @@ static inline int audit_add_rule(struct audit_entry *entry)
 		goto error;
 	}
 
-	/* Avoid calling path_lookup under audit_filter_mutex. */
-	if (watch) {
-		err = audit_get_nd(watch->path, &ndp, &ndw);
-		if (err)
-			goto error;
-	}
-
-	mutex_lock(&audit_filter_mutex);
 	if (watch) {
 		/* audit_filter_mutex is dropped and re-taken during this call */
-		err = audit_add_watch(&entry->rule, ndp, ndw);
+		err = audit_add_watch(&entry->rule);
 		if (err) {
 			mutex_unlock(&audit_filter_mutex);
 			goto error;
 		}
-		h = audit_hash_ino((u32)watch->ino);
+		/* entry->rule.watch may have changed during audit_add_watch() */
+		watch = entry->rule.watch;
+		h = audit_hash_ino((u32)audit_watch_inode(watch));
 		list = &audit_inode_hash[h];
 	}
 	if (tree) {
@@ -1358,11 +933,9 @@ static inline int audit_add_rule(struct audit_entry *entry)
 #endif
 	mutex_unlock(&audit_filter_mutex);
 
-	audit_put_nd(ndp, ndw);		/* NULL args OK */
  	return 0;
 
 error:
-	audit_put_nd(ndp, ndw);		/* NULL args OK */
 	if (watch)
 		audit_put_watch(watch); /* tmp watch, matches initial get */
 	return err;
@@ -1372,7 +945,7 @@ error:
 static inline int audit_del_rule(struct audit_entry *entry)
 {
 	struct audit_entry  *e;
-	struct audit_watch *watch, *tmp_watch = entry->rule.watch;
+	struct audit_watch *watch = entry->rule.watch;
 	struct audit_tree *tree = entry->rule.tree;
 	struct list_head *list;
 	LIST_HEAD(inotify_list);
@@ -1394,29 +967,8 @@ static inline int audit_del_rule(struct audit_entry *entry)
 		goto out;
 	}
 
-	watch = e->rule.watch;
-	if (watch) {
-		struct audit_parent *parent = watch->parent;
-
-		list_del(&e->rule.rlist);
-
-		if (list_empty(&watch->rules)) {
-			audit_remove_watch(watch);
-
-			if (list_empty(&parent->watches)) {
-				/* Put parent on the inotify un-registration
-				 * list.  Grab a reference before releasing
-				 * audit_filter_mutex, to be released in
-				 * audit_inotify_unregister().
-				 * If filesystem is going away, just leave
-				 * the sucker alone, eviction will take
-				 * care of it.
-				 */
-				if (pin_inotify_watch(&parent->wdata))
-					list_add(&parent->ilist, &inotify_list);
-			}
-		}
-	}
+	if (e->rule.watch)
+		audit_remove_watch_rule(&e->rule, &inotify_list);
 
 	if (e->rule.tree)
 		audit_remove_tree_rule(&e->rule);
@@ -1438,8 +990,8 @@ static inline int audit_del_rule(struct audit_entry *entry)
 		audit_inotify_unregister(&inotify_list);
 
 out:
-	if (tmp_watch)
-		audit_put_watch(tmp_watch); /* match initial get */
+	if (watch)
+		audit_put_watch(watch); /* match initial get */
 	if (tree)
 		audit_put_tree(tree);	/* that's the temporary one */
 
@@ -1527,11 +1079,9 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
 			security_release_secctx(ctx, len);
 		}
 	}
-	audit_log_format(ab, " op=%s rule key=", action);
-	if (rule->filterkey)
-		audit_log_untrustedstring(ab, rule->filterkey);
-	else
-		audit_log_format(ab, "(null)");
+	audit_log_format(ab, " op=");
+	audit_log_string(ab, action);
+	audit_log_key(ab, rule->filterkey);
 	audit_log_format(ab, " list=%d res=%d", rule->listnr, res);
 	audit_log_end(ab);
 }
@@ -1595,7 +1145,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
 			return PTR_ERR(entry);
 
 		err = audit_add_rule(entry);
-		audit_log_rule_change(loginuid, sessionid, sid, "add",
+		audit_log_rule_change(loginuid, sessionid, sid, "add rule",
 				      &entry->rule, !err);
 
 		if (err)
@@ -1611,7 +1161,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
 			return PTR_ERR(entry);
 
 		err = audit_del_rule(entry);
-		audit_log_rule_change(loginuid, sessionid, sid, "remove",
+		audit_log_rule_change(loginuid, sessionid, sid, "remove rule",
 				      &entry->rule, !err);
 
 		audit_free_rule(entry);
@@ -1793,7 +1343,7 @@ static int update_lsm_rule(struct audit_krule *r)
 		list_del(&r->list);
 	} else {
 		if (watch) {
-			list_add(&nentry->rule.rlist, &watch->rules);
+			list_add(&nentry->rule.rlist, audit_watch_rules(watch));
 			list_del(&r->rlist);
 		} else if (tree)
 			list_replace_init(&r->rlist, &nentry->rule.rlist);
@@ -1829,27 +1379,3 @@ int audit_update_lsm_rules(void)
 
 	return err;
 }
-
-/* Update watch data in audit rules based on inotify events. */
-void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
-			 u32 cookie, const char *dname, struct inode *inode)
-{
-	struct audit_parent *parent;
-
-	parent = container_of(i_watch, struct audit_parent, wdata);
-
-	if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
-		audit_update_watch(parent, dname, inode->i_sb->s_dev,
-				   inode->i_ino, 0);
-	else if (mask & (IN_DELETE|IN_MOVED_FROM))
-		audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
-	/* inotify automatically removes the watch and sends IN_IGNORED */
-	else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
-		audit_remove_parent_watches(parent);
-	/* inotify does not remove the watch, so remove it manually */
-	else if(mask & IN_MOVE_SELF) {
-		audit_remove_parent_watches(parent);
-		inotify_remove_watch_locked(audit_ih, i_watch);
-	} else if (mask & IN_IGNORED)
-		put_inotify_watch(i_watch);
-}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 7d6ac7c1f41..68d3c6a0ecd 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -199,6 +199,7 @@ struct audit_context {
 
 	struct audit_tree_refs *trees, *first_trees;
 	int tree_count;
+	struct list_head killed_trees;
 
 	int type;
 	union {
@@ -548,9 +549,9 @@ static int audit_filter_rules(struct task_struct *tsk,
 			}
 			break;
 		case AUDIT_WATCH:
-			if (name && rule->watch->ino != (unsigned long)-1)
-				result = (name->dev == rule->watch->dev &&
-					  name->ino == rule->watch->ino);
+			if (name && audit_watch_inode(rule->watch) != (unsigned long)-1)
+				result = (name->dev == audit_watch_dev(rule->watch) &&
+					  name->ino == audit_watch_inode(rule->watch));
 			break;
 		case AUDIT_DIR:
 			if (ctx)
@@ -853,6 +854,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
 	if (!(context = kmalloc(sizeof(*context), GFP_KERNEL)))
 		return NULL;
 	audit_zero_context(context, state);
+	INIT_LIST_HEAD(&context->killed_trees);
 	return context;
 }
 
@@ -1024,8 +1026,8 @@ static int audit_log_single_execve_arg(struct audit_context *context,
 {
 	char arg_num_len_buf[12];
 	const char __user *tmp_p = p;
-	/* how many digits are in arg_num? 3 is the length of " a=" */
-	size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 3;
+	/* how many digits are in arg_num? 5 is the length of ' a=""' */
+	size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5;
 	size_t len, len_left, to_send;
 	size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN;
 	unsigned int i, has_cntl = 0, too_long = 0;
@@ -1137,7 +1139,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
 		if (has_cntl)
 			audit_log_n_hex(*ab, buf, to_send);
 		else
-			audit_log_format(*ab, "\"%s\"", buf);
+			audit_log_string(*ab, buf);
 
 		p += to_send;
 		len_left -= to_send;
@@ -1372,11 +1374,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 
 
 	audit_log_task_info(ab, tsk);
-	if (context->filterkey) {
-		audit_log_format(ab, " key=");
-		audit_log_untrustedstring(ab, context->filterkey);
-	} else
-		audit_log_format(ab, " key=(null)");
+	audit_log_key(ab, context->filterkey);
 	audit_log_end(ab);
 
 	for (aux = context->aux; aux; aux = aux->next) {
@@ -1549,6 +1547,8 @@ void audit_free(struct task_struct *tsk)
 	/* that can happen only if we are called from do_exit() */
 	if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
 		audit_log_exit(context, tsk);
+	if (!list_empty(&context->killed_trees))
+		audit_kill_trees(&context->killed_trees);
 
 	audit_free_context(context);
 }
@@ -1692,6 +1692,9 @@ void audit_syscall_exit(int valid, long return_code)
 	context->in_syscall = 0;
 	context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
 
+	if (!list_empty(&context->killed_trees))
+		audit_kill_trees(&context->killed_trees);
+
 	if (context->previous) {
 		struct audit_context *new_context = context->previous;
 		context->previous  = NULL;
@@ -2525,3 +2528,11 @@ void audit_core_dumps(long signr)
 	audit_log_format(ab, " sig=%ld", signr);
 	audit_log_end(ab);
 }
+
+struct list_head *audit_killed_trees(void)
+{
+	struct audit_context *ctx = current->audit_context;
+	if (likely(!ctx || !ctx->in_syscall))
+		return NULL;
+	return &ctx->killed_trees;
+}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 395b6974dc8..8ce10043e4a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -34,14 +34,11 @@ static struct {
 	 * an ongoing cpu hotplug operation.
 	 */
 	int refcount;
-} cpu_hotplug;
-
-void __init cpu_hotplug_init(void)
-{
-	cpu_hotplug.active_writer = NULL;
-	mutex_init(&cpu_hotplug.lock);
-	cpu_hotplug.refcount = 0;
-}
+} cpu_hotplug = {
+	.active_writer = NULL,
+	.lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
+	.refcount = 0,
+};
 
 #ifdef CONFIG_HOTPLUG_CPU
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 13ae64001fe..628d41f0dd5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1197,8 +1197,11 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 	}
 
 	traced = ptrace_reparented(p);
-
-	if (likely(!traced)) {
+	/*
+	 * It can be ptraced but not reparented, check
+	 * !task_detached() to filter out sub-threads.
+	 */
+	if (likely(!traced) && likely(!task_detached(p))) {
 		struct signal_struct *psig;
 		struct signal_struct *sig;
 
diff --git a/kernel/futex.c b/kernel/futex.c
index 80b5ce71659..794c862125f 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -284,6 +284,25 @@ void put_futex_key(int fshared, union futex_key *key)
 	drop_futex_key_refs(key);
 }
 
+/*
+ * fault_in_user_writeable - fault in user address and verify RW access
+ * @uaddr:	pointer to faulting user space address
+ *
+ * Slow path to fixup the fault we just took in the atomic write
+ * access to @uaddr.
+ *
+ * We have no generic implementation of a non destructive write to the
+ * user address. We know that we faulted in the atomic pagefault
+ * disabled section so we can as well avoid the #PF overhead by
+ * calling get_user_pages() right away.
+ */
+static int fault_in_user_writeable(u32 __user *uaddr)
+{
+	int ret = get_user_pages(current, current->mm, (unsigned long)uaddr,
+				 1, 1, 0, NULL, NULL);
+	return ret < 0 ? ret : 0;
+}
+
 /**
  * futex_top_waiter() - Return the highest priority waiter on a futex
  * @hb:     the hash bucket the futex_q's reside in
@@ -896,7 +915,6 @@ retry:
 retry_private:
 	op_ret = futex_atomic_op_inuser(op, uaddr2);
 	if (unlikely(op_ret < 0)) {
-		u32 dummy;
 
 		double_unlock_hb(hb1, hb2);
 
@@ -914,7 +932,7 @@ retry_private:
 			goto out_put_keys;
 		}
 
-		ret = get_user(dummy, uaddr2);
+		ret = fault_in_user_writeable(uaddr2);
 		if (ret)
 			goto out_put_keys;
 
@@ -1204,7 +1222,7 @@ retry_private:
 			double_unlock_hb(hb1, hb2);
 			put_futex_key(fshared, &key2);
 			put_futex_key(fshared, &key1);
-			ret = get_user(curval2, uaddr2);
+			ret = fault_in_user_writeable(uaddr2);
 			if (!ret)
 				goto retry;
 			goto out;
@@ -1482,7 +1500,7 @@ retry:
 handle_fault:
 	spin_unlock(q->lock_ptr);
 
-	ret = get_user(uval, uaddr);
+	ret = fault_in_user_writeable(uaddr);
 
 	spin_lock(q->lock_ptr);
 
@@ -1807,7 +1825,6 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
 {
 	struct hrtimer_sleeper timeout, *to = NULL;
 	struct futex_hash_bucket *hb;
-	u32 uval;
 	struct futex_q q;
 	int res, ret;
 
@@ -1909,16 +1926,9 @@ out:
 	return ret != -EINTR ? ret : -ERESTARTNOINTR;
 
 uaddr_faulted:
-	/*
-	 * We have to r/w  *(int __user *)uaddr, and we have to modify it
-	 * atomically.  Therefore, if we continue to fault after get_user()
-	 * below, we need to handle the fault ourselves, while still holding
-	 * the mmap_sem.  This can occur if the uaddr is under contention as
-	 * we have to drop the mmap_sem in order to call get_user().
-	 */
 	queue_unlock(&q, hb);
 
-	ret = get_user(uval, uaddr);
+	ret = fault_in_user_writeable(uaddr);
 	if (ret)
 		goto out_put_key;
 
@@ -2013,17 +2023,10 @@ out:
 	return ret;
 
 pi_faulted:
-	/*
-	 * We have to r/w  *(int __user *)uaddr, and we have to modify it
-	 * atomically.  Therefore, if we continue to fault after get_user()
-	 * below, we need to handle the fault ourselves, while still holding
-	 * the mmap_sem.  This can occur if the uaddr is under contention as
-	 * we have to drop the mmap_sem in order to call get_user().
-	 */
 	spin_unlock(&hb->lock);
 	put_futex_key(fshared, &key);
 
-	ret = get_user(uval, uaddr);
+	ret = fault_in_user_writeable(uaddr);
 	if (!ret)
 		goto retry;
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index aaf5c9d0577..50da6767290 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -856,7 +856,7 @@ EXPORT_SYMBOL(free_irq);
  *	still called in hard interrupt context and has to check
  *	whether the interrupt originates from the device. If yes it
  *	needs to disable the interrupt on the device and return
- *	IRQ_THREAD_WAKE which will wake up the handler thread and run
+ *	IRQ_WAKE_THREAD which will wake up the handler thread and run
  *	@thread_fn. This split handler design is necessary to support
  *	shared interrupts.
  *
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 29b685f551a..1a933a221ea 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -124,7 +124,7 @@ void perf_enable(void)
 
 static void get_ctx(struct perf_counter_context *ctx)
 {
-	atomic_inc(&ctx->refcount);
+	WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
 }
 
 static void free_ctx(struct rcu_head *head)
@@ -175,6 +175,11 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
 			spin_unlock_irqrestore(&ctx->lock, *flags);
 			goto retry;
 		}
+
+		if (!atomic_inc_not_zero(&ctx->refcount)) {
+			spin_unlock_irqrestore(&ctx->lock, *flags);
+			ctx = NULL;
+		}
 	}
 	rcu_read_unlock();
 	return ctx;
@@ -193,7 +198,6 @@ static struct perf_counter_context *perf_pin_task_context(struct task_struct *ta
 	ctx = perf_lock_task_context(task, &flags);
 	if (ctx) {
 		++ctx->pin_count;
-		get_ctx(ctx);
 		spin_unlock_irqrestore(&ctx->lock, flags);
 	}
 	return ctx;
@@ -1283,7 +1287,7 @@ static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
 		if (!interrupts) {
 			perf_disable();
 			counter->pmu->disable(counter);
-			atomic_set(&hwc->period_left, 0);
+			atomic64_set(&hwc->period_left, 0);
 			counter->pmu->enable(counter);
 			perf_enable();
 		}
@@ -1459,11 +1463,6 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 			put_ctx(parent_ctx);
 			ctx->parent_ctx = NULL;		/* no longer a clone */
 		}
-		/*
-		 * Get an extra reference before dropping the lock so that
-		 * this context won't get freed if the task exits.
-		 */
-		get_ctx(ctx);
 		spin_unlock_irqrestore(&ctx->lock, flags);
 	}
 
@@ -1553,7 +1552,7 @@ static int perf_release(struct inode *inode, struct file *file)
 static ssize_t
 perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 {
-	u64 values[3];
+	u64 values[4];
 	int n;
 
 	/*
@@ -1620,22 +1619,6 @@ static void perf_counter_reset(struct perf_counter *counter)
 	perf_counter_update_userpage(counter);
 }
 
-static void perf_counter_for_each_sibling(struct perf_counter *counter,
-					  void (*func)(struct perf_counter *))
-{
-	struct perf_counter_context *ctx = counter->ctx;
-	struct perf_counter *sibling;
-
-	WARN_ON_ONCE(ctx->parent_ctx);
-	mutex_lock(&ctx->mutex);
-	counter = counter->group_leader;
-
-	func(counter);
-	list_for_each_entry(sibling, &counter->sibling_list, list_entry)
-		func(sibling);
-	mutex_unlock(&ctx->mutex);
-}
-
 /*
  * Holding the top-level counter's child_mutex means that any
  * descendant process that has inherited this counter will block
@@ -1658,14 +1641,18 @@ static void perf_counter_for_each_child(struct perf_counter *counter,
 static void perf_counter_for_each(struct perf_counter *counter,
 				  void (*func)(struct perf_counter *))
 {
-	struct perf_counter *child;
+	struct perf_counter_context *ctx = counter->ctx;
+	struct perf_counter *sibling;
 
-	WARN_ON_ONCE(counter->ctx->parent_ctx);
-	mutex_lock(&counter->child_mutex);
-	perf_counter_for_each_sibling(counter, func);
-	list_for_each_entry(child, &counter->child_list, child_list)
-		perf_counter_for_each_sibling(child, func);
-	mutex_unlock(&counter->child_mutex);
+	WARN_ON_ONCE(ctx->parent_ctx);
+	mutex_lock(&ctx->mutex);
+	counter = counter->group_leader;
+
+	perf_counter_for_each_child(counter, func);
+	func(counter);
+	list_for_each_entry(sibling, &counter->sibling_list, list_entry)
+		perf_counter_for_each_child(counter, func);
+	mutex_unlock(&ctx->mutex);
 }
 
 static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
@@ -1806,6 +1793,12 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct perf_mmap_data *data;
 	int ret = VM_FAULT_SIGBUS;
 
+	if (vmf->flags & FAULT_FLAG_MKWRITE) {
+		if (vmf->pgoff == 0)
+			ret = 0;
+		return ret;
+	}
+
 	rcu_read_lock();
 	data = rcu_dereference(counter->data);
 	if (!data)
@@ -1819,9 +1812,16 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		if ((unsigned)nr > data->nr_pages)
 			goto unlock;
 
+		if (vmf->flags & FAULT_FLAG_WRITE)
+			goto unlock;
+
 		vmf->page = virt_to_page(data->data_pages[nr]);
 	}
+
 	get_page(vmf->page);
+	vmf->page->mapping = vma->vm_file->f_mapping;
+	vmf->page->index   = vmf->pgoff;
+
 	ret = 0;
 unlock:
 	rcu_read_unlock();
@@ -1874,6 +1874,14 @@ fail:
 	return -ENOMEM;
 }
 
+static void perf_mmap_free_page(unsigned long addr)
+{
+	struct page *page = virt_to_page(addr);
+
+	page->mapping = NULL;
+	__free_page(page);
+}
+
 static void __perf_mmap_data_free(struct rcu_head *rcu_head)
 {
 	struct perf_mmap_data *data;
@@ -1881,9 +1889,10 @@ static void __perf_mmap_data_free(struct rcu_head *rcu_head)
 
 	data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
 
-	free_page((unsigned long)data->user_page);
+	perf_mmap_free_page((unsigned long)data->user_page);
 	for (i = 0; i < data->nr_pages; i++)
-		free_page((unsigned long)data->data_pages[i]);
+		perf_mmap_free_page((unsigned long)data->data_pages[i]);
+
 	kfree(data);
 }
 
@@ -1920,9 +1929,10 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 }
 
 static struct vm_operations_struct perf_mmap_vmops = {
-	.open  = perf_mmap_open,
-	.close = perf_mmap_close,
-	.fault = perf_mmap_fault,
+	.open		= perf_mmap_open,
+	.close		= perf_mmap_close,
+	.fault		= perf_mmap_fault,
+	.page_mkwrite	= perf_mmap_fault,
 };
 
 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
@@ -1936,7 +1946,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	long user_extra, extra;
 	int ret = 0;
 
-	if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
+	if (!(vma->vm_flags & VM_SHARED))
 		return -EINVAL;
 
 	vma_size = vma->vm_end - vma->vm_start;
@@ -1995,10 +2005,12 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	atomic_long_add(user_extra, &user->locked_vm);
 	vma->vm_mm->locked_vm += extra;
 	counter->data->nr_locked = extra;
+	if (vma->vm_flags & VM_WRITE)
+		counter->data->writable = 1;
+
 unlock:
 	mutex_unlock(&counter->mmap_mutex);
 
-	vma->vm_flags &= ~VM_MAYWRITE;
 	vma->vm_flags |= VM_RESERVED;
 	vma->vm_ops = &perf_mmap_vmops;
 
@@ -2175,11 +2187,38 @@ struct perf_output_handle {
 	unsigned long		head;
 	unsigned long		offset;
 	int			nmi;
-	int			overflow;
+	int			sample;
 	int			locked;
 	unsigned long		flags;
 };
 
+static bool perf_output_space(struct perf_mmap_data *data,
+			      unsigned int offset, unsigned int head)
+{
+	unsigned long tail;
+	unsigned long mask;
+
+	if (!data->writable)
+		return true;
+
+	mask = (data->nr_pages << PAGE_SHIFT) - 1;
+	/*
+	 * Userspace could choose to issue a mb() before updating the tail
+	 * pointer. So that all reads will be completed before the write is
+	 * issued.
+	 */
+	tail = ACCESS_ONCE(data->user_page->data_tail);
+	smp_rmb();
+
+	offset = (offset - tail) & mask;
+	head   = (head   - tail) & mask;
+
+	if ((int)(head - offset) < 0)
+		return false;
+
+	return true;
+}
+
 static void perf_output_wakeup(struct perf_output_handle *handle)
 {
 	atomic_set(&handle->data->poll, POLL_IN);
@@ -2270,12 +2309,57 @@ out:
 	local_irq_restore(handle->flags);
 }
 
+static void perf_output_copy(struct perf_output_handle *handle,
+			     const void *buf, unsigned int len)
+{
+	unsigned int pages_mask;
+	unsigned int offset;
+	unsigned int size;
+	void **pages;
+
+	offset		= handle->offset;
+	pages_mask	= handle->data->nr_pages - 1;
+	pages		= handle->data->data_pages;
+
+	do {
+		unsigned int page_offset;
+		int nr;
+
+		nr	    = (offset >> PAGE_SHIFT) & pages_mask;
+		page_offset = offset & (PAGE_SIZE - 1);
+		size	    = min_t(unsigned int, PAGE_SIZE - page_offset, len);
+
+		memcpy(pages[nr] + page_offset, buf, size);
+
+		len	    -= size;
+		buf	    += size;
+		offset	    += size;
+	} while (len);
+
+	handle->offset = offset;
+
+	/*
+	 * Check we didn't copy past our reservation window, taking the
+	 * possible unsigned int wrap into account.
+	 */
+	WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
+}
+
+#define perf_output_put(handle, x) \
+	perf_output_copy((handle), &(x), sizeof(x))
+
 static int perf_output_begin(struct perf_output_handle *handle,
 			     struct perf_counter *counter, unsigned int size,
-			     int nmi, int overflow)
+			     int nmi, int sample)
 {
 	struct perf_mmap_data *data;
 	unsigned int offset, head;
+	int have_lost;
+	struct {
+		struct perf_event_header header;
+		u64			 id;
+		u64			 lost;
+	} lost_event;
 
 	/*
 	 * For inherited counters we send all the output towards the parent.
@@ -2288,19 +2372,25 @@ static int perf_output_begin(struct perf_output_handle *handle,
 	if (!data)
 		goto out;
 
-	handle->data	 = data;
-	handle->counter	 = counter;
-	handle->nmi	 = nmi;
-	handle->overflow = overflow;
+	handle->data	= data;
+	handle->counter	= counter;
+	handle->nmi	= nmi;
+	handle->sample	= sample;
 
 	if (!data->nr_pages)
 		goto fail;
 
+	have_lost = atomic_read(&data->lost);
+	if (have_lost)
+		size += sizeof(lost_event);
+
 	perf_output_lock(handle);
 
 	do {
 		offset = head = atomic_long_read(&data->head);
 		head += size;
+		if (unlikely(!perf_output_space(data, offset, head)))
+			goto fail;
 	} while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
 
 	handle->offset	= offset;
@@ -2309,55 +2399,27 @@ static int perf_output_begin(struct perf_output_handle *handle,
 	if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
 		atomic_set(&data->wakeup, 1);
 
+	if (have_lost) {
+		lost_event.header.type = PERF_EVENT_LOST;
+		lost_event.header.misc = 0;
+		lost_event.header.size = sizeof(lost_event);
+		lost_event.id          = counter->id;
+		lost_event.lost        = atomic_xchg(&data->lost, 0);
+
+		perf_output_put(handle, lost_event);
+	}
+
 	return 0;
 
 fail:
-	perf_output_wakeup(handle);
+	atomic_inc(&data->lost);
+	perf_output_unlock(handle);
 out:
 	rcu_read_unlock();
 
 	return -ENOSPC;
 }
 
-static void perf_output_copy(struct perf_output_handle *handle,
-			     const void *buf, unsigned int len)
-{
-	unsigned int pages_mask;
-	unsigned int offset;
-	unsigned int size;
-	void **pages;
-
-	offset		= handle->offset;
-	pages_mask	= handle->data->nr_pages - 1;
-	pages		= handle->data->data_pages;
-
-	do {
-		unsigned int page_offset;
-		int nr;
-
-		nr	    = (offset >> PAGE_SHIFT) & pages_mask;
-		page_offset = offset & (PAGE_SIZE - 1);
-		size	    = min_t(unsigned int, PAGE_SIZE - page_offset, len);
-
-		memcpy(pages[nr] + page_offset, buf, size);
-
-		len	    -= size;
-		buf	    += size;
-		offset	    += size;
-	} while (len);
-
-	handle->offset = offset;
-
-	/*
-	 * Check we didn't copy past our reservation window, taking the
-	 * possible unsigned int wrap into account.
-	 */
-	WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
-}
-
-#define perf_output_put(handle, x) \
-	perf_output_copy((handle), &(x), sizeof(x))
-
 static void perf_output_end(struct perf_output_handle *handle)
 {
 	struct perf_counter *counter = handle->counter;
@@ -2365,7 +2427,7 @@ static void perf_output_end(struct perf_output_handle *handle)
 
 	int wakeup_events = counter->attr.wakeup_events;
 
-	if (handle->overflow && wakeup_events) {
+	if (handle->sample && wakeup_events) {
 		int events = atomic_inc_return(&data->events);
 		if (events >= wakeup_events) {
 			atomic_sub(wakeup_events, &data->events);
@@ -2970,7 +3032,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
 }
 
 /*
- * Generic counter overflow handling.
+ * Generic counter overflow handling, sampling.
  */
 
 int perf_counter_overflow(struct perf_counter *counter, int nmi,
@@ -3109,20 +3171,15 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 }
 
 static void perf_swcounter_overflow(struct perf_counter *counter,
-				    int nmi, struct pt_regs *regs, u64 addr)
+				    int nmi, struct perf_sample_data *data)
 {
-	struct perf_sample_data data = {
-		.regs	= regs,
-		.addr	= addr,
-		.period	= counter->hw.last_period,
-	};
+	data->period = counter->hw.last_period;
 
 	perf_swcounter_update(counter);
 	perf_swcounter_set_period(counter);
-	if (perf_counter_overflow(counter, nmi, &data))
+	if (perf_counter_overflow(counter, nmi, data))
 		/* soft-disable the counter */
 		;
-
 }
 
 static int perf_swcounter_is_counting(struct perf_counter *counter)
@@ -3187,18 +3244,18 @@ static int perf_swcounter_match(struct perf_counter *counter,
 }
 
 static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
-			       int nmi, struct pt_regs *regs, u64 addr)
+			       int nmi, struct perf_sample_data *data)
 {
 	int neg = atomic64_add_negative(nr, &counter->hw.count);
 
-	if (counter->hw.sample_period && !neg && regs)
-		perf_swcounter_overflow(counter, nmi, regs, addr);
+	if (counter->hw.sample_period && !neg && data->regs)
+		perf_swcounter_overflow(counter, nmi, data);
 }
 
 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
-				     enum perf_type_id type, u32 event,
-				     u64 nr, int nmi, struct pt_regs *regs,
-				     u64 addr)
+				     enum perf_type_id type,
+				     u32 event, u64 nr, int nmi,
+				     struct perf_sample_data *data)
 {
 	struct perf_counter *counter;
 
@@ -3207,8 +3264,8 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
-		if (perf_swcounter_match(counter, type, event, regs))
-			perf_swcounter_add(counter, nr, nmi, regs, addr);
+		if (perf_swcounter_match(counter, type, event, data->regs))
+			perf_swcounter_add(counter, nr, nmi, data);
 	}
 	rcu_read_unlock();
 }
@@ -3227,9 +3284,9 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
 	return &cpuctx->recursion[0];
 }
 
-static void __perf_swcounter_event(enum perf_type_id type, u32 event,
-				   u64 nr, int nmi, struct pt_regs *regs,
-				   u64 addr)
+static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
+				    u64 nr, int nmi,
+				    struct perf_sample_data *data)
 {
 	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
 	int *recursion = perf_swcounter_recursion_context(cpuctx);
@@ -3242,7 +3299,7 @@ static void __perf_swcounter_event(enum perf_type_id type, u32 event,
 	barrier();
 
 	perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
-				 nr, nmi, regs, addr);
+				 nr, nmi, data);
 	rcu_read_lock();
 	/*
 	 * doesn't really matter which of the child contexts the
@@ -3250,7 +3307,7 @@ static void __perf_swcounter_event(enum perf_type_id type, u32 event,
 	 */
 	ctx = rcu_dereference(current->perf_counter_ctxp);
 	if (ctx)
-		perf_swcounter_ctx_event(ctx, type, event, nr, nmi, regs, addr);
+		perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data);
 	rcu_read_unlock();
 
 	barrier();
@@ -3263,7 +3320,12 @@ out:
 void
 perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
 {
-	__perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs, addr);
+	struct perf_sample_data data = {
+		.regs = regs,
+		.addr = addr,
+	};
+
+	do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data);
 }
 
 static void perf_swcounter_read(struct perf_counter *counter)
@@ -3404,36 +3466,18 @@ static const struct pmu perf_ops_task_clock = {
 	.read		= task_clock_perf_counter_read,
 };
 
-/*
- * Software counter: cpu migrations
- */
-void perf_counter_task_migration(struct task_struct *task, int cpu)
-{
-	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
-	struct perf_counter_context *ctx;
-
-	perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE,
-				 PERF_COUNT_SW_CPU_MIGRATIONS,
-				 1, 1, NULL, 0);
-
-	ctx = perf_pin_task_context(task);
-	if (ctx) {
-		perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE,
-					 PERF_COUNT_SW_CPU_MIGRATIONS,
-					 1, 1, NULL, 0);
-		perf_unpin_context(ctx);
-	}
-}
-
 #ifdef CONFIG_EVENT_PROFILE
 void perf_tpcounter_event(int event_id)
 {
-	struct pt_regs *regs = get_irq_regs();
+	struct perf_sample_data data = {
+		.regs = get_irq_regs();
+		.addr = 0,
+	};
 
-	if (!regs)
-		regs = task_pt_regs(current);
+	if (!data.regs)
+		data.regs = task_pt_regs(current);
 
-	__perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0);
+	do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, &data);
 }
 EXPORT_SYMBOL_GPL(perf_tpcounter_event);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 247fd0fedd0..7c9098d186e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1978,7 +1978,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 		if (task_hot(p, old_rq->clock, NULL))
 			schedstat_inc(p, se.nr_forced2_migrations);
 #endif
-		perf_counter_task_migration(p, new_cpu);
+		perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS,
+				     1, 1, NULL, 0);
 	}
 	p->se.vruntime -= old_cfsrq->min_vruntime -
 					 new_cfsrq->min_vruntime;
@@ -7822,7 +7823,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 		free_rootdomain(old_rd);
 }
 
-static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
+static int init_rootdomain(struct root_domain *rd, bool bootmem)
 {
 	gfp_t gfp = GFP_KERNEL;
 
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 7deffc9f0e5..e6c251790dd 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -152,7 +152,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
  *
  * Returns: -ENOMEM if memory fails.
  */
-int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)
+int cpupri_init(struct cpupri *cp, bool bootmem)
 {
 	gfp_t gfp = GFP_KERNEL;
 	int i;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 467ca72f165..70c7e0b7994 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -162,7 +162,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 {
 	s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
 		spread, rq0_min_vruntime, spread0;
-	struct rq *rq = &per_cpu(runqueues, cpu);
+	struct rq *rq = cpu_rq(cpu);
 	struct sched_entity *last;
 	unsigned long flags;
 
@@ -191,7 +191,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	if (last)
 		max_vruntime = last->vruntime;
 	min_vruntime = cfs_rq->min_vruntime;
-	rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime;
+	rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
 	spin_unlock_irqrestore(&rq->lock, flags);
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "MIN_vruntime",
 			SPLIT_NS(MIN_vruntime));
@@ -248,7 +248,7 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 
 static void print_cpu(struct seq_file *m, int cpu)
 {
-	struct rq *rq = &per_cpu(runqueues, cpu);
+	struct rq *rq = cpu_rq(cpu);
 
 #ifdef CONFIG_X86
 	{
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5f9650e8fe7..ba7fd6e9556 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -430,12 +430,13 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 	for_each_sched_entity(se) {
 		struct load_weight *load;
+		struct load_weight lw;
 
 		cfs_rq = cfs_rq_of(se);
 		load = &cfs_rq->load;
 
 		if (unlikely(!se->on_rq)) {
-			struct load_weight lw = cfs_rq->load;
+			lw = cfs_rq->load;
 
 			update_load_add(&lw, se->load.weight);
 			load = &lw;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 62e4ff9968b..98e02328c67 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -335,7 +335,10 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_timer_migration,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &one,
 	},
 #endif
 	{
@@ -744,6 +747,14 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "panic_on_io_nmi",
+		.data		= &panic_on_io_nmi,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
 		.ctl_name	= KERN_BOOTLOADER_TYPE,
 		.procname	= "bootloader_type",
 		.data		= &bootloader_type,
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 2aff39c6f10..e0f59a21c06 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -222,6 +222,15 @@ void tick_nohz_stop_sched_tick(int inidle)
 
 	cpu = smp_processor_id();
 	ts = &per_cpu(tick_cpu_sched, cpu);
+
+	/*
+	 * Call to tick_nohz_start_idle stops the last_update_time from being
+	 * updated. Thus, it must not be called in the event we are called from
+	 * irq_exit() with the prior state different than idle.
+	 */
+	if (!inidle && !ts->inidle)
+		goto end;
+
 	now = tick_nohz_start_idle(ts);
 
 	/*
@@ -239,9 +248,6 @@ void tick_nohz_stop_sched_tick(int inidle)
 	if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
 		goto end;
 
-	if (!inidle && !ts->inidle)
-		goto end;
-
 	ts->inidle = 1;
 
 	if (need_resched())
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index c994530d166..4cde8b9c716 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -96,7 +96,7 @@ static DEFINE_MUTEX(show_mutex);
 /*
  * Collection status, active/inactive:
  */
-static int __read_mostly active;
+int __read_mostly timer_stats_active;
 
 /*
  * Beginning/end timestamps of measurement:
@@ -242,7 +242,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
 	struct entry *entry, input;
 	unsigned long flags;
 
-	if (likely(!active))
+	if (likely(!timer_stats_active))
 		return;
 
 	lock = &per_cpu(lookup_lock, raw_smp_processor_id());
@@ -254,7 +254,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
 	input.timer_flag = timer_flag;
 
 	spin_lock_irqsave(lock, flags);
-	if (!active)
+	if (!timer_stats_active)
 		goto out_unlock;
 
 	entry = tstat_lookup(&input, comm);
@@ -290,7 +290,7 @@ static int tstats_show(struct seq_file *m, void *v)
 	/*
 	 * If still active then calculate up to now:
 	 */
-	if (active)
+	if (timer_stats_active)
 		time_stop = ktime_get();
 
 	time = ktime_sub(time_stop, time_start);
@@ -368,18 +368,18 @@ static ssize_t tstats_write(struct file *file, const char __user *buf,
 	mutex_lock(&show_mutex);
 	switch (ctl[0]) {
 	case '0':
-		if (active) {
-			active = 0;
+		if (timer_stats_active) {
+			timer_stats_active = 0;
 			time_stop = ktime_get();
 			sync_access();
 		}
 		break;
 	case '1':
-		if (!active) {
+		if (!timer_stats_active) {
 			reset_entries();
 			time_start = ktime_get();
 			smp_mb();
-			active = 1;
+			timer_stats_active = 1;
 		}
 		break;
 	default:
diff --git a/kernel/timer.c b/kernel/timer.c
index 54d3912f8ca..0b36b9e5cc8 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -380,6 +380,8 @@ static void timer_stats_account_timer(struct timer_list *timer)
 {
 	unsigned int flag = 0;
 
+	if (likely(!timer->start_site))
+		return;
 	if (unlikely(tbase_get_deferrable(timer->base)))
 		flag |= TIMER_STATS_FLAG_DEFERRABLE;
 
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 61071fecc82..1551f47e766 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -18,6 +18,13 @@ config HAVE_FUNCTION_TRACER
 config HAVE_FUNCTION_GRAPH_TRACER
 	bool
 
+config HAVE_FUNCTION_GRAPH_FP_TEST
+	bool
+	help
+	 An arch may pass in a unique value (frame pointer) to both the
+	 entering and exiting of a function. On exit, the value is compared
+	 and if it does not match, then it will panic the kernel.
+
 config HAVE_FUNCTION_TRACE_MCOUNT_TEST
 	bool
 	help
@@ -121,6 +128,7 @@ config FUNCTION_GRAPH_TRACER
 	bool "Kernel Function Graph Tracer"
 	depends on HAVE_FUNCTION_GRAPH_TRACER
 	depends on FUNCTION_TRACER
+	depends on !X86_32 || !CC_OPTIMIZE_FOR_SIZE
 	default y
 	help
 	  Enable the kernel to trace a function at both its return
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index bb60732ade0..f3716bf04df 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -291,7 +291,9 @@ function_stat_next(void *v, int idx)
 	pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK);
 
  again:
-	rec++;
+	if (idx != 0)
+		rec++;
+
 	if ((void *)rec >= (void *)&pg->records[pg->index]) {
 		pg = pg->next;
 		if (!pg)
@@ -1224,6 +1226,13 @@ static void ftrace_shutdown(int command)
 		return;
 
 	ftrace_start_up--;
+	/*
+	 * Just warn in case of unbalance, no need to kill ftrace, it's not
+	 * critical but the ftrace_call callers may be never nopped again after
+	 * further ftrace uses.
+	 */
+	WARN_ON_ONCE(ftrace_start_up < 0);
+
 	if (!ftrace_start_up)
 		command |= FTRACE_DISABLE_CALLS;
 
@@ -1410,10 +1419,20 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
 {
 	struct ftrace_iterator *iter = m->private;
 	void *p = NULL;
+	loff_t l;
+
+	if (!(iter->flags & FTRACE_ITER_HASH))
+		*pos = 0;
 
 	iter->flags |= FTRACE_ITER_HASH;
 
-	return t_hash_next(m, p, pos);
+	iter->hidx = 0;
+	for (l = 0; l <= *pos; ) {
+		p = t_hash_next(m, p, &l);
+		if (!p)
+			break;
+	}
+	return p;
 }
 
 static int t_hash_show(struct seq_file *m, void *v)
@@ -1460,8 +1479,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 			iter->pg = iter->pg->next;
 			iter->idx = 0;
 			goto retry;
-		} else {
-			iter->idx = -1;
 		}
 	} else {
 		rec = &iter->pg->records[iter->idx++];
@@ -1490,6 +1507,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 {
 	struct ftrace_iterator *iter = m->private;
 	void *p = NULL;
+	loff_t l;
 
 	mutex_lock(&ftrace_lock);
 	/*
@@ -1501,23 +1519,21 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 		if (*pos > 0)
 			return t_hash_start(m, pos);
 		iter->flags |= FTRACE_ITER_PRINTALL;
-		(*pos)++;
 		return iter;
 	}
 
 	if (iter->flags & FTRACE_ITER_HASH)
 		return t_hash_start(m, pos);
 
-	if (*pos > 0) {
-		if (iter->idx < 0)
-			return p;
-		(*pos)--;
-		iter->idx--;
+	iter->pg = ftrace_pages_start;
+	iter->idx = 0;
+	for (l = 0; l <= *pos; ) {
+		p = t_next(m, p, &l);
+		if (!p)
+			break;
 	}
 
-	p = t_next(m, p, pos);
-
-	if (!p)
+	if (!p && iter->flags & FTRACE_ITER_FILTER)
 		return t_hash_start(m, pos);
 
 	return p;
@@ -2493,32 +2509,31 @@ int ftrace_graph_count;
 unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
 
 static void *
-g_next(struct seq_file *m, void *v, loff_t *pos)
+__g_next(struct seq_file *m, loff_t *pos)
 {
 	unsigned long *array = m->private;
-	int index = *pos;
-
-	(*pos)++;
 
-	if (index >= ftrace_graph_count)
+	if (*pos >= ftrace_graph_count)
 		return NULL;
+	return &array[*pos];
+}
 
-	return &array[index];
+static void *
+g_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	(*pos)++;
+	return __g_next(m, pos);
 }
 
 static void *g_start(struct seq_file *m, loff_t *pos)
 {
-	void *p = NULL;
-
 	mutex_lock(&graph_lock);
 
 	/* Nothing, tell g_show to print all functions are enabled */
 	if (!ftrace_graph_count && !*pos)
 		return (void *)1;
 
-	p = g_next(m, p, pos);
-
-	return p;
+	return __g_next(m, pos);
 }
 
 static void g_stop(struct seq_file *m, void *p)
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 86cdf671d7e..1edaa9516e8 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -186,7 +186,7 @@ static int kmem_trace_init(struct trace_array *tr)
 	int cpu;
 	kmemtrace_array = tr;
 
-	for_each_cpu_mask(cpu, cpu_possible_map)
+	for_each_cpu(cpu, cpu_possible_mask)
 		tracing_reset(tr, cpu);
 
 	kmemtrace_start_probes();
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index dc4dc70171c..bf27bb7a63e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -206,6 +206,7 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
 #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
 #define RB_ALIGNMENT		4U
 #define RB_MAX_SMALL_DATA	(RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
+#define RB_EVNT_MIN_SIZE	8U	/* two 32bit words */
 
 /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
 #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
@@ -415,6 +416,8 @@ struct ring_buffer_per_cpu {
 	unsigned long			overrun;
 	unsigned long			read;
 	local_t				entries;
+	local_t				committing;
+	local_t				commits;
 	u64				write_stamp;
 	u64				read_stamp;
 	atomic_t			record_disabled;
@@ -618,12 +621,6 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
 	kfree(cpu_buffer);
 }
 
-/*
- * Causes compile errors if the struct buffer_page gets bigger
- * than the struct page.
- */
-extern int ring_buffer_page_too_big(void);
-
 #ifdef CONFIG_HOTPLUG_CPU
 static int rb_cpu_notify(struct notifier_block *self,
 			 unsigned long action, void *hcpu);
@@ -646,11 +643,6 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
 	int bsize;
 	int cpu;
 
-	/* Paranoid! Optimizes out when all is well */
-	if (sizeof(struct buffer_page) > sizeof(struct page))
-		ring_buffer_page_too_big();
-
-
 	/* keep it in its own cache line */
 	buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
 			 GFP_KERNEL);
@@ -666,8 +658,8 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
 	buffer->reader_lock_key = key;
 
 	/* need at least two pages */
-	if (buffer->pages == 1)
-		buffer->pages++;
+	if (buffer->pages < 2)
+		buffer->pages = 2;
 
 	/*
 	 * In case of non-hotplug cpu, if the ring-buffer is allocated
@@ -1011,12 +1003,12 @@ rb_event_index(struct ring_buffer_event *event)
 {
 	unsigned long addr = (unsigned long)event;
 
-	return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE);
+	return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE;
 }
 
 static inline int
-rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
-	     struct ring_buffer_event *event)
+rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
+		   struct ring_buffer_event *event)
 {
 	unsigned long addr = (unsigned long)event;
 	unsigned long index;
@@ -1029,31 +1021,6 @@ rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
 }
 
 static void
-rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
-		    struct ring_buffer_event *event)
-{
-	unsigned long addr = (unsigned long)event;
-	unsigned long index;
-
-	index = rb_event_index(event);
-	addr &= PAGE_MASK;
-
-	while (cpu_buffer->commit_page->page != (void *)addr) {
-		if (RB_WARN_ON(cpu_buffer,
-			  cpu_buffer->commit_page == cpu_buffer->tail_page))
-			return;
-		cpu_buffer->commit_page->page->commit =
-			cpu_buffer->commit_page->write;
-		rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
-		cpu_buffer->write_stamp =
-			cpu_buffer->commit_page->page->time_stamp;
-	}
-
-	/* Now set the commit to the event's index */
-	local_set(&cpu_buffer->commit_page->page->commit, index);
-}
-
-static void
 rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
 {
 	/*
@@ -1171,6 +1138,60 @@ static unsigned rb_calculate_event_length(unsigned length)
 	return length;
 }
 
+static inline void
+rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
+	      struct buffer_page *tail_page,
+	      unsigned long tail, unsigned long length)
+{
+	struct ring_buffer_event *event;
+
+	/*
+	 * Only the event that crossed the page boundary
+	 * must fill the old tail_page with padding.
+	 */
+	if (tail >= BUF_PAGE_SIZE) {
+		local_sub(length, &tail_page->write);
+		return;
+	}
+
+	event = __rb_page_index(tail_page, tail);
+	kmemcheck_annotate_bitfield(event, bitfield);
+
+	/*
+	 * If this event is bigger than the minimum size, then
+	 * we need to be careful that we don't subtract the
+	 * write counter enough to allow another writer to slip
+	 * in on this page.
+	 * We put in a discarded commit instead, to make sure
+	 * that this space is not used again.
+	 *
+	 * If we are less than the minimum size, we don't need to
+	 * worry about it.
+	 */
+	if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) {
+		/* No room for any events */
+
+		/* Mark the rest of the page with padding */
+		rb_event_set_padding(event);
+
+		/* Set the write back to the previous setting */
+		local_sub(length, &tail_page->write);
+		return;
+	}
+
+	/* Put in a discarded event */
+	event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE;
+	event->type_len = RINGBUF_TYPE_PADDING;
+	/* time delta must be non zero */
+	event->time_delta = 1;
+	/* Account for this as an entry */
+	local_inc(&tail_page->entries);
+	local_inc(&cpu_buffer->entries);
+
+	/* Set write to end of buffer */
+	length = (tail + length) - BUF_PAGE_SIZE;
+	local_sub(length, &tail_page->write);
+}
 
 static struct ring_buffer_event *
 rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
@@ -1180,7 +1201,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 {
 	struct buffer_page *next_page, *head_page, *reader_page;
 	struct ring_buffer *buffer = cpu_buffer->buffer;
-	struct ring_buffer_event *event;
 	bool lock_taken = false;
 	unsigned long flags;
 
@@ -1265,27 +1285,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 		cpu_buffer->tail_page->page->time_stamp = *ts;
 	}
 
-	/*
-	 * The actual tail page has moved forward.
-	 */
-	if (tail < BUF_PAGE_SIZE) {
-		/* Mark the rest of the page with padding */
-		event = __rb_page_index(tail_page, tail);
-		kmemcheck_annotate_bitfield(event, bitfield);
-		rb_event_set_padding(event);
-	}
-
-	/* Set the write back to the previous setting */
-	local_sub(length, &tail_page->write);
-
-	/*
-	 * If this was a commit entry that failed,
-	 * increment that too
-	 */
-	if (tail_page == cpu_buffer->commit_page &&
-	    tail == rb_commit_index(cpu_buffer)) {
-		rb_set_commit_to_write(cpu_buffer);
-	}
+	rb_reset_tail(cpu_buffer, tail_page, tail, length);
 
 	__raw_spin_unlock(&cpu_buffer->lock);
 	local_irq_restore(flags);
@@ -1295,7 +1295,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 
  out_reset:
 	/* reset write */
-	local_sub(length, &tail_page->write);
+	rb_reset_tail(cpu_buffer, tail_page, tail, length);
 
 	if (likely(lock_taken))
 		__raw_spin_unlock(&cpu_buffer->lock);
@@ -1325,9 +1325,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 
 	/* We reserved something on the buffer */
 
-	if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE))
-		return NULL;
-
 	event = __rb_page_index(tail_page, tail);
 	kmemcheck_annotate_bitfield(event, bitfield);
 	rb_update_event(event, type, length);
@@ -1337,11 +1334,11 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		local_inc(&tail_page->entries);
 
 	/*
-	 * If this is a commit and the tail is zero, then update
-	 * this page's time stamp.
+	 * If this is the first commit on the page, then update
+	 * its timestamp.
 	 */
-	if (!tail && rb_is_commit(cpu_buffer, event))
-		cpu_buffer->commit_page->page->time_stamp = *ts;
+	if (!tail)
+		tail_page->page->time_stamp = *ts;
 
 	return event;
 }
@@ -1410,16 +1407,16 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 		return -EAGAIN;
 
 	/* Only a commited time event can update the write stamp */
-	if (rb_is_commit(cpu_buffer, event)) {
+	if (rb_event_is_commit(cpu_buffer, event)) {
 		/*
-		 * If this is the first on the page, then we need to
-		 * update the page itself, and just put in a zero.
+		 * If this is the first on the page, then it was
+		 * updated with the page itself. Try to discard it
+		 * and if we can't just make it zero.
 		 */
 		if (rb_event_index(event)) {
 			event->time_delta = *delta & TS_MASK;
 			event->array[0] = *delta >> TS_SHIFT;
 		} else {
-			cpu_buffer->commit_page->page->time_stamp = *ts;
 			/* try to discard, since we do not need this */
 			if (!rb_try_to_discard(cpu_buffer, event)) {
 				/* nope, just zero it */
@@ -1445,6 +1442,44 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 	return ret;
 }
 
+static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	local_inc(&cpu_buffer->committing);
+	local_inc(&cpu_buffer->commits);
+}
+
+static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	unsigned long commits;
+
+	if (RB_WARN_ON(cpu_buffer,
+		       !local_read(&cpu_buffer->committing)))
+		return;
+
+ again:
+	commits = local_read(&cpu_buffer->commits);
+	/* synchronize with interrupts */
+	barrier();
+	if (local_read(&cpu_buffer->committing) == 1)
+		rb_set_commit_to_write(cpu_buffer);
+
+	local_dec(&cpu_buffer->committing);
+
+	/* synchronize with interrupts */
+	barrier();
+
+	/*
+	 * Need to account for interrupts coming in between the
+	 * updating of the commit page and the clearing of the
+	 * committing counter.
+	 */
+	if (unlikely(local_read(&cpu_buffer->commits) != commits) &&
+	    !local_read(&cpu_buffer->committing)) {
+		local_inc(&cpu_buffer->committing);
+		goto again;
+	}
+}
+
 static struct ring_buffer_event *
 rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 		      unsigned long length)
@@ -1454,6 +1489,8 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 	int commit = 0;
 	int nr_loops = 0;
 
+	rb_start_commit(cpu_buffer);
+
 	length = rb_calculate_event_length(length);
  again:
 	/*
@@ -1466,7 +1503,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 	 * Bail!
 	 */
 	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
-		return NULL;
+		goto out_fail;
 
 	ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
 
@@ -1497,7 +1534,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 
 			commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
 			if (commit == -EBUSY)
-				return NULL;
+				goto out_fail;
 
 			if (commit == -EAGAIN)
 				goto again;
@@ -1511,30 +1548,23 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 	if (unlikely(PTR_ERR(event) == -EAGAIN))
 		goto again;
 
-	if (!event) {
-		if (unlikely(commit))
-			/*
-			 * Ouch! We needed a timestamp and it was commited. But
-			 * we didn't get our event reserved.
-			 */
-			rb_set_commit_to_write(cpu_buffer);
-		return NULL;
-	}
+	if (!event)
+		goto out_fail;
 
-	/*
-	 * If the timestamp was commited, make the commit our entry
-	 * now so that we will update it when needed.
-	 */
-	if (unlikely(commit))
-		rb_set_commit_event(cpu_buffer, event);
-	else if (!rb_is_commit(cpu_buffer, event))
+	if (!rb_event_is_commit(cpu_buffer, event))
 		delta = 0;
 
 	event->time_delta = delta;
 
 	return event;
+
+ out_fail:
+	rb_end_commit(cpu_buffer);
+	return NULL;
 }
 
+#ifdef CONFIG_TRACING
+
 #define TRACE_RECURSIVE_DEPTH 16
 
 static int trace_recursive_lock(void)
@@ -1565,6 +1595,13 @@ static void trace_recursive_unlock(void)
 	current->trace_recursion--;
 }
 
+#else
+
+#define trace_recursive_lock()		(0)
+#define trace_recursive_unlock()	do { } while (0)
+
+#endif
+
 static DEFINE_PER_CPU(int, rb_need_resched);
 
 /**
@@ -1642,13 +1679,14 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
 {
 	local_inc(&cpu_buffer->entries);
 
-	/* Only process further if we own the commit */
-	if (!rb_is_commit(cpu_buffer, event))
-		return;
-
-	cpu_buffer->write_stamp += event->time_delta;
+	/*
+	 * The event first in the commit queue updates the
+	 * time stamp.
+	 */
+	if (rb_event_is_commit(cpu_buffer, event))
+		cpu_buffer->write_stamp += event->time_delta;
 
-	rb_set_commit_to_write(cpu_buffer);
+	rb_end_commit(cpu_buffer);
 }
 
 /**
@@ -1737,15 +1775,15 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
 	/* The event is discarded regardless */
 	rb_event_discard(event);
 
+	cpu = smp_processor_id();
+	cpu_buffer = buffer->buffers[cpu];
+
 	/*
 	 * This must only be called if the event has not been
 	 * committed yet. Thus we can assume that preemption
 	 * is still disabled.
 	 */
-	RB_WARN_ON(buffer, preemptible());
-
-	cpu = smp_processor_id();
-	cpu_buffer = buffer->buffers[cpu];
+	RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing));
 
 	if (!rb_try_to_discard(cpu_buffer, event))
 		goto out;
@@ -1756,13 +1794,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
 	 */
 	local_inc(&cpu_buffer->entries);
  out:
-	/*
-	 * If a write came in and pushed the tail page
-	 * we still need to update the commit pointer
-	 * if we were the commit.
-	 */
-	if (rb_is_commit(cpu_buffer, event))
-		rb_set_commit_to_write(cpu_buffer);
+	rb_end_commit(cpu_buffer);
 
 	trace_recursive_unlock();
 
@@ -2446,6 +2478,21 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_iter_peek);
 
+static inline int rb_ok_to_lock(void)
+{
+	/*
+	 * If an NMI die dumps out the content of the ring buffer
+	 * do not grab locks. We also permanently disable the ring
+	 * buffer too. A one time deal is all you get from reading
+	 * the ring buffer from an NMI.
+	 */
+	if (likely(!in_nmi() && !oops_in_progress))
+		return 1;
+
+	tracing_off_permanent();
+	return 0;
+}
+
 /**
  * ring_buffer_peek - peek at the next event to be read
  * @buffer: The ring buffer to read
@@ -2461,14 +2508,20 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
 	struct ring_buffer_event *event;
 	unsigned long flags;
+	int dolock;
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return NULL;
 
+	dolock = rb_ok_to_lock();
  again:
-	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+	local_irq_save(flags);
+	if (dolock)
+		spin_lock(&cpu_buffer->reader_lock);
 	event = rb_buffer_peek(buffer, cpu, ts);
-	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+	if (dolock)
+		spin_unlock(&cpu_buffer->reader_lock);
+	local_irq_restore(flags);
 
 	if (event && event->type_len == RINGBUF_TYPE_PADDING) {
 		cpu_relax();
@@ -2520,6 +2573,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_event *event = NULL;
 	unsigned long flags;
+	int dolock;
+
+	dolock = rb_ok_to_lock();
 
  again:
 	/* might be called in atomic */
@@ -2529,7 +2585,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
 		goto out;
 
 	cpu_buffer = buffer->buffers[cpu];
-	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+	local_irq_save(flags);
+	if (dolock)
+		spin_lock(&cpu_buffer->reader_lock);
 
 	event = rb_buffer_peek(buffer, cpu, ts);
 	if (!event)
@@ -2538,7 +2596,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
 	rb_advance_reader(cpu_buffer);
 
  out_unlock:
-	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+	if (dolock)
+		spin_unlock(&cpu_buffer->reader_lock);
+	local_irq_restore(flags);
 
  out:
 	preempt_enable();
@@ -2680,6 +2740,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 	cpu_buffer->overrun = 0;
 	cpu_buffer->read = 0;
 	local_set(&cpu_buffer->entries, 0);
+	local_set(&cpu_buffer->committing, 0);
+	local_set(&cpu_buffer->commits, 0);
 
 	cpu_buffer->write_stamp = 0;
 	cpu_buffer->read_stamp = 0;
@@ -2734,12 +2796,25 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset);
 int ring_buffer_empty(struct ring_buffer *buffer)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
+	unsigned long flags;
+	int dolock;
 	int cpu;
+	int ret;
+
+	dolock = rb_ok_to_lock();
 
 	/* yes this is racy, but if you don't like the race, lock the buffer */
 	for_each_buffer_cpu(buffer, cpu) {
 		cpu_buffer = buffer->buffers[cpu];
-		if (!rb_per_cpu_empty(cpu_buffer))
+		local_irq_save(flags);
+		if (dolock)
+			spin_lock(&cpu_buffer->reader_lock);
+		ret = rb_per_cpu_empty(cpu_buffer);
+		if (dolock)
+			spin_unlock(&cpu_buffer->reader_lock);
+		local_irq_restore(flags);
+
+		if (!ret)
 			return 0;
 	}
 
@@ -2755,14 +2830,23 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty);
 int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
+	unsigned long flags;
+	int dolock;
 	int ret;
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return 1;
 
+	dolock = rb_ok_to_lock();
+
 	cpu_buffer = buffer->buffers[cpu];
+	local_irq_save(flags);
+	if (dolock)
+		spin_lock(&cpu_buffer->reader_lock);
 	ret = rb_per_cpu_empty(cpu_buffer);
-
+	if (dolock)
+		spin_unlock(&cpu_buffer->reader_lock);
+	local_irq_restore(flags);
 
 	return ret;
 }
@@ -3029,6 +3113,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_page);
 
+#ifdef CONFIG_TRACING
 static ssize_t
 rb_simple_read(struct file *filp, char __user *ubuf,
 	       size_t cnt, loff_t *ppos)
@@ -3096,6 +3181,7 @@ static __init int rb_init_debugfs(void)
 }
 
 fs_initcall(rb_init_debugfs);
+#endif
 
 #ifdef CONFIG_HOTPLUG_CPU
 static int rb_cpu_notify(struct notifier_block *self,
@@ -3108,7 +3194,7 @@ static int rb_cpu_notify(struct notifier_block *self,
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
-		if (cpu_isset(cpu, *buffer->cpumask))
+		if (cpumask_test_cpu(cpu, buffer->cpumask))
 			return NOTIFY_OK;
 
 		buffer->buffers[cpu] =
@@ -3119,7 +3205,7 @@ static int rb_cpu_notify(struct notifier_block *self,
 			return NOTIFY_OK;
 		}
 		smp_wmb();
-		cpu_set(cpu, *buffer->cpumask);
+		cpumask_set_cpu(cpu, buffer->cpumask);
 		break;
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 8d68e149a8b..573d3cc762c 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -102,8 +102,10 @@ static enum event_status read_page(int cpu)
 			event = (void *)&rpage->data[i];
 			switch (event->type_len) {
 			case RINGBUF_TYPE_PADDING:
-				/* We don't expect any padding */
-				KILL_TEST();
+				/* failed writes may be discarded events */
+				if (!event->time_delta)
+					KILL_TEST();
+				inc = event->array[0] + 4;
 				break;
 			case RINGBUF_TYPE_TIME_EXTEND:
 				inc = 8;
@@ -119,7 +121,7 @@ static enum event_status read_page(int cpu)
 					KILL_TEST();
 					break;
 				}
-				inc = event->array[0];
+				inc = event->array[0] + 4;
 				break;
 			default:
 				entry = ring_buffer_event_data(event);
@@ -201,7 +203,7 @@ static void ring_buffer_producer(void)
 	 * Hammer the buffer for 10 secs (this may
 	 * make the system stall)
 	 */
-	pr_info("Starting ring buffer hammer\n");
+	trace_printk("Starting ring buffer hammer\n");
 	do_gettimeofday(&start_tv);
 	do {
 		struct ring_buffer_event *event;
@@ -237,7 +239,7 @@ static void ring_buffer_producer(void)
 #endif
 
 	} while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test);
-	pr_info("End ring buffer hammer\n");
+	trace_printk("End ring buffer hammer\n");
 
 	if (consumer) {
 		/* Init both completions here to avoid races */
@@ -260,49 +262,50 @@ static void ring_buffer_producer(void)
 	overruns = ring_buffer_overruns(buffer);
 
 	if (kill_test)
-		pr_info("ERROR!\n");
-	pr_info("Time:     %lld (usecs)\n", time);
-	pr_info("Overruns: %lld\n", overruns);
+		trace_printk("ERROR!\n");
+	trace_printk("Time:     %lld (usecs)\n", time);
+	trace_printk("Overruns: %lld\n", overruns);
 	if (disable_reader)
-		pr_info("Read:     (reader disabled)\n");
+		trace_printk("Read:     (reader disabled)\n");
 	else
-		pr_info("Read:     %ld  (by %s)\n", read,
+		trace_printk("Read:     %ld  (by %s)\n", read,
 			read_events ? "events" : "pages");
-	pr_info("Entries:  %lld\n", entries);
-	pr_info("Total:    %lld\n", entries + overruns + read);
-	pr_info("Missed:   %ld\n", missed);
-	pr_info("Hit:      %ld\n", hit);
+	trace_printk("Entries:  %lld\n", entries);
+	trace_printk("Total:    %lld\n", entries + overruns + read);
+	trace_printk("Missed:   %ld\n", missed);
+	trace_printk("Hit:      %ld\n", hit);
 
 	/* Convert time from usecs to millisecs */
 	do_div(time, USEC_PER_MSEC);
 	if (time)
 		hit /= (long)time;
 	else
-		pr_info("TIME IS ZERO??\n");
+		trace_printk("TIME IS ZERO??\n");
 
-	pr_info("Entries per millisec: %ld\n", hit);
+	trace_printk("Entries per millisec: %ld\n", hit);
 
 	if (hit) {
 		/* Calculate the average time in nanosecs */
 		avg = NSEC_PER_MSEC / hit;
-		pr_info("%ld ns per entry\n", avg);
+		trace_printk("%ld ns per entry\n", avg);
 	}
 
 	if (missed) {
 		if (time)
 			missed /= (long)time;
 
-		pr_info("Total iterations per millisec: %ld\n", hit + missed);
+		trace_printk("Total iterations per millisec: %ld\n",
+			     hit + missed);
 
 		/* it is possible that hit + missed will overflow and be zero */
 		if (!(hit + missed)) {
-			pr_info("hit + missed overflowed and totalled zero!\n");
+			trace_printk("hit + missed overflowed and totalled zero!\n");
 			hit--; /* make it non zero */
 		}
 
 		/* Caculate the average time in nanosecs */
 		avg = NSEC_PER_MSEC / (hit + missed);
-		pr_info("%ld ns per entry\n", avg);
+		trace_printk("%ld ns per entry\n", avg);
 	}
 }
 
@@ -353,7 +356,7 @@ static int ring_buffer_producer_thread(void *arg)
 
 		ring_buffer_producer();
 
-		pr_info("Sleeping for 10 secs\n");
+		trace_printk("Sleeping for 10 secs\n");
 		set_current_state(TASK_INTERRUPTIBLE);
 		schedule_timeout(HZ * SLEEP_TIME);
 		__set_current_state(TASK_RUNNING);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c1878bfb2e1..3aa0a0dfdfa 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -284,13 +284,12 @@ void trace_wake_up(void)
 static int __init set_buf_size(char *str)
 {
 	unsigned long buf_size;
-	int ret;
 
 	if (!str)
 		return 0;
-	ret = strict_strtoul(str, 0, &buf_size);
+	buf_size = memparse(str, &str);
 	/* nr_entries can not be zero */
-	if (ret < 0 || buf_size == 0)
+	if (buf_size == 0)
 		return 0;
 	trace_buf_size = buf_size;
 	return 1;
@@ -2053,25 +2052,23 @@ static int tracing_open(struct inode *inode, struct file *file)
 static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	struct tracer *t = m->private;
+	struct tracer *t = v;
 
 	(*pos)++;
 
 	if (t)
 		t = t->next;
 
-	m->private = t;
-
 	return t;
 }
 
 static void *t_start(struct seq_file *m, loff_t *pos)
 {
-	struct tracer *t = m->private;
+	struct tracer *t;
 	loff_t l = 0;
 
 	mutex_lock(&trace_types_lock);
-	for (; t && l < *pos; t = t_next(m, t, &l))
+	for (t = trace_types; t && l < *pos; t = t_next(m, t, &l))
 		;
 
 	return t;
@@ -2107,18 +2104,10 @@ static struct seq_operations show_traces_seq_ops = {
 
 static int show_traces_open(struct inode *inode, struct file *file)
 {
-	int ret;
-
 	if (tracing_disabled)
 		return -ENODEV;
 
-	ret = seq_open(file, &show_traces_seq_ops);
-	if (!ret) {
-		struct seq_file *m = file->private_data;
-		m->private = trace_types;
-	}
-
-	return ret;
+	return seq_open(file, &show_traces_seq_ops);
 }
 
 static ssize_t
@@ -2191,11 +2180,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
 	if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
 		return -ENOMEM;
 
-	mutex_lock(&tracing_cpumask_update_lock);
 	err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
 	if (err)
 		goto err_unlock;
 
+	mutex_lock(&tracing_cpumask_update_lock);
+
 	local_irq_disable();
 	__raw_spin_lock(&ftrace_max_lock);
 	for_each_tracing_cpu(cpu) {
@@ -2223,8 +2213,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
 	return count;
 
 err_unlock:
-	mutex_unlock(&tracing_cpumask_update_lock);
-	free_cpumask_var(tracing_cpumask);
+	free_cpumask_var(tracing_cpumask_new);
 
 	return err;
 }
@@ -3626,7 +3615,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
 	struct trace_seq *s;
 	unsigned long cnt;
 
-	s = kmalloc(sizeof(*s), GFP_ATOMIC);
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
 	if (!s)
 		return ENOMEM;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6e735d4771f..3548ae5cc78 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -597,6 +597,7 @@ print_graph_function(struct trace_iterator *iter)
 
 extern struct pid *ftrace_pid_trace;
 
+#ifdef CONFIG_FUNCTION_TRACER
 static inline int ftrace_trace_task(struct task_struct *task)
 {
 	if (!ftrace_pid_trace)
@@ -604,6 +605,12 @@ static inline int ftrace_trace_task(struct task_struct *task)
 
 	return test_tsk_trace_trace(task);
 }
+#else
+static inline int ftrace_trace_task(struct task_struct *task)
+{
+	return 1;
+}
+#endif
 
 /*
  * trace_iterator_flags is an enumeration that defines bit
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index aa08be69a1b..53c8fd376a8 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -300,10 +300,18 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 
 static void *t_start(struct seq_file *m, loff_t *pos)
 {
+	struct ftrace_event_call *call = NULL;
+	loff_t l;
+
 	mutex_lock(&event_mutex);
-	if (*pos == 0)
-		m->private = ftrace_events.next;
-	return t_next(m, NULL, pos);
+
+	m->private = ftrace_events.next;
+	for (l = 0; l <= *pos; ) {
+		call = t_next(m, NULL, &l);
+		if (!call)
+			break;
+	}
+	return call;
 }
 
 static void *
@@ -332,10 +340,18 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
 
 static void *s_start(struct seq_file *m, loff_t *pos)
 {
+	struct ftrace_event_call *call = NULL;
+	loff_t l;
+
 	mutex_lock(&event_mutex);
-	if (*pos == 0)
-		m->private = ftrace_events.next;
-	return s_next(m, NULL, pos);
+
+	m->private = ftrace_events.next;
+	for (l = 0; l <= *pos; ) {
+		call = s_next(m, NULL, &l);
+		if (!call)
+			break;
+	}
+	return call;
 }
 
 static int t_show(struct seq_file *m, void *v)
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index db6e54bdb59..936c621bbf4 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -27,8 +27,6 @@
 #include "trace.h"
 #include "trace_output.h"
 
-static DEFINE_MUTEX(filter_mutex);
-
 enum filter_op_ids
 {
 	OP_OR,
@@ -178,7 +176,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
 static int filter_pred_strloc(struct filter_pred *pred, void *event,
 			      int val1, int val2)
 {
-	int str_loc = *(int *)(event + pred->offset);
+	unsigned short str_loc = *(unsigned short *)(event + pred->offset);
 	char *addr = (char *)(event + str_loc);
 	int cmp, match;
 
@@ -294,12 +292,12 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
 {
 	struct event_filter *filter = call->filter;
 
-	mutex_lock(&filter_mutex);
+	mutex_lock(&event_mutex);
 	if (filter->filter_string)
 		trace_seq_printf(s, "%s\n", filter->filter_string);
 	else
 		trace_seq_printf(s, "none\n");
-	mutex_unlock(&filter_mutex);
+	mutex_unlock(&event_mutex);
 }
 
 void print_subsystem_event_filter(struct event_subsystem *system,
@@ -307,12 +305,12 @@ void print_subsystem_event_filter(struct event_subsystem *system,
 {
 	struct event_filter *filter = system->filter;
 
-	mutex_lock(&filter_mutex);
+	mutex_lock(&event_mutex);
 	if (filter->filter_string)
 		trace_seq_printf(s, "%s\n", filter->filter_string);
 	else
 		trace_seq_printf(s, "none\n");
-	mutex_unlock(&filter_mutex);
+	mutex_unlock(&event_mutex);
 }
 
 static struct ftrace_event_field *
@@ -381,6 +379,7 @@ void destroy_preds(struct ftrace_event_call *call)
 			filter_free_pred(filter->preds[i]);
 	}
 	kfree(filter->preds);
+	kfree(filter->filter_string);
 	kfree(filter);
 	call->filter = NULL;
 }
@@ -433,7 +432,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
 		filter->n_preds = 0;
 	}
 
-	mutex_lock(&event_mutex);
 	list_for_each_entry(call, &ftrace_events, list) {
 		if (!call->define_fields)
 			continue;
@@ -443,7 +441,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
 			remove_filter_string(call->filter);
 		}
 	}
-	mutex_unlock(&event_mutex);
 }
 
 static int filter_add_pred_fn(struct filter_parse_state *ps,
@@ -546,6 +543,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
 	filter_pred_fn_t fn;
 	unsigned long long val;
 	int string_type;
+	int ret;
 
 	pred->fn = filter_pred_none;
 
@@ -581,7 +579,11 @@ static int filter_add_pred(struct filter_parse_state *ps,
 			pred->not = 1;
 		return filter_add_pred_fn(ps, call, pred, fn);
 	} else {
-		if (strict_strtoull(pred->str_val, 0, &val)) {
+		if (field->is_signed)
+			ret = strict_strtoll(pred->str_val, 0, &val);
+		else
+			ret = strict_strtoull(pred->str_val, 0, &val);
+		if (ret) {
 			parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
 			return -EINVAL;
 		}
@@ -625,7 +627,6 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
 	filter->preds[filter->n_preds] = pred;
 	filter->n_preds++;
 
-	mutex_lock(&event_mutex);
 	list_for_each_entry(call, &ftrace_events, list) {
 
 		if (!call->define_fields)
@@ -636,14 +637,12 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
 
 		err = filter_add_pred(ps, call, pred);
 		if (err) {
-			mutex_unlock(&event_mutex);
 			filter_free_subsystem_preds(system);
 			parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
 			goto out;
 		}
 		replace_filter_string(call->filter, filter_string);
 	}
-	mutex_unlock(&event_mutex);
 out:
 	return err;
 }
@@ -1070,12 +1069,12 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
 
 	struct filter_parse_state *ps;
 
-	mutex_lock(&filter_mutex);
+	mutex_lock(&event_mutex);
 
 	if (!strcmp(strstrip(filter_string), "0")) {
 		filter_disable_preds(call);
 		remove_filter_string(call->filter);
-		mutex_unlock(&filter_mutex);
+		mutex_unlock(&event_mutex);
 		return 0;
 	}
 
@@ -1103,7 +1102,7 @@ out:
 	postfix_clear(ps);
 	kfree(ps);
 out_unlock:
-	mutex_unlock(&filter_mutex);
+	mutex_unlock(&event_mutex);
 
 	return err;
 }
@@ -1115,12 +1114,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
 
 	struct filter_parse_state *ps;
 
-	mutex_lock(&filter_mutex);
+	mutex_lock(&event_mutex);
 
 	if (!strcmp(strstrip(filter_string), "0")) {
 		filter_free_subsystem_preds(system);
 		remove_filter_string(system->filter);
-		mutex_unlock(&filter_mutex);
+		mutex_unlock(&event_mutex);
 		return 0;
 	}
 
@@ -1148,7 +1147,7 @@ out:
 	postfix_clear(ps);
 	kfree(ps);
 out_unlock:
-	mutex_unlock(&filter_mutex);
+	mutex_unlock(&event_mutex);
 
 	return err;
 }
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index c9a0b7df44f..7402144bff2 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -193,9 +193,11 @@ static void tracing_start_function_trace(void)
 static void tracing_stop_function_trace(void)
 {
 	ftrace_function_enabled = 0;
-	/* OK if they are not registered */
-	unregister_ftrace_function(&trace_stack_ops);
-	unregister_ftrace_function(&trace_ops);
+
+	if (func_flags.val & TRACE_FUNC_OPT_STACK)
+		unregister_ftrace_function(&trace_stack_ops);
+	else
+		unregister_ftrace_function(&trace_ops);
 }
 
 static int func_set_flag(u32 old_flags, u32 bit, int set)
@@ -300,8 +302,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
 	if (count == -1)
 		seq_printf(m, ":unlimited\n");
 	else
-		seq_printf(m, ":count=%ld", count);
-	seq_putc(m, '\n');
+		seq_printf(m, ":count=%ld\n", count);
 
 	return 0;
 }
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 8b592418d8b..d2249abafb5 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -57,7 +57,8 @@ static struct tracer_flags tracer_flags = {
 
 /* Add a function return address to the trace stack on thread info.*/
 int
-ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
+ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
+			 unsigned long frame_pointer)
 {
 	unsigned long long calltime;
 	int index;
@@ -85,6 +86,7 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
 	current->ret_stack[index].func = func;
 	current->ret_stack[index].calltime = calltime;
 	current->ret_stack[index].subtime = 0;
+	current->ret_stack[index].fp = frame_pointer;
 	*depth = index;
 
 	return 0;
@@ -92,7 +94,8 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
 
 /* Retrieve a function return address to the trace stack on thread info.*/
 static void
-ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
+ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
+			unsigned long frame_pointer)
 {
 	int index;
 
@@ -106,6 +109,31 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
 		return;
 	}
 
+#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST
+	/*
+	 * The arch may choose to record the frame pointer used
+	 * and check it here to make sure that it is what we expect it
+	 * to be. If gcc does not set the place holder of the return
+	 * address in the frame pointer, and does a copy instead, then
+	 * the function graph trace will fail. This test detects this
+	 * case.
+	 *
+	 * Currently, x86_32 with optimize for size (-Os) makes the latest
+	 * gcc do the above.
+	 */
+	if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
+		ftrace_graph_stop();
+		WARN(1, "Bad frame pointer: expected %lx, received %lx\n"
+		     "  from func %pF return to %lx\n",
+		     current->ret_stack[index].fp,
+		     frame_pointer,
+		     (void *)current->ret_stack[index].func,
+		     current->ret_stack[index].ret);
+		*ret = (unsigned long)panic;
+		return;
+	}
+#endif
+
 	*ret = current->ret_stack[index].ret;
 	trace->func = current->ret_stack[index].func;
 	trace->calltime = current->ret_stack[index].calltime;
@@ -117,12 +145,12 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
  * Send the trace to the ring-buffer.
  * @return the original return address.
  */
-unsigned long ftrace_return_to_handler(void)
+unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
 {
 	struct ftrace_graph_ret trace;
 	unsigned long ret;
 
-	ftrace_pop_return_trace(&trace, &ret);
+	ftrace_pop_return_trace(&trace, &ret, frame_pointer);
 	trace.rettime = trace_clock_local();
 	ftrace_graph_return(&trace);
 	barrier();
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 9bece9687b6..7b627811082 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -155,25 +155,19 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
 EXPORT_SYMBOL_GPL(__ftrace_vprintk);
 
 static void *
-t_next(struct seq_file *m, void *v, loff_t *pos)
+t_start(struct seq_file *m, loff_t *pos)
 {
-	const char **fmt = m->private;
-	const char **next = fmt;
-
-	(*pos)++;
+	const char **fmt = __start___trace_bprintk_fmt + *pos;
 
 	if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt)
 		return NULL;
-
-	next = fmt;
-	m->private = ++next;
-
 	return fmt;
 }
 
-static void *t_start(struct seq_file *m, loff_t *pos)
+static void *t_next(struct seq_file *m, void * v, loff_t *pos)
 {
-	return t_next(m, NULL, pos);
+	(*pos)++;
+	return t_start(m, pos);
 }
 
 static int t_show(struct seq_file *m, void *v)
@@ -224,15 +218,7 @@ static const struct seq_operations show_format_seq_ops = {
 static int
 ftrace_formats_open(struct inode *inode, struct file *file)
 {
-	int ret;
-
-	ret = seq_open(file, &show_format_seq_ops);
-	if (!ret) {
-		struct seq_file *m = file->private_data;
-
-		m->private = __start___trace_bprintk_fmt;
-	}
-	return ret;
+	return seq_open(file, &show_format_seq_ops);
 }
 
 static const struct file_operations ftrace_formats_fops = {
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index c00643733f4..e66f5e49334 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -199,17 +199,13 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos)
 	mutex_lock(&session->stat_mutex);
 
 	/* If we are in the beginning of the file, print the headers */
-	if (!*pos && session->ts->stat_headers) {
-		(*pos)++;
+	if (!*pos && session->ts->stat_headers)
 		return SEQ_START_TOKEN;
-	}
 
 	node = rb_first(&session->stat_root);
 	for (i = 0; node && i < *pos; i++)
 		node = rb_next(node);
 
-	(*pos)++;
-
 	return node;
 }
author	James Morris <jmorris@namei.org>	2009-06-30 09:10:35 +1000
committer	James Morris <jmorris@namei.org>	2009-06-30 09:10:35 +1000
commit	ac7242142b03421c96b0a2f8d99f146d075614c2 (patch)
tree	b0b2ead65858c7a343d38affed86fe815e37e7e9 /kernel
parent	89c86576ecde504da1eeb4f4882b2189ac2f9c4a (diff)
parent	2bfdd79eaa0043346e773ba5f6cfd811ea31b73d (diff)