From 1b2439dbb703ae8d95a9ce7ece6b7800b80f41f0 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 15 Aug 2008 15:29:38 -0700
Subject: debug: add notifier chain debugging

during some development we suspected a case where we left something
in a notifier chain that was from a module that was unloaded already...
and that sort of thing is rather hard to track down.

This patch adds a very simple sanity check (which isn't all that
expensive) to make sure the notifier we're about to call is
actually from either the kernel itself of from a still-loaded
module, avoiding a hard-to-chase-down crash.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/notifier.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'kernel')

diff --git a/kernel/notifier.c b/kernel/notifier.c
index 823be11584e..143fdd77dbf 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -21,6 +21,10 @@ BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
 static int notifier_chain_register(struct notifier_block **nl,
 		struct notifier_block *n)
 {
+	if (!kernel_text_address((unsigned long)n->notifier_call)) {
+		WARN(1, "Invalid notifier registered!");
+		return 0;
+	}
 	while ((*nl) != NULL) {
 		if (n->priority > (*nl)->priority)
 			break;
@@ -34,6 +38,10 @@ static int notifier_chain_register(struct notifier_block **nl,
 static int notifier_chain_cond_register(struct notifier_block **nl,
 		struct notifier_block *n)
 {
+	if (!kernel_text_address((unsigned long)n->notifier_call)) {
+		WARN(1, "Invalid notifier registered!");
+		return 0;
+	}
 	while ((*nl) != NULL) {
 		if ((*nl) == n)
 			return 0;
@@ -82,6 +90,14 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
 
 	while (nb && nr_to_call) {
 		next_nb = rcu_dereference(nb->next);
+
+#ifdef CONFIG_DEBUG_NOTIFIERS
+		if (!kernel_text_address((unsigned long)nb->notifier_call)) {
+			WARN(1, "Invalid notifier called!");
+			nb = next_nb;
+			continue;
+		}
+#endif
 		ret = nb->notifier_call(nb, val, v);
 
 		if (nr_calls)
-- 
cgit v1.2.3


From fb822db465bd9fd4208eef1af4490539b236c54e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 20 Aug 2008 11:17:40 +0200
Subject: softlockup: increase hung tasks check from 2 minutes to 8 minutes

Andrew says:

> Seems that about 100% of the reports we get of this warning triggering
> are sys_sync, transaction commit, etc.

increase the timeout. If it still triggers for people, we can kill it.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/softlockup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index b75b492fbfc..17a05806533 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -164,7 +164,7 @@ unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
 /*
  * Zero means infinite timeout - no checking done:
  */
-unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
+unsigned long __read_mostly sysctl_hung_task_timeout_secs = 480;
 
 unsigned long __read_mostly sysctl_hung_task_warnings = 10;
 
-- 
cgit v1.2.3


From ab7476cf76e560f0efda2a631a70aabe93009025 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 15 Aug 2008 15:29:38 -0700
Subject: debug: add notifier chain debugging, v2

- unbreak ia64 (and powerpc) where function pointers dont
  point at code but at data (reported by Tony Luck)

[ mingo@elte.hu: various cleanups ]

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/extable.c  | 16 ++++++++++++++++
 kernel/notifier.c | 10 +---------
 2 files changed, 17 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/extable.c b/kernel/extable.c
index a26cb2e1702..adf0cc9c02d 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -66,3 +66,19 @@ int kernel_text_address(unsigned long addr)
 		return 1;
 	return module_text_address(addr) != NULL;
 }
+
+/*
+ * On some architectures (PPC64, IA64) function pointers
+ * are actually only tokens to some data that then holds the
+ * real function address. As a result, to find if a function
+ * pointer is part of the kernel text, we need to do some
+ * special dereferencing first.
+ */
+int func_ptr_is_kernel_text(void *ptr)
+{
+	unsigned long addr;
+	addr = (unsigned long) dereference_function_descriptor(ptr);
+	if (core_kernel_text(addr))
+		return 1;
+	return module_text_address(addr) != NULL;
+}
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 143fdd77dbf..0f39e398ef6 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -21,10 +21,6 @@ BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
 static int notifier_chain_register(struct notifier_block **nl,
 		struct notifier_block *n)
 {
-	if (!kernel_text_address((unsigned long)n->notifier_call)) {
-		WARN(1, "Invalid notifier registered!");
-		return 0;
-	}
 	while ((*nl) != NULL) {
 		if (n->priority > (*nl)->priority)
 			break;
@@ -38,10 +34,6 @@ static int notifier_chain_register(struct notifier_block **nl,
 static int notifier_chain_cond_register(struct notifier_block **nl,
 		struct notifier_block *n)
 {
-	if (!kernel_text_address((unsigned long)n->notifier_call)) {
-		WARN(1, "Invalid notifier registered!");
-		return 0;
-	}
 	while ((*nl) != NULL) {
 		if ((*nl) == n)
 			return 0;
@@ -92,7 +84,7 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
 		next_nb = rcu_dereference(nb->next);
 
 #ifdef CONFIG_DEBUG_NOTIFIERS
-		if (!kernel_text_address((unsigned long)nb->notifier_call)) {
+		if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
 			WARN(1, "Invalid notifier called!");
 			nb = next_nb;
 			continue;
-- 
cgit v1.2.3


From 6918bc5c830e890681eabb3c6cb6b8d117a52d14 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 23 Sep 2008 15:33:41 +0200
Subject: lockstat: fixup signed division

Some recent modification to this code made me notice the little todo mark.
Now that we have more elaborate 64-bit division functions this isn't hard.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep_proc.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 20dbcbf9c7d..8d3a6eba8d5 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -470,11 +470,12 @@ static void seq_line(struct seq_file *m, char c, int offset, int length)
 
 static void snprint_time(char *buf, size_t bufsiz, s64 nr)
 {
-	unsigned long rem;
+	s64 div;
+	s32 rem;
 
 	nr += 5; /* for display rounding */
-	rem = do_div(nr, 1000); /* XXX: do_div_signed */
-	snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, (int)rem/10);
+	div = div_s64_rem(nr, 1000, &rem);
+	snprintf(buf, bufsiz, "%lld.%02d", (long long)div, (int)rem/10);
 }
 
 static void seq_time(struct seq_file *m, s64 time)
-- 
cgit v1.2.3


From 38d47c1b7075bd7ec3881141bb3629da58f88dab Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 26 Sep 2008 19:32:20 +0200
Subject: futex: rely on get_user_pages() for shared futexes

On the way of getting rid of the mmap_sem requirement for shared futexes,
start by relying on get_user_pages().

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/futex.c | 162 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 80 insertions(+), 82 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 7d1136e97c1..a4c39fa0a7a 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -161,6 +161,45 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
 		&& key1->both.offset == key2->both.offset);
 }
 
+/*
+ * Take a reference to the resource addressed by a key.
+ * Can be called while holding spinlocks.
+ *
+ */
+static void get_futex_key_refs(union futex_key *key)
+{
+	if (!key->both.ptr)
+		return;
+
+	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
+	case FUT_OFF_INODE:
+		atomic_inc(&key->shared.inode->i_count);
+		break;
+	case FUT_OFF_MMSHARED:
+		atomic_inc(&key->private.mm->mm_count);
+		break;
+	}
+}
+
+/*
+ * Drop a reference to the resource addressed by a key.
+ * The hash bucket spinlock must not be held.
+ */
+static void drop_futex_key_refs(union futex_key *key)
+{
+	if (!key->both.ptr)
+		return;
+
+	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
+	case FUT_OFF_INODE:
+		iput(key->shared.inode);
+		break;
+	case FUT_OFF_MMSHARED:
+		mmdrop(key->private.mm);
+		break;
+	}
+}
+
 /**
  * get_futex_key - Get parameters which are the keys for a futex.
  * @uaddr: virtual address of the futex
@@ -184,7 +223,6 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
 {
 	unsigned long address = (unsigned long)uaddr;
 	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma;
 	struct page *page;
 	int err;
 
@@ -210,98 +248,47 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
 		key->private.address = address;
 		return 0;
 	}
-	/*
-	 * The futex is hashed differently depending on whether
-	 * it's in a shared or private mapping.  So check vma first.
-	 */
-	vma = find_extend_vma(mm, address);
-	if (unlikely(!vma))
-		return -EFAULT;
 
-	/*
-	 * Permissions.
-	 */
-	if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ))
-		return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES;
+again:
+	err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
+	if (err < 0)
+		return err;
+
+	lock_page(page);
+	if (!page->mapping) {
+		unlock_page(page);
+		put_page(page);
+		goto again;
+	}
 
 	/*
 	 * Private mappings are handled in a simple way.
 	 *
 	 * NOTE: When userspace waits on a MAP_SHARED mapping, even if
 	 * it's a read-only handle, it's expected that futexes attach to
-	 * the object not the particular process.  Therefore we use
-	 * VM_MAYSHARE here, not VM_SHARED which is restricted to shared
-	 * mappings of _writable_ handles.
+	 * the object not the particular process.
 	 */
-	if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
-		key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */
+	if (PageAnon(page)) {
+		key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
 		key->private.mm = mm;
 		key->private.address = address;
-		return 0;
-	}
-
-	/*
-	 * Linear file mappings are also simple.
-	 */
-	key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
-	key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
-	if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
-		key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
-				     + vma->vm_pgoff);
-		return 0;
+	} else {
+		key->both.offset |= FUT_OFF_INODE; /* inode-based key */
+		key->shared.inode = page->mapping->host;
+		key->shared.pgoff = page->index;
 	}
 
-	/*
-	 * We could walk the page table to read the non-linear
-	 * pte, and get the page index without fetching the page
-	 * from swap.  But that's a lot of code to duplicate here
-	 * for a rare case, so we simply fetch the page.
-	 */
-	err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
-	if (err >= 0) {
-		key->shared.pgoff =
-			page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-		put_page(page);
-		return 0;
-	}
-	return err;
-}
+	get_futex_key_refs(key);
 
-/*
- * Take a reference to the resource addressed by a key.
- * Can be called while holding spinlocks.
- *
- */
-static void get_futex_key_refs(union futex_key *key)
-{
-	if (key->both.ptr == NULL)
-		return;
-	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
-		case FUT_OFF_INODE:
-			atomic_inc(&key->shared.inode->i_count);
-			break;
-		case FUT_OFF_MMSHARED:
-			atomic_inc(&key->private.mm->mm_count);
-			break;
-	}
+	unlock_page(page);
+	put_page(page);
+	return 0;
 }
 
-/*
- * Drop a reference to the resource addressed by a key.
- * The hash bucket spinlock must not be held.
- */
-static void drop_futex_key_refs(union futex_key *key)
+static inline
+void put_futex_key(struct rw_semaphore *fshared, union futex_key *key)
 {
-	if (!key->both.ptr)
-		return;
-	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
-		case FUT_OFF_INODE:
-			iput(key->shared.inode);
-			break;
-		case FUT_OFF_MMSHARED:
-			mmdrop(key->private.mm);
-			break;
-	}
+	drop_futex_key_refs(key);
 }
 
 static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
@@ -385,6 +372,7 @@ static int refill_pi_state_cache(void)
 	/* pi_mutex gets initialized later */
 	pi_state->owner = NULL;
 	atomic_set(&pi_state->refcount, 1);
+	pi_state->key = FUTEX_KEY_INIT;
 
 	current->pi_state_cache = pi_state;
 
@@ -462,7 +450,7 @@ void exit_pi_state_list(struct task_struct *curr)
 	struct list_head *next, *head = &curr->pi_state_list;
 	struct futex_pi_state *pi_state;
 	struct futex_hash_bucket *hb;
-	union futex_key key;
+	union futex_key key = FUTEX_KEY_INIT;
 
 	if (!futex_cmpxchg_enabled)
 		return;
@@ -725,7 +713,7 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
 	struct futex_hash_bucket *hb;
 	struct futex_q *this, *next;
 	struct plist_head *head;
-	union futex_key key;
+	union futex_key key = FUTEX_KEY_INIT;
 	int ret;
 
 	if (!bitset)
@@ -760,6 +748,7 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
 
 	spin_unlock(&hb->lock);
 out:
+	put_futex_key(fshared, &key);
 	futex_unlock_mm(fshared);
 	return ret;
 }
@@ -773,7 +762,7 @@ futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared,
 	      u32 __user *uaddr2,
 	      int nr_wake, int nr_wake2, int op)
 {
-	union futex_key key1, key2;
+	union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
 	struct futex_hash_bucket *hb1, *hb2;
 	struct plist_head *head;
 	struct futex_q *this, *next;
@@ -873,6 +862,8 @@ retry:
 	if (hb1 != hb2)
 		spin_unlock(&hb2->lock);
 out:
+	put_futex_key(fshared, &key2);
+	put_futex_key(fshared, &key1);
 	futex_unlock_mm(fshared);
 
 	return ret;
@@ -886,7 +877,7 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
 			 u32 __user *uaddr2,
 			 int nr_wake, int nr_requeue, u32 *cmpval)
 {
-	union futex_key key1, key2;
+	union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
 	struct futex_hash_bucket *hb1, *hb2;
 	struct plist_head *head1;
 	struct futex_q *this, *next;
@@ -974,6 +965,8 @@ out_unlock:
 		drop_futex_key_refs(&key1);
 
 out:
+	put_futex_key(fshared, &key2);
+	put_futex_key(fshared, &key1);
 	futex_unlock_mm(fshared);
 	return ret;
 }
@@ -1220,6 +1213,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
  retry:
 	futex_lock_mm(fshared);
 
+	q.key = FUTEX_KEY_INIT;
 	ret = get_futex_key(uaddr, fshared, &q.key);
 	if (unlikely(ret != 0))
 		goto out_release_sem;
@@ -1360,6 +1354,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
 	queue_unlock(&q, hb);
 
  out_release_sem:
+	put_futex_key(fshared, &q.key);
 	futex_unlock_mm(fshared);
 	return ret;
 }
@@ -1411,6 +1406,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
  retry:
 	futex_lock_mm(fshared);
 
+	q.key = FUTEX_KEY_INIT;
 	ret = get_futex_key(uaddr, fshared, &q.key);
 	if (unlikely(ret != 0))
 		goto out_release_sem;
@@ -1625,6 +1621,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
 	queue_unlock(&q, hb);
 
  out_release_sem:
+	put_futex_key(fshared, &q.key);
 	futex_unlock_mm(fshared);
 	if (to)
 		destroy_hrtimer_on_stack(&to->timer);
@@ -1671,7 +1668,7 @@ static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared)
 	struct futex_q *this, *next;
 	u32 uval;
 	struct plist_head *head;
-	union futex_key key;
+	union futex_key key = FUTEX_KEY_INIT;
 	int ret, attempt = 0;
 
 retry:
@@ -1744,6 +1741,7 @@ retry_unlocked:
 out_unlock:
 	spin_unlock(&hb->lock);
 out:
+	put_futex_key(fshared, &key);
 	futex_unlock_mm(fshared);
 
 	return ret;
-- 
cgit v1.2.3


From 61270708ecf1cda148e84fbf6e0703ee5aa81814 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 26 Sep 2008 19:32:21 +0200
Subject: futex: reduce mmap_sem usage

now that we rely on get_user_pages() for the shared key handling
move all the mmap_sem stuff closely around the slow paths.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/futex.c | 83 +++-------------------------------------------------------
 1 file changed, 4 insertions(+), 79 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index a4c39fa0a7a..6a726684217 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -122,24 +122,6 @@ struct futex_hash_bucket {
 
 static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
 
-/*
- * Take mm->mmap_sem, when futex is shared
- */
-static inline void futex_lock_mm(struct rw_semaphore *fshared)
-{
-	if (fshared)
-		down_read(fshared);
-}
-
-/*
- * Release mm->mmap_sem, when the futex is shared
- */
-static inline void futex_unlock_mm(struct rw_semaphore *fshared)
-{
-	if (fshared)
-		up_read(fshared);
-}
-
 /*
  * We hash on the keys returned from get_futex_key (see below).
  */
@@ -250,7 +232,9 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
 	}
 
 again:
+	down_read(&mm->mmap_sem);
 	err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
+	up_read(&mm->mmap_sem);
 	if (err < 0)
 		return err;
 
@@ -327,8 +311,7 @@ static int futex_handle_fault(unsigned long address,
 	if (attempt > 2)
 		return ret;
 
-	if (!fshared)
-		down_read(&mm->mmap_sem);
+	down_read(&mm->mmap_sem);
 	vma = find_vma(mm, address);
 	if (vma && address >= vma->vm_start &&
 	    (vma->vm_flags & VM_WRITE)) {
@@ -348,8 +331,7 @@ static int futex_handle_fault(unsigned long address,
 				current->min_flt++;
 		}
 	}
-	if (!fshared)
-		up_read(&mm->mmap_sem);
+	up_read(&mm->mmap_sem);
 	return ret;
 }
 
@@ -719,8 +701,6 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
 	if (!bitset)
 		return -EINVAL;
 
-	futex_lock_mm(fshared);
-
 	ret = get_futex_key(uaddr, fshared, &key);
 	if (unlikely(ret != 0))
 		goto out;
@@ -749,7 +729,6 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
 	spin_unlock(&hb->lock);
 out:
 	put_futex_key(fshared, &key);
-	futex_unlock_mm(fshared);
 	return ret;
 }
 
@@ -769,8 +748,6 @@ futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared,
 	int ret, op_ret, attempt = 0;
 
 retryfull:
-	futex_lock_mm(fshared);
-
 	ret = get_futex_key(uaddr1, fshared, &key1);
 	if (unlikely(ret != 0))
 		goto out;
@@ -821,12 +798,6 @@ retry:
 			goto retry;
 		}
 
-		/*
-		 * If we would have faulted, release mmap_sem,
-		 * fault it in and start all over again.
-		 */
-		futex_unlock_mm(fshared);
-
 		ret = get_user(dummy, uaddr2);
 		if (ret)
 			return ret;
@@ -864,7 +835,6 @@ retry:
 out:
 	put_futex_key(fshared, &key2);
 	put_futex_key(fshared, &key1);
-	futex_unlock_mm(fshared);
 
 	return ret;
 }
@@ -884,8 +854,6 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
 	int ret, drop_count = 0;
 
  retry:
-	futex_lock_mm(fshared);
-
 	ret = get_futex_key(uaddr1, fshared, &key1);
 	if (unlikely(ret != 0))
 		goto out;
@@ -908,12 +876,6 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
 			if (hb1 != hb2)
 				spin_unlock(&hb2->lock);
 
-			/*
-			 * If we would have faulted, release mmap_sem, fault
-			 * it in and start all over again.
-			 */
-			futex_unlock_mm(fshared);
-
 			ret = get_user(curval, uaddr1);
 
 			if (!ret)
@@ -967,7 +929,6 @@ out_unlock:
 out:
 	put_futex_key(fshared, &key2);
 	put_futex_key(fshared, &key1);
-	futex_unlock_mm(fshared);
 	return ret;
 }
 
@@ -1211,8 +1172,6 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
 	q.pi_state = NULL;
 	q.bitset = bitset;
  retry:
-	futex_lock_mm(fshared);
-
 	q.key = FUTEX_KEY_INIT;
 	ret = get_futex_key(uaddr, fshared, &q.key);
 	if (unlikely(ret != 0))
@@ -1245,12 +1204,6 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
 	if (unlikely(ret)) {
 		queue_unlock(&q, hb);
 
-		/*
-		 * If we would have faulted, release mmap_sem, fault it in and
-		 * start all over again.
-		 */
-		futex_unlock_mm(fshared);
-
 		ret = get_user(uval, uaddr);
 
 		if (!ret)
@@ -1264,12 +1217,6 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
 	/* Only actually queue if *uaddr contained val.  */
 	queue_me(&q, hb);
 
-	/*
-	 * Now the futex is queued and we have checked the data, we
-	 * don't want to hold mmap_sem while we sleep.
-	 */
-	futex_unlock_mm(fshared);
-
 	/*
 	 * There might have been scheduling since the queue_me(), as we
 	 * cannot hold a spinlock across the get_user() in case it
@@ -1355,7 +1302,6 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
 
  out_release_sem:
 	put_futex_key(fshared, &q.key);
-	futex_unlock_mm(fshared);
 	return ret;
 }
 
@@ -1404,8 +1350,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
 
 	q.pi_state = NULL;
  retry:
-	futex_lock_mm(fshared);
-
 	q.key = FUTEX_KEY_INIT;
 	ret = get_futex_key(uaddr, fshared, &q.key);
 	if (unlikely(ret != 0))
@@ -1495,7 +1439,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
 			 * exit to complete.
 			 */
 			queue_unlock(&q, hb);
-			futex_unlock_mm(fshared);
 			cond_resched();
 			goto retry;
 
@@ -1527,12 +1470,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
 	 */
 	queue_me(&q, hb);
 
-	/*
-	 * Now the futex is queued and we have checked the data, we
-	 * don't want to hold mmap_sem while we sleep.
-	 */
-	futex_unlock_mm(fshared);
-
 	WARN_ON(!q.pi_state);
 	/*
 	 * Block on the PI mutex:
@@ -1545,7 +1482,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
 		ret = ret ? 0 : -EWOULDBLOCK;
 	}
 
-	futex_lock_mm(fshared);
 	spin_lock(q.lock_ptr);
 
 	if (!ret) {
@@ -1611,7 +1547,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
 
 	/* Unqueue and drop the lock */
 	unqueue_me_pi(&q);
-	futex_unlock_mm(fshared);
 
 	if (to)
 		destroy_hrtimer_on_stack(&to->timer);
@@ -1622,7 +1557,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
 
  out_release_sem:
 	put_futex_key(fshared, &q.key);
-	futex_unlock_mm(fshared);
 	if (to)
 		destroy_hrtimer_on_stack(&to->timer);
 	return ret;
@@ -1646,8 +1580,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
 		goto retry_unlocked;
 	}
 
-	futex_unlock_mm(fshared);
-
 	ret = get_user(uval, uaddr);
 	if (!ret && (uval != -EFAULT))
 		goto retry;
@@ -1679,10 +1611,6 @@ retry:
 	 */
 	if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
 		return -EPERM;
-	/*
-	 * First take all the futex related locks:
-	 */
-	futex_lock_mm(fshared);
 
 	ret = get_futex_key(uaddr, fshared, &key);
 	if (unlikely(ret != 0))
@@ -1742,7 +1670,6 @@ out_unlock:
 	spin_unlock(&hb->lock);
 out:
 	put_futex_key(fshared, &key);
-	futex_unlock_mm(fshared);
 
 	return ret;
 
@@ -1766,8 +1693,6 @@ pi_faulted:
 		goto retry_unlocked;
 	}
 
-	futex_unlock_mm(fshared);
-
 	ret = get_user(uval, uaddr);
 	if (!ret && (uval != -EFAULT))
 		goto retry;
-- 
cgit v1.2.3


From 734b05b10e51d4ba38c8fc3ee02e846aab09eedf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 26 Sep 2008 19:32:22 +0200
Subject: futex: use fast_gup()

Change the get_user_pages() call with fast_gup() which doesn't require holding
the mmap_sem thereby removing the mmap_sem from all fast paths.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/futex.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 6a726684217..facf17d1a70 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -232,9 +232,7 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
 	}
 
 again:
-	down_read(&mm->mmap_sem);
-	err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
-	up_read(&mm->mmap_sem);
+	err = get_user_pages_fast(address, 1, 0, &page);
 	if (err < 0)
 		return err;
 
-- 
cgit v1.2.3


From c2f9f20154bfb137ccdf8c9159992429a40dfe20 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 26 Sep 2008 19:32:23 +0200
Subject: futex: cleanup fshared

fshared doesn't need to be a rw_sem pointer anymore, so clean that up.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/futex.c | 48 +++++++++++++++++++-----------------------------
 1 file changed, 19 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index facf17d1a70..60b47bb9e3d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -200,8 +200,7 @@ static void drop_futex_key_refs(union futex_key *key)
  * For other futexes, it points to &current->mm->mmap_sem and
  * caller must have taken the reader lock. but NOT any spinlocks.
  */
-static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
-			 union futex_key *key)
+static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
 {
 	unsigned long address = (unsigned long)uaddr;
 	struct mm_struct *mm = current->mm;
@@ -268,7 +267,7 @@ again:
 }
 
 static inline
-void put_futex_key(struct rw_semaphore *fshared, union futex_key *key)
+void put_futex_key(int fshared, union futex_key *key)
 {
 	drop_futex_key_refs(key);
 }
@@ -297,10 +296,8 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from)
 
 /*
  * Fault handling.
- * if fshared is non NULL, current->mm->mmap_sem is already held
  */
-static int futex_handle_fault(unsigned long address,
-			      struct rw_semaphore *fshared, int attempt)
+static int futex_handle_fault(unsigned long address, int attempt)
 {
 	struct vm_area_struct * vma;
 	struct mm_struct *mm = current->mm;
@@ -687,8 +684,7 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
  * Wake up all waiters hashed on the physical page that is mapped
  * to this virtual address:
  */
-static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
-		      int nr_wake, u32 bitset)
+static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
 {
 	struct futex_hash_bucket *hb;
 	struct futex_q *this, *next;
@@ -735,8 +731,7 @@ out:
  * to this virtual address:
  */
 static int
-futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared,
-	      u32 __user *uaddr2,
+futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
 	      int nr_wake, int nr_wake2, int op)
 {
 	union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
@@ -790,7 +785,7 @@ retry:
 		 */
 		if (attempt++) {
 			ret = futex_handle_fault((unsigned long)uaddr2,
-						 fshared, attempt);
+						 attempt);
 			if (ret)
 				goto out;
 			goto retry;
@@ -841,8 +836,7 @@ out:
  * Requeue all waiters hashed on one physical page to another
  * physical page.
  */
-static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
-			 u32 __user *uaddr2,
+static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
 			 int nr_wake, int nr_requeue, u32 *cmpval)
 {
 	union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
@@ -1048,8 +1042,7 @@ static void unqueue_me_pi(struct futex_q *q)
  * private futexes.
  */
 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
-				struct task_struct *newowner,
-				struct rw_semaphore *fshared)
+				struct task_struct *newowner, int fshared)
 {
 	u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
 	struct futex_pi_state *pi_state = q->pi_state;
@@ -1128,7 +1121,7 @@ retry:
 handle_fault:
 	spin_unlock(q->lock_ptr);
 
-	ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++);
+	ret = futex_handle_fault((unsigned long)uaddr, attempt++);
 
 	spin_lock(q->lock_ptr);
 
@@ -1152,7 +1145,7 @@ handle_fault:
 
 static long futex_wait_restart(struct restart_block *restart);
 
-static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
+static int futex_wait(u32 __user *uaddr, int fshared,
 		      u32 val, ktime_t *abs_time, u32 bitset)
 {
 	struct task_struct *curr = current;
@@ -1307,13 +1300,13 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
 static long futex_wait_restart(struct restart_block *restart)
 {
 	u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
-	struct rw_semaphore *fshared = NULL;
+	int fshared = 0;
 	ktime_t t;
 
 	t.tv64 = restart->futex.time;
 	restart->fn = do_no_restart_syscall;
 	if (restart->futex.flags & FLAGS_SHARED)
-		fshared = &current->mm->mmap_sem;
+		fshared = 1;
 	return (long)futex_wait(uaddr, fshared, restart->futex.val, &t,
 				restart->futex.bitset);
 }
@@ -1325,7 +1318,7 @@ static long futex_wait_restart(struct restart_block *restart)
  * if there are waiters then it will block, it does PI, etc. (Due to
  * races the kernel might see a 0 value of the futex too.)
  */
-static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
+static int futex_lock_pi(u32 __user *uaddr, int fshared,
 			 int detect, ktime_t *time, int trylock)
 {
 	struct hrtimer_sleeper timeout, *to = NULL;
@@ -1571,8 +1564,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
 	queue_unlock(&q, hb);
 
 	if (attempt++) {
-		ret = futex_handle_fault((unsigned long)uaddr, fshared,
-					 attempt);
+		ret = futex_handle_fault((unsigned long)uaddr, attempt);
 		if (ret)
 			goto out_release_sem;
 		goto retry_unlocked;
@@ -1592,7 +1584,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
  * This is the in-kernel slowpath: we look up the PI state (if any),
  * and do the rt-mutex unlock.
  */
-static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared)
+static int futex_unlock_pi(u32 __user *uaddr, int fshared)
 {
 	struct futex_hash_bucket *hb;
 	struct futex_q *this, *next;
@@ -1683,8 +1675,7 @@ pi_faulted:
 	spin_unlock(&hb->lock);
 
 	if (attempt++) {
-		ret = futex_handle_fault((unsigned long)uaddr, fshared,
-					 attempt);
+		ret = futex_handle_fault((unsigned long)uaddr, attempt);
 		if (ret)
 			goto out;
 		uval = 0;
@@ -1816,8 +1807,7 @@ retry:
 		 * PI futexes happens in exit_pi_state():
 		 */
 		if (!pi && (uval & FUTEX_WAITERS))
-			futex_wake(uaddr, &curr->mm->mmap_sem, 1,
-				   FUTEX_BITSET_MATCH_ANY);
+			futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
 	}
 	return 0;
 }
@@ -1913,10 +1903,10 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 {
 	int ret = -ENOSYS;
 	int cmd = op & FUTEX_CMD_MASK;
-	struct rw_semaphore *fshared = NULL;
+	int fshared = 0;
 
 	if (!(op & FUTEX_PRIVATE_FLAG))
-		fshared = &current->mm->mmap_sem;
+		fshared = 1;
 
 	switch (cmd) {
 	case FUTEX_WAIT:
-- 
cgit v1.2.3


From 42569c39917a08e8de1e8b5685463be7b74baebd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 30 Sep 2008 12:33:07 +0200
Subject: futex: fixup get_futex_key() for private futexes

With the get_user_pages_fast() patches we made get_futex_key() obtain a
reference on the returned key, but failed to do so for private futexes.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/futex.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 60b47bb9e3d..62cbd648e28 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -227,6 +227,7 @@ static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
 			return -EFAULT;
 		key->private.mm = mm;
 		key->private.address = address;
+		get_futex_key_refs(key);
 		return 0;
 	}
 
-- 
cgit v1.2.3


From 7317d7b87edb41a9135e30be1ec3f7ef817c53dd Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Tue, 30 Sep 2008 20:50:27 +1000
Subject: sched: improve preempt debugging

This patch helped me out with a problem I recently had....

Basically, when the kernel lock is held, then preempt_count underflow does not
get detected until it is released which may be a long time (and arbitrarily,
eg at different points it may be rescheduled). If the bkl is released at
schedule, the resulting output is actually fairly cryptic...

With any other lock that elevates preempt_count, it is illegal to schedule
under it (which would get found pretty quickly). bkl allows scheduling with
preempt_count elevated, which makes underflows hard to debug.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 98890807375..ec3bd1f398b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4305,7 +4305,7 @@ void __kprobes sub_preempt_count(int val)
 	/*
 	 * Underflow?
 	 */
-	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
+       if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
 		return;
 	/*
 	 * Is the spinlock portion underflowing?
-- 
cgit v1.2.3


From c7e78cff6b7518212247fb20b1dc6411540dc9af Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 16 Oct 2008 23:17:09 +0200
Subject: lockstat: contend with points

We currently only provide points that have to wait on contention, also
lists the points we have to wait for.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c      | 33 +++++++++++++++++++++------------
 kernel/lockdep_proc.c | 21 +++++++++++++++++++--
 kernel/mutex.c        |  2 +-
 3 files changed, 41 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index dbda475b13b..234a9dccb4b 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -136,16 +136,16 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock)
 #ifdef CONFIG_LOCK_STAT
 static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
 
-static int lock_contention_point(struct lock_class *class, unsigned long ip)
+static int lock_point(unsigned long points[], unsigned long ip)
 {
 	int i;
 
-	for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) {
-		if (class->contention_point[i] == 0) {
-			class->contention_point[i] = ip;
+	for (i = 0; i < LOCKSTAT_POINTS; i++) {
+		if (points[i] == 0) {
+			points[i] = ip;
 			break;
 		}
-		if (class->contention_point[i] == ip)
+		if (points[i] == ip)
 			break;
 	}
 
@@ -185,6 +185,9 @@ struct lock_class_stats lock_stats(struct lock_class *class)
 		for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
 			stats.contention_point[i] += pcs->contention_point[i];
 
+		for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++)
+			stats.contending_point[i] += pcs->contending_point[i];
+
 		lock_time_add(&pcs->read_waittime, &stats.read_waittime);
 		lock_time_add(&pcs->write_waittime, &stats.write_waittime);
 
@@ -209,6 +212,7 @@ void clear_lock_stats(struct lock_class *class)
 		memset(cpu_stats, 0, sizeof(struct lock_class_stats));
 	}
 	memset(class->contention_point, 0, sizeof(class->contention_point));
+	memset(class->contending_point, 0, sizeof(class->contending_point));
 }
 
 static struct lock_class_stats *get_lock_stats(struct lock_class *class)
@@ -3001,7 +3005,7 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
 	struct held_lock *hlock, *prev_hlock;
 	struct lock_class_stats *stats;
 	unsigned int depth;
-	int i, point;
+	int i, contention_point, contending_point;
 
 	depth = curr->lockdep_depth;
 	if (DEBUG_LOCKS_WARN_ON(!depth))
@@ -3025,18 +3029,22 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
 found_it:
 	hlock->waittime_stamp = sched_clock();
 
-	point = lock_contention_point(hlock_class(hlock), ip);
+	contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
+	contending_point = lock_point(hlock_class(hlock)->contending_point,
+				      lock->ip);
 
 	stats = get_lock_stats(hlock_class(hlock));
-	if (point < ARRAY_SIZE(stats->contention_point))
-		stats->contention_point[point]++;
+	if (contention_point < LOCKSTAT_POINTS)
+		stats->contention_point[contention_point]++;
+	if (contending_point < LOCKSTAT_POINTS)
+		stats->contending_point[contending_point]++;
 	if (lock->cpu != smp_processor_id())
 		stats->bounces[bounce_contended + !!hlock->read]++;
 	put_lock_stats(stats);
 }
 
 static void
-__lock_acquired(struct lockdep_map *lock)
+__lock_acquired(struct lockdep_map *lock, unsigned long ip)
 {
 	struct task_struct *curr = current;
 	struct held_lock *hlock, *prev_hlock;
@@ -3085,6 +3093,7 @@ found_it:
 	put_lock_stats(stats);
 
 	lock->cpu = cpu;
+	lock->ip = ip;
 }
 
 void lock_contended(struct lockdep_map *lock, unsigned long ip)
@@ -3106,7 +3115,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
 }
 EXPORT_SYMBOL_GPL(lock_contended);
 
-void lock_acquired(struct lockdep_map *lock)
+void lock_acquired(struct lockdep_map *lock, unsigned long ip)
 {
 	unsigned long flags;
 
@@ -3119,7 +3128,7 @@ void lock_acquired(struct lockdep_map *lock)
 	raw_local_irq_save(flags);
 	check_flags(flags);
 	current->lockdep_recursion = 1;
-	__lock_acquired(lock);
+	__lock_acquired(lock, ip);
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
 }
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 8d3a6eba8d5..13716b81389 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -557,7 +557,7 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
 	if (stats->read_holdtime.nr)
 		namelen += 2;
 
-	for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) {
+	for (i = 0; i < LOCKSTAT_POINTS; i++) {
 		char sym[KSYM_SYMBOL_LEN];
 		char ip[32];
 
@@ -574,6 +574,23 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
 				stats->contention_point[i],
 				ip, sym);
 	}
+	for (i = 0; i < LOCKSTAT_POINTS; i++) {
+		char sym[KSYM_SYMBOL_LEN];
+		char ip[32];
+
+		if (class->contending_point[i] == 0)
+			break;
+
+		if (!i)
+			seq_line(m, '-', 40-namelen, namelen);
+
+		sprint_symbol(sym, class->contending_point[i]);
+		snprintf(ip, sizeof(ip), "[<%p>]",
+				(void *)class->contending_point[i]);
+		seq_printf(m, "%40s %14lu %29s %s\n", name,
+				stats->contending_point[i],
+				ip, sym);
+	}
 	if (i) {
 		seq_puts(m, "\n");
 		seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1));
@@ -583,7 +600,7 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
 
 static void seq_header(struct seq_file *m)
 {
-	seq_printf(m, "lock_stat version 0.2\n");
+	seq_printf(m, "lock_stat version 0.3\n");
 	seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
 	seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s "
 			"%14s %14s\n",
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 12c779dc65d..39a3816b68d 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -184,7 +184,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	}
 
 done:
-	lock_acquired(&lock->dep_map);
+	lock_acquired(&lock->dep_map, ip);
 	/* got the lock - rejoice! */
 	mutex_remove_waiter(lock, &waiter, task_thread_info(task));
 	debug_mutex_set_owner(lock, task_thread_info(task));
-- 
cgit v1.2.3


From 0eec481e8fb000a209fda9bf8f466aca87dc1150 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 15 Oct 2008 14:56:37 +0800
Subject: markers: simplify marker_set_format()

current marker_set_format() is complex this patch simplify it,
and decrease the overhead of marker_update_probes().

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/marker.c | 71 ++++++++++++++++++++-------------------------------------
 1 file changed, 25 insertions(+), 46 deletions(-)

(limited to 'kernel')

diff --git a/kernel/marker.c b/kernel/marker.c
index e9c6b2bc940..75a1a17cd78 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -64,6 +64,7 @@ struct marker_entry {
 	void *oldptr;
 	int rcu_pending;
 	unsigned char ptype:1;
+	unsigned char format_allocated:1;
 	char name[0];	/* Contains name'\0'format'\0' */
 };
 
@@ -416,6 +417,7 @@ static struct marker_entry *add_marker(const char *name, const char *format)
 	e->single.probe_private = NULL;
 	e->multi = NULL;
 	e->ptype = 0;
+	e->format_allocated = 0;
 	e->refcount = 0;
 	e->rcu_pending = 0;
 	hlist_add_head(&e->hlist, head);
@@ -447,6 +449,8 @@ static int remove_marker(const char *name)
 	if (e->single.func != __mark_empty_function)
 		return -EBUSY;
 	hlist_del(&e->hlist);
+	if (e->format_allocated)
+		kfree(e->format);
 	/* Make sure the call_rcu has been executed */
 	if (e->rcu_pending)
 		rcu_barrier_sched();
@@ -457,57 +461,34 @@ static int remove_marker(const char *name)
 /*
  * Set the mark_entry format to the format found in the element.
  */
-static int marker_set_format(struct marker_entry **entry, const char *format)
+static int marker_set_format(struct marker_entry *entry, const char *format)
 {
-	struct marker_entry *e;
-	size_t name_len = strlen((*entry)->name) + 1;
-	size_t format_len = strlen(format) + 1;
-
-
-	e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
-			GFP_KERNEL);
-	if (!e)
+	entry->format = kstrdup(format, GFP_KERNEL);
+	if (!entry->format)
 		return -ENOMEM;
-	memcpy(&e->name[0], (*entry)->name, name_len);
-	e->format = &e->name[name_len];
-	memcpy(e->format, format, format_len);
-	if (strcmp(e->format, MARK_NOARGS) == 0)
-		e->call = marker_probe_cb_noarg;
-	else
-		e->call = marker_probe_cb;
-	e->single = (*entry)->single;
-	e->multi = (*entry)->multi;
-	e->ptype = (*entry)->ptype;
-	e->refcount = (*entry)->refcount;
-	e->rcu_pending = 0;
-	hlist_add_before(&e->hlist, &(*entry)->hlist);
-	hlist_del(&(*entry)->hlist);
-	/* Make sure the call_rcu has been executed */
-	if ((*entry)->rcu_pending)
-		rcu_barrier_sched();
-	kfree(*entry);
-	*entry = e;
+	entry->format_allocated = 1;
+
 	trace_mark(core_marker_format, "name %s format %s",
-			e->name, e->format);
+			entry->name, entry->format);
 	return 0;
 }
 
 /*
  * Sets the probe callback corresponding to one marker.
  */
-static int set_marker(struct marker_entry **entry, struct marker *elem,
+static int set_marker(struct marker_entry *entry, struct marker *elem,
 		int active)
 {
 	int ret;
-	WARN_ON(strcmp((*entry)->name, elem->name) != 0);
+	WARN_ON(strcmp(entry->name, elem->name) != 0);
 
-	if ((*entry)->format) {
-		if (strcmp((*entry)->format, elem->format) != 0) {
+	if (entry->format) {
+		if (strcmp(entry->format, elem->format) != 0) {
 			printk(KERN_NOTICE
 				"Format mismatch for probe %s "
 				"(%s), marker (%s)\n",
-				(*entry)->name,
-				(*entry)->format,
+				entry->name,
+				entry->format,
 				elem->format);
 			return -EPERM;
 		}
@@ -523,34 +504,33 @@ static int set_marker(struct marker_entry **entry, struct marker *elem,
 	 * pass from a "safe" callback (with argument) to an "unsafe"
 	 * callback (does not set arguments).
 	 */
-	elem->call = (*entry)->call;
+	elem->call = entry->call;
 	/*
 	 * Sanity check :
 	 * We only update the single probe private data when the ptr is
 	 * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
 	 */
 	WARN_ON(elem->single.func != __mark_empty_function
-		&& elem->single.probe_private
-		!= (*entry)->single.probe_private &&
-		!elem->ptype);
-	elem->single.probe_private = (*entry)->single.probe_private;
+		&& elem->single.probe_private != entry->single.probe_private
+		&& !elem->ptype);
+	elem->single.probe_private = entry->single.probe_private;
 	/*
 	 * Make sure the private data is valid when we update the
 	 * single probe ptr.
 	 */
 	smp_wmb();
-	elem->single.func = (*entry)->single.func;
+	elem->single.func = entry->single.func;
 	/*
 	 * We also make sure that the new probe callbacks array is consistent
 	 * before setting a pointer to it.
 	 */
-	rcu_assign_pointer(elem->multi, (*entry)->multi);
+	rcu_assign_pointer(elem->multi, entry->multi);
 	/*
 	 * Update the function or multi probe array pointer before setting the
 	 * ptype.
 	 */
 	smp_wmb();
-	elem->ptype = (*entry)->ptype;
+	elem->ptype = entry->ptype;
 	elem->state = active;
 
 	return 0;
@@ -594,8 +574,7 @@ void marker_update_probe_range(struct marker *begin,
 	for (iter = begin; iter < end; iter++) {
 		mark_entry = get_marker(iter->name);
 		if (mark_entry) {
-			set_marker(&mark_entry, iter,
-					!!mark_entry->refcount);
+			set_marker(mark_entry, iter, !!mark_entry->refcount);
 			/*
 			 * ignore error, continue
 			 */
@@ -657,7 +636,7 @@ int marker_probe_register(const char *name, const char *format,
 			ret = PTR_ERR(entry);
 	} else if (format) {
 		if (!entry->format)
-			ret = marker_set_format(&entry, format);
+			ret = marker_set_format(entry, format);
 		else if (strcmp(entry->format, format))
 			ret = -EPERM;
 	}
-- 
cgit v1.2.3


From 505e371da195fad20cb8aaf45407a2849774d6d0 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 15 Oct 2008 14:56:42 +0800
Subject: markers: remove exported symbol marker_probe_cb_noarg()

marker_probe_cb_noarg() should not be seen by outer code.
this patch remove it.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/marker.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/marker.c b/kernel/marker.c
index 75a1a17cd78..23192646555 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -158,7 +158,7 @@ EXPORT_SYMBOL_GPL(marker_probe_cb);
  *
  * Should be connected to markers "MARK_NOARGS".
  */
-void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
+static void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
 {
 	va_list args;	/* not initialized */
 	char ptype;
@@ -198,7 +198,6 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
 	}
 	rcu_read_unlock_sched();
 }
-EXPORT_SYMBOL_GPL(marker_probe_cb_noarg);
 
 static void free_old_closure(struct rcu_head *head)
 {
-- 
cgit v1.2.3


From 4de62748e69c31fc4fd5bc43b73e9cf60a17ec53 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 15 Oct 2008 14:56:47 +0800
Subject: markers: let marker_table be close to its comments

marker_table is defined far from its comments, this fix make
cleanup for it.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/marker.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/marker.c b/kernel/marker.c
index 23192646555..0f2a944329d 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -43,6 +43,7 @@ static DEFINE_MUTEX(markers_mutex);
  */
 #define MARKER_HASH_BITS 6
 #define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
+static struct hlist_head marker_table[MARKER_TABLE_SIZE];
 
 /*
  * Note about RCU :
@@ -68,8 +69,6 @@ struct marker_entry {
 	char name[0];	/* Contains name'\0'format'\0' */
 };
 
-static struct hlist_head marker_table[MARKER_TABLE_SIZE];
-
 /**
  * __mark_empty_function - Empty probe callback
  * @probe_private: probe private data
-- 
cgit v1.2.3


From 5d9881ea1440f046ee851bbaa2a2962543336a11 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Wed, 22 Oct 2008 11:38:01 +0800
Subject: markers: break the redundant loop in kernel/marker.c

Impact: cleanup, no functionality changed

Because e->name is unique in list, we don't need to continue the iteration
after matched.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/marker.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/marker.c b/kernel/marker.c
index 0f2a944329d..2898b647d41 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -825,8 +825,6 @@ void *marker_get_private_data(const char *name, marker_probe_func *probe,
 			if (!e->ptype) {
 				if (num == 0 && e->single.func == probe)
 					return e->single.probe_private;
-				else
-					break;
 			} else {
 				struct marker_probe_closure *closure;
 				int match = 0;
@@ -838,6 +836,7 @@ void *marker_get_private_data(const char *name, marker_probe_func *probe,
 						return closure[i].probe_private;
 				}
 			}
+			break;
 		}
 	}
 	return ERR_PTR(-ENOENT);
-- 
cgit v1.2.3


From 944ac4259e39801c843a915c3da8194ac9af0440 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 23 Oct 2008 19:26:08 -0400
Subject: ftrace: ftrace dump on oops control

Impact: add (default-off) dump-trace-on-oops flag

Currently, ftrace is set up to dump its contents to the console if the
kernel panics or oops. This can be annoying if you have trace data in
the buffers and you experience an oops, but the trace data is old or
static.

Usually when you want ftrace to dump its contents is when you are debugging
your system and you have set up ftrace to trace the events leading to
an oops.

This patch adds a control variable called "ftrace_dump_on_oops" that will
enable the ftrace dump to console on oops. This variable is default off
but a developer can enable it either through the kernel command line
by adding "ftrace_dump_on_oops" or at run time by setting (or disabling)
/proc/sys/kernel/ftrace_dump_on_oops.

v2:

   Replaced /** with /* as Randy explained that kernel-doc does
    not yet handle variables.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sysctl.c      | 10 ++++++++++
 kernel/trace/trace.c | 29 ++++++++++++++++++++++++++---
 2 files changed, 36 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index a13bd4dfaeb..84754f5801e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -484,6 +484,16 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &ftrace_enable_sysctl,
 	},
 #endif
+#ifdef CONFIG_TRACING
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "ftrace_dump_on_opps",
+		.data		= &ftrace_dump_on_oops,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
 #ifdef CONFIG_MODULES
 	{
 		.ctl_name	= KERN_MODPROBE,
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d345d649d07..47f46cbdd86 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -63,6 +63,28 @@ static cpumask_t __read_mostly		tracing_buffer_mask;
 
 static int tracing_disabled = 1;
 
+/*
+ * ftrace_dump_on_oops - variable to dump ftrace buffer on oops
+ *
+ * If there is an oops (or kernel panic) and the ftrace_dump_on_oops
+ * is set, then ftrace_dump is called. This will output the contents
+ * of the ftrace buffers to the console.  This is very useful for
+ * capturing traces that lead to crashes and outputing it to a
+ * serial console.
+ *
+ * It is default off, but you can enable it with either specifying
+ * "ftrace_dump_on_oops" in the kernel command line, or setting
+ * /proc/sys/kernel/ftrace_dump_on_oops to true.
+ */
+int ftrace_dump_on_oops;
+
+static int __init set_ftrace_dump_on_oops(char *str)
+{
+	ftrace_dump_on_oops = 1;
+	return 1;
+}
+__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
+
 long
 ns2usecs(cycle_t nsec)
 {
@@ -3021,7 +3043,8 @@ EXPORT_SYMBOL_GPL(__ftrace_printk);
 static int trace_panic_handler(struct notifier_block *this,
 			       unsigned long event, void *unused)
 {
-	ftrace_dump();
+	if (ftrace_dump_on_oops)
+		ftrace_dump();
 	return NOTIFY_OK;
 }
 
@@ -3037,7 +3060,8 @@ static int trace_die_handler(struct notifier_block *self,
 {
 	switch (val) {
 	case DIE_OOPS:
-		ftrace_dump();
+		if (ftrace_dump_on_oops)
+			ftrace_dump();
 		break;
 	default:
 		break;
@@ -3078,7 +3102,6 @@ trace_printk_seq(struct trace_seq *s)
 	trace_seq_reset(s);
 }
 
-
 void ftrace_dump(void)
 {
 	static DEFINE_SPINLOCK(ftrace_dump_lock);
-- 
cgit v1.2.3


From eab172294d5e24464f332dd8e94a57a9819c81c4 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 29 Oct 2008 17:03:22 +0800
Subject: sched: cleanup for alloc_rt/fair_sched_group()

Impact: cleanup

Remove checking parent == NULL. It won't be NULLL, because we dynamically
create sub task_group only, and sub task_group always has its parent.
(root task_group is statically defined)

Also replace kmalloc_node(GFP_ZERO) with kzalloc_node().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index e8819bc6f46..7dd6c860773 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8472,7 +8472,7 @@ static
 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
 	struct cfs_rq *cfs_rq;
-	struct sched_entity *se, *parent_se;
+	struct sched_entity *se;
 	struct rq *rq;
 	int i;
 
@@ -8488,18 +8488,17 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
 
-		cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
-				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
+				      GFP_KERNEL, cpu_to_node(i));
 		if (!cfs_rq)
 			goto err;
 
-		se = kmalloc_node(sizeof(struct sched_entity),
-				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+		se = kzalloc_node(sizeof(struct sched_entity),
+				  GFP_KERNEL, cpu_to_node(i));
 		if (!se)
 			goto err;
 
-		parent_se = parent ? parent->se[i] : NULL;
-		init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
+		init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
 	}
 
 	return 1;
@@ -8560,7 +8559,7 @@ static
 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 {
 	struct rt_rq *rt_rq;
-	struct sched_rt_entity *rt_se, *parent_se;
+	struct sched_rt_entity *rt_se;
 	struct rq *rq;
 	int i;
 
@@ -8577,18 +8576,17 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
 
-		rt_rq = kmalloc_node(sizeof(struct rt_rq),
-				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+		rt_rq = kzalloc_node(sizeof(struct rt_rq),
+				     GFP_KERNEL, cpu_to_node(i));
 		if (!rt_rq)
 			goto err;
 
-		rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
-				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+		rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
+				     GFP_KERNEL, cpu_to_node(i));
 		if (!rt_se)
 			goto err;
 
-		parent_se = parent ? parent->rt_se[i] : NULL;
-		init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
+		init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
 	}
 
 	return 1;
-- 
cgit v1.2.3


From f3f0d7b026ae34d6ed5ae67cd4dd5909f9cd70a5 Mon Sep 17 00:00:00 2001
From: Tim Shimmin <tes@sgi.com>
Date: Thu, 30 Oct 2008 18:30:09 +1100
Subject: [XFS] remove restricted chown parameter from xfs linux

On Linux all filesystems are supposed to be operating under Posix'
restricted chown. Restricted chown means it restricts chown to the owner
unless you have CAP_FOWNER.

NOTE: that 2 files outside of fs/xfs have been modified too for this
change.

Reviewed-by: Dave Chinner <david@fromorbit.com>

SGI-PV: 988919

SGI-Modid: 2.6.x-xfs-melb:linux:32413b

Signed-off-by: Tim Shimmin <tes@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: David Chinner <david@fromorbit.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 kernel/sysctl_check.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index c35da23ab8f..fafeb48f27c 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -730,7 +730,6 @@ static const struct trans_ctl_table trans_fs_quota_table[] = {
 };
 
 static const struct trans_ctl_table trans_fs_xfs_table[] = {
-	{ XFS_RESTRICT_CHOWN,	"restrict_chown" },
 	{ XFS_SGID_INHERIT,	"irix_sgid_inherit" },
 	{ XFS_SYMLINK_MODE,	"irix_symlink_mode" },
 	{ XFS_PANIC_MASK,	"panic_mask" },
-- 
cgit v1.2.3


From 34f3a814eef8069a24e5b3ebcf27aba9dabac2ea Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 30 Oct 2008 15:23:32 +0800
Subject: sched: switch sched_features to seqfile

Impact: cleanup

So handling of sched_features read is simplified.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 52 ++++++++++++++++------------------------------------
 1 file changed, 16 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 7dd6c860773..5419df9cc5c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -703,45 +703,18 @@ static __read_mostly char *sched_feat_names[] = {
 
 #undef SCHED_FEAT
 
-static int sched_feat_open(struct inode *inode, struct file *filp)
+static int sched_feat_show(struct seq_file *m, void *v)
 {
-	filp->private_data = inode->i_private;
-	return 0;
-}
-
-static ssize_t
-sched_feat_read(struct file *filp, char __user *ubuf,
-		size_t cnt, loff_t *ppos)
-{
-	char *buf;
-	int r = 0;
-	int len = 0;
 	int i;
 
 	for (i = 0; sched_feat_names[i]; i++) {
-		len += strlen(sched_feat_names[i]);
-		len += 4;
+		if (!(sysctl_sched_features & (1UL << i)))
+			seq_puts(m, "NO_");
+		seq_printf(m, "%s ", sched_feat_names[i]);
 	}
+	seq_puts(m, "\n");
 
-	buf = kmalloc(len + 2, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-
-	for (i = 0; sched_feat_names[i]; i++) {
-		if (sysctl_sched_features & (1UL << i))
-			r += sprintf(buf + r, "%s ", sched_feat_names[i]);
-		else
-			r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
-	}
-
-	r += sprintf(buf + r, "\n");
-	WARN_ON(r >= len + 2);
-
-	r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-
-	kfree(buf);
-
-	return r;
+	return 0;
 }
 
 static ssize_t
@@ -786,10 +759,17 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
 	return cnt;
 }
 
+static int sched_feat_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, sched_feat_show, NULL);
+}
+
 static struct file_operations sched_feat_fops = {
-	.open	= sched_feat_open,
-	.read	= sched_feat_read,
-	.write	= sched_feat_write,
+	.open		= sched_feat_open,
+	.write		= sched_feat_write,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
 };
 
 static __init int sched_init_debug(void)
-- 
cgit v1.2.3


From b807c3d0f8e39ed7cbbbe6da162650e305e8de15 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 30 Oct 2008 16:08:33 -0400
Subject: ftrace: nmi update statistics

Impact: add more debug info to /debugfs/tracing/dyn_ftrace_total_info

This patch adds dynamic ftrace NMI update statistics to the
/debugfs/tracing/dyn_ftrace_total_info stat file.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a610ca77155..bc36febc077 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2815,22 +2815,39 @@ static struct file_operations tracing_mark_fops = {
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 
+#define DYN_INFO_BUF_SIZE 1023
+static char ftrace_dyn_info_buffer[DYN_INFO_BUF_SIZE+1];
+static DEFINE_MUTEX(dyn_info_mutex);
+
+int __weak ftrace_arch_read_dyn_info(char *buf, int size)
+{
+	return 0;
+}
+
 static ssize_t
-tracing_read_long(struct file *filp, char __user *ubuf,
+tracing_read_dyn_info(struct file *filp, char __user *ubuf,
 		  size_t cnt, loff_t *ppos)
 {
 	unsigned long *p = filp->private_data;
-	char buf[64];
+	char *buf = ftrace_dyn_info_buffer;
 	int r;
 
-	r = sprintf(buf, "%ld\n", *p);
+	mutex_lock(&dyn_info_mutex);
+	r = sprintf(buf, "%ld ", *p);
 
-	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+	r += ftrace_arch_read_dyn_info(buf+r, DYN_INFO_BUF_SIZE-r);
+	buf[r++] = '\n';
+
+	r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+
+	mutex_unlock(&dyn_info_mutex);
+
+	return r;
 }
 
-static struct file_operations tracing_read_long_fops = {
+static struct file_operations tracing_dyn_info_fops = {
 	.open		= tracing_open_generic,
-	.read		= tracing_read_long,
+	.read		= tracing_read_dyn_info,
 };
 #endif
 
@@ -2939,7 +2956,7 @@ static __init int tracer_init_debugfs(void)
 #ifdef CONFIG_DYNAMIC_FTRACE
 	entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
 				    &ftrace_update_tot_cnt,
-				    &tracing_read_long_fops);
+				    &tracing_dyn_info_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs "
 			   "'dyn_ftrace_total_info' entry\n");
-- 
cgit v1.2.3


From a26a2a27396c0a0877aa701f8f92d08ba550a6c9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 31 Oct 2008 00:03:22 -0400
Subject: ftrace: nmi safe code clean ups

Impact: cleanup

This patch cleans up the NMI safe code for dynamic ftrace as suggested
by Andrew Morton.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bc36febc077..7f86067d760 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2815,10 +2815,6 @@ static struct file_operations tracing_mark_fops = {
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 
-#define DYN_INFO_BUF_SIZE 1023
-static char ftrace_dyn_info_buffer[DYN_INFO_BUF_SIZE+1];
-static DEFINE_MUTEX(dyn_info_mutex);
-
 int __weak ftrace_arch_read_dyn_info(char *buf, int size)
 {
 	return 0;
@@ -2828,14 +2824,17 @@ static ssize_t
 tracing_read_dyn_info(struct file *filp, char __user *ubuf,
 		  size_t cnt, loff_t *ppos)
 {
+	static char ftrace_dyn_info_buffer[1024];
+	static DEFINE_MUTEX(dyn_info_mutex);
 	unsigned long *p = filp->private_data;
 	char *buf = ftrace_dyn_info_buffer;
+	int size = ARRAY_SIZE(ftrace_dyn_info_buffer);
 	int r;
 
 	mutex_lock(&dyn_info_mutex);
 	r = sprintf(buf, "%ld ", *p);
 
-	r += ftrace_arch_read_dyn_info(buf+r, DYN_INFO_BUF_SIZE-r);
+	r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r);
 	buf[r++] = '\n';
 
 	r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-- 
cgit v1.2.3


From 8bb8c4386d08f2cc5d871d22f220d35032213f84 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 1 Nov 2008 00:13:49 +0100
Subject: sched, ftrace: trace sched.c

Impact: allow function tracing within sched.c

Its useful to see what happens in sched.c.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Makefile | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 9a3ec66a9d8..e1af0397214 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -21,7 +21,6 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
 CFLAGS_REMOVE_rtmutex-debug.o = -pg
 CFLAGS_REMOVE_cgroup-debug.o = -pg
 CFLAGS_REMOVE_sched_clock.o = -pg
-CFLAGS_REMOVE_sched.o = -mno-spe -pg
 endif
 
 obj-$(CONFIG_FREEZER) += freezer.o
-- 
cgit v1.2.3


From d9e540762f5cdd89f24e518ad1fd31142d0b9726 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 1 Nov 2008 19:57:37 +0100
Subject: ftrace: ftrace_dump_on_oops=[tracer]

Impact: add new (optional) debug boot option

In order to facilitate early boot trouble, allow one to specify a tracer
on the kernel boot line.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 58 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 38 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bdb1df00fb1..482583eb800 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -79,6 +79,15 @@ static int tracing_disabled = 1;
  */
 int ftrace_dump_on_oops;
 
+static int tracing_set_tracer(char *buf);
+
+static int __init set_ftrace(char *str)
+{
+	tracing_set_tracer(str);
+	return 1;
+}
+__setup("ftrace", set_ftrace);
+
 static int __init set_ftrace_dump_on_oops(char *str)
 {
 	ftrace_dump_on_oops = 1;
@@ -2394,29 +2403,11 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
-static ssize_t
-tracing_set_trace_write(struct file *filp, const char __user *ubuf,
-			size_t cnt, loff_t *ppos)
+static int tracing_set_tracer(char *buf)
 {
 	struct trace_array *tr = &global_trace;
 	struct tracer *t;
-	char buf[max_tracer_type_len+1];
-	int i;
-	size_t ret;
-
-	ret = cnt;
-
-	if (cnt > max_tracer_type_len)
-		cnt = max_tracer_type_len;
-
-	if (copy_from_user(&buf, ubuf, cnt))
-		return -EFAULT;
-
-	buf[cnt] = 0;
-
-	/* strip ending whitespace. */
-	for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
-		buf[i] = 0;
+	int ret = 0;
 
 	mutex_lock(&trace_types_lock);
 	for (t = trace_types; t; t = t->next) {
@@ -2440,6 +2431,33 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
  out:
 	mutex_unlock(&trace_types_lock);
 
+	return ret;
+}
+
+static ssize_t
+tracing_set_trace_write(struct file *filp, const char __user *ubuf,
+			size_t cnt, loff_t *ppos)
+{
+	char buf[max_tracer_type_len+1];
+	int i;
+	size_t ret;
+
+	if (cnt > max_tracer_type_len)
+		cnt = max_tracer_type_len;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	/* strip ending whitespace. */
+	for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
+		buf[i] = 0;
+
+	ret = tracing_set_tracer(buf);
+	if (!ret)
+		ret = cnt;
+
 	if (ret > 0)
 		filp->f_pos += ret;
 
-- 
cgit v1.2.3


From 19dba33c43a2f0f2aa727ae075ec3b11330775ef Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 28 Oct 2008 10:51:49 +0800
Subject: tracepoint: simplification for tracepoints using RCU

Impact: simplify implementation

Now, unused memory is handled by struct tp_probes.

old code use these three field to handle unused memory.
struct tracepoint_entry {
	...
	struct rcu_head rcu;
	void *oldptr;
	unsigned char rcu_pending:1;
	...
};

in this way, unused memory is handled by struct tracepoint_entry.
it bring reenter bug(it was fixed) and tracepoint.c is filled
full of ".*rcu.*" code statements. this patch removes all these.

and:
  rcu_barrier_sched() is removed.
  Do not need regain tracepoints_mutex after tracepoint_update_probes()
  several little cleanup.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/tracepoint.c | 111 ++++++++++++++++++----------------------------------
 1 file changed, 37 insertions(+), 74 deletions(-)

(limited to 'kernel')

diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index af8c8566488..3e22867184e 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -43,6 +43,7 @@ static DEFINE_MUTEX(tracepoints_mutex);
  */
 #define TRACEPOINT_HASH_BITS 6
 #define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS)
+static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
 
 /*
  * Note about RCU :
@@ -54,40 +55,40 @@ struct tracepoint_entry {
 	struct hlist_node hlist;
 	void **funcs;
 	int refcount;	/* Number of times armed. 0 if disarmed. */
-	struct rcu_head rcu;
-	void *oldptr;
-	unsigned char rcu_pending:1;
 	char name[0];
 };
 
-static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
+struct tp_probes {
+	struct rcu_head rcu;
+	void *probes[0];
+};
 
-static void free_old_closure(struct rcu_head *head)
+static inline void *allocate_probes(int count)
 {
-	struct tracepoint_entry *entry = container_of(head,
-		struct tracepoint_entry, rcu);
-	kfree(entry->oldptr);
-	/* Make sure we free the data before setting the pending flag to 0 */
-	smp_wmb();
-	entry->rcu_pending = 0;
+	struct tp_probes *p  = kmalloc(count * sizeof(void *)
+			+ sizeof(struct tp_probes), GFP_KERNEL);
+	return p == NULL ? NULL : p->probes;
 }
 
-static void tracepoint_entry_free_old(struct tracepoint_entry *entry, void *old)
+static void rcu_free_old_probes(struct rcu_head *head)
 {
-	if (!old)
-		return;
-	entry->oldptr = old;
-	entry->rcu_pending = 1;
-	/* write rcu_pending before calling the RCU callback */
-	smp_wmb();
-	call_rcu_sched(&entry->rcu, free_old_closure);
+	kfree(container_of(head, struct tp_probes, rcu));
+}
+
+static inline void release_probes(void *old)
+{
+	if (old) {
+		struct tp_probes *tp_probes = container_of(old,
+			struct tp_probes, probes[0]);
+		call_rcu(&tp_probes->rcu, rcu_free_old_probes);
+	}
 }
 
 static void debug_print_probes(struct tracepoint_entry *entry)
 {
 	int i;
 
-	if (!tracepoint_debug)
+	if (!tracepoint_debug || !entry->funcs)
 		return;
 
 	for (i = 0; entry->funcs[i]; i++)
@@ -111,12 +112,13 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
 				return ERR_PTR(-EEXIST);
 	}
 	/* + 2 : one for new probe, one for NULL func */
-	new = kzalloc((nr_probes + 2) * sizeof(void *), GFP_KERNEL);
+	new = allocate_probes(nr_probes + 2);
 	if (new == NULL)
 		return ERR_PTR(-ENOMEM);
 	if (old)
 		memcpy(new, old, nr_probes * sizeof(void *));
 	new[nr_probes] = probe;
+	new[nr_probes + 1] = NULL;
 	entry->refcount = nr_probes + 1;
 	entry->funcs = new;
 	debug_print_probes(entry);
@@ -132,7 +134,7 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
 	old = entry->funcs;
 
 	if (!old)
-		return NULL;
+		return ERR_PTR(-ENOENT);
 
 	debug_print_probes(entry);
 	/* (N -> M), (N > 1, M >= 0) probes */
@@ -151,13 +153,13 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
 		int j = 0;
 		/* N -> M, (N > 1, M > 0) */
 		/* + 1 for NULL */
-		new = kzalloc((nr_probes - nr_del + 1)
-			* sizeof(void *), GFP_KERNEL);
+		new = allocate_probes(nr_probes - nr_del + 1);
 		if (new == NULL)
 			return ERR_PTR(-ENOMEM);
 		for (i = 0; old[i]; i++)
 			if ((probe && old[i] != probe))
 				new[j++] = old[i];
+		new[nr_probes - nr_del] = NULL;
 		entry->refcount = nr_probes - nr_del;
 		entry->funcs = new;
 	}
@@ -215,7 +217,6 @@ static struct tracepoint_entry *add_tracepoint(const char *name)
 	memcpy(&e->name[0], name, name_len);
 	e->funcs = NULL;
 	e->refcount = 0;
-	e->rcu_pending = 0;
 	hlist_add_head(&e->hlist, head);
 	return e;
 }
@@ -224,32 +225,10 @@ static struct tracepoint_entry *add_tracepoint(const char *name)
  * Remove the tracepoint from the tracepoint hash table. Must be called with
  * mutex_lock held.
  */
-static int remove_tracepoint(const char *name)
+static inline void remove_tracepoint(struct tracepoint_entry *e)
 {
-	struct hlist_head *head;
-	struct hlist_node *node;
-	struct tracepoint_entry *e;
-	int found = 0;
-	size_t len = strlen(name) + 1;
-	u32 hash = jhash(name, len-1, 0);
-
-	head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
-	hlist_for_each_entry(e, node, head, hlist) {
-		if (!strcmp(name, e->name)) {
-			found = 1;
-			break;
-		}
-	}
-	if (!found)
-		return -ENOENT;
-	if (e->refcount)
-		return -EBUSY;
 	hlist_del(&e->hlist);
-	/* Make sure the call_rcu_sched has been executed */
-	if (e->rcu_pending)
-		rcu_barrier_sched();
 	kfree(e);
-	return 0;
 }
 
 /*
@@ -343,25 +322,17 @@ int tracepoint_probe_register(const char *name, void *probe)
 			goto end;
 		}
 	}
-	/*
-	 * If we detect that a call_rcu_sched is pending for this tracepoint,
-	 * make sure it's executed now.
-	 */
-	if (entry->rcu_pending)
-		rcu_barrier_sched();
 	old = tracepoint_entry_add_probe(entry, probe);
 	if (IS_ERR(old)) {
+		if (!entry->refcount)
+			remove_tracepoint(entry);
 		ret = PTR_ERR(old);
 		goto end;
 	}
 	mutex_unlock(&tracepoints_mutex);
 	tracepoint_update_probes();		/* may update entry */
-	mutex_lock(&tracepoints_mutex);
-	entry = get_tracepoint(name);
-	WARN_ON(!entry);
-	if (entry->rcu_pending)
-		rcu_barrier_sched();
-	tracepoint_entry_free_old(entry, old);
+	release_probes(old);
+	return 0;
 end:
 	mutex_unlock(&tracepoints_mutex);
 	return ret;
@@ -388,25 +359,17 @@ int tracepoint_probe_unregister(const char *name, void *probe)
 	entry = get_tracepoint(name);
 	if (!entry)
 		goto end;
-	if (entry->rcu_pending)
-		rcu_barrier_sched();
 	old = tracepoint_entry_remove_probe(entry, probe);
-	if (!old) {
-		printk(KERN_WARNING "Warning: Trying to unregister a probe"
-				    "that doesn't exist\n");
+	if (IS_ERR(old)) {
+		ret = PTR_ERR(old);
 		goto end;
 	}
+	if (!entry->refcount)
+		remove_tracepoint(entry);
 	mutex_unlock(&tracepoints_mutex);
 	tracepoint_update_probes();		/* may update entry */
-	mutex_lock(&tracepoints_mutex);
-	entry = get_tracepoint(name);
-	if (!entry)
-		goto end;
-	if (entry->rcu_pending)
-		rcu_barrier_sched();
-	tracepoint_entry_free_old(entry, old);
-	remove_tracepoint(name);	/* Ignore busy error message */
-	ret = 0;
+	release_probes(old);
+	return 0;
 end:
 	mutex_unlock(&tracepoints_mutex);
 	return ret;
-- 
cgit v1.2.3


From 127cafbb276266b1b8da967bfe25a062ab1d42ab Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 28 Oct 2008 10:51:53 +0800
Subject: tracepoint: introduce *_noupdate APIs.

Impact: add new tracepoint APIs to allow the batched registration of probes

new APIs separate tracepoint_probe_register(),
tracepoint_probe_unregister() into 2 steps. The first step of them
is just update tracepoint_entry, not connect or disconnect.

this patch introduces tracepoint_probe_update_all() for update all.

these APIs are very useful for registering lots of probes
but just updating once. Another very important thing is that
*_noupdate APIs do not require module_mutex.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/tracepoint.c | 170 ++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 132 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 3e22867184e..e96590f17de 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -59,7 +59,10 @@ struct tracepoint_entry {
 };
 
 struct tp_probes {
-	struct rcu_head rcu;
+	union {
+		struct rcu_head rcu;
+		struct list_head list;
+	} u;
 	void *probes[0];
 };
 
@@ -72,7 +75,7 @@ static inline void *allocate_probes(int count)
 
 static void rcu_free_old_probes(struct rcu_head *head)
 {
-	kfree(container_of(head, struct tp_probes, rcu));
+	kfree(container_of(head, struct tp_probes, u.rcu));
 }
 
 static inline void release_probes(void *old)
@@ -80,7 +83,7 @@ static inline void release_probes(void *old)
 	if (old) {
 		struct tp_probes *tp_probes = container_of(old,
 			struct tp_probes, probes[0]);
-		call_rcu(&tp_probes->rcu, rcu_free_old_probes);
+		call_rcu_sched(&tp_probes->u.rcu, rcu_free_old_probes);
 	}
 }
 
@@ -299,6 +302,23 @@ static void tracepoint_update_probes(void)
 	module_update_tracepoints();
 }
 
+static void *tracepoint_add_probe(const char *name, void *probe)
+{
+	struct tracepoint_entry *entry;
+	void *old;
+
+	entry = get_tracepoint(name);
+	if (!entry) {
+		entry = add_tracepoint(name);
+		if (IS_ERR(entry))
+			return entry;
+	}
+	old = tracepoint_entry_add_probe(entry, probe);
+	if (IS_ERR(old) && !entry->refcount)
+		remove_tracepoint(entry);
+	return old;
+}
+
 /**
  * tracepoint_probe_register -  Connect a probe to a tracepoint
  * @name: tracepoint name
@@ -309,36 +329,36 @@ static void tracepoint_update_probes(void)
  */
 int tracepoint_probe_register(const char *name, void *probe)
 {
-	struct tracepoint_entry *entry;
-	int ret = 0;
 	void *old;
 
 	mutex_lock(&tracepoints_mutex);
-	entry = get_tracepoint(name);
-	if (!entry) {
-		entry = add_tracepoint(name);
-		if (IS_ERR(entry)) {
-			ret = PTR_ERR(entry);
-			goto end;
-		}
-	}
-	old = tracepoint_entry_add_probe(entry, probe);
-	if (IS_ERR(old)) {
-		if (!entry->refcount)
-			remove_tracepoint(entry);
-		ret = PTR_ERR(old);
-		goto end;
-	}
+	old = tracepoint_add_probe(name, probe);
 	mutex_unlock(&tracepoints_mutex);
+	if (IS_ERR(old))
+		return PTR_ERR(old);
+
 	tracepoint_update_probes();		/* may update entry */
 	release_probes(old);
 	return 0;
-end:
-	mutex_unlock(&tracepoints_mutex);
-	return ret;
 }
 EXPORT_SYMBOL_GPL(tracepoint_probe_register);
 
+static void *tracepoint_remove_probe(const char *name, void *probe)
+{
+	struct tracepoint_entry *entry;
+	void *old;
+
+	entry = get_tracepoint(name);
+	if (!entry)
+		return ERR_PTR(-ENOENT);
+	old = tracepoint_entry_remove_probe(entry, probe);
+	if (IS_ERR(old))
+		return old;
+	if (!entry->refcount)
+		remove_tracepoint(entry);
+	return old;
+}
+
 /**
  * tracepoint_probe_unregister -  Disconnect a probe from a tracepoint
  * @name: tracepoint name
@@ -351,31 +371,105 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register);
  */
 int tracepoint_probe_unregister(const char *name, void *probe)
 {
-	struct tracepoint_entry *entry;
 	void *old;
-	int ret = -ENOENT;
 
 	mutex_lock(&tracepoints_mutex);
-	entry = get_tracepoint(name);
-	if (!entry)
-		goto end;
-	old = tracepoint_entry_remove_probe(entry, probe);
-	if (IS_ERR(old)) {
-		ret = PTR_ERR(old);
-		goto end;
-	}
-	if (!entry->refcount)
-		remove_tracepoint(entry);
+	old = tracepoint_remove_probe(name, probe);
 	mutex_unlock(&tracepoints_mutex);
+	if (IS_ERR(old))
+		return PTR_ERR(old);
+
 	tracepoint_update_probes();		/* may update entry */
 	release_probes(old);
 	return 0;
-end:
-	mutex_unlock(&tracepoints_mutex);
-	return ret;
 }
 EXPORT_SYMBOL_GPL(tracepoint_probe_unregister);
 
+static LIST_HEAD(old_probes);
+static int need_update;
+
+static void tracepoint_add_old_probes(void *old)
+{
+	need_update = 1;
+	if (old) {
+		struct tp_probes *tp_probes = container_of(old,
+			struct tp_probes, probes[0]);
+		list_add(&tp_probes->u.list, &old_probes);
+	}
+}
+
+/**
+ * tracepoint_probe_register_noupdate -  register a probe but not connect
+ * @name: tracepoint name
+ * @probe: probe handler
+ *
+ * caller must call tracepoint_probe_update_all()
+ */
+int tracepoint_probe_register_noupdate(const char *name, void *probe)
+{
+	void *old;
+
+	mutex_lock(&tracepoints_mutex);
+	old = tracepoint_add_probe(name, probe);
+	if (IS_ERR(old)) {
+		mutex_unlock(&tracepoints_mutex);
+		return PTR_ERR(old);
+	}
+	tracepoint_add_old_probes(old);
+	mutex_unlock(&tracepoints_mutex);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate);
+
+/**
+ * tracepoint_probe_unregister_noupdate -  remove a probe but not disconnect
+ * @name: tracepoint name
+ * @probe: probe function pointer
+ *
+ * caller must call tracepoint_probe_update_all()
+ */
+int tracepoint_probe_unregister_noupdate(const char *name, void *probe)
+{
+	void *old;
+
+	mutex_lock(&tracepoints_mutex);
+	old = tracepoint_remove_probe(name, probe);
+	if (IS_ERR(old)) {
+		mutex_unlock(&tracepoints_mutex);
+		return PTR_ERR(old);
+	}
+	tracepoint_add_old_probes(old);
+	mutex_unlock(&tracepoints_mutex);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tracepoint_probe_unregister_noupdate);
+
+/**
+ * tracepoint_probe_update_all -  update tracepoints
+ */
+void tracepoint_probe_update_all(void)
+{
+	LIST_HEAD(release_probes);
+	struct tp_probes *pos, *next;
+
+	mutex_lock(&tracepoints_mutex);
+	if (!need_update) {
+		mutex_unlock(&tracepoints_mutex);
+		return;
+	}
+	if (!list_empty(&old_probes))
+		list_replace_init(&old_probes, &release_probes);
+	need_update = 0;
+	mutex_unlock(&tracepoints_mutex);
+
+	tracepoint_update_probes();
+	list_for_each_entry_safe(pos, next, &release_probes, u.list) {
+		list_del(&pos->u.list);
+		call_rcu_sched(&pos->u.rcu, rcu_free_old_probes);
+	}
+}
+EXPORT_SYMBOL_GPL(tracepoint_probe_update_all);
+
 /**
  * tracepoint_get_iter_range - Get a next tracepoint iterator given a range.
  * @tracepoint: current tracepoints (in), next tracepoint (out)
-- 
cgit v1.2.3


From e113a745f693af196c8081b328bf42def086989b Mon Sep 17 00:00:00 2001
From: Dimitri Sivanich <sivanich@sgi.com>
Date: Fri, 31 Oct 2008 08:03:41 -0500
Subject: sched/rt: small optimization to update_curr_rt()

Impact: micro-optimization to SCHED_FIFO/RR scheduling

A very minor improvement, but might it be better to check sched_rt_runtime(rt_rq)
before taking the rt_runtime_lock?

Peter Zijlstra observes:

> Yes, I think its ok to do so.
>
> Like pointed out in the other thread, there are two races:
>
>  - sched_rt_runtime() going to RUNTIME_INF, and that will be handled
>    properly by sched_rt_runtime_exceeded()
>
>  - sched_rt_runtime() going to !RUNTIME_INF, and here we can miss an
>    accounting cycle, but I don't think that is something to worry too
>    much about.

Signed-off-by: Dimitri Sivanich <sivanich@sgi.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

--

 kernel/sched_rt.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
---
 kernel/sched_rt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d9ba9d5f99d..c7963d5d062 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -537,13 +537,13 @@ static void update_curr_rt(struct rq *rq)
 	for_each_sched_rt_entity(rt_se) {
 		rt_rq = rt_rq_of_se(rt_se);
 
-		spin_lock(&rt_rq->rt_runtime_lock);
 		if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
+			spin_lock(&rt_rq->rt_runtime_lock);
 			rt_rq->rt_time += delta_exec;
 			if (sched_rt_runtime_exceeded(rt_rq))
 				resched_task(curr);
+			spin_unlock(&rt_rq->rt_runtime_lock);
 		}
-		spin_unlock(&rt_rq->rt_runtime_lock);
 	}
 }
 
-- 
cgit v1.2.3


From be19ef82e068e92a28df50341938fdeb5ea56436 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Nov 2008 14:35:12 +0100
Subject: rcu: make rcu-stall debug printout more standard

Impact: change debug printout

Change "RCU detected CPU stall" to "INFO: RCU detected CPU stall"
message, to make it easier for tools to pick up the warning.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcuclassic.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 37f72e55154..e503a002f33 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -191,7 +191,7 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
 
 	/* OK, time to rat on our buddy... */
 
-	printk(KERN_ERR "RCU detected CPU stalls:");
+	printk(KERN_ERR "INFO: RCU detected CPU stalls:");
 	for_each_possible_cpu(cpu) {
 		if (cpu_isset(cpu, rcp->cpumask))
 			printk(" %d", cpu);
@@ -204,7 +204,7 @@ static void print_cpu_stall(struct rcu_ctrlblk *rcp)
 {
 	unsigned long flags;
 
-	printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
+	printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
 			smp_processor_id(), jiffies,
 			jiffies - rcp->gp_start);
 	dump_stack();
-- 
cgit v1.2.3


From 8f0a056fcb2f83a069fb5d60c2383304b7456687 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 3 Nov 2008 23:15:55 -0500
Subject: ftrace: introduce ftrace_preempt_disable()/enable()

Impact: add new ftrace-plugin internal APIs

Parts of the tracer needs to be careful about schedule recursion.
If the NEED_RESCHED flag is set, a preempt_enable will call schedule.
Inside the schedule function, the NEED_RESCHED flag is cleared.

The problem arises when a trace happens in the schedule function but before
NEED_RESCHED is cleared. The race is as follows:

schedule()
  >> tracer called

    trace_function()
       preempt_disable()
       [ record trace ]
       preempt_enable()  <<- here's the issue.

         [check NEED_RESCHED]
          schedule()
          [ Repeat the above, over and over again ]

The naive approach is simply to use preempt_enable_no_schedule instead.
The problem with that approach is that, although we solve the schedule
recursion issue, we now might lose a preemption check when not in the
schedule function.

  trace_function()
    preempt_disable()
    [ record trace ]
    [Interrupt comes in and sets NEED_RESCHED]
    preempt_enable_no_resched()
    [continue without scheduling]

The way ftrace handles this problem is with the following approach:

	int resched;

	resched = need_resched();
	preempt_disable_notrace();
	[record trace]
	if (resched)
		preempt_enable_no_sched_notrace();
	else
		preempt_enable_notrace();

This may seem like the opposite of what we want. If resched is set
then we call the "no_sched" version??  The reason we do this is because
if NEED_RESCHED is set before we disable preemption, there's two reasons
for that:

  1) we are in an atomic code path
  2) we are already on our way to the schedule function, and maybe even
     in the schedule function, but have yet to clear the flag.

Both the above cases we do not want to schedule.

This solution has already been implemented within the ftrace infrastructure.
But the problem is that it has been implemented several times. This patch
encapsulates this code to two nice functions.

  resched = ftrace_preempt_disable();
  [ record trace]
  ftrace_preempt_enable(resched);

This way the tracers do not need to worry about getting it right.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8465ad05270..10c6dae7689 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -419,4 +419,52 @@ enum trace_iterator_flags {
 
 extern struct tracer nop_trace;
 
+/**
+ * ftrace_preempt_disable - disable preemption scheduler safe
+ *
+ * When tracing can happen inside the scheduler, there exists
+ * cases that the tracing might happen before the need_resched
+ * flag is checked. If this happens and the tracer calls
+ * preempt_enable (after a disable), a schedule might take place
+ * causing an infinite recursion.
+ *
+ * To prevent this, we read the need_recshed flag before
+ * disabling preemption. When we want to enable preemption we
+ * check the flag, if it is set, then we call preempt_enable_no_resched.
+ * Otherwise, we call preempt_enable.
+ *
+ * The rational for doing the above is that if need resched is set
+ * and we have yet to reschedule, we are either in an atomic location
+ * (where we do not need to check for scheduling) or we are inside
+ * the scheduler and do not want to resched.
+ */
+static inline int ftrace_preempt_disable(void)
+{
+	int resched;
+
+	resched = need_resched();
+	preempt_disable_notrace();
+
+	return resched;
+}
+
+/**
+ * ftrace_preempt_enable - enable preemption scheduler safe
+ * @resched: the return value from ftrace_preempt_disable
+ *
+ * This is a scheduler safe way to enable preemption and not miss
+ * any preemption checks. The disabled saved the state of preemption.
+ * If resched is set, then we were either inside an atomic or
+ * are inside the scheduler (we would have already scheduled
+ * otherwise). In this case, we do not want to call normal
+ * preempt_enable, but preempt_enable_no_resched instead.
+ */
+static inline void ftrace_preempt_enable(int resched)
+{
+	if (resched)
+		preempt_enable_no_resched_notrace();
+	else
+		preempt_enable_notrace();
+}
+
 #endif /* _LINUX_KERNEL_TRACE_H */
-- 
cgit v1.2.3


From 182e9f5f704ed6b9175142fe8da33c9ce0c52b52 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 3 Nov 2008 23:15:56 -0500
Subject: ftrace: insert in the ftrace_preempt_disable()/enable() functions

Impact: use new, consolidated APIs in ftrace plugins

This patch replaces the schedule safe preempt disable code with the
ftrace_preempt_disable() and ftrace_preempt_enable() safe functions.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c        | 27 +++++++++------------------
 kernel/trace/trace.c              |  8 ++------
 kernel/trace/trace_sched_wakeup.c | 13 ++-----------
 kernel/trace/trace_stack.c        |  8 ++------
 4 files changed, 15 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index cedf4e26828..151f6a74867 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -16,6 +16,8 @@
 #include <linux/list.h>
 #include <linux/fs.h>
 
+#include "trace.h"
+
 /* Up this if you want to test the TIME_EXTENTS and normalization */
 #define DEBUG_SHIFT 0
 
@@ -1122,8 +1124,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
 		return NULL;
 
 	/* If we are tracing schedule, we don't want to recurse */
-	resched = need_resched();
-	preempt_disable_notrace();
+	resched = ftrace_preempt_disable();
 
 	cpu = raw_smp_processor_id();
 
@@ -1154,10 +1155,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
 	return event;
 
  out:
-	if (resched)
-		preempt_enable_notrace();
-	else
-		preempt_enable_notrace();
+	ftrace_preempt_enable(resched);
 	return NULL;
 }
 
@@ -1199,12 +1197,9 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
 	/*
 	 * Only the last preempt count needs to restore preemption.
 	 */
-	if (preempt_count() == 1) {
-		if (per_cpu(rb_need_resched, cpu))
-			preempt_enable_no_resched_notrace();
-		else
-			preempt_enable_notrace();
-	} else
+	if (preempt_count() == 1)
+		ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
+	else
 		preempt_enable_no_resched_notrace();
 
 	return 0;
@@ -1237,8 +1232,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
 	if (atomic_read(&buffer->record_disabled))
 		return -EBUSY;
 
-	resched = need_resched();
-	preempt_disable_notrace();
+	resched = ftrace_preempt_disable();
 
 	cpu = raw_smp_processor_id();
 
@@ -1264,10 +1258,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
 
 	ret = 0;
  out:
-	if (resched)
-		preempt_enable_no_resched_notrace();
-	else
-		preempt_enable_notrace();
+	ftrace_preempt_enable(resched);
 
 	return ret;
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e4c40c868d6..3e7bf5eb900 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -904,8 +904,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
 		return;
 
 	pc = preempt_count();
-	resched = need_resched();
-	preempt_disable_notrace();
+	resched = ftrace_preempt_disable();
 	local_save_flags(flags);
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
@@ -915,10 +914,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
 		trace_function(tr, data, ip, parent_ip, flags, pc);
 
 	atomic_dec(&data->disabled);
-	if (resched)
-		preempt_enable_no_resched_notrace();
-	else
-		preempt_enable_notrace();
+	ftrace_preempt_enable(resched);
 }
 
 static struct ftrace_ops trace_ops __read_mostly =
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 3ae93f16b56..7bc4abf6fca 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -50,8 +50,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
 		return;
 
 	pc = preempt_count();
-	resched = need_resched();
-	preempt_disable_notrace();
+	resched = ftrace_preempt_disable();
 
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
@@ -81,15 +80,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
  out:
 	atomic_dec(&data->disabled);
 
-	/*
-	 * To prevent recursion from the scheduler, if the
-	 * resched flag was set before we entered, then
-	 * don't reschedule.
-	 */
-	if (resched)
-		preempt_enable_no_resched_notrace();
-	else
-		preempt_enable_notrace();
+	ftrace_preempt_enable(resched);
 }
 
 static struct ftrace_ops trace_ops __read_mostly =
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index be682b62fe5..d39e8b7de6a 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -107,8 +107,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
 	if (unlikely(!ftrace_enabled || stack_trace_disabled))
 		return;
 
-	resched = need_resched();
-	preempt_disable_notrace();
+	resched = ftrace_preempt_disable();
 
 	cpu = raw_smp_processor_id();
 	/* no atomic needed, we only modify this variable by this cpu */
@@ -120,10 +119,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
  out:
 	per_cpu(trace_active, cpu)--;
 	/* prevent recursion in schedule */
-	if (resched)
-		preempt_enable_no_resched_notrace();
-	else
-		preempt_enable_notrace();
+	ftrace_preempt_enable(resched);
 }
 
 static struct ftrace_ops trace_ops __read_mostly =
-- 
cgit v1.2.3


From b2a866f9344cb30d7ddf5d67b5b8393daf8bef07 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 3 Nov 2008 23:15:57 -0500
Subject: ftrace: function tracer with irqs disabled

Impact: disable interrupts during trace entry creation (as opposed to preempt)

To help with performance, I set the ftracer to not disable interrupts,
and only to disable preemption. If an interrupt occurred, it would not
be traced, because the function tracer protects itself from recursion.
This may be faster, but the trace output might miss some traces.

This patch makes the fuction trace disable interrupts, but it also
adds a runtime feature to disable preemption instead. It does this by
having two different tracer functions. When the function tracer is
enabled, it will check to see which version is requested (irqs disabled
or preemption disabled). Then it will use the corresponding function
as the tracer.

Irq disabling is the default behaviour, but if the user wants better
performance, with the chance of missing traces, then they can choose
the preempt disabled version.

Running hackbench 3 times with the irqs disabled and 3 times with
the preempt disabled function tracer yielded:

tracing type       times            entries recorded
------------      --------          ----------------
irq disabled      43.393            166433066
                  43.282            166172618
                  43.298            166256704

preempt disabled  38.969            159871710
                  38.943            159972935
                  39.325            161056510

Average:

   irqs disabled:  43.324           166287462
preempt disabled:  39.079           160300385

 preempt is 10.8 percent faster than irqs disabled.

I wrote a patch to count function trace recursion and reran hackbench.

With irq disabled: 1,150 times the function tracer did not trace due to
  recursion.
with preempt disabled: 5,117,718 times.

The thousand times with irq disabled could be due to NMIs, or simply a case
where it called a function that was not protected by notrace.

But we also see that a large amount of the trace is lost with the
preempt version.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 40 +++++++++++++++++++++++++++++++++++++++-
 kernel/trace/trace.h |  1 +
 2 files changed, 40 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3e7bf5eb900..d576dbd6def 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -244,6 +244,7 @@ static const char *trace_options[] = {
 	"stacktrace",
 	"sched-tree",
 	"ftrace_printk",
+	"ftrace_preempt",
 	NULL
 };
 
@@ -891,7 +892,7 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
 
 #ifdef CONFIG_FUNCTION_TRACER
 static void
-function_trace_call(unsigned long ip, unsigned long parent_ip)
+function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
 {
 	struct trace_array *tr = &global_trace;
 	struct trace_array_cpu *data;
@@ -917,6 +918,37 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
 	ftrace_preempt_enable(resched);
 }
 
+static void
+function_trace_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct trace_array *tr = &global_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+	int pc;
+
+	if (unlikely(!ftrace_function_enabled))
+		return;
+
+	/*
+	 * Need to use raw, since this must be called before the
+	 * recursive protection is performed.
+	 */
+	raw_local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1)) {
+		pc = preempt_count();
+		trace_function(tr, data, ip, parent_ip, flags, pc);
+	}
+
+	atomic_dec(&data->disabled);
+	raw_local_irq_restore(flags);
+}
+
 static struct ftrace_ops trace_ops __read_mostly =
 {
 	.func = function_trace_call,
@@ -925,6 +957,12 @@ static struct ftrace_ops trace_ops __read_mostly =
 void tracing_start_function_trace(void)
 {
 	ftrace_function_enabled = 0;
+
+	if (trace_flags & TRACE_ITER_PREEMPTONLY)
+		trace_ops.func = function_trace_call_preempt_only;
+	else
+		trace_ops.func = function_trace_call;
+
 	register_ftrace_function(&trace_ops);
 	if (tracer_enabled)
 		ftrace_function_enabled = 1;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 10c6dae7689..bb547e933af 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -415,6 +415,7 @@ enum trace_iterator_flags {
 	TRACE_ITER_STACKTRACE		= 0x100,
 	TRACE_ITER_SCHED_TREE		= 0x200,
 	TRACE_ITER_PRINTK		= 0x400,
+	TRACE_ITER_PREEMPTONLY		= 0x800,
 };
 
 extern struct tracer nop_trace;
-- 
cgit v1.2.3


From eefd796a8e831408ce17e633d73d70430748c47a Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 4 Nov 2008 16:15:37 +0800
Subject: sched debug: remove sd_level_to_string()

Impact: cleanup

Just use the newly introduced sd->name.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 25 +------------------------
 1 file changed, 1 insertion(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 5419df9cc5c..7ac59bae87d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6602,28 +6602,6 @@ early_initcall(migration_init);
 
 #ifdef CONFIG_SCHED_DEBUG
 
-static inline const char *sd_level_to_string(enum sched_domain_level lvl)
-{
-	switch (lvl) {
-	case SD_LV_NONE:
-			return "NONE";
-	case SD_LV_SIBLING:
-			return "SIBLING";
-	case SD_LV_MC:
-			return "MC";
-	case SD_LV_CPU:
-			return "CPU";
-	case SD_LV_NODE:
-			return "NODE";
-	case SD_LV_ALLNODES:
-			return "ALLNODES";
-	case SD_LV_MAX:
-			return "MAX";
-
-	}
-	return "MAX";
-}
-
 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 				  cpumask_t *groupmask)
 {
@@ -6643,8 +6621,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 		return -1;
 	}
 
-	printk(KERN_CONT "span %s level %s\n",
-		str, sd_level_to_string(sd->level));
+	printk(KERN_CONT "span %s level %s\n", str, sd->name);
 
 	if (!cpu_isset(cpu, sd->span)) {
 		printk(KERN_ERR "ERROR: domain->span does not contain "
-- 
cgit v1.2.3


From 0a0db8f5c9d4bbb9bbfcc2b6cb6bce2d0ef4d73d Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 4 Nov 2008 16:17:05 +0800
Subject: sched debug: remove NULL checking in print_cfs/rt_rq()

Impact: cleanup

cfs->tg is initialized in init_tg_cfs_entry() with tg != NULL, and
will never be invalidated to NULL. And the underlying cgroup of a
valid task_group is always valid.

Same for rt->tg.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_debug.c | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 5ae17762ec3..d25cefe3f0e 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -121,14 +121,9 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 
 #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
 	char path[128] = "";
-	struct cgroup *cgroup = NULL;
 	struct task_group *tg = cfs_rq->tg;
 
-	if (tg)
-		cgroup = tg->css.cgroup;
-
-	if (cgroup)
-		cgroup_path(cgroup, path, sizeof(path));
+	cgroup_path(tg->css.cgroup, path, sizeof(path));
 
 	SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
 #else
@@ -193,14 +188,9 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 {
 #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
 	char path[128] = "";
-	struct cgroup *cgroup = NULL;
 	struct task_group *tg = rt_rq->tg;
 
-	if (tg)
-		cgroup = tg->css.cgroup;
-
-	if (cgroup)
-		cgroup_path(cgroup, path, sizeof(path));
+	cgroup_path(tg->css.cgroup, path, sizeof(path));
 
 	SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
 #else
-- 
cgit v1.2.3


From a17e2260926f681a0eb983c1e3cb859ba2064bce Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 4 Nov 2008 16:19:13 +0800
Subject: sched: remove redundant call to unregister_sched_domain_sysctl()

Impact: cleanup

The sysctl has been unregistered by partition_sched_domains().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 7ac59bae87d..3cb94fad33c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7691,8 +7691,6 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
 	cpumask_t tmpmask;
 	int i;
 
-	unregister_sched_domain_sysctl();
-
 	for_each_cpu_mask_nr(i, *cpu_map)
 		cpu_attach_domain(NULL, &def_root_domain, i);
 	synchronize_sched();
-- 
cgit v1.2.3


From faa2f98f856e89d1afb6e4a91707284d242e816e Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 4 Nov 2008 16:20:23 +0800
Subject: sched: add sanity check in partition_sched_domains()

Impact: cleanup, add debug check

It's wrong to make dattr_new = NULL if doms_new == NULL, it introduces
memory leak if dattr_new != NULL. Fortunately dattr_new is always NULL
in this case. So remove the code and add a sanity check.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 3cb94fad33c..213cad5e50a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7767,7 +7767,7 @@ match1:
 		ndoms_cur = 0;
 		doms_new = &fallback_doms;
 		cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
-		dattr_new = NULL;
+		WARN_ON_ONCE(dattr_new);
 	}
 
 	/* Build new domains */
-- 
cgit v1.2.3


From 3299b4dd1180762da831be5eb6adc44553eaec26 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 4 Nov 2008 11:58:21 +0100
Subject: ftrace: sysctl typo

Impact: fix sysctl name typo

Steve must have needed more coffee ;-)

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sysctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6b6b727258b..65d4a9ba79e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -487,7 +487,7 @@ static struct ctl_table kern_table[] = {
 #ifdef CONFIG_TRACING
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "ftrace_dump_on_opps",
+		.procname	= "ftrace_dump_on_oops",
 		.data		= &ftrace_dump_on_oops,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-- 
cgit v1.2.3


From 71566a0d161edec70361b7f90f6e54af6a6d5d05 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 31 Oct 2008 12:57:20 +0100
Subject: tracing/fastboot: Enable boot tracing only during initcalls

Impact: modify boot tracer

We used to disable the initcall tracing at a specified time (IE: end
of builtin initcalls). But we don't need it anymore. It will be
stopped when initcalls are finished.

However we want two things:

_Start this tracing only after pre-smp initcalls are finished.

_Since we are planning to trace sched_switches at the same time, we
want to enable them only during the initcall execution.

For this purpose, this patch introduce two functions to enable/disable
the sched_switch tracing during boot.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_boot.c | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index d0a5e50eeff..d104d5b4641 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -13,23 +13,29 @@
 #include "trace.h"
 
 static struct trace_array *boot_trace;
-static int trace_boot_enabled;
+static bool pre_initcalls_finished;
 
-
-/* Should be started after do_pre_smp_initcalls() in init/main.c */
+/* Tells the boot tracer that the pre_smp_initcalls are finished.
+ * So we are ready .
+ * It doesn't enable sched events tracing however.
+ * You have to call enable_boot_trace to do so.
+ */
 void start_boot_trace(void)
 {
-	trace_boot_enabled = 1;
+	pre_initcalls_finished = true;
+}
+
+void enable_boot_trace(void)
+{
 }
 
-void stop_boot_trace(void)
+void disable_boot_trace(void)
 {
-	trace_boot_enabled = 0;
 }
 
 void reset_boot_trace(struct trace_array *tr)
 {
-	stop_boot_trace();
+	disable_boot_trace();
 }
 
 static void boot_trace_init(struct trace_array *tr)
@@ -37,8 +43,6 @@ static void boot_trace_init(struct trace_array *tr)
 	int cpu;
 	boot_trace = tr;
 
-	trace_boot_enabled = 0;
-
 	for_each_cpu_mask(cpu, cpu_possible_map)
 		tracing_reset(tr, cpu);
 }
@@ -46,9 +50,9 @@ static void boot_trace_init(struct trace_array *tr)
 static void boot_trace_ctrl_update(struct trace_array *tr)
 {
 	if (tr->ctrl)
-		start_boot_trace();
+		enable_boot_trace();
 	else
-		stop_boot_trace();
+		disable_boot_trace();
 }
 
 static enum print_line_t initcall_print_line(struct trace_iterator *iter)
@@ -99,7 +103,7 @@ void trace_boot(struct boot_trace *it, initcall_t fn)
 	unsigned long irq_flags;
 	struct trace_array *tr = boot_trace;
 
-	if (!trace_boot_enabled)
+	if (!pre_initcalls_finished)
 		return;
 
 	/* Get its name now since this function could
-- 
cgit v1.2.3


From 07695fa04e8a3384b0c855398ce1f7885bd7dc3b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 31 Oct 2008 13:08:28 +0100
Subject: tracing/ftrace: fix a race condition in sched_switch tracer

Impact: fix race condition in sched_switch tracer

This patch fixes a race condition in the sched_switch tracer. If
several tasks (IE: concurrent initcalls) are playing with
tracing_start_cmdline_record() and tracing_stop_cmdline_record(), the
following situation could happen:

_ Task A and B are using the same tracepoint probe. Task A holds it.
  Task B is sleeping and doesn't hold it.

_ Task A frees the sched tracer, then sched_ref is decremented to 0.

_ Task A is preempted and hadn't yet unregistered its tracepoint
  probe, then B runs.

_ B increments sched_ref, sees it's 1 and then guess it has to
  register its probe. But it has not been freed by task A.

_ A lot of bad things can happen after that...

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_sched_switch.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index b8f56beb1a6..59de5141207 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -17,6 +17,7 @@
 static struct trace_array	*ctx_trace;
 static int __read_mostly	tracer_enabled;
 static atomic_t			sched_ref;
+static DEFINE_MUTEX(tracepoint_mutex);
 
 static void
 probe_sched_switch(struct rq *__rq, struct task_struct *prev,
@@ -125,18 +126,22 @@ static void tracing_start_sched_switch(void)
 {
 	long ref;
 
+	mutex_lock(&tracepoint_mutex);
 	ref = atomic_inc_return(&sched_ref);
 	if (ref == 1)
 		tracing_sched_register();
+	mutex_unlock(&tracepoint_mutex);
 }
 
 static void tracing_stop_sched_switch(void)
 {
 	long ref;
 
+	mutex_lock(&tracepoint_mutex);
 	ref = atomic_dec_and_test(&sched_ref);
 	if (ref)
 		tracing_sched_unregister();
+	mutex_unlock(&tracepoint_mutex);
 }
 
 void tracing_start_cmdline_record(void)
-- 
cgit v1.2.3


From e55f605c14679c30be41473e60b7ad26524cdc35 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 31 Oct 2008 13:14:28 +0100
Subject: tracing/ftrace: remove unused code in sched_switch tracer

Impact: cleanup

When init_sched_switch_trace() is called, it has no reason to start
the sched tracer if the sched_ref is not zero.

_ If this is non-zero, the tracer is already used, but we can register it
to the tracing engine. There is already a security which avoid the tracer
probes not to be resgistered twice.

_ If this is zero, this block will not be used.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_sched_switch.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 59de5141207..96620c71430 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -203,14 +203,6 @@ static struct tracer sched_switch_trace __read_mostly =
 
 __init static int init_sched_switch_trace(void)
 {
-	int ret = 0;
-
-	if (atomic_read(&sched_ref))
-		ret = tracing_sched_register();
-	if (ret) {
-		pr_info("error registering scheduler trace\n");
-		return ret;
-	}
 	return register_tracer(&sched_switch_trace);
 }
 device_initcall(init_sched_switch_trace);
-- 
cgit v1.2.3


From d7ad44b697c9d13e445ddc7d16f736fbac333249 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 31 Oct 2008 13:20:08 +0100
Subject: tracing/fastboot: use sched switch tracer from boot tracer

Impact: enhance boot trace output with scheduling events

Use the sched_switch tracer from the boot tracer.

We also can trace schedule events inside the initcalls.
Sched tracing is disabled after the initcall has finished and
then reenabled before the next one is started.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c              | 2 ++
 kernel/trace/trace.h              | 1 +
 kernel/trace/trace_boot.c         | 6 ++++++
 kernel/trace/trace_sched_switch.c | 6 +++---
 4 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e4c40c868d6..50d7018163f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3251,6 +3251,8 @@ __init static int tracer_alloc_buffers(void)
 
 	register_tracer(&nop_trace);
 #ifdef CONFIG_BOOT_TRACER
+	/* We don't want to launch sched_switch tracer yet */
+	global_trace.ctrl = 0;
 	register_tracer(&boot_tracer);
 	current_trace = &boot_tracer;
 	current_trace->init(&global_trace);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8465ad05270..9911277b268 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -49,6 +49,7 @@ struct ftrace_entry {
 	unsigned long		parent_ip;
 };
 extern struct tracer boot_tracer;
+extern struct tracer sched_switch_trace; /* Used by the boot tracer */
 
 /*
  * Context switch trace entry - which task (and prio) we switched from/to:
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index d104d5b4641..6bbc8794a6d 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -27,10 +27,14 @@ void start_boot_trace(void)
 
 void enable_boot_trace(void)
 {
+	if (pre_initcalls_finished)
+		tracing_start_cmdline_record();
 }
 
 void disable_boot_trace(void)
 {
+	if (pre_initcalls_finished)
+		tracing_stop_cmdline_record();
 }
 
 void reset_boot_trace(struct trace_array *tr)
@@ -45,6 +49,8 @@ static void boot_trace_init(struct trace_array *tr)
 
 	for_each_cpu_mask(cpu, cpu_possible_map)
 		tracing_reset(tr, cpu);
+
+	sched_switch_trace.init(tr);
 }
 
 static void boot_trace_ctrl_update(struct trace_array *tr)
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 96620c71430..9d7bdac331d 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -127,6 +127,7 @@ static void tracing_start_sched_switch(void)
 	long ref;
 
 	mutex_lock(&tracepoint_mutex);
+	tracer_enabled = 1;
 	ref = atomic_inc_return(&sched_ref);
 	if (ref == 1)
 		tracing_sched_register();
@@ -138,6 +139,7 @@ static void tracing_stop_sched_switch(void)
 	long ref;
 
 	mutex_lock(&tracepoint_mutex);
+	tracer_enabled = 0;
 	ref = atomic_dec_and_test(&sched_ref);
 	if (ref)
 		tracing_sched_unregister();
@@ -158,12 +160,10 @@ static void start_sched_trace(struct trace_array *tr)
 {
 	sched_switch_reset(tr);
 	tracing_start_cmdline_record();
-	tracer_enabled = 1;
 }
 
 static void stop_sched_trace(struct trace_array *tr)
 {
-	tracer_enabled = 0;
 	tracing_stop_cmdline_record();
 }
 
@@ -190,7 +190,7 @@ static void sched_switch_trace_ctrl_update(struct trace_array *tr)
 		stop_sched_trace(tr);
 }
 
-static struct tracer sched_switch_trace __read_mostly =
+struct tracer sched_switch_trace __read_mostly =
 {
 	.name		= "sched_switch",
 	.init		= sched_switch_trace_init,
-- 
cgit v1.2.3


From efade6e7821c4219818e9da08f9315dfa617048b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 31 Oct 2008 13:28:58 +0100
Subject: tracing/ftrace: types and naming corrections for sched tracer

Impact: cleanup

This patch applies some corrections suggested by Steven Rostedt.

Change the type of shed_ref into int since it is used
into a Mutex, we don't need it anymore as an atomic
variable in the sched_switch tracer.
Also change the name of the register mutex.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_sched_switch.c | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 9d7bdac331d..969953bf678 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -16,8 +16,8 @@
 
 static struct trace_array	*ctx_trace;
 static int __read_mostly	tracer_enabled;
-static atomic_t			sched_ref;
-static DEFINE_MUTEX(tracepoint_mutex);
+static int			sched_ref;
+static DEFINE_MUTEX(sched_register_mutex);
 
 static void
 probe_sched_switch(struct rq *__rq, struct task_struct *prev,
@@ -28,7 +28,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
 	int cpu;
 	int pc;
 
-	if (!atomic_read(&sched_ref))
+	if (!sched_ref)
 		return;
 
 	tracing_record_cmdline(prev);
@@ -124,26 +124,22 @@ static void tracing_sched_unregister(void)
 
 static void tracing_start_sched_switch(void)
 {
-	long ref;
-
-	mutex_lock(&tracepoint_mutex);
-	tracer_enabled = 1;
-	ref = atomic_inc_return(&sched_ref);
-	if (ref == 1)
+	mutex_lock(&sched_register_mutex);
+	if (!(sched_ref++)) {
+		tracer_enabled = 1;
 		tracing_sched_register();
-	mutex_unlock(&tracepoint_mutex);
+	}
+	mutex_unlock(&sched_register_mutex);
 }
 
 static void tracing_stop_sched_switch(void)
 {
-	long ref;
-
-	mutex_lock(&tracepoint_mutex);
-	tracer_enabled = 0;
-	ref = atomic_dec_and_test(&sched_ref);
-	if (ref)
+	mutex_lock(&sched_register_mutex);
+	if (!(--sched_ref)) {
 		tracing_sched_unregister();
-	mutex_unlock(&tracepoint_mutex);
+		tracer_enabled = 0;
+	}
+	mutex_unlock(&sched_register_mutex);
 }
 
 void tracing_start_cmdline_record(void)
-- 
cgit v1.2.3


From 79a9d461fd521f133f0e66485aa9ed09c21f5191 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 31 Oct 2008 13:34:45 +0100
Subject: tracing/ftrace: fix a bug when switch current tracer to sched tracer

Impact: fix boot tracer + sched tracer coupling bug

Fix a bug that made the sched_switch tracer unable to run
if set as the current_tracer after the boot tracer.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_boot.c         | 4 ++--
 kernel/trace/trace_sched_switch.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 6bbc8794a6d..bd5046c9deb 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -37,9 +37,9 @@ void disable_boot_trace(void)
 		tracing_stop_cmdline_record();
 }
 
-void reset_boot_trace(struct trace_array *tr)
+static void reset_boot_trace(struct trace_array *tr)
 {
-	disable_boot_trace();
+	sched_switch_trace.reset(tr);
 }
 
 static void boot_trace_init(struct trace_array *tr)
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 969953bf678..888944d3409 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -173,7 +173,7 @@ static void sched_switch_trace_init(struct trace_array *tr)
 
 static void sched_switch_trace_reset(struct trace_array *tr)
 {
-	if (tr->ctrl)
+	if (tr->ctrl && sched_ref)
 		stop_sched_trace(tr);
 }
 
-- 
cgit v1.2.3


From 1f29fae29709b4668979e244c09b2fa78ff1ad59 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Wed, 5 Nov 2008 16:08:52 -0600
Subject: file capabilities: add no_file_caps switch (v4)

Add a no_file_caps boot option when file capabilities are
compiled into the kernel (CONFIG_SECURITY_FILE_CAPABILITIES=y).

This allows distributions to ship a kernel with file capabilities
compiled in, without forcing users to use (and understand and
trust) them.

When no_file_caps is specified at boot, then when a process executes
a file, any file capabilities stored with that file will not be
used in the calculation of the process' new capability sets.

This means that booting with the no_file_caps boot option will
not be the same as booting a kernel with file capabilities
compiled out - in particular a task with  CAP_SETPCAP will not
have any chance of passing capabilities to another task (which
isn't "really" possible anyway, and which may soon by killed
altogether by David Howells in any case), and it will instead
be able to put new capabilities in its pI.  However since fI
will always be empty and pI is masked with fI, it gains the
task nothing.

We also support the extra prctl options, setting securebits and
dropping capabilities from the per-process bounding set.

The other remaining difference is that killpriv, task_setscheduler,
setioprio, and setnice will continue to be hooked.  That will
be noticable in the case where a root task changed its uid
while keeping some caps, and another task owned by the new uid
tries to change settings for the more privileged task.

Changelog:
	Nov 05 2008: (v4) trivial port on top of always-start-\
		with-clear-caps patch
	Sep 23 2008: nixed file_caps_enabled when file caps are
		not compiled in as it isn't used.
		Document no_file_caps in kernel-parameters.txt.

Signed-off-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Andrew G. Morgan <morgan@kernel.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/capability.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/capability.c b/kernel/capability.c
index 33e51e78c2d..e13a68535ad 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -33,6 +33,17 @@ EXPORT_SYMBOL(__cap_empty_set);
 EXPORT_SYMBOL(__cap_full_set);
 EXPORT_SYMBOL(__cap_init_eff_set);
 
+#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
+int file_caps_enabled = 1;
+
+static int __init file_caps_disable(char *str)
+{
+	file_caps_enabled = 0;
+	return 1;
+}
+__setup("no_file_caps", file_caps_disable);
+#endif
+
 /*
  * More recent versions of libcap are available from:
  *
-- 
cgit v1.2.3


From 60a7ecf42661f2b22168751298592da6ee210c9e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 5 Nov 2008 16:05:44 -0500
Subject: ftrace: add quick function trace stop

Impact: quick start and stop of function tracer

This patch adds a way to disable the function tracer quickly without
the need to run kstop_machine. It adds a new variable called
function_trace_stop which will stop the calls to functions from mcount
when set.  This is just an on/off switch and does not handle recursion
like preempt_disable().

It's main purpose is to help other tracers/debuggers start and stop tracing
fuctions without the need to call kstop_machine.

The config option HAVE_FUNCTION_TRACE_MCOUNT_TEST is added for archs
that implement the testing of the function_trace_stop in the mcount
arch dependent code. Otherwise, the test is done in the C code.

x86 is the only arch at the moment that supports this.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig  |  7 +++++++
 kernel/trace/ftrace.c | 47 +++++++++++++++++++++++++++++++++++++----------
 2 files changed, 44 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 33dbefd471e..fc4febc3334 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -9,6 +9,13 @@ config NOP_TRACER
 config HAVE_FUNCTION_TRACER
 	bool
 
+config HAVE_FUNCTION_TRACE_MCOUNT_TEST
+	bool
+	help
+	 This gets selected when the arch tests the function_trace_stop
+	 variable at the mcount call site. Otherwise, this variable
+	 is tested by the called function.
+
 config HAVE_DYNAMIC_FTRACE
 	bool
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4a39d24568c..896c71f0f4c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -47,6 +47,9 @@
 int ftrace_enabled __read_mostly;
 static int last_ftrace_enabled;
 
+/* Quick disabling of function tracer. */
+int function_trace_stop;
+
 /*
  * ftrace_disabled is set when an anomaly is discovered.
  * ftrace_disabled is much stronger than ftrace_enabled.
@@ -63,6 +66,7 @@ static struct ftrace_ops ftrace_list_end __read_mostly =
 
 static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
 ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
+ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
 
 static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
 {
@@ -88,8 +92,23 @@ static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
 void clear_ftrace_function(void)
 {
 	ftrace_trace_function = ftrace_stub;
+	__ftrace_trace_function = ftrace_stub;
 }
 
+#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
+/*
+ * For those archs that do not test ftrace_trace_stop in their
+ * mcount call site, we need to do it from C.
+ */
+static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
+{
+	if (function_trace_stop)
+		return;
+
+	__ftrace_trace_function(ip, parent_ip);
+}
+#endif
+
 static int __register_ftrace_function(struct ftrace_ops *ops)
 {
 	/* should not be called from interrupt context */
@@ -110,10 +129,18 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
 		 * For one func, simply call it directly.
 		 * For more than one func, call the chain.
 		 */
+#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
 		if (ops->next == &ftrace_list_end)
 			ftrace_trace_function = ops->func;
 		else
 			ftrace_trace_function = ftrace_list_func;
+#else
+		if (ops->next == &ftrace_list_end)
+			__ftrace_trace_function = ops->func;
+		else
+			__ftrace_trace_function = ftrace_list_func;
+		ftrace_trace_function = ftrace_test_stop_func;
+#endif
 	}
 
 	spin_unlock(&ftrace_lock);
@@ -526,7 +553,7 @@ static void ftrace_run_update_code(int command)
 }
 
 static ftrace_func_t saved_ftrace_func;
-static int ftrace_start;
+static int ftrace_start_up;
 static DEFINE_MUTEX(ftrace_start_lock);
 
 static void ftrace_startup(void)
@@ -537,8 +564,8 @@ static void ftrace_startup(void)
 		return;
 
 	mutex_lock(&ftrace_start_lock);
-	ftrace_start++;
-	if (ftrace_start == 1)
+	ftrace_start_up++;
+	if (ftrace_start_up == 1)
 		command |= FTRACE_ENABLE_CALLS;
 
 	if (saved_ftrace_func != ftrace_trace_function) {
@@ -562,8 +589,8 @@ static void ftrace_shutdown(void)
 		return;
 
 	mutex_lock(&ftrace_start_lock);
-	ftrace_start--;
-	if (!ftrace_start)
+	ftrace_start_up--;
+	if (!ftrace_start_up)
 		command |= FTRACE_DISABLE_CALLS;
 
 	if (saved_ftrace_func != ftrace_trace_function) {
@@ -589,8 +616,8 @@ static void ftrace_startup_sysctl(void)
 	mutex_lock(&ftrace_start_lock);
 	/* Force update next time */
 	saved_ftrace_func = NULL;
-	/* ftrace_start is true if we want ftrace running */
-	if (ftrace_start)
+	/* ftrace_start_up is true if we want ftrace running */
+	if (ftrace_start_up)
 		command |= FTRACE_ENABLE_CALLS;
 
 	ftrace_run_update_code(command);
@@ -605,8 +632,8 @@ static void ftrace_shutdown_sysctl(void)
 		return;
 
 	mutex_lock(&ftrace_start_lock);
-	/* ftrace_start is true if ftrace is running */
-	if (ftrace_start)
+	/* ftrace_start_up is true if ftrace is running */
+	if (ftrace_start_up)
 		command |= FTRACE_DISABLE_CALLS;
 
 	ftrace_run_update_code(command);
@@ -1186,7 +1213,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
 
 	mutex_lock(&ftrace_sysctl_lock);
 	mutex_lock(&ftrace_start_lock);
-	if (iter->filtered && ftrace_start && ftrace_enabled)
+	if (iter->filtered && ftrace_start_up && ftrace_enabled)
 		ftrace_run_update_code(FTRACE_ENABLE_CALLS);
 	mutex_unlock(&ftrace_start_lock);
 	mutex_unlock(&ftrace_sysctl_lock);
-- 
cgit v1.2.3


From 0f04870148ecb825133bc2733f473b1c5773ac0b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 5 Nov 2008 16:05:44 -0500
Subject: ftrace: soft tracing stop and start

Impact: add way to quickly start stop tracing from the kernel

This patch adds a soft stop and start to the trace. This simply
disables function tracing via the ftrace_disabled flag, and
disables the trace buffers to prevent recording. The tracing
code may still be executed, but the trace will not be recorded.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 79 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 29ab40a764c..113aea9447e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -43,6 +43,15 @@
 unsigned long __read_mostly	tracing_max_latency = (cycle_t)ULONG_MAX;
 unsigned long __read_mostly	tracing_thresh;
 
+
+/*
+ * Kill all tracing for good (never come back).
+ * It is initialized to 1 but will turn to zero if the initialization
+ * of the tracer is successful. But that is the only place that sets
+ * this back to zero.
+ */
+int tracing_disabled = 1;
+
 static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
 
 static inline void ftrace_disable_cpu(void)
@@ -62,8 +71,6 @@ static cpumask_t __read_mostly		tracing_buffer_mask;
 #define for_each_tracing_cpu(cpu)	\
 	for_each_cpu_mask(cpu, tracing_buffer_mask)
 
-static int tracing_disabled = 1;
-
 /*
  * ftrace_dump_on_oops - variable to dump ftrace buffer on oops
  *
@@ -613,6 +620,76 @@ static void trace_init_cmdlines(void)
 	cmdline_idx = 0;
 }
 
+static int trace_stop_count;
+static DEFINE_SPINLOCK(tracing_start_lock);
+
+/**
+ * tracing_start - quick start of the tracer
+ *
+ * If tracing is enabled but was stopped by tracing_stop,
+ * this will start the tracer back up.
+ */
+void tracing_start(void)
+{
+	struct ring_buffer *buffer;
+	unsigned long flags;
+
+	if (tracing_disabled)
+		return;
+
+	spin_lock_irqsave(&tracing_start_lock, flags);
+	if (--trace_stop_count)
+		goto out;
+
+	if (trace_stop_count < 0) {
+		/* Someone screwed up their debugging */
+		WARN_ON_ONCE(1);
+		trace_stop_count = 0;
+		goto out;
+	}
+
+
+	buffer = global_trace.buffer;
+	if (buffer)
+		ring_buffer_record_enable(buffer);
+
+	buffer = max_tr.buffer;
+	if (buffer)
+		ring_buffer_record_enable(buffer);
+
+	ftrace_start();
+ out:
+	spin_unlock_irqrestore(&tracing_start_lock, flags);
+}
+
+/**
+ * tracing_stop - quick stop of the tracer
+ *
+ * Light weight way to stop tracing. Use in conjunction with
+ * tracing_start.
+ */
+void tracing_stop(void)
+{
+	struct ring_buffer *buffer;
+	unsigned long flags;
+
+	ftrace_stop();
+	spin_lock_irqsave(&tracing_start_lock, flags);
+	if (trace_stop_count++)
+		goto out;
+
+	buffer = global_trace.buffer;
+	if (buffer)
+		ring_buffer_record_disable(buffer);
+
+	buffer = max_tr.buffer;
+	if (buffer)
+		ring_buffer_record_disable(buffer);
+
+ out:
+	spin_unlock_irqrestore(&tracing_start_lock, flags);
+}
+
 void trace_stop_cmdline_recording(void);
 
 static void trace_save_cmdline(struct task_struct *tsk)
-- 
cgit v1.2.3


From 9036990d462e09366f7297a2d1da6582c3e6b1d3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 5 Nov 2008 16:05:44 -0500
Subject: ftrace: restructure tracing start/stop infrastructure

Impact: change where tracing is started up and stopped

Currently, when a new tracer is selected via echo'ing a tracer name into
the current_tracer file, the startup is only done if tracing_enabled is
set to one. If tracing_enabled is changed to zero (by echo'ing 0 into
the tracing_enabled file) a full shutdown is performed.

The full startup and shutdown of a tracer can be expensive and the
user can lose out traces when echo'ing in 0 to the tracing_enabled file,
because the process takes too long. There can also be places that
the user would like to start and stop the tracer several times and
doing the full startup and shutdown of a tracer might be too expensive.

This patch performs the full startup and shutdown when a tracer is
selected. It also adds a way to do a quick start or stop of a tracer.
The quick version is just a flag that prevents the tracing from
taking place, but the overhead of the code is still there.

For example, the startup of a tracer may enable tracepoints, or enable
the function tracer.  The stop and start will just set a flag to
have the tracer ignore the calls when the tracepoint or function trace
is called.  The overhead of the tracer may still be present when
the tracer is stopped, but no tracing will occur. Setting the tracer
to the 'nop' tracer (or any other tracer) will perform the shutdown
of the tracer which will disable the tracepoint or disable the
function tracer.

The tracing_enabled file will simply start or stop tracing.

This change is all internal. The end result for the user should be the same
as before. If tracing_enabled is not set, no trace will happen.
If tracing_enabled is set, then the trace will happen. The tracing_enabled
variable is static between tracers. Enabling  tracing_enabled and
going to another tracer will keep tracing_enabled enabled. Same
is true with disabling tracing_enabled.

This patch will now provide a fast start/stop method to the users
for enabling or disabling tracing.

Note: There were two methods to the struct tracer that were never
 used: The methods start and stop. These were to be used as a hook
 to the reading of the trace output, but ended up not being
 necessary. These two methods are now used to enable the start
 and stop of each tracer, in case the tracer needs to do more than
 just not write into the buffer. For example, the irqsoff tracer
 must stop recording max latencies when tracing is stopped.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c              | 64 +++++++++++++++++----------------------
 kernel/trace/trace.h              |  5 +--
 kernel/trace/trace_functions.c    |  6 ++++
 kernel/trace/trace_irqsoff.c      | 41 ++++++++++++++++++++++---
 kernel/trace/trace_sched_switch.c | 13 ++++++++
 kernel/trace/trace_sched_wakeup.c | 39 +++++++++++++++++++++---
 6 files changed, 120 insertions(+), 48 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 113aea9447e..ff1e9ed9b58 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -150,6 +150,19 @@ static DEFINE_PER_CPU(struct trace_array_cpu, max_data);
 /* tracer_enabled is used to toggle activation of a tracer */
 static int			tracer_enabled = 1;
 
+/**
+ * tracing_is_enabled - return tracer_enabled status
+ *
+ * This function is used by other tracers to know the status
+ * of the tracer_enabled flag.  Tracers may use this function
+ * to know if it should enable their features when starting
+ * up. See irqsoff tracer for an example (start_irqsoff_tracer).
+ */
+int tracing_is_enabled(void)
+{
+	return tracer_enabled;
+}
+
 /* function tracing enabled */
 int				ftrace_function_enabled;
 
@@ -1041,8 +1054,7 @@ void tracing_start_function_trace(void)
 		trace_ops.func = function_trace_call;
 
 	register_ftrace_function(&trace_ops);
-	if (tracer_enabled)
-		ftrace_function_enabled = 1;
+	ftrace_function_enabled = 1;
 }
 
 void tracing_stop_function_trace(void)
@@ -1189,10 +1201,6 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 
 	atomic_inc(&trace_record_cmdline_disabled);
 
-	/* let the tracer grab locks here if needed */
-	if (current_trace->start)
-		current_trace->start(iter);
-
 	if (*pos != iter->pos) {
 		iter->ent = NULL;
 		iter->cpu = 0;
@@ -1219,14 +1227,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 
 static void s_stop(struct seq_file *m, void *p)
 {
-	struct trace_iterator *iter = m->private;
-
 	atomic_dec(&trace_record_cmdline_disabled);
-
-	/* let the tracer release locks here if needed */
-	if (current_trace && current_trace == iter->trace && iter->trace->stop)
-		iter->trace->stop(iter);
-
 	mutex_unlock(&trace_types_lock);
 }
 
@@ -2056,10 +2057,7 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 	m->private = iter;
 
 	/* stop the trace while dumping */
-	if (iter->tr->ctrl) {
-		tracer_enabled = 0;
-		ftrace_function_enabled = 0;
-	}
+	tracing_stop();
 
 	if (iter->trace && iter->trace->open)
 			iter->trace->open(iter);
@@ -2104,14 +2102,7 @@ int tracing_release(struct inode *inode, struct file *file)
 		iter->trace->close(iter);
 
 	/* reenable tracing if it was previously enabled */
-	if (iter->tr->ctrl) {
-		tracer_enabled = 1;
-		/*
-		 * It is safe to enable function tracing even if it
-		 * isn't used
-		 */
-		ftrace_function_enabled = 1;
-	}
+	tracing_start();
 	mutex_unlock(&trace_types_lock);
 
 	seq_release(inode, file);
@@ -2449,11 +2440,10 @@ static ssize_t
 tracing_ctrl_read(struct file *filp, char __user *ubuf,
 		  size_t cnt, loff_t *ppos)
 {
-	struct trace_array *tr = filp->private_data;
 	char buf[64];
 	int r;
 
-	r = sprintf(buf, "%ld\n", tr->ctrl);
+	r = sprintf(buf, "%u\n", tracer_enabled);
 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
@@ -2481,16 +2471,18 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
 	val = !!val;
 
 	mutex_lock(&trace_types_lock);
-	if (tr->ctrl ^ val) {
-		if (val)
+	if (tracer_enabled ^ val) {
+		if (val) {
 			tracer_enabled = 1;
-		else
+			if (current_trace->start)
+				current_trace->start(tr);
+			tracing_start();
+		} else {
 			tracer_enabled = 0;
-
-		tr->ctrl = val;
-
-		if (current_trace && current_trace->ctrl_update)
-			current_trace->ctrl_update(tr);
+			tracing_stop();
+			if (current_trace->stop)
+				current_trace->stop(tr);
+		}
 	}
 	mutex_unlock(&trace_types_lock);
 
@@ -3372,7 +3364,7 @@ __init static int tracer_alloc_buffers(void)
 #endif
 
 	/* All seems OK, enable tracing */
-	global_trace.ctrl = tracer_enabled;
+	global_trace.ctrl = 1;
 	tracing_disabled = 0;
 
 	atomic_notifier_chain_register(&panic_notifier_list,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cc14a6bc109..3422489fad5 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -237,11 +237,11 @@ struct tracer {
 	const char		*name;
 	void			(*init)(struct trace_array *tr);
 	void			(*reset)(struct trace_array *tr);
+	void			(*start)(struct trace_array *tr);
+	void			(*stop)(struct trace_array *tr);
 	void			(*open)(struct trace_iterator *iter);
 	void			(*pipe_open)(struct trace_iterator *iter);
 	void			(*close)(struct trace_iterator *iter);
-	void			(*start)(struct trace_iterator *iter);
-	void			(*stop)(struct trace_iterator *iter);
 	ssize_t			(*read)(struct trace_iterator *iter,
 					struct file *filp, char __user *ubuf,
 					size_t cnt, loff_t *ppos);
@@ -282,6 +282,7 @@ struct trace_iterator {
 	long			idx;
 };
 
+int tracing_is_enabled(void);
 void trace_wake_up(void);
 void tracing_reset(struct trace_array *tr, int cpu);
 int tracing_open_generic(struct inode *inode, struct file *filp);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 0f85a64003d..9f1b0de7128 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -62,11 +62,17 @@ static void function_trace_ctrl_update(struct trace_array *tr)
 		stop_function_trace(tr);
 }
 
+static void function_trace_start(struct trace_array *tr)
+{
+	function_reset(tr);
+}
+
 static struct tracer function_trace __read_mostly =
 {
 	.name	     = "function",
 	.init	     = function_trace_init,
 	.reset	     = function_trace_reset,
+	.start	     = function_trace_start,
 	.ctrl_update = function_trace_ctrl_update,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_function,
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 9c74071c10e..a87a20fa3fc 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -353,15 +353,28 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
 }
 #endif /* CONFIG_PREEMPT_TRACER */
 
+/*
+ * save_tracer_enabled is used to save the state of the tracer_enabled
+ * variable when we disable it when we open a trace output file.
+ */
+static int save_tracer_enabled;
+
 static void start_irqsoff_tracer(struct trace_array *tr)
 {
 	register_ftrace_function(&trace_ops);
-	tracer_enabled = 1;
+	if (tracing_is_enabled()) {
+		tracer_enabled = 1;
+		save_tracer_enabled = 1;
+	} else {
+		tracer_enabled = 0;
+		save_tracer_enabled = 0;
+	}
 }
 
 static void stop_irqsoff_tracer(struct trace_array *tr)
 {
 	tracer_enabled = 0;
+	save_tracer_enabled = 0;
 	unregister_ftrace_function(&trace_ops);
 }
 
@@ -389,17 +402,29 @@ static void irqsoff_tracer_ctrl_update(struct trace_array *tr)
 		stop_irqsoff_tracer(tr);
 }
 
+static void irqsoff_tracer_start(struct trace_array *tr)
+{
+	irqsoff_tracer_reset(tr);
+	tracer_enabled = 1;
+	save_tracer_enabled = 1;
+}
+
+static void irqsoff_tracer_stop(struct trace_array *tr)
+{
+	tracer_enabled = 0;
+	save_tracer_enabled = 0;
+}
+
 static void irqsoff_tracer_open(struct trace_iterator *iter)
 {
 	/* stop the trace while dumping */
-	if (iter->tr->ctrl)
-		stop_irqsoff_tracer(iter->tr);
+	tracer_enabled = 0;
 }
 
 static void irqsoff_tracer_close(struct trace_iterator *iter)
 {
-	if (iter->tr->ctrl)
-		start_irqsoff_tracer(iter->tr);
+	/* restart tracing */
+	tracer_enabled = save_tracer_enabled;
 }
 
 #ifdef CONFIG_IRQSOFF_TRACER
@@ -414,6 +439,8 @@ static struct tracer irqsoff_tracer __read_mostly =
 	.name		= "irqsoff",
 	.init		= irqsoff_tracer_init,
 	.reset		= irqsoff_tracer_reset,
+	.start		= irqsoff_tracer_start,
+	.stop		= irqsoff_tracer_stop,
 	.open		= irqsoff_tracer_open,
 	.close		= irqsoff_tracer_close,
 	.ctrl_update	= irqsoff_tracer_ctrl_update,
@@ -440,6 +467,8 @@ static struct tracer preemptoff_tracer __read_mostly =
 	.name		= "preemptoff",
 	.init		= preemptoff_tracer_init,
 	.reset		= irqsoff_tracer_reset,
+	.start		= irqsoff_tracer_start,
+	.stop		= irqsoff_tracer_stop,
 	.open		= irqsoff_tracer_open,
 	.close		= irqsoff_tracer_close,
 	.ctrl_update	= irqsoff_tracer_ctrl_update,
@@ -468,6 +497,8 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
 	.name		= "preemptirqsoff",
 	.init		= preemptirqsoff_tracer_init,
 	.reset		= irqsoff_tracer_reset,
+	.start		= irqsoff_tracer_start,
+	.stop		= irqsoff_tracer_stop,
 	.open		= irqsoff_tracer_open,
 	.close		= irqsoff_tracer_close,
 	.ctrl_update	= irqsoff_tracer_ctrl_update,
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 888944d3409..91c699be8c8 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -186,11 +186,24 @@ static void sched_switch_trace_ctrl_update(struct trace_array *tr)
 		stop_sched_trace(tr);
 }
 
+static void sched_switch_trace_start(struct trace_array *tr)
+{
+	sched_switch_reset(tr);
+	tracing_start_sched_switch();
+}
+
+static void sched_switch_trace_stop(struct trace_array *tr)
+{
+	tracing_stop_sched_switch();
+}
+
 struct tracer sched_switch_trace __read_mostly =
 {
 	.name		= "sched_switch",
 	.init		= sched_switch_trace_init,
 	.reset		= sched_switch_trace_reset,
+	.start		= sched_switch_trace_start,
+	.stop		= sched_switch_trace_stop,
 	.ctrl_update	= sched_switch_trace_ctrl_update,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_sched_switch,
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 7bc4abf6fca..240577bc8ba 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -262,6 +262,12 @@ out:
 	atomic_dec(&wakeup_trace->data[cpu]->disabled);
 }
 
+/*
+ * save_tracer_enabled is used to save the state of the tracer_enabled
+ * variable when we disable it when we open a trace output file.
+ */
+static int save_tracer_enabled;
+
 static void start_wakeup_tracer(struct trace_array *tr)
 {
 	int ret;
@@ -300,7 +306,13 @@ static void start_wakeup_tracer(struct trace_array *tr)
 
 	register_ftrace_function(&trace_ops);
 
-	tracer_enabled = 1;
+	if (tracing_is_enabled()) {
+		tracer_enabled = 1;
+		save_tracer_enabled = 1;
+	} else {
+		tracer_enabled = 0;
+		save_tracer_enabled = 0;
+	}
 
 	return;
 fail_deprobe_wake_new:
@@ -312,6 +324,7 @@ fail_deprobe:
 static void stop_wakeup_tracer(struct trace_array *tr)
 {
 	tracer_enabled = 0;
+	save_tracer_enabled = 0;
 	unregister_ftrace_function(&trace_ops);
 	unregister_trace_sched_switch(probe_wakeup_sched_switch);
 	unregister_trace_sched_wakeup_new(probe_wakeup);
@@ -343,18 +356,32 @@ static void wakeup_tracer_ctrl_update(struct trace_array *tr)
 		stop_wakeup_tracer(tr);
 }
 
+static void wakeup_tracer_start(struct trace_array *tr)
+{
+	wakeup_reset(tr);
+	tracer_enabled = 1;
+	save_tracer_enabled = 1;
+}
+
+static void wakeup_tracer_stop(struct trace_array *tr)
+{
+	tracer_enabled = 0;
+	save_tracer_enabled = 0;
+}
+
 static void wakeup_tracer_open(struct trace_iterator *iter)
 {
 	/* stop the trace while dumping */
-	if (iter->tr->ctrl)
-		stop_wakeup_tracer(iter->tr);
+	tracer_enabled = 0;
 }
 
 static void wakeup_tracer_close(struct trace_iterator *iter)
 {
 	/* forget about any processes we were recording */
-	if (iter->tr->ctrl)
-		start_wakeup_tracer(iter->tr);
+	if (save_tracer_enabled) {
+		wakeup_reset(iter->tr);
+		tracer_enabled = 1;
+	}
 }
 
 static struct tracer wakeup_tracer __read_mostly =
@@ -362,6 +389,8 @@ static struct tracer wakeup_tracer __read_mostly =
 	.name		= "wakeup",
 	.init		= wakeup_tracer_init,
 	.reset		= wakeup_tracer_reset,
+	.start		= wakeup_tracer_start,
+	.stop		= wakeup_tracer_stop,
 	.open		= wakeup_tracer_open,
 	.close		= wakeup_tracer_close,
 	.ctrl_update	= wakeup_tracer_ctrl_update,
-- 
cgit v1.2.3


From 3e03fb7f1da2e691644526c0d6df42d778716349 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 6 Nov 2008 00:09:43 -0500
Subject: ring-buffer: convert to raw spinlocks

Impact: no lockdep debugging of ring buffer

The problem with running lockdep on the ring buffer is that the
ring buffer is the core infrastructure of ftrace. What happens is
that the tracer will start tracing the lockdep code while lockdep
is testing the ring buffers locks.  This can cause lockdep to
fail due to testing cases that have not fully finished their
locking transition.

This patch converts the spin locks used by the ring buffer back
into raw spin locks which lockdep does not check.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 151f6a74867..a2dea500882 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -154,7 +154,7 @@ static inline int test_time_stamp(u64 delta)
 struct ring_buffer_per_cpu {
 	int				cpu;
 	struct ring_buffer		*buffer;
-	spinlock_t			lock;
+	raw_spinlock_t			lock;
 	struct lock_class_key		lock_key;
 	struct list_head		pages;
 	struct buffer_page		*head_page;	/* read from head */
@@ -291,7 +291,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
 
 	cpu_buffer->cpu = cpu;
 	cpu_buffer->buffer = buffer;
-	spin_lock_init(&cpu_buffer->lock);
+	cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 	INIT_LIST_HEAD(&cpu_buffer->pages);
 
 	page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()),
@@ -854,7 +854,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	if (write > BUF_PAGE_SIZE) {
 		struct buffer_page *next_page = tail_page;
 
-		spin_lock_irqsave(&cpu_buffer->lock, flags);
+		local_irq_save(flags);
+		__raw_spin_lock(&cpu_buffer->lock);
 
 		rb_inc_page(cpu_buffer, &next_page);
 
@@ -930,7 +931,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 			rb_set_commit_to_write(cpu_buffer);
 		}
 
-		spin_unlock_irqrestore(&cpu_buffer->lock, flags);
+		__raw_spin_unlock(&cpu_buffer->lock);
+		local_irq_restore(flags);
 
 		/* fail and let the caller try again */
 		return ERR_PTR(-EAGAIN);
@@ -953,7 +955,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	return event;
 
  out_unlock:
-	spin_unlock_irqrestore(&cpu_buffer->lock, flags);
+	__raw_spin_unlock(&cpu_buffer->lock);
+	local_irq_restore(flags);
 	return NULL;
 }
 
@@ -1524,7 +1527,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	struct buffer_page *reader = NULL;
 	unsigned long flags;
 
-	spin_lock_irqsave(&cpu_buffer->lock, flags);
+	local_irq_save(flags);
+	__raw_spin_lock(&cpu_buffer->lock);
 
  again:
 	reader = cpu_buffer->reader_page;
@@ -1574,7 +1578,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	goto again;
 
  out:
-	spin_unlock_irqrestore(&cpu_buffer->lock, flags);
+	__raw_spin_unlock(&cpu_buffer->lock);
+	local_irq_restore(flags);
 
 	return reader;
 }
@@ -1815,9 +1820,11 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
 	atomic_inc(&cpu_buffer->record_disabled);
 	synchronize_sched();
 
-	spin_lock_irqsave(&cpu_buffer->lock, flags);
+	local_irq_save(flags);
+	__raw_spin_lock(&cpu_buffer->lock);
 	ring_buffer_iter_reset(iter);
-	spin_unlock_irqrestore(&cpu_buffer->lock, flags);
+	__raw_spin_unlock(&cpu_buffer->lock);
+	local_irq_restore(flags);
 
 	return iter;
 }
@@ -1903,11 +1910,13 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
 	if (!cpu_isset(cpu, buffer->cpumask))
 		return;
 
-	spin_lock_irqsave(&cpu_buffer->lock, flags);
+	local_irq_save(flags);
+	__raw_spin_lock(&cpu_buffer->lock);
 
 	rb_reset_cpu(cpu_buffer);
 
-	spin_unlock_irqrestore(&cpu_buffer->lock, flags);
+	__raw_spin_unlock(&cpu_buffer->lock);
+	local_irq_restore(flags);
 }
 
 /**
-- 
cgit v1.2.3


From cf7f8690e864c6fe11e77202dd847fa60f483418 Mon Sep 17 00:00:00 2001
From: Sripathi Kodi <sripathik@in.ibm.com>
Date: Wed, 5 Nov 2008 18:57:14 +0530
Subject: sched, lockdep: inline double_unlock_balance()

We have a test case which measures the variation in the amount of time
needed to perform a fixed amount of work on the preempt_rt kernel. We
started seeing deterioration in it's performance recently. The test
should never take more than 10 microseconds, but we started 5-10%
failure rate.

Using elimination method, we traced the problem to commit
1b12bbc747560ea68bcc132c3d05699e52271da0 (lockdep: re-annotate
scheduler runqueues).

When LOCKDEP is disabled, this patch only adds an additional function
call to double_unlock_balance(). Hence I inlined double_unlock_balance()
and the problem went away. Here is a patch to make this change.

Signed-off-by: Sripathi Kodi <sripathik@in.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    | 2 +-
 kernel/sched_rt.c | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index e8819bc6f46..ad10d0aae1d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2825,7 +2825,7 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
 	return ret;
 }
 
-static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
+static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
 	__releases(busiest->lock)
 {
 	spin_unlock(&busiest->lock);
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index c7963d5d062..2bdd4442359 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -910,7 +910,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 #define RT_MAX_TRIES 3
 
 static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
-static void double_unlock_balance(struct rq *this_rq, struct rq *busiest);
+static inline void double_unlock_balance(struct rq *this_rq,
+						struct rq *busiest);
 
 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
 
-- 
cgit v1.2.3


From 6d21cd62516a9697cb7ec33cc52e6b814fb65a13 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 7 Nov 2008 17:03:18 +0800
Subject: sched: clean up SCHED_CPUMASK_ALLOC

Impact: cleanup

The #if/#endif is ugly. Change SCHED_CPUMASK_ALLOC and
SCHED_CPUMASK_FREE to static inline functions.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index b24e57a10f6..59db86c915f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7279,13 +7279,21 @@ struct allmasks {
 };
 
 #if	NR_CPUS > 128
-#define	SCHED_CPUMASK_ALLOC		1
-#define	SCHED_CPUMASK_FREE(v)		kfree(v)
-#define	SCHED_CPUMASK_DECLARE(v)	struct allmasks *v
+#define SCHED_CPUMASK_DECLARE(v)	struct allmasks *v
+static inline void sched_cpumask_alloc(struct allmasks **masks)
+{
+	*masks = kmalloc(sizeof(**masks), GFP_KERNEL);
+}
+static inline void sched_cpumask_free(struct allmasks *masks)
+{
+	kfree(masks);
+}
 #else
-#define	SCHED_CPUMASK_ALLOC		0
-#define	SCHED_CPUMASK_FREE(v)
-#define	SCHED_CPUMASK_DECLARE(v)	struct allmasks _v, *v = &_v
+#define SCHED_CPUMASK_DECLARE(v)	struct allmasks _v, *v = &_v
+static inline void sched_cpumask_alloc(struct allmasks **masks)
+{ }
+static inline void sched_cpumask_free(struct allmasks *masks)
+{ }
 #endif
 
 #define	SCHED_CPUMASK_VAR(v, a) 	cpumask_t *v = (cpumask_t *) \
@@ -7361,9 +7369,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		return -ENOMEM;
 	}
 
-#if SCHED_CPUMASK_ALLOC
 	/* get space for all scratch cpumask variables */
-	allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL);
+	sched_cpumask_alloc(&allmasks);
 	if (!allmasks) {
 		printk(KERN_WARNING "Cannot alloc cpumask array\n");
 		kfree(rd);
@@ -7372,7 +7379,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #endif
 		return -ENOMEM;
 	}
-#endif
+
 	tmpmask = (cpumask_t *)allmasks;
 
 
@@ -7626,13 +7633,13 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		cpu_attach_domain(sd, rd, i);
 	}
 
-	SCHED_CPUMASK_FREE((void *)allmasks);
+	sched_cpumask_free(allmasks);
 	return 0;
 
 #ifdef CONFIG_NUMA
 error:
 	free_sched_groups(cpu_map, tmpmask);
-	SCHED_CPUMASK_FREE((void *)allmasks);
+	sched_cpumask_free(allmasks);
 	kfree(rd);
 	return -ENOMEM;
 #endif
-- 
cgit v1.2.3


From 0183fb1c94b74862b073590fc52c56b7364b7bad Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 7 Nov 2008 22:36:02 -0500
Subject: ftrace: fix set_ftrace_filter

Impact: fix of output of set_ftrace_filter

Commit ftrace: do not show freed records in available_filter_functions

Removed a bit too much from the set_ftrace_filter code, where we now see
all functions in the set_ftrace_filter file even when we set a filter.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 896c71f0f4c..4d2e751bfb1 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -765,6 +765,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 		    ((iter->flags & FTRACE_ITER_FAILURES) &&
 		     !(rec->flags & FTRACE_FL_FAILED)) ||
 
+		    ((iter->flags & FTRACE_ITER_FILTER) &&
+		     !(rec->flags & FTRACE_FL_FILTER)) ||
+
 		    ((iter->flags & FTRACE_ITER_NOTRACE) &&
 		     !(rec->flags & FTRACE_FL_NOTRACE))) {
 			rec = NULL;
-- 
cgit v1.2.3


From 75f5c47da386445ba0c5a8b7e3ca0c906e763369 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 7 Nov 2008 22:36:02 -0500
Subject: ftrace: fix boot trace sched startup

Impact: boot tracer startup modified

The boot tracer calls into some of the schedule tracing private functions
that should not be exported. This patch cleans it up, and makes
way for further changes in the ftrace infrastructure.

This patch adds a api to assign a tracer array to the schedule
context switch tracer.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h              |  2 +-
 kernel/trace/trace_boot.c         |  9 +++++++--
 kernel/trace/trace_sched_switch.c | 15 ++++++++++++++-
 3 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3422489fad5..db12e16137e 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -49,7 +49,6 @@ struct ftrace_entry {
 	unsigned long		parent_ip;
 };
 extern struct tracer boot_tracer;
-extern struct tracer sched_switch_trace; /* Used by the boot tracer */
 
 /*
  * Context switch trace entry - which task (and prio) we switched from/to:
@@ -325,6 +324,7 @@ void trace_function(struct trace_array *tr,
 
 void tracing_start_cmdline_record(void);
 void tracing_stop_cmdline_record(void);
+void tracing_cmdline_assign_trace(struct trace_array *tr);
 int register_tracer(struct tracer *type);
 void unregister_tracer(struct tracer *type);
 
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index bd5046c9deb..662cb919890 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -39,7 +39,12 @@ void disable_boot_trace(void)
 
 static void reset_boot_trace(struct trace_array *tr)
 {
-	sched_switch_trace.reset(tr);
+	int cpu;
+
+	tr->time_start = ftrace_now(tr->cpu);
+
+	for_each_online_cpu(cpu)
+		tracing_reset(tr, cpu);
 }
 
 static void boot_trace_init(struct trace_array *tr)
@@ -50,7 +55,7 @@ static void boot_trace_init(struct trace_array *tr)
 	for_each_cpu_mask(cpu, cpu_possible_map)
 		tracing_reset(tr, cpu);
 
-	sched_switch_trace.init(tr);
+	tracing_cmdline_assign_trace(tr);
 }
 
 static void boot_trace_ctrl_update(struct trace_array *tr)
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 91c699be8c8..fbf05df7134 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -152,6 +152,19 @@ void tracing_stop_cmdline_record(void)
 	tracing_stop_sched_switch();
 }
 
+/**
+ * tracing_cmdline_assign_trace - assign a trace array for ctx switch
+ * @tr: trace array pointer to assign
+ *
+ * Some tracers might want to record the context switches in their
+ * trace. This function lets those tracers assign the trace array
+ * to use.
+ */
+void tracing_cmdline_assign_trace(struct trace_array *tr)
+{
+	ctx_trace = tr;
+}
+
 static void start_sched_trace(struct trace_array *tr)
 {
 	sched_switch_reset(tr);
@@ -197,7 +210,7 @@ static void sched_switch_trace_stop(struct trace_array *tr)
 	tracing_stop_sched_switch();
 }
 
-struct tracer sched_switch_trace __read_mostly =
+static struct tracer sched_switch_trace __read_mostly =
 {
 	.name		= "sched_switch",
 	.init		= sched_switch_trace_init,
-- 
cgit v1.2.3


From e168e0516e476070faa9e8e7b23dfcba79b76d82 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 7 Nov 2008 22:36:02 -0500
Subject: ftrace: fix sched_switch API

Impact: fix for sched_switch that broke dynamic ftrace startup

The commit: tracing/fastboot: use sched switch tracer from boot tracer
broke the API of the sched_switch trace. The use of the
tracing_start/stop_cmdline record is for only recording the cmdline,
NOT recording the schedule switches themselves.

Seeing that the boot tracer broke the API to do something that it
wanted, this patch adds a new interface for the API while
puting back the original interface of the old API.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h              |  4 +++-
 kernel/trace/trace_boot.c         |  6 ++---
 kernel/trace/trace_sched_switch.c | 50 +++++++++++++++++++++++++++++++--------
 3 files changed, 46 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index db12e16137e..25abfc45f08 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -324,7 +324,9 @@ void trace_function(struct trace_array *tr,
 
 void tracing_start_cmdline_record(void);
 void tracing_stop_cmdline_record(void);
-void tracing_cmdline_assign_trace(struct trace_array *tr);
+void tracing_sched_switch_assign_trace(struct trace_array *tr);
+void tracing_stop_sched_switch_record(void);
+void tracing_start_sched_switch_record(void);
 int register_tracer(struct tracer *type);
 void unregister_tracer(struct tracer *type);
 
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 662cb919890..0203c105401 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -28,13 +28,13 @@ void start_boot_trace(void)
 void enable_boot_trace(void)
 {
 	if (pre_initcalls_finished)
-		tracing_start_cmdline_record();
+		tracing_start_sched_switch_record();
 }
 
 void disable_boot_trace(void)
 {
 	if (pre_initcalls_finished)
-		tracing_stop_cmdline_record();
+		tracing_stop_sched_switch_record();
 }
 
 static void reset_boot_trace(struct trace_array *tr)
@@ -55,7 +55,7 @@ static void boot_trace_init(struct trace_array *tr)
 	for_each_cpu_mask(cpu, cpu_possible_map)
 		tracing_reset(tr, cpu);
 
-	tracing_cmdline_assign_trace(tr);
+	tracing_sched_switch_assign_trace(tr);
 }
 
 static void boot_trace_ctrl_update(struct trace_array *tr)
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index fbf05df7134..79410db64d6 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -125,20 +125,16 @@ static void tracing_sched_unregister(void)
 static void tracing_start_sched_switch(void)
 {
 	mutex_lock(&sched_register_mutex);
-	if (!(sched_ref++)) {
-		tracer_enabled = 1;
+	if (!(sched_ref++))
 		tracing_sched_register();
-	}
 	mutex_unlock(&sched_register_mutex);
 }
 
 static void tracing_stop_sched_switch(void)
 {
 	mutex_lock(&sched_register_mutex);
-	if (!(--sched_ref)) {
+	if (!(--sched_ref))
 		tracing_sched_unregister();
-		tracer_enabled = 0;
-	}
 	mutex_unlock(&sched_register_mutex);
 }
 
@@ -153,14 +149,48 @@ void tracing_stop_cmdline_record(void)
 }
 
 /**
- * tracing_cmdline_assign_trace - assign a trace array for ctx switch
+ * tracing_start_sched_switch_record - start tracing context switches
+ *
+ * Turns on context switch tracing for a tracer.
+ */
+void tracing_start_sched_switch_record(void)
+{
+	if (unlikely(!ctx_trace)) {
+		WARN_ON(1);
+		return;
+	}
+
+	tracing_start_sched_switch();
+
+	mutex_lock(&sched_register_mutex);
+	tracer_enabled++;
+	mutex_unlock(&sched_register_mutex);
+}
+
+/**
+ * tracing_stop_sched_switch_record - start tracing context switches
+ *
+ * Turns off context switch tracing for a tracer.
+ */
+void tracing_stop_sched_switch_record(void)
+{
+	mutex_lock(&sched_register_mutex);
+	tracer_enabled--;
+	WARN_ON(tracer_enabled < 0);
+	mutex_unlock(&sched_register_mutex);
+
+	tracing_stop_sched_switch();
+}
+
+/**
+ * tracing_sched_switch_assign_trace - assign a trace array for ctx switch
  * @tr: trace array pointer to assign
  *
  * Some tracers might want to record the context switches in their
  * trace. This function lets those tracers assign the trace array
  * to use.
  */
-void tracing_cmdline_assign_trace(struct trace_array *tr)
+void tracing_sched_switch_assign_trace(struct trace_array *tr)
 {
 	ctx_trace = tr;
 }
@@ -168,12 +198,12 @@ void tracing_cmdline_assign_trace(struct trace_array *tr)
 static void start_sched_trace(struct trace_array *tr)
 {
 	sched_switch_reset(tr);
-	tracing_start_cmdline_record();
+	tracing_start_sched_switch_record();
 }
 
 static void stop_sched_trace(struct trace_array *tr)
 {
-	tracing_stop_cmdline_record();
+	tracing_stop_sched_switch_record();
 }
 
 static void sched_switch_trace_init(struct trace_array *tr)
-- 
cgit v1.2.3


From 451931702017951f74624ddc4f7f02e4641b0e20 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 7 Nov 2008 22:36:02 -0500
Subject: ftrace: irqsoff tracer incorrect reset

Impact: fix to irqsoff tracer output

In converting to the new start / stop ftrace handling, the irqsoff
tracer start called the irqsoff reset function. irqsoff tracer is
not the same as the other traces, and it resets the buffers while
searching for the longest latency.

The reset that the irqsoff stop method calls disables the function
tracing. That means that, by starting the tracer, the function
tracer is disabled incorrectly.

This patch simply removes the call to reset which keeps the function
tracing enabled. Reset is not needed for the irqsoff stop method.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_irqsoff.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index a87a20fa3fc..3509086cdc4 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -404,7 +404,6 @@ static void irqsoff_tracer_ctrl_update(struct trace_array *tr)
 
 static void irqsoff_tracer_start(struct trace_array *tr)
 {
-	irqsoff_tracer_reset(tr);
 	tracer_enabled = 1;
 	save_tracer_enabled = 1;
 }
-- 
cgit v1.2.3


From 49833fc232bd6a5076496994d855f601354501d7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 7 Nov 2008 22:36:02 -0500
Subject: ftrace: enable trace_printk by default

Impact: have the ftrace_printk enabled on startup

It is confusing to have to "echo trace_printk > /debug/tracing/iter_ctrl"
after adding ftrace_printk in the kernel.

Currently the trace_printk is set to off by default. ftrace_printk
should only be in open kernel code when used for debugging, and thus
it should be enabled by default.

It may also be used to record data within a tracer, but those ftrace_printks
should be within wrappers that are either enabled by trace_points or
have a variable protecting the code path from being entered when the
tracer is disabled.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d55ccfc8d67..dbbdacfaaf9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -205,7 +205,7 @@ static DEFINE_MUTEX(trace_types_lock);
 static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 
 /* trace_flags holds iter_ctrl options */
-unsigned long trace_flags = TRACE_ITER_PRINT_PARENT;
+unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK;
 
 /**
  * trace_wake_up - wake up tasks waiting for trace input
-- 
cgit v1.2.3


From bbf5b1a0cecb56de6236db8b01c5bfb7ab8ba8b2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 7 Nov 2008 22:36:02 -0500
Subject: ftrace: remove ctrl_update method

Impact: Remove the ctrl_update tracer method

With the new quick start/stop method of tracing, the ctrl_update
method is out of date.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c              |  2 --
 kernel/trace/trace.h              |  1 -
 kernel/trace/trace_boot.c         |  9 --------
 kernel/trace/trace_functions.c    |  9 --------
 kernel/trace/trace_irqsoff.c      | 11 ---------
 kernel/trace/trace_mmiotrace.c    | 11 +++------
 kernel/trace/trace_nop.c          | 10 --------
 kernel/trace/trace_sched_switch.c | 10 --------
 kernel/trace/trace_sched_wakeup.c |  9 --------
 kernel/trace/trace_selftest.c     | 48 +++++++++++++++++++++------------------
 kernel/trace/trace_sysprof.c      | 10 --------
 11 files changed, 29 insertions(+), 101 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dbbdacfaaf9..9e83188172a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3357,8 +3357,6 @@ __init static int tracer_alloc_buffers(void)
 
 	register_tracer(&nop_trace);
 #ifdef CONFIG_BOOT_TRACER
-	/* We don't want to launch sched_switch tracer yet */
-	global_trace.ctrl = 0;
 	register_tracer(&boot_tracer);
 	current_trace = &boot_tracer;
 	current_trace->init(&global_trace);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 25abfc45f08..e481edaae1c 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -244,7 +244,6 @@ struct tracer {
 	ssize_t			(*read)(struct trace_iterator *iter,
 					struct file *filp, char __user *ubuf,
 					size_t cnt, loff_t *ppos);
-	void			(*ctrl_update)(struct trace_array *tr);
 #ifdef CONFIG_FTRACE_STARTUP_TEST
 	int			(*selftest)(struct tracer *trace,
 					    struct trace_array *tr);
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 0203c105401..8f71915e8bb 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -58,14 +58,6 @@ static void boot_trace_init(struct trace_array *tr)
 	tracing_sched_switch_assign_trace(tr);
 }
 
-static void boot_trace_ctrl_update(struct trace_array *tr)
-{
-	if (tr->ctrl)
-		enable_boot_trace();
-	else
-		disable_boot_trace();
-}
-
 static enum print_line_t initcall_print_line(struct trace_iterator *iter)
 {
 	int ret;
@@ -102,7 +94,6 @@ struct tracer boot_tracer __read_mostly =
 	.name		= "initcall",
 	.init		= boot_trace_init,
 	.reset		= reset_boot_trace,
-	.ctrl_update	= boot_trace_ctrl_update,
 	.print_line	= initcall_print_line,
 };
 
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 9f1b0de7128..e980b872bef 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -54,14 +54,6 @@ static void function_trace_reset(struct trace_array *tr)
 		stop_function_trace(tr);
 }
 
-static void function_trace_ctrl_update(struct trace_array *tr)
-{
-	if (tr->ctrl)
-		start_function_trace(tr);
-	else
-		stop_function_trace(tr);
-}
-
 static void function_trace_start(struct trace_array *tr)
 {
 	function_reset(tr);
@@ -73,7 +65,6 @@ static struct tracer function_trace __read_mostly =
 	.init	     = function_trace_init,
 	.reset	     = function_trace_reset,
 	.start	     = function_trace_start,
-	.ctrl_update = function_trace_ctrl_update,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_function,
 #endif
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 3509086cdc4..ffdf592a54e 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -394,14 +394,6 @@ static void irqsoff_tracer_reset(struct trace_array *tr)
 		stop_irqsoff_tracer(tr);
 }
 
-static void irqsoff_tracer_ctrl_update(struct trace_array *tr)
-{
-	if (tr->ctrl)
-		start_irqsoff_tracer(tr);
-	else
-		stop_irqsoff_tracer(tr);
-}
-
 static void irqsoff_tracer_start(struct trace_array *tr)
 {
 	tracer_enabled = 1;
@@ -442,7 +434,6 @@ static struct tracer irqsoff_tracer __read_mostly =
 	.stop		= irqsoff_tracer_stop,
 	.open		= irqsoff_tracer_open,
 	.close		= irqsoff_tracer_close,
-	.ctrl_update	= irqsoff_tracer_ctrl_update,
 	.print_max	= 1,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_irqsoff,
@@ -470,7 +461,6 @@ static struct tracer preemptoff_tracer __read_mostly =
 	.stop		= irqsoff_tracer_stop,
 	.open		= irqsoff_tracer_open,
 	.close		= irqsoff_tracer_close,
-	.ctrl_update	= irqsoff_tracer_ctrl_update,
 	.print_max	= 1,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_preemptoff,
@@ -500,7 +490,6 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
 	.stop		= irqsoff_tracer_stop,
 	.open		= irqsoff_tracer_open,
 	.close		= irqsoff_tracer_close,
-	.ctrl_update	= irqsoff_tracer_ctrl_update,
 	.print_max	= 1,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_preemptirqsoff,
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index f28484618ff..fa9354e78b5 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -49,15 +49,10 @@ static void mmio_trace_reset(struct trace_array *tr)
 	mmio_trace_array = NULL;
 }
 
-static void mmio_trace_ctrl_update(struct trace_array *tr)
+static void mmio_trace_start(struct trace_array *tr)
 {
 	pr_debug("in %s\n", __func__);
-	if (tr->ctrl) {
-		mmio_reset_data(tr);
-		enable_mmiotrace();
-	} else {
-		disable_mmiotrace();
-	}
+	mmio_reset_data(tr);
 }
 
 static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
@@ -298,10 +293,10 @@ static struct tracer mmio_tracer __read_mostly =
 	.name		= "mmiotrace",
 	.init		= mmio_trace_init,
 	.reset		= mmio_trace_reset,
+	.start		= mmio_trace_start,
 	.pipe_open	= mmio_pipe_open,
 	.close		= mmio_close,
 	.read		= mmio_read,
-	.ctrl_update	= mmio_trace_ctrl_update,
 	.print_line	= mmio_print_line,
 };
 
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index 4592b486251..e3c5fbfcdd3 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -42,21 +42,11 @@ static void nop_trace_reset(struct trace_array *tr)
 		stop_nop_trace(tr);
 }
 
-static void nop_trace_ctrl_update(struct trace_array *tr)
-{
-	/* When starting a new trace, reset the buffers */
-	if (tr->ctrl)
-		start_nop_trace(tr);
-	else
-		stop_nop_trace(tr);
-}
-
 struct tracer nop_trace __read_mostly =
 {
 	.name		= "nop",
 	.init		= nop_trace_init,
 	.reset		= nop_trace_reset,
-	.ctrl_update	= nop_trace_ctrl_update,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest	= trace_selftest_startup_nop,
 #endif
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 79410db64d6..7b73fd11e4a 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -220,15 +220,6 @@ static void sched_switch_trace_reset(struct trace_array *tr)
 		stop_sched_trace(tr);
 }
 
-static void sched_switch_trace_ctrl_update(struct trace_array *tr)
-{
-	/* When starting a new trace, reset the buffers */
-	if (tr->ctrl)
-		start_sched_trace(tr);
-	else
-		stop_sched_trace(tr);
-}
-
 static void sched_switch_trace_start(struct trace_array *tr)
 {
 	sched_switch_reset(tr);
@@ -247,7 +238,6 @@ static struct tracer sched_switch_trace __read_mostly =
 	.reset		= sched_switch_trace_reset,
 	.start		= sched_switch_trace_start,
 	.stop		= sched_switch_trace_stop,
-	.ctrl_update	= sched_switch_trace_ctrl_update,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_sched_switch,
 #endif
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 240577bc8ba..23e54d4e4d9 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -348,14 +348,6 @@ static void wakeup_tracer_reset(struct trace_array *tr)
 	}
 }
 
-static void wakeup_tracer_ctrl_update(struct trace_array *tr)
-{
-	if (tr->ctrl)
-		start_wakeup_tracer(tr);
-	else
-		stop_wakeup_tracer(tr);
-}
-
 static void wakeup_tracer_start(struct trace_array *tr)
 {
 	wakeup_reset(tr);
@@ -393,7 +385,6 @@ static struct tracer wakeup_tracer __read_mostly =
 	.stop		= wakeup_tracer_stop,
 	.open		= wakeup_tracer_open,
 	.close		= wakeup_tracer_close,
-	.ctrl_update	= wakeup_tracer_ctrl_update,
 	.print_max	= 1,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_wakeup,
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 90bc752a758..74693434047 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -134,13 +134,13 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 	msleep(100);
 
 	/* stop the tracing. */
-	tr->ctrl = 0;
-	trace->ctrl_update(tr);
+	tracing_stop();
 	ftrace_enabled = 0;
 
 	/* check the trace buffer */
 	ret = trace_test_buffer(tr, &count);
 	trace->reset(tr);
+	tracing_start();
 
 	/* we should only have one item */
 	if (!ret && count != 1) {
@@ -148,6 +148,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 		ret = -1;
 		goto out;
 	}
+
  out:
 	ftrace_enabled = save_ftrace_enabled;
 	tracer_enabled = save_tracer_enabled;
@@ -185,13 +186,13 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 	/* Sleep for a 1/10 of a second */
 	msleep(100);
 	/* stop the tracing. */
-	tr->ctrl = 0;
-	trace->ctrl_update(tr);
+	tracing_stop();
 	ftrace_enabled = 0;
 
 	/* check the trace buffer */
 	ret = trace_test_buffer(tr, &count);
 	trace->reset(tr);
+	tracing_start();
 
 	if (!ret && !count) {
 		printk(KERN_CONT ".. no entries found ..");
@@ -232,13 +233,13 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
 	udelay(100);
 	local_irq_enable();
 	/* stop the tracing. */
-	tr->ctrl = 0;
-	trace->ctrl_update(tr);
+	tracing_stop();
 	/* check both trace buffers */
 	ret = trace_test_buffer(tr, NULL);
 	if (!ret)
 		ret = trace_test_buffer(&max_tr, &count);
 	trace->reset(tr);
+	tracing_start();
 
 	if (!ret && !count) {
 		printk(KERN_CONT ".. no entries found ..");
@@ -269,13 +270,13 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
 	udelay(100);
 	preempt_enable();
 	/* stop the tracing. */
-	tr->ctrl = 0;
-	trace->ctrl_update(tr);
+	tracing_stop();
 	/* check both trace buffers */
 	ret = trace_test_buffer(tr, NULL);
 	if (!ret)
 		ret = trace_test_buffer(&max_tr, &count);
 	trace->reset(tr);
+	tracing_start();
 
 	if (!ret && !count) {
 		printk(KERN_CONT ".. no entries found ..");
@@ -312,27 +313,30 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 	local_irq_enable();
 
 	/* stop the tracing. */
-	tr->ctrl = 0;
-	trace->ctrl_update(tr);
+	tracing_stop();
 	/* check both trace buffers */
 	ret = trace_test_buffer(tr, NULL);
-	if (ret)
+	if (ret) {
+		tracing_start();
 		goto out;
+	}
 
 	ret = trace_test_buffer(&max_tr, &count);
-	if (ret)
+	if (ret) {
+		tracing_start();
 		goto out;
+	}
 
 	if (!ret && !count) {
 		printk(KERN_CONT ".. no entries found ..");
 		ret = -1;
+		tracing_start();
 		goto out;
 	}
 
 	/* do the test by disabling interrupts first this time */
 	tracing_max_latency = 0;
-	tr->ctrl = 1;
-	trace->ctrl_update(tr);
+	tracing_start();
 	preempt_disable();
 	local_irq_disable();
 	udelay(100);
@@ -341,8 +345,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 	local_irq_enable();
 
 	/* stop the tracing. */
-	tr->ctrl = 0;
-	trace->ctrl_update(tr);
+	tracing_stop();
 	/* check both trace buffers */
 	ret = trace_test_buffer(tr, NULL);
 	if (ret)
@@ -358,6 +361,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 
  out:
 	trace->reset(tr);
+	tracing_start();
 	tracing_max_latency = save_max;
 
 	return ret;
@@ -448,8 +452,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
 	msleep(100);
 
 	/* stop the tracing. */
-	tr->ctrl = 0;
-	trace->ctrl_update(tr);
+	tracing_stop();
 	/* check both trace buffers */
 	ret = trace_test_buffer(tr, NULL);
 	if (!ret)
@@ -457,6 +460,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
 
 
 	trace->reset(tr);
+	tracing_start();
 
 	tracing_max_latency = save_max;
 
@@ -485,11 +489,11 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
 	/* Sleep for a 1/10 of a second */
 	msleep(100);
 	/* stop the tracing. */
-	tr->ctrl = 0;
-	trace->ctrl_update(tr);
+	tracing_stop();
 	/* check the trace buffer */
 	ret = trace_test_buffer(tr, &count);
 	trace->reset(tr);
+	tracing_start();
 
 	if (!ret && !count) {
 		printk(KERN_CONT ".. no entries found ..");
@@ -513,11 +517,11 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
 	/* Sleep for a 1/10 of a second */
 	msleep(100);
 	/* stop the tracing. */
-	tr->ctrl = 0;
-	trace->ctrl_update(tr);
+	tracing_stop();
 	/* check the trace buffer */
 	ret = trace_test_buffer(tr, &count);
 	trace->reset(tr);
+	tracing_start();
 
 	return ret;
 }
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 9587d3bcba5..8097430edff 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -275,21 +275,11 @@ static void stack_trace_reset(struct trace_array *tr)
 		stop_stack_trace(tr);
 }
 
-static void stack_trace_ctrl_update(struct trace_array *tr)
-{
-	/* When starting a new trace, reset the buffers */
-	if (tr->ctrl)
-		start_stack_trace(tr);
-	else
-		stop_stack_trace(tr);
-}
-
 static struct tracer stack_trace __read_mostly =
 {
 	.name		= "sysprof",
 	.init		= stack_trace_init,
 	.reset		= stack_trace_reset,
-	.ctrl_update	= stack_trace_ctrl_update,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_sysprof,
 #endif
-- 
cgit v1.2.3


From c76f06945be50564f925799ddfb6235ee4c26aa0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 7 Nov 2008 22:36:02 -0500
Subject: ftrace: remove trace array ctrl

Impact: remove obsolete variable in trace_array structure

With the new start / stop method of ftrace, the ctrl variable
in the trace_array structure is now obsolete. Remove it.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c              | 20 +++++---------------
 kernel/trace/trace.h              |  1 -
 kernel/trace/trace_functions.c    |  6 ++----
 kernel/trace/trace_irqsoff.c      |  7 ++-----
 kernel/trace/trace_mmiotrace.c    | 11 +++++------
 kernel/trace/trace_nop.c          |  6 ++----
 kernel/trace/trace_sched_switch.c |  6 ++----
 kernel/trace/trace_sched_wakeup.c | 12 ++++--------
 kernel/trace/trace_selftest.c     |  8 --------
 kernel/trace/trace_sysprof.c      |  6 ++----
 10 files changed, 24 insertions(+), 59 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9e83188172a..58435415b36 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -537,7 +537,6 @@ int register_tracer(struct tracer *type)
 	if (type->selftest) {
 		struct tracer *saved_tracer = current_trace;
 		struct trace_array *tr = &global_trace;
-		int saved_ctrl = tr->ctrl;
 		int i;
 		/*
 		 * Run a selftest on this tracer.
@@ -550,13 +549,11 @@ int register_tracer(struct tracer *type)
 			tracing_reset(tr, i);
 		}
 		current_trace = type;
-		tr->ctrl = 0;
 		/* the test is responsible for initializing and enabling */
 		pr_info("Testing tracer %s: ", type->name);
 		ret = type->selftest(type, tr);
 		/* the test is responsible for resetting too */
 		current_trace = saved_tracer;
-		tr->ctrl = saved_ctrl;
 		if (ret) {
 			printk(KERN_CONT "FAILED!\n");
 			goto out;
@@ -966,7 +963,7 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
 	int cpu;
 	int pc;
 
-	if (tracing_disabled || !tr->ctrl)
+	if (tracing_disabled)
 		return;
 
 	pc = preempt_count();
@@ -2820,7 +2817,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
 	unsigned long val;
 	char buf[64];
 	int ret;
-	struct trace_array *tr = filp->private_data;
 
 	if (cnt >= sizeof(buf))
 		return -EINVAL;
@@ -2840,12 +2836,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
 
 	mutex_lock(&trace_types_lock);
 
-	if (tr->ctrl) {
-		cnt = -EBUSY;
-		pr_info("ftrace: please disable tracing"
-			" before modifying buffer size\n");
-		goto out;
-	}
+	tracing_stop();
 
 	if (val != global_trace.entries) {
 		ret = ring_buffer_resize(global_trace.buffer, val);
@@ -2878,6 +2869,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
 	if (tracing_disabled)
 		cnt = -ENOMEM;
  out:
+	tracing_start();
 	max_tr.entries = global_trace.entries;
 	mutex_unlock(&trace_types_lock);
 
@@ -2900,9 +2892,8 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 {
 	char *buf;
 	char *end;
-	struct trace_array *tr = &global_trace;
 
-	if (!tr->ctrl || tracing_disabled)
+	if (tracing_disabled)
 		return -EINVAL;
 
 	if (cnt > TRACE_BUF_SIZE)
@@ -3131,7 +3122,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 	unsigned long flags, irq_flags;
 	int cpu, len = 0, size, pc;
 
-	if (!tr->ctrl || tracing_disabled)
+	if (tracing_disabled)
 		return 0;
 
 	pc = preempt_count();
@@ -3365,7 +3356,6 @@ __init static int tracer_alloc_buffers(void)
 #endif
 
 	/* All seems OK, enable tracing */
-	global_trace.ctrl = 1;
 	tracing_disabled = 0;
 
 	atomic_notifier_chain_register(&panic_notifier_list,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e481edaae1c..cfda9d219e6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -172,7 +172,6 @@ struct trace_iterator;
 struct trace_array {
 	struct ring_buffer	*buffer;
 	unsigned long		entries;
-	long			ctrl;
 	int			cpu;
 	cycle_t			time_start;
 	struct task_struct	*waiter;
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index e980b872bef..8693b7a0a5b 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -44,14 +44,12 @@ static void stop_function_trace(struct trace_array *tr)
 
 static void function_trace_init(struct trace_array *tr)
 {
-	if (tr->ctrl)
-		start_function_trace(tr);
+	start_function_trace(tr);
 }
 
 static void function_trace_reset(struct trace_array *tr)
 {
-	if (tr->ctrl)
-		stop_function_trace(tr);
+	stop_function_trace(tr);
 }
 
 static void function_trace_start(struct trace_array *tr)
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index ffdf592a54e..d919d4eaa7c 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -383,15 +383,12 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
 	irqsoff_trace = tr;
 	/* make sure that the tracer is visible */
 	smp_wmb();
-
-	if (tr->ctrl)
-		start_irqsoff_tracer(tr);
+	start_irqsoff_tracer(tr);
 }
 
 static void irqsoff_tracer_reset(struct trace_array *tr)
 {
-	if (tr->ctrl)
-		stop_irqsoff_tracer(tr);
+	stop_irqsoff_tracer(tr);
 }
 
 static void irqsoff_tracer_start(struct trace_array *tr)
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index fa9354e78b5..51bcf370215 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -34,17 +34,16 @@ static void mmio_trace_init(struct trace_array *tr)
 {
 	pr_debug("in %s\n", __func__);
 	mmio_trace_array = tr;
-	if (tr->ctrl) {
-		mmio_reset_data(tr);
-		enable_mmiotrace();
-	}
+
+	mmio_reset_data(tr);
+	enable_mmiotrace();
 }
 
 static void mmio_trace_reset(struct trace_array *tr)
 {
 	pr_debug("in %s\n", __func__);
-	if (tr->ctrl)
-		disable_mmiotrace();
+
+	disable_mmiotrace();
 	mmio_reset_data(tr);
 	mmio_trace_array = NULL;
 }
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index e3c5fbfcdd3..2ef1d227e7d 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -32,14 +32,12 @@ static void nop_trace_init(struct trace_array *tr)
 	for_each_online_cpu(cpu)
 		tracing_reset(tr, cpu);
 
-	if (tr->ctrl)
-		start_nop_trace(tr);
+	start_nop_trace(tr);
 }
 
 static void nop_trace_reset(struct trace_array *tr)
 {
-	if (tr->ctrl)
-		stop_nop_trace(tr);
+	stop_nop_trace(tr);
 }
 
 struct tracer nop_trace __read_mostly =
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 7b73fd11e4a..be35bdfe2e3 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -209,14 +209,12 @@ static void stop_sched_trace(struct trace_array *tr)
 static void sched_switch_trace_init(struct trace_array *tr)
 {
 	ctx_trace = tr;
-
-	if (tr->ctrl)
-		start_sched_trace(tr);
+	start_sched_trace(tr);
 }
 
 static void sched_switch_trace_reset(struct trace_array *tr)
 {
-	if (tr->ctrl && sched_ref)
+	if (sched_ref)
 		stop_sched_trace(tr);
 }
 
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 23e54d4e4d9..983f2b1478c 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -334,18 +334,14 @@ static void stop_wakeup_tracer(struct trace_array *tr)
 static void wakeup_tracer_init(struct trace_array *tr)
 {
 	wakeup_trace = tr;
-
-	if (tr->ctrl)
-		start_wakeup_tracer(tr);
+	start_wakeup_tracer(tr);
 }
 
 static void wakeup_tracer_reset(struct trace_array *tr)
 {
-	if (tr->ctrl) {
-		stop_wakeup_tracer(tr);
-		/* make sure we put back any tasks we are tracing */
-		wakeup_reset(tr);
-	}
+	stop_wakeup_tracer(tr);
+	/* make sure we put back any tasks we are tracing */
+	wakeup_reset(tr);
 }
 
 static void wakeup_tracer_start(struct trace_array *tr)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 74693434047..ea4e5d3b15d 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -110,7 +110,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 	ftrace_set_filter(func_name, strlen(func_name), 1);
 
 	/* enable tracing */
-	tr->ctrl = 1;
 	trace->init(tr);
 
 	/* Sleep for a 1/10 of a second */
@@ -181,7 +180,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 	ftrace_enabled = 1;
 	tracer_enabled = 1;
 
-	tr->ctrl = 1;
 	trace->init(tr);
 	/* Sleep for a 1/10 of a second */
 	msleep(100);
@@ -224,7 +222,6 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
 	int ret;
 
 	/* start the tracing */
-	tr->ctrl = 1;
 	trace->init(tr);
 	/* reset the max latency */
 	tracing_max_latency = 0;
@@ -261,7 +258,6 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
 	int ret;
 
 	/* start the tracing */
-	tr->ctrl = 1;
 	trace->init(tr);
 	/* reset the max latency */
 	tracing_max_latency = 0;
@@ -298,7 +294,6 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 	int ret;
 
 	/* start the tracing */
-	tr->ctrl = 1;
 	trace->init(tr);
 
 	/* reset the max latency */
@@ -427,7 +422,6 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
 	wait_for_completion(&isrt);
 
 	/* start the tracing */
-	tr->ctrl = 1;
 	trace->init(tr);
 	/* reset the max latency */
 	tracing_max_latency = 0;
@@ -484,7 +478,6 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
 	int ret;
 
 	/* start the tracing */
-	tr->ctrl = 1;
 	trace->init(tr);
 	/* Sleep for a 1/10 of a second */
 	msleep(100);
@@ -512,7 +505,6 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
 	int ret;
 
 	/* start the tracing */
-	tr->ctrl = 1;
 	trace->init(tr);
 	/* Sleep for a 1/10 of a second */
 	msleep(100);
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 8097430edff..05f753422ae 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -265,14 +265,12 @@ static void stack_trace_init(struct trace_array *tr)
 {
 	sysprof_trace = tr;
 
-	if (tr->ctrl)
-		start_stack_trace(tr);
+	start_stack_trace(tr);
 }
 
 static void stack_trace_reset(struct trace_array *tr)
 {
-	if (tr->ctrl)
-		stop_stack_trace(tr);
+	stop_stack_trace(tr);
 }
 
 static struct tracer stack_trace __read_mostly =
-- 
cgit v1.2.3


From 769c48eb2530c5c1a393e2c82063f4f050571d24 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 7 Nov 2008 22:36:02 -0500
Subject: ftrace: force pass of preemptoff selftest

Impact: preemptoff not tested in selftest

Due to the BKL not being preemptable anymore, the selftest of the
preemptoff code can not be tested. It requires that it is called
with preemption enabled, but since the BKL is held, that is no
longer the case.

This patch simply skips those tests if it detects that the context
is not preemptable. The following will now show up in the tests:

Testing tracer preemptoff: can not test ... force PASSED
Testing tracer preemptirqsoff: can not test ... force PASSED

When the BKL is removed, or it becomes preemptable once again, then
the tests will be performed.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_selftest.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index ea4e5d3b15d..0728a105dcc 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -257,6 +257,19 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
 	unsigned long count;
 	int ret;
 
+	/*
+	 * Now that the big kernel lock is no longer preemptable,
+	 * and this is called with the BKL held, it will always
+	 * fail. If preemption is already disabled, simply
+	 * pass the test. When the BKL is removed, or becomes
+	 * preemptible again, we will once again test this,
+	 * so keep it in.
+	 */
+	if (preempt_count()) {
+		printk(KERN_CONT "can not test ... force ");
+		return 0;
+	}
+
 	/* start the tracing */
 	trace->init(tr);
 	/* reset the max latency */
@@ -293,6 +306,19 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 	unsigned long count;
 	int ret;
 
+	/*
+	 * Now that the big kernel lock is no longer preemptable,
+	 * and this is called with the BKL held, it will always
+	 * fail. If preemption is already disabled, simply
+	 * pass the test. When the BKL is removed, or becomes
+	 * preemptible again, we will once again test this,
+	 * so keep it in.
+	 */
+	if (preempt_count()) {
+		printk(KERN_CONT "can not test ... force ");
+		return 0;
+	}
+
 	/* start the tracing */
 	trace->init(tr);
 
-- 
cgit v1.2.3


From a309720c876d7ad2e224bfd1982c92ae4364c82e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 7 Nov 2008 22:36:02 -0500
Subject: ftrace: display start of CPU buffer in trace output

Impact: change in trace output

Because the trace buffers are per cpu ring buffers, the start of
the trace can be confusing. If one CPU is very active at the
end of the trace, its history will not go as far back as the
other CPU traces.  This means that output for a particular CPU
may not appear for the first part of a trace.

To help annotate what is happening, and to prevent any more
confusion, this patch adds a line that annotates the start of
a CPU buffer output.

For example:

       automount-3495  [001]   184.596443: dnotify_parent <-vfs_write
[...]
       automount-3495  [001]   184.596449: dput <-path_put
       automount-3496  [002]   184.596450: down_read_trylock <-do_page_fault
[...]
           sshd-3497  [001]   184.597069: up_read <-do_page_fault
          <idle>-0     [000]   184.597074: __exit_idle <-exit_idle
[...]
       automount-3496  [002]   184.597257: filemap_fault <-__do_fault
          <idle>-0     [003]   184.597261: exit_idle <-smp_apic_timer_interrupt

Note, parsers of a trace output should always ignore any lines that
start with a '#'.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 19 +++++++++++++++++++
 kernel/trace/trace.h |  2 ++
 2 files changed, 21 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 58435415b36..f147f198b9a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1478,6 +1478,17 @@ void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter)
 		trace_seq_putc(s, '\n');
 }
 
+static void test_cpu_buff_start(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+
+	if (cpu_isset(iter->cpu, iter->started))
+		return;
+
+	cpu_set(iter->cpu, iter->started);
+	trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu);
+}
+
 static enum print_line_t
 print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 {
@@ -1497,6 +1508,8 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 	if (entry->type == TRACE_CONT)
 		return TRACE_TYPE_HANDLED;
 
+	test_cpu_buff_start(iter);
+
 	next_entry = find_next_entry(iter, NULL, &next_ts);
 	if (!next_entry)
 		next_ts = iter->ts;
@@ -1612,6 +1625,8 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 	if (entry->type == TRACE_CONT)
 		return TRACE_TYPE_HANDLED;
 
+	test_cpu_buff_start(iter);
+
 	comm = trace_find_cmdline(iter->ent->pid);
 
 	t = ns2usecs(iter->ts);
@@ -2631,6 +2646,10 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
 		return -ENOMEM;
 
 	mutex_lock(&trace_types_lock);
+
+	/* trace pipe does not show start of buffer */
+	cpus_setall(iter->started);
+
 	iter->tr = &global_trace;
 	iter->trace = current_trace;
 	filp->private_data = iter;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cfda9d219e6..978145088fb 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -277,6 +277,8 @@ struct trace_iterator {
 	unsigned long		iter_flags;
 	loff_t			pos;
 	long			idx;
+
+	cpumask_t		started;
 };
 
 int tracing_is_enabled(void);
-- 
cgit v1.2.3


From ae1e9130bfb9ad55eb97ec3fb17a122b7a118f98 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 11 Nov 2008 09:05:16 +0100
Subject: sched: rename SCHED_NO_NO_OMIT_FRAME_POINTER =>
 SCHED_OMIT_FRAME_POINTER

Impact: cleanup, change .config option name

We had this ugly config name for a long time for hysteric raisons.
Rename it to a saner name.

We still cannot get rid of it completely, until /proc/<pid>/stack
usage replaces WCHAN usage for good.

We'll be able to do that in the v2.6.29/v2.6.30 timeframe.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index e1af0397214..46e67a39849 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -91,7 +91,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 
-ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
+ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
 # needed for x86 only.  Why this used to be enabled for all architectures is beyond
 # me.  I suspect most platforms don't need this, but until we know that for sure
-- 
cgit v1.2.3


From 5aa1ba6a6c710e747838a22d798ac97a8b248745 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 10 Nov 2008 23:07:30 -0500
Subject: ftrace: prevent ftrace_special from recursion

Impact: stop ftrace_special from recursion

The ftrace_special is used to help debug areas of the kernel.
Because of this, if it is put in certain locations, the fact that
it allows recursion can become a problem if the kernel developer
using does not realize that.

This patch changes ftrace_special to not allow recursion into itself
to make it more robust.

It also changes from preempt disable interrupts disable to prevent
any loss of trace entries.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0c22fe2d43a..216bbe7547a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -960,6 +960,7 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
 {
 	struct trace_array *tr = &global_trace;
 	struct trace_array_cpu *data;
+	unsigned long flags;
 	int cpu;
 	int pc;
 
@@ -967,14 +968,15 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
 		return;
 
 	pc = preempt_count();
-	preempt_disable_notrace();
+	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 
-	if (likely(!atomic_read(&data->disabled)))
+	if (likely(atomic_inc_return(&data->disabled) == 1))
 		ftrace_trace_special(tr, data, arg1, arg2, arg3, pc);
 
-	preempt_enable_notrace();
+	atomic_dec(&data->disabled);
+	local_irq_restore(flags);
 }
 
 #ifdef CONFIG_FUNCTION_TRACER
-- 
cgit v1.2.3


From f536aafc5a2e6f0c8f1577a155e6f93db5e469f0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 10 Nov 2008 23:07:30 -0500
Subject: ring-buffer: replace most bug ons with warn on and disable buffer

This patch replaces most of the BUG_ONs in the ring_buffer code with
RB_WARN_ON variants. It adds some more variants as needed for the
replacement. This lets the buffer die nicely and still warn the user.

One BUG_ON remains in the code, and that is because it detects a
bad pointer passed in by the calling function, and not a bug by
the ring buffer code itself.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 65 ++++++++++++++++++++++++++++++++++------------
 1 file changed, 49 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index ee9b93d318b..a6b8f9d7ac9 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -188,6 +188,7 @@ struct ring_buffer_iter {
 	u64				read_stamp;
 };
 
+/* buffer may be either ring_buffer or ring_buffer_per_cpu */
 #define RB_WARN_ON(buffer, cond)				\
 	do {							\
 		if (unlikely(cond)) {				\
@@ -197,6 +198,15 @@ struct ring_buffer_iter {
 	} while (0)
 
 #define RB_WARN_ON_RET(buffer, cond)				\
+	do {							\
+		if (unlikely(cond)) {				\
+			atomic_inc(&buffer->record_disabled);	\
+			WARN_ON(1);				\
+			return;					\
+		}						\
+	} while (0)
+
+#define RB_WARN_ON_RET_INT(buffer, cond)			\
 	do {							\
 		if (unlikely(cond)) {				\
 			atomic_inc(&buffer->record_disabled);	\
@@ -205,6 +215,15 @@ struct ring_buffer_iter {
 		}						\
 	} while (0)
 
+#define RB_WARN_ON_RET_NULL(buffer, cond)			\
+	do {							\
+		if (unlikely(cond)) {				\
+			atomic_inc(&buffer->record_disabled);	\
+			WARN_ON(1);				\
+			return NULL;				\
+		}						\
+	} while (0)
+
 #define RB_WARN_ON_ONCE(buffer, cond)				\
 	do {							\
 		static int once;				\
@@ -215,6 +234,17 @@ struct ring_buffer_iter {
 		}						\
 	} while (0)
 
+/* buffer must be ring_buffer not per_cpu */
+#define RB_WARN_ON_UNLOCK(buffer, cond)				\
+	do {							\
+		if (unlikely(cond)) {				\
+			mutex_unlock(&buffer->mutex);		\
+			atomic_inc(&buffer->record_disabled);	\
+			WARN_ON(1);				\
+			return -1;				\
+		}						\
+	} while (0)
+
 /**
  * check_pages - integrity check of buffer pages
  * @cpu_buffer: CPU buffer with pages to test
@@ -227,13 +257,13 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
 	struct list_head *head = &cpu_buffer->pages;
 	struct buffer_page *page, *tmp;
 
-	RB_WARN_ON_RET(cpu_buffer, head->next->prev != head);
-	RB_WARN_ON_RET(cpu_buffer, head->prev->next != head);
+	RB_WARN_ON_RET_INT(cpu_buffer, head->next->prev != head);
+	RB_WARN_ON_RET_INT(cpu_buffer, head->prev->next != head);
 
 	list_for_each_entry_safe(page, tmp, head, list) {
-		RB_WARN_ON_RET(cpu_buffer,
+		RB_WARN_ON_RET_INT(cpu_buffer,
 			       page->list.next->prev != &page->list);
-		RB_WARN_ON_RET(cpu_buffer,
+		RB_WARN_ON_RET_INT(cpu_buffer,
 			       page->list.prev->next != &page->list);
 	}
 
@@ -440,13 +470,13 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
 	synchronize_sched();
 
 	for (i = 0; i < nr_pages; i++) {
-		BUG_ON(list_empty(&cpu_buffer->pages));
+		RB_WARN_ON_RET(cpu_buffer, list_empty(&cpu_buffer->pages));
 		p = cpu_buffer->pages.next;
 		page = list_entry(p, struct buffer_page, list);
 		list_del_init(&page->list);
 		free_buffer_page(page);
 	}
-	BUG_ON(list_empty(&cpu_buffer->pages));
+	RB_WARN_ON_RET(cpu_buffer, list_empty(&cpu_buffer->pages));
 
 	rb_reset_cpu(cpu_buffer);
 
@@ -468,7 +498,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
 	synchronize_sched();
 
 	for (i = 0; i < nr_pages; i++) {
-		BUG_ON(list_empty(pages));
+		RB_WARN_ON_RET(cpu_buffer, list_empty(pages));
 		p = pages->next;
 		page = list_entry(p, struct buffer_page, list);
 		list_del_init(&page->list);
@@ -523,7 +553,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 	if (size < buffer_size) {
 
 		/* easy case, just free pages */
-		BUG_ON(nr_pages >= buffer->pages);
+		RB_WARN_ON_UNLOCK(buffer, nr_pages >= buffer->pages);
 
 		rm_pages = buffer->pages - nr_pages;
 
@@ -542,7 +572,8 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 	 * add these pages to the cpu_buffers. Otherwise we just free
 	 * them all and return -ENOMEM;
 	 */
-	BUG_ON(nr_pages <= buffer->pages);
+	RB_WARN_ON_UNLOCK(buffer, nr_pages <= buffer->pages);
+
 	new_pages = nr_pages - buffer->pages;
 
 	for_each_buffer_cpu(buffer, cpu) {
@@ -565,7 +596,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 		rb_insert_pages(cpu_buffer, &pages, new_pages);
 	}
 
-	BUG_ON(!list_empty(&pages));
+	RB_WARN_ON_UNLOCK(buffer, !list_empty(&pages));
 
  out:
 	buffer->pages = nr_pages;
@@ -653,7 +684,7 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
 	     head += rb_event_length(event)) {
 
 		event = __rb_page_index(cpu_buffer->head_page, head);
-		BUG_ON(rb_null_event(event));
+		RB_WARN_ON_RET(cpu_buffer, rb_null_event(event));
 		/* Only count data entries */
 		if (event->type != RINGBUF_TYPE_DATA)
 			continue;
@@ -940,7 +971,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 
 	/* We reserved something on the buffer */
 
-	BUG_ON(write > BUF_PAGE_SIZE);
+	RB_WARN_ON_RET_NULL(cpu_buffer, write > BUF_PAGE_SIZE);
 
 	event = __rb_page_index(tail_page, tail);
 	rb_update_event(event, type, length);
@@ -1621,7 +1652,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
 	reader = rb_get_reader_page(cpu_buffer);
 
 	/* This function should not be called when buffer is empty */
-	BUG_ON(!reader);
+	RB_WARN_ON_RET(cpu_buffer, !reader);
 
 	event = rb_reader_event(cpu_buffer);
 
@@ -1648,7 +1679,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
 	 * Check if we are at the end of the buffer.
 	 */
 	if (iter->head >= rb_page_size(iter->head_page)) {
-		BUG_ON(iter->head_page == cpu_buffer->commit_page);
+		RB_WARN_ON_RET(buffer,
+			       iter->head_page == cpu_buffer->commit_page);
 		rb_inc_iter(iter);
 		return;
 	}
@@ -1661,8 +1693,9 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
 	 * This should not be called to advance the header if we are
 	 * at the tail of the buffer.
 	 */
-	BUG_ON((iter->head_page == cpu_buffer->commit_page) &&
-	       (iter->head + length > rb_commit_index(cpu_buffer)));
+	RB_WARN_ON_RET(cpu_buffer,
+		       (iter->head_page == cpu_buffer->commit_page) &&
+		       (iter->head + length > rb_commit_index(cpu_buffer)));
 
 	rb_update_iter_read_stamp(iter, event);
 
-- 
cgit v1.2.3


From caf4b323b02a16c92fba449952ac6515ddc76d7a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 11 Nov 2008 07:03:45 +0100
Subject: tracing, x86: add low level support for ftrace return tracing

Impact: add infrastructure for function-return tracing

Add low level support for ftrace return tracing.

This plug-in stores return addresses on the thread_info structure of
the current task.

The index of the current return address is initialized when the task
is the first one (init) and when a process forks (the child). It is
not needed when a task does a sys_execve because after this syscall,
it still needs to return on the kernel functions it called.

Note that the code of return_to_handler has been suggested by Steven
Rostedt as almost all of the ideas of improvements in this V3.

For purpose of security, arch/x86/kernel/process_32.c is not traced
because __switch_to() changes the current task during its execution.
That could cause inconsistency in the stored return address of this
function even if I didn't have any crash after testing with tracing on
this function enabled.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Makefile | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 9a3ec66a9d8..af3be57acbb 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -23,6 +23,10 @@ CFLAGS_REMOVE_cgroup-debug.o = -pg
 CFLAGS_REMOVE_sched_clock.o = -pg
 CFLAGS_REMOVE_sched.o = -mno-spe -pg
 endif
+ifdef CONFIG_FUNCTION_RET_TRACER
+CFLAGS_REMOVE_extable.o = -pg # For __kernel_text_address()
+CFLAGS_REMOVE_module.o = -pg # For __module_text_address()
+endif
 
 obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
-- 
cgit v1.2.3


From 15e6cb3673ea6277999642802406a764b49391b0 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 11 Nov 2008 07:14:25 +0100
Subject: tracing: add a tracer to catch execution time of kernel functions

Impact: add new tracing plugin which can trace full (entry+exit) function calls

This tracer uses the low level function return ftrace plugin to
measure the execution time of the kernel functions.

The first field is the caller of the function, the second is the
measured function, and the last one is the execution time in
nanoseconds.

- v3:

- HAVE_FUNCTION_RET_TRACER have been added. Each arch that support ftrace return
  should enable it.
- ftrace_return_stub becomes ftrace_stub.
- CONFIG_FUNCTION_RET_TRACER depends now on CONFIG_FUNCTION_TRACER
- Return traces printing can be used for other tracers on trace.c
- Adapt to the new tracing API (no more ctrl_update callback)
- Correct the check of "disabled" during insertion.
- Minor changes...

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig                  | 14 ++++++
 kernel/trace/Makefile                 |  1 +
 kernel/trace/ftrace.c                 | 16 +++++++
 kernel/trace/trace.c                  | 65 +++++++++++++++++++++++----
 kernel/trace/trace.h                  | 35 +++++++++++++++
 kernel/trace/trace_functions_return.c | 82 +++++++++++++++++++++++++++++++++++
 6 files changed, 205 insertions(+), 8 deletions(-)
 create mode 100644 kernel/trace/trace_functions_return.c

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index fc4febc3334..d986216c832 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -9,6 +9,9 @@ config NOP_TRACER
 config HAVE_FUNCTION_TRACER
 	bool
 
+config HAVE_FUNCTION_RET_TRACER
+	bool
+
 config HAVE_FUNCTION_TRACE_MCOUNT_TEST
 	bool
 	help
@@ -54,6 +57,17 @@ config FUNCTION_TRACER
 	  (the bootup default), then the overhead of the instructions is very
 	  small and not measurable even in micro-benchmarks.
 
+config FUNCTION_RET_TRACER
+	bool "Kernel Function return Tracer"
+	depends on !DYNAMIC_FTRACE
+	depends on HAVE_FUNCTION_RET_TRACER
+	depends on FUNCTION_TRACER
+	help
+	  Enable the kernel to trace a function at its return.
+	  It's first purpose is to trace the duration of functions.
+	  This is done by setting the current return address on the thread
+	  info structure of the current task.
+
 config IRQSOFF_TRACER
 	bool "Interrupts-off Latency Tracer"
 	default n
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c8228b1a49e..3e1f361bbc1 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -24,5 +24,6 @@ obj-$(CONFIG_NOP_TRACER) += trace_nop.o
 obj-$(CONFIG_STACK_TRACER) += trace_stack.o
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
+obj-$(CONFIG_FUNCTION_RET_TRACER) += trace_functions_return.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4d2e751bfb1..f03fe74ecd6 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1484,3 +1484,19 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 	return ret;
 }
 
+#ifdef CONFIG_FUNCTION_RET_TRACER
+trace_function_return_t ftrace_function_return =
+			(trace_function_return_t)ftrace_stub;
+void register_ftrace_return(trace_function_return_t func)
+{
+	ftrace_function_return = func;
+}
+
+void unregister_ftrace_return(void)
+{
+	ftrace_function_return = (trace_function_return_t)ftrace_stub;
+}
+#endif
+
+
+
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 216bbe7547a..a3f7ae9cd8e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -244,13 +244,6 @@ unsigned long nsecs_to_usecs(unsigned long nsecs)
 	return nsecs / 1000;
 }
 
-/*
- * TRACE_ITER_SYM_MASK masks the options in trace_flags that
- * control the output of kernel symbols.
- */
-#define TRACE_ITER_SYM_MASK \
-	(TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
-
 /* These must match the bit postions in trace_iterator_flags */
 static const char *trace_options[] = {
 	"print-parent",
@@ -810,6 +803,35 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data,
 	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
 }
 
+#ifdef CONFIG_FUNCTION_RET_TRACER
+static void __trace_function_return(struct trace_array *tr,
+				struct trace_array_cpu *data,
+				struct ftrace_retfunc *trace,
+				unsigned long flags,
+				int pc)
+{
+	struct ring_buffer_event *event;
+	struct ftrace_ret_entry *entry;
+	unsigned long irq_flags;
+
+	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
+		return;
+
+	event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		return;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, flags, pc);
+	entry->ent.type			= TRACE_FN_RET;
+	entry->ip			= trace->func;
+	entry->parent_ip	= trace->ret;
+	entry->rettime		= trace->rettime;
+	entry->calltime		= trace->calltime;
+	ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
+}
+#endif
+
 void
 ftrace(struct trace_array *tr, struct trace_array_cpu *data,
        unsigned long ip, unsigned long parent_ip, unsigned long flags,
@@ -1038,6 +1060,29 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
 	raw_local_irq_restore(flags);
 }
 
+#ifdef CONFIG_FUNCTION_RET_TRACER
+void trace_function_return(struct ftrace_retfunc *trace)
+{
+	struct trace_array *tr = &global_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+	int pc;
+
+	raw_local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+	if (likely(disabled == 1)) {
+		pc = preempt_count();
+		__trace_function_return(tr, data, trace, flags, pc);
+	}
+	atomic_dec(&data->disabled);
+	raw_local_irq_restore(flags);
+}
+#endif /* CONFIG_FUNCTION_RET_TRACER */
+
 static struct ftrace_ops trace_ops __read_mostly =
 {
 	.func = function_trace_call,
@@ -1285,7 +1330,7 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt,
 # define IP_FMT "%016lx"
 #endif
 
-static int
+int
 seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
 {
 	int ret;
@@ -1738,6 +1783,10 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 			trace_seq_print_cont(s, iter);
 		break;
 	}
+	case TRACE_FN_RET: {
+		return print_return_function(iter);
+		break;
+	}
 	}
 	return TRACE_TYPE_HANDLED;
 }
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 978145088fb..e40ce0c1469 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -22,6 +22,7 @@ enum trace_type {
 	TRACE_MMIO_RW,
 	TRACE_MMIO_MAP,
 	TRACE_BOOT,
+	TRACE_FN_RET,
 
 	__TRACE_LAST_TYPE
 };
@@ -48,6 +49,15 @@ struct ftrace_entry {
 	unsigned long		ip;
 	unsigned long		parent_ip;
 };
+
+/* Function return entry */
+struct ftrace_ret_entry {
+	struct trace_entry	ent;
+	unsigned long		ip;
+	unsigned long		parent_ip;
+	unsigned long long	calltime;
+	unsigned long long	rettime;
+};
 extern struct tracer boot_tracer;
 
 /*
@@ -218,6 +228,7 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_map,		\
 			  TRACE_MMIO_MAP);				\
 		IF_ASSIGN(var, ent, struct trace_boot, TRACE_BOOT);	\
+		IF_ASSIGN(var, ent, struct ftrace_ret_entry, TRACE_FN_RET); \
 		__ftrace_bad_type();					\
 	} while (0)
 
@@ -321,6 +332,8 @@ void trace_function(struct trace_array *tr,
 		    unsigned long ip,
 		    unsigned long parent_ip,
 		    unsigned long flags, int pc);
+void
+trace_function_return(struct ftrace_retfunc *trace);
 
 void tracing_start_cmdline_record(void);
 void tracing_stop_cmdline_record(void);
@@ -393,6 +406,10 @@ extern void *head_page(struct trace_array_cpu *data);
 extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
 extern void trace_seq_print_cont(struct trace_seq *s,
 				 struct trace_iterator *iter);
+
+extern int
+seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
+		unsigned long sym_flags);
 extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
 				 size_t cnt);
 extern long ns2usecs(cycle_t nsec);
@@ -400,6 +417,17 @@ extern int trace_vprintk(unsigned long ip, const char *fmt, va_list args);
 
 extern unsigned long trace_flags;
 
+/* Standard output formatting function used for function return traces */
+#ifdef CONFIG_FUNCTION_RET_TRACER
+extern enum print_line_t print_return_function(struct trace_iterator *iter);
+#else
+static inline enum print_line_t
+print_return_function(struct trace_iterator *iter)
+{
+	return TRACE_TYPE_UNHANDLED;
+}
+#endif
+
 /*
  * trace_iterator_flags is an enumeration that defines bit
  * positions into trace_flags that controls the output.
@@ -422,6 +450,13 @@ enum trace_iterator_flags {
 	TRACE_ITER_PREEMPTONLY		= 0x800,
 };
 
+/*
+ * TRACE_ITER_SYM_MASK masks the options in trace_flags that
+ * control the output of kernel symbols.
+ */
+#define TRACE_ITER_SYM_MASK \
+	(TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
+
 extern struct tracer nop_trace;
 
 /**
diff --git a/kernel/trace/trace_functions_return.c b/kernel/trace/trace_functions_return.c
new file mode 100644
index 00000000000..7680b21537d
--- /dev/null
+++ b/kernel/trace/trace_functions_return.c
@@ -0,0 +1,82 @@
+/*
+ *
+ * Function return tracer.
+ * Copyright (c) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ * Mostly borrowed from function tracer which
+ * is Copyright (c) Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/fs.h>
+
+#include "trace.h"
+
+
+static void start_return_trace(struct trace_array *tr)
+{
+	register_ftrace_return(&trace_function_return);
+}
+
+static void stop_return_trace(struct trace_array *tr)
+{
+	unregister_ftrace_return();
+}
+
+static void return_trace_init(struct trace_array *tr)
+{
+	int cpu;
+	for_each_online_cpu(cpu)
+		tracing_reset(tr, cpu);
+
+	start_return_trace(tr);
+}
+
+static void return_trace_reset(struct trace_array *tr)
+{
+		stop_return_trace(tr);
+}
+
+
+enum print_line_t
+print_return_function(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry = iter->ent;
+	struct ftrace_ret_entry *field;
+	int ret;
+
+	if (entry->type == TRACE_FN_RET) {
+		trace_assign_type(field, entry);
+		ret = trace_seq_printf(s, "%pF -> ", (void *)field->parent_ip);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+		ret = seq_print_ip_sym(s, field->ip,
+					trace_flags & TRACE_ITER_SYM_MASK);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+		ret = trace_seq_printf(s, " (%llu ns)\n",
+					field->rettime - field->calltime);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+		else
+			return TRACE_TYPE_HANDLED;
+	}
+	return TRACE_TYPE_UNHANDLED;
+}
+
+static struct tracer return_trace __read_mostly =
+{
+	.name	     = "return",
+	.init	     = return_trace_init,
+	.reset	     = return_trace_reset,
+	.print_line = print_return_function
+};
+
+static __init int init_return_trace(void)
+{
+	return register_tracer(&return_trace);
+}
+
+device_initcall(init_return_trace);
-- 
cgit v1.2.3


From ff9b48c3598732926fa09afd7f526981c32a48cc Mon Sep 17 00:00:00 2001
From: Bharata B Rao <bharata@linux.vnet.ibm.com>
Date: Mon, 10 Nov 2008 21:34:09 +0530
Subject: sched: include group statistics in /proc/sched_debug

Impact: extend /proc/sched_debug info

Since the statistics of a group entity isn't exported directly from the
kernel, it becomes difficult to obtain some of the group statistics.
For example, the current method to obtain exec time of a group entity
is not always accurate. One has to read the exec times of all
the tasks(/proc/<pid>/sched) in the group and add them. This method
fails (or becomes difficult) if we want to collect stats of a group over
a duration where tasks get created and terminated.

This patch makes it easier to obtain group stats by directly including
them in /proc/sched_debug. Stats like group exec time would help user
programs (like LTP) to accurately measure the group fairness.

An example output of group stats from /proc/sched_debug:

cfs_rq[3]:/3/a/1
  .exec_clock                    : 89.598007
  .MIN_vruntime                  : 0.000001
  .min_vruntime                  : 256300.970506
  .max_vruntime                  : 0.000001
  .spread                        : 0.000000
  .spread0                       : -25373.372248
  .nr_running                    : 0
  .load                          : 0
  .yld_exp_empty                 : 0
  .yld_act_empty                 : 0
  .yld_both_empty                : 0
  .yld_count                     : 4474
  .sched_switch                  : 0
  .sched_count                   : 40507
  .sched_goidle                  : 12686
  .ttwu_count                    : 15114
  .ttwu_local                    : 11950
  .bkl_count                     : 67
  .nr_spread_over                : 0
  .shares                        : 0
  .se->exec_start                : 113676.727170
  .se->vruntime                  : 1592.612714
  .se->sum_exec_runtime          : 89.598007
  .se->wait_start                : 0.000000
  .se->sleep_start               : 0.000000
  .se->block_start               : 0.000000
  .se->sleep_max                 : 0.000000
  .se->block_max                 : 0.000000
  .se->exec_max                  : 1.000282
  .se->slice_max                 : 1.999750
  .se->wait_max                  : 54.981093
  .se->wait_sum                  : 217.610521
  .se->wait_count                : 50
  .se->load.weight               : 2

Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Acked-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Acked-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_debug.c | 37 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index d25cefe3f0e..3625a659869 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -53,6 +53,40 @@ static unsigned long nsec_low(unsigned long long nsec)
 
 #define SPLIT_NS(x) nsec_high(x), nsec_low(x)
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void print_cfs_group_stats(struct seq_file *m, int cpu,
+		struct task_group *tg)
+{
+	struct sched_entity *se = tg->se[cpu];
+	if (!se)
+		return;
+
+#define P(F) \
+	SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)F)
+#define PN(F) \
+	SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
+
+	PN(se->exec_start);
+	PN(se->vruntime);
+	PN(se->sum_exec_runtime);
+#ifdef CONFIG_SCHEDSTATS
+	PN(se->wait_start);
+	PN(se->sleep_start);
+	PN(se->block_start);
+	PN(se->sleep_max);
+	PN(se->block_max);
+	PN(se->exec_max);
+	PN(se->slice_max);
+	PN(se->wait_max);
+	PN(se->wait_sum);
+	P(se->wait_count);
+#endif
+	P(se->load.weight);
+#undef PN
+#undef P
+}
+#endif
+
 static void
 print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 {
@@ -181,6 +215,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 #ifdef CONFIG_SMP
 	SEQ_printf(m, "  .%-30s: %lu\n", "shares", cfs_rq->shares);
 #endif
+	print_cfs_group_stats(m, cpu, cfs_rq->tg);
 #endif
 }
 
@@ -261,7 +296,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
 	u64 now = ktime_to_ns(ktime_get());
 	int cpu;
 
-	SEQ_printf(m, "Sched Debug Version: v0.07, %s %.*s\n",
+	SEQ_printf(m, "Sched Debug Version: v0.08, %s %.*s\n",
 		init_utsname()->release,
 		(int)strcspn(init_utsname()->version, " "),
 		init_utsname()->version);
-- 
cgit v1.2.3


From 851f7ff56d9c21272f289dd85fb3f1b6cf7a6e10 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 11 Nov 2008 21:48:14 +1100
Subject: This patch will print cap_permitted and cap_inheritable data in the
 PATH records of any file that has file capabilities set.  Files which do not
 have fcaps set will not have different PATH records.

An example audit record if you run:
setcap "cap_net_admin+pie" /bin/bash
/bin/bash

type=SYSCALL msg=audit(1225741937.363:230): arch=c000003e syscall=59 success=yes exit=0 a0=2119230 a1=210da30 a2=20ee290 a3=8 items=2 ppid=2149 pid=2923 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=pts0 ses=3 comm="ping" exe="/bin/ping" subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null)
type=EXECVE msg=audit(1225741937.363:230): argc=2 a0="ping" a1="www.google.com"
type=CWD msg=audit(1225741937.363:230):  cwd="/root"
type=PATH msg=audit(1225741937.363:230): item=0 name="/bin/ping" inode=49256 dev=fd:00 mode=0104755 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:ping_exec_t:s0 cap_fp=0000000000002000 cap_fi=0000000000002000 cap_fe=1 cap_fver=2
type=PATH msg=audit(1225741937.363:230): item=1 name=(null) inode=507915 dev=fd:00 mode=0100755 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:ld_so_t:s0

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/auditsc.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 77 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index cf5bc2f5f9c..de7e9bcba9a 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -65,6 +65,7 @@
 #include <linux/highmem.h>
 #include <linux/syscalls.h>
 #include <linux/inotify.h>
+#include <linux/capability.h>
 
 #include "audit.h"
 
@@ -84,6 +85,15 @@ int audit_n_rules;
 /* determines whether we collect data for signals sent */
 int audit_signals;
 
+struct audit_cap_data {
+	kernel_cap_t		permitted;
+	kernel_cap_t		inheritable;
+	union {
+		unsigned int	fE;		/* effective bit of a file capability */
+		kernel_cap_t	effective;	/* effective set of a process */
+	};
+};
+
 /* When fs/namei.c:getname() is called, we store the pointer in name and
  * we don't let putname() free it (instead we free all of the saved
  * pointers at syscall exit time).
@@ -100,6 +110,8 @@ struct audit_names {
 	gid_t		gid;
 	dev_t		rdev;
 	u32		osid;
+	struct audit_cap_data fcap;
+	unsigned int	fcap_ver;
 };
 
 struct audit_aux_data {
@@ -1171,6 +1183,35 @@ static void audit_log_execve_info(struct audit_context *context,
 	kfree(buf);
 }
 
+static void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
+{
+	int i;
+
+	audit_log_format(ab, " %s=", prefix);
+	CAP_FOR_EACH_U32(i) {
+		audit_log_format(ab, "%08x", cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]);
+	}
+}
+
+static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
+{
+	kernel_cap_t *perm = &name->fcap.permitted;
+	kernel_cap_t *inh = &name->fcap.inheritable;
+	int log = 0;
+
+	if (!cap_isclear(*perm)) {
+		audit_log_cap(ab, "cap_fp", perm);
+		log = 1;
+	}
+	if (!cap_isclear(*inh)) {
+		audit_log_cap(ab, "cap_fi", inh);
+		log = 1;
+	}
+
+	if (log)
+		audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver);
+}
+
 static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
 {
 	int i, call_panic = 0;
@@ -1421,6 +1462,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 			}
 		}
 
+		audit_log_fcaps(ab, n);
+
 		audit_log_end(ab);
 	}
 
@@ -1787,8 +1830,36 @@ static int audit_inc_name_count(struct audit_context *context,
 	return 0;
 }
 
+
+static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry)
+{
+	struct cpu_vfs_cap_data caps;
+	int rc;
+
+	memset(&name->fcap.permitted, 0, sizeof(kernel_cap_t));
+	memset(&name->fcap.inheritable, 0, sizeof(kernel_cap_t));
+	name->fcap.fE = 0;
+	name->fcap_ver = 0;
+
+	if (!dentry)
+		return 0;
+
+	rc = get_vfs_caps_from_disk(dentry, &caps);
+	if (rc)
+		return rc;
+
+	name->fcap.permitted = caps.permitted;
+	name->fcap.inheritable = caps.inheritable;
+	name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
+	name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;
+
+	return 0;
+}
+
+
 /* Copy inode data into an audit_names. */
-static void audit_copy_inode(struct audit_names *name, const struct inode *inode)
+static void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
+			     const struct inode *inode)
 {
 	name->ino   = inode->i_ino;
 	name->dev   = inode->i_sb->s_dev;
@@ -1797,6 +1868,7 @@ static void audit_copy_inode(struct audit_names *name, const struct inode *inode
 	name->gid   = inode->i_gid;
 	name->rdev  = inode->i_rdev;
 	security_inode_getsecid(inode, &name->osid);
+	audit_copy_fcaps(name, dentry);
 }
 
 /**
@@ -1831,7 +1903,7 @@ void __audit_inode(const char *name, const struct dentry *dentry)
 		context->names[idx].name = NULL;
 	}
 	handle_path(dentry);
-	audit_copy_inode(&context->names[idx], inode);
+	audit_copy_inode(&context->names[idx], dentry, inode);
 }
 
 /**
@@ -1892,7 +1964,7 @@ void __audit_inode_child(const char *dname, const struct dentry *dentry,
 		if (!strcmp(dname, n->name) ||
 		     !audit_compare_dname_path(dname, n->name, &dirlen)) {
 			if (inode)
-				audit_copy_inode(n, inode);
+				audit_copy_inode(n, NULL, inode);
 			else
 				n->ino = (unsigned long)-1;
 			found_child = n->name;
@@ -1906,7 +1978,7 @@ add_names:
 			return;
 		idx = context->name_count - 1;
 		context->names[idx].name = NULL;
-		audit_copy_inode(&context->names[idx], parent);
+		audit_copy_inode(&context->names[idx], NULL, parent);
 	}
 
 	if (!found_child) {
@@ -1927,7 +1999,7 @@ add_names:
 		}
 
 		if (inode)
-			audit_copy_inode(&context->names[idx], inode);
+			audit_copy_inode(&context->names[idx], NULL, inode);
 		else
 			context->names[idx].ino = (unsigned long)-1;
 	}
-- 
cgit v1.2.3


From 3fc689e96c0c90b6fede5946d6c31075e9464f69 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 11 Nov 2008 21:48:18 +1100
Subject: Any time fcaps or a setuid app under SECURE_NOROOT is used to result
 in a non-zero pE we will crate a new audit record which contains the entire
 set of known information about the executable in question, fP, fI, fE,
 fversion and includes the process's pE, pI, pP.  Before and after the bprm
 capability are applied.  This record type will only be emitted from execve
 syscalls.

an example of making ping use fcaps instead of setuid:

setcap "cat_net_raw+pe" /bin/ping

type=SYSCALL msg=audit(1225742021.015:236): arch=c000003e syscall=59 success=yes exit=0 a0=1457f30 a1=14606b0 a2=1463940 a3=321b770a70 items=2 ppid=2929 pid=2963 auid=0 uid=500 gid=500 euid=500 suid=500 fsuid=500 egid=500 sgid=500 fsgid=500 tty=pts0 ses=3 comm="ping" exe="/bin/ping" subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null)
type=UNKNOWN[1321] msg=audit(1225742021.015:236): fver=2 fp=0000000000002000 fi=0000000000000000 fe=1 old_pp=0000000000000000 old_pi=0000000000000000 old_pe=0000000000000000 new_pp=0000000000002000 new_pi=0000000000000000 new_pe=0000000000002000
type=EXECVE msg=audit(1225742021.015:236): argc=2 a0="ping" a1="127.0.0.1"
type=CWD msg=audit(1225742021.015:236):  cwd="/home/test"
type=PATH msg=audit(1225742021.015:236): item=0 name="/bin/ping" inode=49256 dev=fd:00 mode=0100755 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:ping_exec_t:s0 cap_fp=0000000000002000 cap_fe=1 cap_fver=2
type=PATH msg=audit(1225742021.015:236): item=1 name=(null) inode=507915 dev=fd:00 mode=0100755 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:ld_so_t:s0

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/auditsc.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index de7e9bcba9a..3229cd4206f 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -196,6 +196,14 @@ struct audit_aux_data_pids {
 	int			pid_count;
 };
 
+struct audit_aux_data_bprm_fcaps {
+	struct audit_aux_data	d;
+	struct audit_cap_data	fcap;
+	unsigned int		fcap_ver;
+	struct audit_cap_data	old_pcap;
+	struct audit_cap_data	new_pcap;
+};
+
 struct audit_tree_refs {
 	struct audit_tree_refs *next;
 	struct audit_chunk *c[31];
@@ -1375,6 +1383,20 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 			audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]);
 			break; }
 
+		case AUDIT_BPRM_FCAPS: {
+			struct audit_aux_data_bprm_fcaps *axs = (void *)aux;
+			audit_log_format(ab, "fver=%x", axs->fcap_ver);
+			audit_log_cap(ab, "fp", &axs->fcap.permitted);
+			audit_log_cap(ab, "fi", &axs->fcap.inheritable);
+			audit_log_format(ab, " fe=%d", axs->fcap.fE);
+			audit_log_cap(ab, "old_pp", &axs->old_pcap.permitted);
+			audit_log_cap(ab, "old_pi", &axs->old_pcap.inheritable);
+			audit_log_cap(ab, "old_pe", &axs->old_pcap.effective);
+			audit_log_cap(ab, "new_pp", &axs->new_pcap.permitted);
+			audit_log_cap(ab, "new_pi", &axs->new_pcap.inheritable);
+			audit_log_cap(ab, "new_pe", &axs->new_pcap.effective);
+			break; }
+
 		}
 		audit_log_end(ab);
 	}
@@ -2501,6 +2523,52 @@ int __audit_signal_info(int sig, struct task_struct *t)
 	return 0;
 }
 
+/**
+ * __audit_log_bprm_fcaps - store information about a loading bprm and relevant fcaps
+ * @bprm pointer to the bprm being processed
+ * @caps the caps read from the disk
+ *
+ * Simply check if the proc already has the caps given by the file and if not
+ * store the priv escalation info for later auditing at the end of the syscall
+ *
+ * this can fail and we don't care.  See the note in audit.h for
+ * audit_log_bprm_fcaps() for my explaination....
+ *
+ * -Eric
+ */
+void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_cap_t *pE)
+{
+	struct audit_aux_data_bprm_fcaps *ax;
+	struct audit_context *context = current->audit_context;
+	struct cpu_vfs_cap_data vcaps;
+	struct dentry *dentry;
+
+	ax = kmalloc(sizeof(*ax), GFP_KERNEL);
+	if (!ax)
+		return;
+
+	ax->d.type = AUDIT_BPRM_FCAPS;
+	ax->d.next = context->aux;
+	context->aux = (void *)ax;
+
+	dentry = dget(bprm->file->f_dentry);
+	get_vfs_caps_from_disk(dentry, &vcaps);
+	dput(dentry);
+
+	ax->fcap.permitted = vcaps.permitted;
+	ax->fcap.inheritable = vcaps.inheritable;
+	ax->fcap.fE = !!(vcaps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
+	ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;
+
+	ax->old_pcap.permitted = *pP;
+	ax->old_pcap.inheritable = current->cap_inheritable;
+	ax->old_pcap.effective = *pE;
+
+	ax->new_pcap.permitted = current->cap_permitted;
+	ax->new_pcap.inheritable = current->cap_inheritable;
+	ax->new_pcap.effective = current->cap_effective;
+}
+
 /**
  * audit_core_dumps - record information about processes that end abnormally
  * @signr: signal value
-- 
cgit v1.2.3


From e68b75a027bb94066576139ee33676264f867b87 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 11 Nov 2008 21:48:22 +1100
Subject: When the capset syscall is used it is not possible for audit to
 record the actual capbilities being added/removed.  This patch adds a new
 record type which emits the target pid and the eff, inh, and perm cap sets.

example output if you audit capset syscalls would be:

type=SYSCALL msg=audit(1225743140.465:76): arch=c000003e syscall=126 success=yes exit=0 a0=17f2014 a1=17f201c a2=80000000 a3=7fff2ab7f060 items=0 ppid=2160 pid=2223 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=pts0 ses=1 comm="setcap" exe="/usr/sbin/setcap" subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null)
type=UNKNOWN[1322] msg=audit(1225743140.465:76): pid=0 cap_pi=ffffffffffffffff cap_pp=ffffffffffffffff cap_pe=ffffffffffffffff

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/auditsc.c    | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/capability.c |  5 +++++
 2 files changed, 53 insertions(+)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 3229cd4206f..cef34235b36 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -204,6 +204,12 @@ struct audit_aux_data_bprm_fcaps {
 	struct audit_cap_data	new_pcap;
 };
 
+struct audit_aux_data_capset {
+	struct audit_aux_data	d;
+	pid_t			pid;
+	struct audit_cap_data	cap;
+};
+
 struct audit_tree_refs {
 	struct audit_tree_refs *next;
 	struct audit_chunk *c[31];
@@ -1397,6 +1403,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 			audit_log_cap(ab, "new_pe", &axs->new_pcap.effective);
 			break; }
 
+		case AUDIT_CAPSET: {
+			struct audit_aux_data_capset *axs = (void *)aux;
+			audit_log_format(ab, "pid=%d", axs->pid);
+			audit_log_cap(ab, "cap_pi", &axs->cap.inheritable);
+			audit_log_cap(ab, "cap_pp", &axs->cap.permitted);
+			audit_log_cap(ab, "cap_pe", &axs->cap.effective);
+			break; }
+
 		}
 		audit_log_end(ab);
 	}
@@ -2569,6 +2583,40 @@ void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_
 	ax->new_pcap.effective = current->cap_effective;
 }
 
+/**
+ * __audit_log_capset - store information about the arguments to the capset syscall
+ * @pid target pid of the capset call
+ * @eff effective cap set
+ * @inh inheritible cap set
+ * @perm permited cap set
+ *
+ * Record the aguments userspace sent to sys_capset for later printing by the
+ * audit system if applicable
+ */
+int __audit_log_capset(pid_t pid, kernel_cap_t *eff, kernel_cap_t *inh, kernel_cap_t *perm)
+{
+	struct audit_aux_data_capset *ax;
+	struct audit_context *context = current->audit_context;
+
+	if (likely(!audit_enabled || !context || context->dummy))
+		return 0;
+
+	ax = kmalloc(sizeof(*ax), GFP_KERNEL);
+	if (!ax)
+		return -ENOMEM;
+
+	ax->d.type = AUDIT_CAPSET;
+	ax->d.next = context->aux;
+	context->aux = (void *)ax;
+
+	ax->pid = pid;
+	ax->cap.effective = *eff;
+	ax->cap.inheritable = *eff;
+	ax->cap.permitted = *perm;
+
+	return 0;
+}
+
 /**
  * audit_core_dumps - record information about processes that end abnormally
  * @signr: signal value
diff --git a/kernel/capability.c b/kernel/capability.c
index e13a68535ad..19f9eda8997 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -7,6 +7,7 @@
  * 30 May 2002:	Cleanup, Robert M. Love <rml@tech9.net>
  */
 
+#include <linux/audit.h>
 #include <linux/capability.h>
 #include <linux/mm.h>
 #include <linux/module.h>
@@ -468,6 +469,10 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 		i++;
 	}
 
+	ret = audit_log_capset(pid, &effective, &inheritable, &permitted);
+	if (ret)
+		return ret;
+
 	if (pid && (pid != task_pid_vnr(current)))
 		ret = do_sys_capset_other_tasks(pid, &effective, &inheritable,
 						&permitted);
-- 
cgit v1.2.3


From 637d32dc720897616e8a1a4f9e9609e29d431800 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Wed, 29 Oct 2008 15:42:12 +1100
Subject: Capabilities: BUG when an invalid capability is requested

If an invalid (large) capability is requested the capabilities system
may panic as it is dereferencing an array of fixed (short) length.  Its
possible (and actually often happens) that the capability system
accidentally stumbled into a valid memory region but it also regularly
happens that it hits invalid memory and BUGs.  If such an operation does
get past cap_capable then the selinux system is sure to have problems as
it already does a (simple) validity check and BUG.  This is known to
happen by the broken and buggy firegl driver.

This patch cleanly checks all capable calls and BUG if a call is for an
invalid capability.  This will likely break the firegl driver for some
situations, but it is the right thing to do.  Garbage into a security
system gets you killed/bugged

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Andrew G. Morgan <morgan@kernel.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/capability.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/capability.c b/kernel/capability.c
index 19f9eda8997..adb262f83de 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -514,6 +514,11 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
  */
 int capable(int cap)
 {
+	if (unlikely(!cap_valid(cap))) {
+		printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap);
+		BUG();
+	}
+
 	if (has_capability(current, cap)) {
 		current->flags |= PF_SUPERPRIV;
 		return 1;
-- 
cgit v1.2.3


From 934352f214b3251eb0793c1209d346595a661d80 Mon Sep 17 00:00:00 2001
From: Bharata B Rao <bharata@linux.vnet.ibm.com>
Date: Mon, 10 Nov 2008 20:41:13 +0530
Subject: sched: add hierarchical accounting to cpu accounting controller

Impact: improve CPU time accounting of tasks under the cpu accounting controller

Add hierarchical accounting to cpu accounting controller and include
cpuacct documentation.

Currently, while charging the task's cputime to its accounting group,
the accounting group hierarchy isn't updated. This patch charges the cputime
of a task to its accounting group and all its parent accounting groups.

Reported-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Reviewed-by: Paul Menage <menage@google.com>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 59db86c915f..ebaf432365f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9196,11 +9196,12 @@ struct cgroup_subsys cpu_cgroup_subsys = {
  * (balbir@in.ibm.com).
  */
 
-/* track cpu usage of a group of tasks */
+/* track cpu usage of a group of tasks and its child groups */
 struct cpuacct {
 	struct cgroup_subsys_state css;
 	/* cpuusage holds pointer to a u64-type object on every cpu */
 	u64 *cpuusage;
+	struct cpuacct *parent;
 };
 
 struct cgroup_subsys cpuacct_subsys;
@@ -9234,6 +9235,9 @@ static struct cgroup_subsys_state *cpuacct_create(
 		return ERR_PTR(-ENOMEM);
 	}
 
+	if (cgrp->parent)
+		ca->parent = cgroup_ca(cgrp->parent);
+
 	return &ca->css;
 }
 
@@ -9313,14 +9317,16 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
 static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 {
 	struct cpuacct *ca;
+	int cpu;
 
 	if (!cpuacct_subsys.active)
 		return;
 
+	cpu = task_cpu(tsk);
 	ca = task_ca(tsk);
-	if (ca) {
-		u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
 
+	for (; ca; ca = ca->parent) {
+		u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
 		*cpuusage += cputime;
 	}
 }
-- 
cgit v1.2.3


From f83c9d0fe42a7544b4d4ffcebb2e6716fcfd95c0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 11 Nov 2008 18:47:44 +0100
Subject: ring-buffer: add reader lock

Impact: serialize reader accesses to individual CPU ring buffers

The code in the ring buffer expects only one reader at a time, but currently
it puts that requirement on the caller. This is not strong enough, and this
patch adds a "reader_lock" that serializes the access to the reader API
of the ring buffer.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 109 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 78 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a6b8f9d7ac9..17c2ccebb56 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -154,6 +154,7 @@ static inline int test_time_stamp(u64 delta)
 struct ring_buffer_per_cpu {
 	int				cpu;
 	struct ring_buffer		*buffer;
+	spinlock_t			reader_lock; /* serialize readers */
 	raw_spinlock_t			lock;
 	struct lock_class_key		lock_key;
 	struct list_head		pages;
@@ -321,6 +322,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
 
 	cpu_buffer->cpu = cpu;
 	cpu_buffer->buffer = buffer;
+	spin_lock_init(&cpu_buffer->reader_lock);
 	cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 	INIT_LIST_HEAD(&cpu_buffer->pages);
 
@@ -1476,6 +1478,9 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
 void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
 {
 	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 
 	/* Iterator usage is expected to have record disabled */
 	if (list_empty(&cpu_buffer->reader_page->list)) {
@@ -1489,6 +1494,8 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
 		iter->read_stamp = cpu_buffer->read_stamp;
 	else
 		iter->read_stamp = iter->head_page->time_stamp;
+
+	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 }
 
 /**
@@ -1707,17 +1714,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
 		rb_advance_iter(iter);
 }
 
-/**
- * ring_buffer_peek - peek at the next event to be read
- * @buffer: The ring buffer to read
- * @cpu: The cpu to peak at
- * @ts: The timestamp counter of this event.
- *
- * This will return the event that will be read next, but does
- * not consume the data.
- */
-struct ring_buffer_event *
-ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
+static struct ring_buffer_event *
+rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_event *event;
@@ -1779,16 +1777,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 	return NULL;
 }
 
-/**
- * ring_buffer_iter_peek - peek at the next event to be read
- * @iter: The ring buffer iterator
- * @ts: The timestamp counter of this event.
- *
- * This will return the event that will be read next, but does
- * not increment the iterator.
- */
-struct ring_buffer_event *
-ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
+static struct ring_buffer_event *
+rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 {
 	struct ring_buffer *buffer;
 	struct ring_buffer_per_cpu *cpu_buffer;
@@ -1849,6 +1839,51 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 	return NULL;
 }
 
+/**
+ * ring_buffer_peek - peek at the next event to be read
+ * @buffer: The ring buffer to read
+ * @cpu: The cpu to peak at
+ * @ts: The timestamp counter of this event.
+ *
+ * This will return the event that will be read next, but does
+ * not consume the data.
+ */
+struct ring_buffer_event *
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
+{
+	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+	struct ring_buffer_event *event;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+	event = rb_buffer_peek(buffer, cpu, ts);
+	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+	return event;
+}
+
+/**
+ * ring_buffer_iter_peek - peek at the next event to be read
+ * @iter: The ring buffer iterator
+ * @ts: The timestamp counter of this event.
+ *
+ * This will return the event that will be read next, but does
+ * not increment the iterator.
+ */
+struct ring_buffer_event *
+ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
+{
+	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+	struct ring_buffer_event *event;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+	event = rb_iter_peek(iter, ts);
+	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+	return event;
+}
+
 /**
  * ring_buffer_consume - return an event and consume it
  * @buffer: The ring buffer to get the next event from
@@ -1860,19 +1895,24 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 struct ring_buffer_event *
 ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
 {
-	struct ring_buffer_per_cpu *cpu_buffer;
+	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
 	struct ring_buffer_event *event;
+	unsigned long flags;
 
 	if (!cpu_isset(cpu, buffer->cpumask))
 		return NULL;
 
-	event = ring_buffer_peek(buffer, cpu, ts);
+	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+	event = rb_buffer_peek(buffer, cpu, ts);
 	if (!event)
-		return NULL;
+		goto out;
 
-	cpu_buffer = buffer->buffers[cpu];
 	rb_advance_reader(cpu_buffer);
 
+ out:
+	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
 	return event;
 }
 
@@ -1909,11 +1949,11 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
 	atomic_inc(&cpu_buffer->record_disabled);
 	synchronize_sched();
 
-	local_irq_save(flags);
+	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 	__raw_spin_lock(&cpu_buffer->lock);
 	ring_buffer_iter_reset(iter);
 	__raw_spin_unlock(&cpu_buffer->lock);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
 	return iter;
 }
@@ -1945,12 +1985,17 @@ struct ring_buffer_event *
 ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
 {
 	struct ring_buffer_event *event;
+	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+	unsigned long flags;
 
-	event = ring_buffer_iter_peek(iter, ts);
+	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+	event = rb_iter_peek(iter, ts);
 	if (!event)
-		return NULL;
+		goto out;
 
 	rb_advance_iter(iter);
+ out:
+	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
 	return event;
 }
@@ -1999,13 +2044,15 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
 	if (!cpu_isset(cpu, buffer->cpumask))
 		return;
 
-	local_irq_save(flags);
+	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
 	__raw_spin_lock(&cpu_buffer->lock);
 
 	rb_reset_cpu(cpu_buffer);
 
 	__raw_spin_unlock(&cpu_buffer->lock);
-	local_irq_restore(flags);
+
+	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 }
 
 /**
-- 
cgit v1.2.3


From 3e89c7bb92fc92bb964734341487798c8d497bae Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 11 Nov 2008 15:28:41 -0500
Subject: ring-buffer: clean up warn ons

Impact: Restructure WARN_ONs in ring_buffer.c

The current WARN_ON macros in ring_buffer.c are quite ugly.

This patch cleans them up and uses a single RB_WARN_ON that returns
the value of the condition. This allows the caller to abort the
function if the condition is true.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 143 ++++++++++++++++++---------------------------
 1 file changed, 57 insertions(+), 86 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 17c2ccebb56..8c5cacb25a1 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -191,60 +191,14 @@ struct ring_buffer_iter {
 
 /* buffer may be either ring_buffer or ring_buffer_per_cpu */
 #define RB_WARN_ON(buffer, cond)				\
-	do {							\
-		if (unlikely(cond)) {				\
+	({							\
+		int _____ret = unlikely(cond);			\
+		if (_____ret) {					\
 			atomic_inc(&buffer->record_disabled);	\
 			WARN_ON(1);				\
 		}						\
-	} while (0)
-
-#define RB_WARN_ON_RET(buffer, cond)				\
-	do {							\
-		if (unlikely(cond)) {				\
-			atomic_inc(&buffer->record_disabled);	\
-			WARN_ON(1);				\
-			return;					\
-		}						\
-	} while (0)
-
-#define RB_WARN_ON_RET_INT(buffer, cond)			\
-	do {							\
-		if (unlikely(cond)) {				\
-			atomic_inc(&buffer->record_disabled);	\
-			WARN_ON(1);				\
-			return -1;				\
-		}						\
-	} while (0)
-
-#define RB_WARN_ON_RET_NULL(buffer, cond)			\
-	do {							\
-		if (unlikely(cond)) {				\
-			atomic_inc(&buffer->record_disabled);	\
-			WARN_ON(1);				\
-			return NULL;				\
-		}						\
-	} while (0)
-
-#define RB_WARN_ON_ONCE(buffer, cond)				\
-	do {							\
-		static int once;				\
-		if (unlikely(cond) && !once) {			\
-			once++;					\
-			atomic_inc(&buffer->record_disabled);	\
-			WARN_ON(1);				\
-		}						\
-	} while (0)
-
-/* buffer must be ring_buffer not per_cpu */
-#define RB_WARN_ON_UNLOCK(buffer, cond)				\
-	do {							\
-		if (unlikely(cond)) {				\
-			mutex_unlock(&buffer->mutex);		\
-			atomic_inc(&buffer->record_disabled);	\
-			WARN_ON(1);				\
-			return -1;				\
-		}						\
-	} while (0)
+		_____ret;					\
+	})
 
 /**
  * check_pages - integrity check of buffer pages
@@ -258,14 +212,18 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
 	struct list_head *head = &cpu_buffer->pages;
 	struct buffer_page *page, *tmp;
 
-	RB_WARN_ON_RET_INT(cpu_buffer, head->next->prev != head);
-	RB_WARN_ON_RET_INT(cpu_buffer, head->prev->next != head);
+	if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
+		return -1;
+	if (RB_WARN_ON(cpu_buffer, head->prev->next != head))
+		return -1;
 
 	list_for_each_entry_safe(page, tmp, head, list) {
-		RB_WARN_ON_RET_INT(cpu_buffer,
-			       page->list.next->prev != &page->list);
-		RB_WARN_ON_RET_INT(cpu_buffer,
-			       page->list.prev->next != &page->list);
+		if (RB_WARN_ON(cpu_buffer,
+			       page->list.next->prev != &page->list))
+			return -1;
+		if (RB_WARN_ON(cpu_buffer,
+			       page->list.prev->next != &page->list))
+			return -1;
 	}
 
 	return 0;
@@ -472,13 +430,15 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
 	synchronize_sched();
 
 	for (i = 0; i < nr_pages; i++) {
-		RB_WARN_ON_RET(cpu_buffer, list_empty(&cpu_buffer->pages));
+		if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages)))
+			return;
 		p = cpu_buffer->pages.next;
 		page = list_entry(p, struct buffer_page, list);
 		list_del_init(&page->list);
 		free_buffer_page(page);
 	}
-	RB_WARN_ON_RET(cpu_buffer, list_empty(&cpu_buffer->pages));
+	if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages)))
+		return;
 
 	rb_reset_cpu(cpu_buffer);
 
@@ -500,7 +460,8 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
 	synchronize_sched();
 
 	for (i = 0; i < nr_pages; i++) {
-		RB_WARN_ON_RET(cpu_buffer, list_empty(pages));
+		if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
+			return;
 		p = pages->next;
 		page = list_entry(p, struct buffer_page, list);
 		list_del_init(&page->list);
@@ -555,7 +516,10 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 	if (size < buffer_size) {
 
 		/* easy case, just free pages */
-		RB_WARN_ON_UNLOCK(buffer, nr_pages >= buffer->pages);
+		if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) {
+			mutex_unlock(&buffer->mutex);
+			return -1;
+		}
 
 		rm_pages = buffer->pages - nr_pages;
 
@@ -574,7 +538,10 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 	 * add these pages to the cpu_buffers. Otherwise we just free
 	 * them all and return -ENOMEM;
 	 */
-	RB_WARN_ON_UNLOCK(buffer, nr_pages <= buffer->pages);
+	if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) {
+		mutex_unlock(&buffer->mutex);
+		return -1;
+	}
 
 	new_pages = nr_pages - buffer->pages;
 
@@ -598,7 +565,10 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 		rb_insert_pages(cpu_buffer, &pages, new_pages);
 	}
 
-	RB_WARN_ON_UNLOCK(buffer, !list_empty(&pages));
+	if (RB_WARN_ON(buffer, !list_empty(&pages))) {
+		mutex_unlock(&buffer->mutex);
+		return -1;
+	}
 
  out:
 	buffer->pages = nr_pages;
@@ -686,7 +656,8 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
 	     head += rb_event_length(event)) {
 
 		event = __rb_page_index(cpu_buffer->head_page, head);
-		RB_WARN_ON_RET(cpu_buffer, rb_null_event(event));
+		if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
+			return;
 		/* Only count data entries */
 		if (event->type != RINGBUF_TYPE_DATA)
 			continue;
@@ -739,8 +710,9 @@ rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
 	addr &= PAGE_MASK;
 
 	while (cpu_buffer->commit_page->page != (void *)addr) {
-		RB_WARN_ON(cpu_buffer,
-			   cpu_buffer->commit_page == cpu_buffer->tail_page);
+		if (RB_WARN_ON(cpu_buffer,
+			  cpu_buffer->commit_page == cpu_buffer->tail_page))
+			return;
 		cpu_buffer->commit_page->commit =
 			cpu_buffer->commit_page->write;
 		rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
@@ -896,7 +868,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		reader_page = cpu_buffer->reader_page;
 
 		/* we grabbed the lock before incrementing */
-		RB_WARN_ON(cpu_buffer, next_page == reader_page);
+		if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
+			goto out_unlock;
 
 		/*
 		 * If for some reason, we had an interrupt storm that made
@@ -973,7 +946,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 
 	/* We reserved something on the buffer */
 
-	RB_WARN_ON_RET_NULL(cpu_buffer, write > BUF_PAGE_SIZE);
+	if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE))
+		return NULL;
 
 	event = __rb_page_index(tail_page, tail);
 	rb_update_event(event, type, length);
@@ -1072,10 +1046,8 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 	 * storm or we have something buggy.
 	 * Bail!
 	 */
-	if (unlikely(++nr_loops > 1000)) {
-		RB_WARN_ON(cpu_buffer, 1);
+	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
 		return NULL;
-	}
 
 	ts = ring_buffer_time_stamp(cpu_buffer->cpu);
 
@@ -1591,8 +1563,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	 * a case where we will loop three times. There should be no
 	 * reason to loop four times (that I know of).
 	 */
-	if (unlikely(++nr_loops > 3)) {
-		RB_WARN_ON(cpu_buffer, 1);
+	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) {
 		reader = NULL;
 		goto out;
 	}
@@ -1604,8 +1575,9 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 		goto out;
 
 	/* Never should we have an index greater than the size */
-	RB_WARN_ON(cpu_buffer,
-		   cpu_buffer->reader_page->read > rb_page_size(reader));
+	if (RB_WARN_ON(cpu_buffer,
+		       cpu_buffer->reader_page->read > rb_page_size(reader)))
+		goto out;
 
 	/* check if we caught up to the tail */
 	reader = NULL;
@@ -1659,7 +1631,8 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
 	reader = rb_get_reader_page(cpu_buffer);
 
 	/* This function should not be called when buffer is empty */
-	RB_WARN_ON_RET(cpu_buffer, !reader);
+	if (RB_WARN_ON(cpu_buffer, !reader))
+		return;
 
 	event = rb_reader_event(cpu_buffer);
 
@@ -1686,8 +1659,9 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
 	 * Check if we are at the end of the buffer.
 	 */
 	if (iter->head >= rb_page_size(iter->head_page)) {
-		RB_WARN_ON_RET(buffer,
-			       iter->head_page == cpu_buffer->commit_page);
+		if (RB_WARN_ON(buffer,
+			       iter->head_page == cpu_buffer->commit_page))
+			return;
 		rb_inc_iter(iter);
 		return;
 	}
@@ -1700,9 +1674,10 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
 	 * This should not be called to advance the header if we are
 	 * at the tail of the buffer.
 	 */
-	RB_WARN_ON_RET(cpu_buffer,
+	if (RB_WARN_ON(cpu_buffer,
 		       (iter->head_page == cpu_buffer->commit_page) &&
-		       (iter->head + length > rb_commit_index(cpu_buffer)));
+		       (iter->head + length > rb_commit_index(cpu_buffer))))
+		return;
 
 	rb_update_iter_read_stamp(iter, event);
 
@@ -1736,10 +1711,8 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 	 * can have.  Nesting 10 deep of interrupts is clearly
 	 * an anomaly.
 	 */
-	if (unlikely(++nr_loops > 10)) {
-		RB_WARN_ON(cpu_buffer, 1);
+	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
 		return NULL;
-	}
 
 	reader = rb_get_reader_page(cpu_buffer);
 	if (!reader)
@@ -1800,10 +1773,8 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 	 * can have. Nesting 10 deep of interrupts is clearly
 	 * an anomaly.
 	 */
-	if (unlikely(++nr_loops > 10)) {
-		RB_WARN_ON(cpu_buffer, 1);
+	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
 		return NULL;
-	}
 
 	if (rb_per_cpu_empty(cpu_buffer))
 		return NULL;
-- 
cgit v1.2.3


From 3f5ec13696fd4a33bde42f385406cbb1d3cc96fd Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 11 Nov 2008 23:21:31 +0100
Subject: tracing/fastboot: move boot tracer structs and funcs into their own
 header.

Impact: Cleanups on the boot tracer and ftrace

This patch bring some cleanups about the boot tracer headers. The
functions and structures of this tracer have nothing related to ftrace
and should have so their own header file.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e40ce0c1469..f69a5199596 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -8,6 +8,7 @@
 #include <linux/ring_buffer.h>
 #include <linux/mmiotrace.h>
 #include <linux/ftrace.h>
+#include <trace/boot.h>
 
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
-- 
cgit v1.2.3


From 74239072830ef3f1398edeb1bc1076fc330fd4a2 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 11 Nov 2008 23:24:42 +0100
Subject: tracing/fastboot: Use the ring-buffer timestamp for initcall entries

Impact: Split the boot tracer entries in two parts: call and return

Now that we are using the sched tracer from the boot tracer, we want
to use the same timestamp than the ring-buffer to have consistent time
captures between sched events and initcall events.

So we get rid of the old time capture by the boot tracer and split the
initcall events in two parts: call and return. This way we have the
ring buffer timestamp of both.

An example trace:

[   27.904149584] calling  net_ns_init+0x0/0x1c0 @ 1
[   27.904429624] initcall net_ns_init+0x0/0x1c0 returned 0 after 0 msecs
[   27.904575926] calling  reboot_init+0x0/0x20 @ 1
[   27.904655399] initcall reboot_init+0x0/0x20 returned 0 after 0 msecs
[   27.904800228] calling  sysctl_init+0x0/0x30 @ 1
[   27.905142914] initcall sysctl_init+0x0/0x30 returned 0 after 0 msecs
[   27.905287211] calling  ksysfs_init+0x0/0xb0 @ 1
 ##### CPU 0 buffer started ####
            init-1     [000]    27.905395:      1:120:R   + [001]    11:115:S
 ##### CPU 1 buffer started ####
          <idle>-0     [001]    27.905425:      0:140:R ==> [001]    11:115:R
            init-1     [000]    27.905426:      1:120:D ==> [000]     0:140:R
          <idle>-0     [000]    27.905431:      0:140:R   + [000]     4:115:S
          <idle>-0     [000]    27.905451:      0:140:R ==> [000]     4:115:R
     ksoftirqd/0-4     [000]    27.905456:      4:115:S ==> [000]     0:140:R
           udevd-11    [001]    27.905458:     11:115:R   + [001]    14:115:R
          <idle>-0     [000]    27.905459:      0:140:R   + [000]     4:115:S
          <idle>-0     [000]    27.905462:      0:140:R ==> [000]     4:115:R
           udevd-11    [001]    27.905462:     11:115:R ==> [001]    14:115:R
     ksoftirqd/0-4     [000]    27.905467:      4:115:S ==> [000]     0:140:R
          <idle>-0     [000]    27.905470:      0:140:R   + [000]     4:115:S
          <idle>-0     [000]    27.905473:      0:140:R ==> [000]     4:115:R
     ksoftirqd/0-4     [000]    27.905476:      4:115:S ==> [000]     0:140:R
          <idle>-0     [000]    27.905479:      0:140:R   + [000]     4:115:S
          <idle>-0     [000]    27.905482:      0:140:R ==> [000]     4:115:R
     ksoftirqd/0-4     [000]    27.905486:      4:115:S ==> [000]     0:140:R
           udevd-14    [001]    27.905499:     14:120:X ==> [001]    11:115:R
           udevd-11    [001]    27.905506:     11:115:R   + [000]     1:120:D
          <idle>-0     [000]    27.905515:      0:140:R ==> [000]     1:120:R
           udevd-11    [001]    27.905517:     11:115:S ==> [001]     0:140:R
[   27.905557107] initcall ksysfs_init+0x0/0xb0 returned 0 after 3906 msecs
[   27.905705736] calling  init_jiffies_clocksource+0x0/0x10 @ 1
[   27.905779239] initcall init_jiffies_clocksource+0x0/0x10 returned 0 after 0 msecs
[   27.906769814] calling  pm_init+0x0/0x30 @ 1
[   27.906853627] initcall pm_init+0x0/0x30 returned 0 after 0 msecs
[   27.906997803] calling  pm_disk_init+0x0/0x20 @ 1
[   27.907076946] initcall pm_disk_init+0x0/0x20 returned 0 after 0 msecs
[   27.907222556] calling  swsusp_header_init+0x0/0x30 @ 1
[   27.907294325] initcall swsusp_header_init+0x0/0x30 returned 0 after 0 msecs
[   27.907439620] calling  stop_machine_init+0x0/0x50 @ 1
            init-1     [000]    27.907485:      1:120:R   + [000]     2:115:S
            init-1     [000]    27.907490:      1:120:D ==> [000]     2:115:R
        kthreadd-2     [000]    27.907507:      2:115:R   + [001]    15:115:R
          <idle>-0     [001]    27.907517:      0:140:R ==> [001]    15:115:R
        kthreadd-2     [000]    27.907517:      2:115:D ==> [000]     0:140:R
          <idle>-0     [000]    27.907521:      0:140:R   + [000]     4:115:S
          <idle>-0     [000]    27.907524:      0:140:R ==> [000]     4:115:R
           udevd-15    [001]    27.907527:     15:115:D   + [000]     2:115:D
     ksoftirqd/0-4     [000]    27.907537:      4:115:S ==> [000]     2:115:R
           udevd-15    [001]    27.907537:     15:115:D ==> [001]     0:140:R
        kthreadd-2     [000]    27.907546:      2:115:R   + [000]     1:120:D
        kthreadd-2     [000]    27.907550:      2:115:S ==> [000]     1:120:R
            init-1     [000]    27.907584:      1:120:R   + [000]    15:  0:D
            init-1     [000]    27.907589:      1:120:R   + [000]     2:115:S
            init-1     [000]    27.907593:      1:120:D ==> [000]    15:  0:R
           udevd-15    [000]    27.907601:     15:  0:S ==> [000]     2:115:R
 ##### CPU 0 buffer started ####
        kthreadd-2     [000]    27.907616:      2:115:R   + [001]    16:115:R
 ##### CPU 1 buffer started ####
          <idle>-0     [001]    27.907620:      0:140:R ==> [001]    16:115:R
        kthreadd-2     [000]    27.907621:      2:115:D ==> [000]     0:140:R
           udevd-16    [001]    27.907625:     16:115:D   + [000]     2:115:D
          <idle>-0     [000]    27.907628:      0:140:R   + [000]     4:115:S
           udevd-16    [001]    27.907629:     16:115:D ==> [001]     0:140:R
          <idle>-0     [000]    27.907631:      0:140:R ==> [000]     4:115:R
     ksoftirqd/0-4     [000]    27.907636:      4:115:S ==> [000]     2:115:R
        kthreadd-2     [000]    27.907644:      2:115:R   + [000]     1:120:D
        kthreadd-2     [000]    27.907647:      2:115:S ==> [000]     1:120:R
            init-1     [000]    27.907657:      1:120:R   + [001]    16:  0:D
          <idle>-0     [001]    27.907666:      0:140:R ==> [001]    16:  0:R
[   27.907703862] initcall stop_machine_init+0x0/0x50 returned 0 after 0 msecs
[   27.907850704] calling  filelock_init+0x0/0x30 @ 1
[   27.907926573] initcall filelock_init+0x0/0x30 returned 0 after 0 msecs
[   27.908071327] calling  init_script_binfmt+0x0/0x10 @ 1
[   27.908165195] initcall init_script_binfmt+0x0/0x10 returned 0 after 0 msecs
[   27.908309461] calling  init_elf_binfmt+0x0/0x10 @ 1

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h      |  17 +++++--
 kernel/trace/trace_boot.c | 123 +++++++++++++++++++++++++++++++++++-----------
 2 files changed, 105 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f69a5199596..b5f91f198fd 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -22,7 +22,8 @@ enum trace_type {
 	TRACE_SPECIAL,
 	TRACE_MMIO_RW,
 	TRACE_MMIO_MAP,
-	TRACE_BOOT,
+	TRACE_BOOT_CALL,
+	TRACE_BOOT_RET,
 	TRACE_FN_RET,
 
 	__TRACE_LAST_TYPE
@@ -123,9 +124,14 @@ struct trace_mmiotrace_map {
 	struct mmiotrace_map	map;
 };
 
-struct trace_boot {
+struct trace_boot_call {
 	struct trace_entry	ent;
-	struct boot_trace	initcall;
+	struct boot_trace_call boot_call;
+};
+
+struct trace_boot_ret {
+	struct trace_entry	ent;
+	struct boot_trace_ret boot_ret;
 };
 
 /*
@@ -228,8 +234,9 @@ extern void __ftrace_bad_type(void);
 			  TRACE_MMIO_RW);				\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_map,		\
 			  TRACE_MMIO_MAP);				\
-		IF_ASSIGN(var, ent, struct trace_boot, TRACE_BOOT);	\
-		IF_ASSIGN(var, ent, struct ftrace_ret_entry, TRACE_FN_RET); \
+		IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\
+		IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\
+		IF_ASSIGN(var, ent, struct ftrace_ret_entry, TRACE_FN_RET);\
 		__ftrace_bad_type();					\
 	} while (0)
 
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 8f71915e8bb..cb333b7fd11 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -58,35 +58,71 @@ static void boot_trace_init(struct trace_array *tr)
 	tracing_sched_switch_assign_trace(tr);
 }
 
-static enum print_line_t initcall_print_line(struct trace_iterator *iter)
+static enum print_line_t
+initcall_call_print_line(struct trace_iterator *iter)
 {
+	struct trace_entry *entry = iter->ent;
+	struct trace_seq *s = &iter->seq;
+	struct trace_boot_call *field;
+	struct boot_trace_call *call;
+	u64 ts;
+	unsigned long nsec_rem;
 	int ret;
+
+	trace_assign_type(field, entry);
+	call = &field->boot_call;
+	ts = iter->ts;
+	nsec_rem = do_div(ts, 1000000000);
+
+	ret = trace_seq_printf(s, "[%5ld.%09ld] calling  %s @ %i\n",
+			(unsigned long)ts, nsec_rem, call->func, call->caller);
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+	else
+		return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+initcall_ret_print_line(struct trace_iterator *iter)
+{
 	struct trace_entry *entry = iter->ent;
-	struct trace_boot *field = (struct trace_boot *)entry;
-	struct boot_trace *it = &field->initcall;
 	struct trace_seq *s = &iter->seq;
-	struct timespec calltime = ktime_to_timespec(it->calltime);
-	struct timespec rettime = ktime_to_timespec(it->rettime);
-
-	if (entry->type == TRACE_BOOT) {
-		ret = trace_seq_printf(s, "[%5ld.%09ld] calling  %s @ %i\n",
-					  calltime.tv_sec,
-					  calltime.tv_nsec,
-					  it->func, it->caller);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-
-		ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
-					  "returned %d after %lld msecs\n",
-					  rettime.tv_sec,
-					  rettime.tv_nsec,
-					  it->func, it->result, it->duration);
-
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
+	struct trace_boot_ret *field;
+	struct boot_trace_ret *init_ret;
+	u64 ts;
+	unsigned long nsec_rem;
+	int ret;
+
+	trace_assign_type(field, entry);
+	init_ret = &field->boot_ret;
+	ts = iter->ts;
+	nsec_rem = do_div(ts, 1000000000);
+
+	ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
+			"returned %d after %llu msecs\n",
+			(unsigned long) ts,
+			nsec_rem,
+			init_ret->func, init_ret->result, init_ret->duration);
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+	else
 		return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t initcall_print_line(struct trace_iterator *iter)
+{
+	struct trace_entry *entry = iter->ent;
+
+	switch (entry->type) {
+	case TRACE_BOOT_CALL:
+		return initcall_call_print_line(iter);
+	case TRACE_BOOT_RET:
+		return initcall_ret_print_line(iter);
+	default:
+		return TRACE_TYPE_UNHANDLED;
 	}
-	return TRACE_TYPE_UNHANDLED;
 }
 
 struct tracer boot_tracer __read_mostly =
@@ -97,11 +133,10 @@ struct tracer boot_tracer __read_mostly =
 	.print_line	= initcall_print_line,
 };
 
-void trace_boot(struct boot_trace *it, initcall_t fn)
+void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
 {
 	struct ring_buffer_event *event;
-	struct trace_boot *entry;
-	struct trace_array_cpu *data;
+	struct trace_boot_call *entry;
 	unsigned long irq_flags;
 	struct trace_array *tr = boot_trace;
 
@@ -111,9 +146,37 @@ void trace_boot(struct boot_trace *it, initcall_t fn)
 	/* Get its name now since this function could
 	 * disappear because it is in the .init section.
 	 */
-	sprint_symbol(it->func, (unsigned long)fn);
+	sprint_symbol(bt->func, (unsigned long)fn);
+	preempt_disable();
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		goto out;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, 0);
+	entry->ent.type = TRACE_BOOT_CALL;
+	entry->boot_call = *bt;
+	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+	trace_wake_up();
+
+ out:
+	preempt_enable();
+}
+
+void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
+{
+	struct ring_buffer_event *event;
+	struct trace_boot_ret *entry;
+	unsigned long irq_flags;
+	struct trace_array *tr = boot_trace;
+
+	if (!pre_initcalls_finished)
+		return;
+
+	sprint_symbol(bt->func, (unsigned long)fn);
 	preempt_disable();
-	data = tr->data[smp_processor_id()];
 
 	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
 					 &irq_flags);
@@ -121,8 +184,8 @@ void trace_boot(struct boot_trace *it, initcall_t fn)
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, 0);
-	entry->ent.type = TRACE_BOOT;
-	entry->initcall = *it;
+	entry->ent.type = TRACE_BOOT_RET;
+	entry->boot_ret = *bt;
 	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
 
 	trace_wake_up();
-- 
cgit v1.2.3


From 642edba5f5c545772b89907cf96134c73d6073c7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 12 Nov 2008 00:01:26 -0500
Subject: ring-buffer: fix deadlock from reader_lock in read_start

Impact: deadlock fix in ring_buffer_read_start

The ring_buffer_iter_reset was called from ring_buffer_read_start
where both grabbed the reader_lock.

This patch separates out the internals of ring_buffer_iter_reset
to its own function so that both APIs may grab the reader_lock.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index c04c433fbc5..86dc353f89b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1475,19 +1475,9 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
 	return overruns;
 }
 
-/**
- * ring_buffer_iter_reset - reset an iterator
- * @iter: The iterator to reset
- *
- * Resets the iterator, so that it will start from the beginning
- * again.
- */
-void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
+static void rb_iter_reset(struct ring_buffer_iter *iter)
 {
 	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
-	unsigned long flags;
-
-	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 
 	/* Iterator usage is expected to have record disabled */
 	if (list_empty(&cpu_buffer->reader_page->list)) {
@@ -1501,7 +1491,22 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
 		iter->read_stamp = cpu_buffer->read_stamp;
 	else
 		iter->read_stamp = iter->head_page->time_stamp;
+}
 
+/**
+ * ring_buffer_iter_reset - reset an iterator
+ * @iter: The iterator to reset
+ *
+ * Resets the iterator, so that it will start from the beginning
+ * again.
+ */
+void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
+{
+	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+	rb_iter_reset(iter);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 }
 
@@ -1957,7 +1962,7 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
 
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 	__raw_spin_lock(&cpu_buffer->lock);
-	ring_buffer_iter_reset(iter);
+	rb_iter_reset(iter);
 	__raw_spin_unlock(&cpu_buffer->lock);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
-- 
cgit v1.2.3


From 1f0d69a9fc815db82f15722bf05227190b1d714d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 12 Nov 2008 00:14:39 -0500
Subject: tracing: profile likely and unlikely annotations

Impact: new unlikely/likely profiler

Andrew Morton recently suggested having an in-kernel way to profile
likely and unlikely macros. This patch achieves that goal.

When configured, every(*) likely and unlikely macro gets a counter attached
to it. When the condition is hit, the hit and misses of that condition
are recorded. These numbers can later be retrieved by:

  /debugfs/tracing/profile_likely    - All likely markers
  /debugfs/tracing/profile_unlikely  - All unlikely markers.

# cat /debug/tracing/profile_unlikely | head
 correct incorrect  %        Function                  File              Line
 ------- ---------  -        --------                  ----              ----
    2167        0   0 do_arch_prctl                  process_64.c         832
       0        0   0 do_arch_prctl                  process_64.c         804
    2670        0   0 IS_ERR                         err.h                34
   71230     5693   7 __switch_to                    process_64.c         673
   76919        0   0 __switch_to                    process_64.c         639
   43184    33743  43 __switch_to                    process_64.c         624
   12740    64181  83 __switch_to                    process_64.c         594
   12740    64174  83 __switch_to                    process_64.c         590

# cat /debug/tracing/profile_unlikely | \
  awk '{ if ($3 > 25) print $0; }' |head -20
   44963    35259  43 __switch_to                    process_64.c         624
   12762    67454  84 __switch_to                    process_64.c         594
   12762    67447  84 __switch_to                    process_64.c         590
    1478      595  28 syscall_get_error              syscall.h            51
       0     2821 100 syscall_trace_leave            ptrace.c             1567
       0        1 100 native_smp_prepare_cpus        smpboot.c            1237
   86338   265881  75 calc_delta_fair                sched_fair.c         408
  210410   108540  34 calc_delta_mine                sched.c              1267
       0    54550 100 sched_info_queued              sched_stats.h        222
   51899    66435  56 pick_next_task_fair            sched_fair.c         1422
       6       10  62 yield_task_fair                sched_fair.c         982
    7325     2692  26 rt_policy                      sched.c              144
       0     1270 100 pre_schedule_rt                sched_rt.c           1261
    1268    48073  97 pick_next_task_rt              sched_rt.c           884
       0    45181 100 sched_info_dequeued            sched_stats.h        177
       0       15 100 sched_move_task                sched.c              8700
       0       15 100 sched_move_task                sched.c              8690
   53167    33217  38 schedule                       sched.c              4457
       0    80208 100 sched_info_switch              sched_stats.h        270
   30585    49631  61 context_switch                 sched.c              2619

# cat /debug/tracing/profile_likely | awk '{ if ($3 > 25) print $0; }'
   39900    36577  47 pick_next_task                 sched.c              4397
   20824    15233  42 switch_mm                      mmu_context_64.h     18
       0        7 100 __cancel_work_timer            workqueue.c          560
     617    66484  99 clocksource_adjust             timekeeping.c        456
       0   346340 100 audit_syscall_exit             auditsc.c            1570
      38   347350  99 audit_get_context              auditsc.c            732
       0   345244 100 audit_syscall_entry            auditsc.c            1541
      38     1017  96 audit_free                     auditsc.c            1446
       0     1090 100 audit_alloc                    auditsc.c            862
    2618     1090  29 audit_alloc                    auditsc.c            858
       0        6 100 move_masked_irq                migration.c          9
       1      198  99 probe_sched_wakeup             trace_sched_switch.c 58
       2        2  50 probe_wakeup                   trace_sched_wakeup.c 227
       0        2 100 probe_wakeup_sched_switch      trace_sched_wakeup.c 144
    4514     2090  31 __grab_cache_page              filemap.c            2149
   12882   228786  94 mapping_unevictable            pagemap.h            50
       4       11  73 __flush_cpu_slab               slub.c               1466
  627757   330451  34 slab_free                      slub.c               1731
    2959    61245  95 dentry_lru_del_init            dcache.c             153
     946     1217  56 load_elf_binary                binfmt_elf.c         904
     102       82  44 disk_put_part                  genhd.h              206
       1        1  50 dst_gc_task                    dst.c                82
       0       19 100 tcp_mss_split_point            tcp_output.c         1126

As you can see by the above, there's a bit of work to do in rethinking
the use of some unlikelys and likelys. Note: the unlikely case had 71 hits
that were more than 25%.

Note:  After submitting my first version of this patch, Andrew Morton
  showed me a version written by Daniel Walker, where I picked up
  the following ideas from:

  1)  Using __builtin_constant_p to avoid profiling fixed values.
  2)  Using __FILE__ instead of instruction pointers.
  3)  Using the preprocessor to stop all profiling of likely
       annotations from vsyscall_64.c.

Thanks to Andrew Morton, Arjan van de Ven, Theodore Tso and Ingo Molnar
for their feed back on this patch.

(*) Not ever unlikely is recorded, those that are used by vsyscalls
 (a few of them) had to have profiling disabled.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Theodore Tso <tytso@mit.edu>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig          |  16 +++++
 kernel/trace/Makefile         |   1 +
 kernel/trace/trace_unlikely.c | 164 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 181 insertions(+)
 create mode 100644 kernel/trace/trace_unlikely.c

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d986216c832..a604f24c755 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -159,6 +159,22 @@ config BOOT_TRACER
 	    selected, because the self-tests are an initcall as well and that
 	    would invalidate the boot trace. )
 
+config TRACE_UNLIKELY_PROFILE
+	bool "Trace likely/unlikely profiler"
+	depends on DEBUG_KERNEL
+	select TRACING
+	help
+	  This tracer profiles all the the likely and unlikely macros
+	  in the kernel. It will display the results in:
+
+	  /debugfs/tracing/profile_likely
+	  /debugfs/tracing/profile_unlikely
+
+	  Note: this will add a significant overhead, only turn this
+	  on if you need to profile the system's use of these macros.
+
+	  Say N if unsure.
+
 config STACK_TRACER
 	bool "Trace max stack"
 	depends on HAVE_FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 3e1f361bbc1..98e70ee2798 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -25,5 +25,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
 obj-$(CONFIG_FUNCTION_RET_TRACER) += trace_functions_return.o
+obj-$(CONFIG_TRACE_UNLIKELY_PROFILE) += trace_unlikely.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace_unlikely.c b/kernel/trace/trace_unlikely.c
new file mode 100644
index 00000000000..94932696069
--- /dev/null
+++ b/kernel/trace/trace_unlikely.c
@@ -0,0 +1,164 @@
+/*
+ * unlikely profiler
+ *
+ * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
+ */
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ftrace.h>
+#include <linux/hash.h>
+#include <linux/fs.h>
+#include <asm/local.h>
+#include "trace.h"
+
+void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect)
+{
+	/* FIXME: Make this atomic! */
+	if (val == expect)
+		f->correct++;
+	else
+		f->incorrect++;
+}
+EXPORT_SYMBOL(ftrace_likely_update);
+
+struct ftrace_pointer {
+	void		*start;
+	void		*stop;
+};
+
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct ftrace_pointer *f = m->private;
+	struct ftrace_likely_data *p = v;
+
+	(*pos)++;
+
+	if (v == (void *)1)
+		return f->start;
+
+	++p;
+
+	if ((void *)p >= (void *)f->stop)
+		return NULL;
+
+	return p;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+	void *t = (void *)1;
+	loff_t l = 0;
+
+	for (; t && l < *pos; t = t_next(m, t, &l))
+		;
+
+	return t;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+	struct ftrace_likely_data *p = v;
+	const char *f;
+	unsigned long percent;
+
+	if (v == (void *)1) {
+		seq_printf(m, " correct incorrect  %% "
+			      "       Function                "
+			      "  File              Line\n"
+			      " ------- ---------  - "
+			      "       --------                "
+			      "  ----              ----\n");
+		return 0;
+	}
+
+	/* Only print the file, not the path */
+	f = p->file + strlen(p->file);
+	while (f >= p->file && *f != '/')
+		f--;
+	f++;
+
+	if (p->correct) {
+		percent = p->incorrect * 100;
+		percent /= p->correct + p->incorrect;
+	} else
+		percent = p->incorrect ? 100 : 0;
+
+	seq_printf(m, "%8lu %8lu %3lu ", p->correct, p->incorrect, percent);
+	seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
+	return 0;
+}
+
+static struct seq_operations tracing_likely_seq_ops = {
+	.start		= t_start,
+	.next		= t_next,
+	.stop		= t_stop,
+	.show		= t_show,
+};
+
+static int tracing_likely_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	ret = seq_open(file, &tracing_likely_seq_ops);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+		m->private = (void *)inode->i_private;
+	}
+
+	return ret;
+}
+
+static struct file_operations tracing_likely_fops = {
+	.open		= tracing_likely_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+};
+
+extern unsigned long __start_likely_profile[];
+extern unsigned long __stop_likely_profile[];
+extern unsigned long __start_unlikely_profile[];
+extern unsigned long __stop_unlikely_profile[];
+
+static struct ftrace_pointer ftrace_likely_pos = {
+	.start			= __start_likely_profile,
+	.stop			= __stop_likely_profile,
+};
+
+static struct ftrace_pointer ftrace_unlikely_pos = {
+	.start			= __start_unlikely_profile,
+	.stop			= __stop_unlikely_profile,
+};
+
+static __init int ftrace_unlikely_init(void)
+{
+	struct dentry *d_tracer;
+	struct dentry *entry;
+
+	d_tracer = tracing_init_dentry();
+
+	entry = debugfs_create_file("profile_likely", 0444, d_tracer,
+				    &ftrace_likely_pos,
+				    &tracing_likely_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'profile_likely' entry\n");
+
+	entry = debugfs_create_file("profile_unlikely", 0444, d_tracer,
+				    &ftrace_unlikely_pos,
+				    &tracing_likely_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs"
+			   " 'profile_unlikely' entry\n");
+
+	return 0;
+}
+
+device_initcall(ftrace_unlikely_init);
-- 
cgit v1.2.3


From 52f232cb720a7babb752849cbc2cab2d24021209 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 12 Nov 2008 00:14:40 -0500
Subject: tracing: likely/unlikely branch annotation tracer

Impact: new likely/unlikely branch tracer

This patch adds a way to record the instances of the likely() and unlikely()
branch condition annotations.

When "unlikely" is set in /debugfs/tracing/iter_ctrl the unlikely conditions
will be added to any of the ftrace tracers. The change takes effect when
a new tracer is passed into the current_tracer file.

For example:

 bash-3471  [003]   357.014755: [INCORRECT] sched_info_dequeued:sched_stats.h:177
 bash-3471  [003]   357.014756: [correct] update_curr:sched_fair.c:489
 bash-3471  [003]   357.014758: [correct] calc_delta_fair:sched_fair.c:411
 bash-3471  [003]   357.014759: [correct] account_group_exec_runtime:sched_stats.h:356
 bash-3471  [003]   357.014761: [correct] update_curr:sched_fair.c:489
 bash-3471  [003]   357.014763: [INCORRECT] calc_delta_fair:sched_fair.c:411
 bash-3471  [003]   357.014765: [correct] calc_delta_mine:sched.c:1279

Which shows the normal tracer heading, as well as whether the condition was
correct "[correct]" or was mistaken "[INCORRECT]", followed by the function,
file name and line number.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig          |  22 ++++++++
 kernel/trace/Makefile         |   6 +++
 kernel/trace/trace.c          |  29 +++++++++++
 kernel/trace/trace.h          |  39 +++++++++++++++
 kernel/trace/trace_unlikely.c | 114 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 210 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a604f24c755..8abcaf821be 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -175,6 +175,28 @@ config TRACE_UNLIKELY_PROFILE
 
 	  Say N if unsure.
 
+config TRACING_UNLIKELY
+	bool
+	help
+	  Selected by tracers that will trace the likely and unlikely
+	  conditions. This prevents the tracers themselves from being
+	  profiled. Profiling the tracing infrastructure can only happen
+	  when the likelys and unlikelys are not being traced.
+
+config UNLIKELY_TRACER
+	bool "Trace likely/unlikely instances"
+	depends on TRACE_UNLIKELY_PROFILE
+	select TRACING_UNLIKELY
+	help
+	  This traces the events of likely and unlikely condition
+	  calls in the kernel.  The difference between this and the
+	  "Trace likely/unlikely profiler" is that this is not a
+	  histogram of the callers, but actually places the calling
+	  events into a running trace buffer to see when and where the
+	  events happened, as well as their results.
+
+	  Say N if unsure.
+
 config STACK_TRACER
 	bool "Trace max stack"
 	depends on HAVE_FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 98e70ee2798..c938d03516c 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -10,6 +10,12 @@ CFLAGS_trace_selftest_dynamic.o = -pg
 obj-y += trace_selftest_dynamic.o
 endif
 
+# If unlikely tracing is enabled, do not trace these files
+ifdef CONFIG_TRACING_UNLIKELY
+KBUILD_CFLAGS += '-Dlikely(x)=likely_notrace(x)'
+KBUILD_CFLAGS += '-Dunlikely(x)=unlikely_notrace(x)'
+endif
+
 obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
 obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a3f7ae9cd8e..83d38634bc9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -258,6 +258,9 @@ static const char *trace_options[] = {
 	"sched-tree",
 	"ftrace_printk",
 	"ftrace_preempt",
+#ifdef CONFIG_UNLIKELY_TRACER
+	"unlikely",
+#endif
 	NULL
 };
 
@@ -1648,6 +1651,18 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 			trace_seq_print_cont(s, iter);
 		break;
 	}
+	case TRACE_UNLIKELY: {
+		struct trace_unlikely *field;
+
+		trace_assign_type(field, entry);
+
+		trace_seq_printf(s, "[%s] %s:%s:%d\n",
+				 field->correct ? "correct" : "INCORRECT",
+				 field->func,
+				 field->file,
+				 field->line);
+		break;
+	}
 	default:
 		trace_seq_printf(s, "Unknown type %d\n", entry->type);
 	}
@@ -1787,6 +1802,18 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 		return print_return_function(iter);
 		break;
 	}
+	case TRACE_UNLIKELY: {
+		struct trace_unlikely *field;
+
+		trace_assign_type(field, entry);
+
+		trace_seq_printf(s, "[%s] %s:%s:%d\n",
+				 field->correct ? "correct" : "INCORRECT",
+				 field->func,
+				 field->file,
+				 field->line);
+		break;
+	}
 	}
 	return TRACE_TYPE_HANDLED;
 }
@@ -2592,6 +2619,7 @@ static int tracing_set_tracer(char *buf)
 	if (t == current_trace)
 		goto out;
 
+	trace_unlikely_disable();
 	if (current_trace && current_trace->reset)
 		current_trace->reset(tr);
 
@@ -2599,6 +2627,7 @@ static int tracing_set_tracer(char *buf)
 	if (t->init)
 		t->init(tr);
 
+	trace_unlikely_enable(tr);
  out:
 	mutex_unlock(&trace_types_lock);
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b5f91f198fd..9635aa2c4fc 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -22,6 +22,7 @@ enum trace_type {
 	TRACE_SPECIAL,
 	TRACE_MMIO_RW,
 	TRACE_MMIO_MAP,
+	TRACE_UNLIKELY,
 	TRACE_BOOT_CALL,
 	TRACE_BOOT_RET,
 	TRACE_FN_RET,
@@ -134,6 +135,16 @@ struct trace_boot_ret {
 	struct boot_trace_ret boot_ret;
 };
 
+#define TRACE_FUNC_SIZE 30
+#define TRACE_FILE_SIZE 20
+struct trace_unlikely {
+	struct trace_entry	ent;
+	unsigned	        line;
+	char			func[TRACE_FUNC_SIZE+1];
+	char			file[TRACE_FILE_SIZE+1];
+	char			correct;
+};
+
 /*
  * trace_flag_type is an enumeration that holds different
  * states when a trace occurs. These are:
@@ -236,6 +247,7 @@ extern void __ftrace_bad_type(void);
 			  TRACE_MMIO_MAP);				\
 		IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\
 		IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\
+		IF_ASSIGN(var, ent, struct trace_unlikely, TRACE_UNLIKELY); \
 		IF_ASSIGN(var, ent, struct ftrace_ret_entry, TRACE_FN_RET);\
 		__ftrace_bad_type();					\
 	} while (0)
@@ -456,6 +468,9 @@ enum trace_iterator_flags {
 	TRACE_ITER_SCHED_TREE		= 0x200,
 	TRACE_ITER_PRINTK		= 0x400,
 	TRACE_ITER_PREEMPTONLY		= 0x800,
+#ifdef CONFIG_UNLIKELY_TRACER
+	TRACE_ITER_UNLIKELY		= 0x1000,
+#endif
 };
 
 /*
@@ -515,4 +530,28 @@ static inline void ftrace_preempt_enable(int resched)
 		preempt_enable_notrace();
 }
 
+#ifdef CONFIG_UNLIKELY_TRACER
+extern int enable_unlikely_tracing(struct trace_array *tr);
+extern void disable_unlikely_tracing(void);
+static inline int trace_unlikely_enable(struct trace_array *tr)
+{
+	if (trace_flags & TRACE_ITER_UNLIKELY)
+		return enable_unlikely_tracing(tr);
+	return 0;
+}
+static inline void trace_unlikely_disable(void)
+{
+	/* due to races, always disable */
+	disable_unlikely_tracing();
+}
+#else
+static inline int trace_unlikely_enable(struct trace_array *tr)
+{
+	return 0;
+}
+static inline void trace_unlikely_disable(void)
+{
+}
+#endif /* CONFIG_UNLIKELY_TRACER */
+
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_unlikely.c b/kernel/trace/trace_unlikely.c
index 94932696069..7290e0e7b4e 100644
--- a/kernel/trace/trace_unlikely.c
+++ b/kernel/trace/trace_unlikely.c
@@ -15,8 +15,122 @@
 #include <asm/local.h>
 #include "trace.h"
 
+#ifdef CONFIG_UNLIKELY_TRACER
+
+static int unlikely_tracing_enabled __read_mostly;
+static DEFINE_MUTEX(unlikely_tracing_mutex);
+static struct trace_array *unlikely_tracer;
+
+static void
+probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
+{
+	struct trace_array *tr = unlikely_tracer;
+	struct ring_buffer_event *event;
+	struct trace_unlikely *entry;
+	unsigned long flags, irq_flags;
+	int cpu, pc;
+	const char *p;
+
+	/*
+	 * I would love to save just the ftrace_likely_data pointer, but
+	 * this code can also be used by modules. Ugly things can happen
+	 * if the module is unloaded, and then we go and read the
+	 * pointer.  This is slower, but much safer.
+	 */
+
+	if (unlikely(!tr))
+		return;
+
+	local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
+		goto out;
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		goto out;
+
+	pc = preempt_count();
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, flags, pc);
+	entry->ent.type		= TRACE_UNLIKELY;
+
+	/* Strip off the path, only save the file */
+	p = f->file + strlen(f->file);
+	while (p >= f->file && *p != '/')
+		p--;
+	p++;
+
+	strncpy(entry->func, f->func, TRACE_FUNC_SIZE);
+	strncpy(entry->file, p, TRACE_FILE_SIZE);
+	entry->func[TRACE_FUNC_SIZE] = 0;
+	entry->file[TRACE_FILE_SIZE] = 0;
+	entry->line = f->line;
+	entry->correct = val == expect;
+
+	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+ out:
+	atomic_dec(&tr->data[cpu]->disabled);
+	local_irq_restore(flags);
+}
+
+static inline
+void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
+{
+	if (!unlikely_tracing_enabled)
+		return;
+
+	probe_likely_condition(f, val, expect);
+}
+
+int enable_unlikely_tracing(struct trace_array *tr)
+{
+	int ret = 0;
+
+	mutex_lock(&unlikely_tracing_mutex);
+	unlikely_tracer = tr;
+	/*
+	 * Must be seen before enabling. The reader is a condition
+	 * where we do not need a matching rmb()
+	 */
+	smp_wmb();
+	unlikely_tracing_enabled++;
+	mutex_unlock(&unlikely_tracing_mutex);
+
+	return ret;
+}
+
+void disable_unlikely_tracing(void)
+{
+	mutex_lock(&unlikely_tracing_mutex);
+
+	if (!unlikely_tracing_enabled)
+		goto out_unlock;
+
+	unlikely_tracing_enabled--;
+
+ out_unlock:
+	mutex_unlock(&unlikely_tracing_mutex);
+}
+#else
+static inline
+void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
+{
+}
+#endif /* CONFIG_UNLIKELY_TRACER */
+
 void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect)
 {
+	/*
+	 * I would love to have a trace point here instead, but the
+	 * trace point code is so inundated with unlikely and likely
+	 * conditions that the recursive nightmare that exists is too
+	 * much to try to get working. At least for now.
+	 */
+	trace_likely_condition(f, val, expect);
+
 	/* FIXME: Make this atomic! */
 	if (val == expect)
 		f->correct++;
-- 
cgit v1.2.3


From f88c4ae9f8c3939bee4337c75c7a673b5de7a8a7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 12 Nov 2008 11:55:41 +0100
Subject: tracing: branch tracer, tweak output

Impact: modify the tracer output, to make it a bit easier to read

Change the output from:

>  bash-3471  [003]   357.014755: [INCORRECT] sched_info_dequeued:sched_stats.h:177
>  bash-3471  [003]   357.014756: [correct] update_curr:sched_fair.c:489
>  bash-3471  [003]   357.014758: [correct] calc_delta_fair:sched_fair.c:411

to:

>  bash-3471  [003]   357.014755: [ MISS ] sched_info_dequeued:sched_stats.h:177
>  bash-3471  [003]   357.014756: [ .... ] update_curr:sched_fair.c:489
>  bash-3471  [003]   357.014758: [ .... ] calc_delta_fair:sched_fair.c:411

it's good to have fields aligned vertically, and the only important
information is a prediction miss, so display only that information.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 83d38634bc9..728a46ec6b0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1657,7 +1657,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 		trace_assign_type(field, entry);
 
 		trace_seq_printf(s, "[%s] %s:%s:%d\n",
-				 field->correct ? "correct" : "INCORRECT",
+				 field->correct ? " .... " : " MISS ",
 				 field->func,
 				 field->file,
 				 field->line);
@@ -1808,7 +1808,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 		trace_assign_type(field, entry);
 
 		trace_seq_printf(s, "[%s] %s:%s:%d\n",
-				 field->correct ? "correct" : "INCORRECT",
+				 field->correct ? " .... " : " MISS ",
 				 field->func,
 				 field->file,
 				 field->line);
-- 
cgit v1.2.3


From 68d119f0a66f7e3663304343b072e56a2693446b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 12 Nov 2008 14:09:30 +0100
Subject: tracing: finetune branch-tracer output

Steve suggested the to change the output from this:

>  bash-3471  [003]   357.014755: [ MISS ] sched_info_dequeued:sched_stats.h:177
>  bash-3471  [003]   357.014756: [ .... ] update_curr:sched_fair.c:489
>  bash-3471  [003]   357.014758: [ .... ] calc_delta_fair:sched_fair.c:411

to this:

>  bash-3471  [003]   357.014755: [ MISS ] sched_info_dequeued:sched_stats.h:177
>  bash-3471  [003]   357.014756: [  ok  ] update_curr:sched_fair.c:489
>  bash-3471  [003]   357.014758: [  ok  ] calc_delta_fair:sched_fair.c:411

as it makes it clearer to the user what it means exactly.

Acked-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 728a46ec6b0..d842db14a59 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1657,7 +1657,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 		trace_assign_type(field, entry);
 
 		trace_seq_printf(s, "[%s] %s:%s:%d\n",
-				 field->correct ? " .... " : " MISS ",
+				 field->correct ? "  ok  " : " MISS ",
 				 field->func,
 				 field->file,
 				 field->line);
@@ -1808,7 +1808,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 		trace_assign_type(field, entry);
 
 		trace_seq_printf(s, "[%s] %s:%s:%d\n",
-				 field->correct ? " .... " : " MISS ",
+				 field->correct ? "  ok  " : " MISS ",
 				 field->func,
 				 field->file,
 				 field->line);
-- 
cgit v1.2.3


From 2ed84eeb8808cf3c9f039213ca137ffd7d753f0e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 12 Nov 2008 15:24:24 -0500
Subject: trace: rename unlikely profiler to branch profiler

Impact: name change of unlikely tracer and profiler

Ingo Molnar suggested changing the config from UNLIKELY_PROFILE
to BRANCH_PROFILING. I never did like the "unlikely" name so I
went one step farther, and renamed all the unlikely configurations
to a "BRANCH" variant.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig          | 10 +++++-----
 kernel/trace/Makefile         |  7 +++----
 kernel/trace/trace.c          |  2 +-
 kernel/trace/trace.h          |  6 +++---
 kernel/trace/trace_unlikely.c |  4 ++--
 5 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8abcaf821be..9c89526b6b7 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -159,7 +159,7 @@ config BOOT_TRACER
 	    selected, because the self-tests are an initcall as well and that
 	    would invalidate the boot trace. )
 
-config TRACE_UNLIKELY_PROFILE
+config TRACE_BRANCH_PROFILING
 	bool "Trace likely/unlikely profiler"
 	depends on DEBUG_KERNEL
 	select TRACING
@@ -175,7 +175,7 @@ config TRACE_UNLIKELY_PROFILE
 
 	  Say N if unsure.
 
-config TRACING_UNLIKELY
+config TRACING_BRANCHES
 	bool
 	help
 	  Selected by tracers that will trace the likely and unlikely
@@ -183,10 +183,10 @@ config TRACING_UNLIKELY
 	  profiled. Profiling the tracing infrastructure can only happen
 	  when the likelys and unlikelys are not being traced.
 
-config UNLIKELY_TRACER
+config BRANCH_TRACER
 	bool "Trace likely/unlikely instances"
-	depends on TRACE_UNLIKELY_PROFILE
-	select TRACING_UNLIKELY
+	depends on TRACE_BRANCH_PROFILING
+	select TRACING_BRANCHES
 	help
 	  This traces the events of likely and unlikely condition
 	  calls in the kernel.  The difference between this and the
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c938d03516c..0087df7ba44 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -11,9 +11,8 @@ obj-y += trace_selftest_dynamic.o
 endif
 
 # If unlikely tracing is enabled, do not trace these files
-ifdef CONFIG_TRACING_UNLIKELY
-KBUILD_CFLAGS += '-Dlikely(x)=likely_notrace(x)'
-KBUILD_CFLAGS += '-Dunlikely(x)=unlikely_notrace(x)'
+ifdef CONFIG_TRACING_BRANCHES
+KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
 endif
 
 obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
@@ -31,6 +30,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
 obj-$(CONFIG_FUNCTION_RET_TRACER) += trace_functions_return.o
-obj-$(CONFIG_TRACE_UNLIKELY_PROFILE) += trace_unlikely.o
+obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_unlikely.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d842db14a59..bad59d32a4a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -258,7 +258,7 @@ static const char *trace_options[] = {
 	"sched-tree",
 	"ftrace_printk",
 	"ftrace_preempt",
-#ifdef CONFIG_UNLIKELY_TRACER
+#ifdef CONFIG_BRANCH_TRACER
 	"unlikely",
 #endif
 	NULL
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9635aa2c4fc..dccae631294 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -468,7 +468,7 @@ enum trace_iterator_flags {
 	TRACE_ITER_SCHED_TREE		= 0x200,
 	TRACE_ITER_PRINTK		= 0x400,
 	TRACE_ITER_PREEMPTONLY		= 0x800,
-#ifdef CONFIG_UNLIKELY_TRACER
+#ifdef CONFIG_BRANCH_TRACER
 	TRACE_ITER_UNLIKELY		= 0x1000,
 #endif
 };
@@ -530,7 +530,7 @@ static inline void ftrace_preempt_enable(int resched)
 		preempt_enable_notrace();
 }
 
-#ifdef CONFIG_UNLIKELY_TRACER
+#ifdef CONFIG_BRANCH_TRACER
 extern int enable_unlikely_tracing(struct trace_array *tr);
 extern void disable_unlikely_tracing(void);
 static inline int trace_unlikely_enable(struct trace_array *tr)
@@ -552,6 +552,6 @@ static inline int trace_unlikely_enable(struct trace_array *tr)
 static inline void trace_unlikely_disable(void)
 {
 }
-#endif /* CONFIG_UNLIKELY_TRACER */
+#endif /* CONFIG_BRANCH_TRACER */
 
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_unlikely.c b/kernel/trace/trace_unlikely.c
index 7290e0e7b4e..856eb3b7f69 100644
--- a/kernel/trace/trace_unlikely.c
+++ b/kernel/trace/trace_unlikely.c
@@ -15,7 +15,7 @@
 #include <asm/local.h>
 #include "trace.h"
 
-#ifdef CONFIG_UNLIKELY_TRACER
+#ifdef CONFIG_BRANCH_TRACER
 
 static int unlikely_tracing_enabled __read_mostly;
 static DEFINE_MUTEX(unlikely_tracing_mutex);
@@ -119,7 +119,7 @@ static inline
 void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
 {
 }
-#endif /* CONFIG_UNLIKELY_TRACER */
+#endif /* CONFIG_BRANCH_TRACER */
 
 void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect)
 {
-- 
cgit v1.2.3


From 9f029e83e968e5661d7be045bbcb620dbb909938 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 12 Nov 2008 15:24:24 -0500
Subject: ftrace: rename unlikely iter_ctrl to branch

Impact: rename of iter_ctrl unlikely to branch

The unlikely name is ugly. This patch converts the iter_ctrl command
"unlikely" and "nounlikely" to "branch" and "nobranch" respectively.

It also renames a lot of internal functions to use "branch" instead
of "unlikely".

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c          | 14 ++++++------
 kernel/trace/trace.h          | 26 +++++++++++-----------
 kernel/trace/trace_unlikely.c | 50 +++++++++++++++++++++----------------------
 3 files changed, 45 insertions(+), 45 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bad59d32a4a..4bf070bb527 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -259,7 +259,7 @@ static const char *trace_options[] = {
 	"ftrace_printk",
 	"ftrace_preempt",
 #ifdef CONFIG_BRANCH_TRACER
-	"unlikely",
+	"branch",
 #endif
 	NULL
 };
@@ -1651,8 +1651,8 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 			trace_seq_print_cont(s, iter);
 		break;
 	}
-	case TRACE_UNLIKELY: {
-		struct trace_unlikely *field;
+	case TRACE_BRANCH: {
+		struct trace_branch *field;
 
 		trace_assign_type(field, entry);
 
@@ -1802,8 +1802,8 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 		return print_return_function(iter);
 		break;
 	}
-	case TRACE_UNLIKELY: {
-		struct trace_unlikely *field;
+	case TRACE_BRANCH: {
+		struct trace_branch *field;
 
 		trace_assign_type(field, entry);
 
@@ -2619,7 +2619,7 @@ static int tracing_set_tracer(char *buf)
 	if (t == current_trace)
 		goto out;
 
-	trace_unlikely_disable();
+	trace_branch_disable();
 	if (current_trace && current_trace->reset)
 		current_trace->reset(tr);
 
@@ -2627,7 +2627,7 @@ static int tracing_set_tracer(char *buf)
 	if (t->init)
 		t->init(tr);
 
-	trace_unlikely_enable(tr);
+	trace_branch_enable(tr);
  out:
 	mutex_unlock(&trace_types_lock);
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index dccae631294..7fbf37b2745 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -22,7 +22,7 @@ enum trace_type {
 	TRACE_SPECIAL,
 	TRACE_MMIO_RW,
 	TRACE_MMIO_MAP,
-	TRACE_UNLIKELY,
+	TRACE_BRANCH,
 	TRACE_BOOT_CALL,
 	TRACE_BOOT_RET,
 	TRACE_FN_RET,
@@ -137,7 +137,7 @@ struct trace_boot_ret {
 
 #define TRACE_FUNC_SIZE 30
 #define TRACE_FILE_SIZE 20
-struct trace_unlikely {
+struct trace_branch {
 	struct trace_entry	ent;
 	unsigned	        line;
 	char			func[TRACE_FUNC_SIZE+1];
@@ -247,7 +247,7 @@ extern void __ftrace_bad_type(void);
 			  TRACE_MMIO_MAP);				\
 		IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\
 		IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\
-		IF_ASSIGN(var, ent, struct trace_unlikely, TRACE_UNLIKELY); \
+		IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
 		IF_ASSIGN(var, ent, struct ftrace_ret_entry, TRACE_FN_RET);\
 		__ftrace_bad_type();					\
 	} while (0)
@@ -469,7 +469,7 @@ enum trace_iterator_flags {
 	TRACE_ITER_PRINTK		= 0x400,
 	TRACE_ITER_PREEMPTONLY		= 0x800,
 #ifdef CONFIG_BRANCH_TRACER
-	TRACE_ITER_UNLIKELY		= 0x1000,
+	TRACE_ITER_BRANCH		= 0x1000,
 #endif
 };
 
@@ -531,25 +531,25 @@ static inline void ftrace_preempt_enable(int resched)
 }
 
 #ifdef CONFIG_BRANCH_TRACER
-extern int enable_unlikely_tracing(struct trace_array *tr);
-extern void disable_unlikely_tracing(void);
-static inline int trace_unlikely_enable(struct trace_array *tr)
+extern int enable_branch_tracing(struct trace_array *tr);
+extern void disable_branch_tracing(void);
+static inline int trace_branch_enable(struct trace_array *tr)
 {
-	if (trace_flags & TRACE_ITER_UNLIKELY)
-		return enable_unlikely_tracing(tr);
+	if (trace_flags & TRACE_ITER_BRANCH)
+		return enable_branch_tracing(tr);
 	return 0;
 }
-static inline void trace_unlikely_disable(void)
+static inline void trace_branch_disable(void)
 {
 	/* due to races, always disable */
-	disable_unlikely_tracing();
+	disable_branch_tracing();
 }
 #else
-static inline int trace_unlikely_enable(struct trace_array *tr)
+static inline int trace_branch_enable(struct trace_array *tr)
 {
 	return 0;
 }
-static inline void trace_unlikely_disable(void)
+static inline void trace_branch_disable(void)
 {
 }
 #endif /* CONFIG_BRANCH_TRACER */
diff --git a/kernel/trace/trace_unlikely.c b/kernel/trace/trace_unlikely.c
index 856eb3b7f69..e5d5969853a 100644
--- a/kernel/trace/trace_unlikely.c
+++ b/kernel/trace/trace_unlikely.c
@@ -17,16 +17,16 @@
 
 #ifdef CONFIG_BRANCH_TRACER
 
-static int unlikely_tracing_enabled __read_mostly;
-static DEFINE_MUTEX(unlikely_tracing_mutex);
-static struct trace_array *unlikely_tracer;
+static int branch_tracing_enabled __read_mostly;
+static DEFINE_MUTEX(branch_tracing_mutex);
+static struct trace_array *branch_tracer;
 
 static void
-probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
+probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 {
-	struct trace_array *tr = unlikely_tracer;
+	struct trace_array *tr = branch_tracer;
 	struct ring_buffer_event *event;
-	struct trace_unlikely *entry;
+	struct trace_branch *entry;
 	unsigned long flags, irq_flags;
 	int cpu, pc;
 	const char *p;
@@ -54,7 +54,7 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
 	pc = preempt_count();
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, flags, pc);
-	entry->ent.type		= TRACE_UNLIKELY;
+	entry->ent.type		= TRACE_BRANCH;
 
 	/* Strip off the path, only save the file */
 	p = f->file + strlen(f->file);
@@ -77,51 +77,51 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
 }
 
 static inline
-void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
+void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 {
-	if (!unlikely_tracing_enabled)
+	if (!branch_tracing_enabled)
 		return;
 
 	probe_likely_condition(f, val, expect);
 }
 
-int enable_unlikely_tracing(struct trace_array *tr)
+int enable_branch_tracing(struct trace_array *tr)
 {
 	int ret = 0;
 
-	mutex_lock(&unlikely_tracing_mutex);
-	unlikely_tracer = tr;
+	mutex_lock(&branch_tracing_mutex);
+	branch_tracer = tr;
 	/*
 	 * Must be seen before enabling. The reader is a condition
 	 * where we do not need a matching rmb()
 	 */
 	smp_wmb();
-	unlikely_tracing_enabled++;
-	mutex_unlock(&unlikely_tracing_mutex);
+	branch_tracing_enabled++;
+	mutex_unlock(&branch_tracing_mutex);
 
 	return ret;
 }
 
-void disable_unlikely_tracing(void)
+void disable_branch_tracing(void)
 {
-	mutex_lock(&unlikely_tracing_mutex);
+	mutex_lock(&branch_tracing_mutex);
 
-	if (!unlikely_tracing_enabled)
+	if (!branch_tracing_enabled)
 		goto out_unlock;
 
-	unlikely_tracing_enabled--;
+	branch_tracing_enabled--;
 
  out_unlock:
-	mutex_unlock(&unlikely_tracing_mutex);
+	mutex_unlock(&branch_tracing_mutex);
 }
 #else
 static inline
-void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
+void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 {
 }
 #endif /* CONFIG_BRANCH_TRACER */
 
-void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect)
+void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect)
 {
 	/*
 	 * I would love to have a trace point here instead, but the
@@ -148,7 +148,7 @@ static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	struct ftrace_pointer *f = m->private;
-	struct ftrace_likely_data *p = v;
+	struct ftrace_branch_data *p = v;
 
 	(*pos)++;
 
@@ -180,7 +180,7 @@ static void t_stop(struct seq_file *m, void *p)
 
 static int t_show(struct seq_file *m, void *v)
 {
-	struct ftrace_likely_data *p = v;
+	struct ftrace_branch_data *p = v;
 	const char *f;
 	unsigned long percent;
 
@@ -252,7 +252,7 @@ static struct ftrace_pointer ftrace_unlikely_pos = {
 	.stop			= __stop_unlikely_profile,
 };
 
-static __init int ftrace_unlikely_init(void)
+static __init int ftrace_branch_init(void)
 {
 	struct dentry *d_tracer;
 	struct dentry *entry;
@@ -275,4 +275,4 @@ static __init int ftrace_unlikely_init(void)
 	return 0;
 }
 
-device_initcall(ftrace_unlikely_init);
+device_initcall(ftrace_branch_init);
-- 
cgit v1.2.3


From 80e5ea4506791af206266c5921c97f11d3b17866 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 12 Nov 2008 15:24:24 -0500
Subject: ftrace: add tracer called branch

Impact: added new branch tracer

Currently the tracing of branch profiling (unlikelys and likelys hit)
is only activated by the iter_ctrl. This patch adds a tracer called
"branch" that will just trace the branch profiling. The advantage
of adding this tracer is that it can be added to the ftrace selftests
on startup.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h          |  2 ++
 kernel/trace/trace_selftest.c | 23 +++++++++++++++++++++++
 kernel/trace/trace_unlikely.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 67 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 7fbf37b2745..9e015f5bea1 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -420,6 +420,8 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace,
 					       struct trace_array *tr);
 extern int trace_selftest_startup_sysprof(struct tracer *trace,
 					       struct trace_array *tr);
+extern int trace_selftest_startup_branch(struct tracer *trace,
+					 struct trace_array *tr);
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
 
 extern void *head_page(struct trace_array_cpu *data);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 0728a105dcc..24e6e075e6d 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -13,6 +13,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
 	case TRACE_STACK:
 	case TRACE_PRINT:
 	case TRACE_SPECIAL:
+	case TRACE_BRANCH:
 		return 1;
 	}
 	return 0;
@@ -544,3 +545,25 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
 	return ret;
 }
 #endif /* CONFIG_SYSPROF_TRACER */
+
+#ifdef CONFIG_BRANCH_TRACER
+int
+trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
+{
+	unsigned long count;
+	int ret;
+
+	/* start the tracing */
+	trace->init(tr);
+	/* Sleep for a 1/10 of a second */
+	msleep(100);
+	/* stop the tracing. */
+	tracing_stop();
+	/* check the trace buffer */
+	ret = trace_test_buffer(tr, &count);
+	trace->reset(tr);
+	tracing_start();
+
+	return ret;
+}
+#endif /* CONFIG_BRANCH_TRACER */
diff --git a/kernel/trace/trace_unlikely.c b/kernel/trace/trace_unlikely.c
index e5d5969853a..85265553918 100644
--- a/kernel/trace/trace_unlikely.c
+++ b/kernel/trace/trace_unlikely.c
@@ -114,6 +114,48 @@ void disable_branch_tracing(void)
  out_unlock:
 	mutex_unlock(&branch_tracing_mutex);
 }
+
+static void start_branch_trace(struct trace_array *tr)
+{
+	enable_branch_tracing(tr);
+}
+
+static void stop_branch_trace(struct trace_array *tr)
+{
+	disable_branch_tracing();
+}
+
+static void branch_trace_init(struct trace_array *tr)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		tracing_reset(tr, cpu);
+
+	start_branch_trace(tr);
+}
+
+static void branch_trace_reset(struct trace_array *tr)
+{
+	stop_branch_trace(tr);
+}
+
+struct tracer branch_trace __read_mostly =
+{
+	.name		= "branch",
+	.init		= branch_trace_init,
+	.reset		= branch_trace_reset,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest	= trace_selftest_startup_branch,
+#endif
+};
+
+__init static int init_branch_trace(void)
+{
+	return register_tracer(&branch_trace);
+}
+
+device_initcall(init_branch_trace);
 #else
 static inline
 void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
-- 
cgit v1.2.3


From 94b80ffd650b22e1fd493ccf6bad7efda4b8ea85 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 12 Nov 2008 16:18:45 -0500
Subject: ftrace: rename trace_unlikely.c file

Impact: File name change of trace_unlikely.c

The "unlikely" name for the tracer is quite ugly. We renamed all the
parts of it to "branch" and now it is time to rename the file too.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Makefile         |   2 +-
 kernel/trace/trace_branch.c   | 320 ++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_unlikely.c | 320 ------------------------------------------
 3 files changed, 321 insertions(+), 321 deletions(-)
 create mode 100644 kernel/trace/trace_branch.c
 delete mode 100644 kernel/trace/trace_unlikely.c

(limited to 'kernel')

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 0087df7ba44..1a8c9259dc6 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -30,6 +30,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
 obj-$(CONFIG_FUNCTION_RET_TRACER) += trace_functions_return.o
-obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_unlikely.o
+obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
new file mode 100644
index 00000000000..85265553918
--- /dev/null
+++ b/kernel/trace/trace_branch.c
@@ -0,0 +1,320 @@
+/*
+ * unlikely profiler
+ *
+ * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
+ */
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ftrace.h>
+#include <linux/hash.h>
+#include <linux/fs.h>
+#include <asm/local.h>
+#include "trace.h"
+
+#ifdef CONFIG_BRANCH_TRACER
+
+static int branch_tracing_enabled __read_mostly;
+static DEFINE_MUTEX(branch_tracing_mutex);
+static struct trace_array *branch_tracer;
+
+static void
+probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
+{
+	struct trace_array *tr = branch_tracer;
+	struct ring_buffer_event *event;
+	struct trace_branch *entry;
+	unsigned long flags, irq_flags;
+	int cpu, pc;
+	const char *p;
+
+	/*
+	 * I would love to save just the ftrace_likely_data pointer, but
+	 * this code can also be used by modules. Ugly things can happen
+	 * if the module is unloaded, and then we go and read the
+	 * pointer.  This is slower, but much safer.
+	 */
+
+	if (unlikely(!tr))
+		return;
+
+	local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
+		goto out;
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		goto out;
+
+	pc = preempt_count();
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, flags, pc);
+	entry->ent.type		= TRACE_BRANCH;
+
+	/* Strip off the path, only save the file */
+	p = f->file + strlen(f->file);
+	while (p >= f->file && *p != '/')
+		p--;
+	p++;
+
+	strncpy(entry->func, f->func, TRACE_FUNC_SIZE);
+	strncpy(entry->file, p, TRACE_FILE_SIZE);
+	entry->func[TRACE_FUNC_SIZE] = 0;
+	entry->file[TRACE_FILE_SIZE] = 0;
+	entry->line = f->line;
+	entry->correct = val == expect;
+
+	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+ out:
+	atomic_dec(&tr->data[cpu]->disabled);
+	local_irq_restore(flags);
+}
+
+static inline
+void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
+{
+	if (!branch_tracing_enabled)
+		return;
+
+	probe_likely_condition(f, val, expect);
+}
+
+int enable_branch_tracing(struct trace_array *tr)
+{
+	int ret = 0;
+
+	mutex_lock(&branch_tracing_mutex);
+	branch_tracer = tr;
+	/*
+	 * Must be seen before enabling. The reader is a condition
+	 * where we do not need a matching rmb()
+	 */
+	smp_wmb();
+	branch_tracing_enabled++;
+	mutex_unlock(&branch_tracing_mutex);
+
+	return ret;
+}
+
+void disable_branch_tracing(void)
+{
+	mutex_lock(&branch_tracing_mutex);
+
+	if (!branch_tracing_enabled)
+		goto out_unlock;
+
+	branch_tracing_enabled--;
+
+ out_unlock:
+	mutex_unlock(&branch_tracing_mutex);
+}
+
+static void start_branch_trace(struct trace_array *tr)
+{
+	enable_branch_tracing(tr);
+}
+
+static void stop_branch_trace(struct trace_array *tr)
+{
+	disable_branch_tracing();
+}
+
+static void branch_trace_init(struct trace_array *tr)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		tracing_reset(tr, cpu);
+
+	start_branch_trace(tr);
+}
+
+static void branch_trace_reset(struct trace_array *tr)
+{
+	stop_branch_trace(tr);
+}
+
+struct tracer branch_trace __read_mostly =
+{
+	.name		= "branch",
+	.init		= branch_trace_init,
+	.reset		= branch_trace_reset,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest	= trace_selftest_startup_branch,
+#endif
+};
+
+__init static int init_branch_trace(void)
+{
+	return register_tracer(&branch_trace);
+}
+
+device_initcall(init_branch_trace);
+#else
+static inline
+void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
+{
+}
+#endif /* CONFIG_BRANCH_TRACER */
+
+void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect)
+{
+	/*
+	 * I would love to have a trace point here instead, but the
+	 * trace point code is so inundated with unlikely and likely
+	 * conditions that the recursive nightmare that exists is too
+	 * much to try to get working. At least for now.
+	 */
+	trace_likely_condition(f, val, expect);
+
+	/* FIXME: Make this atomic! */
+	if (val == expect)
+		f->correct++;
+	else
+		f->incorrect++;
+}
+EXPORT_SYMBOL(ftrace_likely_update);
+
+struct ftrace_pointer {
+	void		*start;
+	void		*stop;
+};
+
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct ftrace_pointer *f = m->private;
+	struct ftrace_branch_data *p = v;
+
+	(*pos)++;
+
+	if (v == (void *)1)
+		return f->start;
+
+	++p;
+
+	if ((void *)p >= (void *)f->stop)
+		return NULL;
+
+	return p;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+	void *t = (void *)1;
+	loff_t l = 0;
+
+	for (; t && l < *pos; t = t_next(m, t, &l))
+		;
+
+	return t;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+	struct ftrace_branch_data *p = v;
+	const char *f;
+	unsigned long percent;
+
+	if (v == (void *)1) {
+		seq_printf(m, " correct incorrect  %% "
+			      "       Function                "
+			      "  File              Line\n"
+			      " ------- ---------  - "
+			      "       --------                "
+			      "  ----              ----\n");
+		return 0;
+	}
+
+	/* Only print the file, not the path */
+	f = p->file + strlen(p->file);
+	while (f >= p->file && *f != '/')
+		f--;
+	f++;
+
+	if (p->correct) {
+		percent = p->incorrect * 100;
+		percent /= p->correct + p->incorrect;
+	} else
+		percent = p->incorrect ? 100 : 0;
+
+	seq_printf(m, "%8lu %8lu %3lu ", p->correct, p->incorrect, percent);
+	seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
+	return 0;
+}
+
+static struct seq_operations tracing_likely_seq_ops = {
+	.start		= t_start,
+	.next		= t_next,
+	.stop		= t_stop,
+	.show		= t_show,
+};
+
+static int tracing_likely_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	ret = seq_open(file, &tracing_likely_seq_ops);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+		m->private = (void *)inode->i_private;
+	}
+
+	return ret;
+}
+
+static struct file_operations tracing_likely_fops = {
+	.open		= tracing_likely_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+};
+
+extern unsigned long __start_likely_profile[];
+extern unsigned long __stop_likely_profile[];
+extern unsigned long __start_unlikely_profile[];
+extern unsigned long __stop_unlikely_profile[];
+
+static struct ftrace_pointer ftrace_likely_pos = {
+	.start			= __start_likely_profile,
+	.stop			= __stop_likely_profile,
+};
+
+static struct ftrace_pointer ftrace_unlikely_pos = {
+	.start			= __start_unlikely_profile,
+	.stop			= __stop_unlikely_profile,
+};
+
+static __init int ftrace_branch_init(void)
+{
+	struct dentry *d_tracer;
+	struct dentry *entry;
+
+	d_tracer = tracing_init_dentry();
+
+	entry = debugfs_create_file("profile_likely", 0444, d_tracer,
+				    &ftrace_likely_pos,
+				    &tracing_likely_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'profile_likely' entry\n");
+
+	entry = debugfs_create_file("profile_unlikely", 0444, d_tracer,
+				    &ftrace_unlikely_pos,
+				    &tracing_likely_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs"
+			   " 'profile_unlikely' entry\n");
+
+	return 0;
+}
+
+device_initcall(ftrace_branch_init);
diff --git a/kernel/trace/trace_unlikely.c b/kernel/trace/trace_unlikely.c
deleted file mode 100644
index 85265553918..00000000000
--- a/kernel/trace/trace_unlikely.c
+++ /dev/null
@@ -1,320 +0,0 @@
-/*
- * unlikely profiler
- *
- * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
- */
-#include <linux/kallsyms.h>
-#include <linux/seq_file.h>
-#include <linux/spinlock.h>
-#include <linux/debugfs.h>
-#include <linux/uaccess.h>
-#include <linux/module.h>
-#include <linux/ftrace.h>
-#include <linux/hash.h>
-#include <linux/fs.h>
-#include <asm/local.h>
-#include "trace.h"
-
-#ifdef CONFIG_BRANCH_TRACER
-
-static int branch_tracing_enabled __read_mostly;
-static DEFINE_MUTEX(branch_tracing_mutex);
-static struct trace_array *branch_tracer;
-
-static void
-probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
-{
-	struct trace_array *tr = branch_tracer;
-	struct ring_buffer_event *event;
-	struct trace_branch *entry;
-	unsigned long flags, irq_flags;
-	int cpu, pc;
-	const char *p;
-
-	/*
-	 * I would love to save just the ftrace_likely_data pointer, but
-	 * this code can also be used by modules. Ugly things can happen
-	 * if the module is unloaded, and then we go and read the
-	 * pointer.  This is slower, but much safer.
-	 */
-
-	if (unlikely(!tr))
-		return;
-
-	local_irq_save(flags);
-	cpu = raw_smp_processor_id();
-	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
-		goto out;
-
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
-	if (!event)
-		goto out;
-
-	pc = preempt_count();
-	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, flags, pc);
-	entry->ent.type		= TRACE_BRANCH;
-
-	/* Strip off the path, only save the file */
-	p = f->file + strlen(f->file);
-	while (p >= f->file && *p != '/')
-		p--;
-	p++;
-
-	strncpy(entry->func, f->func, TRACE_FUNC_SIZE);
-	strncpy(entry->file, p, TRACE_FILE_SIZE);
-	entry->func[TRACE_FUNC_SIZE] = 0;
-	entry->file[TRACE_FILE_SIZE] = 0;
-	entry->line = f->line;
-	entry->correct = val == expect;
-
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
-
- out:
-	atomic_dec(&tr->data[cpu]->disabled);
-	local_irq_restore(flags);
-}
-
-static inline
-void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
-{
-	if (!branch_tracing_enabled)
-		return;
-
-	probe_likely_condition(f, val, expect);
-}
-
-int enable_branch_tracing(struct trace_array *tr)
-{
-	int ret = 0;
-
-	mutex_lock(&branch_tracing_mutex);
-	branch_tracer = tr;
-	/*
-	 * Must be seen before enabling. The reader is a condition
-	 * where we do not need a matching rmb()
-	 */
-	smp_wmb();
-	branch_tracing_enabled++;
-	mutex_unlock(&branch_tracing_mutex);
-
-	return ret;
-}
-
-void disable_branch_tracing(void)
-{
-	mutex_lock(&branch_tracing_mutex);
-
-	if (!branch_tracing_enabled)
-		goto out_unlock;
-
-	branch_tracing_enabled--;
-
- out_unlock:
-	mutex_unlock(&branch_tracing_mutex);
-}
-
-static void start_branch_trace(struct trace_array *tr)
-{
-	enable_branch_tracing(tr);
-}
-
-static void stop_branch_trace(struct trace_array *tr)
-{
-	disable_branch_tracing();
-}
-
-static void branch_trace_init(struct trace_array *tr)
-{
-	int cpu;
-
-	for_each_online_cpu(cpu)
-		tracing_reset(tr, cpu);
-
-	start_branch_trace(tr);
-}
-
-static void branch_trace_reset(struct trace_array *tr)
-{
-	stop_branch_trace(tr);
-}
-
-struct tracer branch_trace __read_mostly =
-{
-	.name		= "branch",
-	.init		= branch_trace_init,
-	.reset		= branch_trace_reset,
-#ifdef CONFIG_FTRACE_SELFTEST
-	.selftest	= trace_selftest_startup_branch,
-#endif
-};
-
-__init static int init_branch_trace(void)
-{
-	return register_tracer(&branch_trace);
-}
-
-device_initcall(init_branch_trace);
-#else
-static inline
-void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
-{
-}
-#endif /* CONFIG_BRANCH_TRACER */
-
-void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect)
-{
-	/*
-	 * I would love to have a trace point here instead, but the
-	 * trace point code is so inundated with unlikely and likely
-	 * conditions that the recursive nightmare that exists is too
-	 * much to try to get working. At least for now.
-	 */
-	trace_likely_condition(f, val, expect);
-
-	/* FIXME: Make this atomic! */
-	if (val == expect)
-		f->correct++;
-	else
-		f->incorrect++;
-}
-EXPORT_SYMBOL(ftrace_likely_update);
-
-struct ftrace_pointer {
-	void		*start;
-	void		*stop;
-};
-
-static void *
-t_next(struct seq_file *m, void *v, loff_t *pos)
-{
-	struct ftrace_pointer *f = m->private;
-	struct ftrace_branch_data *p = v;
-
-	(*pos)++;
-
-	if (v == (void *)1)
-		return f->start;
-
-	++p;
-
-	if ((void *)p >= (void *)f->stop)
-		return NULL;
-
-	return p;
-}
-
-static void *t_start(struct seq_file *m, loff_t *pos)
-{
-	void *t = (void *)1;
-	loff_t l = 0;
-
-	for (; t && l < *pos; t = t_next(m, t, &l))
-		;
-
-	return t;
-}
-
-static void t_stop(struct seq_file *m, void *p)
-{
-}
-
-static int t_show(struct seq_file *m, void *v)
-{
-	struct ftrace_branch_data *p = v;
-	const char *f;
-	unsigned long percent;
-
-	if (v == (void *)1) {
-		seq_printf(m, " correct incorrect  %% "
-			      "       Function                "
-			      "  File              Line\n"
-			      " ------- ---------  - "
-			      "       --------                "
-			      "  ----              ----\n");
-		return 0;
-	}
-
-	/* Only print the file, not the path */
-	f = p->file + strlen(p->file);
-	while (f >= p->file && *f != '/')
-		f--;
-	f++;
-
-	if (p->correct) {
-		percent = p->incorrect * 100;
-		percent /= p->correct + p->incorrect;
-	} else
-		percent = p->incorrect ? 100 : 0;
-
-	seq_printf(m, "%8lu %8lu %3lu ", p->correct, p->incorrect, percent);
-	seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
-	return 0;
-}
-
-static struct seq_operations tracing_likely_seq_ops = {
-	.start		= t_start,
-	.next		= t_next,
-	.stop		= t_stop,
-	.show		= t_show,
-};
-
-static int tracing_likely_open(struct inode *inode, struct file *file)
-{
-	int ret;
-
-	ret = seq_open(file, &tracing_likely_seq_ops);
-	if (!ret) {
-		struct seq_file *m = file->private_data;
-		m->private = (void *)inode->i_private;
-	}
-
-	return ret;
-}
-
-static struct file_operations tracing_likely_fops = {
-	.open		= tracing_likely_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-};
-
-extern unsigned long __start_likely_profile[];
-extern unsigned long __stop_likely_profile[];
-extern unsigned long __start_unlikely_profile[];
-extern unsigned long __stop_unlikely_profile[];
-
-static struct ftrace_pointer ftrace_likely_pos = {
-	.start			= __start_likely_profile,
-	.stop			= __stop_likely_profile,
-};
-
-static struct ftrace_pointer ftrace_unlikely_pos = {
-	.start			= __start_unlikely_profile,
-	.stop			= __stop_unlikely_profile,
-};
-
-static __init int ftrace_branch_init(void)
-{
-	struct dentry *d_tracer;
-	struct dentry *entry;
-
-	d_tracer = tracing_init_dentry();
-
-	entry = debugfs_create_file("profile_likely", 0444, d_tracer,
-				    &ftrace_likely_pos,
-				    &tracing_likely_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'profile_likely' entry\n");
-
-	entry = debugfs_create_file("profile_unlikely", 0444, d_tracer,
-				    &ftrace_unlikely_pos,
-				    &tracing_likely_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs"
-			   " 'profile_unlikely' entry\n");
-
-	return 0;
-}
-
-device_initcall(ftrace_branch_init);
-- 
cgit v1.2.3


From a94c80e78bc9f4493ffc25a02d5d7bcd93c399d0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 12 Nov 2008 17:52:36 -0500
Subject: ftrace: rename trace_entries to buffer_size_kb

Impact: rename of debugfs file trace_entries to buffer_size_kb

The original ftrace had fixed size entries, and the number of entries
was shown and modified via the file called trace_entries. By converting
to the unified trace buffer, we now allow for variable size entries
which makes the meaning of trace_entries pointless.

Since trace_size might be confused to the size of the trace, this patch
names it "buffer_size_kb" (thanks to Arjan van de Ven for this idea).

[ mingo@elte.hu: changed from buffer_size to buffer_size_kb ]

( Note, the units are still bytes - the next patch changes that,
  to keep the wide rename patch separate from the unit-change patch. )

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4bf070bb527..b42d42056fa 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3198,11 +3198,11 @@ static __init int tracer_init_debugfs(void)
 		pr_warning("Could not create debugfs "
 			   "'trace_pipe' entry\n");
 
-	entry = debugfs_create_file("trace_entries", 0644, d_tracer,
+	entry = debugfs_create_file("buffer_size_kb", 0644, d_tracer,
 				    &global_trace, &tracing_entries_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs "
-			   "'trace_entries' entry\n");
+			   "'buffer_size_kb' entry\n");
 
 	entry = debugfs_create_file("trace_marker", 0220, d_tracer,
 				    NULL, &tracing_mark_fops);
-- 
cgit v1.2.3


From 1696b2b0f44a8d42f3e6b1ea90c21790871c04d9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 13 Nov 2008 00:09:35 -0500
Subject: ftrace: show buffer size in kilobytes

Impact: change the units of buffer_size_kb to kilobytes

This patch changes the units of the buffer_size_kb file to kilobytes.
Reading and writing to the file uses kilobytes as units. To help
users to know what units are used, the output of the file now
looks like:

  # cat /debug/tracing/buffer_size_kb
  1408

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b42d42056fa..d664aae2e10 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2905,7 +2905,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
 	char buf[64];
 	int r;
 
-	r = sprintf(buf, "%lu\n", tr->entries);
+	r = sprintf(buf, "%lu\n", tr->entries >> 10);
 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
@@ -2945,6 +2945,9 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
 			atomic_inc(&max_tr.data[cpu]->disabled);
 	}
 
+	/* value is in KB */
+	val <<= 10;
+
 	if (val != global_trace.entries) {
 		ret = ring_buffer_resize(global_trace.buffer, val);
 		if (ret < 0) {
-- 
cgit v1.2.3


From ee6bce52276c0717ed3e63296e5d9465d339e923 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 12 Nov 2008 17:52:37 -0500
Subject: ftrace: rename iter_ctrl to trace_options

Impact: rename file /debug/tracing/iter_ctrl to /debug/tracing/trace_options

The original ftrace had a file called "iter_ctrl" that would control
the way the output was iterated. But this file grew into a catch all
for different trace options. This patch renames the file from iter_ctrl
to trace_options to reflect this change.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d664aae2e10..240423a9d1a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -204,7 +204,7 @@ static DEFINE_MUTEX(trace_types_lock);
 /* trace_wait is a waitqueue for tasks blocked on trace_poll */
 static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 
-/* trace_flags holds iter_ctrl options */
+/* trace_flags holds trace_options default values */
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK;
 
 /**
@@ -2411,7 +2411,7 @@ static struct file_operations tracing_cpumask_fops = {
 };
 
 static ssize_t
-tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
+tracing_trace_options_read(struct file *filp, char __user *ubuf,
 		       size_t cnt, loff_t *ppos)
 {
 	char *buf;
@@ -2448,7 +2448,7 @@ tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
 }
 
 static ssize_t
-tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
+tracing_trace_options_write(struct file *filp, const char __user *ubuf,
 			size_t cnt, loff_t *ppos)
 {
 	char buf[64];
@@ -2493,8 +2493,8 @@ tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
 
 static struct file_operations tracing_iter_fops = {
 	.open		= tracing_open_generic,
-	.read		= tracing_iter_ctrl_read,
-	.write		= tracing_iter_ctrl_write,
+	.read		= tracing_trace_options_read,
+	.write		= tracing_trace_options_write,
 };
 
 static const char readme_msg[] =
@@ -2508,9 +2508,9 @@ static const char readme_msg[] =
 	"# echo sched_switch > /debug/tracing/current_tracer\n"
 	"# cat /debug/tracing/current_tracer\n"
 	"sched_switch\n"
-	"# cat /debug/tracing/iter_ctrl\n"
+	"# cat /debug/tracing/trace_options\n"
 	"noprint-parent nosym-offset nosym-addr noverbose\n"
-	"# echo print-parent > /debug/tracing/iter_ctrl\n"
+	"# echo print-parent > /debug/tracing/trace_options\n"
 	"# echo 1 > /debug/tracing/tracing_enabled\n"
 	"# cat /debug/tracing/trace > /tmp/trace.txt\n"
 	"echo 0 > /debug/tracing/tracing_enabled\n"
@@ -3148,10 +3148,10 @@ static __init int tracer_init_debugfs(void)
 	if (!entry)
 		pr_warning("Could not create debugfs 'tracing_enabled' entry\n");
 
-	entry = debugfs_create_file("iter_ctrl", 0644, d_tracer,
+	entry = debugfs_create_file("trace_options", 0644, d_tracer,
 				    NULL, &tracing_iter_fops);
 	if (!entry)
-		pr_warning("Could not create debugfs 'iter_ctrl' entry\n");
+		pr_warning("Could not create debugfs 'trace_options' entry\n");
 
 	entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
 				    NULL, &tracing_cpumask_fops);
-- 
cgit v1.2.3


From 12ef7d448613ead2babd41c3ebfa1fe03c20edef Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 12 Nov 2008 17:52:38 -0500
Subject: ftrace: CPU buffer start annotation clean ups

Impact: better handling of CPU buffer start annotation

Because of the confusion with the per CPU buffers wrapping where
one CPU might be more active at the end of the trace than the other
CPUs causing that one CPU to have a shorter history. Kernel
developers were confused by the "missing" data of that one CPU
at the beginning of the trace output. An annotation was added to
the trace output to show that the buffer had started:

 # tracer: function
 #
 #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
 #              | |       |          |         |
 ##### CPU 3 buffer started ####
          <idle>-0     [003]   158.192959: smp_apic_timer_interrupt
 [...]
           <idle>-0     [003]   161.556520: default_idle
 ##### CPU 1 buffer started ####
           <idle>-0     [001]   161.592494: hrtimer_force_reprogram
 [etc]

But this annotation gets a bit messy when tracers do not fill the
buffers. This patch does a couple of things:

 One) it adds a flag to trace_options to disable these annotations

 Two) it does not annotate if the tracer did not overflow its buffer.

This makes the output much cleaner.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 16 +++++++++++++++-
 kernel/trace/trace.h |  1 +
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 240423a9d1a..4a904623e05 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -205,7 +205,8 @@ static DEFINE_MUTEX(trace_types_lock);
 static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 
 /* trace_flags holds trace_options default values */
-unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK;
+unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
+	TRACE_ITER_ANNOTATE;
 
 /**
  * trace_wake_up - wake up tasks waiting for trace input
@@ -261,6 +262,7 @@ static const char *trace_options[] = {
 #ifdef CONFIG_BRANCH_TRACER
 	"branch",
 #endif
+	"annotate",
 	NULL
 };
 
@@ -1113,6 +1115,7 @@ void tracing_stop_function_trace(void)
 
 enum trace_file_type {
 	TRACE_FILE_LAT_FMT	= 1,
+	TRACE_FILE_ANNOTATE	= 2,
 };
 
 static void trace_iterator_increment(struct trace_iterator *iter, int cpu)
@@ -1532,6 +1535,12 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
 
+	if (!(trace_flags & TRACE_ITER_ANNOTATE))
+		return;
+
+	if (!(iter->iter_flags & TRACE_FILE_ANNOTATE))
+		return;
+
 	if (cpu_isset(iter->cpu, iter->started))
 		return;
 
@@ -2132,6 +2141,11 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 	iter->trace = current_trace;
 	iter->pos = -1;
 
+	/* Annotate start of buffers if we had overruns */
+	if (ring_buffer_overruns(iter->tr->buffer))
+		iter->iter_flags |= TRACE_FILE_ANNOTATE;
+
+
 	for_each_tracing_cpu(cpu) {
 
 		iter->buffer_iter[cpu] =
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9e015f5bea1..790ea8c0e1f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -473,6 +473,7 @@ enum trace_iterator_flags {
 #ifdef CONFIG_BRANCH_TRACER
 	TRACE_ITER_BRANCH		= 0x1000,
 #endif
+	TRACE_ITER_ANNOTATE		= 0x2000,
 };
 
 /*
-- 
cgit v1.2.3


From b3535c6390f27d04273e4eee0bc687f171fbf5f4 Mon Sep 17 00:00:00 2001
From: walimis <walimisdev@gmail.com>
Date: Fri, 14 Nov 2008 00:21:02 +0800
Subject: ftrace: remove unnecessary if condition of
 __unregister_ftrace_function

Because it has goto out before ftrace_list == &ftrace_list_end,
that's to say, we never meet this condition.

Signed-off-by: walimis <walimisdev@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index beb21a51e1e..54cb9a7d15e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -179,8 +179,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
 
 	if (ftrace_enabled) {
 		/* If we only have one func left, then call that directly */
-		if (ftrace_list == &ftrace_list_end ||
-		    ftrace_list->next == &ftrace_list_end)
+		if (ftrace_list->next == &ftrace_list_end)
 			ftrace_trace_function = ftrace_list->func;
 	}
 
-- 
cgit v1.2.3


From 76aac0e9a17742e60d408be1a706e9aaad370891 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:12 +1100
Subject: CRED: Wrap task credential accesses in the core kernel

Wrap access to task credentials so that they can be separated more easily from
the task_struct during the introduction of COW creds.

Change most current->(|e|s|fs)[ug]id to current_(|e|s|fs)[ug]id().

Change some task->e?[ug]id to task_e?[ug]id().  In some places it makes more
sense to use RCU directly rather than a convenient wrapper; these will be
addressed by later patches.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-audit@redhat.com
Cc: containers@lists.linux-foundation.org
Cc: linux-mm@kvack.org
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/acct.c           |  7 +++----
 kernel/auditsc.c        |  6 ++++--
 kernel/cgroup.c         |  9 +++++----
 kernel/futex.c          |  8 +++++---
 kernel/futex_compat.c   |  3 ++-
 kernel/ptrace.c         | 15 +++++++++------
 kernel/sched.c          | 11 +++++++----
 kernel/signal.c         | 15 +++++++++------
 kernel/sys.c            | 16 ++++++++--------
 kernel/sysctl.c         |  2 +-
 kernel/timer.c          |  8 ++++----
 kernel/user_namespace.c |  2 +-
 12 files changed, 58 insertions(+), 44 deletions(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index f6006a60df5..d57b7cbb98b 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -530,15 +530,14 @@ static void do_acct_process(struct bsd_acct_struct *acct,
 	do_div(elapsed, AHZ);
 	ac.ac_btime = get_seconds() - elapsed;
 	/* we really need to bite the bullet and change layout */
-	ac.ac_uid = current->uid;
-	ac.ac_gid = current->gid;
+	current_uid_gid(&ac.ac_uid, &ac.ac_gid);
 #if ACCT_VERSION==2
 	ac.ac_ahz = AHZ;
 #endif
 #if ACCT_VERSION==1 || ACCT_VERSION==2
 	/* backward-compatible 16 bit fields */
-	ac.ac_uid16 = current->uid;
-	ac.ac_gid16 = current->gid;
+	ac.ac_uid16 = ac.ac_uid;
+	ac.ac_gid16 = ac.ac_gid;
 #endif
 #if ACCT_VERSION==3
 	ac.ac_pid = task_tgid_nr_ns(current, ns);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index cef34235b36..9c7e47ae457 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2628,7 +2628,8 @@ void audit_core_dumps(long signr)
 {
 	struct audit_buffer *ab;
 	u32 sid;
-	uid_t auid = audit_get_loginuid(current);
+	uid_t auid = audit_get_loginuid(current), uid;
+	gid_t gid;
 	unsigned int sessionid = audit_get_sessionid(current);
 
 	if (!audit_enabled)
@@ -2638,8 +2639,9 @@ void audit_core_dumps(long signr)
 		return;
 
 	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
+	current_uid_gid(&uid, &gid);
 	audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
-			auid, current->uid, current->gid, sessionid);
+			 auid, uid, gid, sessionid);
 	security_task_getsecid(current, &sid);
 	if (sid) {
 		char *ctx = NULL;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 35eebd5510c..78f9b310c4f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -571,8 +571,8 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
 
 	if (inode) {
 		inode->i_mode = mode;
-		inode->i_uid = current->fsuid;
-		inode->i_gid = current->fsgid;
+		inode->i_uid = current_fsuid();
+		inode->i_gid = current_fsgid();
 		inode->i_blocks = 0;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
@@ -1279,6 +1279,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
 {
 	struct task_struct *tsk;
+	uid_t euid;
 	int ret;
 
 	if (pid) {
@@ -1291,8 +1292,8 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
 		get_task_struct(tsk);
 		rcu_read_unlock();
 
-		if ((current->euid) && (current->euid != tsk->uid)
-		    && (current->euid != tsk->suid)) {
+		euid = current_euid();
+		if (euid && euid != tsk->uid && euid != tsk->suid) {
 			put_task_struct(tsk);
 			return -EACCES;
 		}
diff --git a/kernel/futex.c b/kernel/futex.c
index 8af10027514..e06962132aa 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -439,10 +439,11 @@ static void free_pi_state(struct futex_pi_state *pi_state)
 static struct task_struct * futex_find_get_task(pid_t pid)
 {
 	struct task_struct *p;
+	uid_t euid = current_euid();
 
 	rcu_read_lock();
 	p = find_task_by_vpid(pid);
-	if (!p || ((current->euid != p->euid) && (current->euid != p->uid)))
+	if (!p || (euid != p->euid && euid != p->uid))
 		p = ERR_PTR(-ESRCH);
 	else
 		get_task_struct(p);
@@ -1829,6 +1830,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr,
 {
 	struct robust_list_head __user *head;
 	unsigned long ret;
+	uid_t euid = current_euid();
 
 	if (!futex_cmpxchg_enabled)
 		return -ENOSYS;
@@ -1844,8 +1846,8 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr,
 		if (!p)
 			goto err_unlock;
 		ret = -EPERM;
-		if ((current->euid != p->euid) && (current->euid != p->uid) &&
-				!capable(CAP_SYS_PTRACE))
+		if (euid != p->euid && euid != p->uid &&
+		    !capable(CAP_SYS_PTRACE))
 			goto err_unlock;
 		head = p->robust_list;
 		rcu_read_unlock();
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 04ac3a9e42c..3254d4e41e8 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -135,6 +135,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
 {
 	struct compat_robust_list_head __user *head;
 	unsigned long ret;
+	uid_t euid = current_euid();
 
 	if (!futex_cmpxchg_enabled)
 		return -ENOSYS;
@@ -150,7 +151,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
 		if (!p)
 			goto err_unlock;
 		ret = -EPERM;
-		if ((current->euid != p->euid) && (current->euid != p->uid) &&
+		if (euid != p->euid && euid != p->uid &&
 				!capable(CAP_SYS_PTRACE))
 			goto err_unlock;
 		head = p->compat_robust_list;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1e68e4c39e2..937f6b5b200 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -123,16 +123,19 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 	 * because setting up the necessary parent/child relationship
 	 * or halting the specified task is impossible.
 	 */
+	uid_t uid;
+	gid_t gid;
 	int dumpable = 0;
 	/* Don't let security modules deny introspection */
 	if (task == current)
 		return 0;
-	if (((current->uid != task->euid) ||
-	     (current->uid != task->suid) ||
-	     (current->uid != task->uid) ||
-	     (current->gid != task->egid) ||
-	     (current->gid != task->sgid) ||
-	     (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
+	current_uid_gid(&uid, &gid);
+	if ((uid != task->euid ||
+	     uid != task->suid ||
+	     uid != task->uid  ||
+	     gid != task->egid ||
+	     gid != task->sgid ||
+	     gid != task->gid) && !capable(CAP_SYS_PTRACE))
 		return -EPERM;
 	smp_rmb();
 	if (task->mm)
diff --git a/kernel/sched.c b/kernel/sched.c
index e8819bc6f46..c3b8b1fcde0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5128,6 +5128,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
 	unsigned long flags;
 	const struct sched_class *prev_class = p->sched_class;
 	struct rq *rq;
+	uid_t euid;
 
 	/* may grab non-irq protected spin_locks */
 	BUG_ON(in_interrupt());
@@ -5180,8 +5181,9 @@ recheck:
 			return -EPERM;
 
 		/* can't change other user's priorities */
-		if ((current->euid != p->euid) &&
-		    (current->euid != p->uid))
+		euid = current_euid();
+		if (euid != p->euid &&
+		    euid != p->uid)
 			return -EPERM;
 	}
 
@@ -5392,6 +5394,7 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
 	cpumask_t cpus_allowed;
 	cpumask_t new_mask = *in_mask;
 	struct task_struct *p;
+	uid_t euid;
 	int retval;
 
 	get_online_cpus();
@@ -5412,9 +5415,9 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
 	get_task_struct(p);
 	read_unlock(&tasklist_lock);
 
+	euid = current_euid();
 	retval = -EPERM;
-	if ((current->euid != p->euid) && (current->euid != p->uid) &&
-			!capable(CAP_SYS_NICE))
+	if (euid != p->euid && euid != p->uid && !capable(CAP_SYS_NICE))
 		goto out_unlock;
 
 	retval = security_task_setscheduler(p, 0, NULL);
diff --git a/kernel/signal.c b/kernel/signal.c
index 4530fc65445..167b535fe1a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -567,6 +567,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
 				 struct task_struct *t)
 {
 	struct pid *sid;
+	uid_t uid, euid;
 	int error;
 
 	if (!valid_signal(sig))
@@ -579,8 +580,10 @@ static int check_kill_permission(int sig, struct siginfo *info,
 	if (error)
 		return error;
 
-	if ((current->euid ^ t->suid) && (current->euid ^ t->uid) &&
-	    (current->uid  ^ t->suid) && (current->uid  ^ t->uid) &&
+	uid = current_uid();
+	euid = current_euid();
+	if ((euid ^ t->suid) && (euid ^ t->uid) &&
+	    (uid  ^ t->suid) && (uid  ^ t->uid) &&
 	    !capable(CAP_KILL)) {
 		switch (sig) {
 		case SIGCONT:
@@ -844,7 +847,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			q->info.si_errno = 0;
 			q->info.si_code = SI_USER;
 			q->info.si_pid = task_pid_vnr(current);
-			q->info.si_uid = current->uid;
+			q->info.si_uid = current_uid();
 			break;
 		case (unsigned long) SEND_SIG_PRIV:
 			q->info.si_signo = sig;
@@ -1598,7 +1601,7 @@ void ptrace_notify(int exit_code)
 	info.si_signo = SIGTRAP;
 	info.si_code = exit_code;
 	info.si_pid = task_pid_vnr(current);
-	info.si_uid = current->uid;
+	info.si_uid = current_uid();
 
 	/* Let the debugger run.  */
 	spin_lock_irq(&current->sighand->siglock);
@@ -2211,7 +2214,7 @@ sys_kill(pid_t pid, int sig)
 	info.si_errno = 0;
 	info.si_code = SI_USER;
 	info.si_pid = task_tgid_vnr(current);
-	info.si_uid = current->uid;
+	info.si_uid = current_uid();
 
 	return kill_something_info(sig, &info, pid);
 }
@@ -2228,7 +2231,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
 	info.si_errno = 0;
 	info.si_code = SI_TKILL;
 	info.si_pid = task_tgid_vnr(current);
-	info.si_uid = current->uid;
+	info.si_uid = current_uid();
 
 	rcu_read_lock();
 	p = find_task_by_vpid(pid);
diff --git a/kernel/sys.c b/kernel/sys.c
index 31deba8f7d1..ed5c29c748a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -114,10 +114,10 @@ void (*pm_power_off_prepare)(void);
 
 static int set_one_prio(struct task_struct *p, int niceval, int error)
 {
+	uid_t euid = current_euid();
 	int no_nice;
 
-	if (p->uid != current->euid &&
-		p->euid != current->euid && !capable(CAP_SYS_NICE)) {
+	if (p->uid != euid && p->euid != euid && !capable(CAP_SYS_NICE)) {
 		error = -EPERM;
 		goto out;
 	}
@@ -176,16 +176,16 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
 		case PRIO_USER:
 			user = current->user;
 			if (!who)
-				who = current->uid;
+				who = current_uid();
 			else
-				if ((who != current->uid) && !(user = find_user(who)))
+				if (who != current_uid() && !(user = find_user(who)))
 					goto out_unlock;	/* No processes for this user */
 
 			do_each_thread(g, p)
 				if (p->uid == who)
 					error = set_one_prio(p, niceval, error);
 			while_each_thread(g, p);
-			if (who != current->uid)
+			if (who != current_uid())
 				free_uid(user);		/* For find_user() */
 			break;
 	}
@@ -238,9 +238,9 @@ asmlinkage long sys_getpriority(int which, int who)
 		case PRIO_USER:
 			user = current->user;
 			if (!who)
-				who = current->uid;
+				who = current_uid();
 			else
-				if ((who != current->uid) && !(user = find_user(who)))
+				if (who != current_uid() && !(user = find_user(who)))
 					goto out_unlock;	/* No processes for this user */
 
 			do_each_thread(g, p)
@@ -250,7 +250,7 @@ asmlinkage long sys_getpriority(int which, int who)
 						retval = niceval;
 				}
 			while_each_thread(g, p);
-			if (who != current->uid)
+			if (who != current_uid())
 				free_uid(user);		/* for find_user() */
 			break;
 	}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9d048fa2d90..511031381c3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1641,7 +1641,7 @@ out:
 
 static int test_perm(int mode, int op)
 {
-	if (!current->euid)
+	if (!current_euid())
 		mode >>= 6;
 	else if (in_egroup_p(0))
 		mode >>= 3;
diff --git a/kernel/timer.c b/kernel/timer.c
index 56becf373c5..b54e4646cee 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1123,25 +1123,25 @@ asmlinkage long sys_getppid(void)
 asmlinkage long sys_getuid(void)
 {
 	/* Only we change this so SMP safe */
-	return current->uid;
+	return current_uid();
 }
 
 asmlinkage long sys_geteuid(void)
 {
 	/* Only we change this so SMP safe */
-	return current->euid;
+	return current_euid();
 }
 
 asmlinkage long sys_getgid(void)
 {
 	/* Only we change this so SMP safe */
-	return current->gid;
+	return current_gid();
 }
 
 asmlinkage long sys_getegid(void)
 {
 	/* Only we change this so SMP safe */
-	return  current->egid;
+	return  current_egid();
 }
 
 #endif
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 532858fa5b8..f82730adea0 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -38,7 +38,7 @@ static struct user_namespace *clone_user_ns(struct user_namespace *old_ns)
 	}
 
 	/* Reset current->user with a new one */
-	new_user = alloc_uid(ns, current->uid);
+	new_user = alloc_uid(ns, current_uid());
 	if (!new_user) {
 		free_uid(ns->root_user);
 		kfree(ns);
-- 
cgit v1.2.3


From 8bbf4976b59fc9fc2861e79cab7beb3f6d647640 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:14 +1100
Subject: KEYS: Alter use of key instantiation link-to-keyring argument

Alter the use of the key instantiation and negation functions' link-to-keyring
arguments.  Currently this specifies a keyring in the target process to link
the key into, creating the keyring if it doesn't exist.  This, however, can be
a problem for copy-on-write credentials as it means that the instantiating
process can alter the credentials of the requesting process.

This patch alters the behaviour such that:

 (1) If keyctl_instantiate_key() or keyctl_negate_key() are given a specific
     keyring by ID (ringid >= 0), then that keyring will be used.

 (2) If keyctl_instantiate_key() or keyctl_negate_key() are given one of the
     special constants that refer to the requesting process's keyrings
     (KEY_SPEC_*_KEYRING, all <= 0), then:

     (a) If sys_request_key() was given a keyring to use (destringid) then the
     	 key will be attached to that keyring.

     (b) If sys_request_key() was given a NULL keyring, then the key being
     	 instantiated will be attached to the default keyring as set by
     	 keyctl_set_reqkey_keyring().

 (3) No extra link will be made.

Decision point (1) follows current behaviour, and allows those instantiators
who've searched for a specifically named keyring in the requestor's keyring so
as to partition the keys by type to still have their named keyrings.

Decision point (2) allows the requestor to make sure that the key or keys that
get produced by request_key() go where they want, whilst allowing the
instantiator to request that the key is retained.  This is mainly useful for
situations where the instantiator makes a secondary request, the key for which
should be retained by the initial requestor:

	+-----------+        +--------------+        +--------------+
	|           |        |              |        |              |
	| Requestor |------->| Instantiator |------->| Instantiator |
	|           |        |              |        |              |
	+-----------+        +--------------+        +--------------+
	           request_key()           request_key()

This might be useful, for example, in Kerberos, where the requestor requests a
ticket, and then the ticket instantiator requests the TGT, which someone else
then has to go and fetch.  The TGT, however, should be retained in the
keyrings of the requestor, not the first instantiator.  To make this explict
an extra special keyring constant is also added.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: James Morris <jmorris@namei.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/kmod.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 3d3c3ea3a02..f044f8f5770 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -140,7 +140,7 @@ static int ____call_usermodehelper(void *data)
 	/* Unblock all signals and set the session keyring. */
 	new_session = key_get(sub_info->ring);
 	spin_lock_irq(&current->sighand->siglock);
-	old_session = __install_session_keyring(current, new_session);
+	old_session = __install_session_keyring(new_session);
 	flush_signal_handlers(current, 1);
 	sigemptyset(&current->blocked);
 	recalc_sigpending();
-- 
cgit v1.2.3


From 1cdcbec1a3372c0c49c59d292e708fd07b509f18 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:14 +1100
Subject: CRED: Neuter sys_capset()

Take away the ability for sys_capset() to affect processes other than current.

This means that current will not need to lock its own credentials when reading
them against interference by other processes.

This has effectively been the case for a while anyway, since:

 (1) Without LSM enabled, sys_capset() is disallowed.

 (2) With file-based capabilities, sys_capset() is neutered.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Andrew G. Morgan <morgan@kernel.org>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/capability.c | 227 ++++++----------------------------------------------
 1 file changed, 23 insertions(+), 204 deletions(-)

(limited to 'kernel')

diff --git a/kernel/capability.c b/kernel/capability.c
index adb262f83de..58b00519624 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -127,160 +127,6 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
 	return 0;
 }
 
-#ifndef CONFIG_SECURITY_FILE_CAPABILITIES
-
-/*
- * Without filesystem capability support, we nominally support one process
- * setting the capabilities of another
- */
-static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
-				     kernel_cap_t *pIp, kernel_cap_t *pPp)
-{
-	struct task_struct *target;
-	int ret;
-
-	spin_lock(&task_capability_lock);
-	read_lock(&tasklist_lock);
-
-	if (pid && pid != task_pid_vnr(current)) {
-		target = find_task_by_vpid(pid);
-		if (!target) {
-			ret = -ESRCH;
-			goto out;
-		}
-	} else
-		target = current;
-
-	ret = security_capget(target, pEp, pIp, pPp);
-
-out:
-	read_unlock(&tasklist_lock);
-	spin_unlock(&task_capability_lock);
-
-	return ret;
-}
-
-/*
- * cap_set_pg - set capabilities for all processes in a given process
- * group.  We call this holding task_capability_lock and tasklist_lock.
- */
-static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective,
-			     kernel_cap_t *inheritable,
-			     kernel_cap_t *permitted)
-{
-	struct task_struct *g, *target;
-	int ret = -EPERM;
-	int found = 0;
-	struct pid *pgrp;
-
-	spin_lock(&task_capability_lock);
-	read_lock(&tasklist_lock);
-
-	pgrp = find_vpid(pgrp_nr);
-	do_each_pid_task(pgrp, PIDTYPE_PGID, g) {
-		target = g;
-		while_each_thread(g, target) {
-			if (!security_capset_check(target, effective,
-						   inheritable, permitted)) {
-				security_capset_set(target, effective,
-						    inheritable, permitted);
-				ret = 0;
-			}
-			found = 1;
-		}
-	} while_each_pid_task(pgrp, PIDTYPE_PGID, g);
-
-	read_unlock(&tasklist_lock);
-	spin_unlock(&task_capability_lock);
-
-	if (!found)
-		ret = 0;
-	return ret;
-}
-
-/*
- * cap_set_all - set capabilities for all processes other than init
- * and self.  We call this holding task_capability_lock and tasklist_lock.
- */
-static inline int cap_set_all(kernel_cap_t *effective,
-			      kernel_cap_t *inheritable,
-			      kernel_cap_t *permitted)
-{
-	struct task_struct *g, *target;
-	int ret = -EPERM;
-	int found = 0;
-
-	spin_lock(&task_capability_lock);
-	read_lock(&tasklist_lock);
-
-	do_each_thread(g, target) {
-		if (target == current
-		    || is_container_init(target->group_leader))
-			continue;
-		found = 1;
-		if (security_capset_check(target, effective, inheritable,
-					  permitted))
-			continue;
-		ret = 0;
-		security_capset_set(target, effective, inheritable, permitted);
-	} while_each_thread(g, target);
-
-	read_unlock(&tasklist_lock);
-	spin_unlock(&task_capability_lock);
-
-	if (!found)
-		ret = 0;
-
-	return ret;
-}
-
-/*
- * Given the target pid does not refer to the current process we
- * need more elaborate support... (This support is not present when
- * filesystem capabilities are configured.)
- */
-static inline int do_sys_capset_other_tasks(pid_t pid, kernel_cap_t *effective,
-					    kernel_cap_t *inheritable,
-					    kernel_cap_t *permitted)
-{
-	struct task_struct *target;
-	int ret;
-
-	if (!capable(CAP_SETPCAP))
-		return -EPERM;
-
-	if (pid == -1)	          /* all procs other than current and init */
-		return cap_set_all(effective, inheritable, permitted);
-
-	else if (pid < 0)                    /* all procs in process group */
-		return cap_set_pg(-pid, effective, inheritable, permitted);
-
-	/* target != current */
-	spin_lock(&task_capability_lock);
-	read_lock(&tasklist_lock);
-
-	target = find_task_by_vpid(pid);
-	if (!target)
-		ret = -ESRCH;
-	else {
-		ret = security_capset_check(target, effective, inheritable,
-					    permitted);
-
-		/* having verified that the proposed changes are legal,
-		   we now put them into effect. */
-		if (!ret)
-			security_capset_set(target, effective, inheritable,
-					    permitted);
-	}
-
-	read_unlock(&tasklist_lock);
-	spin_unlock(&task_capability_lock);
-
-	return ret;
-}
-
-#else /* ie., def CONFIG_SECURITY_FILE_CAPABILITIES */
-
 /*
  * If we have configured with filesystem capability support, then the
  * only thing that can change the capabilities of the current process
@@ -314,22 +160,6 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
 	return ret;
 }
 
-/*
- * With filesystem capability support configured, the kernel does not
- * permit the changing of capabilities in one process by another
- * process. (CAP_SETPCAP has much less broad semantics when configured
- * this way.)
- */
-static inline int do_sys_capset_other_tasks(pid_t pid,
-					    kernel_cap_t *effective,
-					    kernel_cap_t *inheritable,
-					    kernel_cap_t *permitted)
-{
-	return -EPERM;
-}
-
-#endif /* ie., ndef CONFIG_SECURITY_FILE_CAPABILITIES */
-
 /*
  * Atomically modify the effective capabilities returning the original
  * value. No permission check is performed here - it is assumed that the
@@ -424,16 +254,14 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
  * @data: pointer to struct that contains the effective, permitted,
  *	and inheritable capabilities
  *
- * Set capabilities for a given process, all processes, or all
- * processes in a given process group.
+ * Set capabilities for the current process only.  The ability to any other
+ * process(es) has been deprecated and removed.
  *
  * The restrictions on setting capabilities are specified as:
  *
- * [pid is for the 'target' task.  'current' is the calling task.]
- *
- * I: any raised capabilities must be a subset of the (old current) permitted
- * P: any raised capabilities must be a subset of the (old current) permitted
- * E: must be set to a subset of (new target) permitted
+ * I: any raised capabilities must be a subset of the old permitted
+ * P: any raised capabilities must be a subset of the old permitted
+ * E: must be set to a subset of new permitted
  *
  * Returns 0 on success and < 0 on error.
  */
@@ -452,10 +280,13 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 	if (get_user(pid, &header->pid))
 		return -EFAULT;
 
+	/* may only affect current now */
+	if (pid != 0 && pid != task_pid_vnr(current))
+		return -EPERM;
+
 	if (copy_from_user(&kdata, data, tocopy
-			   * sizeof(struct __user_cap_data_struct))) {
+			   * sizeof(struct __user_cap_data_struct)))
 		return -EFAULT;
-	}
 
 	for (i = 0; i < tocopy; i++) {
 		effective.cap[i] = kdata[i].effective;
@@ -473,32 +304,20 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 	if (ret)
 		return ret;
 
-	if (pid && (pid != task_pid_vnr(current)))
-		ret = do_sys_capset_other_tasks(pid, &effective, &inheritable,
-						&permitted);
-	else {
-		/*
-		 * This lock is required even when filesystem
-		 * capability support is configured - it protects the
-		 * sys_capget() call from returning incorrect data in
-		 * the case that the targeted process is not the
-		 * current one.
-		 */
-		spin_lock(&task_capability_lock);
-
-		ret = security_capset_check(current, &effective, &inheritable,
-					    &permitted);
-		/*
-		 * Having verified that the proposed changes are
-		 * legal, we now put them into effect.
-		 */
-		if (!ret)
-			security_capset_set(current, &effective, &inheritable,
-					    &permitted);
-		spin_unlock(&task_capability_lock);
-	}
-
+	/* This lock is required even when filesystem capability support is
+	 * configured - it protects the sys_capget() call from returning
+	 * incorrect data in the case that the targeted process is not the
+	 * current one.
+	 */
+	spin_lock(&task_capability_lock);
 
+	ret = security_capset_check(&effective, &inheritable, &permitted);
+	/* Having verified that the proposed changes are legal, we now put them
+	 * into effect.
+	 */
+	if (!ret)
+		security_capset_set(&effective, &inheritable, &permitted);
+	spin_unlock(&task_capability_lock);
 	return ret;
 }
 
-- 
cgit v1.2.3


From b6dff3ec5e116e3af6f537d4caedcad6b9e5082a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:16 +1100
Subject: CRED: Separate task security context from task_struct

Separate the task security context from task_struct.  At this point, the
security data is temporarily embedded in the task_struct with two pointers
pointing to it.

Note that the Alpha arch is altered as it refers to (E)UID and (E)GID in
entry.S via asm-offsets.

With comment fixes Signed-off-by: Marc Dionne <marc.c.dionne@gmail.com>

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/auditsc.c      |  52 +++++-----
 kernel/capability.c   |   4 +-
 kernel/cgroup.c       |   4 +-
 kernel/exit.c         |  10 +-
 kernel/fork.c         |  24 ++---
 kernel/futex.c        |   6 +-
 kernel/futex_compat.c |   5 +-
 kernel/ptrace.c       |  19 ++--
 kernel/sched.c        |  10 +-
 kernel/signal.c       |  16 +--
 kernel/sys.c          | 266 ++++++++++++++++++++++++++++----------------------
 kernel/trace/trace.c  |   2 +-
 kernel/tsacct.c       |   4 +-
 kernel/uid16.c        |  28 +++---
 kernel/user.c         |   4 +-
 15 files changed, 250 insertions(+), 204 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 9c7e47ae457..2febf5165fa 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -447,6 +447,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 			      struct audit_names *name,
 			      enum audit_state *state)
 {
+	struct cred *cred = tsk->cred;
 	int i, j, need_sid = 1;
 	u32 sid;
 
@@ -466,28 +467,28 @@ static int audit_filter_rules(struct task_struct *tsk,
 			}
 			break;
 		case AUDIT_UID:
-			result = audit_comparator(tsk->uid, f->op, f->val);
+			result = audit_comparator(cred->uid, f->op, f->val);
 			break;
 		case AUDIT_EUID:
-			result = audit_comparator(tsk->euid, f->op, f->val);
+			result = audit_comparator(cred->euid, f->op, f->val);
 			break;
 		case AUDIT_SUID:
-			result = audit_comparator(tsk->suid, f->op, f->val);
+			result = audit_comparator(cred->suid, f->op, f->val);
 			break;
 		case AUDIT_FSUID:
-			result = audit_comparator(tsk->fsuid, f->op, f->val);
+			result = audit_comparator(cred->fsuid, f->op, f->val);
 			break;
 		case AUDIT_GID:
-			result = audit_comparator(tsk->gid, f->op, f->val);
+			result = audit_comparator(cred->gid, f->op, f->val);
 			break;
 		case AUDIT_EGID:
-			result = audit_comparator(tsk->egid, f->op, f->val);
+			result = audit_comparator(cred->egid, f->op, f->val);
 			break;
 		case AUDIT_SGID:
-			result = audit_comparator(tsk->sgid, f->op, f->val);
+			result = audit_comparator(cred->sgid, f->op, f->val);
 			break;
 		case AUDIT_FSGID:
-			result = audit_comparator(tsk->fsgid, f->op, f->val);
+			result = audit_comparator(cred->fsgid, f->op, f->val);
 			break;
 		case AUDIT_PERS:
 			result = audit_comparator(tsk->personality, f->op, f->val);
@@ -1228,6 +1229,7 @@ static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
 
 static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
 {
+	struct cred *cred = tsk->cred;
 	int i, call_panic = 0;
 	struct audit_buffer *ab;
 	struct audit_aux_data *aux;
@@ -1237,14 +1239,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 	context->pid = tsk->pid;
 	if (!context->ppid)
 		context->ppid = sys_getppid();
-	context->uid = tsk->uid;
-	context->gid = tsk->gid;
-	context->euid = tsk->euid;
-	context->suid = tsk->suid;
-	context->fsuid = tsk->fsuid;
-	context->egid = tsk->egid;
-	context->sgid = tsk->sgid;
-	context->fsgid = tsk->fsgid;
+	context->uid = cred->uid;
+	context->gid = cred->gid;
+	context->euid = cred->euid;
+	context->suid = cred->suid;
+	context->fsuid = cred->fsuid;
+	context->egid = cred->egid;
+	context->sgid = cred->sgid;
+	context->fsgid = cred->fsgid;
 	context->personality = tsk->personality;
 
 	ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
@@ -2086,7 +2088,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
 			audit_log_format(ab, "login pid=%d uid=%u "
 				"old auid=%u new auid=%u"
 				" old ses=%u new ses=%u",
-				task->pid, task->uid,
+				task->pid, task->cred->uid,
 				task->loginuid, loginuid,
 				task->sessionid, sessionid);
 			audit_log_end(ab);
@@ -2469,7 +2471,7 @@ void __audit_ptrace(struct task_struct *t)
 
 	context->target_pid = t->pid;
 	context->target_auid = audit_get_loginuid(t);
-	context->target_uid = t->uid;
+	context->target_uid = t->cred->uid;
 	context->target_sessionid = audit_get_sessionid(t);
 	security_task_getsecid(t, &context->target_sid);
 	memcpy(context->target_comm, t->comm, TASK_COMM_LEN);
@@ -2495,7 +2497,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
 			if (tsk->loginuid != -1)
 				audit_sig_uid = tsk->loginuid;
 			else
-				audit_sig_uid = tsk->uid;
+				audit_sig_uid = tsk->cred->uid;
 			security_task_getsecid(tsk, &audit_sig_sid);
 		}
 		if (!audit_signals || audit_dummy_context())
@@ -2507,7 +2509,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
 	if (!ctx->target_pid) {
 		ctx->target_pid = t->tgid;
 		ctx->target_auid = audit_get_loginuid(t);
-		ctx->target_uid = t->uid;
+		ctx->target_uid = t->cred->uid;
 		ctx->target_sessionid = audit_get_sessionid(t);
 		security_task_getsecid(t, &ctx->target_sid);
 		memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN);
@@ -2528,7 +2530,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
 
 	axp->target_pid[axp->pid_count] = t->tgid;
 	axp->target_auid[axp->pid_count] = audit_get_loginuid(t);
-	axp->target_uid[axp->pid_count] = t->uid;
+	axp->target_uid[axp->pid_count] = t->cred->uid;
 	axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
 	security_task_getsecid(t, &axp->target_sid[axp->pid_count]);
 	memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN);
@@ -2575,12 +2577,12 @@ void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_
 	ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;
 
 	ax->old_pcap.permitted = *pP;
-	ax->old_pcap.inheritable = current->cap_inheritable;
+	ax->old_pcap.inheritable = current->cred->cap_inheritable;
 	ax->old_pcap.effective = *pE;
 
-	ax->new_pcap.permitted = current->cap_permitted;
-	ax->new_pcap.inheritable = current->cap_inheritable;
-	ax->new_pcap.effective = current->cap_effective;
+	ax->new_pcap.permitted = current->cred->cap_permitted;
+	ax->new_pcap.inheritable = current->cred->cap_inheritable;
+	ax->new_pcap.effective = current->cred->cap_effective;
 }
 
 /**
diff --git a/kernel/capability.c b/kernel/capability.c
index 58b00519624..a404b980b1b 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -171,8 +171,8 @@ kernel_cap_t cap_set_effective(const kernel_cap_t pE_new)
 
 	spin_lock(&task_capability_lock);
 
-	pE_old = current->cap_effective;
-	current->cap_effective = pE_new;
+	pE_old = current->cred->cap_effective;
+	current->cred->cap_effective = pE_new;
 
 	spin_unlock(&task_capability_lock);
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 78f9b310c4f..e210526e640 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1293,7 +1293,9 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
 		rcu_read_unlock();
 
 		euid = current_euid();
-		if (euid && euid != tsk->uid && euid != tsk->suid) {
+		if (euid &&
+		    euid != tsk->cred->uid &&
+		    euid != tsk->cred->suid) {
 			put_task_struct(tsk);
 			return -EACCES;
 		}
diff --git a/kernel/exit.c b/kernel/exit.c
index 80137a5d946..e0f6e1892fb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -160,7 +160,7 @@ void release_task(struct task_struct * p)
 	int zap_leader;
 repeat:
 	tracehook_prepare_release_task(p);
-	atomic_dec(&p->user->processes);
+	atomic_dec(&p->cred->user->processes);
 	proc_flush_task(p);
 	write_lock_irq(&tasklist_lock);
 	tracehook_finish_release_task(p);
@@ -1272,7 +1272,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
 		return 0;
 
 	if (unlikely(options & WNOWAIT)) {
-		uid_t uid = p->uid;
+		uid_t uid = p->cred->uid;
 		int exit_code = p->exit_code;
 		int why, status;
 
@@ -1393,7 +1393,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
 	if (!retval && infop)
 		retval = put_user(pid, &infop->si_pid);
 	if (!retval && infop)
-		retval = put_user(p->uid, &infop->si_uid);
+		retval = put_user(p->cred->uid, &infop->si_uid);
 	if (!retval)
 		retval = pid;
 
@@ -1458,7 +1458,7 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
 	if (!unlikely(options & WNOWAIT))
 		p->exit_code = 0;
 
-	uid = p->uid;
+	uid = p->cred->uid;
 unlock_sig:
 	spin_unlock_irq(&p->sighand->siglock);
 	if (!exit_code)
@@ -1535,7 +1535,7 @@ static int wait_task_continued(struct task_struct *p, int options,
 	spin_unlock_irq(&p->sighand->siglock);
 
 	pid = task_pid_vnr(p);
-	uid = p->uid;
+	uid = p->cred->uid;
 	get_task_struct(p);
 	read_unlock(&tasklist_lock);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index f6083561dfe..81fdc773390 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -147,8 +147,8 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(tsk == current);
 
 	security_task_free(tsk);
-	free_uid(tsk->user);
-	put_group_info(tsk->group_info);
+	free_uid(tsk->__temp_cred.user);
+	put_group_info(tsk->__temp_cred.group_info);
 	delayacct_tsk_free(tsk);
 
 	if (!profile_handoff_task(tsk))
@@ -969,17 +969,18 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
 	DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
 #endif
+	p->cred = &p->__temp_cred;
 	retval = -EAGAIN;
-	if (atomic_read(&p->user->processes) >=
+	if (atomic_read(&p->cred->user->processes) >=
 			p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
 		if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
-		    p->user != current->nsproxy->user_ns->root_user)
+		    p->cred->user != current->nsproxy->user_ns->root_user)
 			goto bad_fork_free;
 	}
 
-	atomic_inc(&p->user->__count);
-	atomic_inc(&p->user->processes);
-	get_group_info(p->group_info);
+	atomic_inc(&p->cred->user->__count);
+	atomic_inc(&p->cred->user->processes);
+	get_group_info(p->cred->group_info);
 
 	/*
 	 * If multiple threads are within copy_process(), then this check
@@ -1035,9 +1036,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->real_start_time = p->start_time;
 	monotonic_to_bootbased(&p->real_start_time);
 #ifdef CONFIG_SECURITY
-	p->security = NULL;
+	p->cred->security = NULL;
 #endif
-	p->cap_bset = current->cap_bset;
 	p->io_context = NULL;
 	p->audit_context = NULL;
 	cgroup_fork(p);
@@ -1298,9 +1298,9 @@ bad_fork_cleanup_cgroup:
 bad_fork_cleanup_put_domain:
 	module_put(task_thread_info(p)->exec_domain->module);
 bad_fork_cleanup_count:
-	put_group_info(p->group_info);
-	atomic_dec(&p->user->processes);
-	free_uid(p->user);
+	put_group_info(p->cred->group_info);
+	atomic_dec(&p->cred->user->processes);
+	free_uid(p->cred->user);
 bad_fork_free:
 	free_task(p);
 fork_out:
diff --git a/kernel/futex.c b/kernel/futex.c
index e06962132aa..28421d8210b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -443,7 +443,8 @@ static struct task_struct * futex_find_get_task(pid_t pid)
 
 	rcu_read_lock();
 	p = find_task_by_vpid(pid);
-	if (!p || (euid != p->euid && euid != p->uid))
+	if (!p || (euid != p->cred->euid &&
+		   euid != p->cred->uid))
 		p = ERR_PTR(-ESRCH);
 	else
 		get_task_struct(p);
@@ -1846,7 +1847,8 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr,
 		if (!p)
 			goto err_unlock;
 		ret = -EPERM;
-		if (euid != p->euid && euid != p->uid &&
+		if (euid != p->cred->euid &&
+		    euid != p->cred->uid &&
 		    !capable(CAP_SYS_PTRACE))
 			goto err_unlock;
 		head = p->robust_list;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 3254d4e41e8..2c3fd5ed34f 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -151,8 +151,9 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
 		if (!p)
 			goto err_unlock;
 		ret = -EPERM;
-		if (euid != p->euid && euid != p->uid &&
-				!capable(CAP_SYS_PTRACE))
+		if (euid != p->cred->euid &&
+		    euid != p->cred->uid &&
+		    !capable(CAP_SYS_PTRACE))
 			goto err_unlock;
 		head = p->compat_robust_list;
 		read_unlock(&tasklist_lock);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 937f6b5b200..49849d12dd1 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -115,6 +115,8 @@ int ptrace_check_attach(struct task_struct *child, int kill)
 
 int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 {
+	struct cred *cred = current->cred, *tcred = task->cred;
+
 	/* May we inspect the given task?
 	 * This check is used both for attaching with ptrace
 	 * and for allowing access to sensitive information in /proc.
@@ -123,19 +125,18 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 	 * because setting up the necessary parent/child relationship
 	 * or halting the specified task is impossible.
 	 */
-	uid_t uid;
-	gid_t gid;
+	uid_t uid = cred->uid;
+	gid_t gid = cred->gid;
 	int dumpable = 0;
 	/* Don't let security modules deny introspection */
 	if (task == current)
 		return 0;
-	current_uid_gid(&uid, &gid);
-	if ((uid != task->euid ||
-	     uid != task->suid ||
-	     uid != task->uid  ||
-	     gid != task->egid ||
-	     gid != task->sgid ||
-	     gid != task->gid) && !capable(CAP_SYS_PTRACE))
+	if ((uid != tcred->euid ||
+	     uid != tcred->suid ||
+	     uid != tcred->uid  ||
+	     gid != tcred->egid ||
+	     gid != tcred->sgid ||
+	     gid != tcred->gid) && !capable(CAP_SYS_PTRACE))
 		return -EPERM;
 	smp_rmb();
 	if (task->mm)
diff --git a/kernel/sched.c b/kernel/sched.c
index c3b8b1fcde0..733c59e645a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -345,7 +345,7 @@ static inline struct task_group *task_group(struct task_struct *p)
 	struct task_group *tg;
 
 #ifdef CONFIG_USER_SCHED
-	tg = p->user->tg;
+	tg = p->cred->user->tg;
 #elif defined(CONFIG_CGROUP_SCHED)
 	tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
 				struct task_group, css);
@@ -5182,8 +5182,8 @@ recheck:
 
 		/* can't change other user's priorities */
 		euid = current_euid();
-		if (euid != p->euid &&
-		    euid != p->uid)
+		if (euid != p->cred->euid &&
+		    euid != p->cred->uid)
 			return -EPERM;
 	}
 
@@ -5417,7 +5417,9 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
 
 	euid = current_euid();
 	retval = -EPERM;
-	if (euid != p->euid && euid != p->uid && !capable(CAP_SYS_NICE))
+	if (euid != p->cred->euid &&
+	    euid != p->cred->uid &&
+	    !capable(CAP_SYS_NICE))
 		goto out_unlock;
 
 	retval = security_task_setscheduler(p, 0, NULL);
diff --git a/kernel/signal.c b/kernel/signal.c
index 167b535fe1a..80e8a6489f9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -187,7 +187,7 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
 	 * In order to avoid problems with "switch_user()", we want to make
 	 * sure that the compiler doesn't re-load "t->user"
 	 */
-	user = t->user;
+	user = t->cred->user;
 	barrier();
 	atomic_inc(&user->sigpending);
 	if (override_rlimit ||
@@ -582,8 +582,8 @@ static int check_kill_permission(int sig, struct siginfo *info,
 
 	uid = current_uid();
 	euid = current_euid();
-	if ((euid ^ t->suid) && (euid ^ t->uid) &&
-	    (uid  ^ t->suid) && (uid  ^ t->uid) &&
+	if ((euid ^ t->cred->suid) && (euid ^ t->cred->uid) &&
+	    (uid  ^ t->cred->suid) && (uid  ^ t->cred->uid) &&
 	    !capable(CAP_KILL)) {
 		switch (sig) {
 		case SIGCONT:
@@ -1100,8 +1100,8 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
 		goto out_unlock;
 	}
 	if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info)))
-	    && (euid != p->suid) && (euid != p->uid)
-	    && (uid != p->suid) && (uid != p->uid)) {
+	    && (euid != p->cred->suid) && (euid != p->cred->uid)
+	    && (uid != p->cred->suid) && (uid != p->cred->uid)) {
 		ret = -EPERM;
 		goto out_unlock;
 	}
@@ -1374,7 +1374,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 	info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
 	rcu_read_unlock();
 
-	info.si_uid = tsk->uid;
+	info.si_uid = tsk->cred->uid;
 
 	thread_group_cputime(tsk, &cputime);
 	info.si_utime = cputime_to_jiffies(cputime.utime);
@@ -1445,7 +1445,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
 	info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
 	rcu_read_unlock();
 
-	info.si_uid = tsk->uid;
+	info.si_uid = tsk->cred->uid;
 
 	info.si_utime = cputime_to_clock_t(tsk->utime);
 	info.si_stime = cputime_to_clock_t(tsk->stime);
@@ -1713,7 +1713,7 @@ static int ptrace_signal(int signr, siginfo_t *info,
 		info->si_errno = 0;
 		info->si_code = SI_USER;
 		info->si_pid = task_pid_vnr(current->parent);
-		info->si_uid = current->parent->uid;
+		info->si_uid = current->parent->cred->uid;
 	}
 
 	/* If the (new) signal is now blocked, requeue it.  */
diff --git a/kernel/sys.c b/kernel/sys.c
index ed5c29c748a..5d81f07c015 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -117,7 +117,9 @@ static int set_one_prio(struct task_struct *p, int niceval, int error)
 	uid_t euid = current_euid();
 	int no_nice;
 
-	if (p->uid != euid && p->euid != euid && !capable(CAP_SYS_NICE)) {
+	if (p->cred->uid  != euid &&
+	    p->cred->euid != euid &&
+	    !capable(CAP_SYS_NICE)) {
 		error = -EPERM;
 		goto out;
 	}
@@ -174,7 +176,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
 			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
 			break;
 		case PRIO_USER:
-			user = current->user;
+			user = current->cred->user;
 			if (!who)
 				who = current_uid();
 			else
@@ -182,7 +184,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
 					goto out_unlock;	/* No processes for this user */
 
 			do_each_thread(g, p)
-				if (p->uid == who)
+				if (p->cred->uid == who)
 					error = set_one_prio(p, niceval, error);
 			while_each_thread(g, p);
 			if (who != current_uid())
@@ -236,7 +238,7 @@ asmlinkage long sys_getpriority(int which, int who)
 			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
 			break;
 		case PRIO_USER:
-			user = current->user;
+			user = current->cred->user;
 			if (!who)
 				who = current_uid();
 			else
@@ -244,7 +246,7 @@ asmlinkage long sys_getpriority(int which, int who)
 					goto out_unlock;	/* No processes for this user */
 
 			do_each_thread(g, p)
-				if (p->uid == who) {
+				if (p->cred->uid == who) {
 					niceval = 20 - task_nice(p);
 					if (niceval > retval)
 						retval = niceval;
@@ -472,8 +474,9 @@ void ctrl_alt_del(void)
  */
 asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
 {
-	int old_rgid = current->gid;
-	int old_egid = current->egid;
+	struct cred *cred = current->cred;
+	int old_rgid = cred->gid;
+	int old_egid = cred->egid;
 	int new_rgid = old_rgid;
 	int new_egid = old_egid;
 	int retval;
@@ -484,7 +487,7 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
 
 	if (rgid != (gid_t) -1) {
 		if ((old_rgid == rgid) ||
-		    (current->egid==rgid) ||
+		    (cred->egid == rgid) ||
 		    capable(CAP_SETGID))
 			new_rgid = rgid;
 		else
@@ -492,8 +495,8 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
 	}
 	if (egid != (gid_t) -1) {
 		if ((old_rgid == egid) ||
-		    (current->egid == egid) ||
-		    (current->sgid == egid) ||
+		    (cred->egid == egid) ||
+		    (cred->sgid == egid) ||
 		    capable(CAP_SETGID))
 			new_egid = egid;
 		else
@@ -505,10 +508,10 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
 	}
 	if (rgid != (gid_t) -1 ||
 	    (egid != (gid_t) -1 && egid != old_rgid))
-		current->sgid = new_egid;
-	current->fsgid = new_egid;
-	current->egid = new_egid;
-	current->gid = new_rgid;
+		cred->sgid = new_egid;
+	cred->fsgid = new_egid;
+	cred->egid = new_egid;
+	cred->gid = new_rgid;
 	key_fsgid_changed(current);
 	proc_id_connector(current, PROC_EVENT_GID);
 	return 0;
@@ -521,7 +524,8 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
  */
 asmlinkage long sys_setgid(gid_t gid)
 {
-	int old_egid = current->egid;
+	struct cred *cred = current->cred;
+	int old_egid = cred->egid;
 	int retval;
 
 	retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID);
@@ -533,13 +537,13 @@ asmlinkage long sys_setgid(gid_t gid)
 			set_dumpable(current->mm, suid_dumpable);
 			smp_wmb();
 		}
-		current->gid = current->egid = current->sgid = current->fsgid = gid;
-	} else if ((gid == current->gid) || (gid == current->sgid)) {
+		cred->gid = cred->egid = cred->sgid = cred->fsgid = gid;
+	} else if ((gid == cred->gid) || (gid == cred->sgid)) {
 		if (old_egid != gid) {
 			set_dumpable(current->mm, suid_dumpable);
 			smp_wmb();
 		}
-		current->egid = current->fsgid = gid;
+		cred->egid = cred->fsgid = gid;
 	}
 	else
 		return -EPERM;
@@ -570,7 +574,7 @@ static int set_user(uid_t new_ruid, int dumpclear)
 		set_dumpable(current->mm, suid_dumpable);
 		smp_wmb();
 	}
-	current->uid = new_ruid;
+	current->cred->uid = new_ruid;
 	return 0;
 }
 
@@ -591,6 +595,7 @@ static int set_user(uid_t new_ruid, int dumpclear)
  */
 asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
 {
+	struct cred *cred = current->cred;
 	int old_ruid, old_euid, old_suid, new_ruid, new_euid;
 	int retval;
 
@@ -598,14 +603,14 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
 	if (retval)
 		return retval;
 
-	new_ruid = old_ruid = current->uid;
-	new_euid = old_euid = current->euid;
-	old_suid = current->suid;
+	new_ruid = old_ruid = cred->uid;
+	new_euid = old_euid = cred->euid;
+	old_suid = cred->suid;
 
 	if (ruid != (uid_t) -1) {
 		new_ruid = ruid;
 		if ((old_ruid != ruid) &&
-		    (current->euid != ruid) &&
+		    (cred->euid != ruid) &&
 		    !capable(CAP_SETUID))
 			return -EPERM;
 	}
@@ -613,8 +618,8 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
 	if (euid != (uid_t) -1) {
 		new_euid = euid;
 		if ((old_ruid != euid) &&
-		    (current->euid != euid) &&
-		    (current->suid != euid) &&
+		    (cred->euid != euid) &&
+		    (cred->suid != euid) &&
 		    !capable(CAP_SETUID))
 			return -EPERM;
 	}
@@ -626,11 +631,11 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
 		set_dumpable(current->mm, suid_dumpable);
 		smp_wmb();
 	}
-	current->fsuid = current->euid = new_euid;
+	cred->fsuid = cred->euid = new_euid;
 	if (ruid != (uid_t) -1 ||
 	    (euid != (uid_t) -1 && euid != old_ruid))
-		current->suid = current->euid;
-	current->fsuid = current->euid;
+		cred->suid = cred->euid;
+	cred->fsuid = cred->euid;
 
 	key_fsuid_changed(current);
 	proc_id_connector(current, PROC_EVENT_UID);
@@ -653,7 +658,8 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
  */
 asmlinkage long sys_setuid(uid_t uid)
 {
-	int old_euid = current->euid;
+	struct cred *cred = current->cred;
+	int old_euid = cred->euid;
 	int old_ruid, old_suid, new_suid;
 	int retval;
 
@@ -661,23 +667,23 @@ asmlinkage long sys_setuid(uid_t uid)
 	if (retval)
 		return retval;
 
-	old_ruid = current->uid;
-	old_suid = current->suid;
+	old_ruid = cred->uid;
+	old_suid = cred->suid;
 	new_suid = old_suid;
 	
 	if (capable(CAP_SETUID)) {
 		if (uid != old_ruid && set_user(uid, old_euid != uid) < 0)
 			return -EAGAIN;
 		new_suid = uid;
-	} else if ((uid != current->uid) && (uid != new_suid))
+	} else if ((uid != cred->uid) && (uid != new_suid))
 		return -EPERM;
 
 	if (old_euid != uid) {
 		set_dumpable(current->mm, suid_dumpable);
 		smp_wmb();
 	}
-	current->fsuid = current->euid = uid;
-	current->suid = new_suid;
+	cred->fsuid = cred->euid = uid;
+	cred->suid = new_suid;
 
 	key_fsuid_changed(current);
 	proc_id_connector(current, PROC_EVENT_UID);
@@ -692,9 +698,10 @@ asmlinkage long sys_setuid(uid_t uid)
  */
 asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
 {
-	int old_ruid = current->uid;
-	int old_euid = current->euid;
-	int old_suid = current->suid;
+	struct cred *cred = current->cred;
+	int old_ruid = cred->uid;
+	int old_euid = cred->euid;
+	int old_suid = cred->suid;
 	int retval;
 
 	retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES);
@@ -702,30 +709,31 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
 		return retval;
 
 	if (!capable(CAP_SETUID)) {
-		if ((ruid != (uid_t) -1) && (ruid != current->uid) &&
-		    (ruid != current->euid) && (ruid != current->suid))
+		if ((ruid != (uid_t) -1) && (ruid != cred->uid) &&
+		    (ruid != cred->euid) && (ruid != cred->suid))
 			return -EPERM;
-		if ((euid != (uid_t) -1) && (euid != current->uid) &&
-		    (euid != current->euid) && (euid != current->suid))
+		if ((euid != (uid_t) -1) && (euid != cred->uid) &&
+		    (euid != cred->euid) && (euid != cred->suid))
 			return -EPERM;
-		if ((suid != (uid_t) -1) && (suid != current->uid) &&
-		    (suid != current->euid) && (suid != current->suid))
+		if ((suid != (uid_t) -1) && (suid != cred->uid) &&
+		    (suid != cred->euid) && (suid != cred->suid))
 			return -EPERM;
 	}
 	if (ruid != (uid_t) -1) {
-		if (ruid != current->uid && set_user(ruid, euid != current->euid) < 0)
+		if (ruid != cred->uid &&
+		    set_user(ruid, euid != cred->euid) < 0)
 			return -EAGAIN;
 	}
 	if (euid != (uid_t) -1) {
-		if (euid != current->euid) {
+		if (euid != cred->euid) {
 			set_dumpable(current->mm, suid_dumpable);
 			smp_wmb();
 		}
-		current->euid = euid;
+		cred->euid = euid;
 	}
-	current->fsuid = current->euid;
+	cred->fsuid = cred->euid;
 	if (suid != (uid_t) -1)
-		current->suid = suid;
+		cred->suid = suid;
 
 	key_fsuid_changed(current);
 	proc_id_connector(current, PROC_EVENT_UID);
@@ -735,11 +743,12 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
 
 asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid)
 {
+	struct cred *cred = current->cred;
 	int retval;
 
-	if (!(retval = put_user(current->uid, ruid)) &&
-	    !(retval = put_user(current->euid, euid)))
-		retval = put_user(current->suid, suid);
+	if (!(retval = put_user(cred->uid, ruid)) &&
+	    !(retval = put_user(cred->euid, euid)))
+		retval = put_user(cred->suid, suid);
 
 	return retval;
 }
@@ -749,6 +758,7 @@ asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __us
  */
 asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
 {
+	struct cred *cred = current->cred;
 	int retval;
 
 	retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES);
@@ -756,28 +766,28 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
 		return retval;
 
 	if (!capable(CAP_SETGID)) {
-		if ((rgid != (gid_t) -1) && (rgid != current->gid) &&
-		    (rgid != current->egid) && (rgid != current->sgid))
+		if ((rgid != (gid_t) -1) && (rgid != cred->gid) &&
+		    (rgid != cred->egid) && (rgid != cred->sgid))
 			return -EPERM;
-		if ((egid != (gid_t) -1) && (egid != current->gid) &&
-		    (egid != current->egid) && (egid != current->sgid))
+		if ((egid != (gid_t) -1) && (egid != cred->gid) &&
+		    (egid != cred->egid) && (egid != cred->sgid))
 			return -EPERM;
-		if ((sgid != (gid_t) -1) && (sgid != current->gid) &&
-		    (sgid != current->egid) && (sgid != current->sgid))
+		if ((sgid != (gid_t) -1) && (sgid != cred->gid) &&
+		    (sgid != cred->egid) && (sgid != cred->sgid))
 			return -EPERM;
 	}
 	if (egid != (gid_t) -1) {
-		if (egid != current->egid) {
+		if (egid != cred->egid) {
 			set_dumpable(current->mm, suid_dumpable);
 			smp_wmb();
 		}
-		current->egid = egid;
+		cred->egid = egid;
 	}
-	current->fsgid = current->egid;
+	cred->fsgid = cred->egid;
 	if (rgid != (gid_t) -1)
-		current->gid = rgid;
+		cred->gid = rgid;
 	if (sgid != (gid_t) -1)
-		current->sgid = sgid;
+		cred->sgid = sgid;
 
 	key_fsgid_changed(current);
 	proc_id_connector(current, PROC_EVENT_GID);
@@ -786,11 +796,12 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
 
 asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid)
 {
+	struct cred *cred = current->cred;
 	int retval;
 
-	if (!(retval = put_user(current->gid, rgid)) &&
-	    !(retval = put_user(current->egid, egid)))
-		retval = put_user(current->sgid, sgid);
+	if (!(retval = put_user(cred->gid, rgid)) &&
+	    !(retval = put_user(cred->egid, egid)))
+		retval = put_user(cred->sgid, sgid);
 
 	return retval;
 }
@@ -804,20 +815,21 @@ asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __us
  */
 asmlinkage long sys_setfsuid(uid_t uid)
 {
+	struct cred *cred = current->cred;
 	int old_fsuid;
 
-	old_fsuid = current->fsuid;
+	old_fsuid = cred->fsuid;
 	if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS))
 		return old_fsuid;
 
-	if (uid == current->uid || uid == current->euid ||
-	    uid == current->suid || uid == current->fsuid || 
+	if (uid == cred->uid || uid == cred->euid ||
+	    uid == cred->suid || uid == cred->fsuid ||
 	    capable(CAP_SETUID)) {
 		if (uid != old_fsuid) {
 			set_dumpable(current->mm, suid_dumpable);
 			smp_wmb();
 		}
-		current->fsuid = uid;
+		cred->fsuid = uid;
 	}
 
 	key_fsuid_changed(current);
@@ -833,20 +845,21 @@ asmlinkage long sys_setfsuid(uid_t uid)
  */
 asmlinkage long sys_setfsgid(gid_t gid)
 {
+	struct cred *cred = current->cred;
 	int old_fsgid;
 
-	old_fsgid = current->fsgid;
+	old_fsgid = cred->fsgid;
 	if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS))
 		return old_fsgid;
 
-	if (gid == current->gid || gid == current->egid ||
-	    gid == current->sgid || gid == current->fsgid || 
+	if (gid == cred->gid || gid == cred->egid ||
+	    gid == cred->sgid || gid == cred->fsgid ||
 	    capable(CAP_SETGID)) {
 		if (gid != old_fsgid) {
 			set_dumpable(current->mm, suid_dumpable);
 			smp_wmb();
 		}
-		current->fsgid = gid;
+		cred->fsgid = gid;
 		key_fsgid_changed(current);
 		proc_id_connector(current, PROC_EVENT_GID);
 	}
@@ -1208,8 +1221,15 @@ int groups_search(struct group_info *group_info, gid_t grp)
 	return 0;
 }
 
-/* validate and set current->group_info */
-int set_current_groups(struct group_info *group_info)
+/**
+ * set_groups - Change a group subscription in a security record
+ * @sec: The security record to alter
+ * @group_info: The group list to impose
+ *
+ * Validate a group subscription and, if valid, impose it upon a task security
+ * record.
+ */
+int set_groups(struct cred *cred, struct group_info *group_info)
 {
 	int retval;
 	struct group_info *old_info;
@@ -1221,20 +1241,34 @@ int set_current_groups(struct group_info *group_info)
 	groups_sort(group_info);
 	get_group_info(group_info);
 
-	task_lock(current);
-	old_info = current->group_info;
-	current->group_info = group_info;
-	task_unlock(current);
+	spin_lock(&cred->lock);
+	old_info = cred->group_info;
+	cred->group_info = group_info;
+	spin_unlock(&cred->lock);
 
 	put_group_info(old_info);
-
 	return 0;
 }
 
+EXPORT_SYMBOL(set_groups);
+
+/**
+ * set_current_groups - Change current's group subscription
+ * @group_info: The group list to impose
+ *
+ * Validate a group subscription and, if valid, impose it upon current's task
+ * security record.
+ */
+int set_current_groups(struct group_info *group_info)
+{
+	return set_groups(current->cred, group_info);
+}
+
 EXPORT_SYMBOL(set_current_groups);
 
 asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist)
 {
+	struct cred *cred = current->cred;
 	int i = 0;
 
 	/*
@@ -1246,13 +1280,13 @@ asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist)
 		return -EINVAL;
 
 	/* no need to grab task_lock here; it cannot change */
-	i = current->group_info->ngroups;
+	i = cred->group_info->ngroups;
 	if (gidsetsize) {
 		if (i > gidsetsize) {
 			i = -EINVAL;
 			goto out;
 		}
-		if (groups_to_user(grouplist, current->group_info)) {
+		if (groups_to_user(grouplist, cred->group_info)) {
 			i = -EFAULT;
 			goto out;
 		}
@@ -1296,9 +1330,10 @@ asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist)
  */
 int in_group_p(gid_t grp)
 {
+	struct cred *cred = current->cred;
 	int retval = 1;
-	if (grp != current->fsgid)
-		retval = groups_search(current->group_info, grp);
+	if (grp != cred->fsgid)
+		retval = groups_search(cred->group_info, grp);
 	return retval;
 }
 
@@ -1306,9 +1341,10 @@ EXPORT_SYMBOL(in_group_p);
 
 int in_egroup_p(gid_t grp)
 {
+	struct cred *cred = current->cred;
 	int retval = 1;
-	if (grp != current->egid)
-		retval = groups_search(current->group_info, grp);
+	if (grp != cred->egid)
+		retval = groups_search(cred->group_info, grp);
 	return retval;
 }
 
@@ -1624,7 +1660,9 @@ asmlinkage long sys_umask(int mask)
 asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 			  unsigned long arg4, unsigned long arg5)
 {
-	long error = 0;
+	struct task_struct *me = current;
+	unsigned char comm[sizeof(me->comm)];
+	long error;
 
 	if (security_task_prctl(option, arg2, arg3, arg4, arg5, &error))
 		return error;
@@ -1635,39 +1673,41 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 				error = -EINVAL;
 				break;
 			}
-			current->pdeath_signal = arg2;
+			me->pdeath_signal = arg2;
+			error = 0;
 			break;
 		case PR_GET_PDEATHSIG:
-			error = put_user(current->pdeath_signal, (int __user *)arg2);
+			error = put_user(me->pdeath_signal, (int __user *)arg2);
 			break;
 		case PR_GET_DUMPABLE:
-			error = get_dumpable(current->mm);
+			error = get_dumpable(me->mm);
 			break;
 		case PR_SET_DUMPABLE:
 			if (arg2 < 0 || arg2 > 1) {
 				error = -EINVAL;
 				break;
 			}
-			set_dumpable(current->mm, arg2);
+			set_dumpable(me->mm, arg2);
+			error = 0;
 			break;
 
 		case PR_SET_UNALIGN:
-			error = SET_UNALIGN_CTL(current, arg2);
+			error = SET_UNALIGN_CTL(me, arg2);
 			break;
 		case PR_GET_UNALIGN:
-			error = GET_UNALIGN_CTL(current, arg2);
+			error = GET_UNALIGN_CTL(me, arg2);
 			break;
 		case PR_SET_FPEMU:
-			error = SET_FPEMU_CTL(current, arg2);
+			error = SET_FPEMU_CTL(me, arg2);
 			break;
 		case PR_GET_FPEMU:
-			error = GET_FPEMU_CTL(current, arg2);
+			error = GET_FPEMU_CTL(me, arg2);
 			break;
 		case PR_SET_FPEXC:
-			error = SET_FPEXC_CTL(current, arg2);
+			error = SET_FPEXC_CTL(me, arg2);
 			break;
 		case PR_GET_FPEXC:
-			error = GET_FPEXC_CTL(current, arg2);
+			error = GET_FPEXC_CTL(me, arg2);
 			break;
 		case PR_GET_TIMING:
 			error = PR_TIMING_STATISTICAL;
@@ -1675,33 +1715,28 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 		case PR_SET_TIMING:
 			if (arg2 != PR_TIMING_STATISTICAL)
 				error = -EINVAL;
+			else
+				error = 0;
 			break;
 
-		case PR_SET_NAME: {
-			struct task_struct *me = current;
-			unsigned char ncomm[sizeof(me->comm)];
-
-			ncomm[sizeof(me->comm)-1] = 0;
-			if (strncpy_from_user(ncomm, (char __user *)arg2,
-						sizeof(me->comm)-1) < 0)
+		case PR_SET_NAME:
+			comm[sizeof(me->comm)-1] = 0;
+			if (strncpy_from_user(comm, (char __user *)arg2,
+					      sizeof(me->comm) - 1) < 0)
 				return -EFAULT;
-			set_task_comm(me, ncomm);
+			set_task_comm(me, comm);
 			return 0;
-		}
-		case PR_GET_NAME: {
-			struct task_struct *me = current;
-			unsigned char tcomm[sizeof(me->comm)];
-
-			get_task_comm(tcomm, me);
-			if (copy_to_user((char __user *)arg2, tcomm, sizeof(tcomm)))
+		case PR_GET_NAME:
+			get_task_comm(comm, me);
+			if (copy_to_user((char __user *)arg2, comm,
+					 sizeof(comm)))
 				return -EFAULT;
 			return 0;
-		}
 		case PR_GET_ENDIAN:
-			error = GET_ENDIAN(current, arg2);
+			error = GET_ENDIAN(me, arg2);
 			break;
 		case PR_SET_ENDIAN:
-			error = SET_ENDIAN(current, arg2);
+			error = SET_ENDIAN(me, arg2);
 			break;
 
 		case PR_GET_SECCOMP:
@@ -1725,6 +1760,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 					current->default_timer_slack_ns;
 			else
 				current->timer_slack_ns = arg2;
+			error = 0;
 			break;
 		default:
 			error = -EINVAL;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9f3b478f917..5c97c5b4ea8 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -246,7 +246,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 
 	memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
 	data->pid = tsk->pid;
-	data->uid = tsk->uid;
+	data->uid = task_uid(tsk);
 	data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
 	data->policy = tsk->policy;
 	data->rt_priority = tsk->rt_priority;
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 8ebcd8532df..6d1ed07bf31 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -53,8 +53,8 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
 		stats->ac_flag |= AXSIG;
 	stats->ac_nice	 = task_nice(tsk);
 	stats->ac_sched	 = tsk->policy;
-	stats->ac_uid	 = tsk->uid;
-	stats->ac_gid	 = tsk->gid;
+	stats->ac_uid	 = tsk->cred->uid;
+	stats->ac_gid	 = tsk->cred->gid;
 	stats->ac_pid	 = tsk->pid;
 	rcu_read_lock();
 	stats->ac_ppid	 = pid_alive(tsk) ?
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 3e41c1673e2..71f07fc39fe 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -86,9 +86,9 @@ asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid,
 {
 	int retval;
 
-	if (!(retval = put_user(high2lowuid(current->uid), ruid)) &&
-	    !(retval = put_user(high2lowuid(current->euid), euid)))
-		retval = put_user(high2lowuid(current->suid), suid);
+	if (!(retval = put_user(high2lowuid(current->cred->uid), ruid)) &&
+	    !(retval = put_user(high2lowuid(current->cred->euid), euid)))
+		retval = put_user(high2lowuid(current->cred->suid), suid);
 
 	return retval;
 }
@@ -106,9 +106,9 @@ asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid,
 {
 	int retval;
 
-	if (!(retval = put_user(high2lowgid(current->gid), rgid)) &&
-	    !(retval = put_user(high2lowgid(current->egid), egid)))
-		retval = put_user(high2lowgid(current->sgid), sgid);
+	if (!(retval = put_user(high2lowgid(current->cred->gid), rgid)) &&
+	    !(retval = put_user(high2lowgid(current->cred->egid), egid)))
+		retval = put_user(high2lowgid(current->cred->sgid), sgid);
 
 	return retval;
 }
@@ -166,20 +166,20 @@ asmlinkage long sys_getgroups16(int gidsetsize, old_gid_t __user *grouplist)
 	if (gidsetsize < 0)
 		return -EINVAL;
 
-	get_group_info(current->group_info);
-	i = current->group_info->ngroups;
+	get_group_info(current->cred->group_info);
+	i = current->cred->group_info->ngroups;
 	if (gidsetsize) {
 		if (i > gidsetsize) {
 			i = -EINVAL;
 			goto out;
 		}
-		if (groups16_to_user(grouplist, current->group_info)) {
+		if (groups16_to_user(grouplist, current->cred->group_info)) {
 			i = -EFAULT;
 			goto out;
 		}
 	}
 out:
-	put_group_info(current->group_info);
+	put_group_info(current->cred->group_info);
 	return i;
 }
 
@@ -210,20 +210,20 @@ asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t __user *grouplist)
 
 asmlinkage long sys_getuid16(void)
 {
-	return high2lowuid(current->uid);
+	return high2lowuid(current->cred->uid);
 }
 
 asmlinkage long sys_geteuid16(void)
 {
-	return high2lowuid(current->euid);
+	return high2lowuid(current->cred->euid);
 }
 
 asmlinkage long sys_getgid16(void)
 {
-	return high2lowgid(current->gid);
+	return high2lowgid(current->cred->gid);
 }
 
 asmlinkage long sys_getegid16(void)
 {
-	return high2lowgid(current->egid);
+	return high2lowgid(current->cred->egid);
 }
diff --git a/kernel/user.c b/kernel/user.c
index 39d6159fae4..104d22ac84d 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -457,11 +457,11 @@ void switch_uid(struct user_struct *new_user)
 	 * cheaply with the new uid cache, so if it matters
 	 * we should be checking for it.  -DaveM
 	 */
-	old_user = current->user;
+	old_user = current->cred->user;
 	atomic_inc(&new_user->processes);
 	atomic_dec(&old_user->processes);
 	switch_uid_keyring(new_user);
-	current->user = new_user;
+	current->cred->user = new_user;
 	sched_switch_user(current);
 
 	/*
-- 
cgit v1.2.3


From f1752eec6145c97163dbce62d17cf5d928e28a27 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:17 +1100
Subject: CRED: Detach the credentials from task_struct

Detach the credentials from task_struct, duplicating them in copy_process()
and releasing them in __put_task_struct().

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/Makefile |  2 +-
 kernel/cred.c   | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/fork.c   | 24 ++++-----------
 3 files changed, 103 insertions(+), 19 deletions(-)
 create mode 100644 kernel/cred.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 9a3ec66a9d8..5a6a612c302 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
 	    rcupdate.o extable.o params.o posix-timers.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
-	    notifier.o ksysfs.o pm_qos_params.o sched_clock.o
+	    notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o
 
 CFLAGS_REMOVE_sched.o = -mno-spe
 
diff --git a/kernel/cred.c b/kernel/cred.c
new file mode 100644
index 00000000000..833244a7cb0
--- /dev/null
+++ b/kernel/cred.c
@@ -0,0 +1,96 @@
+/* Task credentials management
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#include <linux/module.h>
+#include <linux/cred.h>
+#include <linux/sched.h>
+#include <linux/key.h>
+#include <linux/keyctl.h>
+#include <linux/init_task.h>
+#include <linux/security.h>
+
+/*
+ * The initial credentials for the initial task
+ */
+struct cred init_cred = {
+	.usage			= ATOMIC_INIT(3),
+	.securebits		= SECUREBITS_DEFAULT,
+	.cap_inheritable	= CAP_INIT_INH_SET,
+	.cap_permitted		= CAP_FULL_SET,
+	.cap_effective		= CAP_INIT_EFF_SET,
+	.cap_bset		= CAP_INIT_BSET,
+	.user			= INIT_USER,
+	.group_info		= &init_groups,
+};
+
+/*
+ * The RCU callback to actually dispose of a set of credentials
+ */
+static void put_cred_rcu(struct rcu_head *rcu)
+{
+	struct cred *cred = container_of(rcu, struct cred, rcu);
+
+	BUG_ON(atomic_read(&cred->usage) != 0);
+
+	key_put(cred->thread_keyring);
+	key_put(cred->request_key_auth);
+	put_group_info(cred->group_info);
+	free_uid(cred->user);
+	security_cred_free(cred);
+	kfree(cred);
+}
+
+/**
+ * __put_cred - Destroy a set of credentials
+ * @sec: The record to release
+ *
+ * Destroy a set of credentials on which no references remain.
+ */
+void __put_cred(struct cred *cred)
+{
+	call_rcu(&cred->rcu, put_cred_rcu);
+}
+EXPORT_SYMBOL(__put_cred);
+
+/*
+ * Copy credentials for the new process created by fork()
+ */
+int copy_creds(struct task_struct *p, unsigned long clone_flags)
+{
+	struct cred *pcred;
+	int ret;
+
+	pcred = kmemdup(p->cred, sizeof(*p->cred), GFP_KERNEL);
+	if (!pcred)
+		return -ENOMEM;
+
+#ifdef CONFIG_SECURITY
+	pcred->security = NULL;
+#endif
+
+	ret = security_cred_alloc(pcred);
+	if (ret < 0) {
+		kfree(pcred);
+		return ret;
+	}
+
+	atomic_set(&pcred->usage, 1);
+	get_group_info(pcred->group_info);
+	get_uid(pcred->user);
+	key_get(pcred->thread_keyring);
+	key_get(pcred->request_key_auth);
+
+	atomic_inc(&pcred->user->processes);
+
+	/* RCU assignment is unneeded here as no-one can have accessed this
+	 * pointer yet, barring us */
+	p->cred = pcred;
+	return 0;
+}
diff --git a/kernel/fork.c b/kernel/fork.c
index 81fdc773390..c932e283ddf 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -146,9 +146,7 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
-	security_task_free(tsk);
-	free_uid(tsk->__temp_cred.user);
-	put_group_info(tsk->__temp_cred.group_info);
+	put_cred(tsk->cred);
 	delayacct_tsk_free(tsk);
 
 	if (!profile_handoff_task(tsk))
@@ -969,7 +967,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
 	DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
 #endif
-	p->cred = &p->__temp_cred;
 	retval = -EAGAIN;
 	if (atomic_read(&p->cred->user->processes) >=
 			p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
@@ -978,9 +975,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 			goto bad_fork_free;
 	}
 
-	atomic_inc(&p->cred->user->__count);
-	atomic_inc(&p->cred->user->processes);
-	get_group_info(p->cred->group_info);
+	retval = copy_creds(p, clone_flags);
+	if (retval < 0)
+		goto bad_fork_free;
 
 	/*
 	 * If multiple threads are within copy_process(), then this check
@@ -1035,9 +1032,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	do_posix_clock_monotonic_gettime(&p->start_time);
 	p->real_start_time = p->start_time;
 	monotonic_to_bootbased(&p->real_start_time);
-#ifdef CONFIG_SECURITY
-	p->cred->security = NULL;
-#endif
 	p->io_context = NULL;
 	p->audit_context = NULL;
 	cgroup_fork(p);
@@ -1082,10 +1076,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	sched_fork(p, clone_flags);
 
-	if ((retval = security_task_alloc(p)))
-		goto bad_fork_cleanup_policy;
 	if ((retval = audit_alloc(p)))
-		goto bad_fork_cleanup_security;
+		goto bad_fork_cleanup_policy;
 	/* copy all the process information */
 	if ((retval = copy_semundo(clone_flags, p)))
 		goto bad_fork_cleanup_audit;
@@ -1284,8 +1276,6 @@ bad_fork_cleanup_semundo:
 	exit_sem(p);
 bad_fork_cleanup_audit:
 	audit_free(p);
-bad_fork_cleanup_security:
-	security_task_free(p);
 bad_fork_cleanup_policy:
 #ifdef CONFIG_NUMA
 	mpol_put(p->mempolicy);
@@ -1298,9 +1288,7 @@ bad_fork_cleanup_cgroup:
 bad_fork_cleanup_put_domain:
 	module_put(task_thread_info(p)->exec_domain->module);
 bad_fork_cleanup_count:
-	put_group_info(p->cred->group_info);
-	atomic_dec(&p->cred->user->processes);
-	free_uid(p->cred->user);
+	put_cred(p->cred);
 bad_fork_free:
 	free_task(p);
 fork_out:
-- 
cgit v1.2.3


From 86a264abe542cfececb4df129bc45a0338d8cdb9 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:18 +1100
Subject: CRED: Wrap current->cred and a few other accessors

Wrap current->cred and a few other accessors to hide their actual
implementation.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/sys.c   | 59 +++++++++++++++++++++++++++++-----------------------------
 kernel/uid16.c | 31 +++++++++++++++---------------
 2 files changed, 45 insertions(+), 45 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 5d81f07c015..c4d6b59553e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -143,6 +143,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
 {
 	struct task_struct *g, *p;
 	struct user_struct *user;
+	const struct cred *cred = current_cred();
 	int error = -EINVAL;
 	struct pid *pgrp;
 
@@ -176,18 +177,18 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
 			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
 			break;
 		case PRIO_USER:
-			user = current->cred->user;
+			user = cred->user;
 			if (!who)
-				who = current_uid();
-			else
-				if (who != current_uid() && !(user = find_user(who)))
-					goto out_unlock;	/* No processes for this user */
+				who = cred->uid;
+			else if ((who != cred->uid) &&
+				 !(user = find_user(who)))
+				goto out_unlock;	/* No processes for this user */
 
 			do_each_thread(g, p)
-				if (p->cred->uid == who)
+				if (__task_cred(p)->uid == who)
 					error = set_one_prio(p, niceval, error);
 			while_each_thread(g, p);
-			if (who != current_uid())
+			if (who != cred->uid)
 				free_uid(user);		/* For find_user() */
 			break;
 	}
@@ -207,6 +208,7 @@ asmlinkage long sys_getpriority(int which, int who)
 {
 	struct task_struct *g, *p;
 	struct user_struct *user;
+	const struct cred *cred = current_cred();
 	long niceval, retval = -ESRCH;
 	struct pid *pgrp;
 
@@ -238,21 +240,21 @@ asmlinkage long sys_getpriority(int which, int who)
 			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
 			break;
 		case PRIO_USER:
-			user = current->cred->user;
+			user = (struct user_struct *) cred->user;
 			if (!who)
-				who = current_uid();
-			else
-				if (who != current_uid() && !(user = find_user(who)))
-					goto out_unlock;	/* No processes for this user */
+				who = cred->uid;
+			else if ((who != cred->uid) &&
+				 !(user = find_user(who)))
+				goto out_unlock;	/* No processes for this user */
 
 			do_each_thread(g, p)
-				if (p->cred->uid == who) {
+				if (__task_cred(p)->uid == who) {
 					niceval = 20 - task_nice(p);
 					if (niceval > retval)
 						retval = niceval;
 				}
 			while_each_thread(g, p);
-			if (who != current_uid())
+			if (who != cred->uid)
 				free_uid(user);		/* for find_user() */
 			break;
 	}
@@ -743,11 +745,11 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
 
 asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid)
 {
-	struct cred *cred = current->cred;
+	const struct cred *cred = current_cred();
 	int retval;
 
-	if (!(retval = put_user(cred->uid, ruid)) &&
-	    !(retval = put_user(cred->euid, euid)))
+	if (!(retval   = put_user(cred->uid,  ruid)) &&
+	    !(retval   = put_user(cred->euid, euid)))
 		retval = put_user(cred->suid, suid);
 
 	return retval;
@@ -796,11 +798,11 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
 
 asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid)
 {
-	struct cred *cred = current->cred;
+	const struct cred *cred = current_cred();
 	int retval;
 
-	if (!(retval = put_user(cred->gid, rgid)) &&
-	    !(retval = put_user(cred->egid, egid)))
+	if (!(retval   = put_user(cred->gid,  rgid)) &&
+	    !(retval   = put_user(cred->egid, egid)))
 		retval = put_user(cred->sgid, sgid);
 
 	return retval;
@@ -1199,7 +1201,7 @@ static void groups_sort(struct group_info *group_info)
 }
 
 /* a simple bsearch */
-int groups_search(struct group_info *group_info, gid_t grp)
+int groups_search(const struct group_info *group_info, gid_t grp)
 {
 	unsigned int left, right;
 
@@ -1268,13 +1270,8 @@ EXPORT_SYMBOL(set_current_groups);
 
 asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist)
 {
-	struct cred *cred = current->cred;
-	int i = 0;
-
-	/*
-	 *	SMP: Nobody else can change our grouplist. Thus we are
-	 *	safe.
-	 */
+	const struct cred *cred = current_cred();
+	int i;
 
 	if (gidsetsize < 0)
 		return -EINVAL;
@@ -1330,8 +1327,9 @@ asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist)
  */
 int in_group_p(gid_t grp)
 {
-	struct cred *cred = current->cred;
+	const struct cred *cred = current_cred();
 	int retval = 1;
+
 	if (grp != cred->fsgid)
 		retval = groups_search(cred->group_info, grp);
 	return retval;
@@ -1341,8 +1339,9 @@ EXPORT_SYMBOL(in_group_p);
 
 int in_egroup_p(gid_t grp)
 {
-	struct cred *cred = current->cred;
+	const struct cred *cred = current_cred();
 	int retval = 1;
+
 	if (grp != cred->egid)
 		retval = groups_search(cred->group_info, grp);
 	return retval;
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 71f07fc39fe..2460c3199b5 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -84,11 +84,12 @@ asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid)
 
 asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, old_uid_t __user *suid)
 {
+	const struct cred *cred = current_cred();
 	int retval;
 
-	if (!(retval = put_user(high2lowuid(current->cred->uid), ruid)) &&
-	    !(retval = put_user(high2lowuid(current->cred->euid), euid)))
-		retval = put_user(high2lowuid(current->cred->suid), suid);
+	if (!(retval   = put_user(high2lowuid(cred->uid),  ruid)) &&
+	    !(retval   = put_user(high2lowuid(cred->euid), euid)))
+		retval = put_user(high2lowuid(cred->suid), suid);
 
 	return retval;
 }
@@ -104,11 +105,12 @@ asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid)
 
 asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, old_gid_t __user *sgid)
 {
+	const struct cred *cred = current_cred();
 	int retval;
 
-	if (!(retval = put_user(high2lowgid(current->cred->gid), rgid)) &&
-	    !(retval = put_user(high2lowgid(current->cred->egid), egid)))
-		retval = put_user(high2lowgid(current->cred->sgid), sgid);
+	if (!(retval   = put_user(high2lowgid(cred->gid),  rgid)) &&
+	    !(retval   = put_user(high2lowgid(cred->egid), egid)))
+		retval = put_user(high2lowgid(cred->sgid), sgid);
 
 	return retval;
 }
@@ -161,25 +163,24 @@ static int groups16_from_user(struct group_info *group_info,
 
 asmlinkage long sys_getgroups16(int gidsetsize, old_gid_t __user *grouplist)
 {
-	int i = 0;
+	const struct cred *cred = current_cred();
+	int i;
 
 	if (gidsetsize < 0)
 		return -EINVAL;
 
-	get_group_info(current->cred->group_info);
-	i = current->cred->group_info->ngroups;
+	i = cred->group_info->ngroups;
 	if (gidsetsize) {
 		if (i > gidsetsize) {
 			i = -EINVAL;
 			goto out;
 		}
-		if (groups16_to_user(grouplist, current->cred->group_info)) {
+		if (groups16_to_user(grouplist, cred->group_info)) {
 			i = -EFAULT;
 			goto out;
 		}
 	}
 out:
-	put_group_info(current->cred->group_info);
 	return i;
 }
 
@@ -210,20 +211,20 @@ asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t __user *grouplist)
 
 asmlinkage long sys_getuid16(void)
 {
-	return high2lowuid(current->cred->uid);
+	return high2lowuid(current_uid());
 }
 
 asmlinkage long sys_geteuid16(void)
 {
-	return high2lowuid(current->cred->euid);
+	return high2lowuid(current_euid());
 }
 
 asmlinkage long sys_getgid16(void)
 {
-	return high2lowgid(current->cred->gid);
+	return high2lowgid(current_gid());
 }
 
 asmlinkage long sys_getegid16(void)
 {
-	return high2lowgid(current->cred->egid);
+	return high2lowgid(current_egid());
 }
-- 
cgit v1.2.3


From c69e8d9c01db2adc503464993c358901c9af9de4 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:19 +1100
Subject: CRED: Use RCU to access another task's creds and to release a task's
 own creds

Use RCU to access another task's creds and to release a task's own creds.
This means that it will be possible for the credentials of a task to be
replaced without another task (a) requiring a full lock to read them, and (b)
seeing deallocated memory.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/auditsc.c      | 33 +++++++++++++++++++--------------
 kernel/cgroup.c       | 16 ++++++++--------
 kernel/exit.c         | 14 +++++++++-----
 kernel/futex.c        | 22 ++++++++++++++--------
 kernel/futex_compat.c |  7 ++++---
 kernel/ptrace.c       | 22 +++++++++++++---------
 kernel/sched.c        | 31 +++++++++++++++++++++----------
 kernel/signal.c       | 49 +++++++++++++++++++++++++++++++------------------
 kernel/sys.c          | 11 +++++++----
 kernel/tsacct.c       |  6 ++++--
 10 files changed, 130 insertions(+), 81 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 2febf5165fa..ae8ef88ade3 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -447,7 +447,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 			      struct audit_names *name,
 			      enum audit_state *state)
 {
-	struct cred *cred = tsk->cred;
+	const struct cred *cred = get_task_cred(tsk);
 	int i, j, need_sid = 1;
 	u32 sid;
 
@@ -642,8 +642,10 @@ static int audit_filter_rules(struct task_struct *tsk,
 			break;
 		}
 
-		if (!result)
+		if (!result) {
+			put_cred(cred);
 			return 0;
+		}
 	}
 	if (rule->filterkey && ctx)
 		ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
@@ -651,6 +653,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 	case AUDIT_NEVER:    *state = AUDIT_DISABLED;	    break;
 	case AUDIT_ALWAYS:   *state = AUDIT_RECORD_CONTEXT; break;
 	}
+	put_cred(cred);
 	return 1;
 }
 
@@ -1229,7 +1232,7 @@ static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
 
 static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
 {
-	struct cred *cred = tsk->cred;
+	const struct cred *cred;
 	int i, call_panic = 0;
 	struct audit_buffer *ab;
 	struct audit_aux_data *aux;
@@ -1239,13 +1242,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 	context->pid = tsk->pid;
 	if (!context->ppid)
 		context->ppid = sys_getppid();
-	context->uid = cred->uid;
-	context->gid = cred->gid;
-	context->euid = cred->euid;
-	context->suid = cred->suid;
+	cred = current_cred();
+	context->uid   = cred->uid;
+	context->gid   = cred->gid;
+	context->euid  = cred->euid;
+	context->suid  = cred->suid;
 	context->fsuid = cred->fsuid;
-	context->egid = cred->egid;
-	context->sgid = cred->sgid;
+	context->egid  = cred->egid;
+	context->sgid  = cred->sgid;
 	context->fsgid = cred->fsgid;
 	context->personality = tsk->personality;
 
@@ -2088,7 +2092,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
 			audit_log_format(ab, "login pid=%d uid=%u "
 				"old auid=%u new auid=%u"
 				" old ses=%u new ses=%u",
-				task->pid, task->cred->uid,
+				task->pid, task_uid(task),
 				task->loginuid, loginuid,
 				task->sessionid, sessionid);
 			audit_log_end(ab);
@@ -2471,7 +2475,7 @@ void __audit_ptrace(struct task_struct *t)
 
 	context->target_pid = t->pid;
 	context->target_auid = audit_get_loginuid(t);
-	context->target_uid = t->cred->uid;
+	context->target_uid = task_uid(t);
 	context->target_sessionid = audit_get_sessionid(t);
 	security_task_getsecid(t, &context->target_sid);
 	memcpy(context->target_comm, t->comm, TASK_COMM_LEN);
@@ -2490,6 +2494,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
 	struct audit_aux_data_pids *axp;
 	struct task_struct *tsk = current;
 	struct audit_context *ctx = tsk->audit_context;
+	uid_t uid = current_uid(), t_uid = task_uid(t);
 
 	if (audit_pid && t->tgid == audit_pid) {
 		if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
@@ -2497,7 +2502,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
 			if (tsk->loginuid != -1)
 				audit_sig_uid = tsk->loginuid;
 			else
-				audit_sig_uid = tsk->cred->uid;
+				audit_sig_uid = uid;
 			security_task_getsecid(tsk, &audit_sig_sid);
 		}
 		if (!audit_signals || audit_dummy_context())
@@ -2509,7 +2514,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
 	if (!ctx->target_pid) {
 		ctx->target_pid = t->tgid;
 		ctx->target_auid = audit_get_loginuid(t);
-		ctx->target_uid = t->cred->uid;
+		ctx->target_uid = t_uid;
 		ctx->target_sessionid = audit_get_sessionid(t);
 		security_task_getsecid(t, &ctx->target_sid);
 		memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN);
@@ -2530,7 +2535,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
 
 	axp->target_pid[axp->pid_count] = t->tgid;
 	axp->target_auid[axp->pid_count] = audit_get_loginuid(t);
-	axp->target_uid[axp->pid_count] = t->cred->uid;
+	axp->target_uid[axp->pid_count] = t_uid;
 	axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
 	security_task_getsecid(t, &axp->target_sid[axp->pid_count]);
 	memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e210526e640..a512a75a556 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1279,7 +1279,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
 {
 	struct task_struct *tsk;
-	uid_t euid;
+	const struct cred *cred = current_cred(), *tcred;
 	int ret;
 
 	if (pid) {
@@ -1289,16 +1289,16 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
 			rcu_read_unlock();
 			return -ESRCH;
 		}
-		get_task_struct(tsk);
-		rcu_read_unlock();
 
-		euid = current_euid();
-		if (euid &&
-		    euid != tsk->cred->uid &&
-		    euid != tsk->cred->suid) {
-			put_task_struct(tsk);
+		tcred = __task_cred(tsk);
+		if (cred->euid &&
+		    cred->euid != tcred->uid &&
+		    cred->euid != tcred->suid) {
+			rcu_read_unlock();
 			return -EACCES;
 		}
+		get_task_struct(tsk);
+		rcu_read_unlock();
 	} else {
 		tsk = current;
 		get_task_struct(tsk);
diff --git a/kernel/exit.c b/kernel/exit.c
index e0f6e1892fb..bbc22530f2c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -160,7 +160,10 @@ void release_task(struct task_struct * p)
 	int zap_leader;
 repeat:
 	tracehook_prepare_release_task(p);
-	atomic_dec(&p->cred->user->processes);
+	/* don't need to get the RCU readlock here - the process is dead and
+	 * can't be modifying its own credentials */
+	atomic_dec(&__task_cred(p)->user->processes);
+
 	proc_flush_task(p);
 	write_lock_irq(&tasklist_lock);
 	tracehook_finish_release_task(p);
@@ -1267,12 +1270,12 @@ static int wait_task_zombie(struct task_struct *p, int options,
 	unsigned long state;
 	int retval, status, traced;
 	pid_t pid = task_pid_vnr(p);
+	uid_t uid = __task_cred(p)->uid;
 
 	if (!likely(options & WEXITED))
 		return 0;
 
 	if (unlikely(options & WNOWAIT)) {
-		uid_t uid = p->cred->uid;
 		int exit_code = p->exit_code;
 		int why, status;
 
@@ -1393,7 +1396,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
 	if (!retval && infop)
 		retval = put_user(pid, &infop->si_pid);
 	if (!retval && infop)
-		retval = put_user(p->cred->uid, &infop->si_uid);
+		retval = put_user(uid, &infop->si_uid);
 	if (!retval)
 		retval = pid;
 
@@ -1458,7 +1461,8 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
 	if (!unlikely(options & WNOWAIT))
 		p->exit_code = 0;
 
-	uid = p->cred->uid;
+	/* don't need the RCU readlock here as we're holding a spinlock */
+	uid = __task_cred(p)->uid;
 unlock_sig:
 	spin_unlock_irq(&p->sighand->siglock);
 	if (!exit_code)
@@ -1532,10 +1536,10 @@ static int wait_task_continued(struct task_struct *p, int options,
 	}
 	if (!unlikely(options & WNOWAIT))
 		p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
+	uid = __task_cred(p)->uid;
 	spin_unlock_irq(&p->sighand->siglock);
 
 	pid = task_pid_vnr(p);
-	uid = p->cred->uid;
 	get_task_struct(p);
 	read_unlock(&tasklist_lock);
 
diff --git a/kernel/futex.c b/kernel/futex.c
index 28421d8210b..4fe790e89d0 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -439,15 +439,20 @@ static void free_pi_state(struct futex_pi_state *pi_state)
 static struct task_struct * futex_find_get_task(pid_t pid)
 {
 	struct task_struct *p;
-	uid_t euid = current_euid();
+	const struct cred *cred = current_cred(), *pcred;
 
 	rcu_read_lock();
 	p = find_task_by_vpid(pid);
-	if (!p || (euid != p->cred->euid &&
-		   euid != p->cred->uid))
+	if (!p) {
 		p = ERR_PTR(-ESRCH);
-	else
-		get_task_struct(p);
+	} else {
+		pcred = __task_cred(p);
+		if (cred->euid != pcred->euid &&
+		    cred->euid != pcred->uid)
+			p = ERR_PTR(-ESRCH);
+		else
+			get_task_struct(p);
+	}
 
 	rcu_read_unlock();
 
@@ -1831,7 +1836,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr,
 {
 	struct robust_list_head __user *head;
 	unsigned long ret;
-	uid_t euid = current_euid();
+	const struct cred *cred = current_cred(), *pcred;
 
 	if (!futex_cmpxchg_enabled)
 		return -ENOSYS;
@@ -1847,8 +1852,9 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr,
 		if (!p)
 			goto err_unlock;
 		ret = -EPERM;
-		if (euid != p->cred->euid &&
-		    euid != p->cred->uid &&
+		pcred = __task_cred(p);
+		if (cred->euid != pcred->euid &&
+		    cred->euid != pcred->uid &&
 		    !capable(CAP_SYS_PTRACE))
 			goto err_unlock;
 		head = p->robust_list;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 2c3fd5ed34f..d607a5b9ee2 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -135,7 +135,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
 {
 	struct compat_robust_list_head __user *head;
 	unsigned long ret;
-	uid_t euid = current_euid();
+	const struct cred *cred = current_cred(), *pcred;
 
 	if (!futex_cmpxchg_enabled)
 		return -ENOSYS;
@@ -151,8 +151,9 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
 		if (!p)
 			goto err_unlock;
 		ret = -EPERM;
-		if (euid != p->cred->euid &&
-		    euid != p->cred->uid &&
+		pcred = __task_cred(p);
+		if (cred->euid != pcred->euid &&
+		    cred->euid != pcred->uid &&
 		    !capable(CAP_SYS_PTRACE))
 			goto err_unlock;
 		head = p->compat_robust_list;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 49849d12dd1..b9d5f4e4f6a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -115,7 +115,7 @@ int ptrace_check_attach(struct task_struct *child, int kill)
 
 int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 {
-	struct cred *cred = current->cred, *tcred = task->cred;
+	const struct cred *cred = current_cred(), *tcred;
 
 	/* May we inspect the given task?
 	 * This check is used both for attaching with ptrace
@@ -125,19 +125,23 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 	 * because setting up the necessary parent/child relationship
 	 * or halting the specified task is impossible.
 	 */
-	uid_t uid = cred->uid;
-	gid_t gid = cred->gid;
 	int dumpable = 0;
 	/* Don't let security modules deny introspection */
 	if (task == current)
 		return 0;
-	if ((uid != tcred->euid ||
-	     uid != tcred->suid ||
-	     uid != tcred->uid  ||
-	     gid != tcred->egid ||
-	     gid != tcred->sgid ||
-	     gid != tcred->gid) && !capable(CAP_SYS_PTRACE))
+	rcu_read_lock();
+	tcred = __task_cred(task);
+	if ((cred->uid != tcred->euid ||
+	     cred->uid != tcred->suid ||
+	     cred->uid != tcred->uid  ||
+	     cred->gid != tcred->egid ||
+	     cred->gid != tcred->sgid ||
+	     cred->gid != tcred->gid) &&
+	    !capable(CAP_SYS_PTRACE)) {
+		rcu_read_unlock();
 		return -EPERM;
+	}
+	rcu_read_unlock();
 	smp_rmb();
 	if (task->mm)
 		dumpable = get_dumpable(task->mm);
diff --git a/kernel/sched.c b/kernel/sched.c
index 733c59e645a..92992e287b1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -345,7 +345,9 @@ static inline struct task_group *task_group(struct task_struct *p)
 	struct task_group *tg;
 
 #ifdef CONFIG_USER_SCHED
-	tg = p->cred->user->tg;
+	rcu_read_lock();
+	tg = __task_cred(p)->user->tg;
+	rcu_read_unlock();
 #elif defined(CONFIG_CGROUP_SCHED)
 	tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
 				struct task_group, css);
@@ -5121,6 +5123,22 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
 	set_load_weight(p);
 }
 
+/*
+ * check the target process has a UID that matches the current process's
+ */
+static bool check_same_owner(struct task_struct *p)
+{
+	const struct cred *cred = current_cred(), *pcred;
+	bool match;
+
+	rcu_read_lock();
+	pcred = __task_cred(p);
+	match = (cred->euid == pcred->euid ||
+		 cred->euid == pcred->uid);
+	rcu_read_unlock();
+	return match;
+}
+
 static int __sched_setscheduler(struct task_struct *p, int policy,
 				struct sched_param *param, bool user)
 {
@@ -5128,7 +5146,6 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
 	unsigned long flags;
 	const struct sched_class *prev_class = p->sched_class;
 	struct rq *rq;
-	uid_t euid;
 
 	/* may grab non-irq protected spin_locks */
 	BUG_ON(in_interrupt());
@@ -5181,9 +5198,7 @@ recheck:
 			return -EPERM;
 
 		/* can't change other user's priorities */
-		euid = current_euid();
-		if (euid != p->cred->euid &&
-		    euid != p->cred->uid)
+		if (!check_same_owner(p))
 			return -EPERM;
 	}
 
@@ -5394,7 +5409,6 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
 	cpumask_t cpus_allowed;
 	cpumask_t new_mask = *in_mask;
 	struct task_struct *p;
-	uid_t euid;
 	int retval;
 
 	get_online_cpus();
@@ -5415,11 +5429,8 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
 	get_task_struct(p);
 	read_unlock(&tasklist_lock);
 
-	euid = current_euid();
 	retval = -EPERM;
-	if (euid != p->cred->euid &&
-	    euid != p->cred->uid &&
-	    !capable(CAP_SYS_NICE))
+	if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
 		goto out_unlock;
 
 	retval = security_task_setscheduler(p, 0, NULL);
diff --git a/kernel/signal.c b/kernel/signal.c
index 80e8a6489f9..84989124baf 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -177,6 +177,11 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
 	return sig;
 }
 
+/*
+ * allocate a new signal queue record
+ * - this may be called without locks if and only if t == current, otherwise an
+ *   appopriate lock must be held to protect t's user_struct
+ */
 static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
 					 int override_rlimit)
 {
@@ -184,11 +189,12 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
 	struct user_struct *user;
 
 	/*
-	 * In order to avoid problems with "switch_user()", we want to make
-	 * sure that the compiler doesn't re-load "t->user"
+	 * We won't get problems with the target's UID changing under us
+	 * because changing it requires RCU be used, and if t != current, the
+	 * caller must be holding the RCU readlock (by way of a spinlock) and
+	 * we use RCU protection here
 	 */
-	user = t->cred->user;
-	barrier();
+	user = __task_cred(t)->user;
 	atomic_inc(&user->sigpending);
 	if (override_rlimit ||
 	    atomic_read(&user->sigpending) <=
@@ -562,12 +568,13 @@ static int rm_from_queue(unsigned long mask, struct sigpending *s)
 
 /*
  * Bad permissions for sending the signal
+ * - the caller must hold at least the RCU read lock
  */
 static int check_kill_permission(int sig, struct siginfo *info,
 				 struct task_struct *t)
 {
+	const struct cred *cred = current_cred(), *tcred;
 	struct pid *sid;
-	uid_t uid, euid;
 	int error;
 
 	if (!valid_signal(sig))
@@ -580,10 +587,11 @@ static int check_kill_permission(int sig, struct siginfo *info,
 	if (error)
 		return error;
 
-	uid = current_uid();
-	euid = current_euid();
-	if ((euid ^ t->cred->suid) && (euid ^ t->cred->uid) &&
-	    (uid  ^ t->cred->suid) && (uid  ^ t->cred->uid) &&
+	tcred = __task_cred(t);
+	if ((cred->euid ^ tcred->suid) &&
+	    (cred->euid ^ tcred->uid) &&
+	    (cred->uid  ^ tcred->suid) &&
+	    (cred->uid  ^ tcred->uid) &&
 	    !capable(CAP_KILL)) {
 		switch (sig) {
 		case SIGCONT:
@@ -1011,6 +1019,10 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long
 	return sighand;
 }
 
+/*
+ * send signal info to all the members of a group
+ * - the caller must hold the RCU read lock at least
+ */
 int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 {
 	unsigned long flags;
@@ -1032,8 +1044,8 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 /*
  * __kill_pgrp_info() sends a signal to a process group: this is what the tty
  * control characters do (^C, ^Z etc)
+ * - the caller must hold at least a readlock on tasklist_lock
  */
-
 int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp)
 {
 	struct task_struct *p = NULL;
@@ -1089,6 +1101,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
 {
 	int ret = -EINVAL;
 	struct task_struct *p;
+	const struct cred *pcred;
 
 	if (!valid_signal(sig))
 		return ret;
@@ -1099,9 +1112,11 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
 		ret = -ESRCH;
 		goto out_unlock;
 	}
-	if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info)))
-	    && (euid != p->cred->suid) && (euid != p->cred->uid)
-	    && (uid != p->cred->suid) && (uid != p->cred->uid)) {
+	pcred = __task_cred(p);
+	if ((info == SEND_SIG_NOINFO ||
+	     (!is_si_special(info) && SI_FROMUSER(info))) &&
+	    euid != pcred->suid && euid != pcred->uid &&
+	    uid  != pcred->suid && uid  != pcred->uid) {
 		ret = -EPERM;
 		goto out_unlock;
 	}
@@ -1372,10 +1387,9 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 	 */
 	rcu_read_lock();
 	info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
+	info.si_uid = __task_cred(tsk)->uid;
 	rcu_read_unlock();
 
-	info.si_uid = tsk->cred->uid;
-
 	thread_group_cputime(tsk, &cputime);
 	info.si_utime = cputime_to_jiffies(cputime.utime);
 	info.si_stime = cputime_to_jiffies(cputime.stime);
@@ -1443,10 +1457,9 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
 	 */
 	rcu_read_lock();
 	info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
+	info.si_uid = __task_cred(tsk)->uid;
 	rcu_read_unlock();
 
-	info.si_uid = tsk->cred->uid;
-
 	info.si_utime = cputime_to_clock_t(tsk->utime);
 	info.si_stime = cputime_to_clock_t(tsk->stime);
 
@@ -1713,7 +1726,7 @@ static int ptrace_signal(int signr, siginfo_t *info,
 		info->si_errno = 0;
 		info->si_code = SI_USER;
 		info->si_pid = task_pid_vnr(current->parent);
-		info->si_uid = current->parent->cred->uid;
+		info->si_uid = task_uid(current->parent);
 	}
 
 	/* If the (new) signal is now blocked, requeue it.  */
diff --git a/kernel/sys.c b/kernel/sys.c
index c4d6b59553e..ccc9eb736d3 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -112,14 +112,17 @@ EXPORT_SYMBOL(cad_pid);
 
 void (*pm_power_off_prepare)(void);
 
+/*
+ * set the priority of a task
+ * - the caller must hold the RCU read lock
+ */
 static int set_one_prio(struct task_struct *p, int niceval, int error)
 {
-	uid_t euid = current_euid();
+	const struct cred *cred = current_cred(), *pcred = __task_cred(p);
 	int no_nice;
 
-	if (p->cred->uid  != euid &&
-	    p->cred->euid != euid &&
-	    !capable(CAP_SYS_NICE)) {
+	if (pcred->uid  != cred->euid &&
+	    pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) {
 		error = -EPERM;
 		goto out;
 	}
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 6d1ed07bf31..2dc06ab3571 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -27,6 +27,7 @@
  */
 void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
 {
+	const struct cred *tcred;
 	struct timespec uptime, ts;
 	u64 ac_etime;
 
@@ -53,10 +54,11 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
 		stats->ac_flag |= AXSIG;
 	stats->ac_nice	 = task_nice(tsk);
 	stats->ac_sched	 = tsk->policy;
-	stats->ac_uid	 = tsk->cred->uid;
-	stats->ac_gid	 = tsk->cred->gid;
 	stats->ac_pid	 = tsk->pid;
 	rcu_read_lock();
+	tcred = __task_cred(tsk);
+	stats->ac_uid	 = tcred->uid;
+	stats->ac_gid	 = tcred->gid;
 	stats->ac_ppid	 = pid_alive(tsk) ?
 				rcu_dereference(tsk->real_parent)->tgid : 0;
 	rcu_read_unlock();
-- 
cgit v1.2.3


From bb952bb98a7e479262c7eb25d5592545a3af147d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:20 +1100
Subject: CRED: Separate per-task-group keyrings from signal_struct

Separate per-task-group keyrings from signal_struct and dangle their anchor
from the cred struct rather than the signal_struct.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: James Morris <jmorris@namei.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/cred.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/fork.c |  7 -------
 2 files changed, 63 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cred.c b/kernel/cred.c
index 833244a7cb0..ac73e361768 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -16,6 +16,17 @@
 #include <linux/init_task.h>
 #include <linux/security.h>
 
+/*
+ * The common credentials for the initial task's thread group
+ */
+#ifdef CONFIG_KEYS
+static struct thread_group_cred init_tgcred = {
+	.usage	= ATOMIC_INIT(2),
+	.tgid	= 0,
+	.lock	= SPIN_LOCK_UNLOCKED,
+};
+#endif
+
 /*
  * The initial credentials for the initial task
  */
@@ -28,8 +39,41 @@ struct cred init_cred = {
 	.cap_bset		= CAP_INIT_BSET,
 	.user			= INIT_USER,
 	.group_info		= &init_groups,
+#ifdef CONFIG_KEYS
+	.tgcred			= &init_tgcred,
+#endif
 };
 
+/*
+ * Dispose of the shared task group credentials
+ */
+#ifdef CONFIG_KEYS
+static void release_tgcred_rcu(struct rcu_head *rcu)
+{
+	struct thread_group_cred *tgcred =
+		container_of(rcu, struct thread_group_cred, rcu);
+
+	BUG_ON(atomic_read(&tgcred->usage) != 0);
+
+	key_put(tgcred->session_keyring);
+	key_put(tgcred->process_keyring);
+	kfree(tgcred);
+}
+#endif
+
+/*
+ * Release a set of thread group credentials.
+ */
+static void release_tgcred(struct cred *cred)
+{
+#ifdef CONFIG_KEYS
+	struct thread_group_cred *tgcred = cred->tgcred;
+
+	if (atomic_dec_and_test(&tgcred->usage))
+		call_rcu(&tgcred->rcu, release_tgcred_rcu);
+#endif
+}
+
 /*
  * The RCU callback to actually dispose of a set of credentials
  */
@@ -41,6 +85,7 @@ static void put_cred_rcu(struct rcu_head *rcu)
 
 	key_put(cred->thread_keyring);
 	key_put(cred->request_key_auth);
+	release_tgcred(cred);
 	put_group_info(cred->group_info);
 	free_uid(cred->user);
 	security_cred_free(cred);
@@ -71,12 +116,30 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 	if (!pcred)
 		return -ENOMEM;
 
+#ifdef CONFIG_KEYS
+	if (clone_flags & CLONE_THREAD) {
+		atomic_inc(&pcred->tgcred->usage);
+	} else {
+		pcred->tgcred = kmalloc(sizeof(struct cred), GFP_KERNEL);
+		if (!pcred->tgcred) {
+			kfree(pcred);
+			return -ENOMEM;
+		}
+		atomic_set(&pcred->tgcred->usage, 1);
+		spin_lock_init(&pcred->tgcred->lock);
+		pcred->tgcred->process_keyring = NULL;
+		pcred->tgcred->session_keyring =
+			key_get(p->cred->tgcred->session_keyring);
+	}
+#endif
+
 #ifdef CONFIG_SECURITY
 	pcred->security = NULL;
 #endif
 
 	ret = security_cred_alloc(pcred);
 	if (ret < 0) {
+		release_tgcred(pcred);
 		kfree(pcred);
 		return ret;
 	}
diff --git a/kernel/fork.c b/kernel/fork.c
index c932e283ddf..ded1972672a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -802,12 +802,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	if (!sig)
 		return -ENOMEM;
 
-	ret = copy_thread_group_keys(tsk);
-	if (ret < 0) {
-		kmem_cache_free(signal_cachep, sig);
-		return ret;
-	}
-
 	atomic_set(&sig->count, 1);
 	atomic_set(&sig->live, 1);
 	init_waitqueue_head(&sig->wait_chldexit);
@@ -852,7 +846,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 void __cleanup_signal(struct signal_struct *sig)
 {
 	thread_group_cputime_free(sig);
-	exit_thread_group_keys(sig);
 	tty_kref_put(sig->tty);
 	kmem_cache_free(signal_cachep, sig);
 }
-- 
cgit v1.2.3


From 6cc88bc45ce8043171089c9592da223dfab91823 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:21 +1100
Subject: CRED: Rename is_single_threaded() to is_wq_single_threaded()

Rename is_single_threaded() to is_wq_single_threaded() so that a new
is_single_threaded() can be created that refers to tasks rather than
waitqueues.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: James Morris <jmorris@namei.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/workqueue.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f928f2a87b9..f12ab5c4dec 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -84,21 +84,21 @@ static cpumask_t cpu_singlethread_map __read_mostly;
 static cpumask_t cpu_populated_map __read_mostly;
 
 /* If it's single threaded, it isn't in the list of workqueues. */
-static inline int is_single_threaded(struct workqueue_struct *wq)
+static inline int is_wq_single_threaded(struct workqueue_struct *wq)
 {
 	return wq->singlethread;
 }
 
 static const cpumask_t *wq_cpu_map(struct workqueue_struct *wq)
 {
-	return is_single_threaded(wq)
+	return is_wq_single_threaded(wq)
 		? &cpu_singlethread_map : &cpu_populated_map;
 }
 
 static
 struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu)
 {
-	if (unlikely(is_single_threaded(wq)))
+	if (unlikely(is_wq_single_threaded(wq)))
 		cpu = singlethread_cpu;
 	return per_cpu_ptr(wq->cpu_wq, cpu);
 }
@@ -769,7 +769,7 @@ static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
 {
 	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
 	struct workqueue_struct *wq = cwq->wq;
-	const char *fmt = is_single_threaded(wq) ? "%s" : "%s/%d";
+	const char *fmt = is_wq_single_threaded(wq) ? "%s" : "%s/%d";
 	struct task_struct *p;
 
 	p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu);
-- 
cgit v1.2.3


From d84f4f992cbd76e8f39c488cf0c5d123843923b1 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:23 +1100
Subject: CRED: Inaugurate COW credentials

Inaugurate copy-on-write credentials management.  This uses RCU to manage the
credentials pointer in the task_struct with respect to accesses by other tasks.
A process may only modify its own credentials, and so does not need locking to
access or modify its own credentials.

A mutex (cred_replace_mutex) is added to the task_struct to control the effect
of PTRACE_ATTACHED on credential calculations, particularly with respect to
execve().

With this patch, the contents of an active credentials struct may not be
changed directly; rather a new set of credentials must be prepared, modified
and committed using something like the following sequence of events:

	struct cred *new = prepare_creds();
	int ret = blah(new);
	if (ret < 0) {
		abort_creds(new);
		return ret;
	}
	return commit_creds(new);

There are some exceptions to this rule: the keyrings pointed to by the active
credentials may be instantiated - keyrings violate the COW rule as managing
COW keyrings is tricky, given that it is possible for a task to directly alter
the keys in a keyring in use by another task.

To help enforce this, various pointers to sets of credentials, such as those in
the task_struct, are declared const.  The purpose of this is compile-time
discouragement of altering credentials through those pointers.  Once a set of
credentials has been made public through one of these pointers, it may not be
modified, except under special circumstances:

  (1) Its reference count may incremented and decremented.

  (2) The keyrings to which it points may be modified, but not replaced.

The only safe way to modify anything else is to create a replacement and commit
using the functions described in Documentation/credentials.txt (which will be
added by a later patch).

This patch and the preceding patches have been tested with the LTP SELinux
testsuite.

This patch makes several logical sets of alteration:

 (1) execve().

     This now prepares and commits credentials in various places in the
     security code rather than altering the current creds directly.

 (2) Temporary credential overrides.

     do_coredump() and sys_faccessat() now prepare their own credentials and
     temporarily override the ones currently on the acting thread, whilst
     preventing interference from other threads by holding cred_replace_mutex
     on the thread being dumped.

     This will be replaced in a future patch by something that hands down the
     credentials directly to the functions being called, rather than altering
     the task's objective credentials.

 (3) LSM interface.

     A number of functions have been changed, added or removed:

     (*) security_capset_check(), ->capset_check()
     (*) security_capset_set(), ->capset_set()

     	 Removed in favour of security_capset().

     (*) security_capset(), ->capset()

     	 New.  This is passed a pointer to the new creds, a pointer to the old
     	 creds and the proposed capability sets.  It should fill in the new
     	 creds or return an error.  All pointers, barring the pointer to the
     	 new creds, are now const.

     (*) security_bprm_apply_creds(), ->bprm_apply_creds()

     	 Changed; now returns a value, which will cause the process to be
     	 killed if it's an error.

     (*) security_task_alloc(), ->task_alloc_security()

     	 Removed in favour of security_prepare_creds().

     (*) security_cred_free(), ->cred_free()

     	 New.  Free security data attached to cred->security.

     (*) security_prepare_creds(), ->cred_prepare()

     	 New. Duplicate any security data attached to cred->security.

     (*) security_commit_creds(), ->cred_commit()

     	 New. Apply any security effects for the upcoming installation of new
     	 security by commit_creds().

     (*) security_task_post_setuid(), ->task_post_setuid()

     	 Removed in favour of security_task_fix_setuid().

     (*) security_task_fix_setuid(), ->task_fix_setuid()

     	 Fix up the proposed new credentials for setuid().  This is used by
     	 cap_set_fix_setuid() to implicitly adjust capabilities in line with
     	 setuid() changes.  Changes are made to the new credentials, rather
     	 than the task itself as in security_task_post_setuid().

     (*) security_task_reparent_to_init(), ->task_reparent_to_init()

     	 Removed.  Instead the task being reparented to init is referred
     	 directly to init's credentials.

	 NOTE!  This results in the loss of some state: SELinux's osid no
	 longer records the sid of the thread that forked it.

     (*) security_key_alloc(), ->key_alloc()
     (*) security_key_permission(), ->key_permission()

     	 Changed.  These now take cred pointers rather than task pointers to
     	 refer to the security context.

 (4) sys_capset().

     This has been simplified and uses less locking.  The LSM functions it
     calls have been merged.

 (5) reparent_to_kthreadd().

     This gives the current thread the same credentials as init by simply using
     commit_thread() to point that way.

 (6) __sigqueue_alloc() and switch_uid()

     __sigqueue_alloc() can't stop the target task from changing its creds
     beneath it, so this function gets a reference to the currently applicable
     user_struct which it then passes into the sigqueue struct it returns if
     successful.

     switch_uid() is now called from commit_creds(), and possibly should be
     folded into that.  commit_creds() should take care of protecting
     __sigqueue_alloc().

 (7) [sg]et[ug]id() and co and [sg]et_current_groups.

     The set functions now all use prepare_creds(), commit_creds() and
     abort_creds() to build and check a new set of credentials before applying
     it.

     security_task_set[ug]id() is called inside the prepared section.  This
     guarantees that nothing else will affect the creds until we've finished.

     The calling of set_dumpable() has been moved into commit_creds().

     Much of the functionality of set_user() has been moved into
     commit_creds().

     The get functions all simply access the data directly.

 (8) security_task_prctl() and cap_task_prctl().

     security_task_prctl() has been modified to return -ENOSYS if it doesn't
     want to handle a function, or otherwise return the return value directly
     rather than through an argument.

     Additionally, cap_task_prctl() now prepares a new set of credentials, even
     if it doesn't end up using it.

 (9) Keyrings.

     A number of changes have been made to the keyrings code:

     (a) switch_uid_keyring(), copy_keys(), exit_keys() and suid_keys() have
     	 all been dropped and built in to the credentials functions directly.
     	 They may want separating out again later.

     (b) key_alloc() and search_process_keyrings() now take a cred pointer
     	 rather than a task pointer to specify the security context.

     (c) copy_creds() gives a new thread within the same thread group a new
     	 thread keyring if its parent had one, otherwise it discards the thread
     	 keyring.

     (d) The authorisation key now points directly to the credentials to extend
     	 the search into rather pointing to the task that carries them.

     (e) Installing thread, process or session keyrings causes a new set of
     	 credentials to be created, even though it's not strictly necessary for
     	 process or session keyrings (they're shared).

(10) Usermode helper.

     The usermode helper code now carries a cred struct pointer in its
     subprocess_info struct instead of a new session keyring pointer.  This set
     of credentials is derived from init_cred and installed on the new process
     after it has been cloned.

     call_usermodehelper_setup() allocates the new credentials and
     call_usermodehelper_freeinfo() discards them if they haven't been used.  A
     special cred function (prepare_usermodeinfo_creds()) is provided
     specifically for call_usermodehelper_setup() to call.

     call_usermodehelper_setkeys() adjusts the credentials to sport the
     supplied keyring as the new session keyring.

(11) SELinux.

     SELinux has a number of changes, in addition to those to support the LSM
     interface changes mentioned above:

     (a) selinux_setprocattr() no longer does its check for whether the
     	 current ptracer can access processes with the new SID inside the lock
     	 that covers getting the ptracer's SID.  Whilst this lock ensures that
     	 the check is done with the ptracer pinned, the result is only valid
     	 until the lock is released, so there's no point doing it inside the
     	 lock.

(12) is_single_threaded().

     This function has been extracted from selinux_setprocattr() and put into
     a file of its own in the lib/ directory as join_session_keyring() now
     wants to use it too.

     The code in SELinux just checked to see whether a task shared mm_structs
     with other tasks (CLONE_VM), but that isn't good enough.  We really want
     to know if they're part of the same thread group (CLONE_THREAD).

(13) nfsd.

     The NFS server daemon now has to use the COW credentials to set the
     credentials it is going to use.  It really needs to pass the credentials
     down to the functions it calls, but it can't do that until other patches
     in this series have been applied.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/auditsc.c        |  42 ++---
 kernel/capability.c     |  78 +++------
 kernel/cred-internals.h |  21 +++
 kernel/cred.c           | 321 ++++++++++++++++++++++++++++++----
 kernel/exit.c           |   9 +-
 kernel/fork.c           |   7 +-
 kernel/kmod.c           |  30 +++-
 kernel/ptrace.c         |   9 +
 kernel/signal.c         |  10 +-
 kernel/sys.c            | 450 ++++++++++++++++++++++++++----------------------
 kernel/user.c           |  37 +---
 kernel/user_namespace.c |  12 +-
 12 files changed, 652 insertions(+), 374 deletions(-)
 create mode 100644 kernel/cred-internals.h

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index ae8ef88ade3..bc1e2d854bf 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2546,18 +2546,17 @@ int __audit_signal_info(int sig, struct task_struct *t)
 
 /**
  * __audit_log_bprm_fcaps - store information about a loading bprm and relevant fcaps
- * @bprm pointer to the bprm being processed
- * @caps the caps read from the disk
+ * @bprm: pointer to the bprm being processed
+ * @new: the proposed new credentials
+ * @old: the old credentials
  *
  * Simply check if the proc already has the caps given by the file and if not
  * store the priv escalation info for later auditing at the end of the syscall
  *
- * this can fail and we don't care.  See the note in audit.h for
- * audit_log_bprm_fcaps() for my explaination....
- *
  * -Eric
  */
-void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_cap_t *pE)
+int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
+			   const struct cred *new, const struct cred *old)
 {
 	struct audit_aux_data_bprm_fcaps *ax;
 	struct audit_context *context = current->audit_context;
@@ -2566,7 +2565,7 @@ void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_
 
 	ax = kmalloc(sizeof(*ax), GFP_KERNEL);
 	if (!ax)
-		return;
+		return -ENOMEM;
 
 	ax->d.type = AUDIT_BPRM_FCAPS;
 	ax->d.next = context->aux;
@@ -2581,26 +2580,27 @@ void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_
 	ax->fcap.fE = !!(vcaps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
 	ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;
 
-	ax->old_pcap.permitted = *pP;
-	ax->old_pcap.inheritable = current->cred->cap_inheritable;
-	ax->old_pcap.effective = *pE;
+	ax->old_pcap.permitted   = old->cap_permitted;
+	ax->old_pcap.inheritable = old->cap_inheritable;
+	ax->old_pcap.effective   = old->cap_effective;
 
-	ax->new_pcap.permitted = current->cred->cap_permitted;
-	ax->new_pcap.inheritable = current->cred->cap_inheritable;
-	ax->new_pcap.effective = current->cred->cap_effective;
+	ax->new_pcap.permitted   = new->cap_permitted;
+	ax->new_pcap.inheritable = new->cap_inheritable;
+	ax->new_pcap.effective   = new->cap_effective;
+	return 0;
 }
 
 /**
  * __audit_log_capset - store information about the arguments to the capset syscall
- * @pid target pid of the capset call
- * @eff effective cap set
- * @inh inheritible cap set
- * @perm permited cap set
+ * @pid: target pid of the capset call
+ * @new: the new credentials
+ * @old: the old (current) credentials
  *
  * Record the aguments userspace sent to sys_capset for later printing by the
  * audit system if applicable
  */
-int __audit_log_capset(pid_t pid, kernel_cap_t *eff, kernel_cap_t *inh, kernel_cap_t *perm)
+int __audit_log_capset(pid_t pid,
+		       const struct cred *new, const struct cred *old)
 {
 	struct audit_aux_data_capset *ax;
 	struct audit_context *context = current->audit_context;
@@ -2617,9 +2617,9 @@ int __audit_log_capset(pid_t pid, kernel_cap_t *eff, kernel_cap_t *inh, kernel_c
 	context->aux = (void *)ax;
 
 	ax->pid = pid;
-	ax->cap.effective = *eff;
-	ax->cap.inheritable = *eff;
-	ax->cap.permitted = *perm;
+	ax->cap.effective   = new->cap_effective;
+	ax->cap.inheritable = new->cap_effective;
+	ax->cap.permitted   = new->cap_permitted;
 
 	return 0;
 }
diff --git a/kernel/capability.c b/kernel/capability.c
index a404b980b1b..36b4b4daebe 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -15,12 +15,7 @@
 #include <linux/syscalls.h>
 #include <linux/pid_namespace.h>
 #include <asm/uaccess.h>
-
-/*
- * This lock protects task->cap_* for all tasks including current.
- * Locking rule: acquire this prior to tasklist_lock.
- */
-static DEFINE_SPINLOCK(task_capability_lock);
+#include "cred-internals.h"
 
 /*
  * Leveraged for setting/resetting capabilities
@@ -128,12 +123,11 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
 }
 
 /*
- * If we have configured with filesystem capability support, then the
- * only thing that can change the capabilities of the current process
- * is the current process. As such, we can't be in this code at the
- * same time as we are in the process of setting capabilities in this
- * process. The net result is that we can limit our use of locks to
- * when we are reading the caps of another process.
+ * The only thing that can change the capabilities of the current
+ * process is the current process. As such, we can't be in this code
+ * at the same time as we are in the process of setting capabilities
+ * in this process. The net result is that we can limit our use of
+ * locks to when we are reading the caps of another process.
  */
 static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
 				     kernel_cap_t *pIp, kernel_cap_t *pPp)
@@ -143,7 +137,6 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
 	if (pid && (pid != task_pid_vnr(current))) {
 		struct task_struct *target;
 
-		spin_lock(&task_capability_lock);
 		read_lock(&tasklist_lock);
 
 		target = find_task_by_vpid(pid);
@@ -153,34 +146,12 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
 			ret = security_capget(target, pEp, pIp, pPp);
 
 		read_unlock(&tasklist_lock);
-		spin_unlock(&task_capability_lock);
 	} else
 		ret = security_capget(current, pEp, pIp, pPp);
 
 	return ret;
 }
 
-/*
- * Atomically modify the effective capabilities returning the original
- * value. No permission check is performed here - it is assumed that the
- * caller is permitted to set the desired effective capabilities.
- */
-kernel_cap_t cap_set_effective(const kernel_cap_t pE_new)
-{
-	kernel_cap_t pE_old;
-
-	spin_lock(&task_capability_lock);
-
-	pE_old = current->cred->cap_effective;
-	current->cred->cap_effective = pE_new;
-
-	spin_unlock(&task_capability_lock);
-
-	return pE_old;
-}
-
-EXPORT_SYMBOL(cap_set_effective);
-
 /**
  * sys_capget - get the capabilities of a given process.
  * @header: pointer to struct that contains capability version and
@@ -208,7 +179,6 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
 		return -EINVAL;
 
 	ret = cap_get_target_pid(pid, &pE, &pI, &pP);
-
 	if (!ret) {
 		struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
 		unsigned i;
@@ -270,6 +240,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 	struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
 	unsigned i, tocopy;
 	kernel_cap_t inheritable, permitted, effective;
+	struct cred *new;
 	int ret;
 	pid_t pid;
 
@@ -284,8 +255,8 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 	if (pid != 0 && pid != task_pid_vnr(current))
 		return -EPERM;
 
-	if (copy_from_user(&kdata, data, tocopy
-			   * sizeof(struct __user_cap_data_struct)))
+	if (copy_from_user(&kdata, data,
+			   tocopy * sizeof(struct __user_cap_data_struct)))
 		return -EFAULT;
 
 	for (i = 0; i < tocopy; i++) {
@@ -300,24 +271,23 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 		i++;
 	}
 
-	ret = audit_log_capset(pid, &effective, &inheritable, &permitted);
-	if (ret)
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
+	ret = security_capset(new, current_cred(),
+			      &effective, &inheritable, &permitted);
+	if (ret < 0)
+		goto error;
+
+	ret = audit_log_capset(pid, new, current_cred());
+	if (ret < 0)
 		return ret;
 
-	/* This lock is required even when filesystem capability support is
-	 * configured - it protects the sys_capget() call from returning
-	 * incorrect data in the case that the targeted process is not the
-	 * current one.
-	 */
-	spin_lock(&task_capability_lock);
-
-	ret = security_capset_check(&effective, &inheritable, &permitted);
-	/* Having verified that the proposed changes are legal, we now put them
-	 * into effect.
-	 */
-	if (!ret)
-		security_capset_set(&effective, &inheritable, &permitted);
-	spin_unlock(&task_capability_lock);
+	return commit_creds(new);
+
+error:
+	abort_creds(new);
 	return ret;
 }
 
diff --git a/kernel/cred-internals.h b/kernel/cred-internals.h
new file mode 100644
index 00000000000..2dc4fc2d0bf
--- /dev/null
+++ b/kernel/cred-internals.h
@@ -0,0 +1,21 @@
+/* Internal credentials stuff
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+/*
+ * user.c
+ */
+static inline void sched_switch_user(struct task_struct *p)
+{
+#ifdef CONFIG_USER_SCHED
+	sched_move_task(p);
+#endif	/* CONFIG_USER_SCHED */
+}
+
diff --git a/kernel/cred.c b/kernel/cred.c
index ac73e361768..cb6b5eda978 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -15,6 +15,10 @@
 #include <linux/keyctl.h>
 #include <linux/init_task.h>
 #include <linux/security.h>
+#include <linux/cn_proc.h>
+#include "cred-internals.h"
+
+static struct kmem_cache *cred_jar;
 
 /*
  * The common credentials for the initial task's thread group
@@ -64,7 +68,7 @@ static void release_tgcred_rcu(struct rcu_head *rcu)
 /*
  * Release a set of thread group credentials.
  */
-static void release_tgcred(struct cred *cred)
+void release_tgcred(struct cred *cred)
 {
 #ifdef CONFIG_KEYS
 	struct thread_group_cred *tgcred = cred->tgcred;
@@ -81,79 +85,322 @@ static void put_cred_rcu(struct rcu_head *rcu)
 {
 	struct cred *cred = container_of(rcu, struct cred, rcu);
 
-	BUG_ON(atomic_read(&cred->usage) != 0);
+	if (atomic_read(&cred->usage) != 0)
+		panic("CRED: put_cred_rcu() sees %p with usage %d\n",
+		      cred, atomic_read(&cred->usage));
 
+	security_cred_free(cred);
 	key_put(cred->thread_keyring);
 	key_put(cred->request_key_auth);
 	release_tgcred(cred);
 	put_group_info(cred->group_info);
 	free_uid(cred->user);
-	security_cred_free(cred);
-	kfree(cred);
+	kmem_cache_free(cred_jar, cred);
 }
 
 /**
  * __put_cred - Destroy a set of credentials
- * @sec: The record to release
+ * @cred: The record to release
  *
  * Destroy a set of credentials on which no references remain.
  */
 void __put_cred(struct cred *cred)
 {
+	BUG_ON(atomic_read(&cred->usage) != 0);
+
 	call_rcu(&cred->rcu, put_cred_rcu);
 }
 EXPORT_SYMBOL(__put_cred);
 
+/**
+ * prepare_creds - Prepare a new set of credentials for modification
+ *
+ * Prepare a new set of task credentials for modification.  A task's creds
+ * shouldn't generally be modified directly, therefore this function is used to
+ * prepare a new copy, which the caller then modifies and then commits by
+ * calling commit_creds().
+ *
+ * Returns a pointer to the new creds-to-be if successful, NULL otherwise.
+ *
+ * Call commit_creds() or abort_creds() to clean up.
+ */
+struct cred *prepare_creds(void)
+{
+	struct task_struct *task = current;
+	const struct cred *old;
+	struct cred *new;
+
+	BUG_ON(atomic_read(&task->cred->usage) < 1);
+
+	new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
+	if (!new)
+		return NULL;
+
+	old = task->cred;
+	memcpy(new, old, sizeof(struct cred));
+
+	atomic_set(&new->usage, 1);
+	get_group_info(new->group_info);
+	get_uid(new->user);
+
+#ifdef CONFIG_KEYS
+	key_get(new->thread_keyring);
+	key_get(new->request_key_auth);
+	atomic_inc(&new->tgcred->usage);
+#endif
+
+#ifdef CONFIG_SECURITY
+	new->security = NULL;
+#endif
+
+	if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
+		goto error;
+	return new;
+
+error:
+	abort_creds(new);
+	return NULL;
+}
+EXPORT_SYMBOL(prepare_creds);
+
+/*
+ * prepare new credentials for the usermode helper dispatcher
+ */
+struct cred *prepare_usermodehelper_creds(void)
+{
+#ifdef CONFIG_KEYS
+	struct thread_group_cred *tgcred = NULL;
+#endif
+	struct cred *new;
+
+#ifdef CONFIG_KEYS
+	tgcred = kzalloc(sizeof(*new->tgcred), GFP_ATOMIC);
+	if (!tgcred)
+		return NULL;
+#endif
+
+	new = kmem_cache_alloc(cred_jar, GFP_ATOMIC);
+	if (!new)
+		return NULL;
+
+	memcpy(new, &init_cred, sizeof(struct cred));
+
+	atomic_set(&new->usage, 1);
+	get_group_info(new->group_info);
+	get_uid(new->user);
+
+#ifdef CONFIG_KEYS
+	new->thread_keyring = NULL;
+	new->request_key_auth = NULL;
+	new->jit_keyring = KEY_REQKEY_DEFL_DEFAULT;
+
+	atomic_set(&tgcred->usage, 1);
+	spin_lock_init(&tgcred->lock);
+	new->tgcred = tgcred;
+#endif
+
+#ifdef CONFIG_SECURITY
+	new->security = NULL;
+#endif
+	if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0)
+		goto error;
+
+	BUG_ON(atomic_read(&new->usage) != 1);
+	return new;
+
+error:
+	put_cred(new);
+	return NULL;
+}
+
 /*
  * Copy credentials for the new process created by fork()
+ *
+ * We share if we can, but under some circumstances we have to generate a new
+ * set.
  */
 int copy_creds(struct task_struct *p, unsigned long clone_flags)
 {
-	struct cred *pcred;
-	int ret;
+#ifdef CONFIG_KEYS
+	struct thread_group_cred *tgcred;
+#endif
+	struct cred *new;
+
+	mutex_init(&p->cred_exec_mutex);
 
-	pcred = kmemdup(p->cred, sizeof(*p->cred), GFP_KERNEL);
-	if (!pcred)
+	if (
+#ifdef CONFIG_KEYS
+		!p->cred->thread_keyring &&
+#endif
+		clone_flags & CLONE_THREAD
+	    ) {
+		get_cred(p->cred);
+		atomic_inc(&p->cred->user->processes);
+		return 0;
+	}
+
+	new = prepare_creds();
+	if (!new)
 		return -ENOMEM;
 
 #ifdef CONFIG_KEYS
-	if (clone_flags & CLONE_THREAD) {
-		atomic_inc(&pcred->tgcred->usage);
-	} else {
-		pcred->tgcred = kmalloc(sizeof(struct cred), GFP_KERNEL);
-		if (!pcred->tgcred) {
-			kfree(pcred);
+	/* new threads get their own thread keyrings if their parent already
+	 * had one */
+	if (new->thread_keyring) {
+		key_put(new->thread_keyring);
+		new->thread_keyring = NULL;
+		if (clone_flags & CLONE_THREAD)
+			install_thread_keyring_to_cred(new);
+	}
+
+	/* we share the process and session keyrings between all the threads in
+	 * a process - this is slightly icky as we violate COW credentials a
+	 * bit */
+	if (!(clone_flags & CLONE_THREAD)) {
+		tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
+		if (!tgcred) {
+			put_cred(new);
 			return -ENOMEM;
 		}
-		atomic_set(&pcred->tgcred->usage, 1);
-		spin_lock_init(&pcred->tgcred->lock);
-		pcred->tgcred->process_keyring = NULL;
-		pcred->tgcred->session_keyring =
-			key_get(p->cred->tgcred->session_keyring);
+		atomic_set(&tgcred->usage, 1);
+		spin_lock_init(&tgcred->lock);
+		tgcred->process_keyring = NULL;
+		tgcred->session_keyring = key_get(new->tgcred->session_keyring);
+
+		release_tgcred(new);
+		new->tgcred = tgcred;
 	}
 #endif
 
-#ifdef CONFIG_SECURITY
-	pcred->security = NULL;
-#endif
+	atomic_inc(&new->user->processes);
+	p->cred = new;
+	return 0;
+}
 
-	ret = security_cred_alloc(pcred);
-	if (ret < 0) {
-		release_tgcred(pcred);
-		kfree(pcred);
-		return ret;
+/**
+ * commit_creds - Install new credentials upon the current task
+ * @new: The credentials to be assigned
+ *
+ * Install a new set of credentials to the current task, using RCU to replace
+ * the old set.
+ *
+ * This function eats the caller's reference to the new credentials.
+ *
+ * Always returns 0 thus allowing this function to be tail-called at the end
+ * of, say, sys_setgid().
+ */
+int commit_creds(struct cred *new)
+{
+	struct task_struct *task = current;
+	const struct cred *old;
+
+	BUG_ON(atomic_read(&new->usage) < 1);
+	BUG_ON(atomic_read(&task->cred->usage) < 1);
+
+	old = task->cred;
+	security_commit_creds(new, old);
+
+	/* dumpability changes */
+	if (old->euid != new->euid ||
+	    old->egid != new->egid ||
+	    old->fsuid != new->fsuid ||
+	    old->fsgid != new->fsgid ||
+	    !cap_issubset(new->cap_permitted, old->cap_permitted)) {
+		set_dumpable(task->mm, suid_dumpable);
+		task->pdeath_signal = 0;
+		smp_wmb();
 	}
 
-	atomic_set(&pcred->usage, 1);
-	get_group_info(pcred->group_info);
-	get_uid(pcred->user);
-	key_get(pcred->thread_keyring);
-	key_get(pcred->request_key_auth);
+	/* alter the thread keyring */
+	if (new->fsuid != old->fsuid)
+		key_fsuid_changed(task);
+	if (new->fsgid != old->fsgid)
+		key_fsgid_changed(task);
+
+	/* do it
+	 * - What if a process setreuid()'s and this brings the
+	 *   new uid over his NPROC rlimit?  We can check this now
+	 *   cheaply with the new uid cache, so if it matters
+	 *   we should be checking for it.  -DaveM
+	 */
+	if (new->user != old->user)
+		atomic_inc(&new->user->processes);
+	rcu_assign_pointer(task->cred, new);
+	if (new->user != old->user)
+		atomic_dec(&old->user->processes);
+
+	sched_switch_user(task);
+
+	/* send notifications */
+	if (new->uid   != old->uid  ||
+	    new->euid  != old->euid ||
+	    new->suid  != old->suid ||
+	    new->fsuid != old->fsuid)
+		proc_id_connector(task, PROC_EVENT_UID);
 
-	atomic_inc(&pcred->user->processes);
+	if (new->gid   != old->gid  ||
+	    new->egid  != old->egid ||
+	    new->sgid  != old->sgid ||
+	    new->fsgid != old->fsgid)
+		proc_id_connector(task, PROC_EVENT_GID);
 
-	/* RCU assignment is unneeded here as no-one can have accessed this
-	 * pointer yet, barring us */
-	p->cred = pcred;
+	put_cred(old);
 	return 0;
 }
+EXPORT_SYMBOL(commit_creds);
+
+/**
+ * abort_creds - Discard a set of credentials and unlock the current task
+ * @new: The credentials that were going to be applied
+ *
+ * Discard a set of credentials that were under construction and unlock the
+ * current task.
+ */
+void abort_creds(struct cred *new)
+{
+	BUG_ON(atomic_read(&new->usage) < 1);
+	put_cred(new);
+}
+EXPORT_SYMBOL(abort_creds);
+
+/**
+ * override_creds - Temporarily override the current process's credentials
+ * @new: The credentials to be assigned
+ *
+ * Install a set of temporary override credentials on the current process,
+ * returning the old set for later reversion.
+ */
+const struct cred *override_creds(const struct cred *new)
+{
+	const struct cred *old = current->cred;
+
+	rcu_assign_pointer(current->cred, get_cred(new));
+	return old;
+}
+EXPORT_SYMBOL(override_creds);
+
+/**
+ * revert_creds - Revert a temporary credentials override
+ * @old: The credentials to be restored
+ *
+ * Revert a temporary set of override credentials to an old set, discarding the
+ * override set.
+ */
+void revert_creds(const struct cred *old)
+{
+	const struct cred *override = current->cred;
+
+	rcu_assign_pointer(current->cred, old);
+	put_cred(override);
+}
+EXPORT_SYMBOL(revert_creds);
+
+/*
+ * initialise the credentials stuff
+ */
+void __init cred_init(void)
+{
+	/* allocate a slab in which we can store credentials */
+	cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred),
+				     0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+}
diff --git a/kernel/exit.c b/kernel/exit.c
index bbc22530f2c..c0711da1548 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -47,12 +47,14 @@
 #include <linux/blkdev.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/tracehook.h>
+#include <linux/init_task.h>
 #include <trace/sched.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
+#include "cred-internals.h"
 
 static void exit_mm(struct task_struct * tsk);
 
@@ -338,12 +340,12 @@ static void reparent_to_kthreadd(void)
 	/* cpus_allowed? */
 	/* rt_priority? */
 	/* signals? */
-	security_task_reparent_to_init(current);
 	memcpy(current->signal->rlim, init_task.signal->rlim,
 	       sizeof(current->signal->rlim));
-	atomic_inc(&(INIT_USER->__count));
+
+	atomic_inc(&init_cred.usage);
+	commit_creds(&init_cred);
 	write_unlock_irq(&tasklist_lock);
-	switch_uid(INIT_USER);
 }
 
 void __set_special_pids(struct pid *pid)
@@ -1085,7 +1087,6 @@ NORET_TYPE void do_exit(long code)
 	check_stack_usage();
 	exit_thread();
 	cgroup_exit(tsk, 1);
-	exit_keys(tsk);
 
 	if (group_dead && tsk->signal->leader)
 		disassociate_ctty(1);
diff --git a/kernel/fork.c b/kernel/fork.c
index ded1972672a..82a7948a664 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1084,10 +1084,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto bad_fork_cleanup_sighand;
 	if ((retval = copy_mm(clone_flags, p)))
 		goto bad_fork_cleanup_signal;
-	if ((retval = copy_keys(clone_flags, p)))
-		goto bad_fork_cleanup_mm;
 	if ((retval = copy_namespaces(clone_flags, p)))
-		goto bad_fork_cleanup_keys;
+		goto bad_fork_cleanup_mm;
 	if ((retval = copy_io(clone_flags, p)))
 		goto bad_fork_cleanup_namespaces;
 	retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
@@ -1252,8 +1250,6 @@ bad_fork_cleanup_io:
 	put_io_context(p->io_context);
 bad_fork_cleanup_namespaces:
 	exit_task_namespaces(p);
-bad_fork_cleanup_keys:
-	exit_keys(p);
 bad_fork_cleanup_mm:
 	if (p->mm)
 		mmput(p->mm);
@@ -1281,6 +1277,7 @@ bad_fork_cleanup_cgroup:
 bad_fork_cleanup_put_domain:
 	module_put(task_thread_info(p)->exec_domain->module);
 bad_fork_cleanup_count:
+	atomic_dec(&p->cred->user->processes);
 	put_cred(p->cred);
 bad_fork_free:
 	free_task(p);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index f044f8f5770..b46dbb90866 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -118,10 +118,10 @@ EXPORT_SYMBOL(request_module);
 struct subprocess_info {
 	struct work_struct work;
 	struct completion *complete;
+	struct cred *cred;
 	char *path;
 	char **argv;
 	char **envp;
-	struct key *ring;
 	enum umh_wait wait;
 	int retval;
 	struct file *stdin;
@@ -134,19 +134,20 @@ struct subprocess_info {
 static int ____call_usermodehelper(void *data)
 {
 	struct subprocess_info *sub_info = data;
-	struct key *new_session, *old_session;
 	int retval;
 
-	/* Unblock all signals and set the session keyring. */
-	new_session = key_get(sub_info->ring);
+	BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
+
+	/* Unblock all signals */
 	spin_lock_irq(&current->sighand->siglock);
-	old_session = __install_session_keyring(new_session);
 	flush_signal_handlers(current, 1);
 	sigemptyset(&current->blocked);
 	recalc_sigpending();
 	spin_unlock_irq(&current->sighand->siglock);
 
-	key_put(old_session);
+	/* Install the credentials */
+	commit_creds(sub_info->cred);
+	sub_info->cred = NULL;
 
 	/* Install input pipe when needed */
 	if (sub_info->stdin) {
@@ -185,6 +186,8 @@ void call_usermodehelper_freeinfo(struct subprocess_info *info)
 {
 	if (info->cleanup)
 		(*info->cleanup)(info->argv, info->envp);
+	if (info->cred)
+		put_cred(info->cred);
 	kfree(info);
 }
 EXPORT_SYMBOL(call_usermodehelper_freeinfo);
@@ -240,6 +243,8 @@ static void __call_usermodehelper(struct work_struct *work)
 	pid_t pid;
 	enum umh_wait wait = sub_info->wait;
 
+	BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
+
 	/* CLONE_VFORK: wait until the usermode helper has execve'd
 	 * successfully We need the data structures to stay around
 	 * until that is done.  */
@@ -362,6 +367,9 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
 	sub_info->path = path;
 	sub_info->argv = argv;
 	sub_info->envp = envp;
+	sub_info->cred = prepare_usermodehelper_creds();
+	if (!sub_info->cred)
+		return NULL;
 
   out:
 	return sub_info;
@@ -376,7 +384,13 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
 void call_usermodehelper_setkeys(struct subprocess_info *info,
 				 struct key *session_keyring)
 {
-	info->ring = session_keyring;
+#ifdef CONFIG_KEYS
+	struct thread_group_cred *tgcred = info->cred->tgcred;
+	key_put(tgcred->session_keyring);
+	tgcred->session_keyring = key_get(session_keyring);
+#else
+	BUG();
+#endif
 }
 EXPORT_SYMBOL(call_usermodehelper_setkeys);
 
@@ -444,6 +458,8 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
 	DECLARE_COMPLETION_ONSTACK(done);
 	int retval = 0;
 
+	BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
+
 	helper_lock();
 	if (sub_info->path[0] == '\0')
 		goto out;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index b9d5f4e4f6a..f764b880695 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -171,6 +171,14 @@ int ptrace_attach(struct task_struct *task)
 	if (same_thread_group(task, current))
 		goto out;
 
+	/* Protect exec's credential calculations against our interference;
+	 * SUID, SGID and LSM creds get determined differently under ptrace.
+	 */
+	retval = mutex_lock_interruptible(&current->cred_exec_mutex);
+	if (retval  < 0)
+		goto out;
+
+	retval = -EPERM;
 repeat:
 	/*
 	 * Nasty, nasty.
@@ -210,6 +218,7 @@ repeat:
 bad:
 	write_unlock_irqrestore(&tasklist_lock, flags);
 	task_unlock(task);
+	mutex_unlock(&current->cred_exec_mutex);
 out:
 	return retval;
 }
diff --git a/kernel/signal.c b/kernel/signal.c
index 84989124baf..2a64304ed54 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -180,7 +180,7 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
 /*
  * allocate a new signal queue record
  * - this may be called without locks if and only if t == current, otherwise an
- *   appopriate lock must be held to protect t's user_struct
+ *   appopriate lock must be held to stop the target task from exiting
  */
 static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
 					 int override_rlimit)
@@ -194,7 +194,7 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
 	 * caller must be holding the RCU readlock (by way of a spinlock) and
 	 * we use RCU protection here
 	 */
-	user = __task_cred(t)->user;
+	user = get_uid(__task_cred(t)->user);
 	atomic_inc(&user->sigpending);
 	if (override_rlimit ||
 	    atomic_read(&user->sigpending) <=
@@ -202,12 +202,14 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
 		q = kmem_cache_alloc(sigqueue_cachep, flags);
 	if (unlikely(q == NULL)) {
 		atomic_dec(&user->sigpending);
+		free_uid(user);
 	} else {
 		INIT_LIST_HEAD(&q->list);
 		q->flags = 0;
-		q->user = get_uid(user);
+		q->user = user;
 	}
-	return(q);
+
+	return q;
 }
 
 static void __sigqueue_free(struct sigqueue *q)
diff --git a/kernel/sys.c b/kernel/sys.c
index ccc9eb736d3..ab735040468 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -180,7 +180,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
 			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
 			break;
 		case PRIO_USER:
-			user = cred->user;
+			user = (struct user_struct *) cred->user;
 			if (!who)
 				who = cred->uid;
 			else if ((who != cred->uid) &&
@@ -479,47 +479,48 @@ void ctrl_alt_del(void)
  */
 asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
 {
-	struct cred *cred = current->cred;
-	int old_rgid = cred->gid;
-	int old_egid = cred->egid;
-	int new_rgid = old_rgid;
-	int new_egid = old_egid;
+	const struct cred *old;
+	struct cred *new;
 	int retval;
 
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+	old = current_cred();
+
 	retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE);
 	if (retval)
-		return retval;
+		goto error;
 
+	retval = -EPERM;
 	if (rgid != (gid_t) -1) {
-		if ((old_rgid == rgid) ||
-		    (cred->egid == rgid) ||
+		if (old->gid == rgid ||
+		    old->egid == rgid ||
 		    capable(CAP_SETGID))
-			new_rgid = rgid;
+			new->gid = rgid;
 		else
-			return -EPERM;
+			goto error;
 	}
 	if (egid != (gid_t) -1) {
-		if ((old_rgid == egid) ||
-		    (cred->egid == egid) ||
-		    (cred->sgid == egid) ||
+		if (old->gid == egid ||
+		    old->egid == egid ||
+		    old->sgid == egid ||
 		    capable(CAP_SETGID))
-			new_egid = egid;
+			new->egid = egid;
 		else
-			return -EPERM;
-	}
-	if (new_egid != old_egid) {
-		set_dumpable(current->mm, suid_dumpable);
-		smp_wmb();
+			goto error;
 	}
+
 	if (rgid != (gid_t) -1 ||
-	    (egid != (gid_t) -1 && egid != old_rgid))
-		cred->sgid = new_egid;
-	cred->fsgid = new_egid;
-	cred->egid = new_egid;
-	cred->gid = new_rgid;
-	key_fsgid_changed(current);
-	proc_id_connector(current, PROC_EVENT_GID);
-	return 0;
+	    (egid != (gid_t) -1 && egid != old->gid))
+		new->sgid = new->egid;
+	new->fsgid = new->egid;
+
+	return commit_creds(new);
+
+error:
+	abort_creds(new);
+	return retval;
 }
 
 /*
@@ -529,40 +530,42 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
  */
 asmlinkage long sys_setgid(gid_t gid)
 {
-	struct cred *cred = current->cred;
-	int old_egid = cred->egid;
+	const struct cred *old;
+	struct cred *new;
 	int retval;
 
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+	old = current_cred();
+
 	retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID);
 	if (retval)
-		return retval;
+		goto error;
 
-	if (capable(CAP_SETGID)) {
-		if (old_egid != gid) {
-			set_dumpable(current->mm, suid_dumpable);
-			smp_wmb();
-		}
-		cred->gid = cred->egid = cred->sgid = cred->fsgid = gid;
-	} else if ((gid == cred->gid) || (gid == cred->sgid)) {
-		if (old_egid != gid) {
-			set_dumpable(current->mm, suid_dumpable);
-			smp_wmb();
-		}
-		cred->egid = cred->fsgid = gid;
-	}
+	retval = -EPERM;
+	if (capable(CAP_SETGID))
+		new->gid = new->egid = new->sgid = new->fsgid = gid;
+	else if (gid == old->gid || gid == old->sgid)
+		new->egid = new->fsgid = gid;
 	else
-		return -EPERM;
+		goto error;
 
-	key_fsgid_changed(current);
-	proc_id_connector(current, PROC_EVENT_GID);
-	return 0;
+	return commit_creds(new);
+
+error:
+	abort_creds(new);
+	return retval;
 }
   
-static int set_user(uid_t new_ruid, int dumpclear)
+/*
+ * change the user struct in a credentials set to match the new UID
+ */
+static int set_user(struct cred *new)
 {
 	struct user_struct *new_user;
 
-	new_user = alloc_uid(current->nsproxy->user_ns, new_ruid);
+	new_user = alloc_uid(current->nsproxy->user_ns, new->uid);
 	if (!new_user)
 		return -EAGAIN;
 
@@ -573,13 +576,8 @@ static int set_user(uid_t new_ruid, int dumpclear)
 		return -EAGAIN;
 	}
 
-	switch_uid(new_user);
-
-	if (dumpclear) {
-		set_dumpable(current->mm, suid_dumpable);
-		smp_wmb();
-	}
-	current->cred->uid = new_ruid;
+	free_uid(new->user);
+	new->user = new_user;
 	return 0;
 }
 
@@ -600,55 +598,56 @@ static int set_user(uid_t new_ruid, int dumpclear)
  */
 asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
 {
-	struct cred *cred = current->cred;
-	int old_ruid, old_euid, old_suid, new_ruid, new_euid;
+	const struct cred *old;
+	struct cred *new;
 	int retval;
 
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+	old = current_cred();
+
 	retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE);
 	if (retval)
-		return retval;
-
-	new_ruid = old_ruid = cred->uid;
-	new_euid = old_euid = cred->euid;
-	old_suid = cred->suid;
+		goto error;
 
+	retval = -EPERM;
 	if (ruid != (uid_t) -1) {
-		new_ruid = ruid;
-		if ((old_ruid != ruid) &&
-		    (cred->euid != ruid) &&
+		new->uid = ruid;
+		if (old->uid != ruid &&
+		    old->euid != ruid &&
 		    !capable(CAP_SETUID))
-			return -EPERM;
+			goto error;
 	}
 
 	if (euid != (uid_t) -1) {
-		new_euid = euid;
-		if ((old_ruid != euid) &&
-		    (cred->euid != euid) &&
-		    (cred->suid != euid) &&
+		new->euid = euid;
+		if (old->uid != euid &&
+		    old->euid != euid &&
+		    old->suid != euid &&
 		    !capable(CAP_SETUID))
-			return -EPERM;
+			goto error;
 	}
 
-	if (new_ruid != old_ruid && set_user(new_ruid, new_euid != old_euid) < 0)
-		return -EAGAIN;
+	retval = -EAGAIN;
+	if (new->uid != old->uid && set_user(new) < 0)
+		goto error;
 
-	if (new_euid != old_euid) {
-		set_dumpable(current->mm, suid_dumpable);
-		smp_wmb();
-	}
-	cred->fsuid = cred->euid = new_euid;
 	if (ruid != (uid_t) -1 ||
-	    (euid != (uid_t) -1 && euid != old_ruid))
-		cred->suid = cred->euid;
-	cred->fsuid = cred->euid;
-
-	key_fsuid_changed(current);
-	proc_id_connector(current, PROC_EVENT_UID);
+	    (euid != (uid_t) -1 && euid != old->uid))
+		new->suid = new->euid;
+	new->fsuid = new->euid;
 
-	return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RE);
-}
+	retval = security_task_fix_setuid(new, old, LSM_SETID_RE);
+	if (retval < 0)
+		goto error;
 
+	return commit_creds(new);
 
+error:
+	abort_creds(new);
+	return retval;
+}
 		
 /*
  * setuid() is implemented like SysV with SAVED_IDS 
@@ -663,37 +662,41 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
  */
 asmlinkage long sys_setuid(uid_t uid)
 {
-	struct cred *cred = current->cred;
-	int old_euid = cred->euid;
-	int old_ruid, old_suid, new_suid;
+	const struct cred *old;
+	struct cred *new;
 	int retval;
 
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+	old = current_cred();
+
 	retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID);
 	if (retval)
-		return retval;
+		goto error;
 
-	old_ruid = cred->uid;
-	old_suid = cred->suid;
-	new_suid = old_suid;
-	
+	retval = -EPERM;
 	if (capable(CAP_SETUID)) {
-		if (uid != old_ruid && set_user(uid, old_euid != uid) < 0)
-			return -EAGAIN;
-		new_suid = uid;
-	} else if ((uid != cred->uid) && (uid != new_suid))
-		return -EPERM;
-
-	if (old_euid != uid) {
-		set_dumpable(current->mm, suid_dumpable);
-		smp_wmb();
+		new->suid = new->uid = uid;
+		if (uid != old->uid && set_user(new) < 0) {
+			retval = -EAGAIN;
+			goto error;
+		}
+	} else if (uid != old->uid && uid != new->suid) {
+		goto error;
 	}
-	cred->fsuid = cred->euid = uid;
-	cred->suid = new_suid;
 
-	key_fsuid_changed(current);
-	proc_id_connector(current, PROC_EVENT_UID);
+	new->fsuid = new->euid = uid;
+
+	retval = security_task_fix_setuid(new, old, LSM_SETID_ID);
+	if (retval < 0)
+		goto error;
+
+	return commit_creds(new);
 
-	return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_ID);
+error:
+	abort_creds(new);
+	return retval;
 }
 
 
@@ -703,47 +706,53 @@ asmlinkage long sys_setuid(uid_t uid)
  */
 asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
 {
-	struct cred *cred = current->cred;
-	int old_ruid = cred->uid;
-	int old_euid = cred->euid;
-	int old_suid = cred->suid;
+	const struct cred *old;
+	struct cred *new;
 	int retval;
 
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
 	retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES);
 	if (retval)
-		return retval;
+		goto error;
+	old = current_cred();
 
+	retval = -EPERM;
 	if (!capable(CAP_SETUID)) {
-		if ((ruid != (uid_t) -1) && (ruid != cred->uid) &&
-		    (ruid != cred->euid) && (ruid != cred->suid))
-			return -EPERM;
-		if ((euid != (uid_t) -1) && (euid != cred->uid) &&
-		    (euid != cred->euid) && (euid != cred->suid))
-			return -EPERM;
-		if ((suid != (uid_t) -1) && (suid != cred->uid) &&
-		    (suid != cred->euid) && (suid != cred->suid))
-			return -EPERM;
+		if (ruid != (uid_t) -1 && ruid != old->uid &&
+		    ruid != old->euid  && ruid != old->suid)
+			goto error;
+		if (euid != (uid_t) -1 && euid != old->uid &&
+		    euid != old->euid  && euid != old->suid)
+			goto error;
+		if (suid != (uid_t) -1 && suid != old->uid &&
+		    suid != old->euid  && suid != old->suid)
+			goto error;
 	}
+
+	retval = -EAGAIN;
 	if (ruid != (uid_t) -1) {
-		if (ruid != cred->uid &&
-		    set_user(ruid, euid != cred->euid) < 0)
-			return -EAGAIN;
+		new->uid = ruid;
+		if (ruid != old->uid && set_user(new) < 0)
+			goto error;
 	}
-	if (euid != (uid_t) -1) {
-		if (euid != cred->euid) {
-			set_dumpable(current->mm, suid_dumpable);
-			smp_wmb();
-		}
-		cred->euid = euid;
-	}
-	cred->fsuid = cred->euid;
+	if (euid != (uid_t) -1)
+		new->euid = euid;
 	if (suid != (uid_t) -1)
-		cred->suid = suid;
+		new->suid = suid;
+	new->fsuid = new->euid;
 
-	key_fsuid_changed(current);
-	proc_id_connector(current, PROC_EVENT_UID);
+	retval = security_task_fix_setuid(new, old, LSM_SETID_RES);
+	if (retval < 0)
+		goto error;
 
-	return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RES);
+	return commit_creds(new);
+
+error:
+	abort_creds(new);
+	return retval;
 }
 
 asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid)
@@ -763,40 +772,45 @@ asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __us
  */
 asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
 {
-	struct cred *cred = current->cred;
+	const struct cred *old;
+	struct cred *new;
 	int retval;
 
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+	old = current_cred();
+
 	retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES);
 	if (retval)
-		return retval;
+		goto error;
 
+	retval = -EPERM;
 	if (!capable(CAP_SETGID)) {
-		if ((rgid != (gid_t) -1) && (rgid != cred->gid) &&
-		    (rgid != cred->egid) && (rgid != cred->sgid))
-			return -EPERM;
-		if ((egid != (gid_t) -1) && (egid != cred->gid) &&
-		    (egid != cred->egid) && (egid != cred->sgid))
-			return -EPERM;
-		if ((sgid != (gid_t) -1) && (sgid != cred->gid) &&
-		    (sgid != cred->egid) && (sgid != cred->sgid))
-			return -EPERM;
+		if (rgid != (gid_t) -1 && rgid != old->gid &&
+		    rgid != old->egid  && rgid != old->sgid)
+			goto error;
+		if (egid != (gid_t) -1 && egid != old->gid &&
+		    egid != old->egid  && egid != old->sgid)
+			goto error;
+		if (sgid != (gid_t) -1 && sgid != old->gid &&
+		    sgid != old->egid  && sgid != old->sgid)
+			goto error;
 	}
-	if (egid != (gid_t) -1) {
-		if (egid != cred->egid) {
-			set_dumpable(current->mm, suid_dumpable);
-			smp_wmb();
-		}
-		cred->egid = egid;
-	}
-	cred->fsgid = cred->egid;
+
 	if (rgid != (gid_t) -1)
-		cred->gid = rgid;
+		new->gid = rgid;
+	if (egid != (gid_t) -1)
+		new->egid = egid;
 	if (sgid != (gid_t) -1)
-		cred->sgid = sgid;
+		new->sgid = sgid;
+	new->fsgid = new->egid;
 
-	key_fsgid_changed(current);
-	proc_id_connector(current, PROC_EVENT_GID);
-	return 0;
+	return commit_creds(new);
+
+error:
+	abort_creds(new);
+	return retval;
 }
 
 asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid)
@@ -820,28 +834,35 @@ asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __us
  */
 asmlinkage long sys_setfsuid(uid_t uid)
 {
-	struct cred *cred = current->cred;
-	int old_fsuid;
+	const struct cred *old;
+	struct cred *new;
+	uid_t old_fsuid;
+
+	new = prepare_creds();
+	if (!new)
+		return current_fsuid();
+	old = current_cred();
+	old_fsuid = old->fsuid;
 
-	old_fsuid = cred->fsuid;
-	if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS))
-		return old_fsuid;
+	if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS) < 0)
+		goto error;
 
-	if (uid == cred->uid || uid == cred->euid ||
-	    uid == cred->suid || uid == cred->fsuid ||
+	if (uid == old->uid  || uid == old->euid  ||
+	    uid == old->suid || uid == old->fsuid ||
 	    capable(CAP_SETUID)) {
 		if (uid != old_fsuid) {
-			set_dumpable(current->mm, suid_dumpable);
-			smp_wmb();
+			new->fsuid = uid;
+			if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
+				goto change_okay;
 		}
-		cred->fsuid = uid;
 	}
 
-	key_fsuid_changed(current);
-	proc_id_connector(current, PROC_EVENT_UID);
-
-	security_task_post_setuid(old_fsuid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS);
+error:
+	abort_creds(new);
+	return old_fsuid;
 
+change_okay:
+	commit_creds(new);
 	return old_fsuid;
 }
 
@@ -850,24 +871,34 @@ asmlinkage long sys_setfsuid(uid_t uid)
  */
 asmlinkage long sys_setfsgid(gid_t gid)
 {
-	struct cred *cred = current->cred;
-	int old_fsgid;
+	const struct cred *old;
+	struct cred *new;
+	gid_t old_fsgid;
+
+	new = prepare_creds();
+	if (!new)
+		return current_fsgid();
+	old = current_cred();
+	old_fsgid = old->fsgid;
 
-	old_fsgid = cred->fsgid;
 	if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS))
-		return old_fsgid;
+		goto error;
 
-	if (gid == cred->gid || gid == cred->egid ||
-	    gid == cred->sgid || gid == cred->fsgid ||
+	if (gid == old->gid  || gid == old->egid  ||
+	    gid == old->sgid || gid == old->fsgid ||
 	    capable(CAP_SETGID)) {
 		if (gid != old_fsgid) {
-			set_dumpable(current->mm, suid_dumpable);
-			smp_wmb();
+			new->fsgid = gid;
+			goto change_okay;
 		}
-		cred->fsgid = gid;
-		key_fsgid_changed(current);
-		proc_id_connector(current, PROC_EVENT_GID);
 	}
+
+error:
+	abort_creds(new);
+	return old_fsgid;
+
+change_okay:
+	commit_creds(new);
 	return old_fsgid;
 }
 
@@ -1136,7 +1167,7 @@ EXPORT_SYMBOL(groups_free);
 
 /* export the group_info to a user-space array */
 static int groups_to_user(gid_t __user *grouplist,
-    struct group_info *group_info)
+			  const struct group_info *group_info)
 {
 	int i;
 	unsigned int count = group_info->ngroups;
@@ -1227,31 +1258,25 @@ int groups_search(const struct group_info *group_info, gid_t grp)
 }
 
 /**
- * set_groups - Change a group subscription in a security record
- * @sec: The security record to alter
- * @group_info: The group list to impose
+ * set_groups - Change a group subscription in a set of credentials
+ * @new: The newly prepared set of credentials to alter
+ * @group_info: The group list to install
  *
- * Validate a group subscription and, if valid, impose it upon a task security
- * record.
+ * Validate a group subscription and, if valid, insert it into a set
+ * of credentials.
  */
-int set_groups(struct cred *cred, struct group_info *group_info)
+int set_groups(struct cred *new, struct group_info *group_info)
 {
 	int retval;
-	struct group_info *old_info;
 
 	retval = security_task_setgroups(group_info);
 	if (retval)
 		return retval;
 
+	put_group_info(new->group_info);
 	groups_sort(group_info);
 	get_group_info(group_info);
-
-	spin_lock(&cred->lock);
-	old_info = cred->group_info;
-	cred->group_info = group_info;
-	spin_unlock(&cred->lock);
-
-	put_group_info(old_info);
+	new->group_info = group_info;
 	return 0;
 }
 
@@ -1266,7 +1291,20 @@ EXPORT_SYMBOL(set_groups);
  */
 int set_current_groups(struct group_info *group_info)
 {
-	return set_groups(current->cred, group_info);
+	struct cred *new;
+	int ret;
+
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
+	ret = set_groups(new, group_info);
+	if (ret < 0) {
+		abort_creds(new);
+		return ret;
+	}
+
+	return commit_creds(new);
 }
 
 EXPORT_SYMBOL(set_current_groups);
@@ -1666,9 +1704,11 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 	unsigned char comm[sizeof(me->comm)];
 	long error;
 
-	if (security_task_prctl(option, arg2, arg3, arg4, arg5, &error))
+	error = security_task_prctl(option, arg2, arg3, arg4, arg5);
+	if (error != -ENOSYS)
 		return error;
 
+	error = 0;
 	switch (option) {
 		case PR_SET_PDEATHSIG:
 			if (!valid_signal(arg2)) {
diff --git a/kernel/user.c b/kernel/user.c
index 104d22ac84d..d476307dd4b 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,6 +16,7 @@
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/user_namespace.h>
+#include "cred-internals.h"
 
 struct user_namespace init_user_ns = {
 	.kref = {
@@ -104,16 +105,10 @@ static int sched_create_user(struct user_struct *up)
 	return rc;
 }
 
-static void sched_switch_user(struct task_struct *p)
-{
-	sched_move_task(p);
-}
-
 #else	/* CONFIG_USER_SCHED */
 
 static void sched_destroy_user(struct user_struct *up) { }
 static int sched_create_user(struct user_struct *up) { return 0; }
-static void sched_switch_user(struct task_struct *p) { }
 
 #endif	/* CONFIG_USER_SCHED */
 
@@ -448,36 +443,6 @@ out_unlock:
 	return NULL;
 }
 
-void switch_uid(struct user_struct *new_user)
-{
-	struct user_struct *old_user;
-
-	/* What if a process setreuid()'s and this brings the
-	 * new uid over his NPROC rlimit?  We can check this now
-	 * cheaply with the new uid cache, so if it matters
-	 * we should be checking for it.  -DaveM
-	 */
-	old_user = current->cred->user;
-	atomic_inc(&new_user->processes);
-	atomic_dec(&old_user->processes);
-	switch_uid_keyring(new_user);
-	current->cred->user = new_user;
-	sched_switch_user(current);
-
-	/*
-	 * We need to synchronize with __sigqueue_alloc()
-	 * doing a get_uid(p->user).. If that saw the old
-	 * user value, we need to wait until it has exited
-	 * its critical region before we can free the old
-	 * structure.
-	 */
-	smp_mb();
-	spin_unlock_wait(&current->sighand->siglock);
-
-	free_uid(old_user);
-	suid_keys(current);
-}
-
 #ifdef CONFIG_USER_NS
 void release_uids(struct user_namespace *ns)
 {
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index f82730adea0..0d9c51d6733 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -19,6 +19,7 @@ static struct user_namespace *clone_user_ns(struct user_namespace *old_ns)
 {
 	struct user_namespace *ns;
 	struct user_struct *new_user;
+	struct cred *new;
 	int n;
 
 	ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL);
@@ -45,7 +46,16 @@ static struct user_namespace *clone_user_ns(struct user_namespace *old_ns)
 		return ERR_PTR(-ENOMEM);
 	}
 
-	switch_uid(new_user);
+	/* Install the new user */
+	new = prepare_creds();
+	if (!new) {
+		free_uid(new_user);
+		free_uid(ns->root_user);
+		kfree(ns);
+	}
+	free_uid(new->user);
+	new->user = new_user;
+	commit_creds(new);
 	return ns;
 }
 
-- 
cgit v1.2.3


From a6f76f23d297f70e2a6b3ec607f7aeeea9e37e8d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:24 +1100
Subject: CRED: Make execve() take advantage of copy-on-write credentials

Make execve() take advantage of copy-on-write credentials, allowing it to set
up the credentials in advance, and then commit the whole lot after the point
of no return.

This patch and the preceding patches have been tested with the LTP SELinux
testsuite.

This patch makes several logical sets of alteration:

 (1) execve().

     The credential bits from struct linux_binprm are, for the most part,
     replaced with a single credentials pointer (bprm->cred).  This means that
     all the creds can be calculated in advance and then applied at the point
     of no return with no possibility of failure.

     I would like to replace bprm->cap_effective with:

	cap_isclear(bprm->cap_effective)

     but this seems impossible due to special behaviour for processes of pid 1
     (they always retain their parent's capability masks where normally they'd
     be changed - see cap_bprm_set_creds()).

     The following sequence of events now happens:

     (a) At the start of do_execve, the current task's cred_exec_mutex is
     	 locked to prevent PTRACE_ATTACH from obsoleting the calculation of
     	 creds that we make.

     (a) prepare_exec_creds() is then called to make a copy of the current
     	 task's credentials and prepare it.  This copy is then assigned to
     	 bprm->cred.

  	 This renders security_bprm_alloc() and security_bprm_free()
     	 unnecessary, and so they've been removed.

     (b) The determination of unsafe execution is now performed immediately
     	 after (a) rather than later on in the code.  The result is stored in
     	 bprm->unsafe for future reference.

     (c) prepare_binprm() is called, possibly multiple times.

     	 (i) This applies the result of set[ug]id binaries to the new creds
     	     attached to bprm->cred.  Personality bit clearance is recorded,
     	     but now deferred on the basis that the exec procedure may yet
     	     fail.

         (ii) This then calls the new security_bprm_set_creds().  This should
	     calculate the new LSM and capability credentials into *bprm->cred.

	     This folds together security_bprm_set() and parts of
	     security_bprm_apply_creds() (these two have been removed).
	     Anything that might fail must be done at this point.

         (iii) bprm->cred_prepared is set to 1.

	     bprm->cred_prepared is 0 on the first pass of the security
	     calculations, and 1 on all subsequent passes.  This allows SELinux
	     in (ii) to base its calculations only on the initial script and
	     not on the interpreter.

     (d) flush_old_exec() is called to commit the task to execution.  This
     	 performs the following steps with regard to credentials:

	 (i) Clear pdeath_signal and set dumpable on certain circumstances that
	     may not be covered by commit_creds().

         (ii) Clear any bits in current->personality that were deferred from
             (c.i).

     (e) install_exec_creds() [compute_creds() as was] is called to install the
     	 new credentials.  This performs the following steps with regard to
     	 credentials:

         (i) Calls security_bprm_committing_creds() to apply any security
             requirements, such as flushing unauthorised files in SELinux, that
             must be done before the credentials are changed.

	     This is made up of bits of security_bprm_apply_creds() and
	     security_bprm_post_apply_creds(), both of which have been removed.
	     This function is not allowed to fail; anything that might fail
	     must have been done in (c.ii).

         (ii) Calls commit_creds() to apply the new credentials in a single
             assignment (more or less).  Possibly pdeath_signal and dumpable
             should be part of struct creds.

	 (iii) Unlocks the task's cred_replace_mutex, thus allowing
	     PTRACE_ATTACH to take place.

         (iv) Clears The bprm->cred pointer as the credentials it was holding
             are now immutable.

         (v) Calls security_bprm_committed_creds() to apply any security
             alterations that must be done after the creds have been changed.
             SELinux uses this to flush signals and signal handlers.

     (f) If an error occurs before (d.i), bprm_free() will call abort_creds()
     	 to destroy the proposed new credentials and will then unlock
     	 cred_replace_mutex.  No changes to the credentials will have been
     	 made.

 (2) LSM interface.

     A number of functions have been changed, added or removed:

     (*) security_bprm_alloc(), ->bprm_alloc_security()
     (*) security_bprm_free(), ->bprm_free_security()

     	 Removed in favour of preparing new credentials and modifying those.

     (*) security_bprm_apply_creds(), ->bprm_apply_creds()
     (*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds()

     	 Removed; split between security_bprm_set_creds(),
     	 security_bprm_committing_creds() and security_bprm_committed_creds().

     (*) security_bprm_set(), ->bprm_set_security()

     	 Removed; folded into security_bprm_set_creds().

     (*) security_bprm_set_creds(), ->bprm_set_creds()

     	 New.  The new credentials in bprm->creds should be checked and set up
     	 as appropriate.  bprm->cred_prepared is 0 on the first call, 1 on the
     	 second and subsequent calls.

     (*) security_bprm_committing_creds(), ->bprm_committing_creds()
     (*) security_bprm_committed_creds(), ->bprm_committed_creds()

     	 New.  Apply the security effects of the new credentials.  This
     	 includes closing unauthorised files in SELinux.  This function may not
     	 fail.  When the former is called, the creds haven't yet been applied
     	 to the process; when the latter is called, they have.

 	 The former may access bprm->cred, the latter may not.

 (3) SELinux.

     SELinux has a number of changes, in addition to those to support the LSM
     interface changes mentioned above:

     (a) The bprm_security_struct struct has been removed in favour of using
     	 the credentials-under-construction approach.

     (c) flush_unauthorized_files() now takes a cred pointer and passes it on
     	 to inode_has_perm(), file_has_perm() and dentry_open().

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/cred.c | 46 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 45 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cred.c b/kernel/cred.c
index cb6b5eda978..e6fcdd67b2e 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -68,7 +68,7 @@ static void release_tgcred_rcu(struct rcu_head *rcu)
 /*
  * Release a set of thread group credentials.
  */
-void release_tgcred(struct cred *cred)
+static void release_tgcred(struct cred *cred)
 {
 #ifdef CONFIG_KEYS
 	struct thread_group_cred *tgcred = cred->tgcred;
@@ -163,6 +163,50 @@ error:
 }
 EXPORT_SYMBOL(prepare_creds);
 
+/*
+ * Prepare credentials for current to perform an execve()
+ * - The caller must hold current->cred_exec_mutex
+ */
+struct cred *prepare_exec_creds(void)
+{
+	struct thread_group_cred *tgcred = NULL;
+	struct cred *new;
+
+#ifdef CONFIG_KEYS
+	tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
+	if (!tgcred)
+		return NULL;
+#endif
+
+	new = prepare_creds();
+	if (!new) {
+		kfree(tgcred);
+		return new;
+	}
+
+#ifdef CONFIG_KEYS
+	/* newly exec'd tasks don't get a thread keyring */
+	key_put(new->thread_keyring);
+	new->thread_keyring = NULL;
+
+	/* create a new per-thread-group creds for all this set of threads to
+	 * share */
+	memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred));
+
+	atomic_set(&tgcred->usage, 1);
+	spin_lock_init(&tgcred->lock);
+
+	/* inherit the session keyring; new process keyring */
+	key_get(tgcred->session_keyring);
+	tgcred->process_keyring = NULL;
+
+	release_tgcred(new);
+	new->tgcred = tgcred;
+#endif
+
+	return new;
+}
+
 /*
  * prepare new credentials for the usermode helper dispatcher
  */
-- 
cgit v1.2.3


From 98870ab0a5a3f1822aee681d2997017e1c87d026 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:26 +1100
Subject: CRED: Documentation

Document credentials and the new credentials API.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/cred.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cred.c b/kernel/cred.c
index e6fcdd67b2e..b8bd2f99d8c 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -1,4 +1,4 @@
-/* Task credentials management
+/* Task credentials management - see Documentation/credentials.txt
  *
  * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
-- 
cgit v1.2.3


From 3b11a1decef07c19443d24ae926982bc8ec9f4c0 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:26 +1100
Subject: CRED: Differentiate objective and effective subjective credentials on
 a task

Differentiate the objective and real subjective credentials from the effective
subjective credentials on a task by introducing a second credentials pointer
into the task_struct.

task_struct::real_cred then refers to the objective and apparent real
subjective credentials of a task, as perceived by the other tasks in the
system.

task_struct::cred then refers to the effective subjective credentials of a
task, as used by that task when it's actually running.  These are not visible
to the other tasks in the system.

__task_cred(task) then refers to the objective/real credentials of the task in
question.

current_cred() refers to the effective subjective credentials of the current
task.

prepare_creds() uses the objective creds as a base and commit_creds() changes
both pointers in the task_struct (indeed commit_creds() requires them to be the
same).

override_creds() and revert_creds() change the subjective creds pointer only,
and the former returns the old subjective creds.  These are used by NFSD,
faccessat() and do_coredump(), and will by used by CacheFiles.

In SELinux, current_has_perm() is provided as an alternative to
task_has_perm().  This uses the effective subjective context of current,
whereas task_has_perm() uses the objective/real context of the subject.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/cred.c | 38 ++++++++++++++++++++++++++------------
 kernel/fork.c |  6 ++++--
 2 files changed, 30 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cred.c b/kernel/cred.c
index b8bd2f99d8c..f3ca1066061 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -35,7 +35,7 @@ static struct thread_group_cred init_tgcred = {
  * The initial credentials for the initial task
  */
 struct cred init_cred = {
-	.usage			= ATOMIC_INIT(3),
+	.usage			= ATOMIC_INIT(4),
 	.securebits		= SECUREBITS_DEFAULT,
 	.cap_inheritable	= CAP_INIT_INH_SET,
 	.cap_permitted		= CAP_FULL_SET,
@@ -120,6 +120,8 @@ EXPORT_SYMBOL(__put_cred);
  * prepare a new copy, which the caller then modifies and then commits by
  * calling commit_creds().
  *
+ * Preparation involves making a copy of the objective creds for modification.
+ *
  * Returns a pointer to the new creds-to-be if successful, NULL otherwise.
  *
  * Call commit_creds() or abort_creds() to clean up.
@@ -130,7 +132,7 @@ struct cred *prepare_creds(void)
 	const struct cred *old;
 	struct cred *new;
 
-	BUG_ON(atomic_read(&task->cred->usage) < 1);
+	BUG_ON(atomic_read(&task->real_cred->usage) < 1);
 
 	new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
 	if (!new)
@@ -262,6 +264,9 @@ error:
  *
  * We share if we can, but under some circumstances we have to generate a new
  * set.
+ *
+ * The new process gets the current process's subjective credentials as its
+ * objective and subjective credentials
  */
 int copy_creds(struct task_struct *p, unsigned long clone_flags)
 {
@@ -278,6 +283,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 #endif
 		clone_flags & CLONE_THREAD
 	    ) {
+		p->real_cred = get_cred(p->cred);
 		get_cred(p->cred);
 		atomic_inc(&p->cred->user->processes);
 		return 0;
@@ -317,7 +323,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 #endif
 
 	atomic_inc(&new->user->processes);
-	p->cred = new;
+	p->cred = p->real_cred = get_cred(new);
 	return 0;
 }
 
@@ -326,7 +332,9 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
  * @new: The credentials to be assigned
  *
  * Install a new set of credentials to the current task, using RCU to replace
- * the old set.
+ * the old set.  Both the objective and the subjective credentials pointers are
+ * updated.  This function may not be called if the subjective credentials are
+ * in an overridden state.
  *
  * This function eats the caller's reference to the new credentials.
  *
@@ -338,12 +346,15 @@ int commit_creds(struct cred *new)
 	struct task_struct *task = current;
 	const struct cred *old;
 
+	BUG_ON(task->cred != task->real_cred);
+	BUG_ON(atomic_read(&task->real_cred->usage) < 2);
 	BUG_ON(atomic_read(&new->usage) < 1);
-	BUG_ON(atomic_read(&task->cred->usage) < 1);
 
-	old = task->cred;
+	old = task->real_cred;
 	security_commit_creds(new, old);
 
+	get_cred(new); /* we will require a ref for the subj creds too */
+
 	/* dumpability changes */
 	if (old->euid != new->euid ||
 	    old->egid != new->egid ||
@@ -369,6 +380,7 @@ int commit_creds(struct cred *new)
 	 */
 	if (new->user != old->user)
 		atomic_inc(&new->user->processes);
+	rcu_assign_pointer(task->real_cred, new);
 	rcu_assign_pointer(task->cred, new);
 	if (new->user != old->user)
 		atomic_dec(&old->user->processes);
@@ -388,6 +400,8 @@ int commit_creds(struct cred *new)
 	    new->fsgid != old->fsgid)
 		proc_id_connector(task, PROC_EVENT_GID);
 
+	/* release the old obj and subj refs both */
+	put_cred(old);
 	put_cred(old);
 	return 0;
 }
@@ -408,11 +422,11 @@ void abort_creds(struct cred *new)
 EXPORT_SYMBOL(abort_creds);
 
 /**
- * override_creds - Temporarily override the current process's credentials
+ * override_creds - Override the current process's subjective credentials
  * @new: The credentials to be assigned
  *
- * Install a set of temporary override credentials on the current process,
- * returning the old set for later reversion.
+ * Install a set of temporary override subjective credentials on the current
+ * process, returning the old set for later reversion.
  */
 const struct cred *override_creds(const struct cred *new)
 {
@@ -424,11 +438,11 @@ const struct cred *override_creds(const struct cred *new)
 EXPORT_SYMBOL(override_creds);
 
 /**
- * revert_creds - Revert a temporary credentials override
+ * revert_creds - Revert a temporary subjective credentials override
  * @old: The credentials to be restored
  *
- * Revert a temporary set of override credentials to an old set, discarding the
- * override set.
+ * Revert a temporary set of override subjective credentials to an old set,
+ * discarding the override set.
  */
 void revert_creds(const struct cred *old)
 {
diff --git a/kernel/fork.c b/kernel/fork.c
index 82a7948a664..af0d0f04585 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -146,6 +146,7 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
+	put_cred(tsk->real_cred);
 	put_cred(tsk->cred);
 	delayacct_tsk_free(tsk);
 
@@ -961,10 +962,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
 #endif
 	retval = -EAGAIN;
-	if (atomic_read(&p->cred->user->processes) >=
+	if (atomic_read(&p->real_cred->user->processes) >=
 			p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
 		if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
-		    p->cred->user != current->nsproxy->user_ns->root_user)
+		    p->real_cred->user != current->nsproxy->user_ns->root_user)
 			goto bad_fork_free;
 	}
 
@@ -1278,6 +1279,7 @@ bad_fork_cleanup_put_domain:
 	module_put(task_thread_info(p)->exec_domain->module);
 bad_fork_cleanup_count:
 	atomic_dec(&p->cred->user->processes);
+	put_cred(p->real_cred);
 	put_cred(p->cred);
 bad_fork_free:
 	free_task(p);
-- 
cgit v1.2.3


From 3a3b7ce9336952ea7b9564d976d068a238976c9d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:28 +1100
Subject: CRED: Allow kernel services to override LSM settings for task actions

Allow kernel services to override LSM settings appropriate to the actions
performed by a task by duplicating a set of credentials, modifying it and then
using task_struct::cred to point to it when performing operations on behalf of
a task.

This is used, for example, by CacheFiles which has to transparently access the
cache on behalf of a process that thinks it is doing, say, NFS accesses with a
potentially inappropriate (with respect to accessing the cache) set of
credentials.

This patch provides two LSM hooks for modifying a task security record:

 (*) security_kernel_act_as() which allows modification of the security datum
     with which a task acts on other objects (most notably files).

 (*) security_kernel_create_files_as() which allows modification of the
     security datum that is used to initialise the security data on a file that
     a task creates.

The patch also provides four new credentials handling functions, which wrap the
LSM functions:

 (1) prepare_kernel_cred()

     Prepare a set of credentials for a kernel service to use, based either on
     a daemon's credentials or on init_cred.  All the keyrings are cleared.

 (2) set_security_override()

     Set the LSM security ID in a set of credentials to a specific security
     context, assuming permission from the LSM policy.

 (3) set_security_override_from_ctx()

     As (2), but takes the security context as a string.

 (4) set_create_files_as()

     Set the file creation LSM security ID in a set of credentials to be the
     same as that on a particular inode.

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com> [Smack changes]
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/cred.c | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 113 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cred.c b/kernel/cred.c
index f3ca1066061..13697ca2bb3 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -462,3 +462,116 @@ void __init cred_init(void)
 	cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred),
 				     0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 }
+
+/**
+ * prepare_kernel_cred - Prepare a set of credentials for a kernel service
+ * @daemon: A userspace daemon to be used as a reference
+ *
+ * Prepare a set of credentials for a kernel service.  This can then be used to
+ * override a task's own credentials so that work can be done on behalf of that
+ * task that requires a different subjective context.
+ *
+ * @daemon is used to provide a base for the security record, but can be NULL.
+ * If @daemon is supplied, then the security data will be derived from that;
+ * otherwise they'll be set to 0 and no groups, full capabilities and no keys.
+ *
+ * The caller may change these controls afterwards if desired.
+ *
+ * Returns the new credentials or NULL if out of memory.
+ *
+ * Does not take, and does not return holding current->cred_replace_mutex.
+ */
+struct cred *prepare_kernel_cred(struct task_struct *daemon)
+{
+	const struct cred *old;
+	struct cred *new;
+
+	new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
+	if (!new)
+		return NULL;
+
+	if (daemon)
+		old = get_task_cred(daemon);
+	else
+		old = get_cred(&init_cred);
+
+	get_uid(new->user);
+	get_group_info(new->group_info);
+
+#ifdef CONFIG_KEYS
+	atomic_inc(&init_tgcred.usage);
+	new->tgcred = &init_tgcred;
+	new->request_key_auth = NULL;
+	new->thread_keyring = NULL;
+	new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
+#endif
+
+#ifdef CONFIG_SECURITY
+	new->security = NULL;
+#endif
+	if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
+		goto error;
+
+	atomic_set(&new->usage, 1);
+	put_cred(old);
+	return new;
+
+error:
+	put_cred(new);
+	return NULL;
+}
+EXPORT_SYMBOL(prepare_kernel_cred);
+
+/**
+ * set_security_override - Set the security ID in a set of credentials
+ * @new: The credentials to alter
+ * @secid: The LSM security ID to set
+ *
+ * Set the LSM security ID in a set of credentials so that the subjective
+ * security is overridden when an alternative set of credentials is used.
+ */
+int set_security_override(struct cred *new, u32 secid)
+{
+	return security_kernel_act_as(new, secid);
+}
+EXPORT_SYMBOL(set_security_override);
+
+/**
+ * set_security_override_from_ctx - Set the security ID in a set of credentials
+ * @new: The credentials to alter
+ * @secctx: The LSM security context to generate the security ID from.
+ *
+ * Set the LSM security ID in a set of credentials so that the subjective
+ * security is overridden when an alternative set of credentials is used.  The
+ * security ID is specified in string form as a security context to be
+ * interpreted by the LSM.
+ */
+int set_security_override_from_ctx(struct cred *new, const char *secctx)
+{
+	u32 secid;
+	int ret;
+
+	ret = security_secctx_to_secid(secctx, strlen(secctx), &secid);
+	if (ret < 0)
+		return ret;
+
+	return set_security_override(new, secid);
+}
+EXPORT_SYMBOL(set_security_override_from_ctx);
+
+/**
+ * set_create_files_as - Set the LSM file create context in a set of credentials
+ * @new: The credentials to alter
+ * @inode: The inode to take the context from
+ *
+ * Change the LSM file creation context in a set of credentials to be the same
+ * as the object context of the specified inode, so that the new inodes have
+ * the same MAC context as that inode.
+ */
+int set_create_files_as(struct cred *new, struct inode *inode)
+{
+	new->fsuid = inode->i_uid;
+	new->fsgid = inode->i_gid;
+	return security_kernel_create_files_as(new, inode);
+}
+EXPORT_SYMBOL(set_create_files_as);
-- 
cgit v1.2.3


From f3c7ac40a99f4044b843e6e2c4f46ab2d354c563 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 14 Nov 2008 16:21:19 -0800
Subject: ftrace: remove condition from ftrace_record_ip

Impact: let module functions be recorded when dyn ftrace not enabled

When dynamic ftrace had a daemon and a hash to record the locations
of mcount callers at run time, the recording needed to stop when
ftrace was disabled. But now that the recording is done at compile time
and the ftrace_record_ip is only called at boot up and when a module
is loaded, we no longer need to check if ftrace_enabled is set.
In fact, this breaks module load if it is not set because we skip
over module functions.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 54cb9a7d15e..3160254f6c7 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -334,7 +334,7 @@ ftrace_record_ip(unsigned long ip)
 {
 	struct dyn_ftrace *rec;
 
-	if (!ftrace_enabled || ftrace_disabled)
+	if (ftrace_disabled)
 		return NULL;
 
 	rec = ftrace_alloc_dyn_node(ip);
-- 
cgit v1.2.3


From b17e8a37a13d0e87165054714434534bb7e69f2d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 14 Nov 2008 16:21:19 -0800
Subject: ftrace: disable ftrace on anomalies in trace start and stop

Impact: robust feature to disable ftrace on start or stop tracing on error

Currently only the initial conversion to nops will disable ftrace
on an anomaly. But if an anomaly happens on start or stopping of the
tracer, it will silently fail.

This patch adds a check there too, to disable ftrace and warn if the
conversion fails.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 81 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 44 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 3160254f6c7..d5bd21f3952 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -348,6 +348,47 @@ ftrace_record_ip(unsigned long ip)
 	return rec;
 }
 
+static void print_ip_ins(const char *fmt, unsigned char *p)
+{
+	int i;
+
+	printk(KERN_CONT "%s", fmt);
+
+	for (i = 0; i < MCOUNT_INSN_SIZE; i++)
+		printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
+}
+
+static void ftrace_bug(int failed, unsigned long ip,
+		       unsigned char *expected,
+		       unsigned char *replace)
+{
+	switch (failed) {
+	case -EFAULT:
+		FTRACE_WARN_ON_ONCE(1);
+		pr_info("ftrace faulted on modifying ");
+		print_ip_sym(ip);
+		break;
+	case -EINVAL:
+		FTRACE_WARN_ON_ONCE(1);
+		pr_info("ftrace failed to modify ");
+		print_ip_sym(ip);
+		print_ip_ins(" expected: ", expected);
+		print_ip_ins(" actual: ", (unsigned char *)ip);
+		print_ip_ins(" replace: ", replace);
+		printk(KERN_CONT "\n");
+		break;
+	case -EPERM:
+		FTRACE_WARN_ON_ONCE(1);
+		pr_info("ftrace faulted on writing ");
+		print_ip_sym(ip);
+		break;
+	default:
+		FTRACE_WARN_ON_ONCE(1);
+		pr_info("ftrace faulted on unknown error ");
+		print_ip_sym(ip);
+	}
+}
+
 #define FTRACE_ADDR ((long)(ftrace_caller))
 
 static int
@@ -465,22 +506,13 @@ static void ftrace_replace_code(int enable)
 				if ((system_state == SYSTEM_BOOTING) ||
 				    !core_kernel_text(rec->ip)) {
 					ftrace_free_rec(rec);
-				}
+				} else
+					ftrace_bug(failed, rec->ip, old, new);
 			}
 		}
 	}
 }
 
-static void print_ip_ins(const char *fmt, unsigned char *p)
-{
-	int i;
-
-	printk(KERN_CONT "%s", fmt);
-
-	for (i = 0; i < MCOUNT_INSN_SIZE; i++)
-		printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
-}
-
 static int
 ftrace_code_disable(struct dyn_ftrace *rec)
 {
@@ -495,32 +527,7 @@ ftrace_code_disable(struct dyn_ftrace *rec)
 
 	ret = ftrace_modify_code(ip, call, nop);
 	if (ret) {
-		switch (ret) {
-		case -EFAULT:
-			FTRACE_WARN_ON_ONCE(1);
-			pr_info("ftrace faulted on modifying ");
-			print_ip_sym(ip);
-			break;
-		case -EINVAL:
-			FTRACE_WARN_ON_ONCE(1);
-			pr_info("ftrace failed to modify ");
-			print_ip_sym(ip);
-			print_ip_ins(" expected: ", call);
-			print_ip_ins(" actual: ", (unsigned char *)ip);
-			print_ip_ins(" replace: ", nop);
-			printk(KERN_CONT "\n");
-			break;
-		case -EPERM:
-			FTRACE_WARN_ON_ONCE(1);
-			pr_info("ftrace faulted on writing ");
-			print_ip_sym(ip);
-			break;
-		default:
-			FTRACE_WARN_ON_ONCE(1);
-			pr_info("ftrace faulted on unknown error ");
-			print_ip_sym(ip);
-		}
-
+		ftrace_bug(ret, ip, call, nop);
 		rec->flags |= FTRACE_FL_FAILED;
 		return 0;
 	}
-- 
cgit v1.2.3


From 918c115410c6cc57033835b6a401e57697f9ea4f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 14 Nov 2008 16:21:19 -0800
Subject: ftrace: do not process freed records

Impact: keep from converting freed records

When the tracer is started or stopped, it converts all code pointed
to by the saved records into callers to ftrace or nops. When modules
are unloaded, their records are freed, but they still exist within
the record pages.

This patch changes the code to skip over freed records.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d5bd21f3952..3940c71ac2a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -488,8 +488,12 @@ static void ftrace_replace_code(int enable)
 		for (i = 0; i < pg->index; i++) {
 			rec = &pg->records[i];
 
-			/* don't modify code that has already faulted */
-			if (rec->flags & FTRACE_FL_FAILED)
+			/*
+			 * Skip over free records and records that have
+			 * failed.
+			 */
+			if (rec->flags & FTRACE_FL_FREE ||
+			    rec->flags & FTRACE_FL_FAILED)
 				continue;
 
 			/* ignore updates to this record's mcount site */
-- 
cgit v1.2.3


From d51ad7ac48f991c4a8834485727efa99a691cb87 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 15 Nov 2008 15:48:29 -0500
Subject: ftrace: replace raw_local_irq_save with local_irq_save

Impact: fix lockdep disabling itself when function tracing is enabled

The raw_local_irq_saves used in ftrace is causing problems with
lockdep. (it thinks the irq flags are out of sync and disables
itself with a warning)

The raw ops here are not needed, and the normal local_irq_save is fine.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c          | 4 ++--
 kernel/trace/trace_selftest.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4a904623e05..dff4bee591b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1051,7 +1051,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
 	 * Need to use raw, since this must be called before the
 	 * recursive protection is performed.
 	 */
-	raw_local_irq_save(flags);
+	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 	disabled = atomic_inc_return(&data->disabled);
@@ -1062,7 +1062,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
 	}
 
 	atomic_dec(&data->disabled);
-	raw_local_irq_restore(flags);
+	local_irq_restore(flags);
 }
 
 #ifdef CONFIG_FUNCTION_RET_TRACER
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 24e6e075e6d..5cb64ea061b 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -52,7 +52,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
 	int cpu, ret = 0;
 
 	/* Don't allow flipping of max traces now */
-	raw_local_irq_save(flags);
+	local_irq_save(flags);
 	__raw_spin_lock(&ftrace_max_lock);
 
 	cnt = ring_buffer_entries(tr->buffer);
@@ -63,7 +63,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
 			break;
 	}
 	__raw_spin_unlock(&ftrace_max_lock);
-	raw_local_irq_restore(flags);
+	local_irq_restore(flags);
 
 	if (count)
 		*count = cnt;
-- 
cgit v1.2.3


From 31e889098a80ceb3e9e3c555d522b2686a6663c6 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 14 Nov 2008 16:21:19 -0800
Subject: ftrace: pass module struct to arch dynamic ftrace functions

Impact: allow archs more flexibility on dynamic ftrace implementations

Dynamic ftrace has largly been developed on x86. Since x86 does not
have the same limitations as other architectures, the ftrace interaction
between the generic code and the architecture specific code was not
flexible enough to handle some of the issues that other architectures
have.

Most notably, module trampolines. Due to the limited branch distance
that archs make in calling kernel core code from modules, the module
load code must create a trampoline to jump to what will make the
larger jump into core kernel code.

The problem arises when this happens to a call to mcount. Ftrace checks
all code before modifying it and makes sure the current code is what
it expects. Right now, there is not enough information to handle modifying
module trampolines.

This patch changes the API between generic dynamic ftrace code and
the arch dependent code. There is now two functions for modifying code:

  ftrace_make_nop(mod, rec, addr) - convert the code at rec->ip into
       a nop, where the original text is calling addr. (mod is the
       module struct if called by module init)

  ftrace_make_caller(rec, addr) - convert the code rec->ip that should
       be a nop into a caller to addr.

The record "rec" now has a new field called "arch" where the architecture
can add any special attributes to each call site record.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/module.c       |  2 +-
 kernel/trace/ftrace.c | 62 ++++++++++++++++++++-------------------------------
 2 files changed, 25 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 1f4cc00e0c2..69791274e89 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2201,7 +2201,7 @@ static noinline struct module *load_module(void __user *umod,
 	/* sechdrs[0].sh_size is always zero */
 	mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc",
 			    sizeof(*mseg), &num_mcount);
-	ftrace_init_module(mseg, mseg + num_mcount);
+	ftrace_init_module(mod, mseg, mseg + num_mcount);
 
 	err = module_finalize(hdr, sechdrs, mod);
 	if (err < 0)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 3940c71ac2a..e9a5fbfce08 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -358,9 +358,7 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
 		printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
 }
 
-static void ftrace_bug(int failed, unsigned long ip,
-		       unsigned char *expected,
-		       unsigned char *replace)
+static void ftrace_bug(int failed, unsigned long ip)
 {
 	switch (failed) {
 	case -EFAULT:
@@ -372,9 +370,7 @@ static void ftrace_bug(int failed, unsigned long ip,
 		FTRACE_WARN_ON_ONCE(1);
 		pr_info("ftrace failed to modify ");
 		print_ip_sym(ip);
-		print_ip_ins(" expected: ", expected);
 		print_ip_ins(" actual: ", (unsigned char *)ip);
-		print_ip_ins(" replace: ", replace);
 		printk(KERN_CONT "\n");
 		break;
 	case -EPERM:
@@ -392,8 +388,7 @@ static void ftrace_bug(int failed, unsigned long ip,
 #define FTRACE_ADDR ((long)(ftrace_caller))
 
 static int
-__ftrace_replace_code(struct dyn_ftrace *rec,
-		      unsigned char *old, unsigned char *new, int enable)
+__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 {
 	unsigned long ip, fl;
 
@@ -435,12 +430,10 @@ __ftrace_replace_code(struct dyn_ftrace *rec,
 		 * otherwise enable it!
 		 */
 		if (fl & FTRACE_FL_ENABLED) {
-			/* swap new and old */
-			new = old;
-			old = ftrace_call_replace(ip, FTRACE_ADDR);
+			enable = 0;
 			rec->flags &= ~FTRACE_FL_ENABLED;
 		} else {
-			new = ftrace_call_replace(ip, FTRACE_ADDR);
+			enable = 1;
 			rec->flags |= FTRACE_FL_ENABLED;
 		}
 	} else {
@@ -453,10 +446,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec,
 			fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED);
 			if (fl == FTRACE_FL_NOTRACE)
 				return 0;
-
-			new = ftrace_call_replace(ip, FTRACE_ADDR);
-		} else
-			old = ftrace_call_replace(ip, FTRACE_ADDR);
+		}
 
 		if (enable) {
 			if (rec->flags & FTRACE_FL_ENABLED)
@@ -469,21 +459,18 @@ __ftrace_replace_code(struct dyn_ftrace *rec,
 		}
 	}
 
-	return ftrace_modify_code(ip, old, new);
+	if (enable)
+		return ftrace_make_call(rec, FTRACE_ADDR);
+	else
+		return ftrace_make_nop(NULL, rec, FTRACE_ADDR);
 }
 
 static void ftrace_replace_code(int enable)
 {
 	int i, failed;
-	unsigned char *new = NULL, *old = NULL;
 	struct dyn_ftrace *rec;
 	struct ftrace_page *pg;
 
-	if (enable)
-		old = ftrace_nop_replace();
-	else
-		new = ftrace_nop_replace();
-
 	for (pg = ftrace_pages_start; pg; pg = pg->next) {
 		for (i = 0; i < pg->index; i++) {
 			rec = &pg->records[i];
@@ -504,34 +491,30 @@ static void ftrace_replace_code(int enable)
 				unfreeze_record(rec);
 			}
 
-			failed = __ftrace_replace_code(rec, old, new, enable);
+			failed = __ftrace_replace_code(rec, enable);
 			if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
 				rec->flags |= FTRACE_FL_FAILED;
 				if ((system_state == SYSTEM_BOOTING) ||
 				    !core_kernel_text(rec->ip)) {
 					ftrace_free_rec(rec);
 				} else
-					ftrace_bug(failed, rec->ip, old, new);
+					ftrace_bug(failed, rec->ip);
 			}
 		}
 	}
 }
 
 static int
-ftrace_code_disable(struct dyn_ftrace *rec)
+ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
 {
 	unsigned long ip;
-	unsigned char *nop, *call;
 	int ret;
 
 	ip = rec->ip;
 
-	nop = ftrace_nop_replace();
-	call = ftrace_call_replace(ip, mcount_addr);
-
-	ret = ftrace_modify_code(ip, call, nop);
+	ret = ftrace_make_nop(mod, rec, mcount_addr);
 	if (ret) {
-		ftrace_bug(ret, ip, call, nop);
+		ftrace_bug(ret, ip);
 		rec->flags |= FTRACE_FL_FAILED;
 		return 0;
 	}
@@ -650,7 +633,7 @@ static cycle_t		ftrace_update_time;
 static unsigned long	ftrace_update_cnt;
 unsigned long		ftrace_update_tot_cnt;
 
-static int ftrace_update_code(void)
+static int ftrace_update_code(struct module *mod)
 {
 	struct dyn_ftrace *p, *t;
 	cycle_t start, stop;
@@ -667,7 +650,7 @@ static int ftrace_update_code(void)
 		list_del_init(&p->list);
 
 		/* convert record (i.e, patch mcount-call with NOP) */
-		if (ftrace_code_disable(p)) {
+		if (ftrace_code_disable(mod, p)) {
 			p->flags |= FTRACE_FL_CONVERTED;
 			ftrace_update_cnt++;
 		} else
@@ -1309,7 +1292,8 @@ static __init int ftrace_init_debugfs(void)
 
 fs_initcall(ftrace_init_debugfs);
 
-static int ftrace_convert_nops(unsigned long *start,
+static int ftrace_convert_nops(struct module *mod,
+			       unsigned long *start,
 			       unsigned long *end)
 {
 	unsigned long *p;
@@ -1325,18 +1309,19 @@ static int ftrace_convert_nops(unsigned long *start,
 
 	/* disable interrupts to prevent kstop machine */
 	local_irq_save(flags);
-	ftrace_update_code();
+	ftrace_update_code(mod);
 	local_irq_restore(flags);
 	mutex_unlock(&ftrace_start_lock);
 
 	return 0;
 }
 
-void ftrace_init_module(unsigned long *start, unsigned long *end)
+void ftrace_init_module(struct module *mod,
+			unsigned long *start, unsigned long *end)
 {
 	if (ftrace_disabled || start == end)
 		return;
-	ftrace_convert_nops(start, end);
+	ftrace_convert_nops(mod, start, end);
 }
 
 extern unsigned long __start_mcount_loc[];
@@ -1366,7 +1351,8 @@ void __init ftrace_init(void)
 
 	last_ftrace_enabled = ftrace_enabled = 1;
 
-	ret = ftrace_convert_nops(__start_mcount_loc,
+	ret = ftrace_convert_nops(NULL,
+				  __start_mcount_loc,
 				  __stop_mcount_loc);
 
 	return;
-- 
cgit v1.2.3


From 20e5227e9f55ae1969934821ccbf581563785bbe Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 14 Nov 2008 16:21:19 -0800
Subject: ftrace: allow NULL pointers in mcount_loc

Impact: make ftrace_convert_nops() more permissive

Due to the way different architecture linkers combine the data sections
of the mcount_loc (the section that lists all the locations that
call mcount), there may be zeros added in that section. This is usually
due to strange alignments that the linker performs, that pads in zeros.

This patch makes the conversion code to nops skip any pointer in
the mcount_loc section that is NULL.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index e9a5fbfce08..cc4219135dc 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1304,6 +1304,14 @@ static int ftrace_convert_nops(struct module *mod,
 	p = start;
 	while (p < end) {
 		addr = ftrace_call_adjust(*p++);
+		/*
+		 * Some architecture linkers will pad between
+		 * the different mcount_loc sections of different
+		 * object files to satisfy alignments.
+		 * Skip any NULL pointers.
+		 */
+		if (!addr)
+			continue;
 		ftrace_record_ip(addr);
 	}
 
-- 
cgit v1.2.3


From 982c350b9ec4b3564d67f3627a274ae61bbc7e95 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 15 Nov 2008 16:31:41 -0500
Subject: ftrace: fix dyn ftrace filter

Impact: correct implementation of dyn ftrace filter

The old decisions made by the filter algorithm was complex and incorrect.
This lead to inconsistent enabling or disabling of functions when
the filter was used.

This patch simplifies that code and in doing so, corrects the usage
of the filters.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 83 ++++++++++++++++++++++-----------------------------
 1 file changed, 36 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index cc4219135dc..b9f2e22faf2 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -394,72 +394,62 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 
 	ip = rec->ip;
 
-	if (ftrace_filtered && enable) {
+	/*
+	 * If this record is not to be traced and
+	 * it is not enabled then do nothing.
+	 *
+	 * If this record is not to be traced and
+	 * it is enabled then disabled it.
+	 *
+	 */
+	if (rec->flags & FTRACE_FL_NOTRACE) {
+		if (rec->flags & FTRACE_FL_ENABLED)
+			rec->flags &= ~FTRACE_FL_ENABLED;
+		else
+			return 0;
+
+	} else if (ftrace_filtered && enable) {
 		/*
-		 * If filtering is on:
-		 *
-		 * If this record is set to be filtered and
-		 * is enabled then do nothing.
-		 *
-		 * If this record is set to be filtered and
-		 * it is not enabled, enable it.
-		 *
-		 * If this record is not set to be filtered
-		 * and it is not enabled do nothing.
-		 *
-		 * If this record is set not to trace then
-		 * do nothing.
-		 *
-		 * If this record is set not to trace and
-		 * it is enabled then disable it.
-		 *
-		 * If this record is not set to be filtered and
-		 * it is enabled, disable it.
+		 * Filtering is on:
 		 */
 
-		fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE |
-				   FTRACE_FL_ENABLED);
+		fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED);
 
-		if ((fl ==  (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) ||
-		    (fl ==  (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE)) ||
-		    !fl || (fl == FTRACE_FL_NOTRACE))
+		/* Record is filtered and enabled, do nothing */
+		if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED))
 			return 0;
 
-		/*
-		 * If it is enabled disable it,
-		 * otherwise enable it!
-		 */
-		if (fl & FTRACE_FL_ENABLED) {
-			enable = 0;
+		/* Record is not filtered and is not enabled do nothing */
+		if (!fl)
+			return 0;
+
+		/* Record is not filtered but enabled, disable it */
+		if (fl == FTRACE_FL_ENABLED)
 			rec->flags &= ~FTRACE_FL_ENABLED;
-		} else {
-			enable = 1;
+		else
+		/* Otherwise record is filtered but not enabled, enable it */
 			rec->flags |= FTRACE_FL_ENABLED;
-		}
 	} else {
+		/* Disable or not filtered */
 
 		if (enable) {
-			/*
-			 * If this record is set not to trace and is
-			 * not enabled, do nothing.
-			 */
-			fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED);
-			if (fl == FTRACE_FL_NOTRACE)
-				return 0;
-		}
-
-		if (enable) {
+			/* if record is enabled, do nothing */
 			if (rec->flags & FTRACE_FL_ENABLED)
 				return 0;
+
 			rec->flags |= FTRACE_FL_ENABLED;
+
 		} else {
+
+			/* if record is not enabled do nothing */
 			if (!(rec->flags & FTRACE_FL_ENABLED))
 				return 0;
+
 			rec->flags &= ~FTRACE_FL_ENABLED;
 		}
 	}
 
-	if (enable)
+	if (rec->flags & FTRACE_FL_ENABLED)
 		return ftrace_make_call(rec, FTRACE_ADDR);
 	else
 		return ftrace_make_nop(NULL, rec, FTRACE_ADDR);
@@ -554,8 +544,7 @@ static void ftrace_startup(void)
 
 	mutex_lock(&ftrace_start_lock);
 	ftrace_start_up++;
-	if (ftrace_start_up == 1)
-		command |= FTRACE_ENABLE_CALLS;
+	command |= FTRACE_ENABLE_CALLS;
 
 	if (saved_ftrace_func != ftrace_trace_function) {
 		saved_ftrace_func = ftrace_trace_function;
-- 
cgit v1.2.3


From ee02a2e5c88ca2e4d6921f08d037b46d5bf82641 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 15 Nov 2008 16:31:41 -0500
Subject: ftrace: make filtered functions effective on setting

Impact: set filtered functions at time the filter is set

It can be confusing when the set_filter_functions is set (or cleared)
and the functions being recorded by the dynamic tracer does not
match.

This patch causes the code to be updated if the function tracer is
enabled and the filter is changed.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b9f2e22faf2..b42ec1de546 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1194,7 +1194,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
 
 	mutex_lock(&ftrace_sysctl_lock);
 	mutex_lock(&ftrace_start_lock);
-	if (iter->filtered && ftrace_start_up && ftrace_enabled)
+	if (ftrace_start_up && ftrace_enabled)
 		ftrace_run_update_code(FTRACE_ENABLE_CALLS);
 	mutex_unlock(&ftrace_start_lock);
 	mutex_unlock(&ftrace_sysctl_lock);
-- 
cgit v1.2.3


From e6e7a65aabdb696cf05a56cfd495c49a11fd4cde Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 16 Nov 2008 05:53:19 +0100
Subject: tracing/ftrace: fix unexpected -EINVAL when longest tracer name is
 set

Impact: fix confusing write() -EINVAL when changing the tracer

The following commit d9e540762f5cdd89f24e518ad1fd31142d0b9726 remade
alive the bug which made the set of a new tracer returning -EINVAL if
this is the longest name of tracer. This patch corrects it.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dff4bee591b..80898f4870c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2655,6 +2655,9 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
 	char buf[max_tracer_type_len+1];
 	int i;
 	size_t ret;
+	int err;
+
+	ret = cnt;
 
 	if (cnt > max_tracer_type_len)
 		cnt = max_tracer_type_len;
@@ -2668,12 +2671,11 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
 	for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
 		buf[i] = 0;
 
-	ret = tracing_set_tracer(buf);
-	if (!ret)
-		ret = cnt;
+	err = tracing_set_tracer(buf);
+	if (err)
+		return err;
 
-	if (ret > 0)
-		filp->f_pos += ret;
+	filp->f_pos += ret;
 
 	return ret;
 }
-- 
cgit v1.2.3


From 1c80025a49855b12fa09bb6db71820e3367b1369 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 16 Nov 2008 05:57:26 +0100
Subject: tracing/ftrace: change the type of the init() callback

Impact: extend the ->init() method with the ability to fail

This bring a way to know if the initialization of a tracer successed.
A tracer must return 0 on success and a traditional error (ie:
-ENOMEM) if it fails.

If a tracer fails to init, it is free to print a detailed warn. The
tracing api will not and switch to a new tracer will just return the
error from the init callback.

Note: this will be used for the return tracer.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c                  |  7 ++--
 kernel/trace/trace.h                  |  3 +-
 kernel/trace/trace_boot.c             |  3 +-
 kernel/trace/trace_branch.c           |  3 +-
 kernel/trace/trace_functions.c        |  3 +-
 kernel/trace/trace_functions_return.c |  3 +-
 kernel/trace/trace_irqsoff.c          |  9 +++--
 kernel/trace/trace_mmiotrace.c        |  3 +-
 kernel/trace/trace_nop.c              |  3 +-
 kernel/trace/trace_sched_switch.c     |  3 +-
 kernel/trace/trace_sched_wakeup.c     |  3 +-
 kernel/trace/trace_selftest.c         | 66 ++++++++++++++++++++++++++++++-----
 kernel/trace/trace_sysprof.c          |  3 +-
 13 files changed, 88 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 80898f4870c..396fda034e3 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2638,8 +2638,11 @@ static int tracing_set_tracer(char *buf)
 		current_trace->reset(tr);
 
 	current_trace = t;
-	if (t->init)
-		t->init(tr);
+	if (t->init) {
+		ret = t->init(tr);
+		if (ret)
+			goto out;
+	}
 
 	trace_branch_enable(tr);
  out:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 790ea8c0e1f..cdbd5cc22be 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -264,7 +264,8 @@ enum print_line_t {
  */
 struct tracer {
 	const char		*name;
-	void			(*init)(struct trace_array *tr);
+	/* Your tracer should raise a warning if init fails */
+	int			(*init)(struct trace_array *tr);
 	void			(*reset)(struct trace_array *tr);
 	void			(*start)(struct trace_array *tr);
 	void			(*stop)(struct trace_array *tr);
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index cb333b7fd11..a4fa2c57e34 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -47,7 +47,7 @@ static void reset_boot_trace(struct trace_array *tr)
 		tracing_reset(tr, cpu);
 }
 
-static void boot_trace_init(struct trace_array *tr)
+static int boot_trace_init(struct trace_array *tr)
 {
 	int cpu;
 	boot_trace = tr;
@@ -56,6 +56,7 @@ static void boot_trace_init(struct trace_array *tr)
 		tracing_reset(tr, cpu);
 
 	tracing_sched_switch_assign_trace(tr);
+	return 0;
 }
 
 static enum print_line_t
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 85265553918..44bd39539d6 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -125,7 +125,7 @@ static void stop_branch_trace(struct trace_array *tr)
 	disable_branch_tracing();
 }
 
-static void branch_trace_init(struct trace_array *tr)
+static int branch_trace_init(struct trace_array *tr)
 {
 	int cpu;
 
@@ -133,6 +133,7 @@ static void branch_trace_init(struct trace_array *tr)
 		tracing_reset(tr, cpu);
 
 	start_branch_trace(tr);
+	return 0;
 }
 
 static void branch_trace_reset(struct trace_array *tr)
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 8693b7a0a5b..e74f6d0a321 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -42,9 +42,10 @@ static void stop_function_trace(struct trace_array *tr)
 	tracing_stop_cmdline_record();
 }
 
-static void function_trace_init(struct trace_array *tr)
+static int function_trace_init(struct trace_array *tr)
 {
 	start_function_trace(tr);
+	return 0;
 }
 
 static void function_trace_reset(struct trace_array *tr)
diff --git a/kernel/trace/trace_functions_return.c b/kernel/trace/trace_functions_return.c
index 7680b21537d..61185f756a1 100644
--- a/kernel/trace/trace_functions_return.c
+++ b/kernel/trace/trace_functions_return.c
@@ -24,13 +24,14 @@ static void stop_return_trace(struct trace_array *tr)
 	unregister_ftrace_return();
 }
 
-static void return_trace_init(struct trace_array *tr)
+static int return_trace_init(struct trace_array *tr)
 {
 	int cpu;
 	for_each_online_cpu(cpu)
 		tracing_reset(tr, cpu);
 
 	start_return_trace(tr);
+	return 0;
 }
 
 static void return_trace_reset(struct trace_array *tr)
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index d919d4eaa7c..7c2e326bbc8 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -416,11 +416,12 @@ static void irqsoff_tracer_close(struct trace_iterator *iter)
 }
 
 #ifdef CONFIG_IRQSOFF_TRACER
-static void irqsoff_tracer_init(struct trace_array *tr)
+static int irqsoff_tracer_init(struct trace_array *tr)
 {
 	trace_type = TRACER_IRQS_OFF;
 
 	__irqsoff_tracer_init(tr);
+	return 0;
 }
 static struct tracer irqsoff_tracer __read_mostly =
 {
@@ -442,11 +443,12 @@ static struct tracer irqsoff_tracer __read_mostly =
 #endif
 
 #ifdef CONFIG_PREEMPT_TRACER
-static void preemptoff_tracer_init(struct trace_array *tr)
+static int preemptoff_tracer_init(struct trace_array *tr)
 {
 	trace_type = TRACER_PREEMPT_OFF;
 
 	__irqsoff_tracer_init(tr);
+	return 0;
 }
 
 static struct tracer preemptoff_tracer __read_mostly =
@@ -471,11 +473,12 @@ static struct tracer preemptoff_tracer __read_mostly =
 #if defined(CONFIG_IRQSOFF_TRACER) && \
 	defined(CONFIG_PREEMPT_TRACER)
 
-static void preemptirqsoff_tracer_init(struct trace_array *tr)
+static int preemptirqsoff_tracer_init(struct trace_array *tr)
 {
 	trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF;
 
 	__irqsoff_tracer_init(tr);
+	return 0;
 }
 
 static struct tracer preemptirqsoff_tracer __read_mostly =
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 51bcf370215..433d650eda9 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -30,13 +30,14 @@ static void mmio_reset_data(struct trace_array *tr)
 		tracing_reset(tr, cpu);
 }
 
-static void mmio_trace_init(struct trace_array *tr)
+static int mmio_trace_init(struct trace_array *tr)
 {
 	pr_debug("in %s\n", __func__);
 	mmio_trace_array = tr;
 
 	mmio_reset_data(tr);
 	enable_mmiotrace();
+	return 0;
 }
 
 static void mmio_trace_reset(struct trace_array *tr)
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index 2ef1d227e7d..0e77415caed 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -24,7 +24,7 @@ static void stop_nop_trace(struct trace_array *tr)
 	/* Nothing to do! */
 }
 
-static void nop_trace_init(struct trace_array *tr)
+static int nop_trace_init(struct trace_array *tr)
 {
 	int cpu;
 	ctx_trace = tr;
@@ -33,6 +33,7 @@ static void nop_trace_init(struct trace_array *tr)
 		tracing_reset(tr, cpu);
 
 	start_nop_trace(tr);
+	return 0;
 }
 
 static void nop_trace_reset(struct trace_array *tr)
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index be35bdfe2e3..863390557b4 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -206,10 +206,11 @@ static void stop_sched_trace(struct trace_array *tr)
 	tracing_stop_sched_switch_record();
 }
 
-static void sched_switch_trace_init(struct trace_array *tr)
+static int sched_switch_trace_init(struct trace_array *tr)
 {
 	ctx_trace = tr;
 	start_sched_trace(tr);
+	return 0;
 }
 
 static void sched_switch_trace_reset(struct trace_array *tr)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 983f2b1478c..0067b49746c 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -331,10 +331,11 @@ static void stop_wakeup_tracer(struct trace_array *tr)
 	unregister_trace_sched_wakeup(probe_wakeup);
 }
 
-static void wakeup_tracer_init(struct trace_array *tr)
+static int wakeup_tracer_init(struct trace_array *tr)
 {
 	wakeup_trace = tr;
 	start_wakeup_tracer(tr);
+	return 0;
 }
 
 static void wakeup_tracer_reset(struct trace_array *tr)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 5cb64ea061b..88c8eb70f54 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -71,6 +71,11 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
 	return ret;
 }
 
+static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret)
+{
+	printk(KERN_WARNING "Failed to init %s tracer, init returned %d\n",
+		trace->name, init_ret);
+}
 #ifdef CONFIG_FUNCTION_TRACER
 
 #ifdef CONFIG_DYNAMIC_FTRACE
@@ -111,7 +116,11 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 	ftrace_set_filter(func_name, strlen(func_name), 1);
 
 	/* enable tracing */
-	trace->init(tr);
+	ret = trace->init(tr);
+	if (ret) {
+		warn_failed_init_tracer(trace, ret);
+		goto out;
+	}
 
 	/* Sleep for a 1/10 of a second */
 	msleep(100);
@@ -181,7 +190,12 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 	ftrace_enabled = 1;
 	tracer_enabled = 1;
 
-	trace->init(tr);
+	ret = trace->init(tr);
+	if (ret) {
+		warn_failed_init_tracer(trace, ret);
+		goto out;
+	}
+
 	/* Sleep for a 1/10 of a second */
 	msleep(100);
 	/* stop the tracing. */
@@ -223,7 +237,12 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
 	int ret;
 
 	/* start the tracing */
-	trace->init(tr);
+	ret = trace->init(tr);
+	if (ret) {
+		warn_failed_init_tracer(trace, ret);
+		return ret;
+	}
+
 	/* reset the max latency */
 	tracing_max_latency = 0;
 	/* disable interrupts for a bit */
@@ -272,7 +291,12 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
 	}
 
 	/* start the tracing */
-	trace->init(tr);
+	ret = trace->init(tr);
+	if (ret) {
+		warn_failed_init_tracer(trace, ret);
+		return ret;
+	}
+
 	/* reset the max latency */
 	tracing_max_latency = 0;
 	/* disable preemption for a bit */
@@ -321,7 +345,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 	}
 
 	/* start the tracing */
-	trace->init(tr);
+	ret = trace->init(tr);
+	if (ret) {
+		warn_failed_init_tracer(trace, ret);
+		goto out;
+	}
 
 	/* reset the max latency */
 	tracing_max_latency = 0;
@@ -449,7 +477,12 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
 	wait_for_completion(&isrt);
 
 	/* start the tracing */
-	trace->init(tr);
+	ret = trace->init(tr);
+	if (ret) {
+		warn_failed_init_tracer(trace, ret);
+		return ret;
+	}
+
 	/* reset the max latency */
 	tracing_max_latency = 0;
 
@@ -505,7 +538,12 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
 	int ret;
 
 	/* start the tracing */
-	trace->init(tr);
+	ret = trace->init(tr);
+	if (ret) {
+		warn_failed_init_tracer(trace, ret);
+		return ret;
+	}
+
 	/* Sleep for a 1/10 of a second */
 	msleep(100);
 	/* stop the tracing. */
@@ -532,7 +570,12 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
 	int ret;
 
 	/* start the tracing */
-	trace->init(tr);
+	ret = trace->init(tr);
+	if (ret) {
+		warn_failed_init_tracer(trace, ret);
+		return 0;
+	}
+
 	/* Sleep for a 1/10 of a second */
 	msleep(100);
 	/* stop the tracing. */
@@ -554,7 +597,12 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
 	int ret;
 
 	/* start the tracing */
-	trace->init(tr);
+	ret = trace->init(tr);
+	if (ret) {
+		warn_failed_init_tracer(trace, ret);
+		return ret;
+	}
+
 	/* Sleep for a 1/10 of a second */
 	msleep(100);
 	/* stop the tracing. */
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 05f753422ae..54960edb96d 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -261,11 +261,12 @@ static void stop_stack_trace(struct trace_array *tr)
 	mutex_unlock(&sample_timer_lock);
 }
 
-static void stack_trace_init(struct trace_array *tr)
+static int stack_trace_init(struct trace_array *tr)
 {
 	sysprof_trace = tr;
 
 	start_stack_trace(tr);
+	return 0;
 }
 
 static void stack_trace_reset(struct trace_array *tr)
-- 
cgit v1.2.3


From 072b40a15616fe6bea68466e6bffcfcbf5c8f26f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 16 Nov 2008 05:59:52 +0100
Subject: tracing/branch-tracer: fix a trace recursion on branch tracer

Impact: fix crash when enabling the branch-tracer

When the branch tracer inserts an event through
probe_likely_condition(), it calls local_irq_save() and then results
in a trace recursion.

local_irq_save() -> trace_hardirqs_off() -> trace_hardirqs_off_caller()
	-> unlikely()

The trace_branch.c file is protected by DISABLE_BRANCH_PROFILING but
that doesn't prevent from external call to functions that use
unlikely().

My box crashed each time I tried to set this tracer (sudden and hard
reboot).

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_branch.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 85265553918..2511e32572c 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -41,7 +41,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 	if (unlikely(!tr))
 		return;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	cpu = raw_smp_processor_id();
 	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
 		goto out;
@@ -73,7 +73,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 
  out:
 	atomic_dec(&tr->data[cpu]->disabled);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static inline
-- 
cgit v1.2.3


From e7d3737ea1b102030f44e96c97754101e41515f0 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 16 Nov 2008 06:02:06 +0100
Subject: tracing/function-return-tracer: support for dynamic ftrace on
 function return tracer

This patch adds the support for dynamic tracing on the function return tracer.
The whole difference with normal dynamic function tracing is that we don't need
to hook on a particular callback. The only pro that we want is to nop or set
dynamically the calls to ftrace_caller (which is ftrace_return_caller here).

Some security checks ensure that we are not trying to launch dynamic tracing for
return tracing while normal function tracing is already running.

An example of trace with getnstimeofday set as a filter:

ktime_get_ts+0x22/0x50 -> getnstimeofday (2283 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1396 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1382 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1825 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1426 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1464 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1524 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1382 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1382 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1434 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1464 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1502 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1404 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1397 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1051 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1314 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1344 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1163 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1390 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1374 ns)

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig                  |  1 -
 kernel/trace/ftrace.c                 | 58 ++++++++++++++++++++++++++++++++---
 kernel/trace/trace_functions_return.c | 15 ++-------
 3 files changed, 55 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 9c89526b6b7..b8378fad29a 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -59,7 +59,6 @@ config FUNCTION_TRACER
 
 config FUNCTION_RET_TRACER
 	bool "Kernel Function return Tracer"
-	depends on !DYNAMIC_FTRACE
 	depends on HAVE_FUNCTION_RET_TRACER
 	depends on FUNCTION_TRACER
 	help
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b42ec1de546..2f78a45aac1 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -50,6 +50,9 @@ static int last_ftrace_enabled;
 /* Quick disabling of function tracer. */
 int function_trace_stop;
 
+/* By default, current tracing type is normal tracing. */
+enum ftrace_tracing_type_t ftrace_tracing_type = FTRACE_TYPE_ENTER;
+
 /*
  * ftrace_disabled is set when an anomaly is discovered.
  * ftrace_disabled is much stronger than ftrace_enabled.
@@ -385,12 +388,21 @@ static void ftrace_bug(int failed, unsigned long ip)
 	}
 }
 
-#define FTRACE_ADDR ((long)(ftrace_caller))
 
 static int
 __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 {
 	unsigned long ip, fl;
+	unsigned long ftrace_addr;
+
+#ifdef CONFIG_FUNCTION_RET_TRACER
+	if (ftrace_tracing_type == FTRACE_TYPE_ENTER)
+		ftrace_addr = (unsigned long)ftrace_caller;
+	else
+		ftrace_addr = (unsigned long)ftrace_return_caller;
+#else
+	ftrace_addr = (unsigned long)ftrace_caller;
+#endif
 
 	ip = rec->ip;
 
@@ -450,9 +462,9 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 	}
 
 	if (rec->flags & FTRACE_FL_ENABLED)
-		return ftrace_make_call(rec, FTRACE_ADDR);
+		return ftrace_make_call(rec, ftrace_addr);
 	else
-		return ftrace_make_nop(NULL, rec, FTRACE_ADDR);
+		return ftrace_make_nop(NULL, rec, ftrace_addr);
 }
 
 static void ftrace_replace_code(int enable)
@@ -1405,10 +1417,17 @@ int register_ftrace_function(struct ftrace_ops *ops)
 		return -1;
 
 	mutex_lock(&ftrace_sysctl_lock);
+
+	if (ftrace_tracing_type == FTRACE_TYPE_RETURN) {
+		ret = -EBUSY;
+		goto out;
+	}
+
 	ret = __register_ftrace_function(ops);
 	ftrace_startup();
-	mutex_unlock(&ftrace_sysctl_lock);
 
+out:
+	mutex_unlock(&ftrace_sysctl_lock);
 	return ret;
 }
 
@@ -1474,16 +1493,45 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 }
 
 #ifdef CONFIG_FUNCTION_RET_TRACER
+
+/* The callback that hooks the return of a function */
 trace_function_return_t ftrace_function_return =
 			(trace_function_return_t)ftrace_stub;
-void register_ftrace_return(trace_function_return_t func)
+
+int register_ftrace_return(trace_function_return_t func)
 {
+	int ret = 0;
+
+	mutex_lock(&ftrace_sysctl_lock);
+
+	/*
+	 * Don't launch return tracing if normal function
+	 * tracing is already running.
+	 */
+	if (ftrace_trace_function != ftrace_stub) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	ftrace_tracing_type = FTRACE_TYPE_RETURN;
 	ftrace_function_return = func;
+	ftrace_startup();
+
+out:
+	mutex_unlock(&ftrace_sysctl_lock);
+	return ret;
 }
 
 void unregister_ftrace_return(void)
 {
+	mutex_lock(&ftrace_sysctl_lock);
+
 	ftrace_function_return = (trace_function_return_t)ftrace_stub;
+	ftrace_shutdown();
+	/* Restore normal tracing type */
+	ftrace_tracing_type = FTRACE_TYPE_ENTER;
+
+	mutex_unlock(&ftrace_sysctl_lock);
 }
 #endif
 
diff --git a/kernel/trace/trace_functions_return.c b/kernel/trace/trace_functions_return.c
index 61185f756a1..a68564af022 100644
--- a/kernel/trace/trace_functions_return.c
+++ b/kernel/trace/trace_functions_return.c
@@ -14,29 +14,18 @@
 #include "trace.h"
 
 
-static void start_return_trace(struct trace_array *tr)
-{
-	register_ftrace_return(&trace_function_return);
-}
-
-static void stop_return_trace(struct trace_array *tr)
-{
-	unregister_ftrace_return();
-}
-
 static int return_trace_init(struct trace_array *tr)
 {
 	int cpu;
 	for_each_online_cpu(cpu)
 		tracing_reset(tr, cpu);
 
-	start_return_trace(tr);
-	return 0;
+	return register_ftrace_return(&trace_function_return);
 }
 
 static void return_trace_reset(struct trace_array *tr)
 {
-		stop_return_trace(tr);
+		unregister_ftrace_return();
 }
 
 
-- 
cgit v1.2.3


From 2bdba316c989da028a59becf7516c6350ce3c173 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 14 Nov 2008 17:47:35 -0500
Subject: markers: fix unregister

Impact: fix marker registers/unregister race

get_marker() can return a NULL entry because the mutex is released in
the middle of those functions. Make sure we check to see if it has been
concurrently removed.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/marker.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/marker.c b/kernel/marker.c
index 2898b647d41..de683a7799e 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -653,10 +653,11 @@ int marker_probe_register(const char *name, const char *format,
 		goto end;
 	}
 	mutex_unlock(&markers_mutex);
-	marker_update_probes();		/* may update entry */
+	marker_update_probes();
 	mutex_lock(&markers_mutex);
 	entry = get_marker(name);
-	WARN_ON(!entry);
+	if (!entry)
+		goto end;
 	if (entry->rcu_pending)
 		rcu_barrier_sched();
 	entry->oldptr = old;
@@ -697,7 +698,7 @@ int marker_probe_unregister(const char *name,
 		rcu_barrier_sched();
 	old = marker_entry_remove_probe(entry, probe, probe_private);
 	mutex_unlock(&markers_mutex);
-	marker_update_probes();		/* may update entry */
+	marker_update_probes();
 	mutex_lock(&markers_mutex);
 	entry = get_marker(name);
 	if (!entry)
@@ -778,10 +779,11 @@ int marker_probe_unregister_private_data(marker_probe_func *probe,
 		rcu_barrier_sched();
 	old = marker_entry_remove_probe(entry, NULL, probe_private);
 	mutex_unlock(&markers_mutex);
-	marker_update_probes();		/* may update entry */
+	marker_update_probes();
 	mutex_lock(&markers_mutex);
 	entry = get_marker_from_private_data(probe, probe_private);
-	WARN_ON(!entry);
+	if (!entry)
+		goto end;
 	if (entry->rcu_pending)
 		rcu_barrier_sched();
 	entry->oldptr = old;
-- 
cgit v1.2.3


From 021aeb057fc48af03fe5f37d3dda366c0d97aaf3 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 14 Nov 2008 17:47:37 -0500
Subject: markers: use rcu_*_sched_notrace and notrace

Make marker critical code use notrace to make sure they can be used as an
ftrace callback.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/marker.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/marker.c b/kernel/marker.c
index de683a7799e..22cd7bae63e 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -81,7 +81,7 @@ struct marker_entry {
  * though the function pointer change and the marker enabling are two distinct
  * operations that modifies the execution flow of preemptible code.
  */
-void __mark_empty_function(void *probe_private, void *call_private,
+notrace void __mark_empty_function(void *probe_private, void *call_private,
 	const char *fmt, va_list *args)
 {
 }
@@ -97,7 +97,8 @@ EXPORT_SYMBOL_GPL(__mark_empty_function);
  * need to put a full smp_rmb() in this branch. This is why we do not use
  * rcu_dereference() for the pointer read.
  */
-void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
+notrace void marker_probe_cb(const struct marker *mdata,
+		void *call_private, ...)
 {
 	va_list args;
 	char ptype;
@@ -107,7 +108,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
 	 * sure the teardown of the callbacks can be done correctly when they
 	 * are in modules and they insure RCU read coherency.
 	 */
-	rcu_read_lock_sched();
+	rcu_read_lock_sched_notrace();
 	ptype = mdata->ptype;
 	if (likely(!ptype)) {
 		marker_probe_func *func;
@@ -145,7 +146,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
 			va_end(args);
 		}
 	}
-	rcu_read_unlock_sched();
+	rcu_read_unlock_sched_notrace();
 }
 EXPORT_SYMBOL_GPL(marker_probe_cb);
 
@@ -157,12 +158,13 @@ EXPORT_SYMBOL_GPL(marker_probe_cb);
  *
  * Should be connected to markers "MARK_NOARGS".
  */
-static void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
+static notrace void marker_probe_cb_noarg(const struct marker *mdata,
+		void *call_private, ...)
 {
 	va_list args;	/* not initialized */
 	char ptype;
 
-	rcu_read_lock_sched();
+	rcu_read_lock_sched_notrace();
 	ptype = mdata->ptype;
 	if (likely(!ptype)) {
 		marker_probe_func *func;
@@ -195,7 +197,7 @@ static void marker_probe_cb_noarg(const struct marker *mdata, void *call_private
 			multi[i].func(multi[i].probe_private, call_private,
 				mdata->format, &args);
 	}
-	rcu_read_unlock_sched();
+	rcu_read_unlock_sched_notrace();
 }
 
 static void free_old_closure(struct rcu_head *head)
-- 
cgit v1.2.3


From a419246ac7c2d9282dfd843103702895bb3f3fd7 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 14 Nov 2008 17:47:38 -0500
Subject: markers: use module notifier

Impact: cleanup

Use module notifiers instead of adding a hook in module.c.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/marker.c | 29 +++++++++++++++++++++++++++++
 kernel/module.c |  4 ----
 2 files changed, 29 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/marker.c b/kernel/marker.c
index 22cd7bae63e..348e70cc355 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -846,3 +846,32 @@ void *marker_get_private_data(const char *name, marker_probe_func *probe,
 	return ERR_PTR(-ENOENT);
 }
 EXPORT_SYMBOL_GPL(marker_get_private_data);
+
+int marker_module_notify(struct notifier_block *self,
+			 unsigned long val, void *data)
+{
+	struct module *mod = data;
+
+	switch (val) {
+	case MODULE_STATE_COMING:
+		marker_update_probe_range(mod->markers,
+			mod->markers + mod->num_markers);
+		break;
+	case MODULE_STATE_GOING:
+		marker_update_probe_range(mod->markers,
+			mod->markers + mod->num_markers);
+		break;
+	}
+	return 0;
+}
+
+struct notifier_block marker_module_nb = {
+	.notifier_call = marker_module_notify,
+	.priority = 0,
+};
+
+static int init_markers(void)
+{
+	return register_module_notifier(&marker_module_nb);
+}
+__initcall(init_markers);
diff --git a/kernel/module.c b/kernel/module.c
index 1f4cc00e0c2..72c6ca57421 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2184,10 +2184,6 @@ static noinline struct module *load_module(void __user *umod,
 		struct mod_debug *debug;
 		unsigned int num_debug;
 
-#ifdef CONFIG_MARKERS
-		marker_update_probe_range(mod->markers,
-			mod->markers + mod->num_markers);
-#endif
 		debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
 				     sizeof(*debug), &num_debug);
 		dynamic_printk_setup(debug, num_debug);
-- 
cgit v1.2.3


From c1df1bd2c4d4b20c83755a0f41956b57aec4842a Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 14 Nov 2008 17:47:39 -0500
Subject: markers: auto enable tracepoints (new API : trace_mark_tp())

Impact: new API

Add a new API trace_mark_tp(), which declares a marker within a
tracepoint probe. When the marker is activated, the tracepoint is
automatically enabled.

No branch test is used at the marker site, because it would be a
duplicate of the branch already present in the tracepoint.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/marker.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 51 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/marker.c b/kernel/marker.c
index 348e70cc355..c14ec26a9b9 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -479,7 +479,7 @@ static int marker_set_format(struct marker_entry *entry, const char *format)
 static int set_marker(struct marker_entry *entry, struct marker *elem,
 		int active)
 {
-	int ret;
+	int ret = 0;
 	WARN_ON(strcmp(entry->name, elem->name) != 0);
 
 	if (entry->format) {
@@ -531,9 +531,40 @@ static int set_marker(struct marker_entry *entry, struct marker *elem,
 	 */
 	smp_wmb();
 	elem->ptype = entry->ptype;
+
+	if (elem->tp_name && (active ^ elem->state)) {
+		WARN_ON(!elem->tp_cb);
+		/*
+		 * It is ok to directly call the probe registration because type
+		 * checking has been done in the __trace_mark_tp() macro.
+		 */
+
+		if (active) {
+			/*
+			 * try_module_get should always succeed because we hold
+			 * lock_module() to get the tp_cb address.
+			 */
+			ret = try_module_get(__module_text_address(
+				(unsigned long)elem->tp_cb));
+			BUG_ON(!ret);
+			ret = tracepoint_probe_register_noupdate(
+				elem->tp_name,
+				elem->tp_cb);
+		} else {
+			ret = tracepoint_probe_unregister_noupdate(
+				elem->tp_name,
+				elem->tp_cb);
+			/*
+			 * tracepoint_probe_update_all() must be called
+			 * before the module containing tp_cb is unloaded.
+			 */
+			module_put(__module_text_address(
+				(unsigned long)elem->tp_cb));
+		}
+	}
 	elem->state = active;
 
-	return 0;
+	return ret;
 }
 
 /*
@@ -544,7 +575,24 @@ static int set_marker(struct marker_entry *entry, struct marker *elem,
  */
 static void disable_marker(struct marker *elem)
 {
+	int ret;
+
 	/* leave "call" as is. It is known statically. */
+	if (elem->tp_name && elem->state) {
+		WARN_ON(!elem->tp_cb);
+		/*
+		 * It is ok to directly call the probe registration because type
+		 * checking has been done in the __trace_mark_tp() macro.
+		 */
+		ret = tracepoint_probe_unregister_noupdate(elem->tp_name,
+			elem->tp_cb);
+		WARN_ON(ret);
+		/*
+		 * tracepoint_probe_update_all() must be called
+		 * before the module containing tp_cb is unloaded.
+		 */
+		module_put(__module_text_address((unsigned long)elem->tp_cb));
+	}
 	elem->state = 0;
 	elem->single.func = __mark_empty_function;
 	/* Update the function before setting the ptype */
@@ -608,6 +656,7 @@ static void marker_update_probes(void)
 	marker_update_probe_range(__start___markers, __stop___markers);
 	/* Markers in modules. */
 	module_update_markers();
+	tracepoint_probe_update_all();
 }
 
 /**
-- 
cgit v1.2.3


From de0baf9ad661ac630a45a50ea1717cc4f4b33ace Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 14 Nov 2008 17:47:42 -0500
Subject: tracepoints: fix disable

Impact: fix race

Set the probe array pointer to NULL when the tracepoint is disabled.
The probe array point not being NULL could generate a race condition
where the reader would dereference a freed pointer.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/tracepoint.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index e96590f17de..47a7303d6cd 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -262,6 +262,7 @@ static void set_tracepoint(struct tracepoint_entry **entry,
 static void disable_tracepoint(struct tracepoint *elem)
 {
 	elem->state = 0;
+	rcu_assign_pointer(elem->funcs, NULL);
 }
 
 /**
-- 
cgit v1.2.3


From 32f85742778dfc2c74975cf0b9f5bdb13470cb32 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 14 Nov 2008 17:47:46 -0500
Subject: tracepoints: use modules notifiers

Impact: cleanup

Use module notifiers for tracepoint updates rather than adding a hook in
module.c.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/module.c     |  5 -----
 kernel/tracepoint.c | 29 +++++++++++++++++++++++++++++
 2 files changed, 29 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 72c6ca57421..fc1dff9a178 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2187,11 +2187,6 @@ static noinline struct module *load_module(void __user *umod,
 		debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
 				     sizeof(*debug), &num_debug);
 		dynamic_printk_setup(debug, num_debug);
-
-#ifdef CONFIG_TRACEPOINTS
-		tracepoint_update_probe_range(mod->tracepoints,
-			mod->tracepoints + mod->num_tracepoints);
-#endif
 	}
 
 	/* sechdrs[0].sh_size is always zero */
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 47a7303d6cd..94ac4e35530 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -541,3 +541,32 @@ void tracepoint_iter_reset(struct tracepoint_iter *iter)
 	iter->tracepoint = NULL;
 }
 EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
+
+int tracepoint_module_notify(struct notifier_block *self,
+			     unsigned long val, void *data)
+{
+	struct module *mod = data;
+
+	switch (val) {
+	case MODULE_STATE_COMING:
+		tracepoint_update_probe_range(mod->tracepoints,
+			mod->tracepoints + mod->num_tracepoints);
+		break;
+	case MODULE_STATE_GOING:
+		tracepoint_update_probe_range(mod->tracepoints,
+			mod->tracepoints + mod->num_tracepoints);
+		break;
+	}
+	return 0;
+}
+
+struct notifier_block tracepoint_module_nb = {
+	.notifier_call = tracepoint_module_notify,
+	.priority = 0,
+};
+
+static int init_tracepoints(void)
+{
+	return register_module_notifier(&tracepoint_module_nb);
+}
+__initcall(init_tracepoints);
-- 
cgit v1.2.3


From 7e066fb870fcd1025ec3ba7bbde5d541094f4ce1 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 14 Nov 2008 17:47:47 -0500
Subject: tracepoints: add DECLARE_TRACE() and DEFINE_TRACE()

Impact: API *CHANGE*. Must update all tracepoint users.

Add DEFINE_TRACE() to tracepoints to let them declare the tracepoint
structure in a single spot for all the kernel. It helps reducing memory
consumption, especially when declaring a lot of tracepoints, e.g. for
kmalloc tracing.

*API CHANGE WARNING*: now, DECLARE_TRACE() must be used in headers for
tracepoint declarations rather than DEFINE_TRACE(). This is the sane way
to do it. The name previously used was misleading.

Updates scheduler instrumentation to follow this API change.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/exit.c    | 4 ++++
 kernel/fork.c    | 2 ++
 kernel/kthread.c | 3 +++
 kernel/sched.c   | 6 ++++++
 kernel/signal.c  | 2 ++
 5 files changed, 17 insertions(+)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index ae2b92be5fa..f995d241866 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -54,6 +54,10 @@
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
 
+DEFINE_TRACE(sched_process_free);
+DEFINE_TRACE(sched_process_exit);
+DEFINE_TRACE(sched_process_wait);
+
 static void exit_mm(struct task_struct * tsk);
 
 static inline int task_detached(struct task_struct *p)
diff --git a/kernel/fork.c b/kernel/fork.c
index f6083561dfe..0837d0deee5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -79,6 +79,8 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0;
 
 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
 
+DEFINE_TRACE(sched_process_fork);
+
 int nr_processes(void)
 {
 	int cpu;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 8e7a7ce3ed0..4fbc456f393 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -21,6 +21,9 @@ static DEFINE_SPINLOCK(kthread_create_lock);
 static LIST_HEAD(kthread_create_list);
 struct task_struct *kthreadd_task;
 
+DEFINE_TRACE(sched_kthread_stop);
+DEFINE_TRACE(sched_kthread_stop_ret);
+
 struct kthread_create_info
 {
 	/* Information passed to kthread() from kthreadd. */
diff --git a/kernel/sched.c b/kernel/sched.c
index 50a21f96467..327f91c63c9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -118,6 +118,12 @@
  */
 #define RUNTIME_INF	((u64)~0ULL)
 
+DEFINE_TRACE(sched_wait_task);
+DEFINE_TRACE(sched_wakeup);
+DEFINE_TRACE(sched_wakeup_new);
+DEFINE_TRACE(sched_switch);
+DEFINE_TRACE(sched_migrate_task);
+
 #ifdef CONFIG_SMP
 /*
  * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
diff --git a/kernel/signal.c b/kernel/signal.c
index 4530fc65445..e9afe63da24 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -41,6 +41,8 @@
 
 static struct kmem_cache *sigqueue_cachep;
 
+DEFINE_TRACE(sched_signal_send);
+
 static void __user *sig_handler(struct task_struct *t, int sig)
 {
 	return t->sighand->action[sig - 1].sa.sa_handler;
-- 
cgit v1.2.3


From 227a837567e339c74d9d4243d03a29bd943a018c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 16 Nov 2008 09:50:34 +0100
Subject: markers/tracpoints: fix non-modular build

fix:

 kernel/marker.c: In function 'marker_module_notify':
 kernel/marker.c:905: error: 'MODULE_STATE_COMING' undeclared (first use in this function)
 [...]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/marker.c     | 4 ++++
 kernel/tracepoint.c | 4 ++++
 2 files changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/marker.c b/kernel/marker.c
index c14ec26a9b9..ea54f264786 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -896,6 +896,8 @@ void *marker_get_private_data(const char *name, marker_probe_func *probe,
 }
 EXPORT_SYMBOL_GPL(marker_get_private_data);
 
+#ifdef CONFIG_MODULES
+
 int marker_module_notify(struct notifier_block *self,
 			 unsigned long val, void *data)
 {
@@ -924,3 +926,5 @@ static int init_markers(void)
 	return register_module_notifier(&marker_module_nb);
 }
 __initcall(init_markers);
+
+#endif /* CONFIG_MODULES */
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 94ac4e35530..79602740bbb 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -542,6 +542,8 @@ void tracepoint_iter_reset(struct tracepoint_iter *iter)
 }
 EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
 
+#ifdef CONFIG_MODULES
+
 int tracepoint_module_notify(struct notifier_block *self,
 			     unsigned long val, void *data)
 {
@@ -570,3 +572,5 @@ static int init_tracepoints(void)
 	return register_module_notifier(&tracepoint_module_nb);
 }
 __initcall(init_tracepoints);
+
+#endif /* CONFIG_MODULES */
-- 
cgit v1.2.3


From 0c726da983de0704254250ef6495ca152e7abcca Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sun, 16 Nov 2008 16:07:58 +0530
Subject: tracing: branch tracer, fix writing to trace/trace_options

Impact: fix trace_options behavior

writing to trace/trace_options use the index of the array
to find the value of the flag. With branch tracer flag
defined conditionally, this breaks writing to trace_options
with branch tracer disabled.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 2 --
 kernel/trace/trace.h | 2 --
 2 files changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4a904623e05..b04923b72ce 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -259,9 +259,7 @@ static const char *trace_options[] = {
 	"sched-tree",
 	"ftrace_printk",
 	"ftrace_preempt",
-#ifdef CONFIG_BRANCH_TRACER
 	"branch",
-#endif
 	"annotate",
 	NULL
 };
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 790ea8c0e1f..b41d7b4c2ca 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -470,9 +470,7 @@ enum trace_iterator_flags {
 	TRACE_ITER_SCHED_TREE		= 0x200,
 	TRACE_ITER_PRINTK		= 0x400,
 	TRACE_ITER_PREEMPTONLY		= 0x800,
-#ifdef CONFIG_BRANCH_TRACER
 	TRACE_ITER_BRANCH		= 0x1000,
-#endif
 	TRACE_ITER_ANNOTATE		= 0x2000,
 };
 
-- 
cgit v1.2.3


From 74fcd524e808975dd546dac847119f1995a7c622 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 17 Nov 2008 15:39:52 +0100
Subject: account_steal_time: kill the unneeded account_group_system_time()

Impact: remove unnecessary accounting call

I don't actually understand account_steal_time() and I failed to find the
commit which added account_group_system_time(), but this looks bogus.
In any case rq->idle must be single-threaded, so it can't have ->totals.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index c94baf2969e..b388c9b243e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4202,7 +4202,6 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
 
 	if (p == rq->idle) {
 		p->stime = cputime_add(p->stime, steal);
-		account_group_system_time(p, steal);
 		if (atomic_read(&rq->nr_iowait) > 0)
 			cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
 		else
-- 
cgit v1.2.3


From ce394471d13bf071939a9a0b48c64c297676d233 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 17 Nov 2008 15:40:01 +0100
Subject: thread_group_cputime: kill the bogus ->signal != NULL check

Impact: simplify the code

thread_group_cputime() is called by current when it must have the valid
->signal, or under ->siglock, or under tasklist_lock after the ->signal
check, or the caller is wait_task_zombie() which reaps the child. In any
case ->signal can't be NULL.

But the point of this patch is not optimization. If it is possible to call
thread_group_cputime() when ->signal == NULL we are doing something wrong,
and we should not mask the problem. thread_group_cputime() fills *times
and the caller will use it, if we silently use task_struct->*times* we
report the wrong values.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/posix-cpu-timers.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 895337b16a2..3f4377e0aa0 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -58,21 +58,21 @@ void thread_group_cputime(
 	struct task_struct *tsk,
 	struct task_cputime *times)
 {
-	struct signal_struct *sig;
+	struct task_cputime *totals, *tot;
 	int i;
-	struct task_cputime *tot;
 
-	sig = tsk->signal;
-	if (unlikely(!sig) || !sig->cputime.totals) {
+	totals = tsk->signal->cputime.totals;
+	if (!totals) {
 		times->utime = tsk->utime;
 		times->stime = tsk->stime;
 		times->sum_exec_runtime = tsk->se.sum_exec_runtime;
 		return;
 	}
+
 	times->stime = times->utime = cputime_zero;
 	times->sum_exec_runtime = 0;
 	for_each_possible_cpu(i) {
-		tot = per_cpu_ptr(tsk->signal->cputime.totals, i);
+		tot = per_cpu_ptr(totals, i);
 		times->utime = cputime_add(times->utime, tot->utime);
 		times->stime = cputime_add(times->stime, tot->stime);
 		times->sum_exec_runtime += tot->sum_exec_runtime;
-- 
cgit v1.2.3


From 2b5fe6de58276d0b5a7c884d5dbfc300ca47db78 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 17 Nov 2008 15:40:08 +0100
Subject: thread_group_cputime: move a couple of callsites outside of ->siglock

Impact: relax the locking of cpu-time accounting calls

->siglock buys nothing for thread_group_cputime() in do_sys_times() and
wait_task_zombie() (which btw takes the unrelated parent's ->siglock).

Actually I think do_sys_times() doesn't need ->siglock at all.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/exit.c | 2 +-
 kernel/sys.c  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index ae2b92be5fa..b9c4d8bb72e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1330,10 +1330,10 @@ static int wait_task_zombie(struct task_struct *p, int options,
 		 * group, which consolidates times for all threads in the
 		 * group including the group leader.
 		 */
+		thread_group_cputime(p, &cputime);
 		spin_lock_irq(&p->parent->sighand->siglock);
 		psig = p->parent->signal;
 		sig = p->signal;
-		thread_group_cputime(p, &cputime);
 		psig->cutime =
 			cputime_add(psig->cutime,
 			cputime_add(cputime.utime,
diff --git a/kernel/sys.c b/kernel/sys.c
index 31deba8f7d1..5fc3a0cfb99 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -858,8 +858,8 @@ void do_sys_times(struct tms *tms)
 	struct task_cputime cputime;
 	cputime_t cutime, cstime;
 
-	spin_lock_irq(&current->sighand->siglock);
 	thread_group_cputime(current, &cputime);
+	spin_lock_irq(&current->sighand->siglock);
 	cutime = current->signal->cutime;
 	cstime = current->signal->cstime;
 	spin_unlock_irq(&current->sighand->siglock);
-- 
cgit v1.2.3


From adf9f19574334c9a29a2bc956009fcac7edf1a6b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 17 Nov 2008 19:23:42 +0100
Subject: tracing/ftrace: implement a set_flag callback for tracers

Impact: give a way to send specific messages to tracers

The current implementation of tracing uses some flags to control the
output of general tracers. But we have no way to implement custom
flags handling for a specific tracer. This patch proposes a new
callback for the struct tracer which called set_flag and a structure
that represents a 32 bits variable flag.

A tracer can implement a struct tracer_flags on which it puts the
initial value of the flag integer. Than it can place a range of flags
with their name and their flag mask on the flag integer. The structure
that implement a single flag is called struct tracer_opt.

These custom flags will be available through the trace_options file
like the general tracing flags. Changing their value is done like the
other general flags. For example if you have a flag that calls "foo",
you can activate it by writing "foo" or "nofoo" on trace_options.

Note that the set_flag callback is optional and is only needed if you
want the flags changing to be signaled to your tracer and let it to
accept or refuse their assignment.

V2: Some arrangements in coding style....

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++----
 kernel/trace/trace.h | 26 +++++++++++++++
 2 files changed, 112 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2596b5a968c..9531fddcfb8 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -43,6 +43,20 @@
 unsigned long __read_mostly	tracing_max_latency = (cycle_t)ULONG_MAX;
 unsigned long __read_mostly	tracing_thresh;
 
+/* For tracers that don't implement custom flags */
+static struct tracer_opt dummy_tracer_opt[] = {
+	{ }
+};
+
+static struct tracer_flags dummy_tracer_flags = {
+	.val = 0,
+	.opts = dummy_tracer_opt
+};
+
+static int dummy_set_flag(u32 old_flags, u32 bit, int set)
+{
+	return 0;
+}
 
 /*
  * Kill all tracing for good (never come back).
@@ -529,6 +543,14 @@ int register_tracer(struct tracer *type)
 		}
 	}
 
+	if (!type->set_flag)
+		type->set_flag = &dummy_set_flag;
+	if (!type->flags)
+		type->flags = &dummy_tracer_flags;
+	else
+		if (!type->flags->opts)
+			type->flags->opts = dummy_tracer_opt;
+
 #ifdef CONFIG_FTRACE_STARTUP_TEST
 	if (type->selftest) {
 		struct tracer *saved_tracer = current_trace;
@@ -2426,10 +2448,13 @@ static ssize_t
 tracing_trace_options_read(struct file *filp, char __user *ubuf,
 		       size_t cnt, loff_t *ppos)
 {
+	int i;
 	char *buf;
 	int r = 0;
 	int len = 0;
-	int i;
+	u32 tracer_flags = current_trace->flags->val;
+	struct tracer_opt *trace_opts = current_trace->flags->opts;
+
 
 	/* calulate max size */
 	for (i = 0; trace_options[i]; i++) {
@@ -2437,6 +2462,15 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
 		len += 3; /* "no" and space */
 	}
 
+	/*
+	 * Increase the size with names of options specific
+	 * of the current tracer.
+	 */
+	for (i = 0; trace_opts[i].name; i++) {
+		len += strlen(trace_opts[i].name);
+		len += 3; /* "no" and space */
+	}
+
 	/* +2 for \n and \0 */
 	buf = kmalloc(len + 2, GFP_KERNEL);
 	if (!buf)
@@ -2449,6 +2483,15 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
 			r += sprintf(buf + r, "no%s ", trace_options[i]);
 	}
 
+	for (i = 0; trace_opts[i].name; i++) {
+		if (tracer_flags & trace_opts[i].bit)
+			r += sprintf(buf + r, "%s ",
+				trace_opts[i].name);
+		else
+			r += sprintf(buf + r, "no%s ",
+				trace_opts[i].name);
+	}
+
 	r += sprintf(buf + r, "\n");
 	WARN_ON(r >= len + 2);
 
@@ -2459,6 +2502,40 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
 	return r;
 }
 
+/* Try to assign a tracer specific option */
+static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
+{
+	struct tracer_flags *trace_flags = trace->flags;
+	struct tracer_opt *opts = NULL;
+	int ret = 0, i = 0;
+	int len;
+
+	for (i = 0; trace_flags->opts[i].name; i++) {
+		opts = &trace_flags->opts[i];
+		len = strlen(opts->name);
+
+		if (strncmp(cmp, opts->name, len) == 0) {
+			ret = trace->set_flag(trace_flags->val,
+				opts->bit, !neg);
+			break;
+		}
+	}
+	/* Not found */
+	if (!trace_flags->opts[i].name)
+		return -EINVAL;
+
+	/* Refused to handle */
+	if (ret)
+		return ret;
+
+	if (neg)
+		trace_flags->val &= ~opts->bit;
+	else
+		trace_flags->val |= opts->bit;
+
+	return 0;
+}
+
 static ssize_t
 tracing_trace_options_write(struct file *filp, const char __user *ubuf,
 			size_t cnt, loff_t *ppos)
@@ -2466,6 +2543,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
 	char buf[64];
 	char *cmp = buf;
 	int neg = 0;
+	int ret;
 	int i;
 
 	if (cnt >= sizeof(buf))
@@ -2492,11 +2570,13 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
 			break;
 		}
 	}
-	/*
-	 * If no option could be set, return an error:
-	 */
-	if (!trace_options[i])
-		return -EINVAL;
+
+	/* If no option could be set, test the specific tracer options */
+	if (!trace_options[i]) {
+		ret = set_tracer_option(current_trace, cmp, neg);
+		if (ret)
+			return ret;
+	}
 
 	filp->f_pos += cnt;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 37947f6b92b..9d22618bf99 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -259,6 +259,29 @@ enum print_line_t {
 	TRACE_TYPE_UNHANDLED	= 2	/* Relay to other output functions */
 };
 
+
+/*
+ * An option specific to a tracer. This is a boolean value.
+ * The bit is the bit index that sets its value on the
+ * flags value in struct tracer_flags.
+ */
+struct tracer_opt {
+	const char 	*name; /* Will appear on the trace_options file */
+	u32 		bit; /* Mask assigned in val field in tracer_flags */
+};
+
+/*
+ * The set of specific options for a tracer. Your tracer
+ * have to set the initial value of the flags val.
+ */
+struct tracer_flags {
+	u32			val;
+	struct tracer_opt 	*opts;
+};
+
+/* Makes more easy to define a tracer opt */
+#define TRACER_OPT(s, b)	.name = #s, .bit = b
+
 /*
  * A specific tracer, represented by methods that operate on a trace array:
  */
@@ -280,8 +303,11 @@ struct tracer {
 					    struct trace_array *tr);
 #endif
 	enum print_line_t	(*print_line)(struct trace_iterator *iter);
+	/* If you handled the flag setting, return 0 */
+	int			(*set_flag)(u32 old_flags, u32 bit, int set);
 	struct tracer		*next;
 	int			print_max;
+	struct tracer_flags 	*flags;
 };
 
 struct trace_seq {
-- 
cgit v1.2.3


From 0619faf657806b943e6acf51f60f1cd023a96c78 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 17 Nov 2008 19:26:30 +0100
Subject: tracing/ftrace: make nop tracer using tracer flags

Impact: give an example on how to use specific tracer flags

This patch propose to use the nop tracer to provide an
example for using the tracer's custom flags implementation.

V2: replace structures and defines just after the headers includes for
    cleanliness.
V3: replace defines by enum values.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Noonan <steven@uplinklabs.net>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_nop.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index 0e77415caed..b9767acd30a 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -12,6 +12,27 @@
 
 #include "trace.h"
 
+/* Our two options */
+enum {
+	TRACE_NOP_OPT_ACCEPT = 0x1,
+	TRACE_NOP_OPT_REFUSE = 0x2
+};
+
+/* Options for the tracer (see trace_options file) */
+static struct tracer_opt nop_opts[] = {
+	/* Option that will be accepted by set_flag callback */
+	{ TRACER_OPT(test_nop_accept, TRACE_NOP_OPT_ACCEPT) },
+	/* Option that will be refused by set_flag callback */
+	{ TRACER_OPT(test_nop_refuse, TRACE_NOP_OPT_REFUSE) },
+	{ } /* Always set a last empty entry */
+};
+
+static struct tracer_flags nop_flags = {
+	/* You can check your flags value here when you want. */
+	.val = 0, /* By default: all flags disabled */
+	.opts = nop_opts
+};
+
 static struct trace_array	*ctx_trace;
 
 static void start_nop_trace(struct trace_array *tr)
@@ -41,6 +62,35 @@ static void nop_trace_reset(struct trace_array *tr)
 	stop_nop_trace(tr);
 }
 
+/* It only serves as a signal handler and a callback to
+ * accept or refuse tthe setting of a flag.
+ * If you don't implement it, then the flag setting will be
+ * automatically accepted.
+ */
+static int nop_set_flag(u32 old_flags, u32 bit, int set)
+{
+	/*
+	 * Note that you don't need to update nop_flags.val yourself.
+	 * The tracing Api will do it automatically if you return 0
+	 */
+	if (bit == TRACE_NOP_OPT_ACCEPT) {
+		printk(KERN_DEBUG "nop_test_accept flag set to %d: we accept."
+			" Now cat trace_options to see the result\n",
+			set);
+		return 0;
+	}
+
+	if (bit == TRACE_NOP_OPT_REFUSE) {
+		printk(KERN_DEBUG "nop_test_refuse flag set to %d: we refuse."
+			"Now cat trace_options to see the result\n",
+			set);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+
 struct tracer nop_trace __read_mostly =
 {
 	.name		= "nop",
@@ -49,5 +99,7 @@ struct tracer nop_trace __read_mostly =
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest	= trace_selftest_startup_nop,
 #endif
+	.flags		= &nop_flags,
+	.set_flag	= nop_set_flag
 };
 
-- 
cgit v1.2.3


From 0231022cc32d5f2e7f3c06b75691dda0ad6aec33 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 17 Nov 2008 03:22:41 +0100
Subject: tracing/function-return-tracer: add the overrun field

Impact: help to find the better depth of trace

We decided to arbitrary define the depth of function return trace as
"20". Perhaps this is not enough. To help finding an optimal depth, we
measure now the overrun: the number of functions that have been missed
for the current thread. By default this is not displayed, we have to
do set a particular flag on the return tracer: echo overrun >
/debug/tracing/trace_options And the overrun will be printed on the
right.

As the trace shows below, the current 20 depth is not enough.

update_wall_time+0x37f/0x8c0 -> update_xtime_cache (345 ns) (Overruns: 2838)
update_wall_time+0x384/0x8c0 -> clocksource_get_next (1141 ns) (Overruns: 2838)
do_timer+0x23/0x100 -> update_wall_time (3882 ns) (Overruns: 2838)
tick_do_update_jiffies64+0xbf/0x160 -> do_timer (5339 ns) (Overruns: 2838)
tick_sched_timer+0x6a/0xf0 -> tick_do_update_jiffies64 (7209 ns) (Overruns: 2838)
vgacon_set_cursor_size+0x98/0x120 -> native_io_delay (2613 ns) (Overruns: 274)
vgacon_cursor+0x16e/0x1d0 -> vgacon_set_cursor_size (33151 ns) (Overruns: 274)
set_cursor+0x5f/0x80 -> vgacon_cursor (36432 ns) (Overruns: 274)
con_flush_chars+0x34/0x40 -> set_cursor (38790 ns) (Overruns: 274)
release_console_sem+0x1ec/0x230 -> up (721 ns) (Overruns: 274)
release_console_sem+0x225/0x230 -> wake_up_klogd (316 ns) (Overruns: 274)
con_flush_chars+0x39/0x40 -> release_console_sem (2996 ns) (Overruns: 274)
con_write+0x22/0x30 -> con_flush_chars (46067 ns) (Overruns: 274)
n_tty_write+0x1cc/0x360 -> con_write (292670 ns) (Overruns: 274)
smp_apic_timer_interrupt+0x2a/0x90 -> native_apic_mem_write (330 ns) (Overruns: 274)
irq_enter+0x17/0x70 -> idle_cpu (413 ns) (Overruns: 274)
smp_apic_timer_interrupt+0x2f/0x90 -> irq_enter (1525 ns) (Overruns: 274)
ktime_get_ts+0x40/0x70 -> getnstimeofday (465 ns) (Overruns: 274)
ktime_get_ts+0x60/0x70 -> set_normalized_timespec (436 ns) (Overruns: 274)
ktime_get+0x16/0x30 -> ktime_get_ts (2501 ns) (Overruns: 274)
hrtimer_interrupt+0x77/0x1a0 -> ktime_get (3439 ns) (Overruns: 274)

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c                  |  1 +
 kernel/trace/trace.h                  |  1 +
 kernel/trace/trace_functions_return.c | 38 +++++++++++++++++++++++++++++------
 3 files changed, 34 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9531fddcfb8..e97c29a6e7b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -853,6 +853,7 @@ static void __trace_function_return(struct trace_array *tr,
 	entry->parent_ip	= trace->ret;
 	entry->rettime		= trace->rettime;
 	entry->calltime		= trace->calltime;
+	entry->overrun		= trace->overrun;
 	ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
 }
 #endif
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9d22618bf99..2cb12fd98f6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -60,6 +60,7 @@ struct ftrace_ret_entry {
 	unsigned long		parent_ip;
 	unsigned long long	calltime;
 	unsigned long long	rettime;
+	unsigned long		overrun;
 };
 extern struct tracer boot_tracer;
 
diff --git a/kernel/trace/trace_functions_return.c b/kernel/trace/trace_functions_return.c
index a68564af022..e00d64509c9 100644
--- a/kernel/trace/trace_functions_return.c
+++ b/kernel/trace/trace_functions_return.c
@@ -14,6 +14,19 @@
 #include "trace.h"
 
 
+#define TRACE_RETURN_PRINT_OVERRUN	0x1
+static struct tracer_opt trace_opts[] = {
+	/* Display overruns or not */
+	{ TRACER_OPT(overrun, TRACE_RETURN_PRINT_OVERRUN) },
+	{ } /* Empty entry */
+};
+
+static struct tracer_flags tracer_flags = {
+	.val = 0, /* Don't display overruns by default */
+	.opts = trace_opts
+};
+
+
 static int return_trace_init(struct trace_array *tr)
 {
 	int cpu;
@@ -42,26 +55,39 @@ print_return_function(struct trace_iterator *iter)
 		ret = trace_seq_printf(s, "%pF -> ", (void *)field->parent_ip);
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
+
 		ret = seq_print_ip_sym(s, field->ip,
 					trace_flags & TRACE_ITER_SYM_MASK);
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
-		ret = trace_seq_printf(s, " (%llu ns)\n",
+
+		ret = trace_seq_printf(s, " (%llu ns)",
 					field->rettime - field->calltime);
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
-		else
-			return TRACE_TYPE_HANDLED;
+
+		if (tracer_flags.val & TRACE_RETURN_PRINT_OVERRUN) {
+			ret = trace_seq_printf(s, " (Overruns: %lu)",
+						field->overrun);
+			if (!ret)
+				return TRACE_TYPE_PARTIAL_LINE;
+		}
+
+		ret = trace_seq_printf(s, "\n");
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+
+		return TRACE_TYPE_HANDLED;
 	}
 	return TRACE_TYPE_UNHANDLED;
 }
 
-static struct tracer return_trace __read_mostly =
-{
+static struct tracer return_trace __read_mostly = {
 	.name	     = "return",
 	.init	     = return_trace_init,
 	.reset	     = return_trace_reset,
-	.print_line = print_return_function
+	.print_line = print_return_function,
+	.flags		= &tracer_flags,
 };
 
 static __init int init_return_trace(void)
-- 
cgit v1.2.3


From a22506347d038a66506c6f57e9b97104128e280d Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 18 Nov 2008 18:06:35 +0100
Subject: ftrace: preemptoff selftest not working

Impact: fix preemptoff and preemptirqsoff tracer self-tests

I was wondering why the preemptoff and preemptirqsoff tracer selftests
don't work on s390. After all its just that they get called from
non-preemptible context:

kernel_init() will execute all initcalls, however the first line in
kernel_init() is lock_kernel(), which causes the preempt_count to be
increased. Any later calls to add_preempt_count() (especially those
from the selftests) will therefore not result in a call to
trace_preempt_off() since the check below in add_preempt_count()
will be false:

        if (preempt_count() == val)
                trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));

Hence the trace buffer will be empty.

Fix this by releasing the BKL during the self-tests.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 396fda034e3..16892121cb7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -532,6 +532,13 @@ int register_tracer(struct tracer *type)
 	}
 
 #ifdef CONFIG_FTRACE_STARTUP_TEST
+	/*
+	 * When this gets called we hold the BKL which means that preemption
+	 * is disabled. Various trace selftests however need to disable
+	 * and enable preemption for successful tests. So we drop the BKL here
+	 * and grab it after the tests again.
+	 */
+	unlock_kernel();
 	if (type->selftest) {
 		struct tracer *saved_tracer = current_trace;
 		struct trace_array *tr = &global_trace;
@@ -562,6 +569,7 @@ int register_tracer(struct tracer *type)
 		}
 		printk(KERN_CONT "PASSED\n");
 	}
+	lock_kernel();
 #endif
 
 	type->next = trace_types;
-- 
cgit v1.2.3


From 86fa2f60674540df0b34f5c547ed0c1cf3a8f212 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 19 Nov 2008 10:00:15 +0100
Subject: ftrace: fix selftest locking

Impact: fix self-test boot crash

Self-test failure forgot to re-lock the BKL - crashing the next
initcall:

Testing tracer irqsoff: .. no entries found ..FAILED!
initcall init_irqsoff_tracer+0x0/0x11 returned 0 after 3906 usecs
calling  init_mmio_trace+0x0/0xf @ 1
------------[ cut here ]------------
Kernel BUG at c0c0a915 [verbose debug info unavailable]
invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC
last sysfs file:

Pid: 1, comm: swapper Not tainted (2.6.28-rc5-tip #53704)
EIP: 0060:[<c0c0a915>] EFLAGS: 00010286 CPU: 1
EIP is at unlock_kernel+0x10/0x2b
EAX: ffffffff EBX: 00000000 ECX: 00000000 EDX: f7030000
ESI: c12da19c EDI: 00000000 EBP: f7039f54 ESP: f7039f54
 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068
Process swapper (pid: 1, ti=f7038000 task=f7030000 task.ti=f7038000)
Stack:
 f7039f6c c0164d30 c013fed8 a7d8d7b4 00000000 00000000 f7039f74 c12fb78a
 f7039fd0 c0101132 c12fb77d 00000000 6f727200 6f632072 2d206564 c1002031
 0000000f f7039fa2 f7039fb0 3531b171 00000000 00000000 0000002f c12ca480
Call Trace:
 [<c0164d30>] ? register_tracer+0x66/0x13f
 [<c013fed8>] ? ktime_get+0x19/0x1b
 [<c12fb78a>] ? init_mmio_trace+0xd/0xf
 [<c0101132>] ? do_one_initcall+0x4a/0x111
 [<c12fb77d>] ? init_mmio_trace+0x0/0xf
 [<c015c7e6>] ? init_irq_proc+0x46/0x59
 [<c12e851d>] ? kernel_init+0x104/0x152
 [<c12e8419>] ? kernel_init+0x0/0x152
 [<c01038b7>] ? kernel_thread_helper+0x7/0x10
Code: 58 14 43 75 0a b8 00 9b 2d c1 e8 51 43 7a ff 64 a1 00 a0 37 c1 89 58 14 5b 5d c3 55 64 8b 15 00 a0 37 c1 83 7a 14 00 89 e5 79 04 <0f> 0b eb fe 8b 42 14 48 85 c0 89 42 14 79 0a b8 00 9b 2d c1 e8
EIP: [<c0c0a915>] unlock_kernel+0x10/0x2b SS:ESP 0068:f7039f54
---[ end trace a7919e7f17c0a725 ]---
Kernel panic - not syncing: Attempted to kill init!

So clean up the flow a bit.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 16892121cb7..24b6238884f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -520,7 +520,15 @@ int register_tracer(struct tracer *type)
 		return -1;
 	}
 
+	/*
+	 * When this gets called we hold the BKL which means that
+	 * preemption is disabled. Various trace selftests however
+	 * need to disable and enable preemption for successful tests.
+	 * So we drop the BKL here and grab it after the tests again.
+	 */
+	unlock_kernel();
 	mutex_lock(&trace_types_lock);
+
 	for (t = trace_types; t; t = t->next) {
 		if (strcmp(type->name, t->name) == 0) {
 			/* already found */
@@ -532,13 +540,6 @@ int register_tracer(struct tracer *type)
 	}
 
 #ifdef CONFIG_FTRACE_STARTUP_TEST
-	/*
-	 * When this gets called we hold the BKL which means that preemption
-	 * is disabled. Various trace selftests however need to disable
-	 * and enable preemption for successful tests. So we drop the BKL here
-	 * and grab it after the tests again.
-	 */
-	unlock_kernel();
 	if (type->selftest) {
 		struct tracer *saved_tracer = current_trace;
 		struct trace_array *tr = &global_trace;
@@ -550,9 +551,9 @@ int register_tracer(struct tracer *type)
 		 * internal tracing to verify that everything is in order.
 		 * If we fail, we do not register this tracer.
 		 */
-		for_each_tracing_cpu(i) {
+		for_each_tracing_cpu(i)
 			tracing_reset(tr, i);
-		}
+
 		current_trace = type;
 		/* the test is responsible for initializing and enabling */
 		pr_info("Testing tracer %s: ", type->name);
@@ -564,12 +565,11 @@ int register_tracer(struct tracer *type)
 			goto out;
 		}
 		/* Only reset on passing, to avoid touching corrupted buffers */
-		for_each_tracing_cpu(i) {
+		for_each_tracing_cpu(i)
 			tracing_reset(tr, i);
-		}
+
 		printk(KERN_CONT "PASSED\n");
 	}
-	lock_kernel();
 #endif
 
 	type->next = trace_types;
@@ -580,6 +580,7 @@ int register_tracer(struct tracer *type)
 
  out:
 	mutex_unlock(&trace_types_lock);
+	lock_kernel();
 
 	return ret;
 }
-- 
cgit v1.2.3


From 60a515132086b2c28a8141d873297fdf7a180ca7 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 18 Nov 2008 22:20:10 -0800
Subject: profiling: clean up profile_nop()

Impact: cleanup

No point in inlining this.

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/profile.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/profile.c b/kernel/profile.c
index 5b7d1ac7124..7f93a5042d3 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -544,7 +544,7 @@ static const struct file_operations proc_profile_operations = {
 };
 
 #ifdef CONFIG_SMP
-static inline void profile_nop(void *unused)
+static void profile_nop(void *unused)
 {
 }
 
-- 
cgit v1.2.3


From ec4e0e2fe018992d980910db901637c814575914 Mon Sep 17 00:00:00 2001
From: Ken Chen <kenchen@google.com>
Date: Tue, 18 Nov 2008 22:41:57 -0800
Subject: sched: fix inconsistency when redistribute per-cpu tg->cfs_rq shares

Impact: make load-balancing more consistent

In the update_shares() path leading to tg_shares_up(), the calculation of
per-cpu cfs_rq shares is rather erratic even under moderate task wake up
rate.  The problem is that the per-cpu tg->cfs_rq load weight used in the
sd_rq_weight aggregation and actual redistribution of the cfs_rq->shares
are collected at different time.  Under moderate system load, we've seen
quite a bit of variation on the cfs_rq->shares and ultimately wildly
affects sched_entity's load weight.

This patch caches the result of initial per-cpu load weight when doing the
sum calculation, and then pass it down to update_group_shares_cpu() for
redistributing per-cpu cfs_rq shares.  This allows consistent total cfs_rq
shares across all CPUs. It also simplifies the rounding and zero load
weight check.

Signed-off-by: Ken Chen <kenchen@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 41 +++++++++++++++--------------------------
 1 file changed, 15 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index a4c156d9a4a..93bfb086e60 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1453,27 +1453,13 @@ static void
 update_group_shares_cpu(struct task_group *tg, int cpu,
 			unsigned long sd_shares, unsigned long sd_rq_weight)
 {
-	int boost = 0;
 	unsigned long shares;
 	unsigned long rq_weight;
 
 	if (!tg->se[cpu])
 		return;
 
-	rq_weight = tg->cfs_rq[cpu]->load.weight;
-
-	/*
-	 * If there are currently no tasks on the cpu pretend there is one of
-	 * average load so that when a new task gets to run here it will not
-	 * get delayed by group starvation.
-	 */
-	if (!rq_weight) {
-		boost = 1;
-		rq_weight = NICE_0_LOAD;
-	}
-
-	if (unlikely(rq_weight > sd_rq_weight))
-		rq_weight = sd_rq_weight;
+	rq_weight = tg->cfs_rq[cpu]->rq_weight;
 
 	/*
 	 *           \Sum shares * rq_weight
@@ -1481,7 +1467,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
 	 *               \Sum rq_weight
 	 *
 	 */
-	shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
+	shares = (sd_shares * rq_weight) / sd_rq_weight;
 	shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
 
 	if (abs(shares - tg->se[cpu]->load.weight) >
@@ -1490,11 +1476,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
 		unsigned long flags;
 
 		spin_lock_irqsave(&rq->lock, flags);
-		/*
-		 * record the actual number of shares, not the boosted amount.
-		 */
-		tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
-		tg->cfs_rq[cpu]->rq_weight = rq_weight;
+		tg->cfs_rq[cpu]->shares = shares;
 
 		__set_se_shares(tg->se[cpu], shares);
 		spin_unlock_irqrestore(&rq->lock, flags);
@@ -1508,13 +1490,23 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
  */
 static int tg_shares_up(struct task_group *tg, void *data)
 {
-	unsigned long rq_weight = 0;
+	unsigned long weight, rq_weight = 0;
 	unsigned long shares = 0;
 	struct sched_domain *sd = data;
 	int i;
 
 	for_each_cpu_mask(i, sd->span) {
-		rq_weight += tg->cfs_rq[i]->load.weight;
+		/*
+		 * If there are currently no tasks on the cpu pretend there
+		 * is one of average load so that when a new task gets to
+		 * run here it will not get delayed by group starvation.
+		 */
+		weight = tg->cfs_rq[i]->load.weight;
+		if (!weight)
+			weight = NICE_0_LOAD;
+
+		tg->cfs_rq[i]->rq_weight = weight;
+		rq_weight += weight;
 		shares += tg->cfs_rq[i]->shares;
 	}
 
@@ -1524,9 +1516,6 @@ static int tg_shares_up(struct task_group *tg, void *data)
 	if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
 		shares = tg->shares;
 
-	if (!rq_weight)
-		rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
-
 	for_each_cpu_mask(i, sd->span)
 		update_group_shares_cpu(tg, i, shares, rq_weight);
 
-- 
cgit v1.2.3


From 957ad0166e9f76a8561dafa5e14ef5bd3f5e9a3b Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Fri, 21 Nov 2008 01:30:36 +0100
Subject: sched: update comment for move_task_off_dead_cpu

Impact: cleanup

This commit:

commit f7b4cddcc5aca03e80e357360c9424dfba5056c2
Author: Oleg Nesterov <oleg@tv-sign.ru>
Date:   Tue Oct 16 23:30:56 2007 -0700

    do CPU_DEAD migrating under read_lock(tasklist) instead of write_lock_irq(ta

    Currently move_task_off_dead_cpu() is called under
    write_lock_irq(tasklist).  This means it can't use task_lock() which is
    needed to improve migrating to take task's ->cpuset into account.

    Change the code to call move_task_off_dead_cpu() with irqs enabled, and
    change migrate_live_tasks() to use read_lock(tasklist).

...forgot to update the comment in front of move_task_off_dead_cpu.

Reference: http://lkml.org/lkml/2008/6/23/135

Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 93bfb086e60..a6085d5166d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6094,7 +6094,6 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
 
 /*
  * Figure out where task on dead CPU should go, use force if necessary.
- * NOTE: interrupts should be disabled by the caller
  */
 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 {
-- 
cgit v1.2.3


From f201ae2356c74bcae130b2177b3dca903ea98071 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 23 Nov 2008 06:22:56 +0100
Subject: tracing/function-return-tracer: store return stack into task_struct
 and allocate it dynamically

Impact: use deeper function tracing depth safely

Some tests showed that function return tracing needed a more deeper depth
of function calls. But it could be unsafe to store these return addresses
to the stack.

So these arrays will now be allocated dynamically into task_struct of current
only when the tracer is activated.

Typical scheme when tracer is activated:
- allocate a return stack for each task in global list.
- fork: allocate the return stack for the newly created task
- exit: free return stack of current
- idle init: same as fork

I chose a default depth of 50. I don't have overruns anymore.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/exit.c         |  5 ++-
 kernel/fork.c         |  4 +++
 kernel/sched.c        |  3 ++
 kernel/trace/ftrace.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 106 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 35c8ec2ba03..b9d446329da 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -47,6 +47,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/tracehook.h>
 #include <trace/sched.h>
+#include <linux/ftrace.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -1127,7 +1128,9 @@ NORET_TYPE void do_exit(long code)
 	preempt_disable();
 	/* causes final put_task_struct in finish_task_switch(). */
 	tsk->state = TASK_DEAD;
-
+#ifdef CONFIG_FUNCTION_RET_TRACER
+	ftrace_retfunc_exit_task(tsk);
+#endif
 	schedule();
 	BUG();
 	/* Avoid "noreturn function does return".  */
diff --git a/kernel/fork.c b/kernel/fork.c
index ac62f43ee43..d1eb30e69cc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -47,6 +47,7 @@
 #include <linux/mount.h>
 #include <linux/audit.h>
 #include <linux/memcontrol.h>
+#include <linux/ftrace.h>
 #include <linux/profile.h>
 #include <linux/rmap.h>
 #include <linux/acct.h>
@@ -1269,6 +1270,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	total_forks++;
 	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
+#ifdef CONFIG_FUNCTION_RET_TRACER
+	ftrace_retfunc_init_task(p);
+#endif
 	proc_fork_connector(p);
 	cgroup_post_fork(p);
 	return p;
diff --git a/kernel/sched.c b/kernel/sched.c
index 4de56108c86..fb17205950d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5901,6 +5901,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 	 * The idle tasks have their own, simple scheduling class:
 	 */
 	idle->sched_class = &idle_sched_class;
+#ifdef CONFIG_FUNCTION_RET_TRACER
+	ftrace_retfunc_init_task(idle);
+#endif
 }
 
 /*
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f212da48668..90d99fb02ae 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1498,10 +1498,77 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 
 #ifdef CONFIG_FUNCTION_RET_TRACER
 
+static atomic_t ftrace_retfunc_active;
+
 /* The callback that hooks the return of a function */
 trace_function_return_t ftrace_function_return =
 			(trace_function_return_t)ftrace_stub;
 
+
+/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
+static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
+{
+	int i;
+	int ret = 0;
+	unsigned long flags;
+	int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE;
+	struct task_struct *g, *t;
+
+	for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) {
+		ret_stack_list[i] = kmalloc(FTRACE_RETFUNC_DEPTH
+					* sizeof(struct ftrace_ret_stack),
+					GFP_KERNEL);
+		if (!ret_stack_list[i]) {
+			start = 0;
+			end = i;
+			ret = -ENOMEM;
+			goto free;
+		}
+	}
+
+	read_lock_irqsave(&tasklist_lock, flags);
+	do_each_thread(g, t) {
+		if (start == end) {
+			ret = -EAGAIN;
+			goto unlock;
+		}
+
+		if (t->ret_stack == NULL) {
+			t->ret_stack = ret_stack_list[start++];
+			t->curr_ret_stack = -1;
+			atomic_set(&t->trace_overrun, 0);
+		}
+	} while_each_thread(g, t);
+
+unlock:
+	read_unlock_irqrestore(&tasklist_lock, flags);
+free:
+	for (i = start; i < end; i++)
+		kfree(ret_stack_list[i]);
+	return ret;
+}
+
+/* Allocate a return stack for each task */
+static int start_return_tracing(void)
+{
+	struct ftrace_ret_stack **ret_stack_list;
+	int ret;
+
+	ret_stack_list = kmalloc(FTRACE_RETSTACK_ALLOC_SIZE *
+				sizeof(struct ftrace_ret_stack *),
+				GFP_KERNEL);
+
+	if (!ret_stack_list)
+		return -ENOMEM;
+
+	do {
+		ret = alloc_retstack_tasklist(ret_stack_list);
+	} while (ret == -EAGAIN);
+
+	kfree(ret_stack_list);
+	return ret;
+}
+
 int register_ftrace_return(trace_function_return_t func)
 {
 	int ret = 0;
@@ -1516,7 +1583,12 @@ int register_ftrace_return(trace_function_return_t func)
 		ret = -EBUSY;
 		goto out;
 	}
-
+	atomic_inc(&ftrace_retfunc_active);
+	ret = start_return_tracing();
+	if (ret) {
+		atomic_dec(&ftrace_retfunc_active);
+		goto out;
+	}
 	ftrace_tracing_type = FTRACE_TYPE_RETURN;
 	ftrace_function_return = func;
 	ftrace_startup();
@@ -1530,6 +1602,7 @@ void unregister_ftrace_return(void)
 {
 	mutex_lock(&ftrace_sysctl_lock);
 
+	atomic_dec(&ftrace_retfunc_active);
 	ftrace_function_return = (trace_function_return_t)ftrace_stub;
 	ftrace_shutdown();
 	/* Restore normal tracing type */
@@ -1537,6 +1610,27 @@ void unregister_ftrace_return(void)
 
 	mutex_unlock(&ftrace_sysctl_lock);
 }
+
+/* Allocate a return stack for newly created task */
+void ftrace_retfunc_init_task(struct task_struct *t)
+{
+	if (atomic_read(&ftrace_retfunc_active)) {
+		t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
+				* sizeof(struct ftrace_ret_stack),
+				GFP_KERNEL);
+		if (!t->ret_stack)
+			return;
+		t->curr_ret_stack = -1;
+		atomic_set(&t->trace_overrun, 0);
+	} else
+		t->ret_stack = NULL;
+}
+
+void ftrace_retfunc_exit_task(struct task_struct *t)
+{
+	kfree(t->ret_stack);
+	t->ret_stack = NULL;
+}
 #endif
 
 
-- 
cgit v1.2.3


From 82f60f0bc854aada696f27d863c03bef91f1509d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 23 Nov 2008 09:18:56 +0100
Subject: tracing/function-return-tracer: clean up task start/exit callbacks

Impact: cleanup

Eliminate #ifdefs in core code by using empty inline functions.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/exit.c  | 2 --
 kernel/fork.c  | 2 --
 kernel/sched.c | 2 --
 3 files changed, 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index b9d446329da..ef04d03b328 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1128,9 +1128,7 @@ NORET_TYPE void do_exit(long code)
 	preempt_disable();
 	/* causes final put_task_struct in finish_task_switch(). */
 	tsk->state = TASK_DEAD;
-#ifdef CONFIG_FUNCTION_RET_TRACER
 	ftrace_retfunc_exit_task(tsk);
-#endif
 	schedule();
 	BUG();
 	/* Avoid "noreturn function does return".  */
diff --git a/kernel/fork.c b/kernel/fork.c
index d1eb30e69cc..fbf4a4c0a62 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1270,9 +1270,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	total_forks++;
 	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
-#ifdef CONFIG_FUNCTION_RET_TRACER
 	ftrace_retfunc_init_task(p);
-#endif
 	proc_fork_connector(p);
 	cgroup_post_fork(p);
 	return p;
diff --git a/kernel/sched.c b/kernel/sched.c
index fb17205950d..388d9db044a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5901,9 +5901,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 	 * The idle tasks have their own, simple scheduling class:
 	 */
 	idle->sched_class = &idle_sched_class;
-#ifdef CONFIG_FUNCTION_RET_TRACER
 	ftrace_retfunc_init_task(idle);
-#endif
 }
 
 /*
-- 
cgit v1.2.3


From 02b67518e2b1c490787dac7f35e1204e74fe21ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?T=C3=B6r=C3=B6k=20Edwin?= <edwintorok@gmail.com>
Date: Sat, 22 Nov 2008 13:28:47 +0200
Subject: tracing: add support for userspace stacktraces in tracing/iter_ctrl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: add new (default-off) tracing visualization feature

Usage example:

 mount -t debugfs nodev /sys/kernel/debug
 cd /sys/kernel/debug/tracing
 echo userstacktrace >iter_ctrl
 echo sched_switch >current_tracer
 echo 1 >tracing_enabled
 .... run application ...
 echo 0 >tracing_enabled

Then read one of 'trace','latency_trace','trace_pipe'.

To get the best output you can compile your userspace programs with
frame pointers (at least glibc + the app you are tracing).

Signed-off-by: Török Edwin <edwintorok@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h |  9 +++++
 2 files changed, 102 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4ee6f037522..ced8b4fa9f5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -275,6 +275,7 @@ static const char *trace_options[] = {
 	"ftrace_preempt",
 	"branch",
 	"annotate",
+	"userstacktrace",
 	NULL
 };
 
@@ -918,6 +919,44 @@ void __trace_stack(struct trace_array *tr,
 	ftrace_trace_stack(tr, data, flags, skip, preempt_count());
 }
 
+static void ftrace_trace_userstack(struct trace_array *tr,
+		   struct trace_array_cpu *data,
+		   unsigned long flags, int pc)
+{
+	struct userstack_entry *entry;
+	struct stack_trace trace;
+	struct ring_buffer_event *event;
+	unsigned long irq_flags;
+
+	if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
+		return;
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		return;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, flags, pc);
+	entry->ent.type		= TRACE_USER_STACK;
+
+	memset(&entry->caller, 0, sizeof(entry->caller));
+
+	trace.nr_entries	= 0;
+	trace.max_entries	= FTRACE_STACK_ENTRIES;
+	trace.skip		= 0;
+	trace.entries		= entry->caller;
+
+	save_stack_trace_user(&trace);
+	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+}
+
+void __trace_userstack(struct trace_array *tr,
+		   struct trace_array_cpu *data,
+		   unsigned long flags)
+{
+	ftrace_trace_userstack(tr, data, flags, preempt_count());
+}
+
 static void
 ftrace_trace_special(void *__tr, void *__data,
 		     unsigned long arg1, unsigned long arg2, unsigned long arg3,
@@ -941,6 +980,7 @@ ftrace_trace_special(void *__tr, void *__data,
 	entry->arg3			= arg3;
 	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
 	ftrace_trace_stack(tr, data, irq_flags, 4, pc);
+	ftrace_trace_userstack(tr, data, irq_flags, pc);
 
 	trace_wake_up();
 }
@@ -979,6 +1019,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->next_cpu	= task_cpu(next);
 	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
 	ftrace_trace_stack(tr, data, flags, 5, pc);
+	ftrace_trace_userstack(tr, data, flags, pc);
 }
 
 void
@@ -1008,6 +1049,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->next_cpu			= task_cpu(wakee);
 	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
 	ftrace_trace_stack(tr, data, flags, 6, pc);
+	ftrace_trace_userstack(tr, data, flags, pc);
 
 	trace_wake_up();
 }
@@ -1387,6 +1429,31 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
 	return ret;
 }
 
+static int
+seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
+		unsigned long sym_flags)
+{
+	int ret = 1;
+	unsigned i;
+
+	for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+		unsigned long ip = entry->caller[i];
+
+		if (ip == ULONG_MAX || !ret)
+			break;
+		if (i)
+			ret = trace_seq_puts(s, " <- ");
+		if (!ip) {
+			ret = trace_seq_puts(s, "??");
+			continue;
+		}
+		if (ret /*&& (sym_flags & TRACE_ITER_SYM_ADDR)*/)
+			ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
+	}
+
+	return ret;
+}
+
 static void print_lat_help_header(struct seq_file *m)
 {
 	seq_puts(m, "#                  _------=> CPU#            \n");
@@ -1702,6 +1769,16 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 				 field->line);
 		break;
 	}
+	case TRACE_USER_STACK: {
+		struct userstack_entry *field;
+
+		trace_assign_type(field, entry);
+
+		seq_print_userip_objs(field, s, sym_flags);
+		if (entry->flags & TRACE_FLAG_CONT)
+			trace_seq_print_cont(s, iter);
+		break;
+	}
 	default:
 		trace_seq_printf(s, "Unknown type %d\n", entry->type);
 	}
@@ -1853,6 +1930,19 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 				 field->line);
 		break;
 	}
+	case TRACE_USER_STACK: {
+		struct userstack_entry *field;
+
+		trace_assign_type(field, entry);
+
+		ret = seq_print_userip_objs(field, s, sym_flags);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+		ret = trace_seq_putc(s, '\n');
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+		break;
+	}
 	}
 	return TRACE_TYPE_HANDLED;
 }
@@ -1912,6 +2002,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
 		break;
 	}
 	case TRACE_SPECIAL:
+	case TRACE_USER_STACK:
 	case TRACE_STACK: {
 		struct special_entry *field;
 
@@ -2000,6 +2091,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 		break;
 	}
 	case TRACE_SPECIAL:
+	case TRACE_USER_STACK:
 	case TRACE_STACK: {
 		struct special_entry *field;
 
@@ -2054,6 +2146,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 		break;
 	}
 	case TRACE_SPECIAL:
+	case TRACE_USER_STACK:
 	case TRACE_STACK: {
 		struct special_entry *field;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2cb12fd98f6..17bb4c830b0 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -26,6 +26,7 @@ enum trace_type {
 	TRACE_BOOT_CALL,
 	TRACE_BOOT_RET,
 	TRACE_FN_RET,
+	TRACE_USER_STACK,
 
 	__TRACE_LAST_TYPE
 };
@@ -42,6 +43,7 @@ struct trace_entry {
 	unsigned char		flags;
 	unsigned char		preempt_count;
 	int			pid;
+	int			tgid;
 };
 
 /*
@@ -99,6 +101,11 @@ struct stack_entry {
 	unsigned long		caller[FTRACE_STACK_ENTRIES];
 };
 
+struct userstack_entry {
+	struct trace_entry	ent;
+	unsigned long		caller[FTRACE_STACK_ENTRIES];
+};
+
 /*
  * ftrace_printk entry:
  */
@@ -240,6 +247,7 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct ctx_switch_entry, 0);	\
 		IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \
 		IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK);	\
+		IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
 		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\
 		IF_ASSIGN(var, ent, struct special_entry, 0);		\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,		\
@@ -500,6 +508,7 @@ enum trace_iterator_flags {
 	TRACE_ITER_PREEMPTONLY		= 0x800,
 	TRACE_ITER_BRANCH		= 0x1000,
 	TRACE_ITER_ANNOTATE		= 0x2000,
+	TRACE_ITER_USERSTACKTRACE       = 0x4000
 };
 
 /*
-- 
cgit v1.2.3


From b54d3de9f3b8956653b06f1a32e9f9321c6d9027 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?T=C3=B6r=C3=B6k=20Edwin?= <edwintorok@gmail.com>
Date: Sat, 22 Nov 2008 13:28:48 +0200
Subject: tracing: identify which executable object the userspace address
 belongs to
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: modify+improve the userstacktrace tracing visualization feature

Store thread group leader id, and use it to lookup the address in the
process's map. We could have looked up the address on thread's map,
but the thread might not exist by the time we are called. The process
might not exist either, but if you are reading trace_pipe, that is
unlikely.

Example usage:

 mount -t debugfs nodev /sys/kernel/debug
 cd /sys/kernel/debug/tracing
 echo userstacktrace >iter_ctrl
 echo sym-userobj >iter_ctrl
 echo sched_switch >current_tracer
 echo 1 >tracing_enabled
 cat trace_pipe >/tmp/trace&
 .... run application ...
 echo 0 >tracing_enabled
 cat /tmp/trace

You'll see stack entries like:

   /lib/libpthread-2.7.so[+0xd370]

You can convert them to function/line using:

   addr2line -fie /lib/libpthread-2.7.so 0xd370

Or:

   addr2line -fie /usr/lib/debug/libpthread-2.7.so 0xd370

For non-PIC/PIE executables this won't work:

   a.out[+0x73b]

You need to run the following: addr2line -fie a.out 0x40073b
(where 0x400000 is the default load address of a.out)

Signed-off-by: Török Edwin <edwintorok@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++-----
 kernel/trace/trace.h |  3 +-
 2 files changed, 81 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ced8b4fa9f5..62776b71b1c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -30,6 +30,7 @@
 #include <linux/gfp.h>
 #include <linux/fs.h>
 #include <linux/kprobes.h>
+#include <linux/seq_file.h>
 #include <linux/writeback.h>
 
 #include <linux/stacktrace.h>
@@ -276,6 +277,7 @@ static const char *trace_options[] = {
 	"branch",
 	"annotate",
 	"userstacktrace",
+	"sym-userobj",
 	NULL
 };
 
@@ -422,6 +424,28 @@ trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
 	return trace_seq_putmem(s, hex, j);
 }
 
+static int
+trace_seq_path(struct trace_seq *s, struct path *path)
+{
+	unsigned char *p;
+
+	if (s->len >= (PAGE_SIZE - 1))
+		return 0;
+	p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
+	if (!IS_ERR(p)) {
+		p = mangle_path(s->buffer + s->len, p, "\n");
+		if (p) {
+			s->len = p - s->buffer;
+			return 1;
+		}
+	} else {
+		s->buffer[s->len++] = '?';
+		return 1;
+	}
+
+	return 0;
+}
+
 static void
 trace_seq_reset(struct trace_seq *s)
 {
@@ -802,6 +826,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 
 	entry->preempt_count		= pc & 0xff;
 	entry->pid			= (tsk) ? tsk->pid : 0;
+	entry->tgid               	= (tsk) ? tsk->tgid : 0;
 	entry->flags =
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
 		(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -1429,28 +1454,73 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
 	return ret;
 }
 
+static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
+				    unsigned long ip, unsigned long sym_flags)
+{
+	struct file *file = NULL;
+	unsigned long vmstart = 0;
+	int ret = 1;
+
+	if (mm) {
+		const struct vm_area_struct *vma = find_vma(mm, ip);
+		if (vma) {
+			file = vma->vm_file;
+			vmstart = vma->vm_start;
+		}
+	}
+	if (file) {
+		ret = trace_seq_path(s, &file->f_path);
+		if (ret)
+			ret = trace_seq_printf(s, "[+0x%lx]",
+					ip - vmstart);
+	}
+	if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
+		ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
+	return ret;
+}
+
 static int
 seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
-		unsigned long sym_flags)
+		      unsigned long sym_flags)
 {
+	struct mm_struct *mm = NULL;
 	int ret = 1;
 	unsigned i;
 
+	if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
+		struct task_struct *task;
+		/*
+		 * we do the lookup on the thread group leader,
+		 * since individual threads might have already quit!
+		 */
+		rcu_read_lock();
+		task = find_task_by_vpid(entry->ent.tgid);
+		rcu_read_unlock();
+
+		if (task)
+			mm = get_task_mm(task);
+	}
+
 	for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
 		unsigned long ip = entry->caller[i];
 
 		if (ip == ULONG_MAX || !ret)
 			break;
-		if (i)
+		if (i && ret)
 			ret = trace_seq_puts(s, " <- ");
 		if (!ip) {
-			ret = trace_seq_puts(s, "??");
+			if (ret)
+				ret = trace_seq_puts(s, "??");
 			continue;
 		}
-		if (ret /*&& (sym_flags & TRACE_ITER_SYM_ADDR)*/)
-			ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
+		if (!ret)
+			break;
+		if (ret)
+			ret = seq_print_user_ip(s, mm, ip, sym_flags);
 	}
 
+	if (mm)
+		mmput(mm);
 	return ret;
 }
 
@@ -1775,8 +1845,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 		trace_assign_type(field, entry);
 
 		seq_print_userip_objs(field, s, sym_flags);
-		if (entry->flags & TRACE_FLAG_CONT)
-			trace_seq_print_cont(s, iter);
+		trace_seq_putc(s, '\n');
 		break;
 	}
 	default:
@@ -3581,6 +3650,9 @@ void ftrace_dump(void)
 		atomic_inc(&global_trace.data[cpu]->disabled);
 	}
 
+	/* don't look at user memory in panic mode */
+	trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
+
 	printk(KERN_TRACE "Dumping ftrace buffer:\n");
 
 	iter.tr = &global_trace;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 17bb4c830b0..28c15c2ebc2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -508,7 +508,8 @@ enum trace_iterator_flags {
 	TRACE_ITER_PREEMPTONLY		= 0x800,
 	TRACE_ITER_BRANCH		= 0x1000,
 	TRACE_ITER_ANNOTATE		= 0x2000,
-	TRACE_ITER_USERSTACKTRACE       = 0x4000
+	TRACE_ITER_USERSTACKTRACE       = 0x4000,
+	TRACE_ITER_SYM_USEROBJ          = 0x8000
 };
 
 /*
-- 
cgit v1.2.3


From cbe2f5a6e84eebb98ab42fc5e58c3cd5b7767349 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 23 Nov 2008 10:37:12 +0100
Subject: tracing: allow tracing of suspend/resume & hibernation code again

Impact: widen function-tracing to suspend+resume (and hibernation) sequences

Now that the ftrace kernel thread is gone, we can allow tracing
during suspend/resume again.

So revert these two commits:

  f42ac38c5 "ftrace: disable tracing for suspend to ram"
  41108eb10 "ftrace: disable tracing for hibernation"

This should be tested very carefully, as it could interact with
altneratives instruction patching, etc.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/power/disk.c | 13 +++----------
 kernel/power/main.c |  5 +----
 2 files changed, 4 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index c9d74083746..f77d3819ef5 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -22,7 +22,6 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
-#include <linux/ftrace.h>
 
 #include "power.h"
 
@@ -257,7 +256,7 @@ static int create_image(int platform_mode)
 
 int hibernation_snapshot(int platform_mode)
 {
-	int error, ftrace_save;
+	int error;
 
 	/* Free memory before shutting down devices. */
 	error = swsusp_shrink_memory();
@@ -269,7 +268,6 @@ int hibernation_snapshot(int platform_mode)
 		goto Close;
 
 	suspend_console();
-	ftrace_save = __ftrace_enabled_save();
 	error = device_suspend(PMSG_FREEZE);
 	if (error)
 		goto Recover_platform;
@@ -299,7 +297,6 @@ int hibernation_snapshot(int platform_mode)
  Resume_devices:
 	device_resume(in_suspend ?
 		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
-	__ftrace_enabled_restore(ftrace_save);
 	resume_console();
  Close:
 	platform_end(platform_mode);
@@ -370,11 +367,10 @@ static int resume_target_kernel(void)
 
 int hibernation_restore(int platform_mode)
 {
-	int error, ftrace_save;
+	int error;
 
 	pm_prepare_console();
 	suspend_console();
-	ftrace_save = __ftrace_enabled_save();
 	error = device_suspend(PMSG_QUIESCE);
 	if (error)
 		goto Finish;
@@ -389,7 +385,6 @@ int hibernation_restore(int platform_mode)
 	platform_restore_cleanup(platform_mode);
 	device_resume(PMSG_RECOVER);
  Finish:
-	__ftrace_enabled_restore(ftrace_save);
 	resume_console();
 	pm_restore_console();
 	return error;
@@ -402,7 +397,7 @@ int hibernation_restore(int platform_mode)
 
 int hibernation_platform_enter(void)
 {
-	int error, ftrace_save;
+	int error;
 
 	if (!hibernation_ops)
 		return -ENOSYS;
@@ -417,7 +412,6 @@ int hibernation_platform_enter(void)
 		goto Close;
 
 	suspend_console();
-	ftrace_save = __ftrace_enabled_save();
 	error = device_suspend(PMSG_HIBERNATE);
 	if (error) {
 		if (hibernation_ops->recover)
@@ -452,7 +446,6 @@ int hibernation_platform_enter(void)
 	hibernation_ops->finish();
  Resume_devices:
 	device_resume(PMSG_RESTORE);
-	__ftrace_enabled_restore(ftrace_save);
 	resume_console();
  Close:
 	hibernation_ops->end();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index b8f7ce9473e..613f16941b8 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -22,7 +22,6 @@
 #include <linux/freezer.h>
 #include <linux/vmstat.h>
 #include <linux/syscalls.h>
-#include <linux/ftrace.h>
 
 #include "power.h"
 
@@ -317,7 +316,7 @@ static int suspend_enter(suspend_state_t state)
  */
 int suspend_devices_and_enter(suspend_state_t state)
 {
-	int error, ftrace_save;
+	int error;
 
 	if (!suspend_ops)
 		return -ENOSYS;
@@ -328,7 +327,6 @@ int suspend_devices_and_enter(suspend_state_t state)
 			goto Close;
 	}
 	suspend_console();
-	ftrace_save = __ftrace_enabled_save();
 	suspend_test_start();
 	error = device_suspend(PMSG_SUSPEND);
 	if (error) {
@@ -360,7 +358,6 @@ int suspend_devices_and_enter(suspend_state_t state)
 	suspend_test_start();
 	device_resume(PMSG_RESUME);
 	suspend_test_finish("resume devices");
-	__ftrace_enabled_restore(ftrace_save);
 	resume_console();
  Close:
 	if (suspend_ops->end)
-- 
cgit v1.2.3


From 45b797492a0758e64dff74e9db70e1f65e0603a5 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 21 Nov 2008 00:40:40 -0500
Subject: trace: consolidate unlikely and likely profiler

Impact: clean up to make one profiler of like and unlikely tracer

The likely and unlikely profiler prints out the file and line numbers
of the annotated branches that it is profiling. It shows the number
of times it was correct or incorrect in its guess. Having two
different files or sections for that matter to tell us if it was a
likely or unlikely is pretty pointless. We really only care if
it was correct or not.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig        |  3 +--
 kernel/trace/trace_branch.c | 39 +++++++++++++--------------------------
 2 files changed, 14 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index b8378fad29a..7e354870570 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -166,8 +166,7 @@ config TRACE_BRANCH_PROFILING
 	  This tracer profiles all the the likely and unlikely macros
 	  in the kernel. It will display the results in:
 
-	  /debugfs/tracing/profile_likely
-	  /debugfs/tracing/profile_unlikely
+	  /debugfs/tracing/profile_annotated_branch
 
 	  Note: this will add a significant overhead, only turn this
 	  on if you need to profile the system's use of these macros.
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 23f9b02ce96..21dedc8b50a 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -261,7 +261,7 @@ static struct seq_operations tracing_likely_seq_ops = {
 	.show		= t_show,
 };
 
-static int tracing_likely_open(struct inode *inode, struct file *file)
+static int tracing_branch_open(struct inode *inode, struct file *file)
 {
 	int ret;
 
@@ -274,25 +274,18 @@ static int tracing_likely_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
-static struct file_operations tracing_likely_fops = {
-	.open		= tracing_likely_open,
+static const struct file_operations tracing_branch_fops = {
+	.open		= tracing_branch_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 };
 
-extern unsigned long __start_likely_profile[];
-extern unsigned long __stop_likely_profile[];
-extern unsigned long __start_unlikely_profile[];
-extern unsigned long __stop_unlikely_profile[];
+extern unsigned long __start_annotated_branch_profile[];
+extern unsigned long __stop_annotated_branch_profile[];
 
-static struct ftrace_pointer ftrace_likely_pos = {
-	.start			= __start_likely_profile,
-	.stop			= __stop_likely_profile,
-};
-
-static struct ftrace_pointer ftrace_unlikely_pos = {
-	.start			= __start_unlikely_profile,
-	.stop			= __stop_unlikely_profile,
+static const struct ftrace_pointer ftrace_annotated_branch_pos = {
+	.start			= __start_annotated_branch_profile,
+	.stop			= __stop_annotated_branch_profile,
 };
 
 static __init int ftrace_branch_init(void)
@@ -302,18 +295,12 @@ static __init int ftrace_branch_init(void)
 
 	d_tracer = tracing_init_dentry();
 
-	entry = debugfs_create_file("profile_likely", 0444, d_tracer,
-				    &ftrace_likely_pos,
-				    &tracing_likely_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'profile_likely' entry\n");
-
-	entry = debugfs_create_file("profile_unlikely", 0444, d_tracer,
-				    &ftrace_unlikely_pos,
-				    &tracing_likely_fops);
+	entry = debugfs_create_file("profile_annotated_branch", 0444, d_tracer,
+				    &ftrace_annotated_branch_pos,
+				    &tracing_branch_fops);
 	if (!entry)
-		pr_warning("Could not create debugfs"
-			   " 'profile_unlikely' entry\n");
+		pr_warning("Could not create debugfs "
+			   "'profile_annotatet_branch' entry\n");
 
 	return 0;
 }
-- 
cgit v1.2.3


From bac28bfe42ba98ee67503f78984d1d5e1ebbbb78 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 21 Nov 2008 01:51:53 -0500
Subject: trace: branch profiling should not print percent without data

Impact: cleanup on output of branch profiler

When a branch has not been taken, it does not make sense to show
a percentage incorrect or hit. This patch changes the behaviour
to print out a 'X' when the branch has not been executed yet.

For example:

 correct incorrect  %        Function                  File              Line
 ------- ---------  -        --------                  ----              ----
    2096        0   0 do_arch_prctl                  process_64.c         832
       0        0   X do_arch_prctl                  process_64.c         804
    2604        0   0 IS_ERR                         err.h                34
  130228     5765   4 __switch_to                    process_64.c         673
       0        0   X enable_TSC                     process_64.c         448
       0        0   X disable_TSC                    process_64.c         431

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_branch.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 21dedc8b50a..142acb3b4e0 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -225,7 +225,7 @@ static int t_show(struct seq_file *m, void *v)
 {
 	struct ftrace_branch_data *p = v;
 	const char *f;
-	unsigned long percent;
+	long percent;
 
 	if (v == (void *)1) {
 		seq_printf(m, " correct incorrect  %% "
@@ -247,9 +247,13 @@ static int t_show(struct seq_file *m, void *v)
 		percent = p->incorrect * 100;
 		percent /= p->correct + p->incorrect;
 	} else
-		percent = p->incorrect ? 100 : 0;
+		percent = p->incorrect ? 100 : -1;
 
-	seq_printf(m, "%8lu %8lu %3lu ", p->correct, p->incorrect, percent);
+	seq_printf(m, "%8lu %8lu ",  p->correct, p->incorrect);
+	if (percent < 0)
+		seq_printf(m, "  X ");
+	else
+		seq_printf(m, "%3ld ", percent);
 	seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
 	return 0;
 }
-- 
cgit v1.2.3


From 2bcd521a684cc94befbe2ce7d5b613c841b0d304 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 21 Nov 2008 01:30:54 -0500
Subject: trace: profile all if conditionals

Impact: feature to profile if statements

This patch adds a branch profiler for all if () statements.
The results will be found in:

  /debugfs/tracing/profile_branch

For example:

   miss      hit    %        Function                  File              Line
 ------- ---------  -        --------                  ----              ----
       0        1 100 x86_64_start_reservations      head64.c             127
       0        1 100 copy_bootdata                  head64.c             69
       1        0   0 x86_64_start_kernel            head64.c             111
      32        0   0 set_intr_gate                  desc.h               319
       1        0   0 reserve_ebda_region            head.c               51
       1        0   0 reserve_ebda_region            head.c               47
       0        1 100 reserve_ebda_region            head.c               42
       0        0   X maxcpus                        main.c               165

Miss means the branch was not taken. Hit means the branch was taken.
The percent is the percentage the branch was taken.

This adds a significant amount of overhead and should only be used
by those analyzing their system.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig        | 16 ++++++++++++++++
 kernel/trace/trace_branch.c | 33 +++++++++++++++++++++++++++++++--
 2 files changed, 47 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 7e354870570..61e8cca6ff4 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -173,6 +173,22 @@ config TRACE_BRANCH_PROFILING
 
 	  Say N if unsure.
 
+config PROFILE_ALL_BRANCHES
+	bool "Profile all if conditionals"
+	depends on TRACE_BRANCH_PROFILING
+	help
+	  This tracer profiles all branch conditions. Every if ()
+	  taken in the kernel is recorded whether it hit or miss.
+	  The results will be displayed in:
+
+	  /debugfs/tracing/profile_branch
+
+	  This configuration, when enabled, will impose a great overhead
+	  on the system. This should only be enabled when the system
+	  is to be analyzed
+
+	  Say N if unsure.
+
 config TRACING_BRANCHES
 	bool
 	help
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 142acb3b4e0..85792aec64d 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -185,6 +185,7 @@ EXPORT_SYMBOL(ftrace_likely_update);
 struct ftrace_pointer {
 	void		*start;
 	void		*stop;
+	int		hit;
 };
 
 static void *
@@ -223,13 +224,17 @@ static void t_stop(struct seq_file *m, void *p)
 
 static int t_show(struct seq_file *m, void *v)
 {
+	struct ftrace_pointer *fp = m->private;
 	struct ftrace_branch_data *p = v;
 	const char *f;
 	long percent;
 
 	if (v == (void *)1) {
-		seq_printf(m, " correct incorrect  %% "
-			      "       Function                "
+		if (fp->hit)
+			seq_printf(m, "   miss      hit    %% ");
+		else
+			seq_printf(m, " correct incorrect  %% ");
+		seq_printf(m, "       Function                "
 			      "  File              Line\n"
 			      " ------- ---------  - "
 			      "       --------                "
@@ -243,6 +248,9 @@ static int t_show(struct seq_file *m, void *v)
 		f--;
 	f++;
 
+	/*
+	 * The miss is overlayed on correct, and hit on incorrect.
+	 */
 	if (p->correct) {
 		percent = p->incorrect * 100;
 		percent /= p->correct + p->incorrect;
@@ -284,6 +292,18 @@ static const struct file_operations tracing_branch_fops = {
 	.llseek		= seq_lseek,
 };
 
+#ifdef CONFIG_PROFILE_ALL_BRANCHES
+extern unsigned long __start_branch_profile[];
+extern unsigned long __stop_branch_profile[];
+
+static struct ftrace_pointer ftrace_branch_pos = {
+	.start			= __start_branch_profile,
+	.stop			= __stop_branch_profile,
+	.hit			= 1,
+};
+
+#endif /* CONFIG_PROFILE_ALL_BRANCHES */
+
 extern unsigned long __start_annotated_branch_profile[];
 extern unsigned long __stop_annotated_branch_profile[];
 
@@ -306,6 +326,15 @@ static __init int ftrace_branch_init(void)
 		pr_warning("Could not create debugfs "
 			   "'profile_annotatet_branch' entry\n");
 
+#ifdef CONFIG_PROFILE_ALL_BRANCHES
+	entry = debugfs_create_file("profile_branch", 0444, d_tracer,
+				    &ftrace_branch_pos,
+				    &tracing_branch_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs"
+			   " 'profile_branch' entry\n");
+#endif
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 033601a32b2012b6948e80e739cca40bff4de4a0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 21 Nov 2008 12:41:55 -0500
Subject: ring-buffer: add tracing_off_permanent

Impact: feature to permanently disable ring buffer

This patch adds a API to the ring buffer code that will permanently
disable the ring buffer from ever recording. This should only be
called when some serious anomaly is detected, and the system
may be in an unstable state. When that happens, shutting down the
recording to the ring buffers may be appropriate.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 79 ++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 66 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 85ced143c2c..e206951603c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -18,8 +18,46 @@
 
 #include "trace.h"
 
-/* Global flag to disable all recording to ring buffers */
-static int ring_buffers_off __read_mostly;
+/*
+ * A fast way to enable or disable all ring buffers is to
+ * call tracing_on or tracing_off. Turning off the ring buffers
+ * prevents all ring buffers from being recorded to.
+ * Turning this switch on, makes it OK to write to the
+ * ring buffer, if the ring buffer is enabled itself.
+ *
+ * There's three layers that must be on in order to write
+ * to the ring buffer.
+ *
+ * 1) This global flag must be set.
+ * 2) The ring buffer must be enabled for recording.
+ * 3) The per cpu buffer must be enabled for recording.
+ *
+ * In case of an anomaly, this global flag has a bit set that
+ * will permantly disable all ring buffers.
+ */
+
+/*
+ * Global flag to disable all recording to ring buffers
+ *  This has two bits: ON, DISABLED
+ *
+ *  ON   DISABLED
+ * ---- ----------
+ *   0      0        : ring buffers are off
+ *   1      0        : ring buffers are on
+ *   X      1        : ring buffers are permanently disabled
+ */
+
+enum {
+	RB_BUFFERS_ON_BIT	= 0,
+	RB_BUFFERS_DISABLED_BIT	= 1,
+};
+
+enum {
+	RB_BUFFERS_ON		= 1 << RB_BUFFERS_ON_BIT,
+	RB_BUFFERS_DISABLED	= 1 << RB_BUFFERS_DISABLED_BIT,
+};
+
+static long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
 
 /**
  * tracing_on - enable all tracing buffers
@@ -29,7 +67,7 @@ static int ring_buffers_off __read_mostly;
  */
 void tracing_on(void)
 {
-	ring_buffers_off = 0;
+	set_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
 }
 
 /**
@@ -42,7 +80,18 @@ void tracing_on(void)
  */
 void tracing_off(void)
 {
-	ring_buffers_off = 1;
+	clear_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
+}
+
+/**
+ * tracing_off_permanent - permanently disable ring buffers
+ *
+ * This function, once called, will disable all ring buffers
+ * permanenty.
+ */
+void tracing_off_permanent(void)
+{
+	set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags);
 }
 
 #include "trace.h"
@@ -1185,7 +1234,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
 	struct ring_buffer_event *event;
 	int cpu, resched;
 
-	if (ring_buffers_off)
+	if (ring_buffer_flags != RB_BUFFERS_ON)
 		return NULL;
 
 	if (atomic_read(&buffer->record_disabled))
@@ -1297,7 +1346,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
 	int ret = -EBUSY;
 	int cpu, resched;
 
-	if (ring_buffers_off)
+	if (ring_buffer_flags != RB_BUFFERS_ON)
 		return -EBUSY;
 
 	if (atomic_read(&buffer->record_disabled))
@@ -2178,12 +2227,14 @@ static ssize_t
 rb_simple_read(struct file *filp, char __user *ubuf,
 	       size_t cnt, loff_t *ppos)
 {
-	int *p = filp->private_data;
+	long *p = filp->private_data;
 	char buf[64];
 	int r;
 
-	/* !ring_buffers_off == tracing_on */
-	r = sprintf(buf, "%d\n", !*p);
+	if (test_bit(RB_BUFFERS_DISABLED_BIT, p))
+		r = sprintf(buf, "permanently disabled\n");
+	else
+		r = sprintf(buf, "%d\n", test_bit(RB_BUFFERS_ON_BIT, p));
 
 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
@@ -2192,7 +2243,7 @@ static ssize_t
 rb_simple_write(struct file *filp, const char __user *ubuf,
 		size_t cnt, loff_t *ppos)
 {
-	int *p = filp->private_data;
+	long *p = filp->private_data;
 	char buf[64];
 	long val;
 	int ret;
@@ -2209,8 +2260,10 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
 	if (ret < 0)
 		return ret;
 
-	/* !ring_buffers_off == tracing_on */
-	*p = !val;
+	if (val)
+		set_bit(RB_BUFFERS_ON_BIT, p);
+	else
+		clear_bit(RB_BUFFERS_ON_BIT, p);
 
 	(*ppos)++;
 
@@ -2232,7 +2285,7 @@ static __init int rb_init_debugfs(void)
 	d_tracer = tracing_init_dentry();
 
 	entry = debugfs_create_file("tracing_on", 0644, d_tracer,
-				    &ring_buffers_off, &rb_simple_fops);
+				    &ring_buffer_flags, &rb_simple_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs 'tracing_on' entry\n");
 
-- 
cgit v1.2.3


From 69bb54ec05f57da7f6fac2cec0820cbc970df20f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 21 Nov 2008 12:59:38 -0500
Subject: ftrace: add ftrace_off_permanent

Impact: add new API to disable all of ftrace on anomalies

It case of a serious anomaly being detected (like something caught by
lockdep) it is a good idea to disable all tracing immediately, without
grabing any locks.

This patch adds ftrace_off_permanent that disables the tracers, function
tracing and ring buffers without a way to enable them again. This should
only be used when something serious has been detected.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4ee6f037522..0dbfb23ced9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -660,6 +660,21 @@ static void trace_init_cmdlines(void)
 static int trace_stop_count;
 static DEFINE_SPINLOCK(tracing_start_lock);
 
+/**
+ * ftrace_off_permanent - disable all ftrace code permanently
+ *
+ * This should only be called when a serious anomally has
+ * been detected.  This will turn off the function tracing,
+ * ring buffers, and other tracing utilites. It takes no
+ * locks and can be called from any context.
+ */
+void ftrace_off_permanent(void)
+{
+	tracing_disabled = 1;
+	ftrace_stop();
+	tracing_off_permanent();
+}
+
 /**
  * tracing_start - quick start of the tracer
  *
-- 
cgit v1.2.3


From 0429149fb5e01edc410648591c19095d2074ee00 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 21 Nov 2008 14:44:57 -0500
Subject: trace: fix compiler warning in branch profiler

Impact: fix compiler warning

The ftrace_pointers used in the branch profiler are constant values.
They should never change. But the compiler complains when they are
passed into the debugfs_create_file as a data pointer, because the
function discards the qualifier.

This patch typecasts the parameter to debugfs_create_file back to
a void pointer. To remind the callbacks that they are pointing to
a constant value, I also modified the callback local pointers to
be const struct ftrace_pointer * as well.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_branch.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 85792aec64d..877ee88e6a7 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -191,7 +191,7 @@ struct ftrace_pointer {
 static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	struct ftrace_pointer *f = m->private;
+	const struct ftrace_pointer *f = m->private;
 	struct ftrace_branch_data *p = v;
 
 	(*pos)++;
@@ -224,7 +224,7 @@ static void t_stop(struct seq_file *m, void *p)
 
 static int t_show(struct seq_file *m, void *v)
 {
-	struct ftrace_pointer *fp = m->private;
+	const struct ftrace_pointer *fp = m->private;
 	struct ftrace_branch_data *p = v;
 	const char *f;
 	long percent;
@@ -296,7 +296,7 @@ static const struct file_operations tracing_branch_fops = {
 extern unsigned long __start_branch_profile[];
 extern unsigned long __stop_branch_profile[];
 
-static struct ftrace_pointer ftrace_branch_pos = {
+static const struct ftrace_pointer ftrace_branch_pos = {
 	.start			= __start_branch_profile,
 	.stop			= __stop_branch_profile,
 	.hit			= 1,
@@ -320,7 +320,7 @@ static __init int ftrace_branch_init(void)
 	d_tracer = tracing_init_dentry();
 
 	entry = debugfs_create_file("profile_annotated_branch", 0444, d_tracer,
-				    &ftrace_annotated_branch_pos,
+				    (void *)&ftrace_annotated_branch_pos,
 				    &tracing_branch_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs "
@@ -328,7 +328,7 @@ static __init int ftrace_branch_init(void)
 
 #ifdef CONFIG_PROFILE_ALL_BRANCHES
 	entry = debugfs_create_file("profile_branch", 0444, d_tracer,
-				    &ftrace_branch_pos,
+				    (void *)&ftrace_branch_pos,
 				    &tracing_branch_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs"
-- 
cgit v1.2.3


From 8d7c6a96164651dbbab449ef0b5c20ae1f76a3a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?T=C3=B6r=C3=B6k=20Edwin?= <edwintorok@gmail.com>
Date: Sun, 23 Nov 2008 12:39:06 +0200
Subject: tracing/stack-tracer: fix style issues
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: cleanup

Signed-off-by: Török Edwin <edwintorok@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 62776b71b1c..dedf35f3697 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -948,9 +948,9 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 		   struct trace_array_cpu *data,
 		   unsigned long flags, int pc)
 {
+	struct ring_buffer_event *event;
 	struct userstack_entry *entry;
 	struct stack_trace trace;
-	struct ring_buffer_event *event;
 	unsigned long irq_flags;
 
 	if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
@@ -1471,8 +1471,7 @@ static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
 	if (file) {
 		ret = trace_seq_path(s, &file->f_path);
 		if (ret)
-			ret = trace_seq_printf(s, "[+0x%lx]",
-					ip - vmstart);
+			ret = trace_seq_printf(s, "[+0x%lx]", ip - vmstart);
 	}
 	if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
 		ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
@@ -1485,7 +1484,7 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
 {
 	struct mm_struct *mm = NULL;
 	int ret = 1;
-	unsigned i;
+	unsigned int i;
 
 	if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
 		struct task_struct *task;
-- 
cgit v1.2.3


From cffa10aecb6891f090a4d53a075bc40c082c45fc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?T=C3=B6r=C3=B6k=20Edwin?= <edwintorok@gmail.com>
Date: Sun, 23 Nov 2008 12:39:07 +0200
Subject: tracing/stack-tracer: fix locking and refcounts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: fix refcounting/object-access bug

Hold mmap_sem while looking up/accessing vma.
Hold the RCU lock while using the task we looked up.

Signed-off-by: Török Edwin <edwintorok@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dedf35f3697..4c3bd82cec4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1462,11 +1462,15 @@ static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
 	int ret = 1;
 
 	if (mm) {
-		const struct vm_area_struct *vma = find_vma(mm, ip);
+		const struct vm_area_struct *vma;
+
+		down_read(&mm->mmap_sem);
+		vma = find_vma(mm, ip);
 		if (vma) {
 			file = vma->vm_file;
 			vmstart = vma->vm_start;
 		}
+		up_read(&mm->mmap_sem);
 	}
 	if (file) {
 		ret = trace_seq_path(s, &file->f_path);
@@ -1494,10 +1498,9 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
 		 */
 		rcu_read_lock();
 		task = find_task_by_vpid(entry->ent.tgid);
-		rcu_read_unlock();
-
 		if (task)
 			mm = get_task_mm(task);
+		rcu_read_unlock();
 	}
 
 	for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
-- 
cgit v1.2.3


From 8d26487fd4ddda7a0237da418fb8669fb06ae557 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?T=C3=B6r=C3=B6k=20Edwin?= <edwintorok@gmail.com>
Date: Sun, 23 Nov 2008 12:39:08 +0200
Subject: tracing/stack-tracer: introduce CONFIG_USER_STACKTRACE_SUPPORT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: cleanup

User stack tracing is just implemented for x86, but it is not x86 specific.

Introduce a generic config flag, that is currently enabled only for x86.
When other arches implement it, they will have to
SELECT USER_STACKTRACE_SUPPORT.

Signed-off-by: Török Edwin <edwintorok@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index b8378fad29a..87fc34a1bb9 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -3,6 +3,9 @@
 #  select HAVE_FUNCTION_TRACER:
 #
 
+config USER_STACKTRACE_SUPPORT
+	bool
+
 config NOP_TRACER
 	bool
 
-- 
cgit v1.2.3


From e38da59269be8c0196d16dff1be5bb26076afc6a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?T=C3=B6r=C3=B6k=20Edwin?= <edwintorok@gmail.com>
Date: Sun, 23 Nov 2008 13:08:10 +0200
Subject: tracing/stack-tracer: avoid races accessing file

Impact: fix race

vma->vm_file reference is only stable while holding the mmap_sem,
so move usage of it to within the critical section.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4c3bd82cec4..48d1536f1ca 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1470,13 +1470,13 @@ static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
 			file = vma->vm_file;
 			vmstart = vma->vm_start;
 		}
+		if (file) {
+			ret = trace_seq_path(s, &file->f_path);
+			if (ret)
+				ret = trace_seq_printf(s, "[+0x%lx]", ip - vmstart);
+		}
 		up_read(&mm->mmap_sem);
 	}
-	if (file) {
-		ret = trace_seq_path(s, &file->f_path);
-		if (ret)
-			ret = trace_seq_printf(s, "[+0x%lx]", ip - vmstart);
-	}
 	if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
 		ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
 	return ret;
-- 
cgit v1.2.3


From eae849ca034c7f1015f0a6f17421ebc737f0a069 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 23 Nov 2008 17:33:12 +0100
Subject: tracing/function-return-tracer: don't trace kfree while it frees the
 return stack

Impact: fix a crash

While I killed the cat process, I got sometimes the following (but rare)
crash:

[   65.689027] Pid: 2969, comm: cat Not tainted (2.6.28-rc6-tip #83) AMILO Li 2727
[   65.689027] EIP: 0060:[<00000000>] EFLAGS: 00010082 CPU: 1
[   65.689027] EIP is at 0x0
[   65.689027] EAX: 00000000 EBX: f66cd780 ECX: c019a64a EDX: f66cd780
[   65.689027] ESI: 00000286 EDI: f66cd780 EBP: f630be2c ESP: f630be24
[   65.689027]  DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068
[   65.689027] Process cat (pid: 2969, ti=f630a000 task=f66cd780 task.ti=f630a000)
[   65.689027] Stack:
[   65.689027]  00000012 f630bd54 f630be7c c012c853 00000000 c0133cc9 f66cda54 f630be5c
[   65.689027]  f630be68 f66cda54 f66cd88c f66cd878 f7070000 00000001 f630be90 c0135dbc
[   65.689027]  f614a614 f630be68 f630be68 f65ba200 00000002 f630bf10 f630be90 c012cad6
[   65.689027] Call Trace:
[   65.689027]  [<c012c853>] ? do_exit+0x603/0x850
[   65.689027]  [<c0133cc9>] ? next_signal+0x9/0x40
[   65.689027]  [<c0135dbc>] ? dequeue_signal+0x8c/0x180
[   65.689027]  [<c012cad6>] ? do_group_exit+0x36/0x90
[   65.689027]  [<c013709c>] ? get_signal_to_deliver+0x20c/0x390
[   65.689027]  [<c0102b69>] ? do_notify_resume+0x99/0x8b0
[   65.689027]  [<c02e6d1a>] ? tty_ldisc_deref+0x5a/0x80
[   65.689027]  [<c014db9b>] ? trace_hardirqs_on+0xb/0x10
[   65.689027]  [<c02e6d1a>] ? tty_ldisc_deref+0x5a/0x80
[   65.689027]  [<c02e39b0>] ? n_tty_write+0x0/0x340
[   65.689027]  [<c02e1812>] ? redirected_tty_write+0x82/0x90
[   65.689027]  [<c019ee99>] ? vfs_write+0x99/0xd0
[   65.689027]  [<c02e1790>] ? redirected_tty_write+0x0/0x90
[   65.689027]  [<c019f342>] ? sys_write+0x42/0x70
[   65.689027]  [<c01035ca>] ? work_notifysig+0x13/0x19
[   65.689027] Code:  Bad EIP value.
[   65.689027] EIP: [<00000000>] 0x0 SS:ESP 0068:f630be24

This is because on do_exit(), kfree is called to free the return addresses stack
but kfree is traced and stored its return address in this stack.
This patch fixes it.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 90d99fb02ae..53042f118f2 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1628,8 +1628,13 @@ void ftrace_retfunc_init_task(struct task_struct *t)
 
 void ftrace_retfunc_exit_task(struct task_struct *t)
 {
-	kfree(t->ret_stack);
+	struct ftrace_ret_stack	*ret_stack = t->ret_stack;
+
 	t->ret_stack = NULL;
+	/* NULL must become visible to IRQs before we free it: */
+	barrier();
+
+	kfree(ret_stack);
 }
 #endif
 
-- 
cgit v1.2.3


From 65afa5e603d507014580ead016ec887b49e1afa6 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 23 Nov 2008 18:43:39 +0100
Subject: tracing/function-return-tracer: free the return stack on free_task()

Impact: avoid losing some traces when a task is freed

do_exit() is not the last function called when a task finishes.
There are still some functions which are to be called such as
ree_task().  So we delay the freeing of the return stack to the
last moment.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/exit.c | 2 --
 kernel/fork.c | 1 +
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index ef04d03b328..e5ae36ebe8a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -47,7 +47,6 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/tracehook.h>
 #include <trace/sched.h>
-#include <linux/ftrace.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -1128,7 +1127,6 @@ NORET_TYPE void do_exit(long code)
 	preempt_disable();
 	/* causes final put_task_struct in finish_task_switch(). */
 	tsk->state = TASK_DEAD;
-	ftrace_retfunc_exit_task(tsk);
 	schedule();
 	BUG();
 	/* Avoid "noreturn function does return".  */
diff --git a/kernel/fork.c b/kernel/fork.c
index fbf4a4c0a62..d6e1a3205f6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -140,6 +140,7 @@ void free_task(struct task_struct *tsk)
 	prop_local_destroy_single(&tsk->dirties);
 	free_thread_info(tsk->stack);
 	rt_mutex_debug_task_free(tsk);
+	ftrace_retfunc_exit_task(tsk);
 	free_task_struct(tsk);
 }
 EXPORT_SYMBOL(free_task);
-- 
cgit v1.2.3


From 7918baa555140989eeee1270f48533987d48fdba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?T=C3=B6r=C3=B6k=20Edwin?= <edwintorok@gmail.com>
Date: Mon, 24 Nov 2008 10:17:42 +0200
Subject: mutex: __used is needed for function referenced only from inline asm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: fix build failure on llvm-gcc-4.2

According to the gcc manual, the 'used' attribute should be applied to
functions referenced only from inline assembly.
This fixes a build failure with llvm-gcc-4.2, which deleted
__mutex_lock_slowpath, __mutex_unlock_slowpath.

Signed-off-by: Török Edwin <edwintorok@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/mutex.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/mutex.c b/kernel/mutex.c
index 39a3816b68d..4f45d4b658e 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -59,7 +59,7 @@ EXPORT_SYMBOL(__mutex_init);
  * We also put the fastpath first in the kernel image, to make sure the
  * branch is predicted by the CPU as default-untaken.
  */
-static void noinline __sched
+static __used noinline void __sched
 __mutex_lock_slowpath(atomic_t *lock_count);
 
 /***
@@ -96,7 +96,7 @@ void inline __sched mutex_lock(struct mutex *lock)
 EXPORT_SYMBOL(mutex_lock);
 #endif
 
-static noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
+static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
 
 /***
  * mutex_unlock - release the mutex
@@ -268,7 +268,7 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
 /*
  * Release the lock, slowpath:
  */
-static noinline void
+static __used noinline void
 __mutex_unlock_slowpath(atomic_t *lock_count)
 {
 	__mutex_unlock_common_slowpath(lock_count, 1);
@@ -313,7 +313,7 @@ int __sched mutex_lock_killable(struct mutex *lock)
 }
 EXPORT_SYMBOL(mutex_lock_killable);
 
-static noinline void __sched
+static __used noinline void __sched
 __mutex_lock_slowpath(atomic_t *lock_count)
 {
 	struct mutex *lock = container_of(lock_count, struct mutex, count);
-- 
cgit v1.2.3


From ea6f18ed5a1531caf678374f30a0990c9e6742f3 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Tue, 25 Nov 2008 02:35:02 +1030
Subject: sched: reduce stack size requirements in kernel/sched.c

Impact: cleanup

  * use node_to_cpumask_ptr in place of node_to_cpumask to reduce stack
    requirements in sched.c

Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index bb827651558..dd22cec499b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6110,8 +6110,9 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 
 	do {
 		/* On same node? */
-		mask = node_to_cpumask(cpu_to_node(dead_cpu));
-		cpus_and(mask, mask, p->cpus_allowed);
+		node_to_cpumask_ptr(pnodemask, cpu_to_node(dead_cpu));
+
+		cpus_and(mask, *pnodemask, p->cpus_allowed);
 		dest_cpu = any_online_cpu(mask);
 
 		/* On any allowed CPU? */
@@ -7098,9 +7099,9 @@ static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
 				 struct sched_group **sg, cpumask_t *nodemask)
 {
 	int group;
+	node_to_cpumask_ptr(pnodemask, cpu_to_node(cpu));
 
-	*nodemask = node_to_cpumask(cpu_to_node(cpu));
-	cpus_and(*nodemask, *nodemask, *cpu_map);
+	cpus_and(*nodemask, *pnodemask, *cpu_map);
 	group = first_cpu(*nodemask);
 
 	if (sg)
@@ -7150,9 +7151,9 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
 
 		for (i = 0; i < nr_node_ids; i++) {
 			struct sched_group *oldsg, *sg = sched_group_nodes[i];
+			node_to_cpumask_ptr(pnodemask, i);
 
-			*nodemask = node_to_cpumask(i);
-			cpus_and(*nodemask, *nodemask, *cpu_map);
+			cpus_and(*nodemask, *pnodemask, *cpu_map);
 			if (cpus_empty(*nodemask))
 				continue;
 
-- 
cgit v1.2.3


From abcd083a1a658d2bc1f7fced02632bfe03918002 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 25 Nov 2008 02:35:02 +1030
Subject: sched: convert sched.c from for_each_cpu_mask to for_each_cpu.

Impact: trivial API conversion

This is a simple conversion, but note that for_each_cpu() terminates
with i >= nr_cpu_ids, not i == NR_CPUS like for_each_cpu_mask() did.

I don't convert all of them: sd->span changes in a later patch, so
change those iterators there rather than here.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index dd22cec499b..e59978eead1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2061,7 +2061,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 		/* Tally up the load of all CPUs in the group */
 		avg_load = 0;
 
-		for_each_cpu_mask_nr(i, group->cpumask) {
+		for_each_cpu(i, &group->cpumask) {
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
 				load = source_load(i, load_idx);
@@ -2103,7 +2103,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
 	/* Traverse only the allowed CPUs */
 	cpus_and(*tmp, group->cpumask, p->cpus_allowed);
 
-	for_each_cpu_mask_nr(i, *tmp) {
+	for_each_cpu(i, tmp) {
 		load = weighted_cpuload(i);
 
 		if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -3121,7 +3121,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		max_cpu_load = 0;
 		min_cpu_load = ~0UL;
 
-		for_each_cpu_mask_nr(i, group->cpumask) {
+		for_each_cpu(i, &group->cpumask) {
 			struct rq *rq;
 
 			if (!cpu_isset(i, *cpus))
@@ -3400,7 +3400,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
 	unsigned long max_load = 0;
 	int i;
 
-	for_each_cpu_mask_nr(i, group->cpumask) {
+	for_each_cpu(i, &group->cpumask) {
 		unsigned long wl;
 
 		if (!cpu_isset(i, *cpus))
@@ -3942,7 +3942,7 @@ static void run_rebalance_domains(struct softirq_action *h)
 		int balance_cpu;
 
 		cpu_clear(this_cpu, cpus);
-		for_each_cpu_mask_nr(balance_cpu, cpus) {
+		for_each_cpu(balance_cpu, &cpus) {
 			/*
 			 * If this cpu gets work to do, stop the load balancing
 			 * work being done for other cpus. Next load
@@ -6906,7 +6906,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
 
 	cpus_clear(*covered);
 
-	for_each_cpu_mask_nr(i, *span) {
+	for_each_cpu(i, span) {
 		struct sched_group *sg;
 		int group = group_fn(i, cpu_map, &sg, tmpmask);
 		int j;
@@ -6917,7 +6917,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
 		cpus_clear(sg->cpumask);
 		sg->__cpu_power = 0;
 
-		for_each_cpu_mask_nr(j, *span) {
+		for_each_cpu(j, span) {
 			if (group_fn(j, cpu_map, NULL, tmpmask) != group)
 				continue;
 
@@ -7117,7 +7117,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
 	if (!sg)
 		return;
 	do {
-		for_each_cpu_mask_nr(j, sg->cpumask) {
+		for_each_cpu(j, &sg->cpumask) {
 			struct sched_domain *sd;
 
 			sd = &per_cpu(phys_domains, j);
@@ -7142,7 +7142,7 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
 {
 	int cpu, i;
 
-	for_each_cpu_mask_nr(cpu, *cpu_map) {
+	for_each_cpu(cpu, cpu_map) {
 		struct sched_group **sched_group_nodes
 			= sched_group_nodes_bycpu[cpu];
 
@@ -7396,7 +7396,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 	/*
 	 * Set up domains for cpus specified by the cpu_map.
 	 */
-	for_each_cpu_mask_nr(i, *cpu_map) {
+	for_each_cpu(i, cpu_map) {
 		struct sched_domain *sd = NULL, *p;
 		SCHED_CPUMASK_VAR(nodemask, allmasks);
 
@@ -7463,7 +7463,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 
 #ifdef CONFIG_SCHED_SMT
 	/* Set up CPU (sibling) groups */
-	for_each_cpu_mask_nr(i, *cpu_map) {
+	for_each_cpu(i, cpu_map) {
 		SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
 		SCHED_CPUMASK_VAR(send_covered, allmasks);
 
@@ -7480,7 +7480,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 
 #ifdef CONFIG_SCHED_MC
 	/* Set up multi-core groups */
-	for_each_cpu_mask_nr(i, *cpu_map) {
+	for_each_cpu(i, cpu_map) {
 		SCHED_CPUMASK_VAR(this_core_map, allmasks);
 		SCHED_CPUMASK_VAR(send_covered, allmasks);
 
@@ -7547,7 +7547,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 			goto error;
 		}
 		sched_group_nodes[i] = sg;
-		for_each_cpu_mask_nr(j, *nodemask) {
+		for_each_cpu(j, nodemask) {
 			struct sched_domain *sd;
 
 			sd = &per_cpu(node_domains, j);
@@ -7593,21 +7593,21 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 
 	/* Calculate CPU power for physical packages and nodes */
 #ifdef CONFIG_SCHED_SMT
-	for_each_cpu_mask_nr(i, *cpu_map) {
+	for_each_cpu(i, cpu_map) {
 		struct sched_domain *sd = &per_cpu(cpu_domains, i);
 
 		init_sched_groups_power(i, sd);
 	}
 #endif
 #ifdef CONFIG_SCHED_MC
-	for_each_cpu_mask_nr(i, *cpu_map) {
+	for_each_cpu(i, cpu_map) {
 		struct sched_domain *sd = &per_cpu(core_domains, i);
 
 		init_sched_groups_power(i, sd);
 	}
 #endif
 
-	for_each_cpu_mask_nr(i, *cpu_map) {
+	for_each_cpu(i, cpu_map) {
 		struct sched_domain *sd = &per_cpu(phys_domains, i);
 
 		init_sched_groups_power(i, sd);
@@ -7627,7 +7627,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #endif
 
 	/* Attach the domains */
-	for_each_cpu_mask_nr(i, *cpu_map) {
+	for_each_cpu(i, cpu_map) {
 		struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
 		sd = &per_cpu(cpu_domains, i);
@@ -7709,7 +7709,7 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
 	cpumask_t tmpmask;
 	int i;
 
-	for_each_cpu_mask_nr(i, *cpu_map)
+	for_each_cpu(i, cpu_map)
 		cpu_attach_domain(NULL, &def_root_domain, i);
 	synchronize_sched();
 	arch_destroy_sched_domains(cpu_map, &tmpmask);
-- 
cgit v1.2.3


From 3404c8d97c2d3eb87b1bf4aadad957bfb5235b14 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 25 Nov 2008 02:35:03 +1030
Subject: sched: get rid of boutique sched.c allocations, use cpumask_var_t.

Impact: use new general API

Using lots of allocs rather than one big alloc is less efficient, but
who cares for this setup function?

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 139 +++++++++++++++++++++++----------------------------------
 1 file changed, 55 insertions(+), 84 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index e59978eead1..0dc9d5752d6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7263,48 +7263,6 @@ SD_INIT_FUNC(CPU)
  SD_INIT_FUNC(MC)
 #endif
 
-/*
- * To minimize stack usage kmalloc room for cpumasks and share the
- * space as the usage in build_sched_domains() dictates.  Used only
- * if the amount of space is significant.
- */
-struct allmasks {
-	cpumask_t tmpmask;			/* make this one first */
-	union {
-		cpumask_t nodemask;
-		cpumask_t this_sibling_map;
-		cpumask_t this_core_map;
-	};
-	cpumask_t send_covered;
-
-#ifdef CONFIG_NUMA
-	cpumask_t domainspan;
-	cpumask_t covered;
-	cpumask_t notcovered;
-#endif
-};
-
-#if	NR_CPUS > 128
-#define SCHED_CPUMASK_DECLARE(v)	struct allmasks *v
-static inline void sched_cpumask_alloc(struct allmasks **masks)
-{
-	*masks = kmalloc(sizeof(**masks), GFP_KERNEL);
-}
-static inline void sched_cpumask_free(struct allmasks *masks)
-{
-	kfree(masks);
-}
-#else
-#define SCHED_CPUMASK_DECLARE(v)	struct allmasks _v, *v = &_v
-static inline void sched_cpumask_alloc(struct allmasks **masks)
-{ }
-static inline void sched_cpumask_free(struct allmasks *masks)
-{ }
-#endif
-
-#define	SCHED_CPUMASK_VAR(v, a) 	cpumask_t *v = (cpumask_t *) \
-			((unsigned long)(a) + offsetof(struct allmasks, v))
-
 static int default_relax_domain_level = -1;
 
 static int __init setup_relax_domain_level(char *str)
@@ -7347,14 +7305,35 @@ static void set_domain_attribute(struct sched_domain *sd,
 static int __build_sched_domains(const cpumask_t *cpu_map,
 				 struct sched_domain_attr *attr)
 {
-	int i;
+	int i, err = -ENOMEM;
 	struct root_domain *rd;
-	SCHED_CPUMASK_DECLARE(allmasks);
-	cpumask_t *tmpmask;
+	cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
+		tmpmask;
 #ifdef CONFIG_NUMA
+	cpumask_var_t domainspan, covered, notcovered;
 	struct sched_group **sched_group_nodes = NULL;
 	int sd_allnodes = 0;
 
+	if (!alloc_cpumask_var(&domainspan, GFP_KERNEL))
+		goto out;
+	if (!alloc_cpumask_var(&covered, GFP_KERNEL))
+		goto free_domainspan;
+	if (!alloc_cpumask_var(&notcovered, GFP_KERNEL))
+		goto free_covered;
+#endif
+
+	if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
+		goto free_notcovered;
+	if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
+		goto free_nodemask;
+	if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
+		goto free_this_sibling_map;
+	if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
+		goto free_this_core_map;
+	if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
+		goto free_send_covered;
+
+#ifdef CONFIG_NUMA
 	/*
 	 * Allocate the per-node list of sched groups
 	 */
@@ -7362,33 +7341,16 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 				    GFP_KERNEL);
 	if (!sched_group_nodes) {
 		printk(KERN_WARNING "Can not alloc sched group node list\n");
-		return -ENOMEM;
+		goto free_tmpmask;
 	}
 #endif
 
 	rd = alloc_rootdomain();
 	if (!rd) {
 		printk(KERN_WARNING "Cannot alloc root domain\n");
-#ifdef CONFIG_NUMA
-		kfree(sched_group_nodes);
-#endif
-		return -ENOMEM;
+		goto free_sched_groups;
 	}
 
-	/* get space for all scratch cpumask variables */
-	sched_cpumask_alloc(&allmasks);
-	if (!allmasks) {
-		printk(KERN_WARNING "Cannot alloc cpumask array\n");
-		kfree(rd);
-#ifdef CONFIG_NUMA
-		kfree(sched_group_nodes);
-#endif
-		return -ENOMEM;
-	}
-
-	tmpmask = (cpumask_t *)allmasks;
-
-
 #ifdef CONFIG_NUMA
 	sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
 #endif
@@ -7398,7 +7360,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 	 */
 	for_each_cpu(i, cpu_map) {
 		struct sched_domain *sd = NULL, *p;
-		SCHED_CPUMASK_VAR(nodemask, allmasks);
 
 		*nodemask = node_to_cpumask(cpu_to_node(i));
 		cpus_and(*nodemask, *nodemask, *cpu_map);
@@ -7464,9 +7425,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #ifdef CONFIG_SCHED_SMT
 	/* Set up CPU (sibling) groups */
 	for_each_cpu(i, cpu_map) {
-		SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
-		SCHED_CPUMASK_VAR(send_covered, allmasks);
-
 		*this_sibling_map = per_cpu(cpu_sibling_map, i);
 		cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);
 		if (i != first_cpu(*this_sibling_map))
@@ -7481,9 +7439,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #ifdef CONFIG_SCHED_MC
 	/* Set up multi-core groups */
 	for_each_cpu(i, cpu_map) {
-		SCHED_CPUMASK_VAR(this_core_map, allmasks);
-		SCHED_CPUMASK_VAR(send_covered, allmasks);
-
 		*this_core_map = cpu_coregroup_map(i);
 		cpus_and(*this_core_map, *this_core_map, *cpu_map);
 		if (i != first_cpu(*this_core_map))
@@ -7497,9 +7452,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 
 	/* Set up physical groups */
 	for (i = 0; i < nr_node_ids; i++) {
-		SCHED_CPUMASK_VAR(nodemask, allmasks);
-		SCHED_CPUMASK_VAR(send_covered, allmasks);
-
 		*nodemask = node_to_cpumask(i);
 		cpus_and(*nodemask, *nodemask, *cpu_map);
 		if (cpus_empty(*nodemask))
@@ -7513,8 +7465,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #ifdef CONFIG_NUMA
 	/* Set up node groups */
 	if (sd_allnodes) {
-		SCHED_CPUMASK_VAR(send_covered, allmasks);
-
 		init_sched_build_groups(cpu_map, cpu_map,
 					&cpu_to_allnodes_group,
 					send_covered, tmpmask);
@@ -7523,9 +7473,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 	for (i = 0; i < nr_node_ids; i++) {
 		/* Set up node groups */
 		struct sched_group *sg, *prev;
-		SCHED_CPUMASK_VAR(nodemask, allmasks);
-		SCHED_CPUMASK_VAR(domainspan, allmasks);
-		SCHED_CPUMASK_VAR(covered, allmasks);
 		int j;
 
 		*nodemask = node_to_cpumask(i);
@@ -7560,7 +7507,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		prev = sg;
 
 		for (j = 0; j < nr_node_ids; j++) {
-			SCHED_CPUMASK_VAR(notcovered, allmasks);
 			int n = (i + j) % nr_node_ids;
 			node_to_cpumask_ptr(pnodemask, n);
 
@@ -7639,15 +7585,40 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		cpu_attach_domain(sd, rd, i);
 	}
 
-	sched_cpumask_free(allmasks);
-	return 0;
+	err = 0;
+
+free_tmpmask:
+	free_cpumask_var(tmpmask);
+free_send_covered:
+	free_cpumask_var(send_covered);
+free_this_core_map:
+	free_cpumask_var(this_core_map);
+free_this_sibling_map:
+	free_cpumask_var(this_sibling_map);
+free_nodemask:
+	free_cpumask_var(nodemask);
+free_notcovered:
+#ifdef CONFIG_NUMA
+	free_cpumask_var(notcovered);
+free_covered:
+	free_cpumask_var(covered);
+free_domainspan:
+	free_cpumask_var(domainspan);
+out:
+#endif
+	return err;
+
+free_sched_groups:
+#ifdef CONFIG_NUMA
+	kfree(sched_group_nodes);
+#endif
+	goto free_tmpmask;
 
 #ifdef CONFIG_NUMA
 error:
 	free_sched_groups(cpu_map, tmpmask);
-	sched_cpumask_free(allmasks);
 	kfree(rd);
-	return -ENOMEM;
+	goto free_tmpmask;
 #endif
 }
 
-- 
cgit v1.2.3


From 1e5ce4f4a755ee498bd9217dae26143afa0d8f31 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 25 Nov 2008 02:35:03 +1030
Subject: sched: remove any_online_cpu()

Impact: use new API

any_online_cpu() is a good name, but it takes a cpumask_t, not a
pointer.

There are several places where any_online_cpu() doesn't really want a
mask arg at all.  Replace all callers with cpumask_any() and
cpumask_any_and().

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 0dc9d5752d6..a2de33d0534 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5964,7 +5964,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
 	if (cpu_isset(task_cpu(p), *new_mask))
 		goto out;
 
-	if (migrate_task(p, any_online_cpu(*new_mask), &req)) {
+	if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
 		/* Need help from migration thread: drop lock and wait. */
 		task_rq_unlock(rq, &flags);
 		wake_up_process(rq->migration_thread);
@@ -6113,11 +6113,12 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 		node_to_cpumask_ptr(pnodemask, cpu_to_node(dead_cpu));
 
 		cpus_and(mask, *pnodemask, p->cpus_allowed);
-		dest_cpu = any_online_cpu(mask);
+		dest_cpu = cpumask_any_and(cpu_online_mask, &mask);
 
 		/* On any allowed CPU? */
 		if (dest_cpu >= nr_cpu_ids)
-			dest_cpu = any_online_cpu(p->cpus_allowed);
+			dest_cpu = cpumask_any_and(cpu_online_mask,
+						   &p->cpus_allowed);
 
 		/* No more Mr. Nice Guy. */
 		if (dest_cpu >= nr_cpu_ids) {
@@ -6133,7 +6134,8 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 			 */
 			rq = task_rq_lock(p, &flags);
 			p->cpus_allowed = cpus_allowed;
-			dest_cpu = any_online_cpu(p->cpus_allowed);
+			dest_cpu = cpumask_any_and(cpu_online_mask,
+						    &p->cpus_allowed);
 			task_rq_unlock(rq, &flags);
 
 			/*
@@ -6159,7 +6161,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
  */
 static void migrate_nr_uninterruptible(struct rq *rq_src)
 {
-	struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR));
+	struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask));
 	unsigned long flags;
 
 	local_irq_save(flags);
@@ -6524,7 +6526,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 			break;
 		/* Unbind it from offline cpu so it can run. Fall thru. */
 		kthread_bind(cpu_rq(cpu)->migration_thread,
-			     any_online_cpu(cpu_online_map));
+			     cpumask_any(cpu_online_mask));
 		kthread_stop(cpu_rq(cpu)->migration_thread);
 		cpu_rq(cpu)->migration_thread = NULL;
 		break;
-- 
cgit v1.2.3


From 758b2cdc6f6a22c702bd8f2344382fb1270b2161 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 25 Nov 2008 02:35:04 +1030
Subject: sched: wrap sched_group and sched_domain cpumask accesses.

Impact: trivial wrap of member accesses

This eases the transition in the next patch.

We also get rid of a temporary cpumask in find_idlest_cpu() thanks to
for_each_cpu_and, and sched_balance_self() due to getting weight before
setting sd to NULL.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c       | 114 +++++++++++++++++++++++++--------------------------
 kernel/sched_fair.c  |  10 ++---
 kernel/sched_rt.c    |   3 +-
 kernel/sched_stats.h |   3 +-
 4 files changed, 63 insertions(+), 67 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index a2de33d0534..575f38acf4d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1501,7 +1501,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
 	struct sched_domain *sd = data;
 	int i;
 
-	for_each_cpu_mask(i, sd->span) {
+	for_each_cpu(i, sched_domain_span(sd)) {
 		/*
 		 * If there are currently no tasks on the cpu pretend there
 		 * is one of average load so that when a new task gets to
@@ -1522,7 +1522,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
 	if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
 		shares = tg->shares;
 
-	for_each_cpu_mask(i, sd->span)
+	for_each_cpu(i, sched_domain_span(sd))
 		update_group_shares_cpu(tg, i, shares, rq_weight);
 
 	return 0;
@@ -2053,15 +2053,17 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 		int i;
 
 		/* Skip over this group if it has no CPUs allowed */
-		if (!cpus_intersects(group->cpumask, p->cpus_allowed))
+		if (!cpumask_intersects(sched_group_cpus(group),
+					&p->cpus_allowed))
 			continue;
 
-		local_group = cpu_isset(this_cpu, group->cpumask);
+		local_group = cpumask_test_cpu(this_cpu,
+					       sched_group_cpus(group));
 
 		/* Tally up the load of all CPUs in the group */
 		avg_load = 0;
 
-		for_each_cpu(i, &group->cpumask) {
+		for_each_cpu(i, sched_group_cpus(group)) {
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
 				load = source_load(i, load_idx);
@@ -2093,17 +2095,14 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
  * find_idlest_cpu - find the idlest cpu among the cpus in group.
  */
 static int
-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
-		cpumask_t *tmp)
+find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 {
 	unsigned long load, min_load = ULONG_MAX;
 	int idlest = -1;
 	int i;
 
 	/* Traverse only the allowed CPUs */
-	cpus_and(*tmp, group->cpumask, p->cpus_allowed);
-
-	for_each_cpu(i, tmp) {
+	for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
 		load = weighted_cpuload(i);
 
 		if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -2145,7 +2144,6 @@ static int sched_balance_self(int cpu, int flag)
 		update_shares(sd);
 
 	while (sd) {
-		cpumask_t span, tmpmask;
 		struct sched_group *group;
 		int new_cpu, weight;
 
@@ -2154,14 +2152,13 @@ static int sched_balance_self(int cpu, int flag)
 			continue;
 		}
 
-		span = sd->span;
 		group = find_idlest_group(sd, t, cpu);
 		if (!group) {
 			sd = sd->child;
 			continue;
 		}
 
-		new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask);
+		new_cpu = find_idlest_cpu(group, t, cpu);
 		if (new_cpu == -1 || new_cpu == cpu) {
 			/* Now try balancing at a lower domain level of cpu */
 			sd = sd->child;
@@ -2170,10 +2167,10 @@ static int sched_balance_self(int cpu, int flag)
 
 		/* Now try balancing at a lower domain level of new_cpu */
 		cpu = new_cpu;
+		weight = cpumask_weight(sched_domain_span(sd));
 		sd = NULL;
-		weight = cpus_weight(span);
 		for_each_domain(cpu, tmp) {
-			if (weight <= cpus_weight(tmp->span))
+			if (weight <= cpumask_weight(sched_domain_span(tmp)))
 				break;
 			if (tmp->flags & flag)
 				sd = tmp;
@@ -2218,7 +2215,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 		cpu = task_cpu(p);
 
 		for_each_domain(this_cpu, sd) {
-			if (cpu_isset(cpu, sd->span)) {
+			if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
 				update_shares(sd);
 				break;
 			}
@@ -2266,7 +2263,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	else {
 		struct sched_domain *sd;
 		for_each_domain(this_cpu, sd) {
-			if (cpu_isset(cpu, sd->span)) {
+			if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
 				schedstat_inc(sd, ttwu_wake_remote);
 				break;
 			}
@@ -3109,10 +3106,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		unsigned long sum_avg_load_per_task;
 		unsigned long avg_load_per_task;
 
-		local_group = cpu_isset(this_cpu, group->cpumask);
+		local_group = cpumask_test_cpu(this_cpu,
+					       sched_group_cpus(group));
 
 		if (local_group)
-			balance_cpu = first_cpu(group->cpumask);
+			balance_cpu = cpumask_first(sched_group_cpus(group));
 
 		/* Tally up the load of all CPUs in the group */
 		sum_weighted_load = sum_nr_running = avg_load = 0;
@@ -3121,13 +3119,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		max_cpu_load = 0;
 		min_cpu_load = ~0UL;
 
-		for_each_cpu(i, &group->cpumask) {
-			struct rq *rq;
-
-			if (!cpu_isset(i, *cpus))
-				continue;
-
-			rq = cpu_rq(i);
+		for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+			struct rq *rq = cpu_rq(i);
 
 			if (*sd_idle && rq->nr_running)
 				*sd_idle = 0;
@@ -3238,8 +3231,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		 */
 		if ((sum_nr_running < min_nr_running) ||
 		    (sum_nr_running == min_nr_running &&
-		     first_cpu(group->cpumask) <
-		     first_cpu(group_min->cpumask))) {
+		     cpumask_first(sched_group_cpus(group)) <
+		     cpumask_first(sched_group_cpus(group_min)))) {
 			group_min = group;
 			min_nr_running = sum_nr_running;
 			min_load_per_task = sum_weighted_load /
@@ -3254,8 +3247,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		if (sum_nr_running <= group_capacity - 1) {
 			if (sum_nr_running > leader_nr_running ||
 			    (sum_nr_running == leader_nr_running &&
-			     first_cpu(group->cpumask) >
-			      first_cpu(group_leader->cpumask))) {
+			     cpumask_first(sched_group_cpus(group)) >
+			     cpumask_first(sched_group_cpus(group_leader)))) {
 				group_leader = group;
 				leader_nr_running = sum_nr_running;
 			}
@@ -3400,7 +3393,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
 	unsigned long max_load = 0;
 	int i;
 
-	for_each_cpu(i, &group->cpumask) {
+	for_each_cpu(i, sched_group_cpus(group)) {
 		unsigned long wl;
 
 		if (!cpu_isset(i, *cpus))
@@ -3746,7 +3739,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 	/* Search for an sd spanning us and the target CPU. */
 	for_each_domain(target_cpu, sd) {
 		if ((sd->flags & SD_LOAD_BALANCE) &&
-		    cpu_isset(busiest_cpu, sd->span))
+		    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
 				break;
 	}
 
@@ -6618,7 +6611,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 	struct sched_group *group = sd->groups;
 	char str[256];
 
-	cpulist_scnprintf(str, sizeof(str), sd->span);
+	cpulist_scnprintf(str, sizeof(str), *sched_domain_span(sd));
 	cpus_clear(*groupmask);
 
 	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
@@ -6633,11 +6626,11 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 
 	printk(KERN_CONT "span %s level %s\n", str, sd->name);
 
-	if (!cpu_isset(cpu, sd->span)) {
+	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
 		printk(KERN_ERR "ERROR: domain->span does not contain "
 				"CPU%d\n", cpu);
 	}
-	if (!cpu_isset(cpu, group->cpumask)) {
+	if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
 		printk(KERN_ERR "ERROR: domain->groups does not contain"
 				" CPU%d\n", cpu);
 	}
@@ -6657,31 +6650,32 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 			break;
 		}
 
-		if (!cpus_weight(group->cpumask)) {
+		if (!cpumask_weight(sched_group_cpus(group))) {
 			printk(KERN_CONT "\n");
 			printk(KERN_ERR "ERROR: empty group\n");
 			break;
 		}
 
-		if (cpus_intersects(*groupmask, group->cpumask)) {
+		if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
 			printk(KERN_CONT "\n");
 			printk(KERN_ERR "ERROR: repeated CPUs\n");
 			break;
 		}
 
-		cpus_or(*groupmask, *groupmask, group->cpumask);
+		cpumask_or(groupmask, groupmask, sched_group_cpus(group));
 
-		cpulist_scnprintf(str, sizeof(str), group->cpumask);
+		cpulist_scnprintf(str, sizeof(str), *sched_group_cpus(group));
 		printk(KERN_CONT " %s", str);
 
 		group = group->next;
 	} while (group != sd->groups);
 	printk(KERN_CONT "\n");
 
-	if (!cpus_equal(sd->span, *groupmask))
+	if (!cpumask_equal(sched_domain_span(sd), groupmask))
 		printk(KERN_ERR "ERROR: groups don't span domain->span\n");
 
-	if (sd->parent && !cpus_subset(*groupmask, sd->parent->span))
+	if (sd->parent &&
+	    !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
 		printk(KERN_ERR "ERROR: parent span is not a superset "
 			"of domain->span\n");
 	return 0;
@@ -6721,7 +6715,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 
 static int sd_degenerate(struct sched_domain *sd)
 {
-	if (cpus_weight(sd->span) == 1)
+	if (cpumask_weight(sched_domain_span(sd)) == 1)
 		return 1;
 
 	/* Following flags need at least 2 groups */
@@ -6752,7 +6746,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 	if (sd_degenerate(parent))
 		return 1;
 
-	if (!cpus_equal(sd->span, parent->span))
+	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
 		return 0;
 
 	/* Does parent contain flags not in child? */
@@ -6913,10 +6907,10 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
 		int group = group_fn(i, cpu_map, &sg, tmpmask);
 		int j;
 
-		if (cpu_isset(i, *covered))
+		if (cpumask_test_cpu(i, covered))
 			continue;
 
-		cpus_clear(sg->cpumask);
+		cpumask_clear(sched_group_cpus(sg));
 		sg->__cpu_power = 0;
 
 		for_each_cpu(j, span) {
@@ -6924,7 +6918,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
 				continue;
 
 			cpu_set(j, *covered);
-			cpu_set(j, sg->cpumask);
+			cpumask_set_cpu(j, sched_group_cpus(sg));
 		}
 		if (!first)
 			first = sg;
@@ -7119,11 +7113,11 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
 	if (!sg)
 		return;
 	do {
-		for_each_cpu(j, &sg->cpumask) {
+		for_each_cpu(j, sched_group_cpus(sg)) {
 			struct sched_domain *sd;
 
 			sd = &per_cpu(phys_domains, j);
-			if (j != first_cpu(sd->groups->cpumask)) {
+			if (j != cpumask_first(sched_group_cpus(sd->groups))) {
 				/*
 				 * Only add "power" once for each
 				 * physical package.
@@ -7200,7 +7194,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 
 	WARN_ON(!sd || !sd->groups);
 
-	if (cpu != first_cpu(sd->groups->cpumask))
+	if (cpu != cpumask_first(sched_group_cpus(sd->groups)))
 		return;
 
 	child = sd->child;
@@ -7372,7 +7366,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 			sd = &per_cpu(allnodes_domains, i);
 			SD_INIT(sd, ALLNODES);
 			set_domain_attribute(sd, attr);
-			sd->span = *cpu_map;
+			cpumask_copy(sched_domain_span(sd), cpu_map);
 			cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
 			p = sd;
 			sd_allnodes = 1;
@@ -7382,18 +7376,19 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		sd = &per_cpu(node_domains, i);
 		SD_INIT(sd, NODE);
 		set_domain_attribute(sd, attr);
-		sched_domain_node_span(cpu_to_node(i), &sd->span);
+		sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
 		sd->parent = p;
 		if (p)
 			p->child = sd;
-		cpus_and(sd->span, sd->span, *cpu_map);
+		cpumask_and(sched_domain_span(sd),
+			    sched_domain_span(sd), cpu_map);
 #endif
 
 		p = sd;
 		sd = &per_cpu(phys_domains, i);
 		SD_INIT(sd, CPU);
 		set_domain_attribute(sd, attr);
-		sd->span = *nodemask;
+		cpumask_copy(sched_domain_span(sd), nodemask);
 		sd->parent = p;
 		if (p)
 			p->child = sd;
@@ -7404,8 +7399,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		sd = &per_cpu(core_domains, i);
 		SD_INIT(sd, MC);
 		set_domain_attribute(sd, attr);
-		sd->span = cpu_coregroup_map(i);
-		cpus_and(sd->span, sd->span, *cpu_map);
+		*sched_domain_span(sd) = cpu_coregroup_map(i);
+		cpumask_and(sched_domain_span(sd),
+			    sched_domain_span(sd), cpu_map);
 		sd->parent = p;
 		p->child = sd;
 		cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7416,8 +7412,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		sd = &per_cpu(cpu_domains, i);
 		SD_INIT(sd, SIBLING);
 		set_domain_attribute(sd, attr);
-		sd->span = per_cpu(cpu_sibling_map, i);
-		cpus_and(sd->span, sd->span, *cpu_map);
+		cpumask_and(sched_domain_span(sd),
+			    &per_cpu(cpu_sibling_map, i), cpu_map);
 		sd->parent = p;
 		p->child = sd;
 		cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7503,7 +7499,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 			sd->groups = sg;
 		}
 		sg->__cpu_power = 0;
-		sg->cpumask = *nodemask;
+		cpumask_copy(sched_group_cpus(sg), nodemask);
 		sg->next = sg;
 		cpus_or(*covered, *covered, *nodemask);
 		prev = sg;
@@ -7530,7 +7526,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 				goto error;
 			}
 			sg->__cpu_power = 0;
-			sg->cpumask = *tmpmask;
+			cpumask_copy(sched_group_cpus(sg), tmpmask);
 			sg->next = prev->next;
 			cpus_or(*covered, *covered, *tmpmask);
 			prev->next = sg;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 98345e45b05..bba00402ed9 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1024,7 +1024,6 @@ static void yield_task_fair(struct rq *rq)
 #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
 static int wake_idle(int cpu, struct task_struct *p)
 {
-	cpumask_t tmp;
 	struct sched_domain *sd;
 	int i;
 
@@ -1044,10 +1043,9 @@ static int wake_idle(int cpu, struct task_struct *p)
 		if ((sd->flags & SD_WAKE_IDLE)
 		    || ((sd->flags & SD_WAKE_IDLE_FAR)
 			&& !task_hot(p, task_rq(p)->clock, sd))) {
-			cpus_and(tmp, sd->span, p->cpus_allowed);
-			cpus_and(tmp, tmp, cpu_active_map);
-			for_each_cpu_mask_nr(i, tmp) {
-				if (idle_cpu(i)) {
+			for_each_cpu_and(i, sched_domain_span(sd),
+					 &p->cpus_allowed) {
+				if (cpu_active(i) && idle_cpu(i)) {
 					if (i != task_cpu(p)) {
 						schedstat_inc(p,
 						       se.nr_wakeups_idle);
@@ -1240,7 +1238,7 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
 	 * this_cpu and prev_cpu are present in:
 	 */
 	for_each_domain(this_cpu, sd) {
-		if (cpu_isset(prev_cpu, sd->span)) {
+		if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) {
 			this_sd = sd;
 			break;
 		}
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 2bdd4442359..4cd813abc23 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1017,7 +1017,8 @@ static int find_lowest_rq(struct task_struct *task)
 			cpumask_t domain_mask;
 			int       best_cpu;
 
-			cpus_and(domain_mask, sd->span, *lowest_mask);
+			cpumask_and(&domain_mask, sched_domain_span(sd),
+				    lowest_mask);
 
 			best_cpu = pick_optimal_cpu(this_cpu,
 						    &domain_mask);
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 7dbf72a2b02..ce340835d05 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -42,7 +42,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
 		for_each_domain(cpu, sd) {
 			enum cpu_idle_type itype;
 
-			cpumask_scnprintf(mask_str, mask_len, sd->span);
+			cpumask_scnprintf(mask_str, mask_len,
+					  *sched_domain_span(sd));
 			seq_printf(seq, "domain%d %s", dcount++, mask_str);
 			for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
 					itype++) {
-- 
cgit v1.2.3


From 6c99e9ad47d9c082bd096f42fb49e397b05d58a8 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 25 Nov 2008 02:35:04 +1030
Subject: sched: convert struct sched_group/sched_domain cpumask_ts to variable
 bitmaps

Impact: (future) size reduction for large NR_CPUS.

We move the 'cpumask' member of sched_group to the end, so when we
kmalloc it we can do a minimal allocation: saves space for small
nr_cpu_ids but big CONFIG_NR_CPUS.  Similar trick for 'span' in
sched_domain.

This isn't quite as good as converting to a cpumask_var_t, as some
sched_groups are actually static, but it's safer: we don't have to
figure out where to call alloc_cpumask_var/free_cpumask_var.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 65 ++++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 41 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 575f38acf4d..6b9606a6cab 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7005,19 +7005,34 @@ static void sched_domain_node_span(int node, cpumask_t *span)
 
 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 
+/*
+ * The cpus mask in sched_group and sched_domain hangs off the end.
+ * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
+ * for nr_cpu_ids < CONFIG_NR_CPUS.
+ */
+struct static_sched_group {
+	struct sched_group sg;
+	DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
+};
+
+struct static_sched_domain {
+	struct sched_domain sd;
+	DECLARE_BITMAP(span, CONFIG_NR_CPUS);
+};
+
 /*
  * SMT sched-domains:
  */
 #ifdef CONFIG_SCHED_SMT
-static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
-static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
+static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
+static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);
 
 static int
 cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
 		 cpumask_t *unused)
 {
 	if (sg)
-		*sg = &per_cpu(sched_group_cpus, cpu);
+		*sg = &per_cpu(sched_group_cpus, cpu).sg;
 	return cpu;
 }
 #endif /* CONFIG_SCHED_SMT */
@@ -7026,8 +7041,8 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
  * multi-core sched-domains:
  */
 #ifdef CONFIG_SCHED_MC
-static DEFINE_PER_CPU(struct sched_domain, core_domains);
-static DEFINE_PER_CPU(struct sched_group, sched_group_core);
+static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
+static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
 #endif /* CONFIG_SCHED_MC */
 
 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
@@ -7041,7 +7056,7 @@ cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
 	cpus_and(*mask, *mask, *cpu_map);
 	group = first_cpu(*mask);
 	if (sg)
-		*sg = &per_cpu(sched_group_core, group);
+		*sg = &per_cpu(sched_group_core, group).sg;
 	return group;
 }
 #elif defined(CONFIG_SCHED_MC)
@@ -7050,13 +7065,13 @@ cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
 		  cpumask_t *unused)
 {
 	if (sg)
-		*sg = &per_cpu(sched_group_core, cpu);
+		*sg = &per_cpu(sched_group_core, cpu).sg;
 	return cpu;
 }
 #endif
 
-static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
+static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
+static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
 
 static int
 cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
@@ -7075,7 +7090,7 @@ cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
 	group = cpu;
 #endif
 	if (sg)
-		*sg = &per_cpu(sched_group_phys, group);
+		*sg = &per_cpu(sched_group_phys, group).sg;
 	return group;
 }
 
@@ -7089,7 +7104,7 @@ static DEFINE_PER_CPU(struct sched_domain, node_domains);
 static struct sched_group ***sched_group_nodes_bycpu;
 
 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
-static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
+static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
 
 static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
 				 struct sched_group **sg, cpumask_t *nodemask)
@@ -7101,7 +7116,7 @@ static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
 	group = first_cpu(*nodemask);
 
 	if (sg)
-		*sg = &per_cpu(sched_group_allnodes, group);
+		*sg = &per_cpu(sched_group_allnodes, group).sg;
 	return group;
 }
 
@@ -7116,7 +7131,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
 		for_each_cpu(j, sched_group_cpus(sg)) {
 			struct sched_domain *sd;
 
-			sd = &per_cpu(phys_domains, j);
+			sd = &per_cpu(phys_domains, j).sd;
 			if (j != cpumask_first(sched_group_cpus(sd->groups))) {
 				/*
 				 * Only add "power" once for each
@@ -7385,7 +7400,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #endif
 
 		p = sd;
-		sd = &per_cpu(phys_domains, i);
+		sd = &per_cpu(phys_domains, i).sd;
 		SD_INIT(sd, CPU);
 		set_domain_attribute(sd, attr);
 		cpumask_copy(sched_domain_span(sd), nodemask);
@@ -7396,7 +7411,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 
 #ifdef CONFIG_SCHED_MC
 		p = sd;
-		sd = &per_cpu(core_domains, i);
+		sd = &per_cpu(core_domains, i).sd;
 		SD_INIT(sd, MC);
 		set_domain_attribute(sd, attr);
 		*sched_domain_span(sd) = cpu_coregroup_map(i);
@@ -7409,7 +7424,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 
 #ifdef CONFIG_SCHED_SMT
 		p = sd;
-		sd = &per_cpu(cpu_domains, i);
+		sd = &per_cpu(cpu_domains, i).sd;
 		SD_INIT(sd, SIBLING);
 		set_domain_attribute(sd, attr);
 		cpumask_and(sched_domain_span(sd),
@@ -7485,7 +7500,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		sched_domain_node_span(i, domainspan);
 		cpus_and(*domainspan, *domainspan, *cpu_map);
 
-		sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
+		sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
+				  GFP_KERNEL, i);
 		if (!sg) {
 			printk(KERN_WARNING "Can not alloc domain group for "
 				"node %d\n", i);
@@ -7518,7 +7534,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 			if (cpus_empty(*tmpmask))
 				continue;
 
-			sg = kmalloc_node(sizeof(struct sched_group),
+			sg = kmalloc_node(sizeof(struct sched_group) +
+					  cpumask_size(),
 					  GFP_KERNEL, i);
 			if (!sg) {
 				printk(KERN_WARNING
@@ -7538,21 +7555,21 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 	/* Calculate CPU power for physical packages and nodes */
 #ifdef CONFIG_SCHED_SMT
 	for_each_cpu(i, cpu_map) {
-		struct sched_domain *sd = &per_cpu(cpu_domains, i);
+		struct sched_domain *sd = &per_cpu(cpu_domains, i).sd;
 
 		init_sched_groups_power(i, sd);
 	}
 #endif
 #ifdef CONFIG_SCHED_MC
 	for_each_cpu(i, cpu_map) {
-		struct sched_domain *sd = &per_cpu(core_domains, i);
+		struct sched_domain *sd = &per_cpu(core_domains, i).sd;
 
 		init_sched_groups_power(i, sd);
 	}
 #endif
 
 	for_each_cpu(i, cpu_map) {
-		struct sched_domain *sd = &per_cpu(phys_domains, i);
+		struct sched_domain *sd = &per_cpu(phys_domains, i).sd;
 
 		init_sched_groups_power(i, sd);
 	}
@@ -7574,11 +7591,11 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 	for_each_cpu(i, cpu_map) {
 		struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
-		sd = &per_cpu(cpu_domains, i);
+		sd = &per_cpu(cpu_domains, i).sd;
 #elif defined(CONFIG_SCHED_MC)
-		sd = &per_cpu(core_domains, i);
+		sd = &per_cpu(core_domains, i).sd;
 #else
-		sd = &per_cpu(phys_domains, i);
+		sd = &per_cpu(phys_domains, i).sd;
 #endif
 		cpu_attach_domain(sd, rd, i);
 	}
-- 
cgit v1.2.3


From 6a7b3dc3440f7b5a9b67594af01ed562cdeb41e4 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 25 Nov 2008 02:35:04 +1030
Subject: sched: convert nohz_cpu_mask to cpumask_var_t.

Impact: (future) size reduction for large NR_CPUS.

Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves
space for small nr_cpu_ids but big CONFIG_NR_CPUS.  cpumask_var_t
is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcuclassic.c      |  2 +-
 kernel/sched.c           |  7 +++++--
 kernel/time/tick-sched.c | 10 +++++-----
 3 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index e503a002f33..c03ca3e6191 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -393,7 +393,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
 		 * unnecessarily.
 		 */
 		smp_mb();
-		cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
+		cpumask_andnot(&rcp->cpumask, cpu_online_mask, nohz_cpu_mask);
 
 		rcp->signaled = 0;
 	}
diff --git a/kernel/sched.c b/kernel/sched.c
index 6b9606a6cab..2723d7a4a42 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5870,9 +5870,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
  * indicates which cpus entered this state. This is used
  * in the rcu update to wait only for active cpus. For system
  * which do not switch off the HZ timer nohz_cpu_mask should
- * always be CPU_MASK_NONE.
+ * always be CPU_BITS_NONE.
  */
-cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
+cpumask_var_t nohz_cpu_mask;
 
 /*
  * Increase the granularity value when there are more CPUs,
@@ -8274,6 +8274,9 @@ void __init sched_init(void)
 	 */
 	current->sched_class = &fair_sched_class;
 
+	/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
+	alloc_bootmem_cpumask_var(&nohz_cpu_mask);
+
 	scheduler_running = 1;
 }
 
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 342fc9ccab4..70f872c71f4 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -144,7 +144,7 @@ void tick_nohz_update_jiffies(void)
 	if (!ts->tick_stopped)
 		return;
 
-	cpu_clear(cpu, nohz_cpu_mask);
+	cpumask_clear_cpu(cpu, nohz_cpu_mask);
 	now = ktime_get();
 	ts->idle_waketime = now;
 
@@ -283,7 +283,7 @@ void tick_nohz_stop_sched_tick(int inidle)
 	if ((long)delta_jiffies >= 1) {
 
 		if (delta_jiffies > 1)
-			cpu_set(cpu, nohz_cpu_mask);
+			cpumask_set_cpu(cpu, nohz_cpu_mask);
 		/*
 		 * nohz_stop_sched_tick can be called several times before
 		 * the nohz_restart_sched_tick is called. This happens when
@@ -296,7 +296,7 @@ void tick_nohz_stop_sched_tick(int inidle)
 				/*
 				 * sched tick not stopped!
 				 */
-				cpu_clear(cpu, nohz_cpu_mask);
+				cpumask_clear_cpu(cpu, nohz_cpu_mask);
 				goto out;
 			}
 
@@ -354,7 +354,7 @@ void tick_nohz_stop_sched_tick(int inidle)
 		 * softirq.
 		 */
 		tick_do_update_jiffies64(ktime_get());
-		cpu_clear(cpu, nohz_cpu_mask);
+		cpumask_clear_cpu(cpu, nohz_cpu_mask);
 	}
 	raise_softirq_irqoff(TIMER_SOFTIRQ);
 out:
@@ -432,7 +432,7 @@ void tick_nohz_restart_sched_tick(void)
 	select_nohz_load_balancer(0);
 	now = ktime_get();
 	tick_do_update_jiffies64(now);
-	cpu_clear(cpu, nohz_cpu_mask);
+	cpumask_clear_cpu(cpu, nohz_cpu_mask);
 
 	/*
 	 * We stopped the tick in idle. Update process times would miss the
-- 
cgit v1.2.3


From c6c4927b22a3514c6660f0e72c78716226bd3cc8 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 25 Nov 2008 02:35:05 +1030
Subject: sched: convert struct root_domain to cpumask_var_t.

Impact: (future) size reduction for large NR_CPUS.

Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves
space for small nr_cpu_ids but big CONFIG_NR_CPUS.  cpumask_var_t
is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK.

def_root_domain is static, and so its masks are initialized with
alloc_bootmem_cpumask_var.  After that, alloc_cpumask_var is used.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    | 69 ++++++++++++++++++++++++++++++++++++++++---------------
 kernel/sched_rt.c | 26 ++++++++++-----------
 2 files changed, 64 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 2723d7a4a42..93309c3034d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -487,14 +487,14 @@ struct rt_rq {
  */
 struct root_domain {
 	atomic_t refcount;
-	cpumask_t span;
-	cpumask_t online;
+	cpumask_var_t span;
+	cpumask_var_t online;
 
 	/*
 	 * The "RT overload" flag: it gets set if a CPU has more than
 	 * one runnable RT task.
 	 */
-	cpumask_t rto_mask;
+	cpumask_var_t rto_mask;
 	atomic_t rto_count;
 #ifdef CONFIG_SMP
 	struct cpupri cpupri;
@@ -6444,7 +6444,7 @@ static void set_rq_online(struct rq *rq)
 	if (!rq->online) {
 		const struct sched_class *class;
 
-		cpu_set(rq->cpu, rq->rd->online);
+		cpumask_set_cpu(rq->cpu, rq->rd->online);
 		rq->online = 1;
 
 		for_each_class(class) {
@@ -6464,7 +6464,7 @@ static void set_rq_offline(struct rq *rq)
 				class->rq_offline(rq);
 		}
 
-		cpu_clear(rq->cpu, rq->rd->online);
+		cpumask_clear_cpu(rq->cpu, rq->rd->online);
 		rq->online = 0;
 	}
 }
@@ -6505,7 +6505,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		rq = cpu_rq(cpu);
 		spin_lock_irqsave(&rq->lock, flags);
 		if (rq->rd) {
-			BUG_ON(!cpu_isset(cpu, rq->rd->span));
+			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 
 			set_rq_online(rq);
 		}
@@ -6567,7 +6567,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		rq = cpu_rq(cpu);
 		spin_lock_irqsave(&rq->lock, flags);
 		if (rq->rd) {
-			BUG_ON(!cpu_isset(cpu, rq->rd->span));
+			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 			set_rq_offline(rq);
 		}
 		spin_unlock_irqrestore(&rq->lock, flags);
@@ -6768,6 +6768,14 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 	return 1;
 }
 
+static void free_rootdomain(struct root_domain *rd)
+{
+	free_cpumask_var(rd->rto_mask);
+	free_cpumask_var(rd->online);
+	free_cpumask_var(rd->span);
+	kfree(rd);
+}
+
 static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 {
 	unsigned long flags;
@@ -6777,38 +6785,60 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 	if (rq->rd) {
 		struct root_domain *old_rd = rq->rd;
 
-		if (cpu_isset(rq->cpu, old_rd->online))
+		if (cpumask_test_cpu(rq->cpu, old_rd->online))
 			set_rq_offline(rq);
 
-		cpu_clear(rq->cpu, old_rd->span);
+		cpumask_clear_cpu(rq->cpu, old_rd->span);
 
 		if (atomic_dec_and_test(&old_rd->refcount))
-			kfree(old_rd);
+			free_rootdomain(old_rd);
 	}
 
 	atomic_inc(&rd->refcount);
 	rq->rd = rd;
 
-	cpu_set(rq->cpu, rd->span);
-	if (cpu_isset(rq->cpu, cpu_online_map))
+	cpumask_set_cpu(rq->cpu, rd->span);
+	if (cpumask_test_cpu(rq->cpu, cpu_online_mask))
 		set_rq_online(rq);
 
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
 
-static void init_rootdomain(struct root_domain *rd)
+static int init_rootdomain(struct root_domain *rd, bool bootmem)
 {
 	memset(rd, 0, sizeof(*rd));
 
-	cpus_clear(rd->span);
-	cpus_clear(rd->online);
+	if (bootmem) {
+		alloc_bootmem_cpumask_var(&def_root_domain.span);
+		alloc_bootmem_cpumask_var(&def_root_domain.online);
+		alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
+		cpupri_init(&rd->cpupri);
+		return 0;
+	}
+
+	if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+		goto free_rd;
+	if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+		goto free_span;
+	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+		goto free_online;
 
 	cpupri_init(&rd->cpupri);
+	return 0;
+
+free_online:
+	free_cpumask_var(rd->online);
+free_span:
+	free_cpumask_var(rd->span);
+free_rd:
+	kfree(rd);
+	return -ENOMEM;
 }
 
 static void init_defrootdomain(void)
 {
-	init_rootdomain(&def_root_domain);
+	init_rootdomain(&def_root_domain, true);
+
 	atomic_set(&def_root_domain.refcount, 1);
 }
 
@@ -6820,7 +6850,10 @@ static struct root_domain *alloc_rootdomain(void)
 	if (!rd)
 		return NULL;
 
-	init_rootdomain(rd);
+	if (init_rootdomain(rd, false) != 0) {
+		kfree(rd);
+		return NULL;
+	}
 
 	return rd;
 }
@@ -7632,7 +7665,7 @@ free_sched_groups:
 #ifdef CONFIG_NUMA
 error:
 	free_sched_groups(cpu_map, tmpmask);
-	kfree(rd);
+	free_rootdomain(rd);
 	goto free_tmpmask;
 #endif
 }
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 4cd813abc23..820fc422c6d 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -15,7 +15,7 @@ static inline void rt_set_overload(struct rq *rq)
 	if (!rq->online)
 		return;
 
-	cpu_set(rq->cpu, rq->rd->rto_mask);
+	cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
 	/*
 	 * Make sure the mask is visible before we set
 	 * the overload count. That is checked to determine
@@ -34,7 +34,7 @@ static inline void rt_clear_overload(struct rq *rq)
 
 	/* the order here really doesn't matter */
 	atomic_dec(&rq->rd->rto_count);
-	cpu_clear(rq->cpu, rq->rd->rto_mask);
+	cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
 }
 
 static void update_rt_migration(struct rq *rq)
@@ -139,14 +139,14 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se)
 }
 
 #ifdef CONFIG_SMP
-static inline cpumask_t sched_rt_period_mask(void)
+static inline const struct cpumask *sched_rt_period_mask(void)
 {
 	return cpu_rq(smp_processor_id())->rd->span;
 }
 #else
-static inline cpumask_t sched_rt_period_mask(void)
+static inline const struct cpumask *sched_rt_period_mask(void)
 {
-	return cpu_online_map;
+	return cpu_online_mask;
 }
 #endif
 
@@ -212,9 +212,9 @@ static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 	return rt_rq->rt_throttled;
 }
 
-static inline cpumask_t sched_rt_period_mask(void)
+static inline const struct cpumask *sched_rt_period_mask(void)
 {
-	return cpu_online_map;
+	return cpu_online_mask;
 }
 
 static inline
@@ -241,11 +241,11 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
 	int i, weight, more = 0;
 	u64 rt_period;
 
-	weight = cpus_weight(rd->span);
+	weight = cpumask_weight(rd->span);
 
 	spin_lock(&rt_b->rt_runtime_lock);
 	rt_period = ktime_to_ns(rt_b->rt_period);
-	for_each_cpu_mask_nr(i, rd->span) {
+	for_each_cpu(i, rd->span) {
 		struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 		s64 diff;
 
@@ -324,7 +324,7 @@ static void __disable_runtime(struct rq *rq)
 		/*
 		 * Greedy reclaim, take back as much as we can.
 		 */
-		for_each_cpu_mask(i, rd->span) {
+		for_each_cpu(i, rd->span) {
 			struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 			s64 diff;
 
@@ -429,13 +429,13 @@ static inline int balance_runtime(struct rt_rq *rt_rq)
 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 {
 	int i, idle = 1;
-	cpumask_t span;
+	const struct cpumask *span;
 
 	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
 		return 1;
 
 	span = sched_rt_period_mask();
-	for_each_cpu_mask(i, span) {
+	for_each_cpu(i, span) {
 		int enqueue = 0;
 		struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
 		struct rq *rq = rq_of_rt_rq(rt_rq);
@@ -1181,7 +1181,7 @@ static int pull_rt_task(struct rq *this_rq)
 
 	next = pick_next_task_rt(this_rq);
 
-	for_each_cpu_mask_nr(cpu, this_rq->rd->rto_mask) {
+	for_each_cpu(cpu, this_rq->rd->rto_mask) {
 		if (this_cpu == cpu)
 			continue;
 
-- 
cgit v1.2.3


From 7d1e6a9b95e3edeac91888bc683ae62f18519432 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 25 Nov 2008 02:35:09 +1030
Subject: sched: convert nohz struct to cpumask_var_t.

Impact: (future) size reduction for large NR_CPUS.

Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves
space for small nr_cpu_ids but big CONFIG_NR_CPUS.  cpumask_var_t
is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 93309c3034d..2f8ea99df16 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3758,10 +3758,9 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 #ifdef CONFIG_NO_HZ
 static struct {
 	atomic_t load_balancer;
-	cpumask_t cpu_mask;
+	cpumask_var_t cpu_mask;
 } nohz ____cacheline_aligned = {
 	.load_balancer = ATOMIC_INIT(-1),
-	.cpu_mask = CPU_MASK_NONE,
 };
 
 /*
@@ -3789,7 +3788,7 @@ int select_nohz_load_balancer(int stop_tick)
 	int cpu = smp_processor_id();
 
 	if (stop_tick) {
-		cpu_set(cpu, nohz.cpu_mask);
+		cpumask_set_cpu(cpu, nohz.cpu_mask);
 		cpu_rq(cpu)->in_nohz_recently = 1;
 
 		/*
@@ -3803,7 +3802,7 @@ int select_nohz_load_balancer(int stop_tick)
 		}
 
 		/* time for ilb owner also to sleep */
-		if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+		if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
 			if (atomic_read(&nohz.load_balancer) == cpu)
 				atomic_set(&nohz.load_balancer, -1);
 			return 0;
@@ -3816,10 +3815,10 @@ int select_nohz_load_balancer(int stop_tick)
 		} else if (atomic_read(&nohz.load_balancer) == cpu)
 			return 1;
 	} else {
-		if (!cpu_isset(cpu, nohz.cpu_mask))
+		if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
 			return 0;
 
-		cpu_clear(cpu, nohz.cpu_mask);
+		cpumask_clear_cpu(cpu, nohz.cpu_mask);
 
 		if (atomic_read(&nohz.load_balancer) == cpu)
 			if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
@@ -3930,12 +3929,13 @@ static void run_rebalance_domains(struct softirq_action *h)
 	 */
 	if (this_rq->idle_at_tick &&
 	    atomic_read(&nohz.load_balancer) == this_cpu) {
-		cpumask_t cpus = nohz.cpu_mask;
 		struct rq *rq;
 		int balance_cpu;
 
-		cpu_clear(this_cpu, cpus);
-		for_each_cpu(balance_cpu, &cpus) {
+		for_each_cpu(balance_cpu, nohz.cpu_mask) {
+			if (balance_cpu == this_cpu)
+				continue;
+
 			/*
 			 * If this cpu gets work to do, stop the load balancing
 			 * work being done for other cpus. Next load
@@ -3973,7 +3973,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
 		rq->in_nohz_recently = 0;
 
 		if (atomic_read(&nohz.load_balancer) == cpu) {
-			cpu_clear(cpu, nohz.cpu_mask);
+			cpumask_clear_cpu(cpu, nohz.cpu_mask);
 			atomic_set(&nohz.load_balancer, -1);
 		}
 
@@ -3986,7 +3986,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
 			 * TBD: Traverse the sched domains and nominate
 			 * the nearest cpu in the nohz.cpu_mask.
 			 */
-			int ilb = first_cpu(nohz.cpu_mask);
+			int ilb = cpumask_first(nohz.cpu_mask);
 
 			if (ilb < nr_cpu_ids)
 				resched_cpu(ilb);
@@ -3998,7 +3998,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
 	 * cpus with ticks stopped, is it time for that to stop?
 	 */
 	if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
-	    cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+	    cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
 		resched_cpu(cpu);
 		return;
 	}
@@ -4008,7 +4008,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
 	 * someone else, then no need raise the SCHED_SOFTIRQ
 	 */
 	if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
-	    cpu_isset(cpu, nohz.cpu_mask))
+	    cpumask_test_cpu(cpu, nohz.cpu_mask))
 		return;
 #endif
 	if (time_after_eq(jiffies, rq->next_balance))
@@ -8309,6 +8309,9 @@ void __init sched_init(void)
 
 	/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
 	alloc_bootmem_cpumask_var(&nohz_cpu_mask);
+#ifdef CONFIG_NO_HZ
+	alloc_bootmem_cpumask_var(&nohz.cpu_mask);
+#endif
 
 	scheduler_running = 1;
 }
-- 
cgit v1.2.3


From 4d2732c63e0c05cfef2a74868d08eace922dfc3e Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 25 Nov 2008 02:35:10 +1030
Subject: sched: convert idle_balance() to cpumask_var_t.

Impact: stack usage reduction

Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves
space in the stack.  cpumask_var_t is just a struct cpumask for
!CONFIG_CPUMASK_OFFSTACK.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 2f8ea99df16..154a95fcea7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3676,7 +3676,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
 	struct sched_domain *sd;
 	int pulled_task = -1;
 	unsigned long next_balance = jiffies + HZ;
-	cpumask_t tmpmask;
+	cpumask_var_t tmpmask;
+
+	if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
+		return;
 
 	for_each_domain(this_cpu, sd) {
 		unsigned long interval;
@@ -3687,7 +3690,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
 		if (sd->flags & SD_BALANCE_NEWIDLE)
 			/* If we've pulled tasks over stop searching: */
 			pulled_task = load_balance_newidle(this_cpu, this_rq,
-							   sd, &tmpmask);
+							   sd, tmpmask);
 
 		interval = msecs_to_jiffies(sd->balance_interval);
 		if (time_after(next_balance, sd->last_balance + interval))
@@ -3702,6 +3705,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
 		 */
 		this_rq->next_balance = next_balance;
 	}
+	free_cpumask_var(tmpmask);
 }
 
 /*
-- 
cgit v1.2.3


From a0e902452da16b79d7c9230630ed8a595d14fa85 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 25 Nov 2008 02:35:11 +1030
Subject: sched: convert rebalance_domains() to cpumask_var_t.

Impact: stack usage reduction

Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves
space in the stack.  cpumask_var_t is just a struct cpumask for
!CONFIG_CPUMASK_OFFSTACK.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 154a95fcea7..67383e7f1cc 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3850,7 +3850,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 	unsigned long next_balance = jiffies + 60*HZ;
 	int update_next_balance = 0;
 	int need_serialize;
-	cpumask_t tmp;
+	cpumask_var_t tmp;
+
+	/* Fails alloc?  Rebalancing probably not a priority right now. */
+	if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
+		return;
 
 	for_each_domain(cpu, sd) {
 		if (!(sd->flags & SD_LOAD_BALANCE))
@@ -3875,7 +3879,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 		}
 
 		if (time_after_eq(jiffies, sd->last_balance + interval)) {
-			if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) {
+			if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
 				/*
 				 * We've pulled tasks over so either we're no
 				 * longer idle, or one of our SMT siblings is
@@ -3909,6 +3913,8 @@ out:
 	 */
 	if (likely(update_next_balance))
 		rq->next_balance = next_balance;
+
+	free_cpumask_var(tmp);
 }
 
 /*
-- 
cgit v1.2.3


From f17c860760927c2a8e41a021eab3317e4415e962 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 25 Nov 2008 02:35:11 +1030
Subject: sched: convert sys_sched_getaffinity() to cpumask_var_t.

Impact: stack usage reduction

Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves
space in the stack.  cpumask_var_t is just a struct cpumask for
!CONFIG_CPUMASK_OFFSTACK.

Some jiggling here to make sure we always exit at the bottom (so we hit
the free_cpumask_var there).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 67383e7f1cc..6deff24349b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5499,19 +5499,24 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
 				      unsigned long __user *user_mask_ptr)
 {
 	int ret;
-	cpumask_t mask;
+	cpumask_var_t mask;
 
-	if (len < sizeof(cpumask_t))
+	if (len < cpumask_size())
 		return -EINVAL;
 
-	ret = sched_getaffinity(pid, &mask);
-	if (ret < 0)
-		return ret;
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
 
-	if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
-		return -EFAULT;
+	ret = sched_getaffinity(pid, mask);
+	if (ret == 0) {
+		if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
+			ret = -EFAULT;
+		else
+			ret = cpumask_size();
+	}
+	free_cpumask_var(mask);
 
-	return sizeof(cpumask_t);
+	return ret;
 }
 
 /**
-- 
cgit v1.2.3


From e76bd8d9850c2296a7e8e24c9dce9b5e6b55fe2f Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 25 Nov 2008 02:35:11 +1030
Subject: sched: avoid stack var in move_task_off_dead_cpu

Impact: stack usage reduction

With some care, we can avoid needing a temporary cpumask (we can't
really allocate here, since we can't fail).

This version calls cpuset_cpus_allowed_locked() with the task_rq_lock
held.  I'm fairly sure this works, but there might be a deadlock
hiding.

And of course, we can't get rid of the last cpumask on stack until we
can use cpumask_of_node instead of node_to_cpumask.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 78 +++++++++++++++++++++++++++-------------------------------
 1 file changed, 36 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 6deff24349b..f7dee2029e4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6112,52 +6112,46 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 {
 	unsigned long flags;
-	cpumask_t mask;
 	struct rq *rq;
 	int dest_cpu;
+	/* FIXME: Use cpumask_of_node here. */
+	cpumask_t _nodemask = node_to_cpumask(cpu_to_node(dead_cpu));
+	const struct cpumask *nodemask = &_nodemask;
+
+again:
+	/* Look for allowed, online CPU in same node. */
+	for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)
+		if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
+			goto move;
+
+	/* Any allowed, online CPU? */
+	dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);
+	if (dest_cpu < nr_cpu_ids)
+		goto move;
+
+	/* No more Mr. Nice Guy. */
+	if (dest_cpu >= nr_cpu_ids) {
+		rq = task_rq_lock(p, &flags);
+		cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
+		dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
+		task_rq_unlock(rq, &flags);
 
-	do {
-		/* On same node? */
-		node_to_cpumask_ptr(pnodemask, cpu_to_node(dead_cpu));
-
-		cpus_and(mask, *pnodemask, p->cpus_allowed);
-		dest_cpu = cpumask_any_and(cpu_online_mask, &mask);
-
-		/* On any allowed CPU? */
-		if (dest_cpu >= nr_cpu_ids)
-			dest_cpu = cpumask_any_and(cpu_online_mask,
-						   &p->cpus_allowed);
-
-		/* No more Mr. Nice Guy. */
-		if (dest_cpu >= nr_cpu_ids) {
-			cpumask_t cpus_allowed;
-
-			cpuset_cpus_allowed_locked(p, &cpus_allowed);
-			/*
-			 * Try to stay on the same cpuset, where the
-			 * current cpuset may be a subset of all cpus.
-			 * The cpuset_cpus_allowed_locked() variant of
-			 * cpuset_cpus_allowed() will not block. It must be
-			 * called within calls to cpuset_lock/cpuset_unlock.
-			 */
-			rq = task_rq_lock(p, &flags);
-			p->cpus_allowed = cpus_allowed;
-			dest_cpu = cpumask_any_and(cpu_online_mask,
-						    &p->cpus_allowed);
-			task_rq_unlock(rq, &flags);
-
-			/*
-			 * Don't tell them about moving exiting tasks or
-			 * kernel threads (both mm NULL), since they never
-			 * leave kernel.
-			 */
-			if (p->mm && printk_ratelimit()) {
-				printk(KERN_INFO "process %d (%s) no "
-				       "longer affine to cpu%d\n",
-					task_pid_nr(p), p->comm, dead_cpu);
-			}
+		/*
+		 * Don't tell them about moving exiting tasks or
+		 * kernel threads (both mm NULL), since they never
+		 * leave kernel.
+		 */
+		if (p->mm && printk_ratelimit()) {
+			printk(KERN_INFO "process %d (%s) no "
+			       "longer affine to cpu%d\n",
+			       task_pid_nr(p), p->comm, dead_cpu);
 		}
-	} while (!__migrate_task_irq(p, dead_cpu, dest_cpu));
+	}
+
+move:
+	/* It can have affinity changed while we were choosing. */
+	if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
+		goto again;
 }
 
 /*
-- 
cgit v1.2.3


From 5a16f3d30ca4e3f166d691220c003066a14e32b5 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 25 Nov 2008 02:35:11 +1030
Subject: sched: convert struct (sys_)sched_setaffinity() to cpumask_var_t.

Impact: stack usage reduction

Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves
space on the stack.  cpumask_var_t is just a struct cpumask for
!CONFIG_CPUMASK_OFFSTACK.

Note the removal of the initializer of new_mask: since the first thing
we did was "cpus_and(new_mask, new_mask, cpus_allowed)" I just changed
that to "cpumask_and(new_mask, in_mask, cpus_allowed);".

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 40 +++++++++++++++++++++++++++-------------
 1 file changed, 27 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index f7dee2029e4..2d4ff91e0c9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5378,8 +5378,7 @@ out_unlock:
 
 long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
 {
-	cpumask_t cpus_allowed;
-	cpumask_t new_mask = *in_mask;
+	cpumask_var_t cpus_allowed, new_mask;
 	struct task_struct *p;
 	int retval;
 
@@ -5401,6 +5400,14 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
 	get_task_struct(p);
 	read_unlock(&tasklist_lock);
 
+	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
+		retval = -ENOMEM;
+		goto out_put_task;
+	}
+	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+		retval = -ENOMEM;
+		goto out_free_cpus_allowed;
+	}
 	retval = -EPERM;
 	if ((current->euid != p->euid) && (current->euid != p->uid) &&
 			!capable(CAP_SYS_NICE))
@@ -5410,24 +5417,28 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
 	if (retval)
 		goto out_unlock;
 
-	cpuset_cpus_allowed(p, &cpus_allowed);
-	cpus_and(new_mask, new_mask, cpus_allowed);
+	cpuset_cpus_allowed(p, cpus_allowed);
+	cpumask_and(new_mask, in_mask, cpus_allowed);
  again:
-	retval = set_cpus_allowed_ptr(p, &new_mask);
+	retval = set_cpus_allowed_ptr(p, new_mask);
 
 	if (!retval) {
-		cpuset_cpus_allowed(p, &cpus_allowed);
-		if (!cpus_subset(new_mask, cpus_allowed)) {
+		cpuset_cpus_allowed(p, cpus_allowed);
+		if (!cpumask_subset(new_mask, cpus_allowed)) {
 			/*
 			 * We must have raced with a concurrent cpuset
 			 * update. Just reset the cpus_allowed to the
 			 * cpuset's cpus_allowed
 			 */
-			new_mask = cpus_allowed;
+			cpumask_copy(new_mask, cpus_allowed);
 			goto again;
 		}
 	}
 out_unlock:
+	free_cpumask_var(new_mask);
+out_free_cpus_allowed:
+	free_cpumask_var(cpus_allowed);
+out_put_task:
 	put_task_struct(p);
 	put_online_cpus();
 	return retval;
@@ -5453,14 +5464,17 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
 				      unsigned long __user *user_mask_ptr)
 {
-	cpumask_t new_mask;
+	cpumask_var_t new_mask;
 	int retval;
 
-	retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
-	if (retval)
-		return retval;
+	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
+		return -ENOMEM;
 
-	return sched_setaffinity(pid, &new_mask);
+	retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
+	if (retval == 0)
+		retval = sched_setaffinity(pid, new_mask);
+	free_cpumask_var(new_mask);
+	return retval;
 }
 
 long sched_getaffinity(pid_t pid, cpumask_t *mask)
-- 
cgit v1.2.3


From d5dd3db1dce73cdd5c45c5a3498c51bd21b8864b Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 25 Nov 2008 02:35:12 +1030
Subject: sched: convert sched_domain_debug to cpumask_var_t.

Impact: stack usage reduction

Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves
stack space.  cpumask_var_t is just a struct cpumask for
!CONFIG_CPUMASK_OFFSTACK.

In this case, we always alloced, but we don't need to any more.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 2d4ff91e0c9..24012c2a889 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6706,7 +6706,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 
 static void sched_domain_debug(struct sched_domain *sd, int cpu)
 {
-	cpumask_t *groupmask;
+	cpumask_var_t groupmask;
 	int level = 0;
 
 	if (!sd) {
@@ -6716,8 +6716,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 
 	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
 
-	groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
-	if (!groupmask) {
+	if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
 		printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
 		return;
 	}
@@ -6730,7 +6729,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 		if (!sd)
 			break;
 	}
-	kfree(groupmask);
+	free_cpumask_var(groupmask);
 }
 #else /* !CONFIG_SCHED_DEBUG */
 # define sched_domain_debug(sd, cpu) do { } while (0)
-- 
cgit v1.2.3


From dcc30a35f71bcf51f1e9b336dc5e41923071509a Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 25 Nov 2008 02:35:12 +1030
Subject: sched: convert cpu_isolated_map to cpumask_var_t.

Impact: stack usage reduction, (future) size reduction, cleanup

Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves
space for small nr_cpu_ids but big CONFIG_NR_CPUS.  cpumask_var_t
is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK.

We can also use cpulist_parse() instead of doing it manually in
isolated_cpu_setup.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 24012c2a889..526618fe4a7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6917,19 +6917,12 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 }
 
 /* cpus with isolated domains */
-static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
+static cpumask_var_t cpu_isolated_map;
 
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
 {
-	static int __initdata ints[NR_CPUS];
-	int i;
-
-	str = get_options(str, ARRAY_SIZE(ints), ints);
-	cpus_clear(cpu_isolated_map);
-	for (i = 1; i <= ints[0]; i++)
-		if (ints[i] < NR_CPUS)
-			cpu_set(ints[i], cpu_isolated_map);
+	cpulist_parse(str, *cpu_isolated_map);
 	return 1;
 }
 
@@ -7727,7 +7720,7 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map)
 	doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
 	if (!doms_cur)
 		doms_cur = &fallback_doms;
-	cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
+	cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);
 	dattr_cur = NULL;
 	err = build_sched_domains(doms_cur);
 	register_sched_domain_sysctl();
@@ -7826,7 +7819,7 @@ match1:
 	if (doms_new == NULL) {
 		ndoms_cur = 0;
 		doms_new = &fallback_doms;
-		cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
+		cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);
 		WARN_ON_ONCE(dattr_new);
 	}
 
@@ -7985,7 +7978,9 @@ static int update_runtime(struct notifier_block *nfb,
 
 void __init sched_init_smp(void)
 {
-	cpumask_t non_isolated_cpus;
+	cpumask_var_t non_isolated_cpus;
+
+	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
 
 #if defined(CONFIG_NUMA)
 	sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
@@ -7994,10 +7989,10 @@ void __init sched_init_smp(void)
 #endif
 	get_online_cpus();
 	mutex_lock(&sched_domains_mutex);
-	arch_init_sched_domains(&cpu_online_map);
-	cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
-	if (cpus_empty(non_isolated_cpus))
-		cpu_set(smp_processor_id(), non_isolated_cpus);
+	arch_init_sched_domains(cpu_online_mask);
+	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
+	if (cpumask_empty(non_isolated_cpus))
+		cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
 	mutex_unlock(&sched_domains_mutex);
 	put_online_cpus();
 
@@ -8012,9 +8007,10 @@ void __init sched_init_smp(void)
 	init_hrtick();
 
 	/* Move init over to a non-isolated CPU */
-	if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)
+	if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
 		BUG();
 	sched_init_granularity();
+	free_cpumask_var(non_isolated_cpus);
 }
 #else
 void __init sched_init_smp(void)
@@ -8334,6 +8330,7 @@ void __init sched_init(void)
 #ifdef CONFIG_NO_HZ
 	alloc_bootmem_cpumask_var(&nohz.cpu_mask);
 #endif
+	alloc_bootmem_cpumask_var(&cpu_isolated_map);
 
 	scheduler_running = 1;
 }
-- 
cgit v1.2.3


From 4212823fb459eacc8098dd420bb68ebb9917989d Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 25 Nov 2008 02:35:12 +1030
Subject: sched: convert falback_doms to cpumask_var_t.

Impact: (future) size reduction for large NR_CPUS.

Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves
space for small nr_cpu_ids but big CONFIG_NR_CPUS.  cpumask_var_t
is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 526618fe4a7..42588ad93b2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7697,10 +7697,10 @@ static struct sched_domain_attr *dattr_cur;
 
 /*
  * Special case: If a kmalloc of a doms_cur partition (array of
- * cpumask_t) fails, then fallback to a single sched domain,
- * as determined by the single cpumask_t fallback_doms.
+ * cpumask) fails, then fallback to a single sched domain,
+ * as determined by the single cpumask fallback_doms.
  */
-static cpumask_t fallback_doms;
+static cpumask_var_t fallback_doms;
 
 void __attribute__((weak)) arch_update_cpu_topology(void)
 {
@@ -7719,7 +7719,7 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map)
 	ndoms_cur = 1;
 	doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
 	if (!doms_cur)
-		doms_cur = &fallback_doms;
+		doms_cur = fallback_doms;
 	cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);
 	dattr_cur = NULL;
 	err = build_sched_domains(doms_cur);
@@ -7818,7 +7818,7 @@ match1:
 
 	if (doms_new == NULL) {
 		ndoms_cur = 0;
-		doms_new = &fallback_doms;
+		doms_new = fallback_doms;
 		cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);
 		WARN_ON_ONCE(dattr_new);
 	}
@@ -7838,7 +7838,7 @@ match2:
 	}
 
 	/* Remember the new sched domains */
-	if (doms_cur != &fallback_doms)
+	if (doms_cur != fallback_doms)
 		kfree(doms_cur);
 	kfree(dattr_cur);	/* kfree(NULL) is safe */
 	doms_cur = doms_new;
@@ -8011,6 +8011,8 @@ void __init sched_init_smp(void)
 		BUG();
 	sched_init_granularity();
 	free_cpumask_var(non_isolated_cpus);
+
+	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
 }
 #else
 void __init sched_init_smp(void)
-- 
cgit v1.2.3


From 68e74568fbe5854952355e942acca51f138096d9 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 25 Nov 2008 02:35:13 +1030
Subject: sched: convert struct cpupri_vec cpumask_var_t.

Impact: stack usage reduction, (future) size reduction for large NR_CPUS.

Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves
space for small nr_cpu_ids but big CONFIG_NR_CPUS.

The fact cpupro_init is called both before and after the slab is
available makes for an ugly parameter unfortunately.

We also use cpumask_any_and to get rid of a temporary in cpupri_find.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c        |  9 +++++++--
 kernel/sched_cpupri.c | 39 ++++++++++++++++++++++++++++-----------
 kernel/sched_cpupri.h |  5 +++--
 3 files changed, 38 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 42588ad93b2..94fa333c1e7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6792,6 +6792,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 
 static void free_rootdomain(struct root_domain *rd)
 {
+	cpupri_cleanup(&rd->cpupri);
+
 	free_cpumask_var(rd->rto_mask);
 	free_cpumask_var(rd->online);
 	free_cpumask_var(rd->span);
@@ -6834,7 +6836,7 @@ static int init_rootdomain(struct root_domain *rd, bool bootmem)
 		alloc_bootmem_cpumask_var(&def_root_domain.span);
 		alloc_bootmem_cpumask_var(&def_root_domain.online);
 		alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
-		cpupri_init(&rd->cpupri);
+		cpupri_init(&rd->cpupri, true);
 		return 0;
 	}
 
@@ -6845,9 +6847,12 @@ static int init_rootdomain(struct root_domain *rd, bool bootmem)
 	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
 		goto free_online;
 
-	cpupri_init(&rd->cpupri);
+	if (cpupri_init(&rd->cpupri, false) != 0)
+		goto free_rto_mask;
 	return 0;
 
+free_rto_mask:
+	free_cpumask_var(rd->rto_mask);
 free_online:
 	free_cpumask_var(rd->online);
 free_span:
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 52154fefab7..018b7be1db2 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -67,24 +67,21 @@ static int convert_prio(int prio)
  * Returns: (int)bool - CPUs were found
  */
 int cpupri_find(struct cpupri *cp, struct task_struct *p,
-		cpumask_t *lowest_mask)
+		struct cpumask *lowest_mask)
 {
 	int                  idx      = 0;
 	int                  task_pri = convert_prio(p->prio);
 
 	for_each_cpupri_active(cp->pri_active, idx) {
 		struct cpupri_vec *vec  = &cp->pri_to_cpu[idx];
-		cpumask_t mask;
 
 		if (idx >= task_pri)
 			break;
 
-		cpus_and(mask, p->cpus_allowed, vec->mask);
-
-		if (cpus_empty(mask))
+		if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
 			continue;
 
-		*lowest_mask = mask;
+		cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
 		return 1;
 	}
 
@@ -126,7 +123,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
 		vec->count--;
 		if (!vec->count)
 			clear_bit(oldpri, cp->pri_active);
-		cpu_clear(cpu, vec->mask);
+		cpumask_clear_cpu(cpu, vec->mask);
 
 		spin_unlock_irqrestore(&vec->lock, flags);
 	}
@@ -136,7 +133,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
 
 		spin_lock_irqsave(&vec->lock, flags);
 
-		cpu_set(cpu, vec->mask);
+		cpumask_set_cpu(cpu, vec->mask);
 		vec->count++;
 		if (vec->count == 1)
 			set_bit(newpri, cp->pri_active);
@@ -150,10 +147,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
 /**
  * cpupri_init - initialize the cpupri structure
  * @cp: The cpupri context
+ * @bootmem: true if allocations need to use bootmem
  *
- * Returns: (void)
+ * Returns: -ENOMEM if memory fails.
  */
-void cpupri_init(struct cpupri *cp)
+int cpupri_init(struct cpupri *cp, bool bootmem)
 {
 	int i;
 
@@ -164,11 +162,30 @@ void cpupri_init(struct cpupri *cp)
 
 		spin_lock_init(&vec->lock);
 		vec->count = 0;
-		cpus_clear(vec->mask);
+		if (bootmem)
+			alloc_bootmem_cpumask_var(&vec->mask);
+		else if (!alloc_cpumask_var(&vec->mask, GFP_KERNEL))
+			goto cleanup;
 	}
 
 	for_each_possible_cpu(i)
 		cp->cpu_to_pri[i] = CPUPRI_INVALID;
+	return 0;
+
+cleanup:
+	for (i--; i >= 0; i--)
+		free_cpumask_var(cp->pri_to_cpu[i].mask);
+	return -ENOMEM;
 }
 
+/**
+ * cpupri_cleanup - clean up the cpupri structure
+ * @cp: The cpupri context
+ */
+void cpupri_cleanup(struct cpupri *cp)
+{
+	int i;
 
+	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
+		free_cpumask_var(cp->pri_to_cpu[i].mask);
+}
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index f25811b0f93..642a94ef8a0 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -14,7 +14,7 @@
 struct cpupri_vec {
 	spinlock_t lock;
 	int        count;
-	cpumask_t  mask;
+	cpumask_var_t mask;
 };
 
 struct cpupri {
@@ -27,7 +27,8 @@ struct cpupri {
 int  cpupri_find(struct cpupri *cp,
 		 struct task_struct *p, cpumask_t *lowest_mask);
 void cpupri_set(struct cpupri *cp, int cpu, int pri);
-void cpupri_init(struct cpupri *cp);
+int cpupri_init(struct cpupri *cp, bool bootmem);
+void cpupri_cleanup(struct cpupri *cp);
 #else
 #define cpupri_set(cp, cpu, pri) do { } while (0)
 #define cpupri_init() do { } while (0)
-- 
cgit v1.2.3


From 24600ce89a819a8f2fb4fd69fd777218a82ade20 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 25 Nov 2008 02:35:13 +1030
Subject: sched: convert check_preempt_equal_prio to cpumask_var_t.

Impact: stack reduction for large NR_CPUS

Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves
stack space.

We simply return if the allocation fails: since we don't use it we
could just pass NULL to cpupri_find and have it handle that.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 820fc422c6d..1fa13624293 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -805,17 +805,20 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
 
 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 {
-	cpumask_t mask;
+	cpumask_var_t mask;
 
 	if (rq->curr->rt.nr_cpus_allowed == 1)
 		return;
 
-	if (p->rt.nr_cpus_allowed != 1
-	    && cpupri_find(&rq->rd->cpupri, p, &mask))
+	if (!alloc_cpumask_var(&mask, GFP_ATOMIC))
 		return;
 
-	if (!cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
-		return;
+	if (p->rt.nr_cpus_allowed != 1
+	    && cpupri_find(&rq->rd->cpupri, p, mask))
+		goto free;
+
+	if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask))
+		goto free;
 
 	/*
 	 * There appears to be other cpus that can accept
@@ -824,6 +827,8 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 	 */
 	requeue_task_rt(rq, p, 1);
 	resched_task(rq->curr);
+free:
+	free_cpumask_var(mask);
 }
 
 #endif /* CONFIG_SMP */
-- 
cgit v1.2.3


From 0e3900e6d3b04c44737ebc505604dcd8ed30e354 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 25 Nov 2008 02:35:13 +1030
Subject: sched: convert local_cpu_mask to cpumask_var_t.

Impact: (future) size reduction for large NR_CPUS.

Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves
space for small nr_cpu_ids but big CONFIG_NR_CPUS.  cpumask_var_t
is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    |  1 +
 kernel/sched_rt.c | 13 +++++++++++--
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 94fa333c1e7..f2be6187003 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8018,6 +8018,7 @@ void __init sched_init_smp(void)
 	free_cpumask_var(non_isolated_cpus);
 
 	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+	init_sched_rt_class();
 }
 #else
 void __init sched_init_smp(void)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 1fa13624293..1f0e99d1a8c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -962,7 +962,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
 	return next;
 }
 
-static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
+static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
 
 static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
 {
@@ -982,7 +982,7 @@ static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
 static int find_lowest_rq(struct task_struct *task)
 {
 	struct sched_domain *sd;
-	cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
+	cpumask_t *lowest_mask = __get_cpu_var(local_cpu_mask);
 	int this_cpu = smp_processor_id();
 	int cpu      = task_cpu(task);
 
@@ -1551,3 +1551,12 @@ static void print_rt_stats(struct seq_file *m, int cpu)
 	rcu_read_unlock();
 }
 #endif /* CONFIG_SCHED_DEBUG */
+
+/* Note that this is never called for !SMP, but that's OK. */
+static inline void init_sched_rt_class(void)
+{
+	unsigned int i;
+
+	for_each_possible_cpu(i)
+		alloc_cpumask_var(&per_cpu(local_cpu_mask, i), GFP_KERNEL);
+}
-- 
cgit v1.2.3


From 96f874e26428ab5d2db681c100210c254775e154 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 25 Nov 2008 02:35:14 +1030
Subject: sched: convert remaining old-style cpumask operators

Impact: Trivial API conversion

  NR_CPUS -> nr_cpu_ids
  cpumask_t -> struct cpumask
  sizeof(cpumask_t) -> cpumask_size()
  cpumask_a = cpumask_b -> cpumask_copy(&cpumask_a, &cpumask_b)

  cpu_set() -> cpumask_set_cpu()
  first_cpu() -> cpumask_first()
  cpumask_of_cpu() -> cpumask_of()
  cpus_* -> cpumask_*

There are some FIXMEs where we all archs to complete infrastructure
(patches have been sent):

  cpu_coregroup_map -> cpu_coregroup_mask
  node_to_cpumask* -> cpumask_of_node

There is also one FIXME where we pass an array of cpumasks to
partition_sched_domains(): this implies knowing the definition of
'struct cpumask' and the size of a cpumask.  This will be fixed in a
future patch.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 212 ++++++++++++++++++++++++++++------------------------
 kernel/sched_fair.c |   4 +-
 kernel/sched_rt.c   |  18 ++---
 3 files changed, 124 insertions(+), 110 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index f2be6187003..eba6a156d33 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2829,7 +2829,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
 	struct rq *rq;
 
 	rq = task_rq_lock(p, &flags);
-	if (!cpu_isset(dest_cpu, p->cpus_allowed)
+	if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
 	    || unlikely(!cpu_active(dest_cpu)))
 		goto out;
 
@@ -2895,7 +2895,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
 	 * 3) are cache-hot on their current CPU.
 	 */
-	if (!cpu_isset(this_cpu, p->cpus_allowed)) {
+	if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
 		schedstat_inc(p, se.nr_failed_migrations_affine);
 		return 0;
 	}
@@ -3070,7 +3070,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
 		   unsigned long *imbalance, enum cpu_idle_type idle,
-		   int *sd_idle, const cpumask_t *cpus, int *balance)
+		   int *sd_idle, const struct cpumask *cpus, int *balance)
 {
 	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
 	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -3387,7 +3387,7 @@ ret:
  */
 static struct rq *
 find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
-		   unsigned long imbalance, const cpumask_t *cpus)
+		   unsigned long imbalance, const struct cpumask *cpus)
 {
 	struct rq *busiest = NULL, *rq;
 	unsigned long max_load = 0;
@@ -3396,7 +3396,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
 	for_each_cpu(i, sched_group_cpus(group)) {
 		unsigned long wl;
 
-		if (!cpu_isset(i, *cpus))
+		if (!cpumask_test_cpu(i, cpus))
 			continue;
 
 		rq = cpu_rq(i);
@@ -3426,7 +3426,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
  */
 static int load_balance(int this_cpu, struct rq *this_rq,
 			struct sched_domain *sd, enum cpu_idle_type idle,
-			int *balance, cpumask_t *cpus)
+			int *balance, struct cpumask *cpus)
 {
 	int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
 	struct sched_group *group;
@@ -3434,7 +3434,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 	struct rq *busiest;
 	unsigned long flags;
 
-	cpus_setall(*cpus);
+	cpumask_setall(cpus);
 
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
@@ -3494,8 +3494,8 @@ redo:
 
 		/* All tasks on this runqueue were pinned by CPU affinity */
 		if (unlikely(all_pinned)) {
-			cpu_clear(cpu_of(busiest), *cpus);
-			if (!cpus_empty(*cpus))
+			cpumask_clear_cpu(cpu_of(busiest), cpus);
+			if (!cpumask_empty(cpus))
 				goto redo;
 			goto out_balanced;
 		}
@@ -3512,7 +3512,8 @@ redo:
 			/* don't kick the migration_thread, if the curr
 			 * task on busiest cpu can't be moved to this_cpu
 			 */
-			if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+			if (!cpumask_test_cpu(this_cpu,
+					      &busiest->curr->cpus_allowed)) {
 				spin_unlock_irqrestore(&busiest->lock, flags);
 				all_pinned = 1;
 				goto out_one_pinned;
@@ -3587,7 +3588,7 @@ out:
  */
 static int
 load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
-			cpumask_t *cpus)
+			struct cpumask *cpus)
 {
 	struct sched_group *group;
 	struct rq *busiest = NULL;
@@ -3596,7 +3597,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
 	int sd_idle = 0;
 	int all_pinned = 0;
 
-	cpus_setall(*cpus);
+	cpumask_setall(cpus);
 
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
@@ -3640,8 +3641,8 @@ redo:
 		double_unlock_balance(this_rq, busiest);
 
 		if (unlikely(all_pinned)) {
-			cpu_clear(cpu_of(busiest), *cpus);
-			if (!cpus_empty(*cpus))
+			cpumask_clear_cpu(cpu_of(busiest), cpus);
+			if (!cpumask_empty(cpus))
 				goto redo;
 		}
 	}
@@ -5376,7 +5377,7 @@ out_unlock:
 	return retval;
 }
 
-long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 {
 	cpumask_var_t cpus_allowed, new_mask;
 	struct task_struct *p;
@@ -5445,13 +5446,13 @@ out_put_task:
 }
 
 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
-			     cpumask_t *new_mask)
+			     struct cpumask *new_mask)
 {
-	if (len < sizeof(cpumask_t)) {
-		memset(new_mask, 0, sizeof(cpumask_t));
-	} else if (len > sizeof(cpumask_t)) {
-		len = sizeof(cpumask_t);
-	}
+	if (len < cpumask_size())
+		cpumask_clear(new_mask);
+	else if (len > cpumask_size())
+		len = cpumask_size();
+
 	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
 }
 
@@ -5477,7 +5478,7 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
 	return retval;
 }
 
-long sched_getaffinity(pid_t pid, cpumask_t *mask)
+long sched_getaffinity(pid_t pid, struct cpumask *mask)
 {
 	struct task_struct *p;
 	int retval;
@@ -5494,7 +5495,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
 	if (retval)
 		goto out_unlock;
 
-	cpus_and(*mask, p->cpus_allowed, cpu_online_map);
+	cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
 
 out_unlock:
 	read_unlock(&tasklist_lock);
@@ -5872,7 +5873,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 	idle->se.exec_start = sched_clock();
 
 	idle->prio = idle->normal_prio = MAX_PRIO;
-	idle->cpus_allowed = cpumask_of_cpu(cpu);
+	cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
 	__set_task_cpu(idle, cpu);
 
 	rq->curr = rq->idle = idle;
@@ -5956,7 +5957,7 @@ static inline void sched_init_granularity(void)
  * task must not exit() & deallocate itself prematurely. The
  * call is not atomic; no spinlocks may be held.
  */
-int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 {
 	struct migration_req req;
 	unsigned long flags;
@@ -5964,13 +5965,13 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
 	int ret = 0;
 
 	rq = task_rq_lock(p, &flags);
-	if (!cpus_intersects(*new_mask, cpu_online_map)) {
+	if (!cpumask_intersects(new_mask, cpu_online_mask)) {
 		ret = -EINVAL;
 		goto out;
 	}
 
 	if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
-		     !cpus_equal(p->cpus_allowed, *new_mask))) {
+		     !cpumask_equal(&p->cpus_allowed, new_mask))) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -5978,12 +5979,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
 	if (p->sched_class->set_cpus_allowed)
 		p->sched_class->set_cpus_allowed(p, new_mask);
 	else {
-		p->cpus_allowed = *new_mask;
-		p->rt.nr_cpus_allowed = cpus_weight(*new_mask);
+		cpumask_copy(&p->cpus_allowed, new_mask);
+		p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
 	}
 
 	/* Can the task run on the task's current CPU? If so, we're done */
-	if (cpu_isset(task_cpu(p), *new_mask))
+	if (cpumask_test_cpu(task_cpu(p), new_mask))
 		goto out;
 
 	if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
@@ -6028,7 +6029,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 	if (task_cpu(p) != src_cpu)
 		goto done;
 	/* Affinity changed (again). */
-	if (!cpu_isset(dest_cpu, p->cpus_allowed))
+	if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
 		goto fail;
 
 	on_rq = p->se.on_rq;
@@ -6629,13 +6630,13 @@ early_initcall(migration_init);
 #ifdef CONFIG_SCHED_DEBUG
 
 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
-				  cpumask_t *groupmask)
+				  struct cpumask *groupmask)
 {
 	struct sched_group *group = sd->groups;
 	char str[256];
 
 	cpulist_scnprintf(str, sizeof(str), *sched_domain_span(sd));
-	cpus_clear(*groupmask);
+	cpumask_clear(groupmask);
 
 	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
 
@@ -6936,24 +6937,25 @@ __setup("isolcpus=", isolated_cpu_setup);
 /*
  * init_sched_build_groups takes the cpumask we wish to span, and a pointer
  * to a function which identifies what group(along with sched group) a CPU
- * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
- * (due to the fact that we keep track of groups covered with a cpumask_t).
+ * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
+ * (due to the fact that we keep track of groups covered with a struct cpumask).
  *
  * init_sched_build_groups will build a circular linked list of the groups
  * covered by the given span, and will set each group's ->cpumask correctly,
  * and ->cpu_power to 0.
  */
 static void
-init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
-			int (*group_fn)(int cpu, const cpumask_t *cpu_map,
+init_sched_build_groups(const struct cpumask *span,
+			const struct cpumask *cpu_map,
+			int (*group_fn)(int cpu, const struct cpumask *cpu_map,
 					struct sched_group **sg,
-					cpumask_t *tmpmask),
-			cpumask_t *covered, cpumask_t *tmpmask)
+					struct cpumask *tmpmask),
+			struct cpumask *covered, struct cpumask *tmpmask)
 {
 	struct sched_group *first = NULL, *last = NULL;
 	int i;
 
-	cpus_clear(*covered);
+	cpumask_clear(covered);
 
 	for_each_cpu(i, span) {
 		struct sched_group *sg;
@@ -6970,7 +6972,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
 			if (group_fn(j, cpu_map, NULL, tmpmask) != group)
 				continue;
 
-			cpu_set(j, *covered);
+			cpumask_set_cpu(j, covered);
 			cpumask_set_cpu(j, sched_group_cpus(sg));
 		}
 		if (!first)
@@ -7035,9 +7037,10 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
  * should be one that prevents unnecessary balancing, but also spreads tasks
  * out optimally.
  */
-static void sched_domain_node_span(int node, cpumask_t *span)
+static void sched_domain_node_span(int node, struct cpumask *span)
 {
 	nodemask_t used_nodes;
+	/* FIXME: use cpumask_of_node() */
 	node_to_cpumask_ptr(nodemask, node);
 	int i;
 
@@ -7081,8 +7084,8 @@ static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
 static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);
 
 static int
-cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
-		 cpumask_t *unused)
+cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
+		 struct sched_group **sg, struct cpumask *unused)
 {
 	if (sg)
 		*sg = &per_cpu(sched_group_cpus, cpu).sg;
@@ -7100,22 +7103,21 @@ static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
 
 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
 static int
-cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
-		  cpumask_t *mask)
+cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
+		  struct sched_group **sg, struct cpumask *mask)
 {
 	int group;
 
-	*mask = per_cpu(cpu_sibling_map, cpu);
-	cpus_and(*mask, *mask, *cpu_map);
-	group = first_cpu(*mask);
+	cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+	group = cpumask_first(mask);
 	if (sg)
 		*sg = &per_cpu(sched_group_core, group).sg;
 	return group;
 }
 #elif defined(CONFIG_SCHED_MC)
 static int
-cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
-		  cpumask_t *unused)
+cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
+		  struct sched_group **sg, struct cpumask *unused)
 {
 	if (sg)
 		*sg = &per_cpu(sched_group_core, cpu).sg;
@@ -7127,18 +7129,18 @@ static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
 static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
 
 static int
-cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
-		  cpumask_t *mask)
+cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
+		  struct sched_group **sg, struct cpumask *mask)
 {
 	int group;
 #ifdef CONFIG_SCHED_MC
+	/* FIXME: Use cpu_coregroup_mask. */
 	*mask = cpu_coregroup_map(cpu);
 	cpus_and(*mask, *mask, *cpu_map);
-	group = first_cpu(*mask);
+	group = cpumask_first(mask);
 #elif defined(CONFIG_SCHED_SMT)
-	*mask = per_cpu(cpu_sibling_map, cpu);
-	cpus_and(*mask, *mask, *cpu_map);
-	group = first_cpu(*mask);
+	cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+	group = cpumask_first(mask);
 #else
 	group = cpu;
 #endif
@@ -7159,14 +7161,16 @@ static struct sched_group ***sched_group_nodes_bycpu;
 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
 static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
 
-static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
-				 struct sched_group **sg, cpumask_t *nodemask)
+static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
+				 struct sched_group **sg,
+				 struct cpumask *nodemask)
 {
 	int group;
+	/* FIXME: use cpumask_of_node */
 	node_to_cpumask_ptr(pnodemask, cpu_to_node(cpu));
 
-	cpus_and(*nodemask, *pnodemask, *cpu_map);
-	group = first_cpu(*nodemask);
+	cpumask_and(nodemask, pnodemask, cpu_map);
+	group = cpumask_first(nodemask);
 
 	if (sg)
 		*sg = &per_cpu(sched_group_allnodes, group).sg;
@@ -7202,7 +7206,8 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
 
 #ifdef CONFIG_NUMA
 /* Free memory allocated for various sched_group structures */
-static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
+static void free_sched_groups(const struct cpumask *cpu_map,
+			      struct cpumask *nodemask)
 {
 	int cpu, i;
 
@@ -7215,10 +7220,11 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
 
 		for (i = 0; i < nr_node_ids; i++) {
 			struct sched_group *oldsg, *sg = sched_group_nodes[i];
+			/* FIXME: Use cpumask_of_node */
 			node_to_cpumask_ptr(pnodemask, i);
 
 			cpus_and(*nodemask, *pnodemask, *cpu_map);
-			if (cpus_empty(*nodemask))
+			if (cpumask_empty(nodemask))
 				continue;
 
 			if (sg == NULL)
@@ -7236,7 +7242,8 @@ next_sg:
 	}
 }
 #else /* !CONFIG_NUMA */
-static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
+static void free_sched_groups(const struct cpumask *cpu_map,
+			      struct cpumask *nodemask)
 {
 }
 #endif /* CONFIG_NUMA */
@@ -7366,7 +7373,7 @@ static void set_domain_attribute(struct sched_domain *sd,
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
  */
-static int __build_sched_domains(const cpumask_t *cpu_map,
+static int __build_sched_domains(const struct cpumask *cpu_map,
 				 struct sched_domain_attr *attr)
 {
 	int i, err = -ENOMEM;
@@ -7416,7 +7423,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 	}
 
 #ifdef CONFIG_NUMA
-	sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
+	sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes;
 #endif
 
 	/*
@@ -7425,12 +7432,13 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 	for_each_cpu(i, cpu_map) {
 		struct sched_domain *sd = NULL, *p;
 
+		/* FIXME: use cpumask_of_node */
 		*nodemask = node_to_cpumask(cpu_to_node(i));
 		cpus_and(*nodemask, *nodemask, *cpu_map);
 
 #ifdef CONFIG_NUMA
-		if (cpus_weight(*cpu_map) >
-				SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) {
+		if (cpumask_weight(cpu_map) >
+				SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
 			sd = &per_cpu(allnodes_domains, i);
 			SD_INIT(sd, ALLNODES);
 			set_domain_attribute(sd, attr);
@@ -7491,9 +7499,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #ifdef CONFIG_SCHED_SMT
 	/* Set up CPU (sibling) groups */
 	for_each_cpu(i, cpu_map) {
-		*this_sibling_map = per_cpu(cpu_sibling_map, i);
-		cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);
-		if (i != first_cpu(*this_sibling_map))
+		cpumask_and(this_sibling_map,
+			    &per_cpu(cpu_sibling_map, i), cpu_map);
+		if (i != cpumask_first(this_sibling_map))
 			continue;
 
 		init_sched_build_groups(this_sibling_map, cpu_map,
@@ -7505,9 +7513,10 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #ifdef CONFIG_SCHED_MC
 	/* Set up multi-core groups */
 	for_each_cpu(i, cpu_map) {
+		/* FIXME: Use cpu_coregroup_mask */
 		*this_core_map = cpu_coregroup_map(i);
 		cpus_and(*this_core_map, *this_core_map, *cpu_map);
-		if (i != first_cpu(*this_core_map))
+		if (i != cpumask_first(this_core_map))
 			continue;
 
 		init_sched_build_groups(this_core_map, cpu_map,
@@ -7518,9 +7527,10 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 
 	/* Set up physical groups */
 	for (i = 0; i < nr_node_ids; i++) {
+		/* FIXME: Use cpumask_of_node */
 		*nodemask = node_to_cpumask(i);
 		cpus_and(*nodemask, *nodemask, *cpu_map);
-		if (cpus_empty(*nodemask))
+		if (cpumask_empty(nodemask))
 			continue;
 
 		init_sched_build_groups(nodemask, cpu_map,
@@ -7541,17 +7551,18 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		struct sched_group *sg, *prev;
 		int j;
 
+		/* FIXME: Use cpumask_of_node */
 		*nodemask = node_to_cpumask(i);
-		cpus_clear(*covered);
+		cpumask_clear(covered);
 
 		cpus_and(*nodemask, *nodemask, *cpu_map);
-		if (cpus_empty(*nodemask)) {
+		if (cpumask_empty(nodemask)) {
 			sched_group_nodes[i] = NULL;
 			continue;
 		}
 
 		sched_domain_node_span(i, domainspan);
-		cpus_and(*domainspan, *domainspan, *cpu_map);
+		cpumask_and(domainspan, domainspan, cpu_map);
 
 		sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
 				  GFP_KERNEL, i);
@@ -7570,21 +7581,22 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		sg->__cpu_power = 0;
 		cpumask_copy(sched_group_cpus(sg), nodemask);
 		sg->next = sg;
-		cpus_or(*covered, *covered, *nodemask);
+		cpumask_or(covered, covered, nodemask);
 		prev = sg;
 
 		for (j = 0; j < nr_node_ids; j++) {
 			int n = (i + j) % nr_node_ids;
+			/* FIXME: Use cpumask_of_node */
 			node_to_cpumask_ptr(pnodemask, n);
 
-			cpus_complement(*notcovered, *covered);
-			cpus_and(*tmpmask, *notcovered, *cpu_map);
-			cpus_and(*tmpmask, *tmpmask, *domainspan);
-			if (cpus_empty(*tmpmask))
+			cpumask_complement(notcovered, covered);
+			cpumask_and(tmpmask, notcovered, cpu_map);
+			cpumask_and(tmpmask, tmpmask, domainspan);
+			if (cpumask_empty(tmpmask))
 				break;
 
-			cpus_and(*tmpmask, *tmpmask, *pnodemask);
-			if (cpus_empty(*tmpmask))
+			cpumask_and(tmpmask, tmpmask, pnodemask);
+			if (cpumask_empty(tmpmask))
 				continue;
 
 			sg = kmalloc_node(sizeof(struct sched_group) +
@@ -7598,7 +7610,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 			sg->__cpu_power = 0;
 			cpumask_copy(sched_group_cpus(sg), tmpmask);
 			sg->next = prev->next;
-			cpus_or(*covered, *covered, *tmpmask);
+			cpumask_or(covered, covered, tmpmask);
 			prev->next = sg;
 			prev = sg;
 		}
@@ -7634,7 +7646,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 	if (sd_allnodes) {
 		struct sched_group *sg;
 
-		cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg,
+		cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
 								tmpmask);
 		init_numa_sched_groups_power(sg);
 	}
@@ -7690,12 +7702,12 @@ error:
 #endif
 }
 
-static int build_sched_domains(const cpumask_t *cpu_map)
+static int build_sched_domains(const struct cpumask *cpu_map)
 {
 	return __build_sched_domains(cpu_map, NULL);
 }
 
-static cpumask_t *doms_cur;	/* current sched domains */
+static struct cpumask *doms_cur;	/* current sched domains */
 static int ndoms_cur;		/* number of sched domains in 'doms_cur' */
 static struct sched_domain_attr *dattr_cur;
 				/* attribues of custom domains in 'doms_cur' */
@@ -7716,13 +7728,13 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
  * For now this just excludes isolated cpus, but could be used to
  * exclude other special cases in the future.
  */
-static int arch_init_sched_domains(const cpumask_t *cpu_map)
+static int arch_init_sched_domains(const struct cpumask *cpu_map)
 {
 	int err;
 
 	arch_update_cpu_topology();
 	ndoms_cur = 1;
-	doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
+	doms_cur = kmalloc(cpumask_size(), GFP_KERNEL);
 	if (!doms_cur)
 		doms_cur = fallback_doms;
 	cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);
@@ -7733,8 +7745,8 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map)
 	return err;
 }
 
-static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
-				       cpumask_t *tmpmask)
+static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
+				       struct cpumask *tmpmask)
 {
 	free_sched_groups(cpu_map, tmpmask);
 }
@@ -7743,15 +7755,16 @@ static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
  * Detach sched domains from a group of cpus specified in cpu_map
  * These cpus will now be attached to the NULL domain
  */
-static void detach_destroy_domains(const cpumask_t *cpu_map)
+static void detach_destroy_domains(const struct cpumask *cpu_map)
 {
-	cpumask_t tmpmask;
+	/* Save because hotplug lock held. */
+	static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
 	int i;
 
 	for_each_cpu(i, cpu_map)
 		cpu_attach_domain(NULL, &def_root_domain, i);
 	synchronize_sched();
-	arch_destroy_sched_domains(cpu_map, &tmpmask);
+	arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
 }
 
 /* handle null as "default" */
@@ -7776,7 +7789,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
  * doms_new[] to the current sched domain partitioning, doms_cur[].
  * It destroys each deleted domain and builds each new domain.
  *
- * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'.
+ * 'doms_new' is an array of cpumask's of length 'ndoms_new'.
  * The masks don't intersect (don't overlap.) We should setup one
  * sched domain for each mask. CPUs not in any of the cpumasks will
  * not be load balanced. If the same cpumask appears both in the
@@ -7790,13 +7803,14 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
  * the single partition 'fallback_doms', it also forces the domains
  * to be rebuilt.
  *
- * If doms_new == NULL it will be replaced with cpu_online_map.
+ * If doms_new == NULL it will be replaced with cpu_online_mask.
  * ndoms_new == 0 is a special case for destroying existing domains,
  * and it will not create the default domain.
  *
  * Call with hotplug lock held
  */
-void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+/* FIXME: Change to struct cpumask *doms_new[] */
+void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
 			     struct sched_domain_attr *dattr_new)
 {
 	int i, j, n;
@@ -7811,7 +7825,7 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
 	/* Destroy deleted domains */
 	for (i = 0; i < ndoms_cur; i++) {
 		for (j = 0; j < n; j++) {
-			if (cpus_equal(doms_cur[i], doms_new[j])
+			if (cpumask_equal(&doms_cur[i], &doms_new[j])
 			    && dattrs_equal(dattr_cur, i, dattr_new, j))
 				goto match1;
 		}
@@ -7831,7 +7845,7 @@ match1:
 	/* Build new domains */
 	for (i = 0; i < ndoms_new; i++) {
 		for (j = 0; j < ndoms_cur; j++) {
-			if (cpus_equal(doms_new[i], doms_cur[j])
+			if (cpumask_equal(&doms_new[i], &doms_cur[j])
 			    && dattrs_equal(dattr_new, i, dattr_cur, j))
 				goto match2;
 		}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index bba00402ed9..08ffffd4a41 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1017,7 +1017,7 @@ static void yield_task_fair(struct rq *rq)
  * search starts with cpus closest then further out as needed,
  * so we always favor a closer, idle cpu.
  * Domains may include CPUs that are not usable for migration,
- * hence we need to mask them out (cpu_active_map)
+ * hence we need to mask them out (cpu_active_mask)
  *
  * Returns the CPU we should wake onto.
  */
@@ -1244,7 +1244,7 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
 		}
 	}
 
-	if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
+	if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed)))
 		goto out;
 
 	/*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 1f0e99d1a8c..fb3964579a8 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -923,7 +923,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 {
 	if (!task_running(rq, p) &&
-	    (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) &&
+	    (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
 	    (p->rt.nr_cpus_allowed > 1))
 		return 1;
 	return 0;
@@ -982,7 +982,7 @@ static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
 static int find_lowest_rq(struct task_struct *task)
 {
 	struct sched_domain *sd;
-	cpumask_t *lowest_mask = __get_cpu_var(local_cpu_mask);
+	struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
 	int this_cpu = smp_processor_id();
 	int cpu      = task_cpu(task);
 
@@ -997,7 +997,7 @@ static int find_lowest_rq(struct task_struct *task)
 	 * I guess we might want to change cpupri_find() to ignore those
 	 * in the first place.
 	 */
-	cpus_and(*lowest_mask, *lowest_mask, cpu_active_map);
+	cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
 
 	/*
 	 * At this point we have built a mask of cpus representing the
@@ -1007,7 +1007,7 @@ static int find_lowest_rq(struct task_struct *task)
 	 * We prioritize the last cpu that the task executed on since
 	 * it is most likely cache-hot in that location.
 	 */
-	if (cpu_isset(cpu, *lowest_mask))
+	if (cpumask_test_cpu(cpu, lowest_mask))
 		return cpu;
 
 	/*
@@ -1064,8 +1064,8 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
 			 * Also make sure that it wasn't scheduled on its rq.
 			 */
 			if (unlikely(task_rq(task) != rq ||
-				     !cpu_isset(lowest_rq->cpu,
-						task->cpus_allowed) ||
+				     !cpumask_test_cpu(lowest_rq->cpu,
+						       &task->cpus_allowed) ||
 				     task_running(rq, task) ||
 				     !task->se.on_rq)) {
 
@@ -1315,9 +1315,9 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 }
 
 static void set_cpus_allowed_rt(struct task_struct *p,
-				const cpumask_t *new_mask)
+				const struct cpumask *new_mask)
 {
-	int weight = cpus_weight(*new_mask);
+	int weight = cpumask_weight(new_mask);
 
 	BUG_ON(!rt_task(p));
 
@@ -1338,7 +1338,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
 		update_rt_migration(rq);
 	}
 
-	p->cpus_allowed    = *new_mask;
+	cpumask_copy(&p->cpus_allowed, new_mask);
 	p->rt.nr_cpus_allowed = weight;
 }
 
-- 
cgit v1.2.3


From 1acdac104668a0834cfa267de9946fac7764d486 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 20 Nov 2008 10:02:53 -0800
Subject: futex: make clock selectable for FUTEX_WAIT_BITSET

FUTEX_WAIT_BITSET could be used instead of FUTEX_WAIT by setting the
bit set to FUTEX_BITSET_MATCH_ANY, but FUTEX_WAIT uses CLOCK_REALTIME
while FUTEX_WAIT_BITSET uses CLOCK_MONOTONIC.

Add a flag to select CLOCK_REALTIME for FUTEX_WAIT_BITSET so glibc can
replace the FUTEX_WAIT logic which needs to do gettimeofday() calls
before and after the syscall to convert the absolute timeout to a
relative timeout for FUTEX_WAIT.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Ulrich Drepper <drepper@redhat.com>
---
 kernel/futex.c | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index e10c5c8786a..ba0d3b83c09 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1142,12 +1142,13 @@ handle_fault:
  * In case we must use restart_block to restart a futex_wait,
  * we encode in the 'flags' shared capability
  */
-#define FLAGS_SHARED  1
+#define FLAGS_SHARED		0x01
+#define FLAGS_CLOCKRT		0x02
 
 static long futex_wait_restart(struct restart_block *restart);
 
 static int futex_wait(u32 __user *uaddr, int fshared,
-		      u32 val, ktime_t *abs_time, u32 bitset)
+		      u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
 {
 	struct task_struct *curr = current;
 	DECLARE_WAITQUEUE(wait, curr);
@@ -1233,8 +1234,10 @@ static int futex_wait(u32 __user *uaddr, int fshared,
 			slack = current->timer_slack_ns;
 			if (rt_task(current))
 				slack = 0;
-			hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC,
-						HRTIMER_MODE_ABS);
+			hrtimer_init_on_stack(&t.timer,
+					      clockrt ? CLOCK_REALTIME :
+					      CLOCK_MONOTONIC,
+					      HRTIMER_MODE_ABS);
 			hrtimer_init_sleeper(&t, current);
 			hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack);
 
@@ -1289,6 +1292,8 @@ static int futex_wait(u32 __user *uaddr, int fshared,
 
 		if (fshared)
 			restart->futex.flags |= FLAGS_SHARED;
+		if (clockrt)
+			restart->futex.flags |= FLAGS_CLOCKRT;
 		return -ERESTART_RESTARTBLOCK;
 	}
 
@@ -1312,7 +1317,8 @@ static long futex_wait_restart(struct restart_block *restart)
 	if (restart->futex.flags & FLAGS_SHARED)
 		fshared = 1;
 	return (long)futex_wait(uaddr, fshared, restart->futex.val, &t,
-				restart->futex.bitset);
+				restart->futex.bitset,
+				restart->futex.flags & FLAGS_CLOCKRT);
 }
 
 
@@ -1905,18 +1911,22 @@ void exit_robust_list(struct task_struct *curr)
 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		u32 __user *uaddr2, u32 val2, u32 val3)
 {
-	int ret = -ENOSYS;
+	int clockrt, ret = -ENOSYS;
 	int cmd = op & FUTEX_CMD_MASK;
 	int fshared = 0;
 
 	if (!(op & FUTEX_PRIVATE_FLAG))
 		fshared = 1;
 
+	clockrt = op & FUTEX_CLOCK_REALTIME;
+	if (clockrt && cmd != FUTEX_WAIT_BITSET)
+		return -ENOSYS;
+
 	switch (cmd) {
 	case FUTEX_WAIT:
 		val3 = FUTEX_BITSET_MATCH_ANY;
 	case FUTEX_WAIT_BITSET:
-		ret = futex_wait(uaddr, fshared, val, timeout, val3);
+		ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt);
 		break;
 	case FUTEX_WAKE:
 		val3 = FUTEX_BITSET_MATCH_ANY;
-- 
cgit v1.2.3


From 18b6e0414e42d95183f07d8177e3ff0241abd825 Mon Sep 17 00:00:00 2001
From: Serge Hallyn <serue@us.ibm.com>
Date: Wed, 15 Oct 2008 16:38:45 -0500
Subject: User namespaces: set of cleanups (v2)

The user_ns is moved from nsproxy to user_struct, so that a struct
cred by itself is sufficient to determine access (which it otherwise
would not be).  Corresponding ecryptfs fixes (by David Howells) are
here as well.

Fix refcounting.  The following rules now apply:
        1. The task pins the user struct.
        2. The user struct pins its user namespace.
        3. The user namespace pins the struct user which created it.

User namespaces are cloned during copy_creds().  Unsharing a new user_ns
is no longer possible.  (We could re-add that, but it'll cause code
duplication and doesn't seem useful if PAM doesn't need to clone user
namespaces).

When a user namespace is created, its first user (uid 0) gets empty
keyrings and a clean group_info.

This incorporates a previous patch by David Howells.  Here
is his original patch description:

>I suggest adding the attached incremental patch.  It makes the following
>changes:
>
> (1) Provides a current_user_ns() macro to wrap accesses to current's user
>     namespace.
>
> (2) Fixes eCryptFS.
>
> (3) Renames create_new_userns() to create_user_ns() to be more consistent
>     with the other associated functions and because the 'new' in the name is
>     superfluous.
>
> (4) Moves the argument and permission checks made for CLONE_NEWUSER to the
>     beginning of do_fork() so that they're done prior to making any attempts
>     at allocation.
>
> (5) Calls create_user_ns() after prepare_creds(), and gives it the new creds
>     to fill in rather than have it return the new root user.  I don't imagine
>     the new root user being used for anything other than filling in a cred
>     struct.
>
>     This also permits me to get rid of a get_uid() and a free_uid(), as the
>     reference the creds were holding on the old user_struct can just be
>     transferred to the new namespace's creator pointer.
>
> (6) Makes create_user_ns() reset the UIDs and GIDs of the creds under
>     preparation rather than doing it in copy_creds().
>
>David

>Signed-off-by: David Howells <dhowells@redhat.com>

Changelog:
	Oct 20: integrate dhowells comments
		1. leave thread_keyring alone
		2. use current_user_ns() in set_user()

Signed-off-by: Serge Hallyn <serue@us.ibm.com>
---
 kernel/cred.c           | 15 ++++++++--
 kernel/fork.c           | 19 +++++++++++--
 kernel/nsproxy.c        | 15 ++--------
 kernel/sys.c            |  4 +--
 kernel/user.c           | 47 +++++++++----------------------
 kernel/user_namespace.c | 75 ++++++++++++++++++++-----------------------------
 6 files changed, 76 insertions(+), 99 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cred.c b/kernel/cred.c
index 13697ca2bb3..ff7bc071991 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -274,6 +274,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 	struct thread_group_cred *tgcred;
 #endif
 	struct cred *new;
+	int ret;
 
 	mutex_init(&p->cred_exec_mutex);
 
@@ -293,6 +294,12 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 	if (!new)
 		return -ENOMEM;
 
+	if (clone_flags & CLONE_NEWUSER) {
+		ret = create_user_ns(new);
+		if (ret < 0)
+			goto error_put;
+	}
+
 #ifdef CONFIG_KEYS
 	/* new threads get their own thread keyrings if their parent already
 	 * had one */
@@ -309,8 +316,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 	if (!(clone_flags & CLONE_THREAD)) {
 		tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
 		if (!tgcred) {
-			put_cred(new);
-			return -ENOMEM;
+			ret = -ENOMEM;
+			goto error_put;
 		}
 		atomic_set(&tgcred->usage, 1);
 		spin_lock_init(&tgcred->lock);
@@ -325,6 +332,10 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 	atomic_inc(&new->user->processes);
 	p->cred = p->real_cred = get_cred(new);
 	return 0;
+
+error_put:
+	put_cred(new);
+	return ret;
 }
 
 /**
diff --git a/kernel/fork.c b/kernel/fork.c
index 29c18c14812..1dd89451fae 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -976,7 +976,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (atomic_read(&p->real_cred->user->processes) >=
 			p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
 		if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
-		    p->real_cred->user != current->nsproxy->user_ns->root_user)
+		    p->real_cred->user != INIT_USER)
 			goto bad_fork_free;
 	}
 
@@ -1334,6 +1334,20 @@ long do_fork(unsigned long clone_flags,
 	int trace = 0;
 	long nr;
 
+	/*
+	 * Do some preliminary argument and permissions checking before we
+	 * actually start allocating stuff
+	 */
+	if (clone_flags & CLONE_NEWUSER) {
+		if (clone_flags & CLONE_THREAD)
+			return -EINVAL;
+		/* hopefully this check will go away when userns support is
+		 * complete
+		 */
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+	}
+
 	/*
 	 * We hope to recycle these flags after 2.6.26
 	 */
@@ -1581,8 +1595,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 	err = -EINVAL;
 	if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
 				CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
-				CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER|
-				CLONE_NEWNET))
+				CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
 		goto bad_unshare_out;
 
 	/*
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 1d3ef29a258..63598dca2d0 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -80,12 +80,6 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
 		goto out_pid;
 	}
 
-	new_nsp->user_ns = copy_user_ns(flags, tsk->nsproxy->user_ns);
-	if (IS_ERR(new_nsp->user_ns)) {
-		err = PTR_ERR(new_nsp->user_ns);
-		goto out_user;
-	}
-
 	new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns);
 	if (IS_ERR(new_nsp->net_ns)) {
 		err = PTR_ERR(new_nsp->net_ns);
@@ -95,9 +89,6 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
 	return new_nsp;
 
 out_net:
-	if (new_nsp->user_ns)
-		put_user_ns(new_nsp->user_ns);
-out_user:
 	if (new_nsp->pid_ns)
 		put_pid_ns(new_nsp->pid_ns);
 out_pid:
@@ -130,7 +121,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
 	get_nsproxy(old_ns);
 
 	if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
-				CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET)))
+				CLONE_NEWPID | CLONE_NEWNET)))
 		return 0;
 
 	if (!capable(CAP_SYS_ADMIN)) {
@@ -173,8 +164,6 @@ void free_nsproxy(struct nsproxy *ns)
 		put_ipc_ns(ns->ipc_ns);
 	if (ns->pid_ns)
 		put_pid_ns(ns->pid_ns);
-	if (ns->user_ns)
-		put_user_ns(ns->user_ns);
 	put_net(ns->net_ns);
 	kmem_cache_free(nsproxy_cachep, ns);
 }
@@ -189,7 +178,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
 	int err = 0;
 
 	if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
-			       CLONE_NEWUSER | CLONE_NEWNET)))
+			       CLONE_NEWNET)))
 		return 0;
 
 	if (!capable(CAP_SYS_ADMIN))
diff --git a/kernel/sys.c b/kernel/sys.c
index ab735040468..ebe65c2c987 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -565,13 +565,13 @@ static int set_user(struct cred *new)
 {
 	struct user_struct *new_user;
 
-	new_user = alloc_uid(current->nsproxy->user_ns, new->uid);
+	new_user = alloc_uid(current_user_ns(), new->uid);
 	if (!new_user)
 		return -EAGAIN;
 
 	if (atomic_read(&new_user->processes) >=
 				current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
-			new_user != current->nsproxy->user_ns->root_user) {
+			new_user != INIT_USER) {
 		free_uid(new_user);
 		return -EAGAIN;
 	}
diff --git a/kernel/user.c b/kernel/user.c
index d476307dd4b..c0ef3a46443 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -20,9 +20,9 @@
 
 struct user_namespace init_user_ns = {
 	.kref = {
-		.refcount	= ATOMIC_INIT(2),
+		.refcount	= ATOMIC_INIT(1),
 	},
-	.root_user = &root_user,
+	.creator = &root_user,
 };
 EXPORT_SYMBOL_GPL(init_user_ns);
 
@@ -48,12 +48,14 @@ static struct kmem_cache *uid_cachep;
  */
 static DEFINE_SPINLOCK(uidhash_lock);
 
+/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->creator */
 struct user_struct root_user = {
-	.__count	= ATOMIC_INIT(1),
+	.__count	= ATOMIC_INIT(2),
 	.processes	= ATOMIC_INIT(1),
 	.files		= ATOMIC_INIT(0),
 	.sigpending	= ATOMIC_INIT(0),
 	.locked_shm     = 0,
+	.user_ns	= &init_user_ns,
 #ifdef CONFIG_USER_SCHED
 	.tg		= &init_task_group,
 #endif
@@ -314,12 +316,13 @@ done:
  * IRQ state (as stored in flags) is restored and uidhash_lock released
  * upon function exit.
  */
-static inline void free_user(struct user_struct *up, unsigned long flags)
+static void free_user(struct user_struct *up, unsigned long flags)
 {
 	/* restore back the count */
 	atomic_inc(&up->__count);
 	spin_unlock_irqrestore(&uidhash_lock, flags);
 
+	put_user_ns(up->user_ns);
 	INIT_WORK(&up->work, remove_user_sysfs_dir);
 	schedule_work(&up->work);
 }
@@ -335,13 +338,14 @@ static inline void uids_mutex_unlock(void) { }
  * IRQ state (as stored in flags) is restored and uidhash_lock released
  * upon function exit.
  */
-static inline void free_user(struct user_struct *up, unsigned long flags)
+static void free_user(struct user_struct *up, unsigned long flags)
 {
 	uid_hash_remove(up);
 	spin_unlock_irqrestore(&uidhash_lock, flags);
 	sched_destroy_user(up);
 	key_put(up->uid_keyring);
 	key_put(up->session_keyring);
+	put_user_ns(up->user_ns);
 	kmem_cache_free(uid_cachep, up);
 }
 
@@ -357,7 +361,7 @@ struct user_struct *find_user(uid_t uid)
 {
 	struct user_struct *ret;
 	unsigned long flags;
-	struct user_namespace *ns = current->nsproxy->user_ns;
+	struct user_namespace *ns = current_user()->user_ns;
 
 	spin_lock_irqsave(&uidhash_lock, flags);
 	ret = uid_hash_find(uid, uidhashentry(ns, uid));
@@ -404,6 +408,8 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
 		if (sched_create_user(new) < 0)
 			goto out_free_user;
 
+		new->user_ns = get_user_ns(ns);
+
 		if (uids_user_create(new))
 			goto out_destoy_sched;
 
@@ -427,7 +433,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
 			up = new;
 		}
 		spin_unlock_irq(&uidhash_lock);
-
 	}
 
 	uids_mutex_unlock();
@@ -436,6 +441,7 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
 
 out_destoy_sched:
 	sched_destroy_user(new);
+	put_user_ns(new->user_ns);
 out_free_user:
 	kmem_cache_free(uid_cachep, new);
 out_unlock:
@@ -443,33 +449,6 @@ out_unlock:
 	return NULL;
 }
 
-#ifdef CONFIG_USER_NS
-void release_uids(struct user_namespace *ns)
-{
-	int i;
-	unsigned long flags;
-	struct hlist_head *head;
-	struct hlist_node *nd;
-
-	spin_lock_irqsave(&uidhash_lock, flags);
-	/*
-	 * collapse the chains so that the user_struct-s will
-	 * be still alive, but not in hashes. subsequent free_uid()
-	 * will free them.
-	 */
-	for (i = 0; i < UIDHASH_SZ; i++) {
-		head = ns->uidhash_table + i;
-		while (!hlist_empty(head)) {
-			nd = head->first;
-			hlist_del_init(nd);
-		}
-	}
-	spin_unlock_irqrestore(&uidhash_lock, flags);
-
-	free_uid(ns->root_user);
-}
-#endif
-
 static int __init uid_cache_init(void)
 {
 	int n;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 0d9c51d6733..79084311ee5 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,70 +9,55 @@
 #include <linux/nsproxy.h>
 #include <linux/slab.h>
 #include <linux/user_namespace.h>
+#include <linux/cred.h>
 
 /*
- * Clone a new ns copying an original user ns, setting refcount to 1
- * @old_ns: namespace to clone
- * Return NULL on error (failure to kmalloc), new ns otherwise
+ * Create a new user namespace, deriving the creator from the user in the
+ * passed credentials, and replacing that user with the new root user for the
+ * new namespace.
+ *
+ * This is called by copy_creds(), which will finish setting the target task's
+ * credentials.
  */
-static struct user_namespace *clone_user_ns(struct user_namespace *old_ns)
+int create_user_ns(struct cred *new)
 {
 	struct user_namespace *ns;
-	struct user_struct *new_user;
-	struct cred *new;
+	struct user_struct *root_user;
 	int n;
 
 	ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL);
 	if (!ns)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 
 	kref_init(&ns->kref);
 
 	for (n = 0; n < UIDHASH_SZ; ++n)
 		INIT_HLIST_HEAD(ns->uidhash_table + n);
 
-	/* Insert new root user.  */
-	ns->root_user = alloc_uid(ns, 0);
-	if (!ns->root_user) {
+	/* Alloc new root user.  */
+	root_user = alloc_uid(ns, 0);
+	if (!root_user) {
 		kfree(ns);
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 	}
 
-	/* Reset current->user with a new one */
-	new_user = alloc_uid(ns, current_uid());
-	if (!new_user) {
-		free_uid(ns->root_user);
-		kfree(ns);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	/* Install the new user */
-	new = prepare_creds();
-	if (!new) {
-		free_uid(new_user);
-		free_uid(ns->root_user);
-		kfree(ns);
-	}
-	free_uid(new->user);
-	new->user = new_user;
-	commit_creds(new);
-	return ns;
-}
-
-struct user_namespace * copy_user_ns(int flags, struct user_namespace *old_ns)
-{
-	struct user_namespace *new_ns;
-
-	BUG_ON(!old_ns);
-	get_user_ns(old_ns);
-
-	if (!(flags & CLONE_NEWUSER))
-		return old_ns;
+	/* set the new root user in the credentials under preparation */
+	ns->creator = new->user;
+	new->user = root_user;
+	new->uid = new->euid = new->suid = new->fsuid = 0;
+	new->gid = new->egid = new->sgid = new->fsgid = 0;
+	put_group_info(new->group_info);
+	new->group_info = get_group_info(&init_groups);
+#ifdef CONFIG_KEYS
+	key_put(new->request_key_auth);
+	new->request_key_auth = NULL;
+#endif
+	/* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
 
-	new_ns = clone_user_ns(old_ns);
+	/* alloc_uid() incremented the userns refcount.  Just set it to 1 */
+	kref_set(&ns->kref, 1);
 
-	put_user_ns(old_ns);
-	return new_ns;
+	return 0;
 }
 
 void free_user_ns(struct kref *kref)
@@ -80,7 +65,7 @@ void free_user_ns(struct kref *kref)
 	struct user_namespace *ns;
 
 	ns = container_of(kref, struct user_namespace, kref);
-	release_uids(ns);
+	free_uid(ns->creator);
 	kfree(ns);
 }
 EXPORT_SYMBOL(free_user_ns);
-- 
cgit v1.2.3


From 6ded6ab9be4f6164aef1c527407c1b94f0929799 Mon Sep 17 00:00:00 2001
From: Serge Hallyn <serue@us.ibm.com>
Date: Mon, 24 Nov 2008 16:24:10 -0500
Subject: User namespaces: use the current_user_ns() macro

Fix up the last current_user()->user_ns instance to use
current_user_ns().

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
---
 kernel/user.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/user.c b/kernel/user.c
index c0ef3a46443..97202cb29ad 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -361,7 +361,7 @@ struct user_struct *find_user(uid_t uid)
 {
 	struct user_struct *ret;
 	unsigned long flags;
-	struct user_namespace *ns = current_user()->user_ns;
+	struct user_namespace *ns = current_user_ns();
 
 	spin_lock_irqsave(&uidhash_lock, flags);
 	ret = uid_hash_find(uid, uidhashentry(ns, uid));
-- 
cgit v1.2.3


From 7807fafa52b990abb321f1212416c71e64523ecb Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 25 Nov 2008 08:44:24 +0100
Subject: lockdep: fix unused function warning in kernel/lockdep.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: fix build warning

this warning:

  kernel/lockdep.c:584: warning: ‘print_lock_dependencies’ defined but not used

triggers because print_lock_dependencies() is only used if both
CONFIG_TRACE_IRQFLAGS and CONFIG_PROVE_LOCKING are enabled.

But adding #ifdefs is not an option here - it would spread out to 4-5
other helper functions and uglify the file. So mark this function
as __used - it's static and the compiler can eliminate it just fine.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index a4285830323..c137953420e 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -580,7 +580,8 @@ static void print_lock_class_header(struct lock_class *class, int depth)
 /*
  * printk all lock dependencies starting at <entry>:
  */
-static void print_lock_dependencies(struct lock_class *class, int depth)
+static void __used
+print_lock_dependencies(struct lock_class *class, int depth)
 {
 	struct lock_list *entry;
 
-- 
cgit v1.2.3


From ca109491f612aab5c8152207631c0444f63da97f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 25 Nov 2008 12:43:51 +0100
Subject: hrtimer: removing all ur callback modes

Impact: cleanup, move all hrtimer processing into hardirq context

This is an attempt at removing some of the hrtimer complexity by
reducing the number of callback modes to 1.

This means that all hrtimer callback functions will be ran from HARD-irq
context.

I went through all the 30 odd hrtimer callback functions in the kernel
and saw only one that I'm not quite sure of, which is the one in
net/can/bcm.c - hence I'm CC-ing the folks responsible for that code.

Furthermore, the hrtimer core now calls callbacks directly with IRQs
disabled in case you try to enqueue an expired timer. If this timer is a
periodic timer (which should use hrtimer_forward() to advance its time)
then it might be possible to end up in an inf. recursive loop due to the
fact that hrtimer_forward() doesn't round up to the next timer
granularity, and therefore keeps on calling the callback - obviously
this needs a fix.

Aside from that, this seems to compile and actually boot on my dual core
test box - although I'm sure there are some bugs in, me not hitting any
makes me certain :-)

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hrtimer.c             | 280 +++++--------------------------------------
 kernel/sched.c               |   2 -
 kernel/time/ntp.c            |   4 +-
 kernel/time/tick-sched.c     |   1 -
 kernel/trace/trace_sysprof.c |   1 -
 5 files changed, 33 insertions(+), 255 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 47e63349d1b..efd6f41e1c1 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -442,22 +442,6 @@ static inline void debug_hrtimer_activate(struct hrtimer *timer) { }
 static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
 #endif
 
-/*
- * Check, whether the timer is on the callback pending list
- */
-static inline int hrtimer_cb_pending(const struct hrtimer *timer)
-{
-	return timer->state & HRTIMER_STATE_PENDING;
-}
-
-/*
- * Remove a timer from the callback pending list
- */
-static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
-{
-	list_del_init(&timer->cb_entry);
-}
-
 /* High resolution timer related functions */
 #ifdef CONFIG_HIGH_RES_TIMERS
 
@@ -651,6 +635,8 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
 {
 }
 
+static void __run_hrtimer(struct hrtimer *timer);
+
 /*
  * When High resolution timers are active, try to reprogram. Note, that in case
  * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry
@@ -661,31 +647,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
 					    struct hrtimer_clock_base *base)
 {
 	if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
-
-		/* Timer is expired, act upon the callback mode */
-		switch(timer->cb_mode) {
-		case HRTIMER_CB_IRQSAFE_PERCPU:
-		case HRTIMER_CB_IRQSAFE_UNLOCKED:
-			/*
-			 * This is solely for the sched tick emulation with
-			 * dynamic tick support to ensure that we do not
-			 * restart the tick right on the edge and end up with
-			 * the tick timer in the softirq ! The calling site
-			 * takes care of this. Also used for hrtimer sleeper !
-			 */
-			debug_hrtimer_deactivate(timer);
-			return 1;
-		case HRTIMER_CB_SOFTIRQ:
-			/*
-			 * Move everything else into the softirq pending list !
-			 */
-			list_add_tail(&timer->cb_entry,
-				      &base->cpu_base->cb_pending);
-			timer->state = HRTIMER_STATE_PENDING;
-			return 1;
-		default:
-			BUG();
-		}
+		/*
+		 * XXX: recursion check?
+		 * hrtimer_forward() should round up with timer granularity
+		 * so that we never get into inf recursion here,
+		 * it doesn't do that though
+		 */
+		__run_hrtimer(timer);
+		return 1;
 	}
 	return 0;
 }
@@ -724,11 +693,6 @@ static int hrtimer_switch_to_hres(void)
 	return 1;
 }
 
-static inline void hrtimer_raise_softirq(void)
-{
-	raise_softirq(HRTIMER_SOFTIRQ);
-}
-
 #else
 
 static inline int hrtimer_hres_active(void) { return 0; }
@@ -747,7 +711,6 @@ static inline int hrtimer_reprogram(struct hrtimer *timer,
 {
 	return 0;
 }
-static inline void hrtimer_raise_softirq(void) { }
 
 #endif /* CONFIG_HIGH_RES_TIMERS */
 
@@ -890,10 +853,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
 			     struct hrtimer_clock_base *base,
 			     unsigned long newstate, int reprogram)
 {
-	/* High res. callback list. NOP for !HIGHRES */
-	if (hrtimer_cb_pending(timer))
-		hrtimer_remove_cb_pending(timer);
-	else {
+	if (timer->state & HRTIMER_STATE_ENQUEUED) {
 		/*
 		 * Remove the timer from the rbtree and replace the
 		 * first entry pointer if necessary.
@@ -953,7 +913,7 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n
 {
 	struct hrtimer_clock_base *base, *new_base;
 	unsigned long flags;
-	int ret, raise;
+	int ret;
 
 	base = lock_hrtimer_base(timer, &flags);
 
@@ -988,26 +948,8 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n
 	enqueue_hrtimer(timer, new_base,
 			new_base->cpu_base == &__get_cpu_var(hrtimer_bases));
 
-	/*
-	 * The timer may be expired and moved to the cb_pending
-	 * list. We can not raise the softirq with base lock held due
-	 * to a possible deadlock with runqueue lock.
-	 */
-	raise = timer->state == HRTIMER_STATE_PENDING;
-
-	/*
-	 * We use preempt_disable to prevent this task from migrating after
-	 * setting up the softirq and raising it. Otherwise, if me migrate
-	 * we will raise the softirq on the wrong CPU.
-	 */
-	preempt_disable();
-
 	unlock_hrtimer_base(timer, &flags);
 
-	if (raise)
-		hrtimer_raise_softirq();
-	preempt_enable();
-
 	return ret;
 }
 EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
@@ -1192,75 +1134,6 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
 }
 EXPORT_SYMBOL_GPL(hrtimer_get_res);
 
-static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
-{
-	spin_lock_irq(&cpu_base->lock);
-
-	while (!list_empty(&cpu_base->cb_pending)) {
-		enum hrtimer_restart (*fn)(struct hrtimer *);
-		struct hrtimer *timer;
-		int restart;
-		int emulate_hardirq_ctx = 0;
-
-		timer = list_entry(cpu_base->cb_pending.next,
-				   struct hrtimer, cb_entry);
-
-		debug_hrtimer_deactivate(timer);
-		timer_stats_account_hrtimer(timer);
-
-		fn = timer->function;
-		/*
-		 * A timer might have been added to the cb_pending list
-		 * when it was migrated during a cpu-offline operation.
-		 * Emulate hardirq context for such timers.
-		 */
-		if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
-		    timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED)
-			emulate_hardirq_ctx = 1;
-
-		__remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
-		spin_unlock_irq(&cpu_base->lock);
-
-		if (unlikely(emulate_hardirq_ctx)) {
-			local_irq_disable();
-			restart = fn(timer);
-			local_irq_enable();
-		} else
-			restart = fn(timer);
-
-		spin_lock_irq(&cpu_base->lock);
-
-		timer->state &= ~HRTIMER_STATE_CALLBACK;
-		if (restart == HRTIMER_RESTART) {
-			BUG_ON(hrtimer_active(timer));
-			/*
-			 * Enqueue the timer, allow reprogramming of the event
-			 * device
-			 */
-			enqueue_hrtimer(timer, timer->base, 1);
-		} else if (hrtimer_active(timer)) {
-			/*
-			 * If the timer was rearmed on another CPU, reprogram
-			 * the event device.
-			 */
-			struct hrtimer_clock_base *base = timer->base;
-
-			if (base->first == &timer->node &&
-			    hrtimer_reprogram(timer, base)) {
-				/*
-				 * Timer is expired. Thus move it from tree to
-				 * pending list again.
-				 */
-				__remove_hrtimer(timer, base,
-						 HRTIMER_STATE_PENDING, 0);
-				list_add_tail(&timer->cb_entry,
-					      &base->cpu_base->cb_pending);
-			}
-		}
-	}
-	spin_unlock_irq(&cpu_base->lock);
-}
-
 static void __run_hrtimer(struct hrtimer *timer)
 {
 	struct hrtimer_clock_base *base = timer->base;
@@ -1268,25 +1141,21 @@ static void __run_hrtimer(struct hrtimer *timer)
 	enum hrtimer_restart (*fn)(struct hrtimer *);
 	int restart;
 
+	WARN_ON(!irqs_disabled());
+
 	debug_hrtimer_deactivate(timer);
 	__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
 	timer_stats_account_hrtimer(timer);
-
 	fn = timer->function;
-	if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
-	    timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) {
-		/*
-		 * Used for scheduler timers, avoid lock inversion with
-		 * rq->lock and tasklist_lock.
-		 *
-		 * These timers are required to deal with enqueue expiry
-		 * themselves and are not allowed to migrate.
-		 */
-		spin_unlock(&cpu_base->lock);
-		restart = fn(timer);
-		spin_lock(&cpu_base->lock);
-	} else
-		restart = fn(timer);
+
+	/*
+	 * Because we run timers from hardirq context, there is no chance
+	 * they get migrated to another cpu, therefore its safe to unlock
+	 * the timer base.
+	 */
+	spin_unlock(&cpu_base->lock);
+	restart = fn(timer);
+	spin_lock(&cpu_base->lock);
 
 	/*
 	 * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid
@@ -1311,7 +1180,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
 	struct hrtimer_clock_base *base;
 	ktime_t expires_next, now;
-	int i, raise = 0;
+	int i;
 
 	BUG_ON(!cpu_base->hres_active);
 	cpu_base->nr_events++;
@@ -1360,16 +1229,6 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 				break;
 			}
 
-			/* Move softirq callbacks to the pending list */
-			if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
-				__remove_hrtimer(timer, base,
-						 HRTIMER_STATE_PENDING, 0);
-				list_add_tail(&timer->cb_entry,
-					      &base->cpu_base->cb_pending);
-				raise = 1;
-				continue;
-			}
-
 			__run_hrtimer(timer);
 		}
 		spin_unlock(&cpu_base->lock);
@@ -1383,10 +1242,6 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 		if (tick_program_event(expires_next, 0))
 			goto retry;
 	}
-
-	/* Raise softirq ? */
-	if (raise)
-		raise_softirq(HRTIMER_SOFTIRQ);
 }
 
 /**
@@ -1413,11 +1268,6 @@ void hrtimer_peek_ahead_timers(void)
 	local_irq_restore(flags);
 }
 
-static void run_hrtimer_softirq(struct softirq_action *h)
-{
-	run_hrtimer_pending(&__get_cpu_var(hrtimer_bases));
-}
-
 #endif	/* CONFIG_HIGH_RES_TIMERS */
 
 /*
@@ -1429,8 +1279,6 @@ static void run_hrtimer_softirq(struct softirq_action *h)
  */
 void hrtimer_run_pending(void)
 {
-	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
-
 	if (hrtimer_hres_active())
 		return;
 
@@ -1444,8 +1292,6 @@ void hrtimer_run_pending(void)
 	 */
 	if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
 		hrtimer_switch_to_hres();
-
-	run_hrtimer_pending(cpu_base);
 }
 
 /*
@@ -1482,14 +1328,6 @@ void hrtimer_run_queues(void)
 					hrtimer_get_expires_tv64(timer))
 				break;
 
-			if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
-				__remove_hrtimer(timer, base,
-					HRTIMER_STATE_PENDING, 0);
-				list_add_tail(&timer->cb_entry,
-					&base->cpu_base->cb_pending);
-				continue;
-			}
-
 			__run_hrtimer(timer);
 		}
 		spin_unlock(&cpu_base->lock);
@@ -1516,9 +1354,6 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
 {
 	sl->timer.function = hrtimer_wakeup;
 	sl->task = task;
-#ifdef CONFIG_HIGH_RES_TIMERS
-	sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
-#endif
 }
 
 static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
@@ -1655,36 +1490,22 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
 		cpu_base->clock_base[i].cpu_base = cpu_base;
 
-	INIT_LIST_HEAD(&cpu_base->cb_pending);
 	hrtimer_init_hres(cpu_base);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
 
-static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
-				struct hrtimer_clock_base *new_base, int dcpu)
+static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
+				 struct hrtimer_clock_base *new_base, int dcpu)
 {
 	struct hrtimer *timer;
 	struct rb_node *node;
-	int raise = 0;
 
 	while ((node = rb_first(&old_base->active))) {
 		timer = rb_entry(node, struct hrtimer, node);
 		BUG_ON(hrtimer_callback_running(timer));
 		debug_hrtimer_deactivate(timer);
 
-		/*
-		 * Should not happen. Per CPU timers should be
-		 * canceled _before_ the migration code is called
-		 */
-		if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) {
-			__remove_hrtimer(timer, old_base,
-					 HRTIMER_STATE_INACTIVE, 0);
-			WARN(1, "hrtimer (%p %p)active but cpu %d dead\n",
-			     timer, timer->function, dcpu);
-			continue;
-		}
-
 		/*
 		 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
 		 * timer could be seen as !active and just vanish away
@@ -1708,48 +1529,19 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 		 * otherwise we end up with a stale timer.
 		 */
 		if (timer->state == HRTIMER_STATE_MIGRATE) {
-			timer->state = HRTIMER_STATE_PENDING;
-			list_add_tail(&timer->cb_entry,
-				      &new_base->cpu_base->cb_pending);
-			raise = 1;
+			/* XXX: running on offline cpu */
+			__run_hrtimer(timer);
 		}
 #endif
 		/* Clear the migration state bit */
 		timer->state &= ~HRTIMER_STATE_MIGRATE;
 	}
-	return raise;
 }
 
-#ifdef CONFIG_HIGH_RES_TIMERS
-static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
-				   struct hrtimer_cpu_base *new_base)
-{
-	struct hrtimer *timer;
-	int raise = 0;
-
-	while (!list_empty(&old_base->cb_pending)) {
-		timer = list_entry(old_base->cb_pending.next,
-				   struct hrtimer, cb_entry);
-
-		__remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0);
-		timer->base = &new_base->clock_base[timer->base->index];
-		list_add_tail(&timer->cb_entry, &new_base->cb_pending);
-		raise = 1;
-	}
-	return raise;
-}
-#else
-static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
-				   struct hrtimer_cpu_base *new_base)
-{
-	return 0;
-}
-#endif
-
 static void migrate_hrtimers(int cpu)
 {
 	struct hrtimer_cpu_base *old_base, *new_base;
-	int i, raise = 0;
+	int i;
 
 	BUG_ON(cpu_online(cpu));
 	old_base = &per_cpu(hrtimer_bases, cpu);
@@ -1764,20 +1556,13 @@ static void migrate_hrtimers(int cpu)
 	spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
 
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
-		if (migrate_hrtimer_list(&old_base->clock_base[i],
-					 &new_base->clock_base[i], cpu))
-			raise = 1;
+		migrate_hrtimer_list(&old_base->clock_base[i],
+				     &new_base->clock_base[i], cpu);
 	}
 
-	if (migrate_hrtimer_pending(old_base, new_base))
-		raise = 1;
-
 	spin_unlock(&old_base->lock);
 	spin_unlock_irq(&new_base->lock);
 	put_cpu_var(hrtimer_bases);
-
-	if (raise)
-		hrtimer_raise_softirq();
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
@@ -1817,9 +1602,6 @@ void __init hrtimers_init(void)
 	hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
 			  (void *)(long)smp_processor_id());
 	register_cpu_notifier(&hrtimers_nb);
-#ifdef CONFIG_HIGH_RES_TIMERS
-	open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
-#endif
 }
 
 /**
diff --git a/kernel/sched.c b/kernel/sched.c
index 9b1e79371c2..5ac5e953616 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -203,7 +203,6 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
 	hrtimer_init(&rt_b->rt_period_timer,
 			CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	rt_b->rt_period_timer.function = sched_rt_period_timer;
-	rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
 }
 
 static inline int rt_bandwidth_enabled(void)
@@ -1139,7 +1138,6 @@ static void init_rq_hrtick(struct rq *rq)
 
 	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	rq->hrtick_timer.function = hrtick;
-	rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
 }
 #else	/* CONFIG_SCHED_HRTICK */
 static inline void hrtick_clear(struct rq *rq)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 8ff15e5d486..f5f793d9241 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -131,7 +131,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
 {
 	enum hrtimer_restart res = HRTIMER_NORESTART;
 
-	write_seqlock_irq(&xtime_lock);
+	write_seqlock(&xtime_lock);
 
 	switch (time_state) {
 	case TIME_OK:
@@ -164,7 +164,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
 	}
 	update_vsyscall(&xtime, clock);
 
-	write_sequnlock_irq(&xtime_lock);
+	write_sequnlock(&xtime_lock);
 
 	return res;
 }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 342fc9ccab4..502a81e2639 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -681,7 +681,6 @@ void tick_setup_sched_timer(void)
 	 */
 	hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 	ts->sched_timer.function = tick_sched_timer;
-	ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
 
 	/* Get the next period (per cpu) */
 	hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 9587d3bcba5..ae542e2e38d 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -202,7 +202,6 @@ static void start_stack_timer(int cpu)
 
 	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hrtimer->function = stack_trace_timer_fn;
-	hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
 
 	hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
 }
-- 
cgit v1.2.3


From 8bba1bf5e2434c83f2fe8b1422604ace9bbe4cb8 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Tue, 25 Nov 2008 09:12:31 +0100
Subject: x86, ftrace: call trace->open() before stopping tracing; add
 trace->print_header()

Add a callback to allow an ftrace plug-in to write its own header.

Move the call to trace->open() up a few lines.

The changes are required by the BTS ftrace plug-in.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 11 +++++++----
 kernel/trace/trace.h |  1 +
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a45b59e53fb..8df8fdd69c9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2298,7 +2298,9 @@ static int s_show(struct seq_file *m, void *v)
 			seq_printf(m, "# tracer: %s\n", iter->trace->name);
 			seq_puts(m, "#\n");
 		}
-		if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
+		if (iter->trace && iter->trace->print_header)
+			iter->trace->print_header(m);
+		else if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
 			/* print nothing if the buffers are empty */
 			if (trace_empty(iter))
 				return 0;
@@ -2350,6 +2352,10 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 	iter->trace = current_trace;
 	iter->pos = -1;
 
+	/* Notify the tracer early; before we stop tracing. */
+	if (iter->trace && iter->trace->open)
+			iter->trace->open(iter);
+
 	/* Annotate start of buffers if we had overruns */
 	if (ring_buffer_overruns(iter->tr->buffer))
 		iter->iter_flags |= TRACE_FILE_ANNOTATE;
@@ -2375,9 +2381,6 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 	/* stop the trace while dumping */
 	tracing_stop();
 
-	if (iter->trace && iter->trace->open)
-			iter->trace->open(iter);
-
 	mutex_unlock(&trace_types_lock);
 
  out:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 28c15c2ebc2..717f9f045c6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -311,6 +311,7 @@ struct tracer {
 	int			(*selftest)(struct tracer *trace,
 					    struct trace_array *tr);
 #endif
+	void			(*print_header)(struct seq_file *m);
 	enum print_line_t	(*print_line)(struct trace_iterator *iter);
 	/* If you handled the flag setting, return 0 */
 	int			(*set_flag)(u32 old_flags, u32 bit, int set);
-- 
cgit v1.2.3


From 1e9b51c28312f7334394aa30be56ff52c2b65b7e Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Tue, 25 Nov 2008 09:24:15 +0100
Subject: x86, bts, ftrace: a BTS ftrace plug-in prototype

Impact: add new ftrace plugin

A prototype for a BTS ftrace plug-in.

The tracer collects branch trace in a cyclic buffer for each cpu.

The tracer is not configurable and the trace for each snapshot is
appended when doing cat /debug/tracing/trace.

This is a proof of concept that will be extended with future patches
to become a (hopefully) useful tool.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig     |  11 ++
 kernel/trace/Makefile    |   1 +
 kernel/trace/trace.h     |  12 +++
 kernel/trace/trace_bts.c | 276 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 300 insertions(+)
 create mode 100644 kernel/trace/trace_bts.c

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 9cbf7761f49..620feadff67 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -28,6 +28,9 @@ config HAVE_DYNAMIC_FTRACE
 config HAVE_FTRACE_MCOUNT_RECORD
 	bool
 
+config HAVE_HW_BRANCH_TRACER
+	bool
+
 config TRACER_MAX_TRACE
 	bool
 
@@ -233,6 +236,14 @@ config STACK_TRACER
 
 	  Say N if unsure.
 
+config BTS_TRACER
+	depends on HAVE_HW_BRANCH_TRACER
+	bool "Trace branches"
+	select TRACING
+	help
+	  This tracer records all branches on the system in a circular
+	  buffer giving access to the last N branches for each cpu.
+
 config DYNAMIC_FTRACE
 	bool "enable/disable ftrace tracepoints dynamically"
 	depends on FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 1a8c9259dc6..cef4bcb4e82 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -31,5 +31,6 @@ obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
 obj-$(CONFIG_FUNCTION_RET_TRACER) += trace_functions_return.o
 obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
+obj-$(CONFIG_BTS_TRACER) += trace_bts.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 717f9f045c6..3abd645e8af 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -27,6 +27,7 @@ enum trace_type {
 	TRACE_BOOT_RET,
 	TRACE_FN_RET,
 	TRACE_USER_STACK,
+	TRACE_BTS,
 
 	__TRACE_LAST_TYPE
 };
@@ -153,6 +154,12 @@ struct trace_branch {
 	char			correct;
 };
 
+struct bts_entry {
+	struct trace_entry	ent;
+	unsigned long		from;
+	unsigned long		to;
+};
+
 /*
  * trace_flag_type is an enumeration that holds different
  * states when a trace occurs. These are:
@@ -258,6 +265,7 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\
 		IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
 		IF_ASSIGN(var, ent, struct ftrace_ret_entry, TRACE_FN_RET);\
+		IF_ASSIGN(var, ent, struct bts_entry, TRACE_BTS);\
 		__ftrace_bad_type();					\
 	} while (0)
 
@@ -392,6 +400,10 @@ void trace_function(struct trace_array *tr,
 void
 trace_function_return(struct ftrace_retfunc *trace);
 
+void trace_bts(struct trace_array *tr,
+	       unsigned long from,
+	       unsigned long to);
+
 void tracing_start_cmdline_record(void);
 void tracing_stop_cmdline_record(void);
 void tracing_sched_switch_assign_trace(struct trace_array *tr);
diff --git a/kernel/trace/trace_bts.c b/kernel/trace/trace_bts.c
new file mode 100644
index 00000000000..23b76e4690e
--- /dev/null
+++ b/kernel/trace/trace_bts.c
@@ -0,0 +1,276 @@
+/*
+ * BTS tracer
+ *
+ * Copyright (C) 2008 Markus Metzger <markus.t.metzger@gmail.com>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/ftrace.h>
+#include <linux/kallsyms.h>
+
+#include <asm/ds.h>
+
+#include "trace.h"
+
+
+#define SIZEOF_BTS (1 << 13)
+
+static DEFINE_PER_CPU(struct bts_tracer *, tracer);
+static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
+
+#define this_tracer per_cpu(tracer, smp_processor_id())
+#define this_buffer per_cpu(buffer, smp_processor_id())
+
+
+/*
+ * Information to interpret a BTS record.
+ * This will go into an in-kernel BTS interface.
+ */
+static unsigned char sizeof_field;
+static unsigned long debugctl_mask;
+
+#define sizeof_bts (3 * sizeof_field)
+
+static void bts_trace_cpuinit(struct cpuinfo_x86 *c)
+{
+	switch (c->x86) {
+	case 0x6:
+		switch (c->x86_model) {
+		case 0x0 ... 0xC:
+			break;
+		case 0xD:
+		case 0xE: /* Pentium M */
+			sizeof_field = sizeof(long);
+			debugctl_mask = (1<<6)|(1<<7);
+			break;
+		default:
+			sizeof_field = 8;
+			debugctl_mask = (1<<6)|(1<<7);
+			break;
+		}
+		break;
+	case 0xF:
+		switch (c->x86_model) {
+		case 0x0:
+		case 0x1:
+		case 0x2: /* Netburst */
+			sizeof_field = sizeof(long);
+			debugctl_mask = (1<<2)|(1<<3);
+			break;
+		default:
+			/* sorry, don't know about them */
+			break;
+		}
+		break;
+	default:
+		/* sorry, don't know about them */
+		break;
+	}
+}
+
+static inline void bts_enable(void)
+{
+	unsigned long debugctl;
+
+	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl | debugctl_mask);
+}
+
+static inline void bts_disable(void)
+{
+	unsigned long debugctl;
+
+	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl & ~debugctl_mask);
+}
+
+static void bts_trace_reset(struct trace_array *tr)
+{
+	int cpu;
+
+	tr->time_start = ftrace_now(tr->cpu);
+
+	for_each_online_cpu(cpu)
+		tracing_reset(tr, cpu);
+}
+
+static void bts_trace_start_cpu(void *arg)
+{
+	this_tracer =
+		ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS,
+			       /* ovfl = */ NULL, /* th = */ (size_t)-1);
+	if (IS_ERR(this_tracer)) {
+		this_tracer = NULL;
+		return;
+	}
+
+	bts_enable();
+}
+
+static void bts_trace_start(struct trace_array *tr)
+{
+	int cpu;
+
+	bts_trace_reset(tr);
+
+	for_each_cpu_mask(cpu, cpu_possible_map)
+		smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
+}
+
+static void bts_trace_stop_cpu(void *arg)
+{
+	if (this_tracer) {
+		bts_disable();
+
+		ds_release_bts(this_tracer);
+		this_tracer = NULL;
+	}
+}
+
+static void bts_trace_stop(struct trace_array *tr)
+{
+	int cpu;
+
+	for_each_cpu_mask(cpu, cpu_possible_map)
+		smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
+}
+
+static int bts_trace_init(struct trace_array *tr)
+{
+	bts_trace_cpuinit(&boot_cpu_data);
+	bts_trace_reset(tr);
+	bts_trace_start(tr);
+
+	return 0;
+}
+
+static void bts_trace_print_header(struct seq_file *m)
+{
+#ifdef __i386__
+	seq_puts(m, "# CPU#    FROM           TO     FUNCTION\n");
+	seq_puts(m, "#  |       |             |         |\n");
+#else
+	seq_puts(m,
+		 "# CPU#        FROM                   TO         FUNCTION\n");
+	seq_puts(m,
+		 "#  |           |                     |             |\n");
+#endif
+}
+
+static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
+{
+	struct trace_entry *entry = iter->ent;
+	struct trace_seq *seq = &iter->seq;
+	struct bts_entry *it;
+
+	trace_assign_type(it, entry);
+
+	if (entry->type == TRACE_BTS) {
+		int ret;
+#ifdef CONFIG_KALLSYMS
+		char function[KSYM_SYMBOL_LEN];
+		sprint_symbol(function, it->from);
+#else
+		char *function = "<unknown>";
+#endif
+
+		ret = trace_seq_printf(seq, "%4d  0x%lx -> 0x%lx [%s]\n",
+				       entry->cpu, it->from, it->to, function);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;;
+		return TRACE_TYPE_HANDLED;
+	}
+	return TRACE_TYPE_UNHANDLED;
+}
+
+void trace_bts(struct trace_array *tr, unsigned long from, unsigned long to)
+{
+	struct ring_buffer_event *event;
+	struct bts_entry *entry;
+	unsigned long irq;
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq);
+	if (!event)
+		return;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, from);
+	entry->ent.type = TRACE_BTS;
+	entry->ent.cpu = smp_processor_id();
+	entry->from = from;
+	entry->to   = to;
+	ring_buffer_unlock_commit(tr->buffer, event, irq);
+}
+
+static void trace_bts_at(struct trace_array *tr, size_t index)
+{
+	const void *raw = NULL;
+	unsigned long from, to;
+	int err;
+
+	err = ds_access_bts(this_tracer, index, &raw);
+	if (err < 0)
+		return;
+
+	from = *(const unsigned long *)raw;
+	to = *(const unsigned long *)((const char *)raw + sizeof_field);
+
+	trace_bts(tr, from, to);
+}
+
+static void trace_bts_cpu(void *arg)
+{
+	struct trace_array *tr = (struct trace_array *) arg;
+	size_t index = 0, end = 0, i;
+	int err;
+
+	if (!this_tracer)
+		return;
+
+	bts_disable();
+
+	err = ds_get_bts_index(this_tracer, &index);
+	if (err < 0)
+		goto out;
+
+	err = ds_get_bts_end(this_tracer, &end);
+	if (err < 0)
+		goto out;
+
+	for (i = index; i < end; i++)
+		trace_bts_at(tr, i);
+
+	for (i = 0; i < index; i++)
+		trace_bts_at(tr, i);
+
+out:
+	bts_enable();
+}
+
+static void trace_bts_prepare(struct trace_iterator *iter)
+{
+	int cpu;
+
+	for_each_cpu_mask(cpu, cpu_possible_map)
+		smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1);
+}
+
+struct tracer bts_tracer __read_mostly =
+{
+	.name		= "bts",
+	.init		= bts_trace_init,
+	.reset		= bts_trace_stop,
+	.print_header	= bts_trace_print_header,
+	.print_line	= bts_trace_print_line,
+	.start		= bts_trace_start,
+	.stop		= bts_trace_stop,
+	.open		= trace_bts_prepare
+};
+
+__init static int init_bts_trace(void)
+{
+	return register_tracer(&bts_tracer);
+}
+device_initcall(init_bts_trace);
-- 
cgit v1.2.3


From fb52607afcd0629776f1dc9e657647ceae81dd50 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 25 Nov 2008 21:07:04 +0100
Subject: tracing/function-return-tracer: change the name into
 function-graph-tracer

Impact: cleanup

This patch changes the name of the "return function tracer" into
function-graph-tracer which is a more suitable name for a tracing
which makes one able to retrieve the ordered call stack during
the code flow.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Makefile                      |  2 +-
 kernel/fork.c                        |  4 +-
 kernel/sched.c                       |  2 +-
 kernel/trace/Kconfig                 | 19 ++++---
 kernel/trace/Makefile                |  2 +-
 kernel/trace/ftrace.c                | 26 +++++-----
 kernel/trace/trace.c                 | 18 +++----
 kernel/trace/trace.h                 | 12 ++---
 kernel/trace/trace_functions_graph.c | 98 ++++++++++++++++++++++++++++++++++++
 9 files changed, 142 insertions(+), 41 deletions(-)
 create mode 100644 kernel/trace/trace_functions_graph.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 03a45e7e87b..703cf3b7389 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -21,7 +21,7 @@ CFLAGS_REMOVE_cgroup-debug.o = -pg
 CFLAGS_REMOVE_sched_clock.o = -pg
 CFLAGS_REMOVE_sched.o = -pg
 endif
-ifdef CONFIG_FUNCTION_RET_TRACER
+ifdef CONFIG_FUNCTION_GRAPH_TRACER
 CFLAGS_REMOVE_extable.o = -pg # For __kernel_text_address()
 CFLAGS_REMOVE_module.o = -pg # For __module_text_address()
 endif
diff --git a/kernel/fork.c b/kernel/fork.c
index d6e1a3205f6..5f82a999c03 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -140,7 +140,7 @@ void free_task(struct task_struct *tsk)
 	prop_local_destroy_single(&tsk->dirties);
 	free_thread_info(tsk->stack);
 	rt_mutex_debug_task_free(tsk);
-	ftrace_retfunc_exit_task(tsk);
+	ftrace_graph_exit_task(tsk);
 	free_task_struct(tsk);
 }
 EXPORT_SYMBOL(free_task);
@@ -1271,7 +1271,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	total_forks++;
 	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
-	ftrace_retfunc_init_task(p);
+	ftrace_graph_init_task(p);
 	proc_fork_connector(p);
 	cgroup_post_fork(p);
 	return p;
diff --git a/kernel/sched.c b/kernel/sched.c
index 388d9db044a..52490bf6b88 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5901,7 +5901,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 	 * The idle tasks have their own, simple scheduling class:
 	 */
 	idle->sched_class = &idle_sched_class;
-	ftrace_retfunc_init_task(idle);
+	ftrace_graph_init_task(idle);
 }
 
 /*
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 620feadff67..eb9b901e077 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -12,7 +12,7 @@ config NOP_TRACER
 config HAVE_FUNCTION_TRACER
 	bool
 
-config HAVE_FUNCTION_RET_TRACER
+config HAVE_FUNCTION_GRAPH_TRACER
 	bool
 
 config HAVE_FUNCTION_TRACE_MCOUNT_TEST
@@ -63,15 +63,18 @@ config FUNCTION_TRACER
 	  (the bootup default), then the overhead of the instructions is very
 	  small and not measurable even in micro-benchmarks.
 
-config FUNCTION_RET_TRACER
-	bool "Kernel Function return Tracer"
-	depends on HAVE_FUNCTION_RET_TRACER
+config FUNCTION_GRAPH_TRACER
+	bool "Kernel Function Graph Tracer"
+	depends on HAVE_FUNCTION_GRAPH_TRACER
 	depends on FUNCTION_TRACER
 	help
-	  Enable the kernel to trace a function at its return.
-	  It's first purpose is to trace the duration of functions.
-	  This is done by setting the current return address on the thread
-	  info structure of the current task.
+	  Enable the kernel to trace a function at both its return
+	  and its entry.
+	  It's first purpose is to trace the duration of functions and
+	  draw a call graph for each thread with some informations like
+	  the return value.
+	  This is done by setting the current return address on the current
+	  task structure into a stack of calls.
 
 config IRQSOFF_TRACER
 	bool "Interrupts-off Latency Tracer"
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index cef4bcb4e82..08c5fe6ddc0 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -29,7 +29,7 @@ obj-$(CONFIG_NOP_TRACER) += trace_nop.o
 obj-$(CONFIG_STACK_TRACER) += trace_stack.o
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
-obj-$(CONFIG_FUNCTION_RET_TRACER) += trace_functions_return.o
+obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
 obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
 obj-$(CONFIG_BTS_TRACER) += trace_bts.o
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 53042f118f2..9e19976af72 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -395,11 +395,11 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 	unsigned long ip, fl;
 	unsigned long ftrace_addr;
 
-#ifdef CONFIG_FUNCTION_RET_TRACER
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	if (ftrace_tracing_type == FTRACE_TYPE_ENTER)
 		ftrace_addr = (unsigned long)ftrace_caller;
 	else
-		ftrace_addr = (unsigned long)ftrace_return_caller;
+		ftrace_addr = (unsigned long)ftrace_graph_caller;
 #else
 	ftrace_addr = (unsigned long)ftrace_caller;
 #endif
@@ -1496,13 +1496,13 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 	return ret;
 }
 
-#ifdef CONFIG_FUNCTION_RET_TRACER
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
 static atomic_t ftrace_retfunc_active;
 
 /* The callback that hooks the return of a function */
-trace_function_return_t ftrace_function_return =
-			(trace_function_return_t)ftrace_stub;
+trace_function_graph_t ftrace_graph_function =
+			(trace_function_graph_t)ftrace_stub;
 
 
 /* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
@@ -1549,7 +1549,7 @@ free:
 }
 
 /* Allocate a return stack for each task */
-static int start_return_tracing(void)
+static int start_graph_tracing(void)
 {
 	struct ftrace_ret_stack **ret_stack_list;
 	int ret;
@@ -1569,7 +1569,7 @@ static int start_return_tracing(void)
 	return ret;
 }
 
-int register_ftrace_return(trace_function_return_t func)
+int register_ftrace_graph(trace_function_graph_t func)
 {
 	int ret = 0;
 
@@ -1584,13 +1584,13 @@ int register_ftrace_return(trace_function_return_t func)
 		goto out;
 	}
 	atomic_inc(&ftrace_retfunc_active);
-	ret = start_return_tracing();
+	ret = start_graph_tracing();
 	if (ret) {
 		atomic_dec(&ftrace_retfunc_active);
 		goto out;
 	}
 	ftrace_tracing_type = FTRACE_TYPE_RETURN;
-	ftrace_function_return = func;
+	ftrace_graph_function = func;
 	ftrace_startup();
 
 out:
@@ -1598,12 +1598,12 @@ out:
 	return ret;
 }
 
-void unregister_ftrace_return(void)
+void unregister_ftrace_graph(void)
 {
 	mutex_lock(&ftrace_sysctl_lock);
 
 	atomic_dec(&ftrace_retfunc_active);
-	ftrace_function_return = (trace_function_return_t)ftrace_stub;
+	ftrace_graph_function = (trace_function_graph_t)ftrace_stub;
 	ftrace_shutdown();
 	/* Restore normal tracing type */
 	ftrace_tracing_type = FTRACE_TYPE_ENTER;
@@ -1612,7 +1612,7 @@ void unregister_ftrace_return(void)
 }
 
 /* Allocate a return stack for newly created task */
-void ftrace_retfunc_init_task(struct task_struct *t)
+void ftrace_graph_init_task(struct task_struct *t)
 {
 	if (atomic_read(&ftrace_retfunc_active)) {
 		t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
@@ -1626,7 +1626,7 @@ void ftrace_retfunc_init_task(struct task_struct *t)
 		t->ret_stack = NULL;
 }
 
-void ftrace_retfunc_exit_task(struct task_struct *t)
+void ftrace_graph_exit_task(struct task_struct *t)
 {
 	struct ftrace_ret_stack	*ret_stack = t->ret_stack;
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8df8fdd69c9..f21ab2c68fd 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -878,15 +878,15 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data,
 	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
 }
 
-#ifdef CONFIG_FUNCTION_RET_TRACER
-static void __trace_function_return(struct trace_array *tr,
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static void __trace_function_graph(struct trace_array *tr,
 				struct trace_array_cpu *data,
-				struct ftrace_retfunc *trace,
+				struct ftrace_graph_ret *trace,
 				unsigned long flags,
 				int pc)
 {
 	struct ring_buffer_event *event;
-	struct ftrace_ret_entry *entry;
+	struct ftrace_graph_entry *entry;
 	unsigned long irq_flags;
 
 	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
@@ -1177,8 +1177,8 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
 	local_irq_restore(flags);
 }
 
-#ifdef CONFIG_FUNCTION_RET_TRACER
-void trace_function_return(struct ftrace_retfunc *trace)
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+void trace_function_graph(struct ftrace_graph_ret *trace)
 {
 	struct trace_array *tr = &global_trace;
 	struct trace_array_cpu *data;
@@ -1193,12 +1193,12 @@ void trace_function_return(struct ftrace_retfunc *trace)
 	disabled = atomic_inc_return(&data->disabled);
 	if (likely(disabled == 1)) {
 		pc = preempt_count();
-		__trace_function_return(tr, data, trace, flags, pc);
+		__trace_function_graph(tr, data, trace, flags, pc);
 	}
 	atomic_dec(&data->disabled);
 	raw_local_irq_restore(flags);
 }
-#endif /* CONFIG_FUNCTION_RET_TRACER */
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
 static struct ftrace_ops trace_ops __read_mostly =
 {
@@ -2001,7 +2001,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 		break;
 	}
 	case TRACE_FN_RET: {
-		return print_return_function(iter);
+		return print_graph_function(iter);
 		break;
 	}
 	case TRACE_BRANCH: {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3abd645e8af..72b5ef86876 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -57,7 +57,7 @@ struct ftrace_entry {
 };
 
 /* Function return entry */
-struct ftrace_ret_entry {
+struct ftrace_graph_entry {
 	struct trace_entry	ent;
 	unsigned long		ip;
 	unsigned long		parent_ip;
@@ -264,7 +264,7 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\
 		IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\
 		IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
-		IF_ASSIGN(var, ent, struct ftrace_ret_entry, TRACE_FN_RET);\
+		IF_ASSIGN(var, ent, struct ftrace_graph_entry, TRACE_FN_RET);\
 		IF_ASSIGN(var, ent, struct bts_entry, TRACE_BTS);\
 		__ftrace_bad_type();					\
 	} while (0)
@@ -398,7 +398,7 @@ void trace_function(struct trace_array *tr,
 		    unsigned long parent_ip,
 		    unsigned long flags, int pc);
 void
-trace_function_return(struct ftrace_retfunc *trace);
+trace_function_graph(struct ftrace_graph_ret *trace);
 
 void trace_bts(struct trace_array *tr,
 	       unsigned long from,
@@ -489,11 +489,11 @@ extern int trace_vprintk(unsigned long ip, const char *fmt, va_list args);
 extern unsigned long trace_flags;
 
 /* Standard output formatting function used for function return traces */
-#ifdef CONFIG_FUNCTION_RET_TRACER
-extern enum print_line_t print_return_function(struct trace_iterator *iter);
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+extern enum print_line_t print_graph_function(struct trace_iterator *iter);
 #else
 static inline enum print_line_t
-print_return_function(struct trace_iterator *iter)
+print_graph_function(struct trace_iterator *iter)
 {
 	return TRACE_TYPE_UNHANDLED;
 }
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
new file mode 100644
index 00000000000..f5bad4624d2
--- /dev/null
+++ b/kernel/trace/trace_functions_graph.c
@@ -0,0 +1,98 @@
+/*
+ *
+ * Function graph tracer.
+ * Copyright (c) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ * Mostly borrowed from function tracer which
+ * is Copyright (c) Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/fs.h>
+
+#include "trace.h"
+
+
+#define TRACE_GRAPH_PRINT_OVERRUN	0x1
+static struct tracer_opt trace_opts[] = {
+	/* Display overruns or not */
+	{ TRACER_OPT(overrun, TRACE_GRAPH_PRINT_OVERRUN) },
+	{ } /* Empty entry */
+};
+
+static struct tracer_flags tracer_flags = {
+	.val = 0, /* Don't display overruns by default */
+	.opts = trace_opts
+};
+
+
+static int graph_trace_init(struct trace_array *tr)
+{
+	int cpu;
+	for_each_online_cpu(cpu)
+		tracing_reset(tr, cpu);
+
+	return register_ftrace_graph(&trace_function_graph);
+}
+
+static void graph_trace_reset(struct trace_array *tr)
+{
+		unregister_ftrace_graph();
+}
+
+
+enum print_line_t
+print_graph_function(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry = iter->ent;
+	struct ftrace_graph_entry *field;
+	int ret;
+
+	if (entry->type == TRACE_FN_RET) {
+		trace_assign_type(field, entry);
+		ret = trace_seq_printf(s, "%pF -> ", (void *)field->parent_ip);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+
+		ret = seq_print_ip_sym(s, field->ip,
+					trace_flags & TRACE_ITER_SYM_MASK);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+
+		ret = trace_seq_printf(s, " (%llu ns)",
+					field->rettime - field->calltime);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+
+		if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
+			ret = trace_seq_printf(s, " (Overruns: %lu)",
+						field->overrun);
+			if (!ret)
+				return TRACE_TYPE_PARTIAL_LINE;
+		}
+
+		ret = trace_seq_printf(s, "\n");
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+
+		return TRACE_TYPE_HANDLED;
+	}
+	return TRACE_TYPE_UNHANDLED;
+}
+
+static struct tracer graph_trace __read_mostly = {
+	.name	     = "function-graph",
+	.init	     = graph_trace_init,
+	.reset	     = graph_trace_reset,
+	.print_line = print_graph_function,
+	.flags		= &tracer_flags,
+};
+
+static __init int init_graph_trace(void)
+{
+	return register_tracer(&graph_trace);
+}
+
+device_initcall(init_graph_trace);
-- 
cgit v1.2.3


From 287b6e68ca7209caec40b2f44f837c580a413bae Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 26 Nov 2008 00:57:25 +0100
Subject: tracing/function-return-tracer: set a more human readable output

Impact: feature

This patch sets a C-like output for the function graph tracing.
For this aim, we now call two handler for each function: one on the entry
and one other on return. This way we can draw a well-ordered call stack.

The pid of the previous trace is loosely stored to be compared against
the one of the current trace to see if there were a context switch.

Without this little feature, the call tree would seem broken at
some locations.
We could use the sched_tracer to capture these sched_events but this
way of processing is much more simpler.

2 spaces have been chosen for indentation to fit the screen while deep
calls. The time of execution in nanosecs is printed just after closed
braces, it seems more easy this way to find the corresponding function.
If the time was printed as a first column, it would be not so easy to
find the corresponding function if it is called on a deep depth.

I plan to output the return value but on 32 bits CPU, the return value
can be 32 or 64, and its difficult to guess on which case we are.
I don't know what would be the better solution on X86-32: only print
eax (low-part) or even edx (high-part).

Actually it's thee same problem when a function return a 8 bits value, the
high part of eax could contain junk values...

Here is an example of trace:

sys_read() {
  fget_light() {
  } 526
  vfs_read() {
    rw_verify_area() {
      security_file_permission() {
        cap_file_permission() {
        } 519
      } 1564
    } 2640
    do_sync_read() {
      pipe_read() {
        __might_sleep() {
        } 511
        pipe_wait() {
          prepare_to_wait() {
          } 760
          deactivate_task() {
            dequeue_task() {
              dequeue_task_fair() {
                dequeue_entity() {
                  update_curr() {
                    update_min_vruntime() {
                    } 504
                  } 1587
                  clear_buddies() {
                  } 512
                  add_cfs_task_weight() {
                  } 519
                  update_min_vruntime() {
                  } 511
                } 5602
                dequeue_entity() {
                  update_curr() {
                    update_min_vruntime() {
                    } 496
                  } 1631
                  clear_buddies() {
                  } 496
                  update_min_vruntime() {
                  } 527
                } 4580
                hrtick_update() {
                  hrtick_start_fair() {
                  } 488
                } 1489
              } 13700
            } 14949
          } 16016
          msecs_to_jiffies() {
          } 496
          put_prev_task_fair() {
          } 504
          pick_next_task_fair() {
          } 489
          pick_next_task_rt() {
          } 496
          pick_next_task_fair() {
          } 489
          pick_next_task_idle() {
          } 489

------------8<---------- thread 4 ------------8<----------

finish_task_switch() {
} 1203
do_softirq() {
  __do_softirq() {
    __local_bh_disable() {
    } 669
    rcu_process_callbacks() {
      __rcu_process_callbacks() {
        cpu_quiet() {
          rcu_start_batch() {
          } 503
        } 1647
      } 3128
      __rcu_process_callbacks() {
      } 542
    } 5362
    _local_bh_enable() {
    } 587
  } 8880
} 9986
kthread_should_stop() {
} 669
deactivate_task() {
  dequeue_task() {
    dequeue_task_fair() {
      dequeue_entity() {
        update_curr() {
          calc_delta_mine() {
          } 511
          update_min_vruntime() {
          } 511
        } 2813

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c                 |  30 +++++-----
 kernel/trace/trace.c                  |  67 ++++++++++++++++++----
 kernel/trace/trace.h                  |  28 +++++----
 kernel/trace/trace_functions_graph.c  | 104 ++++++++++++++++++++++++++--------
 kernel/trace/trace_functions_return.c |  98 --------------------------------
 5 files changed, 168 insertions(+), 159 deletions(-)
 delete mode 100644 kernel/trace/trace_functions_return.c

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 9e19976af72..7e2d3b91692 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1498,12 +1498,13 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
-static atomic_t ftrace_retfunc_active;
-
-/* The callback that hooks the return of a function */
-trace_function_graph_t ftrace_graph_function =
-			(trace_function_graph_t)ftrace_stub;
+static atomic_t ftrace_graph_active;
 
+/* The callbacks that hook a function */
+trace_func_graph_ret_t ftrace_graph_return =
+			(trace_func_graph_ret_t)ftrace_stub;
+trace_func_graph_ent_t ftrace_graph_entry =
+			(trace_func_graph_ent_t)ftrace_stub;
 
 /* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
 static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
@@ -1569,7 +1570,8 @@ static int start_graph_tracing(void)
 	return ret;
 }
 
-int register_ftrace_graph(trace_function_graph_t func)
+int register_ftrace_graph(trace_func_graph_ret_t retfunc,
+			trace_func_graph_ent_t entryfunc)
 {
 	int ret = 0;
 
@@ -1583,14 +1585,15 @@ int register_ftrace_graph(trace_function_graph_t func)
 		ret = -EBUSY;
 		goto out;
 	}
-	atomic_inc(&ftrace_retfunc_active);
+	atomic_inc(&ftrace_graph_active);
 	ret = start_graph_tracing();
 	if (ret) {
-		atomic_dec(&ftrace_retfunc_active);
+		atomic_dec(&ftrace_graph_active);
 		goto out;
 	}
 	ftrace_tracing_type = FTRACE_TYPE_RETURN;
-	ftrace_graph_function = func;
+	ftrace_graph_return = retfunc;
+	ftrace_graph_entry = entryfunc;
 	ftrace_startup();
 
 out:
@@ -1602,8 +1605,9 @@ void unregister_ftrace_graph(void)
 {
 	mutex_lock(&ftrace_sysctl_lock);
 
-	atomic_dec(&ftrace_retfunc_active);
-	ftrace_graph_function = (trace_function_graph_t)ftrace_stub;
+	atomic_dec(&ftrace_graph_active);
+	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
+	ftrace_graph_entry = (trace_func_graph_ent_t)ftrace_stub;
 	ftrace_shutdown();
 	/* Restore normal tracing type */
 	ftrace_tracing_type = FTRACE_TYPE_ENTER;
@@ -1614,7 +1618,7 @@ void unregister_ftrace_graph(void)
 /* Allocate a return stack for newly created task */
 void ftrace_graph_init_task(struct task_struct *t)
 {
-	if (atomic_read(&ftrace_retfunc_active)) {
+	if (atomic_read(&ftrace_graph_active)) {
 		t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
 				* sizeof(struct ftrace_ret_stack),
 				GFP_KERNEL);
@@ -1638,5 +1642,3 @@ void ftrace_graph_exit_task(struct task_struct *t)
 }
 #endif
 
-
-
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f21ab2c68fd..9d5f7c94f25 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -879,14 +879,38 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data,
 }
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static void __trace_function_graph(struct trace_array *tr,
+static void __trace_graph_entry(struct trace_array *tr,
+				struct trace_array_cpu *data,
+				struct ftrace_graph_ent *trace,
+				unsigned long flags,
+				int pc)
+{
+	struct ring_buffer_event *event;
+	struct ftrace_graph_ent_entry *entry;
+	unsigned long irq_flags;
+
+	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
+		return;
+
+	event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		return;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, flags, pc);
+	entry->ent.type			= TRACE_GRAPH_ENT;
+	entry->graph_ent			= *trace;
+	ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
+}
+
+static void __trace_graph_return(struct trace_array *tr,
 				struct trace_array_cpu *data,
 				struct ftrace_graph_ret *trace,
 				unsigned long flags,
 				int pc)
 {
 	struct ring_buffer_event *event;
-	struct ftrace_graph_entry *entry;
+	struct ftrace_graph_ret_entry *entry;
 	unsigned long irq_flags;
 
 	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
@@ -898,12 +922,8 @@ static void __trace_function_graph(struct trace_array *tr,
 		return;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, flags, pc);
-	entry->ent.type			= TRACE_FN_RET;
-	entry->ip			= trace->func;
-	entry->parent_ip	= trace->ret;
-	entry->rettime		= trace->rettime;
-	entry->calltime		= trace->calltime;
-	entry->overrun		= trace->overrun;
+	entry->ent.type			= TRACE_GRAPH_RET;
+	entry->ret				= *trace;
 	ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
 }
 #endif
@@ -1178,7 +1198,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
 }
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-void trace_function_graph(struct ftrace_graph_ret *trace)
+void trace_graph_entry(struct ftrace_graph_ent *trace)
 {
 	struct trace_array *tr = &global_trace;
 	struct trace_array_cpu *data;
@@ -1193,7 +1213,28 @@ void trace_function_graph(struct ftrace_graph_ret *trace)
 	disabled = atomic_inc_return(&data->disabled);
 	if (likely(disabled == 1)) {
 		pc = preempt_count();
-		__trace_function_graph(tr, data, trace, flags, pc);
+		__trace_graph_entry(tr, data, trace, flags, pc);
+	}
+	atomic_dec(&data->disabled);
+	raw_local_irq_restore(flags);
+}
+
+void trace_graph_return(struct ftrace_graph_ret *trace)
+{
+	struct trace_array *tr = &global_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+	int pc;
+
+	raw_local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+	if (likely(disabled == 1)) {
+		pc = preempt_count();
+		__trace_graph_return(tr, data, trace, flags, pc);
 	}
 	atomic_dec(&data->disabled);
 	raw_local_irq_restore(flags);
@@ -2000,9 +2041,11 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 			trace_seq_print_cont(s, iter);
 		break;
 	}
-	case TRACE_FN_RET: {
+	case TRACE_GRAPH_RET: {
+		return print_graph_function(iter);
+	}
+	case TRACE_GRAPH_ENT: {
 		return print_graph_function(iter);
-		break;
 	}
 	case TRACE_BRANCH: {
 		struct trace_branch *field;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 72b5ef86876..ffe1bb1eb62 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -25,7 +25,8 @@ enum trace_type {
 	TRACE_BRANCH,
 	TRACE_BOOT_CALL,
 	TRACE_BOOT_RET,
-	TRACE_FN_RET,
+	TRACE_GRAPH_RET,
+	TRACE_GRAPH_ENT,
 	TRACE_USER_STACK,
 	TRACE_BTS,
 
@@ -56,14 +57,16 @@ struct ftrace_entry {
 	unsigned long		parent_ip;
 };
 
+/* Function call entry */
+struct ftrace_graph_ent_entry {
+	struct trace_entry			ent;
+	struct ftrace_graph_ent		graph_ent;
+};
+
 /* Function return entry */
-struct ftrace_graph_entry {
-	struct trace_entry	ent;
-	unsigned long		ip;
-	unsigned long		parent_ip;
-	unsigned long long	calltime;
-	unsigned long long	rettime;
-	unsigned long		overrun;
+struct ftrace_graph_ret_entry {
+	struct trace_entry			ent;
+	struct ftrace_graph_ret		ret;
 };
 extern struct tracer boot_tracer;
 
@@ -264,7 +267,10 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\
 		IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\
 		IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
-		IF_ASSIGN(var, ent, struct ftrace_graph_entry, TRACE_FN_RET);\
+		IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry,	\
+			  TRACE_GRAPH_ENT);		\
+		IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry,	\
+			  TRACE_GRAPH_RET);		\
 		IF_ASSIGN(var, ent, struct bts_entry, TRACE_BTS);\
 		__ftrace_bad_type();					\
 	} while (0)
@@ -397,9 +403,9 @@ void trace_function(struct trace_array *tr,
 		    unsigned long ip,
 		    unsigned long parent_ip,
 		    unsigned long flags, int pc);
-void
-trace_function_graph(struct ftrace_graph_ret *trace);
 
+void trace_graph_return(struct ftrace_graph_ret *trace);
+void trace_graph_entry(struct ftrace_graph_ent *trace);
 void trace_bts(struct trace_array *tr,
 	       unsigned long from,
 	       unsigned long to);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index f5bad4624d2..b6f0cc2a00c 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -13,6 +13,7 @@
 
 #include "trace.h"
 
+#define TRACE_GRAPH_INDENT	2
 
 #define TRACE_GRAPH_PRINT_OVERRUN	0x1
 static struct tracer_opt trace_opts[] = {
@@ -26,6 +27,8 @@ static struct tracer_flags tracer_flags = {
 	.opts = trace_opts
 };
 
+/* pid on the last trace processed */
+static pid_t last_pid = -1;
 
 static int graph_trace_init(struct trace_array *tr)
 {
@@ -33,7 +36,8 @@ static int graph_trace_init(struct trace_array *tr)
 	for_each_online_cpu(cpu)
 		tracing_reset(tr, cpu);
 
-	return register_ftrace_graph(&trace_function_graph);
+	return register_ftrace_graph(&trace_graph_return,
+					&trace_graph_entry);
 }
 
 static void graph_trace_reset(struct trace_array *tr)
@@ -41,45 +45,97 @@ static void graph_trace_reset(struct trace_array *tr)
 		unregister_ftrace_graph();
 }
 
+/* If the pid changed since the last trace, output this event */
+static int verif_pid(struct trace_seq *s, pid_t pid)
+{
+	if (last_pid != -1 && last_pid == pid)
+		return 1;
 
-enum print_line_t
-print_graph_function(struct trace_iterator *iter)
+	last_pid = pid;
+	return trace_seq_printf(s, "\n------------8<---------- thread %d"
+				    " ------------8<----------\n\n",
+				  pid);
+}
+
+static enum print_line_t
+print_graph_entry(struct ftrace_graph_ent *call, struct trace_seq *s,
+		struct trace_entry *ent)
 {
-	struct trace_seq *s = &iter->seq;
-	struct trace_entry *entry = iter->ent;
-	struct ftrace_graph_entry *field;
+	int i;
 	int ret;
 
-	if (entry->type == TRACE_FN_RET) {
-		trace_assign_type(field, entry);
-		ret = trace_seq_printf(s, "%pF -> ", (void *)field->parent_ip);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
+	if (!verif_pid(s, ent->pid))
+		return TRACE_TYPE_PARTIAL_LINE;
 
-		ret = seq_print_ip_sym(s, field->ip,
-					trace_flags & TRACE_ITER_SYM_MASK);
+	for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
+		ret = trace_seq_printf(s, " ");
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+	ret = seq_print_ip_sym(s, call->func, 0);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	ret = trace_seq_printf(s, "() {\n");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+	return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
+		   struct trace_entry *ent)
+{
+	int i;
+	int ret;
+
+	if (!verif_pid(s, ent->pid))
+		return TRACE_TYPE_PARTIAL_LINE;
 
-		ret = trace_seq_printf(s, " (%llu ns)",
-					field->rettime - field->calltime);
+	for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
+		ret = trace_seq_printf(s, " ");
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+	ret = trace_seq_printf(s, "} ");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
 
-		if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
-			ret = trace_seq_printf(s, " (Overruns: %lu)",
-						field->overrun);
-			if (!ret)
-				return TRACE_TYPE_PARTIAL_LINE;
-		}
+	ret = trace_seq_printf(s, "%llu\n", trace->rettime - trace->calltime);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
 
-		ret = trace_seq_printf(s, "\n");
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
+		ret = trace_seq_printf(s, " (Overruns: %lu)\n",
+					trace->overrun);
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
+	}
+	return TRACE_TYPE_HANDLED;
+}
+
+enum print_line_t
+print_graph_function(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry = iter->ent;
 
-		return TRACE_TYPE_HANDLED;
+	switch (entry->type) {
+	case TRACE_GRAPH_ENT: {
+		struct ftrace_graph_ent_entry *field;
+		trace_assign_type(field, entry);
+		return print_graph_entry(&field->graph_ent, s, entry);
+	}
+	case TRACE_GRAPH_RET: {
+		struct ftrace_graph_ret_entry *field;
+		trace_assign_type(field, entry);
+		return print_graph_return(&field->ret, s, entry);
+	}
+	default:
+		return TRACE_TYPE_UNHANDLED;
 	}
-	return TRACE_TYPE_UNHANDLED;
 }
 
 static struct tracer graph_trace __read_mostly = {
diff --git a/kernel/trace/trace_functions_return.c b/kernel/trace/trace_functions_return.c
deleted file mode 100644
index e00d64509c9..00000000000
--- a/kernel/trace/trace_functions_return.c
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- *
- * Function return tracer.
- * Copyright (c) 2008 Frederic Weisbecker <fweisbec@gmail.com>
- * Mostly borrowed from function tracer which
- * is Copyright (c) Steven Rostedt <srostedt@redhat.com>
- *
- */
-#include <linux/debugfs.h>
-#include <linux/uaccess.h>
-#include <linux/ftrace.h>
-#include <linux/fs.h>
-
-#include "trace.h"
-
-
-#define TRACE_RETURN_PRINT_OVERRUN	0x1
-static struct tracer_opt trace_opts[] = {
-	/* Display overruns or not */
-	{ TRACER_OPT(overrun, TRACE_RETURN_PRINT_OVERRUN) },
-	{ } /* Empty entry */
-};
-
-static struct tracer_flags tracer_flags = {
-	.val = 0, /* Don't display overruns by default */
-	.opts = trace_opts
-};
-
-
-static int return_trace_init(struct trace_array *tr)
-{
-	int cpu;
-	for_each_online_cpu(cpu)
-		tracing_reset(tr, cpu);
-
-	return register_ftrace_return(&trace_function_return);
-}
-
-static void return_trace_reset(struct trace_array *tr)
-{
-		unregister_ftrace_return();
-}
-
-
-enum print_line_t
-print_return_function(struct trace_iterator *iter)
-{
-	struct trace_seq *s = &iter->seq;
-	struct trace_entry *entry = iter->ent;
-	struct ftrace_ret_entry *field;
-	int ret;
-
-	if (entry->type == TRACE_FN_RET) {
-		trace_assign_type(field, entry);
-		ret = trace_seq_printf(s, "%pF -> ", (void *)field->parent_ip);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-
-		ret = seq_print_ip_sym(s, field->ip,
-					trace_flags & TRACE_ITER_SYM_MASK);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-
-		ret = trace_seq_printf(s, " (%llu ns)",
-					field->rettime - field->calltime);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-
-		if (tracer_flags.val & TRACE_RETURN_PRINT_OVERRUN) {
-			ret = trace_seq_printf(s, " (Overruns: %lu)",
-						field->overrun);
-			if (!ret)
-				return TRACE_TYPE_PARTIAL_LINE;
-		}
-
-		ret = trace_seq_printf(s, "\n");
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-
-		return TRACE_TYPE_HANDLED;
-	}
-	return TRACE_TYPE_UNHANDLED;
-}
-
-static struct tracer return_trace __read_mostly = {
-	.name	     = "return",
-	.init	     = return_trace_init,
-	.reset	     = return_trace_reset,
-	.print_line = print_return_function,
-	.flags		= &tracer_flags,
-};
-
-static __init int init_return_trace(void)
-{
-	return register_tracer(&return_trace);
-}
-
-device_initcall(init_return_trace);
-- 
cgit v1.2.3


From df4fc31558dd2a3a30292ddb3a64c2a5befcec73 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 26 Nov 2008 00:16:23 -0500
Subject: ftrace: add function tracing to single thread

Impact: feature to function trace a single thread

This patch adds the ability to function trace a single thread.
The file:

  /debugfs/tracing/set_ftrace_pid

contains the pid to trace. Valid pids are any positive integer.
Writing any negative number to this file will disable the pid
tracing and the function tracer will go back to tracing all of
threads.

This feature works with both static and dynamic function tracing.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 209 +++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 183 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 7e2d3b91692..00d98c65fad 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -47,6 +47,9 @@
 int ftrace_enabled __read_mostly;
 static int last_ftrace_enabled;
 
+/* ftrace_pid_trace >= 0 will only trace threads with this pid */
+static int ftrace_pid_trace = -1;
+
 /* Quick disabling of function tracer. */
 int function_trace_stop;
 
@@ -61,6 +64,7 @@ static int ftrace_disabled __read_mostly;
 
 static DEFINE_SPINLOCK(ftrace_lock);
 static DEFINE_MUTEX(ftrace_sysctl_lock);
+static DEFINE_MUTEX(ftrace_start_lock);
 
 static struct ftrace_ops ftrace_list_end __read_mostly =
 {
@@ -70,6 +74,7 @@ static struct ftrace_ops ftrace_list_end __read_mostly =
 static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
 ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
 ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
+ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
 
 static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
 {
@@ -86,6 +91,21 @@ static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
 	};
 }
 
+static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip)
+{
+	if (current->pid != ftrace_pid_trace)
+		return;
+
+	ftrace_pid_function(ip, parent_ip);
+}
+
+static void set_ftrace_pid_function(ftrace_func_t func)
+{
+	/* do not set ftrace_pid_function to itself! */
+	if (func != ftrace_pid_func)
+		ftrace_pid_function = func;
+}
+
 /**
  * clear_ftrace_function - reset the ftrace function
  *
@@ -96,6 +116,7 @@ void clear_ftrace_function(void)
 {
 	ftrace_trace_function = ftrace_stub;
 	__ftrace_trace_function = ftrace_stub;
+	ftrace_pid_function = ftrace_stub;
 }
 
 #ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
@@ -128,20 +149,26 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
 	ftrace_list = ops;
 
 	if (ftrace_enabled) {
+		ftrace_func_t func;
+
+		if (ops->next == &ftrace_list_end)
+			func = ops->func;
+		else
+			func = ftrace_list_func;
+
+		if (ftrace_pid_trace >= 0) {
+			set_ftrace_pid_function(func);
+			func = ftrace_pid_func;
+		}
+
 		/*
 		 * For one func, simply call it directly.
 		 * For more than one func, call the chain.
 		 */
 #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
-		if (ops->next == &ftrace_list_end)
-			ftrace_trace_function = ops->func;
-		else
-			ftrace_trace_function = ftrace_list_func;
+		ftrace_trace_function = func;
 #else
-		if (ops->next == &ftrace_list_end)
-			__ftrace_trace_function = ops->func;
-		else
-			__ftrace_trace_function = ftrace_list_func;
+		__ftrace_trace_function = func;
 		ftrace_trace_function = ftrace_test_stop_func;
 #endif
 	}
@@ -182,8 +209,19 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
 
 	if (ftrace_enabled) {
 		/* If we only have one func left, then call that directly */
-		if (ftrace_list->next == &ftrace_list_end)
-			ftrace_trace_function = ftrace_list->func;
+		if (ftrace_list->next == &ftrace_list_end) {
+			ftrace_func_t func = ftrace_list->func;
+
+			if (ftrace_pid_trace >= 0) {
+				set_ftrace_pid_function(func);
+				func = ftrace_pid_func;
+			}
+#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
+			ftrace_trace_function = func;
+#else
+			__ftrace_trace_function = func;
+#endif
+		}
 	}
 
  out:
@@ -192,6 +230,38 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
 	return ret;
 }
 
+static void ftrace_update_pid_func(void)
+{
+	ftrace_func_t func;
+
+	/* should not be called from interrupt context */
+	spin_lock(&ftrace_lock);
+
+	if (ftrace_trace_function == ftrace_stub)
+		goto out;
+
+	func = ftrace_trace_function;
+
+	if (ftrace_pid_trace >= 0) {
+		set_ftrace_pid_function(func);
+		func = ftrace_pid_func;
+	} else {
+		if (func != ftrace_pid_func)
+			goto out;
+
+		set_ftrace_pid_function(func);
+	}
+
+#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
+	ftrace_trace_function = func;
+#else
+	__ftrace_trace_function = func;
+#endif
+
+ out:
+	spin_unlock(&ftrace_lock);
+}
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 #ifndef CONFIG_FTRACE_MCOUNT_RECORD
 # error Dynamic ftrace depends on MCOUNT_RECORD
@@ -545,7 +615,19 @@ static void ftrace_run_update_code(int command)
 
 static ftrace_func_t saved_ftrace_func;
 static int ftrace_start_up;
-static DEFINE_MUTEX(ftrace_start_lock);
+
+static void ftrace_startup_enable(int command)
+{
+	if (saved_ftrace_func != ftrace_trace_function) {
+		saved_ftrace_func = ftrace_trace_function;
+		command |= FTRACE_UPDATE_TRACE_FUNC;
+	}
+
+	if (!command || !ftrace_enabled)
+		return;
+
+	ftrace_run_update_code(command);
+}
 
 static void ftrace_startup(void)
 {
@@ -558,16 +640,8 @@ static void ftrace_startup(void)
 	ftrace_start_up++;
 	command |= FTRACE_ENABLE_CALLS;
 
-	if (saved_ftrace_func != ftrace_trace_function) {
-		saved_ftrace_func = ftrace_trace_function;
-		command |= FTRACE_UPDATE_TRACE_FUNC;
-	}
-
-	if (!command || !ftrace_enabled)
-		goto out;
+	ftrace_startup_enable(command);
 
-	ftrace_run_update_code(command);
- out:
 	mutex_unlock(&ftrace_start_lock);
 }
 
@@ -1262,13 +1336,10 @@ static struct file_operations ftrace_notrace_fops = {
 	.release = ftrace_notrace_release,
 };
 
-static __init int ftrace_init_debugfs(void)
+static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
 {
-	struct dentry *d_tracer;
 	struct dentry *entry;
 
-	d_tracer = tracing_init_dentry();
-
 	entry = debugfs_create_file("available_filter_functions", 0444,
 				    d_tracer, NULL, &ftrace_avail_fops);
 	if (!entry)
@@ -1295,8 +1366,6 @@ static __init int ftrace_init_debugfs(void)
 	return 0;
 }
 
-fs_initcall(ftrace_init_debugfs);
-
 static int ftrace_convert_nops(struct module *mod,
 			       unsigned long *start,
 			       unsigned long *end)
@@ -1382,12 +1451,100 @@ static int __init ftrace_nodyn_init(void)
 }
 device_initcall(ftrace_nodyn_init);
 
+static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
+static inline void ftrace_startup_enable(int command) { }
 # define ftrace_startup()		do { } while (0)
 # define ftrace_shutdown()		do { } while (0)
 # define ftrace_startup_sysctl()	do { } while (0)
 # define ftrace_shutdown_sysctl()	do { } while (0)
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
+static ssize_t
+ftrace_pid_read(struct file *file, char __user *ubuf,
+		       size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	int r;
+
+	if (ftrace_pid_trace >= 0)
+		r = sprintf(buf, "%u\n", ftrace_pid_trace);
+	else
+		r = sprintf(buf, "no pid\n");
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+ftrace_pid_write(struct file *filp, const char __user *ubuf,
+		   size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	long val;
+	int ret;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	ret = strict_strtol(buf, 10, &val);
+	if (ret < 0)
+		return ret;
+
+	mutex_lock(&ftrace_start_lock);
+	if (ret < 0) {
+		/* disable pid tracing */
+		if (ftrace_pid_trace < 0)
+			goto out;
+		ftrace_pid_trace = -1;
+
+	} else {
+
+		if (ftrace_pid_trace == val)
+			goto out;
+
+		ftrace_pid_trace = val;
+	}
+
+	/* update the function call */
+	ftrace_update_pid_func();
+	ftrace_startup_enable(0);
+
+ out:
+	mutex_unlock(&ftrace_start_lock);
+
+	return cnt;
+}
+
+static struct file_operations ftrace_pid_fops = {
+	.read = ftrace_pid_read,
+	.write = ftrace_pid_write,
+};
+
+static __init int ftrace_init_debugfs(void)
+{
+	struct dentry *d_tracer;
+	struct dentry *entry;
+
+	d_tracer = tracing_init_dentry();
+	if (!d_tracer)
+		return 0;
+
+	ftrace_init_dyn_debugfs(d_tracer);
+
+	entry = debugfs_create_file("set_ftrace_pid", 0644, d_tracer,
+				    NULL, &ftrace_pid_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'set_ftrace_pid' entry\n");
+	return 0;
+}
+
+fs_initcall(ftrace_init_debugfs);
+
 /**
  * ftrace_kill - kill ftrace
  *
-- 
cgit v1.2.3


From 5a45cfe1c64862e8cd3b0d79d7c4ba71c3118915 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 26 Nov 2008 00:16:24 -0500
Subject: ftrace: use code patching for ftrace graph tracer

Impact: more efficient code for ftrace graph tracer

This patch uses the dynamic patching, when available, to patch
the function graph code into the kernel.

This patch will ease the way for letting both function tracing
and function graph tracing run together.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 35 ++++++++++++++++-------------------
 1 file changed, 16 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 00d98c65fad..5f7c8642d58 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -281,6 +281,8 @@ enum {
 	FTRACE_UPDATE_TRACE_FUNC	= (1 << 2),
 	FTRACE_ENABLE_MCOUNT		= (1 << 3),
 	FTRACE_DISABLE_MCOUNT		= (1 << 4),
+	FTRACE_START_FUNC_RET		= (1 << 5),
+	FTRACE_STOP_FUNC_RET		= (1 << 6),
 };
 
 static int ftrace_filtered;
@@ -465,14 +467,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 	unsigned long ip, fl;
 	unsigned long ftrace_addr;
 
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-	if (ftrace_tracing_type == FTRACE_TYPE_ENTER)
-		ftrace_addr = (unsigned long)ftrace_caller;
-	else
-		ftrace_addr = (unsigned long)ftrace_graph_caller;
-#else
 	ftrace_addr = (unsigned long)ftrace_caller;
-#endif
 
 	ip = rec->ip;
 
@@ -605,6 +600,11 @@ static int __ftrace_modify_code(void *data)
 	if (*command & FTRACE_UPDATE_TRACE_FUNC)
 		ftrace_update_ftrace_func(ftrace_trace_function);
 
+	if (*command & FTRACE_START_FUNC_RET)
+		ftrace_enable_ftrace_graph_caller();
+	else if (*command & FTRACE_STOP_FUNC_RET)
+		ftrace_disable_ftrace_graph_caller();
+
 	return 0;
 }
 
@@ -629,10 +629,8 @@ static void ftrace_startup_enable(int command)
 	ftrace_run_update_code(command);
 }
 
-static void ftrace_startup(void)
+static void ftrace_startup(int command)
 {
-	int command = 0;
-
 	if (unlikely(ftrace_disabled))
 		return;
 
@@ -645,10 +643,8 @@ static void ftrace_startup(void)
 	mutex_unlock(&ftrace_start_lock);
 }
 
-static void ftrace_shutdown(void)
+static void ftrace_shutdown(int command)
 {
-	int command = 0;
-
 	if (unlikely(ftrace_disabled))
 		return;
 
@@ -1453,8 +1449,9 @@ device_initcall(ftrace_nodyn_init);
 
 static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
 static inline void ftrace_startup_enable(int command) { }
-# define ftrace_startup()		do { } while (0)
-# define ftrace_shutdown()		do { } while (0)
+/* Keep as macros so we do not need to define the commands */
+# define ftrace_startup(command)	do { } while (0)
+# define ftrace_shutdown(command)	do { } while (0)
 # define ftrace_startup_sysctl()	do { } while (0)
 # define ftrace_shutdown_sysctl()	do { } while (0)
 #endif /* CONFIG_DYNAMIC_FTRACE */
@@ -1585,7 +1582,7 @@ int register_ftrace_function(struct ftrace_ops *ops)
 	}
 
 	ret = __register_ftrace_function(ops);
-	ftrace_startup();
+	ftrace_startup(0);
 
 out:
 	mutex_unlock(&ftrace_sysctl_lock);
@@ -1604,7 +1601,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
 
 	mutex_lock(&ftrace_sysctl_lock);
 	ret = __unregister_ftrace_function(ops);
-	ftrace_shutdown();
+	ftrace_shutdown(0);
 	mutex_unlock(&ftrace_sysctl_lock);
 
 	return ret;
@@ -1751,7 +1748,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 	ftrace_tracing_type = FTRACE_TYPE_RETURN;
 	ftrace_graph_return = retfunc;
 	ftrace_graph_entry = entryfunc;
-	ftrace_startup();
+	ftrace_startup(FTRACE_START_FUNC_RET);
 
 out:
 	mutex_unlock(&ftrace_sysctl_lock);
@@ -1765,7 +1762,7 @@ void unregister_ftrace_graph(void)
 	atomic_dec(&ftrace_graph_active);
 	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
 	ftrace_graph_entry = (trace_func_graph_ent_t)ftrace_stub;
-	ftrace_shutdown();
+	ftrace_shutdown(FTRACE_STOP_FUNC_RET);
 	/* Restore normal tracing type */
 	ftrace_tracing_type = FTRACE_TYPE_ENTER;
 
-- 
cgit v1.2.3


From e53a6319cca69111c1643dc9f18f4465d7f1cbf0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 26 Nov 2008 00:16:25 -0500
Subject: ftrace: let function tracing and function return run together

Impact: feature

This patch enables function tracing and function return to run together.
I've tested this by enabling the stack tracer and return tracer, where
both the function entry and function return are used together with
dynamic ftrace.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 22 ++--------------------
 1 file changed, 2 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 5f7c8642d58..cbf8b09f63a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -53,9 +53,6 @@ static int ftrace_pid_trace = -1;
 /* Quick disabling of function tracer. */
 int function_trace_stop;
 
-/* By default, current tracing type is normal tracing. */
-enum ftrace_tracing_type_t ftrace_tracing_type = FTRACE_TYPE_ENTER;
-
 /*
  * ftrace_disabled is set when an anomaly is discovered.
  * ftrace_disabled is much stronger than ftrace_enabled.
@@ -1576,15 +1573,9 @@ int register_ftrace_function(struct ftrace_ops *ops)
 
 	mutex_lock(&ftrace_sysctl_lock);
 
-	if (ftrace_tracing_type == FTRACE_TYPE_RETURN) {
-		ret = -EBUSY;
-		goto out;
-	}
-
 	ret = __register_ftrace_function(ops);
 	ftrace_startup(0);
 
-out:
 	mutex_unlock(&ftrace_sysctl_lock);
 	return ret;
 }
@@ -1731,23 +1722,16 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 
 	mutex_lock(&ftrace_sysctl_lock);
 
-	/*
-	 * Don't launch return tracing if normal function
-	 * tracing is already running.
-	 */
-	if (ftrace_trace_function != ftrace_stub) {
-		ret = -EBUSY;
-		goto out;
-	}
 	atomic_inc(&ftrace_graph_active);
 	ret = start_graph_tracing();
 	if (ret) {
 		atomic_dec(&ftrace_graph_active);
 		goto out;
 	}
-	ftrace_tracing_type = FTRACE_TYPE_RETURN;
+
 	ftrace_graph_return = retfunc;
 	ftrace_graph_entry = entryfunc;
+
 	ftrace_startup(FTRACE_START_FUNC_RET);
 
 out:
@@ -1763,8 +1747,6 @@ void unregister_ftrace_graph(void)
 	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
 	ftrace_graph_entry = (trace_func_graph_ent_t)ftrace_stub;
 	ftrace_shutdown(FTRACE_STOP_FUNC_RET);
-	/* Restore normal tracing type */
-	ftrace_tracing_type = FTRACE_TYPE_ENTER;
 
 	mutex_unlock(&ftrace_sysctl_lock);
 }
-- 
cgit v1.2.3


From 660c7f9be96321fc80026d76411bd15e6f418a72 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 26 Nov 2008 00:16:26 -0500
Subject: ftrace: add thread comm to function graph tracer

Impact: enhancement to function graph tracer

Export the trace_find_cmdline so the function graph tracer can
use it to print the comms of the threads.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c                 |  2 +-
 kernel/trace/trace.h                 |  1 +
 kernel/trace/trace_functions_graph.c | 21 ++++++++++++++++-----
 3 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9d5f7c94f25..5811e0a5f73 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -804,7 +804,7 @@ static void trace_save_cmdline(struct task_struct *tsk)
 	spin_unlock(&trace_cmdline_lock);
 }
 
-static char *trace_find_cmdline(int pid)
+char *trace_find_cmdline(int pid)
 {
 	char *cmdline = "<...>";
 	unsigned map;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index ffe1bb1eb62..7adacf349ef 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -450,6 +450,7 @@ struct tracer_switch_ops {
 	struct tracer_switch_ops	*next;
 };
 
+char *trace_find_cmdline(int pid);
 #endif /* CONFIG_CONTEXT_SWITCH_TRACER */
 
 #ifdef CONFIG_DYNAMIC_FTRACE
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index b6f0cc2a00c..bbb81e7b6c4 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -32,29 +32,40 @@ static pid_t last_pid = -1;
 
 static int graph_trace_init(struct trace_array *tr)
 {
-	int cpu;
+	int cpu, ret;
+
 	for_each_online_cpu(cpu)
 		tracing_reset(tr, cpu);
 
-	return register_ftrace_graph(&trace_graph_return,
+	ret = register_ftrace_graph(&trace_graph_return,
 					&trace_graph_entry);
+	if (ret)
+		return ret;
+	tracing_start_cmdline_record();
+
+	return 0;
 }
 
 static void graph_trace_reset(struct trace_array *tr)
 {
-		unregister_ftrace_graph();
+	tracing_stop_cmdline_record();
+	unregister_ftrace_graph();
 }
 
 /* If the pid changed since the last trace, output this event */
 static int verif_pid(struct trace_seq *s, pid_t pid)
 {
+	char *comm;
+
 	if (last_pid != -1 && last_pid == pid)
 		return 1;
 
 	last_pid = pid;
-	return trace_seq_printf(s, "\n------------8<---------- thread %d"
+	comm = trace_find_cmdline(pid);
+
+	return trace_seq_printf(s, "\n------------8<---------- thread %s-%d"
 				    " ------------8<----------\n\n",
-				  pid);
+				    comm, pid);
 }
 
 static enum print_line_t
-- 
cgit v1.2.3


From 437f24fb897d409a9978eb71ecfaf279dcd94acd Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 26 Nov 2008 00:16:27 -0500
Subject: ftrace: add cpu annotation for function graph tracer

Impact: enhancement for function graph tracer

When run on a SMP box, the function graph tracer is confusing because
it shows the different CPUS as changes in the trace.

This patch adds the annotation of 'CPU[###]' where ### is a three digit
number. The output will look similar to this:

CPU[001]     dput() {
CPU[000] } 726
CPU[001]     } 487
CPU[000] do_softirq() {
CPU[001]   } 2221
CPU[000]   __do_softirq() {
CPU[000]     __local_bh_disable() {
CPU[001]   unroll_tree_refs() {
CPU[000]     } 569
CPU[001]   } 501
CPU[000]     rcu_process_callbacks() {
CPU[001]   kfree() {

What makes this nice is that now you can grep the file and produce
readable format for a particular CPU.

 # cat /debug/tracing/trace > /tmp/trace
 # grep '^CPU\[000\]' /tmp/trace > /tmp/trace0
 # grep '^CPU\[001\]' /tmp/trace > /tmp/trace1

Will give you:

 # head /tmp/trace0
CPU[000] ------------8<---------- thread sshd-3899 ------------8<----------
CPU[000]     inotify_dentry_parent_queue_event() {
CPU[000]     } 2531
CPU[000]     inotify_inode_queue_event() {
CPU[000]     } 505
CPU[000]   } 69626
CPU[000] } 73089
CPU[000] audit_syscall_exit() {
CPU[000]   path_put() {
CPU[000]     dput() {

 # head /tmp/trace1
CPU[001] ------------8<---------- thread pcscd-3446 ------------8<----------
CPU[001]               } 4186
CPU[001]               dput() {
CPU[001]               } 543
CPU[001]               vfs_permission() {
CPU[001]                 inode_permission() {
CPU[001]                   shmem_permission() {
CPU[001]                     generic_permission() {
CPU[001]                     } 501
CPU[001]                   } 2205

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_functions_graph.c | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index bbb81e7b6c4..d31d695174a 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -28,7 +28,7 @@ static struct tracer_flags tracer_flags = {
 };
 
 /* pid on the last trace processed */
-static pid_t last_pid = -1;
+static pid_t last_pid[NR_CPUS] = { [0 ... NR_CPUS-1] = -1 };
 
 static int graph_trace_init(struct trace_array *tr)
 {
@@ -53,29 +53,34 @@ static void graph_trace_reset(struct trace_array *tr)
 }
 
 /* If the pid changed since the last trace, output this event */
-static int verif_pid(struct trace_seq *s, pid_t pid)
+static int verif_pid(struct trace_seq *s, pid_t pid, int cpu)
 {
 	char *comm;
 
-	if (last_pid != -1 && last_pid == pid)
+	if (last_pid[cpu] != -1 && last_pid[cpu] == pid)
 		return 1;
 
-	last_pid = pid;
+	last_pid[cpu] = pid;
 	comm = trace_find_cmdline(pid);
 
-	return trace_seq_printf(s, "\n------------8<---------- thread %s-%d"
+	return trace_seq_printf(s, "\nCPU[%03d]"
+				    " ------------8<---------- thread %s-%d"
 				    " ------------8<----------\n\n",
-				    comm, pid);
+				    cpu, comm, pid);
 }
 
 static enum print_line_t
 print_graph_entry(struct ftrace_graph_ent *call, struct trace_seq *s,
-		struct trace_entry *ent)
+		  struct trace_entry *ent, int cpu)
 {
 	int i;
 	int ret;
 
-	if (!verif_pid(s, ent->pid))
+	if (!verif_pid(s, ent->pid, cpu))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	ret = trace_seq_printf(s, "CPU[%03d] ", cpu);
+	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
@@ -96,12 +101,16 @@ print_graph_entry(struct ftrace_graph_ent *call, struct trace_seq *s,
 
 static enum print_line_t
 print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
-		   struct trace_entry *ent)
+		   struct trace_entry *ent, int cpu)
 {
 	int i;
 	int ret;
 
-	if (!verif_pid(s, ent->pid))
+	if (!verif_pid(s, ent->pid, cpu))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	ret = trace_seq_printf(s, "CPU[%03d] ", cpu);
+	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
@@ -137,12 +146,13 @@ print_graph_function(struct trace_iterator *iter)
 	case TRACE_GRAPH_ENT: {
 		struct ftrace_graph_ent_entry *field;
 		trace_assign_type(field, entry);
-		return print_graph_entry(&field->graph_ent, s, entry);
+		return print_graph_entry(&field->graph_ent, s, entry,
+					 iter->cpu);
 	}
 	case TRACE_GRAPH_RET: {
 		struct ftrace_graph_ret_entry *field;
 		trace_assign_type(field, entry);
-		return print_graph_return(&field->ret, s, entry);
+		return print_graph_return(&field->ret, s, entry, iter->cpu);
 	}
 	default:
 		return TRACE_TYPE_UNHANDLED;
-- 
cgit v1.2.3


From bf4d83f66476086c6b50dc52aac00d71ad70494e Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 25 Nov 2008 09:57:51 +1030
Subject: sched: convert nohz struct to cpumask_var_t, fix

Impact: build fix

Fix the !CONFIG_SMP case.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index eba6a156d33..1aa840a9f58 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8349,10 +8349,12 @@ void __init sched_init(void)
 
 	/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
 	alloc_bootmem_cpumask_var(&nohz_cpu_mask);
+#ifdef CONFIG_SMP
 #ifdef CONFIG_NO_HZ
 	alloc_bootmem_cpumask_var(&nohz.cpu_mask);
 #endif
 	alloc_bootmem_cpumask_var(&cpu_isolated_map);
+#endif /* SMP */
 
 	scheduler_running = 1;
 }
-- 
cgit v1.2.3


From 3d8cbdf8650f44d95333ca645d950832a0653f35 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 25 Nov 2008 09:58:41 +1030
Subject: sched: convert local_cpu_mask to cpumask_var_t, fix

Impact: build fix for !CONFIG_SMP

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index fb3964579a8..94aab72f6a0 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1381,6 +1381,14 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p,
 	if (!rq->rt.rt_nr_running)
 		pull_rt_task(rq);
 }
+
+static inline void init_sched_rt_class(void)
+{
+	unsigned int i;
+
+	for_each_possible_cpu(i)
+		alloc_cpumask_var(&per_cpu(local_cpu_mask, i), GFP_KERNEL);
+}
 #endif /* CONFIG_SMP */
 
 /*
@@ -1552,11 +1560,3 @@ static void print_rt_stats(struct seq_file *m, int cpu)
 }
 #endif /* CONFIG_SCHED_DEBUG */
 
-/* Note that this is never called for !SMP, but that's OK. */
-static inline void init_sched_rt_class(void)
-{
-	unsigned int i;
-
-	for_each_possible_cpu(i)
-		alloc_cpumask_var(&per_cpu(local_cpu_mask, i), GFP_KERNEL);
-}
-- 
cgit v1.2.3


From 1224e376f2a7e3c7ab19ef37099a78597978a696 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 25 Nov 2008 09:59:20 +1030
Subject: sched: avoid stack var in move_task_off_dead_cpu, fix

Impact: locking fix

We can't call cpuset_cpus_allowed_locked() with the rq lock held.
However, the rq lock merely protects us from (1) cpu_online_mask changing
and (2) someone else changing p->cpus_allowed.

The first can't happen because we're being called from a cpu hotplug
notifier.  The second doesn't really matter: we are forcing the task off
a CPU it was affine to, so we're not doing very well anyway.

So we remove the rq lock from this path, and all is good.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 1aa840a9f58..3f5bfdc3d94 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6126,8 +6126,6 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
  */
 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 {
-	unsigned long flags;
-	struct rq *rq;
 	int dest_cpu;
 	/* FIXME: Use cpumask_of_node here. */
 	cpumask_t _nodemask = node_to_cpumask(cpu_to_node(dead_cpu));
@@ -6146,10 +6144,8 @@ again:
 
 	/* No more Mr. Nice Guy. */
 	if (dest_cpu >= nr_cpu_ids) {
-		rq = task_rq_lock(p, &flags);
 		cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
 		dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
-		task_rq_unlock(rq, &flags);
 
 		/*
 		 * Don't tell them about moving exiting tasks or
-- 
cgit v1.2.3


From f3f47a6768a29448866da4422b6f6bee485c947f Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Sun, 23 Nov 2008 16:49:58 -0800
Subject: tracing: add "power-tracer": C/P state tracer to help power
 optimization

Impact: new "power-tracer" ftrace plugin

This patch adds a C/P-state ftrace plugin that will generate
detailed statistics about the C/P-states that are being used,
so that we can look at detailed decisions that the C/P-state
code is making, rather than the too high level "average"
that we have today.

An example way of using this is:

 mount -t debugfs none /sys/kernel/debug
 echo cstate > /sys/kernel/debug/tracing/current_tracer
 echo 1 > /sys/kernel/debug/tracing/tracing_enabled
 sleep 1
 echo 0 > /sys/kernel/debug/tracing/tracing_enabled
 cat /sys/kernel/debug/tracing/trace | perl scripts/trace/cstate.pl > out.svg

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig       |  11 +++
 kernel/trace/Makefile      |   1 +
 kernel/trace/trace.h       |   7 ++
 kernel/trace/trace_power.c | 179 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 198 insertions(+)
 create mode 100644 kernel/trace/trace_power.c

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 620feadff67..d151aab48ed 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -217,6 +217,17 @@ config BRANCH_TRACER
 
 	  Say N if unsure.
 
+config POWER_TRACER
+	bool "Trace power consumption behavior"
+	depends on DEBUG_KERNEL
+	depends on X86
+	select TRACING
+	help
+	  This tracer helps developers to analyze and optimize the kernels
+	  power management decisions, specifically the C-state and P-state
+	  behavior.
+
+
 config STACK_TRACER
 	bool "Trace max stack"
 	depends on HAVE_FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index cef4bcb4e82..acaa06553ec 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -32,5 +32,6 @@ obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
 obj-$(CONFIG_FUNCTION_RET_TRACER) += trace_functions_return.o
 obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
 obj-$(CONFIG_BTS_TRACER) += trace_bts.o
+obj-$(CONFIG_POWER_TRACER) += trace_power.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3abd645e8af..4c453778a6a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -28,6 +28,7 @@ enum trace_type {
 	TRACE_FN_RET,
 	TRACE_USER_STACK,
 	TRACE_BTS,
+	TRACE_POWER,
 
 	__TRACE_LAST_TYPE
 };
@@ -160,6 +161,11 @@ struct bts_entry {
 	unsigned long		to;
 };
 
+struct trace_power {
+	struct trace_entry	ent;
+	struct power_trace	state_data;
+};
+
 /*
  * trace_flag_type is an enumeration that holds different
  * states when a trace occurs. These are:
@@ -266,6 +272,7 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
 		IF_ASSIGN(var, ent, struct ftrace_ret_entry, TRACE_FN_RET);\
 		IF_ASSIGN(var, ent, struct bts_entry, TRACE_BTS);\
+ 		IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
 		__ftrace_bad_type();					\
 	} while (0)
 
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
new file mode 100644
index 00000000000..a7172a352f6
--- /dev/null
+++ b/kernel/trace/trace_power.c
@@ -0,0 +1,179 @@
+/*
+ * ring buffer based C-state tracer
+ *
+ * Arjan van de Ven <arjan@linux.intel.com>
+ * Copyright (C) 2008 Intel Corporation
+ *
+ * Much is borrowed from trace_boot.c which is
+ * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/debugfs.h>
+#include <linux/ftrace.h>
+#include <linux/kallsyms.h>
+#include <linux/module.h>
+
+#include "trace.h"
+
+static struct trace_array *power_trace;
+static int __read_mostly trace_power_enabled;
+
+
+static void start_power_trace(struct trace_array *tr)
+{
+	trace_power_enabled = 1;
+}
+
+static void stop_power_trace(struct trace_array *tr)
+{
+	trace_power_enabled = 0;
+}
+
+
+static int power_trace_init(struct trace_array *tr)
+{
+	int cpu;
+	power_trace = tr;
+
+	trace_power_enabled = 1;
+
+	for_each_cpu_mask(cpu, cpu_possible_map)
+		tracing_reset(tr, cpu);
+	return 0;
+}
+
+static enum print_line_t power_print_line(struct trace_iterator *iter)
+{
+	int ret = 0;
+	struct trace_entry *entry = iter->ent;
+	struct trace_power *field ;
+	struct power_trace *it;
+	struct trace_seq *s = &iter->seq;
+	struct timespec stamp;
+	struct timespec duration;
+
+	trace_assign_type(field, entry);
+	it = &field->state_data;
+	stamp = ktime_to_timespec(it->stamp);
+	duration = ktime_to_timespec(ktime_sub(it->end, it->stamp));
+
+	if (entry->type == TRACE_POWER) {
+		if (it->type == POWER_CSTATE)
+			ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n",
+					  stamp.tv_sec,
+					  stamp.tv_nsec,
+					  it->state, iter->cpu,
+					  duration.tv_sec,
+					  duration.tv_nsec);
+		if (it->type == POWER_PSTATE)
+			ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n",
+					  stamp.tv_sec,
+					  stamp.tv_nsec,
+					  it->state, iter->cpu);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+		return TRACE_TYPE_HANDLED;
+	}
+	return TRACE_TYPE_UNHANDLED;
+}
+
+static struct tracer power_tracer __read_mostly =
+{
+	.name		= "power",
+	.init		= power_trace_init,
+	.start		= start_power_trace,
+	.stop		= stop_power_trace,
+	.reset		= stop_power_trace,
+	.print_line	= power_print_line,
+};
+
+static int init_power_trace(void)
+{
+	return register_tracer(&power_tracer);
+}
+device_initcall(init_power_trace);
+
+void trace_power_start(struct power_trace *it, unsigned int type,
+			 unsigned int level)
+{
+	if (!trace_power_enabled)
+		return;
+
+	memset(it, 0, sizeof(struct power_trace));
+	it->state = level;
+	it->type = type;
+	it->stamp = ktime_get();
+}
+EXPORT_SYMBOL_GPL(trace_power_start);
+
+
+void trace_power_end(struct power_trace *it)
+{
+	struct ring_buffer_event *event;
+	struct trace_power *entry;
+	struct trace_array_cpu *data;
+	unsigned long irq_flags;
+	struct trace_array *tr = power_trace;
+
+	if (!trace_power_enabled)
+		return;
+
+	preempt_disable();
+	it->end = ktime_get();
+	data = tr->data[smp_processor_id()];
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		goto out;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, 0);
+	entry->ent.type = TRACE_POWER;
+	entry->state_data = *it;
+	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+	trace_wake_up();
+
+ out:
+	preempt_enable();
+}
+EXPORT_SYMBOL_GPL(trace_power_end);
+
+void trace_power_mark(struct power_trace *it, unsigned int type,
+			 unsigned int level)
+{
+	struct ring_buffer_event *event;
+	struct trace_power *entry;
+	struct trace_array_cpu *data;
+	unsigned long irq_flags;
+	struct trace_array *tr = power_trace;
+
+	if (!trace_power_enabled)
+		return;
+
+	memset(it, 0, sizeof(struct power_trace));
+	it->state = level;
+	it->type = type;
+	it->stamp = ktime_get();
+	preempt_disable();
+	it->end = it->stamp;
+	data = tr->data[smp_processor_id()];
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		goto out;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, 0);
+	entry->ent.type = TRACE_POWER;
+	entry->state_data = *it;
+	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+	trace_wake_up();
+
+ out:
+	preempt_enable();
+}
+EXPORT_SYMBOL_GPL(trace_power_mark);
-- 
cgit v1.2.3


From 83a8df618eb04bd2819a758f3b409b1449862434 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 27 Nov 2008 01:46:33 +0100
Subject: tracing/function-graph-tracer: enhancements for the trace output

Impact: enhance the output of the graph-tracer

This patch applies some ideas of Ingo Molnar and Steven Rostedt.

* Output leaf functions in one line with parenthesis, semicolon and duration
  output.

* Add a second column (after cpu) for an overhead sign.
  if duration > 100 us, "!"
  if duration > 10 us, "+"
  else " "

* Print output in us with remaining nanosec: u.n

* Print duration on the right end, following the indentation of the functions.
  Use also visual clues: "-" on entry call (no duration to output) and "+" on
  return (duration output).

The name of the tracer has been fixed as well: function-branch becomes
function_branch.

Here is an example of the new output:

CPU[000]           dequeue_entity() {                    -
CPU[000]             update_curr() {                    -
CPU[000]               update_min_vruntime();                    + 0.512 us
CPU[000]             }                                + 1.504 us
CPU[000]             clear_buddies();                    + 0.481 us
CPU[000]             update_min_vruntime();                    + 0.504 us
CPU[000]           }                                + 4.557 us
CPU[000]           hrtick_update() {                    -
CPU[000]             hrtick_start_fair();                    + 0.489 us
CPU[000]           }                                + 1.443 us
CPU[000] +       }                                + 14.655 us
CPU[000] +     }                                + 15.678 us
CPU[000] +   }                                + 16.686 us
CPU[000]     msecs_to_jiffies();                    + 0.481 us
CPU[000]     put_prev_task_fair();                    + 0.504 us
CPU[000]     pick_next_task_fair();                    + 0.482 us
CPU[000]     pick_next_task_rt();                    + 0.504 us
CPU[000]     pick_next_task_fair();                    + 0.481 us
CPU[000]     pick_next_task_idle();                    + 0.489 us
CPU[000]     _spin_trylock();                    + 0.655 us
CPU[000]     _spin_unlock();                    + 0.609 us

CPU[000]  ------------8<---------- thread bash-2794 ------------8<----------

CPU[000]               finish_task_switch() {                    -
CPU[000]                 _spin_unlock_irq();                    + 0.722 us
CPU[000]               }                                + 2.369 us
CPU[000] !           }                                + 501972.605 us
CPU[000] !         }                                + 501973.763 us
CPU[000]           copy_from_read_buf() {                    -
CPU[000]             _spin_lock_irqsave();                    + 0.670 us
CPU[000]             _spin_unlock_irqrestore();                    + 0.699 us
CPU[000]             copy_to_user() {                    -
CPU[000]               might_fault() {                    -
CPU[000]                 __might_sleep();                    + 0.503 us
CPU[000]               }                                + 1.632 us
CPU[000]               __copy_to_user_ll();                    + 0.542 us
CPU[000]             }                                + 3.858 us
CPU[000]             tty_audit_add_data() {                    -
CPU[000]               _spin_lock_irq();                    + 0.609 us
CPU[000]               _spin_unlock_irq();                    + 0.624 us
CPU[000]             }                                + 3.196 us
CPU[000]             _spin_lock_irqsave();                    + 0.624 us
CPU[000]             _spin_unlock_irqrestore();                    + 0.625 us
CPU[000] +         }                                + 13.611 us
CPU[000]           copy_from_read_buf() {                    -
CPU[000]             _spin_lock_irqsave();                    + 0.624 us
CPU[000]             _spin_unlock_irqrestore();                    + 0.616 us
CPU[000]           }                                + 2.820 us
CPU[000]

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_functions_graph.c | 168 +++++++++++++++++++++++++++++++++--
 1 file changed, 159 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index d31d695174a..b958d60377b 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -14,6 +14,10 @@
 #include "trace.h"
 
 #define TRACE_GRAPH_INDENT	2
+/* Spaces between function call and time duration */
+#define TRACE_GRAPH_TIMESPACE_ENTRY	"                    "
+/* Spaces between function call and closing braces */
+#define TRACE_GRAPH_TIMESPACE_RET	"                               "
 
 #define TRACE_GRAPH_PRINT_OVERRUN	0x1
 static struct tracer_opt trace_opts[] = {
@@ -63,26 +67,130 @@ static int verif_pid(struct trace_seq *s, pid_t pid, int cpu)
 	last_pid[cpu] = pid;
 	comm = trace_find_cmdline(pid);
 
-	return trace_seq_printf(s, "\nCPU[%03d]"
+	return trace_seq_printf(s, "\nCPU[%03d] "
 				    " ------------8<---------- thread %s-%d"
 				    " ------------8<----------\n\n",
 				    cpu, comm, pid);
 }
 
+static bool
+trace_branch_is_leaf(struct trace_iterator *iter,
+		struct ftrace_graph_ent_entry *curr)
+{
+	struct ring_buffer_iter *ring_iter;
+	struct ring_buffer_event *event;
+	struct ftrace_graph_ret_entry *next;
+
+	ring_iter = iter->buffer_iter[iter->cpu];
+
+	if (!ring_iter)
+		return false;
+
+	event = ring_buffer_iter_peek(iter->buffer_iter[iter->cpu], NULL);
+
+	if (!event)
+		return false;
+
+	next = ring_buffer_event_data(event);
+
+	if (next->ent.type != TRACE_GRAPH_RET)
+		return false;
+
+	if (curr->ent.pid != next->ent.pid ||
+			curr->graph_ent.func != next->ret.func)
+		return false;
+
+	return true;
+}
+
+
+static inline int
+print_graph_duration(unsigned long long duration, struct trace_seq *s)
+{
+	unsigned long nsecs_rem = do_div(duration, 1000);
+	return trace_seq_printf(s, "+ %llu.%lu us\n", duration, nsecs_rem);
+}
+
+/* Signal a overhead of time execution to the output */
+static int
+print_graph_overhead(unsigned long long duration, struct trace_seq *s)
+{
+	/* Duration exceeded 100 msecs */
+	if (duration > 100000ULL)
+		return trace_seq_printf(s, "! ");
+
+	/* Duration exceeded 10 msecs */
+	if (duration > 10000ULL)
+		return trace_seq_printf(s, "+ ");
+
+	return trace_seq_printf(s, "  ");
+}
+
+/* Case of a leaf function on its call entry */
 static enum print_line_t
-print_graph_entry(struct ftrace_graph_ent *call, struct trace_seq *s,
-		  struct trace_entry *ent, int cpu)
+print_graph_entry_leaf(struct trace_iterator *iter,
+		struct ftrace_graph_ent_entry *entry, struct trace_seq *s)
 {
+	struct ftrace_graph_ret_entry *ret_entry;
+	struct ftrace_graph_ret *graph_ret;
+	struct ring_buffer_event *event;
+	struct ftrace_graph_ent *call;
+	unsigned long long duration;
 	int i;
 	int ret;
 
-	if (!verif_pid(s, ent->pid, cpu))
+	event = ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
+	ret_entry = ring_buffer_event_data(event);
+	graph_ret = &ret_entry->ret;
+	call = &entry->graph_ent;
+	duration = graph_ret->rettime - graph_ret->calltime;
+
+	/* Overhead */
+	ret = print_graph_overhead(duration, s);
+	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	ret = trace_seq_printf(s, "CPU[%03d] ", cpu);
+	/* Function */
+	for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
+		ret = trace_seq_printf(s, " ");
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+	ret = seq_print_ip_sym(s, call->func, 0);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	ret = trace_seq_printf(s, "();");
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
+	/* Duration */
+	ret = trace_seq_printf(s, TRACE_GRAPH_TIMESPACE_ENTRY);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	ret = print_graph_duration(duration, s);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
+			struct trace_seq *s)
+{
+	int i;
+	int ret;
+	struct ftrace_graph_ent *call = &entry->graph_ent;
+
+	/* No overhead */
+	ret = trace_seq_printf(s, "  ");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Function */
 	for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
 		ret = trace_seq_printf(s, " ");
 		if (!ret)
@@ -93,26 +201,62 @@ print_graph_entry(struct ftrace_graph_ent *call, struct trace_seq *s,
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	ret = trace_seq_printf(s, "() {\n");
+	ret = trace_seq_printf(s, "() {");
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* No duration to print at this state */
+	ret = trace_seq_printf(s, TRACE_GRAPH_TIMESPACE_ENTRY "-\n");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
 	return TRACE_TYPE_HANDLED;
 }
 
+static enum print_line_t
+print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
+			struct trace_iterator *iter, int cpu)
+{
+	int ret;
+	struct trace_entry *ent = iter->ent;
+
+	if (!verif_pid(s, ent->pid, cpu))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	ret = trace_seq_printf(s, "CPU[%03d] ", cpu);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	if (trace_branch_is_leaf(iter, field))
+		return print_graph_entry_leaf(iter, field, s);
+	else
+		return print_graph_entry_nested(field, s);
+
+}
+
 static enum print_line_t
 print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 		   struct trace_entry *ent, int cpu)
 {
 	int i;
 	int ret;
+	unsigned long long duration = trace->rettime - trace->calltime;
 
+	/* Pid */
 	if (!verif_pid(s, ent->pid, cpu))
 		return TRACE_TYPE_PARTIAL_LINE;
 
+	/* Cpu */
 	ret = trace_seq_printf(s, "CPU[%03d] ", cpu);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
+	/* Overhead */
+	ret = print_graph_overhead(duration, s);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Closing brace */
 	for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
 		ret = trace_seq_printf(s, " ");
 		if (!ret)
@@ -123,10 +267,16 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	ret = trace_seq_printf(s, "%llu\n", trace->rettime - trace->calltime);
+	/* Duration */
+	ret = trace_seq_printf(s, TRACE_GRAPH_TIMESPACE_RET);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	ret = print_graph_duration(duration, s);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
+	/* Overrun */
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
 		ret = trace_seq_printf(s, " (Overruns: %lu)\n",
 					trace->overrun);
@@ -146,7 +296,7 @@ print_graph_function(struct trace_iterator *iter)
 	case TRACE_GRAPH_ENT: {
 		struct ftrace_graph_ent_entry *field;
 		trace_assign_type(field, entry);
-		return print_graph_entry(&field->graph_ent, s, entry,
+		return print_graph_entry(field, s, iter,
 					 iter->cpu);
 	}
 	case TRACE_GRAPH_RET: {
@@ -160,7 +310,7 @@ print_graph_function(struct trace_iterator *iter)
 }
 
 static struct tracer graph_trace __read_mostly = {
-	.name	     = "function-graph",
+	.name	     = "function_graph",
 	.init	     = graph_trace_init,
 	.reset	     = graph_trace_reset,
 	.print_line = print_graph_function,
-- 
cgit v1.2.3


From 1a056155edd458eb93ef383fa8e5741d7e7c6360 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 28 Nov 2008 00:42:46 +0100
Subject: tracing/function-graph-tracer: adjustments of the trace informations

Impact: increase the visual qualities of the call-graph-tracer output

This patch applies various trace output formatting changes:

 - CPU is now a decimal number, followed by a parenthesis.

 - Overhead is now on the second column (gives a good visibility)

 - Cost is now on the third column, can't exceed 9999.99 us. It is
   followed by a virtual line based on a "|" character.

 - Functions calls are now the last column on the right. This way, we
   haven't dynamic column (which flow is harder to follow) on its right.

 - CPU and Overhead have their own option flag. They are default-on but you
   can disable them easily:

      echo nofuncgraph-cpu > trace_options
      echo nofuncgraph-overhead > trace_options

TODO:

_ Refactoring of the thread switch output.
_ Give a default-off option to output the thread and its pid on each row.
_ Provide headers
_ ....

Here is an example of the new trace style:

0)           |             mutex_unlock() {
0)      0.639 us |           __mutex_unlock_slowpath();
0)      1.607 us |         }
0)           |             remove_wait_queue() {
0)      0.616 us |           _spin_lock_irqsave();
0)      0.616 us |           _spin_unlock_irqrestore();
0)      2.779 us |         }
0)      0.495 us |         n_tty_set_room();
0) ! 9999.999 us |       }
0)           |           tty_ldisc_deref() {
0)      0.615 us |         _spin_lock_irqsave();
0)      0.616 us |         _spin_unlock_irqrestore();
0)      2.793 us |       }
0)           |           current_fs_time() {
0)      0.488 us |         current_kernel_time();
0)      0.495 us |         timespec_trunc();
0)      2.486 us |       }
0) ! 9999.999 us |     }
0) ! 9999.999 us |   }
0) ! 9999.999 us | }
0)           |     sys_read() {
0)      0.796 us |   fget_light();
0)           |       vfs_read() {
0)           |         rw_verify_area() {
0)           |           security_file_permission() {
0)      0.488 us |         cap_file_permission();
0)      1.720 us |       }
0)      3.  4 us |     }
0)           |         tty_read() {
0)      0.488 us |       tty_paranoia_check();
0)           |           tty_ldisc_ref_wait() {
0)           |             tty_ldisc_try() {
0)      0.615 us |           _spin_lock_irqsave();
0)      0.615 us |           _spin_unlock_irqrestore();
0)      5.436 us |         }
0)      6.427 us |       }

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_functions_graph.c | 142 +++++++++++++++++++++++------------
 1 file changed, 93 insertions(+), 49 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index b958d60377b..596a3ee4305 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -14,20 +14,25 @@
 #include "trace.h"
 
 #define TRACE_GRAPH_INDENT	2
-/* Spaces between function call and time duration */
-#define TRACE_GRAPH_TIMESPACE_ENTRY	"                    "
-/* Spaces between function call and closing braces */
-#define TRACE_GRAPH_TIMESPACE_RET	"                               "
 
+/* Flag options */
 #define TRACE_GRAPH_PRINT_OVERRUN	0x1
+#define TRACE_GRAPH_PRINT_CPU		0x2
+#define TRACE_GRAPH_PRINT_OVERHEAD	0x4
+
 static struct tracer_opt trace_opts[] = {
-	/* Display overruns or not */
-	{ TRACER_OPT(overrun, TRACE_GRAPH_PRINT_OVERRUN) },
+	/* Display overruns ? */
+	{ TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) },
+	/* Display CPU ? */
+	{ TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) },
+	/* Display Overhead ? */
+	{ TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) },
 	{ } /* Empty entry */
 };
 
 static struct tracer_flags tracer_flags = {
-	.val = 0, /* Don't display overruns by default */
+	/* Don't display overruns by default */
+	.val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD,
 	.opts = trace_opts
 };
 
@@ -56,6 +61,36 @@ static void graph_trace_reset(struct trace_array *tr)
 	unregister_ftrace_graph();
 }
 
+static inline int log10_cpu(int nb)
+{
+	if (nb / 100)
+		return 3;
+	if (nb / 10)
+		return 2;
+	return 1;
+}
+
+static enum print_line_t
+print_graph_cpu(struct trace_seq *s, int cpu)
+{
+	int i;
+	int ret;
+	int log10_this = log10_cpu(cpu);
+	int log10_all = log10_cpu(cpus_weight_nr(cpu_online_map));
+
+
+	for (i = 0; i < log10_all - log10_this; i++) {
+		ret = trace_seq_printf(s, " ");
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+	ret = trace_seq_printf(s, "%d) ", cpu);
+	if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	return TRACE_TYPE_HANDLED;
+}
+
+
 /* If the pid changed since the last trace, output this event */
 static int verif_pid(struct trace_seq *s, pid_t pid, int cpu)
 {
@@ -67,8 +102,7 @@ static int verif_pid(struct trace_seq *s, pid_t pid, int cpu)
 	last_pid[cpu] = pid;
 	comm = trace_find_cmdline(pid);
 
-	return trace_seq_printf(s, "\nCPU[%03d] "
-				    " ------------8<---------- thread %s-%d"
+	return trace_seq_printf(s, "\n------------8<---------- thread %s-%d"
 				    " ------------8<----------\n\n",
 				    cpu, comm, pid);
 }
@@ -86,7 +120,7 @@ trace_branch_is_leaf(struct trace_iterator *iter,
 	if (!ring_iter)
 		return false;
 
-	event = ring_buffer_iter_peek(iter->buffer_iter[iter->cpu], NULL);
+	event = ring_buffer_iter_peek(ring_iter, NULL);
 
 	if (!event)
 		return false;
@@ -108,7 +142,7 @@ static inline int
 print_graph_duration(unsigned long long duration, struct trace_seq *s)
 {
 	unsigned long nsecs_rem = do_div(duration, 1000);
-	return trace_seq_printf(s, "+ %llu.%lu us\n", duration, nsecs_rem);
+	return trace_seq_printf(s, "%4llu.%3lu us | ", duration, nsecs_rem);
 }
 
 /* Signal a overhead of time execution to the output */
@@ -136,8 +170,8 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 	struct ring_buffer_event *event;
 	struct ftrace_graph_ent *call;
 	unsigned long long duration;
-	int i;
 	int ret;
+	int i;
 
 	event = ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
 	ret_entry = ring_buffer_event_data(event);
@@ -145,8 +179,19 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 	call = &entry->graph_ent;
 	duration = graph_ret->rettime - graph_ret->calltime;
 
+	/* Must not exceed 8 characters: 9999.999 us */
+	if (duration > 10000000ULL)
+		duration = 9999999ULL;
+
 	/* Overhead */
-	ret = print_graph_overhead(duration, s);
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
+		ret = print_graph_overhead(duration, s);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+	/* Duration */
+	ret = print_graph_duration(duration, s);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -161,16 +206,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	ret = trace_seq_printf(s, "();");
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Duration */
-	ret = trace_seq_printf(s, TRACE_GRAPH_TIMESPACE_ENTRY);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	ret = print_graph_duration(duration, s);
+	ret = trace_seq_printf(s, "();\n");
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -186,9 +222,14 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
 	struct ftrace_graph_ent *call = &entry->graph_ent;
 
 	/* No overhead */
-	ret = trace_seq_printf(s, "  ");
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
+		ret = trace_seq_printf(s, "  ");
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+	/* No time */
+	ret = trace_seq_printf(s, "        |     ");
 
 	/* Function */
 	for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
@@ -201,12 +242,7 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	ret = trace_seq_printf(s, "() {");
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* No duration to print at this state */
-	ret = trace_seq_printf(s, TRACE_GRAPH_TIMESPACE_ENTRY "-\n");
+	ret = trace_seq_printf(s, "() {\n");
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -220,12 +256,16 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 	int ret;
 	struct trace_entry *ent = iter->ent;
 
+	/* Pid */
 	if (!verif_pid(s, ent->pid, cpu))
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	ret = trace_seq_printf(s, "CPU[%03d] ", cpu);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
+	/* Cpu */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
+		ret = print_graph_cpu(s, cpu);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
 
 	if (trace_branch_is_leaf(iter, field))
 		return print_graph_entry_leaf(iter, field, s);
@@ -242,17 +282,30 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 	int ret;
 	unsigned long long duration = trace->rettime - trace->calltime;
 
+	/* Must not exceed 8 characters: xxxx.yyy us */
+	if (duration > 10000000ULL)
+		duration = 9999999ULL;
+
 	/* Pid */
 	if (!verif_pid(s, ent->pid, cpu))
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Cpu */
-	ret = trace_seq_printf(s, "CPU[%03d] ", cpu);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
+		ret = print_graph_cpu(s, cpu);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
 
 	/* Overhead */
-	ret = print_graph_overhead(duration, s);
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
+		ret = print_graph_overhead(duration, s);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+	/* Duration */
+	ret = print_graph_duration(duration, s);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -263,16 +316,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
-	ret = trace_seq_printf(s, "} ");
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Duration */
-	ret = trace_seq_printf(s, TRACE_GRAPH_TIMESPACE_RET);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	ret = print_graph_duration(duration, s);
+	ret = trace_seq_printf(s, "}\n");
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-- 
cgit v1.2.3


From d51090b34602a20984ab0312ef04e47069c0aec6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 28 Nov 2008 09:55:16 +0100
Subject: tracing/function-graph-tracer: more output tweaks

Impact: prettify the output some more

Before:

0)           |     sys_read() {
0)      0.796 us |   fget_light();
0)           |       vfs_read() {
0)           |         rw_verify_area() {
0)           |           security_file_permission() {
------------8<---------- thread sshd-1755 ------------8<----------

After:

 0)               |  sys_read() {
 0)      0.796 us |    fget_light();
 0)               |    vfs_read() {
 0)               |      rw_verify_area() {
 0)               |        security_file_permission() {
 ------------------------------------------
 | 1)  migration/0--1  =>  sshd-1755
 ------------------------------------------

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_functions_graph.c | 45 ++++++++++++++++++++++++++++++------
 1 file changed, 38 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 596a3ee4305..894b50bca31 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -79,6 +79,19 @@ print_graph_cpu(struct trace_seq *s, int cpu)
 	int log10_all = log10_cpu(cpus_weight_nr(cpu_online_map));
 
 
+	/*
+	 * Start with a space character - to make it stand out
+	 * to the right a bit when trace output is pasted into
+	 * email:
+	 */
+	ret = trace_seq_printf(s, " ");
+
+	/*
+	 * Tricky - we space the CPU field according to the max
+	 * number of online CPUs. On a 2-cpu system it would take
+	 * a maximum of 1 digit - on a 128 cpu system it would
+	 * take up to 3 digits:
+	 */
 	for (i = 0; i < log10_all - log10_this; i++) {
 		ret = trace_seq_printf(s, " ");
 		if (!ret)
@@ -86,7 +99,8 @@ print_graph_cpu(struct trace_seq *s, int cpu)
 	}
 	ret = trace_seq_printf(s, "%d) ", cpu);
 	if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
+		return TRACE_TYPE_PARTIAL_LINE;
+
 	return TRACE_TYPE_HANDLED;
 }
 
@@ -94,17 +108,34 @@ print_graph_cpu(struct trace_seq *s, int cpu)
 /* If the pid changed since the last trace, output this event */
 static int verif_pid(struct trace_seq *s, pid_t pid, int cpu)
 {
-	char *comm;
+	char *comm, *prev_comm;
+	pid_t prev_pid;
+	int ret;
 
 	if (last_pid[cpu] != -1 && last_pid[cpu] == pid)
 		return 1;
 
+	prev_pid = last_pid[cpu];
 	last_pid[cpu] = pid;
+
 	comm = trace_find_cmdline(pid);
+	prev_comm = trace_find_cmdline(prev_pid);
 
-	return trace_seq_printf(s, "\n------------8<---------- thread %s-%d"
-				    " ------------8<----------\n\n",
-				    cpu, comm, pid);
+/*
+ * Context-switch trace line:
+
+ ------------------------------------------
+ | 1)  migration/0--1  =>  sshd-1755
+ ------------------------------------------
+
+ */
+	ret = trace_seq_printf(s,
+		" ------------------------------------------\n");
+	ret += trace_seq_printf(s, " | %d)  %s-%d  =>  %s-%d\n",
+				  cpu, prev_comm, prev_pid, comm, pid);
+	ret += trace_seq_printf(s,
+		" ------------------------------------------\n\n");
+	return ret;
 }
 
 static bool
@@ -142,7 +173,7 @@ static inline int
 print_graph_duration(unsigned long long duration, struct trace_seq *s)
 {
 	unsigned long nsecs_rem = do_div(duration, 1000);
-	return trace_seq_printf(s, "%4llu.%3lu us | ", duration, nsecs_rem);
+	return trace_seq_printf(s, "%4llu.%3lu us |  ", duration, nsecs_rem);
 }
 
 /* Signal a overhead of time execution to the output */
@@ -229,7 +260,7 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
 	}
 
 	/* No time */
-	ret = trace_seq_printf(s, "        |     ");
+	ret = trace_seq_printf(s, "            |  ");
 
 	/* Function */
 	for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
-- 
cgit v1.2.3


From c7425acb42fff1e723b05fbf4ea11e9a455d95dc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?T=C3=B6r=C3=B6k=20Edwin?= <edwintorok@gmail.com>
Date: Fri, 28 Nov 2008 11:17:56 +0200
Subject: tracing, alpha: fix build: add missing #ifdef CONFIG_STACKTRACE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are architectures that still have no stacktrace support.

Signed-off-by: Török Edwin <edwintorok@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5811e0a5f73..91887a280ab 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -983,6 +983,7 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 		   struct trace_array_cpu *data,
 		   unsigned long flags, int pc)
 {
+#ifdef CONFIG_STACKTRACE
 	struct ring_buffer_event *event;
 	struct userstack_entry *entry;
 	struct stack_trace trace;
@@ -1008,6 +1009,7 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 
 	save_stack_trace_user(&trace);
 	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+#endif
 }
 
 void __trace_userstack(struct trace_array *tr,
-- 
cgit v1.2.3


From 50cdaf08a8ec1d7f43987705da7aff7cf949708f Mon Sep 17 00:00:00 2001
From: Liming Wang <liming.wang@windriver.com>
Date: Fri, 28 Nov 2008 12:13:21 +0800
Subject: ftrace: improve seq_operation of ftrace

Impact: make ftrace position computing more sane

First remove useless ->pos field. Then we needn't check seq_printf
in .show like other place.

Signed-off-by: Liming Wang <liming.wang@windriver.com>
Reviewed-by: Bruce Ashfield <bruce.ashfield@windriver.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 27 ++++++++++-----------------
 1 file changed, 10 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index cbf8b09f63a..08b536a2614 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -786,7 +786,6 @@ enum {
 #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
 
 struct ftrace_iterator {
-	loff_t			pos;
 	struct ftrace_page	*pg;
 	unsigned		idx;
 	unsigned		flags;
@@ -811,6 +810,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 			iter->pg = iter->pg->next;
 			iter->idx = 0;
 			goto retry;
+		} else {
+			iter->idx = -1;
 		}
 	} else {
 		rec = &iter->pg->records[iter->idx++];
@@ -833,8 +834,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 	}
 	spin_unlock(&ftrace_lock);
 
-	iter->pos = *pos;
-
 	return rec;
 }
 
@@ -842,13 +841,15 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 {
 	struct ftrace_iterator *iter = m->private;
 	void *p = NULL;
-	loff_t l = -1;
 
-	if (*pos > iter->pos)
-		*pos = iter->pos;
+	if (*pos > 0) {
+		if (iter->idx < 0)
+			return p;
+		(*pos)--;
+		iter->idx--;
+	}
 
-	l = *pos;
-	p = t_next(m, p, &l);
+	p = t_next(m, p, pos);
 
 	return p;
 }
@@ -859,21 +860,15 @@ static void t_stop(struct seq_file *m, void *p)
 
 static int t_show(struct seq_file *m, void *v)
 {
-	struct ftrace_iterator *iter = m->private;
 	struct dyn_ftrace *rec = v;
 	char str[KSYM_SYMBOL_LEN];
-	int ret = 0;
 
 	if (!rec)
 		return 0;
 
 	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
 
-	ret = seq_printf(m, "%s\n", str);
-	if (ret < 0) {
-		iter->pos--;
-		iter->idx--;
-	}
+	seq_printf(m, "%s\n", str);
 
 	return 0;
 }
@@ -899,7 +894,6 @@ ftrace_avail_open(struct inode *inode, struct file *file)
 		return -ENOMEM;
 
 	iter->pg = ftrace_pages_start;
-	iter->pos = 0;
 
 	ret = seq_open(file, &show_ftrace_seq_ops);
 	if (!ret) {
@@ -986,7 +980,6 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
 
 	if (file->f_mode & FMODE_READ) {
 		iter->pg = ftrace_pages_start;
-		iter->pos = 0;
 		iter->flags = enable ? FTRACE_ITER_FILTER :
 			FTRACE_ITER_NOTRACE;
 
-- 
cgit v1.2.3


From 8b752e3ef6e3f5cde87afc649dd51d92b1e549c1 Mon Sep 17 00:00:00 2001
From: Liming Wang <liming.wang@windriver.com>
Date: Fri, 28 Nov 2008 09:52:40 +0800
Subject: softirq: remove useless function __local_bh_enable

Impact: remove unused code

__local_bh_enable has been replaced with _local_bh_enable.
As comments says "it always nests inside local_bh_enable() sections"
has not been valid now. Also there is no reason to use __local_bh_enable
anywhere, so we can remove this useless function.

Signed-off-by: Liming Wang <liming.wang@windriver.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/softirq.c | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index e7c69a720d6..8d9934b4162 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -102,20 +102,6 @@ void local_bh_disable(void)
 
 EXPORT_SYMBOL(local_bh_disable);
 
-void __local_bh_enable(void)
-{
-	WARN_ON_ONCE(in_irq());
-
-	/*
-	 * softirqs should never be enabled by __local_bh_enable(),
-	 * it always nests inside local_bh_enable() sections:
-	 */
-	WARN_ON_ONCE(softirq_count() == SOFTIRQ_OFFSET);
-
-	sub_preempt_count(SOFTIRQ_OFFSET);
-}
-EXPORT_SYMBOL_GPL(__local_bh_enable);
-
 /*
  * Special-case - softirqs can safely be enabled in
  * cond_resched_softirq(), or by __do_softirq(),
-- 
cgit v1.2.3


From 74853dba2f7a1a9b0905a09abcf65c1f3ce0b14f Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 28 Nov 2008 08:35:25 -0800
Subject: debug warnings: consolidate warn_slowpath and warn_on_slowpath

Impact: cleanup, code reduction

warn_slowpath is a superset of warn_on_slowpath; just have
warn_on_slowpath call warn_slowpath with a NULL 3rd argument.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/panic.c | 32 ++++++++++++--------------------
 1 file changed, 12 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 6513aac8e99..6bbf7b905c7 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -320,23 +320,6 @@ void oops_exit(void)
 }
 
 #ifdef WANT_WARN_ON_SLOWPATH
-void warn_on_slowpath(const char *file, int line)
-{
-	char function[KSYM_SYMBOL_LEN];
-	unsigned long caller = (unsigned long) __builtin_return_address(0);
-	sprint_symbol(function, caller);
-
-	printk(KERN_WARNING "------------[ cut here ]------------\n");
-	printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
-		line, function);
-	print_modules();
-	dump_stack();
-	print_oops_end_marker();
-	add_taint(TAINT_WARN);
-}
-EXPORT_SYMBOL(warn_on_slowpath);
-
-
 void warn_slowpath(const char *file, int line, const char *fmt, ...)
 {
 	va_list args;
@@ -347,9 +330,12 @@ void warn_slowpath(const char *file, int line, const char *fmt, ...)
 	printk(KERN_WARNING "------------[ cut here ]------------\n");
 	printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
 		line, function);
-	va_start(args, fmt);
-	vprintk(fmt, args);
-	va_end(args);
+
+	if (fmt) {
+		va_start(args, fmt);
+		vprintk(fmt, args);
+		va_end(args);
+	}
 
 	print_modules();
 	dump_stack();
@@ -357,6 +343,12 @@ void warn_slowpath(const char *file, int line, const char *fmt, ...)
 	add_taint(TAINT_WARN);
 }
 EXPORT_SYMBOL(warn_slowpath);
+
+void warn_on_slowpath(const char *file, int line)
+{
+	warn_slowpath(file, line, NULL);
+}
+EXPORT_SYMBOL(warn_on_slowpath);
 #endif
 
 #ifdef CONFIG_CC_STACKPROTECTOR
-- 
cgit v1.2.3


From bd89bb29a01503c5cffa367eccb0b356f910cb8d Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 28 Nov 2008 08:36:09 -0800
Subject: debug warnings: print the DMI board info name in a WARN/WARN_ON

Impact: extend WARN_ON() output with DMI_PRODUCT_NAME

It's very useful for many low level WARN_ON's to find out which
motherboard has the broken BIOS etc... this patch adds a printk
to the WARN_ON code for this.

On architectures without DMI, gcc should optimize the code out.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/panic.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 6bbf7b905c7..73d365199c3 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -21,6 +21,7 @@
 #include <linux/debug_locks.h>
 #include <linux/random.h>
 #include <linux/kallsyms.h>
+#include <linux/dmi.h>
 
 int panic_on_oops;
 static unsigned long tainted_mask;
@@ -325,11 +326,16 @@ void warn_slowpath(const char *file, int line, const char *fmt, ...)
 	va_list args;
 	char function[KSYM_SYMBOL_LEN];
 	unsigned long caller = (unsigned long)__builtin_return_address(0);
+	const char *board;
+
 	sprint_symbol(function, caller);
 
 	printk(KERN_WARNING "------------[ cut here ]------------\n");
 	printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
 		line, function);
+	board = dmi_get_system_info(DMI_PRODUCT_NAME);
+	if (board)
+		printk(KERN_WARNING "Hardware name: %s\n", board);
 
 	if (fmt) {
 		va_start(args, fmt);
-- 
cgit v1.2.3


From ec5679e513305f1411753e5f5489935bd638af23 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 28 Nov 2008 17:56:14 +0100
Subject: debug warnings: eliminate warn_on_slowpath()

Impact: cleanup, eliminate code

now that warn_on_slowpath() uses warn_slowpath(...,NULL), we can
eliminate warn_on_slowpath() altogether and use warn_slowpath().

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/panic.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 73d365199c3..50349a41fba 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -349,12 +349,6 @@ void warn_slowpath(const char *file, int line, const char *fmt, ...)
 	add_taint(TAINT_WARN);
 }
 EXPORT_SYMBOL(warn_slowpath);
-
-void warn_on_slowpath(const char *file, int line)
-{
-	warn_slowpath(file, line, NULL);
-}
-EXPORT_SYMBOL(warn_on_slowpath);
 #endif
 
 #ifdef CONFIG_CC_STACKPROTECTOR
-- 
cgit v1.2.3


From 70574a996fc7a70c5586eb56bd92a544eccf18b6 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 28 Nov 2008 22:08:00 +0300
Subject: sched: move double_unlock_balance() higher

Move double_lock_balance()/double_unlock_balance() higher to fix the following
with gcc-3.4.6:

   CC      kernel/sched.o
 In file included from kernel/sched.c:1605:
 kernel/sched_rt.c: In function `find_lock_lowest_rq':
 kernel/sched_rt.c:914: sorry, unimplemented: inlining failed in call to 'double_unlock_balance': function body not available
 kernel/sched_rt.c:1077: sorry, unimplemented: called from here
 make[2]: *** [kernel/sched.o] Error 1

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    | 67 +++++++++++++++++++++++++++----------------------------
 kernel/sched_rt.c |  4 ----
 2 files changed, 33 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 3d1ee429219..6a99703e0eb 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1581,6 +1581,39 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
 
 #endif
 
+/*
+ * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ */
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+	__releases(this_rq->lock)
+	__acquires(busiest->lock)
+	__acquires(this_rq->lock)
+{
+	int ret = 0;
+
+	if (unlikely(!irqs_disabled())) {
+		/* printk() doesn't work good under rq->lock */
+		spin_unlock(&this_rq->lock);
+		BUG_ON(1);
+	}
+	if (unlikely(!spin_trylock(&busiest->lock))) {
+		if (busiest < this_rq) {
+			spin_unlock(&this_rq->lock);
+			spin_lock(&busiest->lock);
+			spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
+			ret = 1;
+		} else
+			spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
+	}
+	return ret;
+}
+
+static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
+	__releases(busiest->lock)
+{
+	spin_unlock(&busiest->lock);
+	lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+}
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -2780,40 +2813,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 		__release(rq2->lock);
 }
 
-/*
- * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
- */
-static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
-	__releases(this_rq->lock)
-	__acquires(busiest->lock)
-	__acquires(this_rq->lock)
-{
-	int ret = 0;
-
-	if (unlikely(!irqs_disabled())) {
-		/* printk() doesn't work good under rq->lock */
-		spin_unlock(&this_rq->lock);
-		BUG_ON(1);
-	}
-	if (unlikely(!spin_trylock(&busiest->lock))) {
-		if (busiest < this_rq) {
-			spin_unlock(&this_rq->lock);
-			spin_lock(&busiest->lock);
-			spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
-			ret = 1;
-		} else
-			spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
-	}
-	return ret;
-}
-
-static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
-	__releases(busiest->lock)
-{
-	spin_unlock(&busiest->lock);
-	lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
-}
-
 /*
  * If dest_cpu is allowed for this process, migrate the task to it.
  * This is accomplished by forcing the cpu_allowed mask to only
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 2bdd4442359..587a16e2a8f 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -909,10 +909,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 /* Only try algorithms three times */
 #define RT_MAX_TRIES 3
 
-static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
-static inline void double_unlock_balance(struct rq *this_rq,
-						struct rq *busiest);
-
 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
 
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
-- 
cgit v1.2.3


From 65c6dc6adbe7ee0acf207445243400a68c77af15 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 29 Nov 2008 04:12:46 +0100
Subject: tracing/branch-tracer: include missing irqflags.h

Impact: fix build error on branch tracer

This should fix a build error reported on alpha in linux-next:

 CC      kernel/trace/trace_branch.o
  kernel/trace/trace_branch.c: In function 'probe_likely_condition':
  kernel/trace/trace_branch.c:44: error: implicit declaration of function 'raw_local_irq_save'
  kernel/trace/trace_branch.c:76: error: implicit declaration of function 'raw_local_irq_restore'

Unfortunately, I can't test it since I don't have any Alpha build environment.

Reported-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_branch.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 877ee88e6a7..bc972753568 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -6,6 +6,7 @@
 #include <linux/kallsyms.h>
 #include <linux/seq_file.h>
 #include <linux/spinlock.h>
+#include <linux/irqflags.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
-- 
cgit v1.2.3


From 6c415b9234a8c71f290e5d4fddc467f103f32719 Mon Sep 17 00:00:00 2001
From: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
Date: Mon, 1 Dec 2008 20:49:05 +0530
Subject: sched: add uid information to sched_debug for CONFIG_USER_SCHED

Impact: extend information in /proc/sched_debug

This patch adds uid information in sched_debug for CONFIG_USER_SCHED

Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c       | 10 ++++++++++
 kernel/sched_debug.c |  6 +++++-
 kernel/user.c        |  2 ++
 3 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 6a99703e0eb..4c7388ef5be 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -261,6 +261,10 @@ struct task_group {
 	struct cgroup_subsys_state css;
 #endif
 
+#ifdef CONFIG_USER_SCHED
+	uid_t uid;
+#endif
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* schedulable entities of this group on each cpu */
 	struct sched_entity **se;
@@ -286,6 +290,12 @@ struct task_group {
 
 #ifdef CONFIG_USER_SCHED
 
+/* Helper function to pass uid information to create_sched_user() */
+void set_tg_uid(struct user_struct *user)
+{
+	user->tg->uid = user->uid;
+}
+
 /*
  * Root task group.
  * 	Every UID task group (including init_task_group aka UID-0) will
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index baf2f17af46..4293cfa9681 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -160,10 +160,14 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	cgroup_path(tg->css.cgroup, path, sizeof(path));
 
 	SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
+#elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
+	{
+		uid_t uid = cfs_rq->tg->uid;
+		SEQ_printf(m, "\ncfs_rq[%d] for UID: %u\n", cpu, uid);
+	}
 #else
 	SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
 #endif
-
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",
 			SPLIT_NS(cfs_rq->exec_clock));
 
diff --git a/kernel/user.c b/kernel/user.c
index 39d6159fae4..cec2224bc9f 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -101,6 +101,8 @@ static int sched_create_user(struct user_struct *up)
 	if (IS_ERR(up->tg))
 		rc = -ENOMEM;
 
+	set_tg_uid(up);
+
 	return rc;
 }
 
-- 
cgit v1.2.3


From 66eafebc1086014709dc38f52ddcb3d67d9b346c Mon Sep 17 00:00:00 2001
From: Liming Wang <liming.wang@windriver.com>
Date: Tue, 2 Dec 2008 10:33:08 +0800
Subject: function trace: fix a bug of single thread function trace

Impact: fix "no output from tracer" bug caused by ftrace_update_pid_func()

When disabling single thread function trace using
"echo -1 > set_ftrace_pid", the normal function trace
has to restore to original function, otherwise the normal
function trace will not work well.

Without this commit, something like below:

	$ ps |grep 850
	  850 root      2556 S    -/bin/sh
	$ echo 850 > /debug/tracing/set_ftrace_pid
	$ echo function > /debug/tracing/current_tracer
	$ echo 1 > /debug/tracing/tracing_enabled
	$ sleep 1
	$ echo 0 > /debug/tracing/tracing_enabled
	$ cat /debug/tracing/trace_pipe |wc -l
	59704
	$ echo -1 > /debug/tracing/set_ftrace_pid
	$ echo 1 > /debug/tracing/tracing_enabled
	$ sleep 1
	$ echo 0 > /debug/tracing/tracing_enabled
	$ more /debug/tracing/trace_pipe
		<====== nothing output now!
			it should output trace record.

Signed-off-by: Liming Wang <liming.wang@windriver.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 08b536a2614..6d89ab46c6e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -243,10 +243,8 @@ static void ftrace_update_pid_func(void)
 		set_ftrace_pid_function(func);
 		func = ftrace_pid_func;
 	} else {
-		if (func != ftrace_pid_func)
-			goto out;
-
-		set_ftrace_pid_function(func);
+		if (func == ftrace_pid_func)
+			func = ftrace_pid_function;
 	}
 
 #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
-- 
cgit v1.2.3


From 48d68b20d00865035b8b65e69af343d0f53fac9d Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 2 Dec 2008 00:20:39 +0100
Subject: tracing/function-graph-tracer: support for x86-64

Impact: extend and enable the function graph tracer to 64-bit x86

This patch implements the support for function graph tracer under x86-64.
Both static and dynamic tracing are supported.

This causes some small CPP conditional asm on arch/x86/kernel/ftrace.c I
wanted to use probe_kernel_read/write to make the return address
saving/patching code more generic but it causes tracing recursion.

That would be perhaps useful to implement a notrace version of these
function for other archs ports.

Note that arch/x86/process_64.c is not traced, as in X86-32. I first
thought __switch_to() was responsible of crashes during tracing because I
believed current task were changed inside but that's actually not the
case (actually yes, but not the "current" pointer).

So I will have to investigate to find the functions that harm here, to
enable tracing of the other functions inside (but there is no issue at
this time, while process_64.c stays out of -pg flags).

A little possible race condition is fixed inside this patch too. When the
tracer allocate a return stack dynamically, the current depth is not
initialized before but after. An interrupt could occur at this time and,
after seeing that the return stack is allocated, the tracer could try to
trace it with a random uninitialized depth. It's a prevention, even if I
hadn't problems with it.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim Bird <tim.bird@am.sony.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 08b536a2614..f7249962752 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1673,8 +1673,10 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
 		}
 
 		if (t->ret_stack == NULL) {
-			t->ret_stack = ret_stack_list[start++];
 			t->curr_ret_stack = -1;
+			/* Make sure IRQs see the -1 first: */
+			barrier();
+			t->ret_stack = ret_stack_list[start++];
 			atomic_set(&t->trace_overrun, 0);
 		}
 	} while_each_thread(g, t);
-- 
cgit v1.2.3


From 470c66239ef0336429b35345f3f615d47341e13b Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Mon, 1 Dec 2008 14:31:37 -0800
Subject: genirq: warn when IRQF_DISABLED may be ignored

Impact: emit new warning

We periodically waste time tracking down problems from the genirq
framework not respecting IRQF_DISABLED for some shared IRQ cases.  Linus
views this as "will not fix", but we're still left with the bugs caused by
this misbehavior.

This patch adds a nag message in request_irq(), so that drivers can fix
their IRQ handlers to avoid this problem.

Note that developers will never see the relevant bugs when they run with
LOCKDEP, so it's no wonder these bugs are hard to find.  (That also means
LOCKDEP is overlooking some IRQ-related bugs involving IRQ handlers that
don't set IRQF_DISABLED...)

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/manage.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index c498a1b8c62..7fd891c3a33 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -635,6 +635,18 @@ int request_irq(unsigned int irq, irq_handler_t handler,
 	struct irq_desc *desc;
 	int retval;
 
+	/*
+	 * handle_IRQ_event() always ignores IRQF_DISABLED except for
+	 * the _first_ irqaction (sigh).  That can cause oopsing, but
+	 * the behavior is classified as "will not fix" so we need to
+	 * start nudging drivers away from using that idiom.
+	 */
+	if ((irqflags & (IRQF_SHARED|IRQF_DISABLED))
+			== (IRQF_SHARED|IRQF_DISABLED))
+		pr_warning("IRQ %d/%s: IRQF_DISABLED is not "
+				"guaranteed on shared IRQs\n",
+				irq, devname);
+
 #ifdef CONFIG_LOCKDEP
 	/*
 	 * Lockdep wants atomic interrupt handlers:
-- 
cgit v1.2.3


From a5e25883a445dce94a087ca479b21a5959cd5c18 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 2 Dec 2008 15:34:05 -0500
Subject: ftrace: replace raw_local_irq_save with local_irq_save

Impact: fix for lockdep and ftrace

The raw_local_irq_save/restore confuses lockdep. This patch
converts them to the local_irq_save/restore variants.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c            |  1 +
 kernel/trace/trace.c        | 12 ++++++------
 kernel/trace/trace_branch.c |  4 ++--
 kernel/trace/trace_stack.c  |  8 ++++----
 4 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 46a404173db..74b1878b8bb 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -25,6 +25,7 @@
  * Thanks to Arjan van de Ven for coming up with the initial idea of
  * mapping lock dependencies runtime.
  */
+#define DISABLE_BRANCH_PROFILING
 #include <linux/mutex.h>
 #include <linux/sched.h>
 #include <linux/delay.h>
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 91887a280ab..380de630ebc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1209,7 +1209,7 @@ void trace_graph_entry(struct ftrace_graph_ent *trace)
 	int cpu;
 	int pc;
 
-	raw_local_irq_save(flags);
+	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 	disabled = atomic_inc_return(&data->disabled);
@@ -1218,7 +1218,7 @@ void trace_graph_entry(struct ftrace_graph_ent *trace)
 		__trace_graph_entry(tr, data, trace, flags, pc);
 	}
 	atomic_dec(&data->disabled);
-	raw_local_irq_restore(flags);
+	local_irq_restore(flags);
 }
 
 void trace_graph_return(struct ftrace_graph_ret *trace)
@@ -1230,7 +1230,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
 	int cpu;
 	int pc;
 
-	raw_local_irq_save(flags);
+	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 	disabled = atomic_inc_return(&data->disabled);
@@ -1239,7 +1239,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
 		__trace_graph_return(tr, data, trace, flags, pc);
 	}
 	atomic_dec(&data->disabled);
-	raw_local_irq_restore(flags);
+	local_irq_restore(flags);
 }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
@@ -2645,7 +2645,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
 	if (err)
 		goto err_unlock;
 
-	raw_local_irq_disable();
+	local_irq_disable();
 	__raw_spin_lock(&ftrace_max_lock);
 	for_each_tracing_cpu(cpu) {
 		/*
@@ -2662,7 +2662,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
 		}
 	}
 	__raw_spin_unlock(&ftrace_max_lock);
-	raw_local_irq_enable();
+	local_irq_enable();
 
 	tracing_cpumask = tracing_cpumask_new;
 
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index bc972753568..6c00feb3bac 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -42,7 +42,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 	if (unlikely(!tr))
 		return;
 
-	raw_local_irq_save(flags);
+	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
 	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
 		goto out;
@@ -74,7 +74,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 
  out:
 	atomic_dec(&tr->data[cpu]->disabled);
-	raw_local_irq_restore(flags);
+	local_irq_restore(flags);
 }
 
 static inline
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index fde3be15c64..06a16115be0 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -48,7 +48,7 @@ static inline void check_stack(void)
 	if (!object_is_on_stack(&this_size))
 		return;
 
-	raw_local_irq_save(flags);
+	local_irq_save(flags);
 	__raw_spin_lock(&max_stack_lock);
 
 	/* a race could have already updated it */
@@ -96,7 +96,7 @@ static inline void check_stack(void)
 
  out:
 	__raw_spin_unlock(&max_stack_lock);
-	raw_local_irq_restore(flags);
+	local_irq_restore(flags);
 }
 
 static void
@@ -162,11 +162,11 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
 	if (ret < 0)
 		return ret;
 
-	raw_local_irq_save(flags);
+	local_irq_save(flags);
 	__raw_spin_lock(&max_stack_lock);
 	*ptr = val;
 	__raw_spin_unlock(&max_stack_lock);
-	raw_local_irq_restore(flags);
+	local_irq_restore(flags);
 
 	return count;
 }
-- 
cgit v1.2.3


From abc9b56d66fbd4d93302ef4bf6fa726e1b8255f9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 2 Dec 2008 15:34:06 -0500
Subject: ring-buffer: move some metadata into buffer page

Impact: get ready for splice changes

This patch moves the commit and timestamp into the beginning of each
data page of the buffer. This change will allow the page to be moved
to another location (disk, network, etc) and still have information
in the page to be able to read it.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 63 ++++++++++++++++++++++++++--------------------
 1 file changed, 36 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index e206951603c..8619c534588 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -195,20 +195,24 @@ void *ring_buffer_event_data(struct ring_buffer_event *event)
 #define TS_MASK		((1ULL << TS_SHIFT) - 1)
 #define TS_DELTA_TEST	(~TS_MASK)
 
-/*
- * This hack stolen from mm/slob.c.
- * We can store per page timing information in the page frame of the page.
- * Thanks to Peter Zijlstra for suggesting this idea.
- */
-struct buffer_page {
+struct buffer_data_page {
 	u64		 time_stamp;	/* page time stamp */
-	local_t		 write;		/* index for next write */
 	local_t		 commit;	/* write commited index */
+	unsigned char	 data[];	/* data of buffer page */
+};
+
+struct buffer_page {
+	local_t		 write;		/* index for next write */
 	unsigned	 read;		/* index for next read */
 	struct list_head list;		/* list of free pages */
-	void *page;			/* Actual data page */
+	struct buffer_data_page *page;	/* Actual data page */
 };
 
+static void rb_init_page(struct buffer_data_page *page)
+{
+	local_set(&page->commit, 0);
+}
+
 /*
  * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
  * this issue out.
@@ -230,7 +234,7 @@ static inline int test_time_stamp(u64 delta)
 	return 0;
 }
 
-#define BUF_PAGE_SIZE PAGE_SIZE
+#define BUF_PAGE_SIZE (PAGE_SIZE - sizeof(struct buffer_data_page))
 
 /*
  * head_page == tail_page && head == tail then buffer is empty.
@@ -333,6 +337,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
 		if (!addr)
 			goto free_pages;
 		page->page = (void *)addr;
+		rb_init_page(page->page);
 	}
 
 	list_splice(&pages, head);
@@ -378,6 +383,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
 	if (!addr)
 		goto fail_free_reader;
 	page->page = (void *)addr;
+	rb_init_page(page->page);
 
 	INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
 
@@ -647,6 +653,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 			if (!addr)
 				goto free_pages;
 			page->page = (void *)addr;
+			rb_init_page(page->page);
 		}
 	}
 
@@ -682,7 +689,7 @@ static inline int rb_null_event(struct ring_buffer_event *event)
 
 static inline void *__rb_page_index(struct buffer_page *page, unsigned index)
 {
-	return page->page + index;
+	return page->page->data + index;
 }
 
 static inline struct ring_buffer_event *
@@ -712,7 +719,7 @@ static inline unsigned rb_page_write(struct buffer_page *bpage)
 
 static inline unsigned rb_page_commit(struct buffer_page *bpage)
 {
-	return local_read(&bpage->commit);
+	return local_read(&bpage->page->commit);
 }
 
 /* Size is determined by what has been commited */
@@ -804,14 +811,15 @@ rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
 		if (RB_WARN_ON(cpu_buffer,
 			  cpu_buffer->commit_page == cpu_buffer->tail_page))
 			return;
-		cpu_buffer->commit_page->commit =
+		cpu_buffer->commit_page->page->commit =
 			cpu_buffer->commit_page->write;
 		rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
-		cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp;
+		cpu_buffer->write_stamp =
+			cpu_buffer->commit_page->page->time_stamp;
 	}
 
 	/* Now set the commit to the event's index */
-	local_set(&cpu_buffer->commit_page->commit, index);
+	local_set(&cpu_buffer->commit_page->page->commit, index);
 }
 
 static inline void
@@ -826,16 +834,17 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
 	 * assign the commit to the tail.
 	 */
 	while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
-		cpu_buffer->commit_page->commit =
+		cpu_buffer->commit_page->page->commit =
 			cpu_buffer->commit_page->write;
 		rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
-		cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp;
+		cpu_buffer->write_stamp =
+			cpu_buffer->commit_page->page->time_stamp;
 		/* add barrier to keep gcc from optimizing too much */
 		barrier();
 	}
 	while (rb_commit_index(cpu_buffer) !=
 	       rb_page_write(cpu_buffer->commit_page)) {
-		cpu_buffer->commit_page->commit =
+		cpu_buffer->commit_page->page->commit =
 			cpu_buffer->commit_page->write;
 		barrier();
 	}
@@ -843,7 +852,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
 
 static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 {
-	cpu_buffer->read_stamp = cpu_buffer->reader_page->time_stamp;
+	cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp;
 	cpu_buffer->reader_page->read = 0;
 }
 
@@ -862,7 +871,7 @@ static inline void rb_inc_iter(struct ring_buffer_iter *iter)
 	else
 		rb_inc_page(cpu_buffer, &iter->head_page);
 
-	iter->read_stamp = iter->head_page->time_stamp;
+	iter->read_stamp = iter->head_page->page->time_stamp;
 	iter->head = 0;
 }
 
@@ -998,12 +1007,12 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		 */
 		if (tail_page == cpu_buffer->tail_page) {
 			local_set(&next_page->write, 0);
-			local_set(&next_page->commit, 0);
+			local_set(&next_page->page->commit, 0);
 			cpu_buffer->tail_page = next_page;
 
 			/* reread the time stamp */
 			*ts = ring_buffer_time_stamp(cpu_buffer->cpu);
-			cpu_buffer->tail_page->time_stamp = *ts;
+			cpu_buffer->tail_page->page->time_stamp = *ts;
 		}
 
 		/*
@@ -1048,7 +1057,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	 * this page's time stamp.
 	 */
 	if (!tail && rb_is_commit(cpu_buffer, event))
-		cpu_buffer->commit_page->time_stamp = *ts;
+		cpu_buffer->commit_page->page->time_stamp = *ts;
 
 	return event;
 
@@ -1099,7 +1108,7 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 			event->time_delta = *delta & TS_MASK;
 			event->array[0] = *delta >> TS_SHIFT;
 		} else {
-			cpu_buffer->commit_page->time_stamp = *ts;
+			cpu_buffer->commit_page->page->time_stamp = *ts;
 			event->time_delta = 0;
 			event->array[0] = 0;
 		}
@@ -1552,7 +1561,7 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
 	if (iter->head)
 		iter->read_stamp = cpu_buffer->read_stamp;
 	else
-		iter->read_stamp = iter->head_page->time_stamp;
+		iter->read_stamp = iter->head_page->page->time_stamp;
 }
 
 /**
@@ -1696,7 +1705,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	cpu_buffer->reader_page->list.prev = reader->list.prev;
 
 	local_set(&cpu_buffer->reader_page->write, 0);
-	local_set(&cpu_buffer->reader_page->commit, 0);
+	local_set(&cpu_buffer->reader_page->page->commit, 0);
 
 	/* Make the reader page now replace the head */
 	reader->list.prev->next = &cpu_buffer->reader_page->list;
@@ -2088,7 +2097,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 	cpu_buffer->head_page
 		= list_entry(cpu_buffer->pages.next, struct buffer_page, list);
 	local_set(&cpu_buffer->head_page->write, 0);
-	local_set(&cpu_buffer->head_page->commit, 0);
+	local_set(&cpu_buffer->head_page->page->commit, 0);
 
 	cpu_buffer->head_page->read = 0;
 
@@ -2097,7 +2106,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 
 	INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
 	local_set(&cpu_buffer->reader_page->write, 0);
-	local_set(&cpu_buffer->reader_page->commit, 0);
+	local_set(&cpu_buffer->reader_page->page->commit, 0);
 	cpu_buffer->reader_page->read = 0;
 
 	cpu_buffer->overrun = 0;
-- 
cgit v1.2.3


From 8789a9e7df6bf9b93739c4c7d4e380725bc9e936 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 2 Dec 2008 15:34:07 -0500
Subject: ring-buffer: read page interface

Impact: new API to ring buffer

This patch adds a new interface into the ring buffer that allows a
page to be read from the ring buffer on a given CPU. For every page
read, one must also be given to allow for a "swap" of the pages.

 rpage = ring_buffer_alloc_read_page(buffer);
 if (!rpage)
	goto err;
 ret = ring_buffer_read_page(buffer, &rpage, cpu, full);
 if (!ret)
	goto empty;
 process_page(rpage);
 ring_buffer_free_read_page(rpage);

The caller of these functions must handle any waits that are
needed to wait for new data. The ring_buffer_read_page will simply
return 0 if there is no data, or if "full" is set and the writer
is still on the current page.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 166 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 166 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 8619c534588..50b74d3a5c3 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -687,6 +687,12 @@ static inline int rb_null_event(struct ring_buffer_event *event)
 	return event->type == RINGBUF_TYPE_PADDING;
 }
 
+static inline void *
+__rb_data_page_index(struct buffer_data_page *page, unsigned index)
+{
+	return page->data + index;
+}
+
 static inline void *__rb_page_index(struct buffer_page *page, unsigned index)
 {
 	return page->page->data + index;
@@ -2232,6 +2238,166 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
 	return 0;
 }
 
+static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
+			      struct buffer_data_page *page)
+{
+	struct ring_buffer_event *event;
+	unsigned long head;
+
+	__raw_spin_lock(&cpu_buffer->lock);
+	for (head = 0; head < local_read(&page->commit);
+	     head += rb_event_length(event)) {
+
+		event = __rb_data_page_index(page, head);
+		if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
+			return;
+		/* Only count data entries */
+		if (event->type != RINGBUF_TYPE_DATA)
+			continue;
+		cpu_buffer->entries--;
+	}
+	__raw_spin_unlock(&cpu_buffer->lock);
+}
+
+/**
+ * ring_buffer_alloc_read_page - allocate a page to read from buffer
+ * @buffer: the buffer to allocate for.
+ *
+ * This function is used in conjunction with ring_buffer_read_page.
+ * When reading a full page from the ring buffer, these functions
+ * can be used to speed up the process. The calling function should
+ * allocate a few pages first with this function. Then when it
+ * needs to get pages from the ring buffer, it passes the result
+ * of this function into ring_buffer_read_page, which will swap
+ * the page that was allocated, with the read page of the buffer.
+ *
+ * Returns:
+ *  The page allocated, or NULL on error.
+ */
+void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
+{
+	unsigned long addr;
+	struct buffer_data_page *page;
+
+	addr = __get_free_page(GFP_KERNEL);
+	if (!addr)
+		return NULL;
+
+	page = (void *)addr;
+
+	return page;
+}
+
+/**
+ * ring_buffer_free_read_page - free an allocated read page
+ * @buffer: the buffer the page was allocate for
+ * @data: the page to free
+ *
+ * Free a page allocated from ring_buffer_alloc_read_page.
+ */
+void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
+{
+	free_page((unsigned long)data);
+}
+
+/**
+ * ring_buffer_read_page - extract a page from the ring buffer
+ * @buffer: buffer to extract from
+ * @data_page: the page to use allocated from ring_buffer_alloc_read_page
+ * @cpu: the cpu of the buffer to extract
+ * @full: should the extraction only happen when the page is full.
+ *
+ * This function will pull out a page from the ring buffer and consume it.
+ * @data_page must be the address of the variable that was returned
+ * from ring_buffer_alloc_read_page. This is because the page might be used
+ * to swap with a page in the ring buffer.
+ *
+ * for example:
+ *	rpage = ring_buffer_alloc_page(buffer);
+ *	if (!rpage)
+ *		return error;
+ *	ret = ring_buffer_read_page(buffer, &rpage, cpu, 0);
+ *	if (ret)
+ *		process_page(rpage);
+ *
+ * When @full is set, the function will not return true unless
+ * the writer is off the reader page.
+ *
+ * Note: it is up to the calling functions to handle sleeps and wakeups.
+ *  The ring buffer can be used anywhere in the kernel and can not
+ *  blindly call wake_up. The layer that uses the ring buffer must be
+ *  responsible for that.
+ *
+ * Returns:
+ *  1 if data has been transferred
+ *  0 if no data has been transferred.
+ */
+int ring_buffer_read_page(struct ring_buffer *buffer,
+			    void **data_page, int cpu, int full)
+{
+	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+	struct ring_buffer_event *event;
+	struct buffer_data_page *page;
+	unsigned long flags;
+	int ret = 0;
+
+	if (!data_page)
+		return 0;
+
+	page = *data_page;
+	if (!page)
+		return 0;
+
+	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+	/*
+	 * rb_buffer_peek will get the next ring buffer if
+	 * the current reader page is empty.
+	 */
+	event = rb_buffer_peek(buffer, cpu, NULL);
+	if (!event)
+		goto out;
+
+	/* check for data */
+	if (!local_read(&cpu_buffer->reader_page->page->commit))
+		goto out;
+	/*
+	 * If the writer is already off of the read page, then simply
+	 * switch the read page with the given page. Otherwise
+	 * we need to copy the data from the reader to the writer.
+	 */
+	if (cpu_buffer->reader_page == cpu_buffer->commit_page) {
+		unsigned int read = cpu_buffer->reader_page->read;
+
+		if (full)
+			goto out;
+		/* The writer is still on the reader page, we must copy */
+		page = cpu_buffer->reader_page->page;
+		memcpy(page->data,
+		       cpu_buffer->reader_page->page->data + read,
+		       local_read(&page->commit) - read);
+
+		/* consume what was read */
+		cpu_buffer->reader_page += read;
+
+	} else {
+		/* swap the pages */
+		rb_init_page(page);
+		page = cpu_buffer->reader_page->page;
+		cpu_buffer->reader_page->page = *data_page;
+		cpu_buffer->reader_page->read = 0;
+		*data_page = page;
+	}
+	ret = 1;
+
+	/* update the entry counter */
+	rb_remove_entries(cpu_buffer, page);
+ out:
+	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+	return ret;
+}
+
 static ssize_t
 rb_simple_read(struct file *filp, char __user *ubuf,
 	       size_t cnt, loff_t *ppos)
-- 
cgit v1.2.3


From 14a866c567e040ccf6240d68b083dd1dbbde63e6 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 2 Dec 2008 23:50:02 -0500
Subject: ftrace: add ftrace_graph_stop()

Impact: new ftrace_graph_stop function

While developing more features of function graph, I hit a bug that
caused the WARN_ON to trigger in the prepare_ftrace_return function.
Well, it was hard for me to find out that was happening because the
bug would not print, it would just cause a hard lockup or reboot.
The reason is that it is not safe to call printk from this function.

Looking further, I also found that it calls unregister_ftrace_graph,
which grabs a mutex and calls kstop machine. This would definitely
lock the box up if it were to trigger.

This patch adds a fast and safe ftrace_graph_stop() which will
stop the function tracer. Then it is safe to call the WARN ON.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2e78628443e..a44af05ae2d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1769,5 +1769,10 @@ void ftrace_graph_exit_task(struct task_struct *t)
 
 	kfree(ret_stack);
 }
+
+void ftrace_graph_stop(void)
+{
+	ftrace_stop();
+}
 #endif
 
-- 
cgit v1.2.3


From 044fa782ebb9472cf5253e95d9a625fd4c0bdd99 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 2 Dec 2008 23:50:03 -0500
Subject: ring-buffer: change "page" variable names to "bpage"

Impact: clean up

Andrew Morton pointed out that the kernel convention of a variable
named page should be of type page struct. The ring buffer uses
a variable named "page" for a pointer to something else.

This patch converts those to be called "bpage" (as in "buffer page").

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 130 ++++++++++++++++++++++-----------------------
 1 file changed, 65 insertions(+), 65 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 50b74d3a5c3..7f69cfeaadf 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -208,9 +208,9 @@ struct buffer_page {
 	struct buffer_data_page *page;	/* Actual data page */
 };
 
-static void rb_init_page(struct buffer_data_page *page)
+static void rb_init_page(struct buffer_data_page *bpage)
 {
-	local_set(&page->commit, 0);
+	local_set(&bpage->commit, 0);
 }
 
 /*
@@ -298,19 +298,19 @@ struct ring_buffer_iter {
 static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
 {
 	struct list_head *head = &cpu_buffer->pages;
-	struct buffer_page *page, *tmp;
+	struct buffer_page *bpage, *tmp;
 
 	if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
 		return -1;
 	if (RB_WARN_ON(cpu_buffer, head->prev->next != head))
 		return -1;
 
-	list_for_each_entry_safe(page, tmp, head, list) {
+	list_for_each_entry_safe(bpage, tmp, head, list) {
 		if (RB_WARN_ON(cpu_buffer,
-			       page->list.next->prev != &page->list))
+			       bpage->list.next->prev != &bpage->list))
 			return -1;
 		if (RB_WARN_ON(cpu_buffer,
-			       page->list.prev->next != &page->list))
+			       bpage->list.prev->next != &bpage->list))
 			return -1;
 	}
 
@@ -321,23 +321,23 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
 			     unsigned nr_pages)
 {
 	struct list_head *head = &cpu_buffer->pages;
-	struct buffer_page *page, *tmp;
+	struct buffer_page *bpage, *tmp;
 	unsigned long addr;
 	LIST_HEAD(pages);
 	unsigned i;
 
 	for (i = 0; i < nr_pages; i++) {
-		page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()),
+		bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
 				    GFP_KERNEL, cpu_to_node(cpu_buffer->cpu));
-		if (!page)
+		if (!bpage)
 			goto free_pages;
-		list_add(&page->list, &pages);
+		list_add(&bpage->list, &pages);
 
 		addr = __get_free_page(GFP_KERNEL);
 		if (!addr)
 			goto free_pages;
-		page->page = (void *)addr;
-		rb_init_page(page->page);
+		bpage->page = (void *)addr;
+		rb_init_page(bpage->page);
 	}
 
 	list_splice(&pages, head);
@@ -347,9 +347,9 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
 	return 0;
 
  free_pages:
-	list_for_each_entry_safe(page, tmp, &pages, list) {
-		list_del_init(&page->list);
-		free_buffer_page(page);
+	list_for_each_entry_safe(bpage, tmp, &pages, list) {
+		list_del_init(&bpage->list);
+		free_buffer_page(bpage);
 	}
 	return -ENOMEM;
 }
@@ -358,7 +358,7 @@ static struct ring_buffer_per_cpu *
 rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
-	struct buffer_page *page;
+	struct buffer_page *bpage;
 	unsigned long addr;
 	int ret;
 
@@ -373,17 +373,17 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
 	cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 	INIT_LIST_HEAD(&cpu_buffer->pages);
 
-	page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()),
+	bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
 			    GFP_KERNEL, cpu_to_node(cpu));
-	if (!page)
+	if (!bpage)
 		goto fail_free_buffer;
 
-	cpu_buffer->reader_page = page;
+	cpu_buffer->reader_page = bpage;
 	addr = __get_free_page(GFP_KERNEL);
 	if (!addr)
 		goto fail_free_reader;
-	page->page = (void *)addr;
-	rb_init_page(page->page);
+	bpage->page = (void *)addr;
+	rb_init_page(bpage->page);
 
 	INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
 
@@ -408,14 +408,14 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
 static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
 {
 	struct list_head *head = &cpu_buffer->pages;
-	struct buffer_page *page, *tmp;
+	struct buffer_page *bpage, *tmp;
 
 	list_del_init(&cpu_buffer->reader_page->list);
 	free_buffer_page(cpu_buffer->reader_page);
 
-	list_for_each_entry_safe(page, tmp, head, list) {
-		list_del_init(&page->list);
-		free_buffer_page(page);
+	list_for_each_entry_safe(bpage, tmp, head, list) {
+		list_del_init(&bpage->list);
+		free_buffer_page(bpage);
 	}
 	kfree(cpu_buffer);
 }
@@ -512,7 +512,7 @@ static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
 static void
 rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
 {
-	struct buffer_page *page;
+	struct buffer_page *bpage;
 	struct list_head *p;
 	unsigned i;
 
@@ -523,9 +523,9 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
 		if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages)))
 			return;
 		p = cpu_buffer->pages.next;
-		page = list_entry(p, struct buffer_page, list);
-		list_del_init(&page->list);
-		free_buffer_page(page);
+		bpage = list_entry(p, struct buffer_page, list);
+		list_del_init(&bpage->list);
+		free_buffer_page(bpage);
 	}
 	if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages)))
 		return;
@@ -542,7 +542,7 @@ static void
 rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
 		struct list_head *pages, unsigned nr_pages)
 {
-	struct buffer_page *page;
+	struct buffer_page *bpage;
 	struct list_head *p;
 	unsigned i;
 
@@ -553,9 +553,9 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
 		if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
 			return;
 		p = pages->next;
-		page = list_entry(p, struct buffer_page, list);
-		list_del_init(&page->list);
-		list_add_tail(&page->list, &cpu_buffer->pages);
+		bpage = list_entry(p, struct buffer_page, list);
+		list_del_init(&bpage->list);
+		list_add_tail(&bpage->list, &cpu_buffer->pages);
 	}
 	rb_reset_cpu(cpu_buffer);
 
@@ -582,7 +582,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	unsigned nr_pages, rm_pages, new_pages;
-	struct buffer_page *page, *tmp;
+	struct buffer_page *bpage, *tmp;
 	unsigned long buffer_size;
 	unsigned long addr;
 	LIST_HEAD(pages);
@@ -643,17 +643,17 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 
 	for_each_buffer_cpu(buffer, cpu) {
 		for (i = 0; i < new_pages; i++) {
-			page = kzalloc_node(ALIGN(sizeof(*page),
+			bpage = kzalloc_node(ALIGN(sizeof(*bpage),
 						  cache_line_size()),
 					    GFP_KERNEL, cpu_to_node(cpu));
-			if (!page)
+			if (!bpage)
 				goto free_pages;
-			list_add(&page->list, &pages);
+			list_add(&bpage->list, &pages);
 			addr = __get_free_page(GFP_KERNEL);
 			if (!addr)
 				goto free_pages;
-			page->page = (void *)addr;
-			rb_init_page(page->page);
+			bpage->page = (void *)addr;
+			rb_init_page(bpage->page);
 		}
 	}
 
@@ -674,9 +674,9 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 	return size;
 
  free_pages:
-	list_for_each_entry_safe(page, tmp, &pages, list) {
-		list_del_init(&page->list);
-		free_buffer_page(page);
+	list_for_each_entry_safe(bpage, tmp, &pages, list) {
+		list_del_init(&bpage->list);
+		free_buffer_page(bpage);
 	}
 	mutex_unlock(&buffer->mutex);
 	return -ENOMEM;
@@ -688,14 +688,14 @@ static inline int rb_null_event(struct ring_buffer_event *event)
 }
 
 static inline void *
-__rb_data_page_index(struct buffer_data_page *page, unsigned index)
+__rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
 {
-	return page->data + index;
+	return bpage->data + index;
 }
 
-static inline void *__rb_page_index(struct buffer_page *page, unsigned index)
+static inline void *__rb_page_index(struct buffer_page *bpage, unsigned index)
 {
-	return page->page->data + index;
+	return bpage->page->data + index;
 }
 
 static inline struct ring_buffer_event *
@@ -771,14 +771,14 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
 }
 
 static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
-			       struct buffer_page **page)
+			       struct buffer_page **bpage)
 {
-	struct list_head *p = (*page)->list.next;
+	struct list_head *p = (*bpage)->list.next;
 
 	if (p == &cpu_buffer->pages)
 		p = p->next;
 
-	*page = list_entry(p, struct buffer_page, list);
+	*bpage = list_entry(p, struct buffer_page, list);
 }
 
 static inline unsigned
@@ -2239,16 +2239,16 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
 }
 
 static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
-			      struct buffer_data_page *page)
+			      struct buffer_data_page *bpage)
 {
 	struct ring_buffer_event *event;
 	unsigned long head;
 
 	__raw_spin_lock(&cpu_buffer->lock);
-	for (head = 0; head < local_read(&page->commit);
+	for (head = 0; head < local_read(&bpage->commit);
 	     head += rb_event_length(event)) {
 
-		event = __rb_data_page_index(page, head);
+		event = __rb_data_page_index(bpage, head);
 		if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
 			return;
 		/* Only count data entries */
@@ -2277,15 +2277,15 @@ static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
 void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
 {
 	unsigned long addr;
-	struct buffer_data_page *page;
+	struct buffer_data_page *bpage;
 
 	addr = __get_free_page(GFP_KERNEL);
 	if (!addr)
 		return NULL;
 
-	page = (void *)addr;
+	bpage = (void *)addr;
 
-	return page;
+	return bpage;
 }
 
 /**
@@ -2337,15 +2337,15 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 {
 	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
 	struct ring_buffer_event *event;
-	struct buffer_data_page *page;
+	struct buffer_data_page *bpage;
 	unsigned long flags;
 	int ret = 0;
 
 	if (!data_page)
 		return 0;
 
-	page = *data_page;
-	if (!page)
+	bpage = *data_page;
+	if (!bpage)
 		return 0;
 
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
@@ -2372,26 +2372,26 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 		if (full)
 			goto out;
 		/* The writer is still on the reader page, we must copy */
-		page = cpu_buffer->reader_page->page;
-		memcpy(page->data,
+		bpage = cpu_buffer->reader_page->page;
+		memcpy(bpage->data,
 		       cpu_buffer->reader_page->page->data + read,
-		       local_read(&page->commit) - read);
+		       local_read(&bpage->commit) - read);
 
 		/* consume what was read */
 		cpu_buffer->reader_page += read;
 
 	} else {
 		/* swap the pages */
-		rb_init_page(page);
-		page = cpu_buffer->reader_page->page;
+		rb_init_page(bpage);
+		bpage = cpu_buffer->reader_page->page;
 		cpu_buffer->reader_page->page = *data_page;
 		cpu_buffer->reader_page->read = 0;
-		*data_page = page;
+		*data_page = bpage;
 	}
 	ret = 1;
 
 	/* update the entry counter */
-	rb_remove_entries(cpu_buffer, page);
+	rb_remove_entries(cpu_buffer, bpage);
  out:
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
-- 
cgit v1.2.3


From e49dc19c6a19ea112fcb94b7c62ec62cdd5c08aa Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 2 Dec 2008 23:50:05 -0500
Subject: ftrace: function graph return for function entry

Impact: feature, let entry function decide to trace or not

This patch lets the graph tracer entry function decide if the tracing
should be done at the end as well. This requires all function graph
entry functions return 1 if it should trace, or 0 if the return should
not be traced.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 10 +++++++---
 kernel/trace/trace.c  |  4 +++-
 kernel/trace/trace.h  |  2 +-
 3 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a44af05ae2d..65b9e863056 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1636,11 +1636,15 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 
 static atomic_t ftrace_graph_active;
 
+int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
+{
+	return 0;
+}
+
 /* The callbacks that hook a function */
 trace_func_graph_ret_t ftrace_graph_return =
 			(trace_func_graph_ret_t)ftrace_stub;
-trace_func_graph_ent_t ftrace_graph_entry =
-			(trace_func_graph_ent_t)ftrace_stub;
+trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub;
 
 /* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
 static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
@@ -1738,7 +1742,7 @@ void unregister_ftrace_graph(void)
 
 	atomic_dec(&ftrace_graph_active);
 	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
-	ftrace_graph_entry = (trace_func_graph_ent_t)ftrace_stub;
+	ftrace_graph_entry = ftrace_graph_entry_stub;
 	ftrace_shutdown(FTRACE_STOP_FUNC_RET);
 
 	mutex_unlock(&ftrace_sysctl_lock);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 380de630ebc..8b6409a62b5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1200,7 +1200,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
 }
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-void trace_graph_entry(struct ftrace_graph_ent *trace)
+int trace_graph_entry(struct ftrace_graph_ent *trace)
 {
 	struct trace_array *tr = &global_trace;
 	struct trace_array_cpu *data;
@@ -1219,6 +1219,8 @@ void trace_graph_entry(struct ftrace_graph_ent *trace)
 	}
 	atomic_dec(&data->disabled);
 	local_irq_restore(flags);
+
+	return 1;
 }
 
 void trace_graph_return(struct ftrace_graph_ret *trace)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f96f4e787ff..0565ae9a221 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -412,7 +412,7 @@ void trace_function(struct trace_array *tr,
 		    unsigned long flags, int pc);
 
 void trace_graph_return(struct ftrace_graph_ret *trace);
-void trace_graph_entry(struct ftrace_graph_ent *trace);
+int trace_graph_entry(struct ftrace_graph_ent *trace);
 void trace_bts(struct trace_array *tr,
 	       unsigned long from,
 	       unsigned long to);
-- 
cgit v1.2.3


From 11e84acc400921743cc8d488e4a265cd98a655c7 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 3 Dec 2008 02:30:37 +0100
Subject: tracing/function-graph-tracer: display unified style cmdline and pid

Impact: extend function-graph output: let one know which thread called a function

This patch implements a helper function to print the couple cmdline/pid.
Its output is provided during task switching and on each row if the new
"funcgraph-proc" defualt-off option is set through trace_options file.

The output is center aligned and never exceeds 14 characters. The cmdline
is truncated over 7 chars.
But note that if the pid exceeds 6 characters, the column will overflow (but
the situation is abnormal).

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_functions_graph.c | 113 ++++++++++++++++++++++++++++++-----
 1 file changed, 99 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 894b50bca31..23e19d2dbd0 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -19,6 +19,7 @@
 #define TRACE_GRAPH_PRINT_OVERRUN	0x1
 #define TRACE_GRAPH_PRINT_CPU		0x2
 #define TRACE_GRAPH_PRINT_OVERHEAD	0x4
+#define TRACE_GRAPH_PRINT_PROC		0x8
 
 static struct tracer_opt trace_opts[] = {
 	/* Display overruns ? */
@@ -27,11 +28,13 @@ static struct tracer_opt trace_opts[] = {
 	{ TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) },
 	/* Display Overhead ? */
 	{ TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) },
+	/* Display proc name/pid */
+	{ TRACER_OPT(funcgraph-proc, TRACE_GRAPH_PRINT_PROC) },
 	{ } /* Empty entry */
 };
 
 static struct tracer_flags tracer_flags = {
-	/* Don't display overruns by default */
+	/* Don't display overruns and proc by default */
 	.val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD,
 	.opts = trace_opts
 };
@@ -104,23 +107,63 @@ print_graph_cpu(struct trace_seq *s, int cpu)
 	return TRACE_TYPE_HANDLED;
 }
 
+#define TRACE_GRAPH_PROCINFO_LENGTH	14
+
+static enum print_line_t
+print_graph_proc(struct trace_seq *s, pid_t pid)
+{
+	int i;
+	int ret;
+	int len;
+	char comm[8];
+	int spaces = 0;
+	/* sign + log10(MAX_INT) + '\0' */
+	char pid_str[11];
+
+	strncpy(comm, trace_find_cmdline(pid), 7);
+	comm[7] = '\0';
+	sprintf(pid_str, "%d", pid);
+
+	/* 1 stands for the "-" character */
+	len = strlen(comm) + strlen(pid_str) + 1;
+
+	if (len < TRACE_GRAPH_PROCINFO_LENGTH)
+		spaces = TRACE_GRAPH_PROCINFO_LENGTH - len;
+
+	/* First spaces to align center */
+	for (i = 0; i < spaces / 2; i++) {
+		ret = trace_seq_printf(s, " ");
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+	ret = trace_seq_printf(s, "%s-%s", comm, pid_str);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Last spaces to align center */
+	for (i = 0; i < spaces - (spaces / 2); i++) {
+		ret = trace_seq_printf(s, " ");
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+	return TRACE_TYPE_HANDLED;
+}
+
 
 /* If the pid changed since the last trace, output this event */
-static int verif_pid(struct trace_seq *s, pid_t pid, int cpu)
+static enum print_line_t
+verif_pid(struct trace_seq *s, pid_t pid, int cpu)
 {
-	char *comm, *prev_comm;
 	pid_t prev_pid;
 	int ret;
 
 	if (last_pid[cpu] != -1 && last_pid[cpu] == pid)
-		return 1;
+		return TRACE_TYPE_HANDLED;
 
 	prev_pid = last_pid[cpu];
 	last_pid[cpu] = pid;
 
-	comm = trace_find_cmdline(pid);
-	prev_comm = trace_find_cmdline(prev_pid);
-
 /*
  * Context-switch trace line:
 
@@ -130,11 +173,31 @@ static int verif_pid(struct trace_seq *s, pid_t pid, int cpu)
 
  */
 	ret = trace_seq_printf(s,
-		" ------------------------------------------\n");
-	ret += trace_seq_printf(s, " | %d)  %s-%d  =>  %s-%d\n",
-				  cpu, prev_comm, prev_pid, comm, pid);
-	ret += trace_seq_printf(s,
-		" ------------------------------------------\n\n");
+		"\n ------------------------------------------\n |");
+	if (!ret)
+		TRACE_TYPE_PARTIAL_LINE;
+
+	ret = print_graph_cpu(s, cpu);
+	if (ret == TRACE_TYPE_PARTIAL_LINE)
+		TRACE_TYPE_PARTIAL_LINE;
+
+	ret = print_graph_proc(s, prev_pid);
+	if (ret == TRACE_TYPE_PARTIAL_LINE)
+		TRACE_TYPE_PARTIAL_LINE;
+
+	ret = trace_seq_printf(s, " => ");
+	if (!ret)
+		TRACE_TYPE_PARTIAL_LINE;
+
+	ret = print_graph_proc(s, pid);
+	if (ret == TRACE_TYPE_PARTIAL_LINE)
+		TRACE_TYPE_PARTIAL_LINE;
+
+	ret = trace_seq_printf(s,
+		"\n ------------------------------------------\n\n");
+	if (!ret)
+		TRACE_TYPE_PARTIAL_LINE;
+
 	return ret;
 }
 
@@ -288,12 +351,23 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 	struct trace_entry *ent = iter->ent;
 
 	/* Pid */
-	if (!verif_pid(s, ent->pid, cpu))
+	if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Cpu */
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
 		ret = print_graph_cpu(s, cpu);
+		if (ret == TRACE_TYPE_PARTIAL_LINE)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+	/* Proc */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
+		ret = print_graph_proc(s, ent->pid);
+		if (ret == TRACE_TYPE_PARTIAL_LINE)
+			return TRACE_TYPE_PARTIAL_LINE;
+
+		ret = trace_seq_printf(s, " | ");
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
@@ -318,12 +392,23 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 		duration = 9999999ULL;
 
 	/* Pid */
-	if (!verif_pid(s, ent->pid, cpu))
+	if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Cpu */
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
 		ret = print_graph_cpu(s, cpu);
+		if (ret == TRACE_TYPE_PARTIAL_LINE)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+	/* Proc */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
+		ret = print_graph_proc(s, ent->pid);
+		if (ret == TRACE_TYPE_PARTIAL_LINE)
+			return TRACE_TYPE_PARTIAL_LINE;
+
+		ret = trace_seq_printf(s, " | ");
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
-- 
cgit v1.2.3


From 166d3c7994d79ab3f78f420607283361ff5cce79 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 3 Dec 2008 02:32:12 +0100
Subject: tracing/function-graph-tracer: improve duration output

Impact: better trace output of duration for long calls

The old duration output didn't exceeded 9999.999 us to fit the column
and the nanosecs were always 3 numbers. As Ingo suggested, it's better
to have the whole microseconds elapsed time and shift the nanosecs precision
if needed to fit the maximum 7 numbers. And usec need more number, the case
should be rare and important enough to break a bit the column alignment to
show it.

So, depending of the duration value, we now have these patterns:

    u.nnn us
   uu.nnn us
  uuu.nnn us
 uuuu.nnn us
 uuuuu.nn us
 uuuuuu.n us
 uuuuuuuu..... us

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_functions_graph.c | 55 ++++++++++++++++++++++++++++--------
 1 file changed, 43 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 23e19d2dbd0..c66578f2fdc 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -232,11 +232,50 @@ trace_branch_is_leaf(struct trace_iterator *iter,
 }
 
 
-static inline int
+static enum print_line_t
 print_graph_duration(unsigned long long duration, struct trace_seq *s)
 {
 	unsigned long nsecs_rem = do_div(duration, 1000);
-	return trace_seq_printf(s, "%4llu.%3lu us |  ", duration, nsecs_rem);
+	/* log10(ULONG_MAX) + '\0' */
+	char msecs_str[21];
+	char nsecs_str[5];
+	int ret, len;
+	int i;
+
+	sprintf(msecs_str, "%lu", (unsigned long) duration);
+
+	/* Print msecs */
+	ret = trace_seq_printf(s, msecs_str);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	len = strlen(msecs_str);
+
+	/* Print nsecs (we don't want to exceed 7 numbers) */
+	if (len < 7) {
+		snprintf(nsecs_str, 8 - len, "%03lu", nsecs_rem);
+		ret = trace_seq_printf(s, ".%s", nsecs_str);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+		len += strlen(nsecs_str);
+	}
+
+	ret = trace_seq_printf(s, " us ");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Print remaining spaces to fit the row's width */
+	for (i = len; i < 7; i++) {
+		ret = trace_seq_printf(s, " ");
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+	ret = trace_seq_printf(s, "|  ");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+	return TRACE_TYPE_HANDLED;
+
 }
 
 /* Signal a overhead of time execution to the output */
@@ -273,10 +312,6 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 	call = &entry->graph_ent;
 	duration = graph_ret->rettime - graph_ret->calltime;
 
-	/* Must not exceed 8 characters: 9999.999 us */
-	if (duration > 10000000ULL)
-		duration = 9999999ULL;
-
 	/* Overhead */
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
 		ret = print_graph_overhead(duration, s);
@@ -286,7 +321,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 
 	/* Duration */
 	ret = print_graph_duration(duration, s);
-	if (!ret)
+	if (ret == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Function */
@@ -387,10 +422,6 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 	int ret;
 	unsigned long long duration = trace->rettime - trace->calltime;
 
-	/* Must not exceed 8 characters: xxxx.yyy us */
-	if (duration > 10000000ULL)
-		duration = 9999999ULL;
-
 	/* Pid */
 	if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
@@ -422,7 +453,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 
 	/* Duration */
 	ret = print_graph_duration(duration, s);
-	if (!ret)
+	if (ret == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Closing brace */
-- 
cgit v1.2.3


From 764f3b95131a7ce5c992e3d00caf590fcada2f7b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Dec 2008 10:33:58 +0100
Subject: tracing/function-graph-tracer: enabled by default

CONFIG_FUNCTION_GRAPH_TRACER depends on FUNCTION_TRACER already,
(turning it non-default) so it so making it default-n is pointless.

So enable it by default - it's a nice extension of the function tracer.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8b6b673b4d6..bde6f03512d 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -67,6 +67,7 @@ config FUNCTION_GRAPH_TRACER
 	bool "Kernel Function Graph Tracer"
 	depends on HAVE_FUNCTION_GRAPH_TRACER
 	depends on FUNCTION_TRACER
+	default y
 	help
 	  Enable the kernel to trace a function at both its return
 	  and its entry.
-- 
cgit v1.2.3


From 0a37119d963e876ca86912497346ec50dea2541b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 3 Dec 2008 11:04:50 -0500
Subject: trace: fix output of stack trace

Impact: fix to output of stack trace

If a function is not found in the stack of the stack tracer, the
number printed is quite strange. This fixes the algorithm to handle
missing functions better.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_stack.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 06a16115be0..0b863f2cbc8 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -78,6 +78,7 @@ static inline void check_stack(void)
 	 * on a new max, so it is far from a fast path.
 	 */
 	while (i < max_stack_trace.nr_entries) {
+		int found = 0;
 
 		stack_dump_index[i] = this_size;
 		p = start;
@@ -86,12 +87,14 @@ static inline void check_stack(void)
 			if (*p == stack_dump_trace[i]) {
 				this_size = stack_dump_index[i++] =
 					(top - p) * sizeof(unsigned long);
+				found = 1;
 				/* Start the search from here */
 				start = p + 1;
 			}
 		}
 
-		i++;
+		if (!found)
+			i++;
 	}
 
  out:
-- 
cgit v1.2.3


From e8e1abe92fd7ea9d823a3aaf81d10e2cba593b6b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 3 Dec 2008 11:04:51 -0500
Subject: ftrace: fix race in function graph during fork

Impact: graph tracer race/crash fix

There is a nasy race in startup of a new process running the
function graph tracer. In fork.c:

	total_forks++;
	spin_unlock(&current->sighand->siglock);
	write_unlock_irq(&tasklist_lock);
	ftrace_graph_init_task(p);
	proc_fork_connector(p);
	cgroup_post_fork(p);
	return p;

The new task is free to run as soon as the tasklist_lock is released.
This is before the ftrace_graph_init_task. If the task does run
it will be using the same ret_stack and curr_ret_stack as the parent.
This will cause crashes that are difficult to debug.

This patch moves the ftrace_graph_init_task to just after the alloc_pid
code. This fixes the above race.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/fork.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 5f82a999c03..7407ab31987 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1137,6 +1137,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		}
 	}
 
+	ftrace_graph_init_task(p);
+
 	p->pid = pid_nr(pid);
 	p->tgid = p->pid;
 	if (clone_flags & CLONE_THREAD)
@@ -1145,7 +1147,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (current->nsproxy != p->nsproxy) {
 		retval = ns_cgroup_clone(p, pid);
 		if (retval)
-			goto bad_fork_free_pid;
+			goto bad_fork_free_graph;
 	}
 
 	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
@@ -1238,7 +1240,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		spin_unlock(&current->sighand->siglock);
 		write_unlock_irq(&tasklist_lock);
 		retval = -ERESTARTNOINTR;
-		goto bad_fork_free_pid;
+		goto bad_fork_free_graph;
 	}
 
 	if (clone_flags & CLONE_THREAD) {
@@ -1271,11 +1273,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	total_forks++;
 	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
-	ftrace_graph_init_task(p);
 	proc_fork_connector(p);
 	cgroup_post_fork(p);
 	return p;
 
+bad_fork_free_graph:
+	ftrace_graph_exit_task(p);
 bad_fork_free_pid:
 	if (pid != &init_struct_pid)
 		free_pid(pid);
-- 
cgit v1.2.3


From ea4e2bc4d9f7370e57a343ccb5e7c0ad3222ec3c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 3 Dec 2008 15:36:57 -0500
Subject: ftrace: graph of a single function

This patch adds the file:

   /debugfs/tracing/set_graph_function

which can be used along with the function graph tracer.

When this file is empty, the function graph tracer will act as
usual. When the file has a function in it, the function graph
tracer will only trace that function.

For example:

 # echo blk_unplug > /debugfs/tracing/set_graph_function
 # cat /debugfs/tracing/trace
 [...]
 ------------------------------------------
 | 2)  make-19003  =>  kjournald-2219
 ------------------------------------------

 2)               |  blk_unplug() {
 2)               |    dm_unplug_all() {
 2)               |      dm_get_table() {
 2)      1.381 us |        _read_lock();
 2)      0.911 us |        dm_table_get();
 2)      1. 76 us |        _read_unlock();
 2) +   12.912 us |      }
 2)               |      dm_table_unplug_all() {
 2)               |        blk_unplug() {
 2)      0.778 us |          generic_unplug_device();
 2)      2.409 us |        }
 2)      5.992 us |      }
 2)      0.813 us |      dm_table_put();
 2) +   29. 90 us |    }
 2) +   34.532 us |  }

You can add up to 32 functions into this file. Currently we limit it
to 32, but this may change with later improvements.

To add another function, use the append '>>':

  # echo sys_read >> /debugfs/tracing/set_graph_function
  # cat /debugfs/tracing/set_graph_function
  blk_unplug
  sys_read

Using the '>' will clear out the function and write anew:

  # echo sys_write > /debug/tracing/set_graph_function
  # cat /debug/tracing/set_graph_function
  sys_write

Note, if you have function graph running while doing this, the small
time between clearing it and updating it will cause the graph to
record all functions. This should not be an issue because after
it sets the filter, only those functions will be recorded from then on.
If you need to only record a particular function then set this
file first before starting the function graph tracer. In the future
this side effect may be corrected.

The set_graph_function file is similar to the set_ftrace_filter but
it does not take wild cards nor does it allow for more than one
function to be set with a single write. There is no technical reason why
this is the case, I just do not have the time yet to implement that.

Note, dynamic ftrace must be enabled for this to appear because it
uses the dynamic ftrace records to match the name to the mcount
call sites.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 227 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.c  |   8 ++
 kernel/trace/trace.h  |  30 ++++++-
 3 files changed, 264 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 65b9e863056..b17a30350f0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1320,6 +1320,224 @@ static struct file_operations ftrace_notrace_fops = {
 	.release = ftrace_notrace_release,
 };
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+static DEFINE_MUTEX(graph_lock);
+
+int ftrace_graph_count;
+unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
+
+static void *
+g_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	unsigned long *array = m->private;
+	int index = *pos;
+
+	(*pos)++;
+
+	if (index >= ftrace_graph_count)
+		return NULL;
+
+	return &array[index];
+}
+
+static void *g_start(struct seq_file *m, loff_t *pos)
+{
+	void *p = NULL;
+
+	mutex_lock(&graph_lock);
+
+	p = g_next(m, p, pos);
+
+	return p;
+}
+
+static void g_stop(struct seq_file *m, void *p)
+{
+	mutex_unlock(&graph_lock);
+}
+
+static int g_show(struct seq_file *m, void *v)
+{
+	unsigned long *ptr = v;
+	char str[KSYM_SYMBOL_LEN];
+
+	if (!ptr)
+		return 0;
+
+	kallsyms_lookup(*ptr, NULL, NULL, NULL, str);
+
+	seq_printf(m, "%s\n", str);
+
+	return 0;
+}
+
+static struct seq_operations ftrace_graph_seq_ops = {
+	.start = g_start,
+	.next = g_next,
+	.stop = g_stop,
+	.show = g_show,
+};
+
+static int
+ftrace_graph_open(struct inode *inode, struct file *file)
+{
+	int ret = 0;
+
+	if (unlikely(ftrace_disabled))
+		return -ENODEV;
+
+	mutex_lock(&graph_lock);
+	if ((file->f_mode & FMODE_WRITE) &&
+	    !(file->f_flags & O_APPEND)) {
+		ftrace_graph_count = 0;
+		memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
+	}
+
+	if (file->f_mode & FMODE_READ) {
+		ret = seq_open(file, &ftrace_graph_seq_ops);
+		if (!ret) {
+			struct seq_file *m = file->private_data;
+			m->private = ftrace_graph_funcs;
+		}
+	} else
+		file->private_data = ftrace_graph_funcs;
+	mutex_unlock(&graph_lock);
+
+	return ret;
+}
+
+static ssize_t
+ftrace_graph_read(struct file *file, char __user *ubuf,
+		       size_t cnt, loff_t *ppos)
+{
+	if (file->f_mode & FMODE_READ)
+		return seq_read(file, ubuf, cnt, ppos);
+	else
+		return -EPERM;
+}
+
+static int
+ftrace_set_func(unsigned long *array, int idx, char *buffer)
+{
+	char str[KSYM_SYMBOL_LEN];
+	struct dyn_ftrace *rec;
+	struct ftrace_page *pg;
+	int found = 0;
+	int i;
+
+	if (ftrace_disabled)
+		return -ENODEV;
+
+	/* should not be called from interrupt context */
+	spin_lock(&ftrace_lock);
+
+	for (pg = ftrace_pages_start; pg; pg = pg->next) {
+		for (i = 0; i < pg->index; i++) {
+			rec = &pg->records[i];
+
+			if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
+				continue;
+
+			kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+			if (strcmp(str, buffer) == 0) {
+				found = 1;
+				array[idx] = rec->ip;
+				break;
+			}
+		}
+	}
+	spin_unlock(&ftrace_lock);
+
+	return found ? 0 : -EINVAL;
+}
+
+static ssize_t
+ftrace_graph_write(struct file *file, const char __user *ubuf,
+		   size_t cnt, loff_t *ppos)
+{
+	unsigned char buffer[FTRACE_BUFF_MAX+1];
+	unsigned long *array;
+	size_t read = 0;
+	ssize_t ret;
+	int index = 0;
+	char ch;
+
+	if (!cnt || cnt < 0)
+		return 0;
+
+	mutex_lock(&graph_lock);
+
+	if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	if (file->f_mode & FMODE_READ) {
+		struct seq_file *m = file->private_data;
+		array = m->private;
+	} else
+		array = file->private_data;
+
+	ret = get_user(ch, ubuf++);
+	if (ret)
+		goto out;
+	read++;
+	cnt--;
+
+	/* skip white space */
+	while (cnt && isspace(ch)) {
+		ret = get_user(ch, ubuf++);
+		if (ret)
+			goto out;
+		read++;
+		cnt--;
+	}
+
+	if (isspace(ch)) {
+		*ppos += read;
+		ret = read;
+		goto out;
+	}
+
+	while (cnt && !isspace(ch)) {
+		if (index < FTRACE_BUFF_MAX)
+			buffer[index++] = ch;
+		else {
+			ret = -EINVAL;
+			goto out;
+		}
+		ret = get_user(ch, ubuf++);
+		if (ret)
+			goto out;
+		read++;
+		cnt--;
+	}
+	buffer[index] = 0;
+
+	/* we allow only one at a time */
+	ret = ftrace_set_func(array, ftrace_graph_count, buffer);
+	if (ret)
+		goto out;
+
+	ftrace_graph_count++;
+
+	file->f_pos += read;
+
+	ret = read;
+ out:
+	mutex_unlock(&graph_lock);
+
+	return ret;
+}
+
+static const struct file_operations ftrace_graph_fops = {
+	.open = ftrace_graph_open,
+	.read = ftrace_graph_read,
+	.write = ftrace_graph_write,
+};
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
 static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
 {
 	struct dentry *entry;
@@ -1347,6 +1565,15 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
 		pr_warning("Could not create debugfs "
 			   "'set_ftrace_notrace' entry\n");
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	entry = debugfs_create_file("set_graph_function", 0444, d_tracer,
+				    NULL,
+				    &ftrace_graph_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'set_graph_function' entry\n");
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
 	return 0;
 }
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8b6409a62b5..710b39acd81 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1209,6 +1209,9 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 	int cpu;
 	int pc;
 
+	if (!ftrace_graph_addr(trace->func))
+		return 0;
+
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
@@ -1217,6 +1220,9 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 		pc = preempt_count();
 		__trace_graph_entry(tr, data, trace, flags, pc);
 	}
+	/* Only do the atomic if it is not already set */
+	if (!test_tsk_trace_graph(current))
+		set_tsk_trace_graph(current);
 	atomic_dec(&data->disabled);
 	local_irq_restore(flags);
 
@@ -1240,6 +1246,8 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
 		pc = preempt_count();
 		__trace_graph_return(tr, data, trace, flags, pc);
 	}
+	if (!trace->depth)
+		clear_tsk_trace_graph(current);
 	atomic_dec(&data->disabled);
 	local_irq_restore(flags);
 }
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 0565ae9a221..41f026bfc9e 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -505,13 +505,41 @@ extern unsigned long trace_flags;
 /* Standard output formatting function used for function return traces */
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 extern enum print_line_t print_graph_function(struct trace_iterator *iter);
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+/* TODO: make this variable */
+#define FTRACE_GRAPH_MAX_FUNCS		32
+extern int ftrace_graph_count;
+extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS];
+
+static inline int ftrace_graph_addr(unsigned long addr)
+{
+	int i;
+
+	if (!ftrace_graph_count || test_tsk_trace_graph(current))
+		return 1;
+
+	for (i = 0; i < ftrace_graph_count; i++) {
+		if (addr == ftrace_graph_funcs[i])
+			return 1;
+	}
+
+	return 0;
+}
 #else
+static inline int ftrace_trace_addr(unsigned long addr)
+{
+	return 1
+}
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+#else /* CONFIG_FUNCTION_GRAPH_TRACER */
 static inline enum print_line_t
 print_graph_function(struct trace_iterator *iter)
 {
 	return TRACE_TYPE_UNHANDLED;
 }
-#endif
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
 /*
  * trace_iterator_flags is an enumeration that defines bit
-- 
cgit v1.2.3


From 0ef8cde56ab92ab3f65221246dc1622c6b5068b3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 3 Dec 2008 15:36:58 -0500
Subject: ftrace: use task struct trace flag to filter on pid

Impact: clean up

Use the new task struct trace flags to determine if a process should be
traced or not.

Note: this moves the searching of the pid to the slow path of setting
the pid field. This needs to be converted to the pid name space.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b17a30350f0..c5049f54a27 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -47,7 +47,7 @@
 int ftrace_enabled __read_mostly;
 static int last_ftrace_enabled;
 
-/* ftrace_pid_trace >= 0 will only trace threads with this pid */
+/* set when tracing only a pid */
 static int ftrace_pid_trace = -1;
 
 /* Quick disabling of function tracer. */
@@ -90,7 +90,7 @@ static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
 
 static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip)
 {
-	if (current->pid != ftrace_pid_trace)
+	if (!test_tsk_trace_trace(current))
 		return;
 
 	ftrace_pid_function(ip, parent_ip);
@@ -1714,11 +1714,33 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
 		ftrace_pid_trace = -1;
 
 	} else {
+		struct task_struct *p;
+		int found = 0;
 
 		if (ftrace_pid_trace == val)
 			goto out;
 
-		ftrace_pid_trace = val;
+		/*
+		 * Find the task that matches this pid.
+		 * TODO: use pid namespaces instead.
+		 */
+		rcu_read_lock();
+		for_each_process(p) {
+			if (p->pid == val) {
+				found = 1;
+				set_tsk_trace_trace(p);
+			} else if (test_tsk_trace_trace(p))
+				clear_tsk_trace_trace(p);
+		}
+		rcu_read_unlock();
+
+		if (found)
+			ftrace_pid_trace = val;
+		else {
+			if (ftrace_pid_trace < 0)
+				goto out;
+			ftrace_pid_trace = -1;
+		}
 	}
 
 	/* update the function call */
-- 
cgit v1.2.3


From 804a685162a7080386714166776f57255a75238e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 3 Dec 2008 15:36:59 -0500
Subject: ftrace: trace single pid for function graph tracer

Impact: New feature

This patch makes the changes to set_ftrace_pid apply to the function
graph tracer.

  # echo $$ > /debugfs/tracing/set_ftrace_pid
  # echo function_graph > /debugfs/tracing/current_tracer

Will cause only the current task to be traced. Note, the trace flags are
also inherited by child processes, so the children of the shell
will also be traced.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c |  2 +-
 kernel/trace/trace.c  |  3 +++
 kernel/trace/trace.h  | 10 ++++++++++
 3 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index c5049f54a27..57592a9dd63 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -48,7 +48,7 @@ int ftrace_enabled __read_mostly;
 static int last_ftrace_enabled;
 
 /* set when tracing only a pid */
-static int ftrace_pid_trace = -1;
+int ftrace_pid_trace = -1;
 
 /* Quick disabling of function tracer. */
 int function_trace_stop;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 710b39acd81..1bd9574404e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1209,6 +1209,9 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 	int cpu;
 	int pc;
 
+	if (!ftrace_trace_task(current))
+		return 0;
+
 	if (!ftrace_graph_addr(trace->func))
 		return 0;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 41f026bfc9e..95fff37ed97 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -541,6 +541,16 @@ print_graph_function(struct trace_iterator *iter)
 }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
+extern int ftrace_pid_trace;
+
+static inline int ftrace_trace_task(struct task_struct *task)
+{
+	if (ftrace_pid_trace < 0)
+		return 1;
+
+	return test_tsk_trace_trace(task);
+}
+
 /*
  * trace_iterator_flags is an enumeration that defines bit
  * positions into trace_flags that controls the output.
-- 
cgit v1.2.3


From 978f3a45d9499c7a447ca7615455cefb63d44165 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 4 Dec 2008 00:26:40 -0500
Subject: ftrace: use struct pid

Impact: clean up, extend PID filtering to PID namespaces

Eric Biederman suggested using the struct pid for filtering on
pids in the kernel. This patch is based off of a demonstration
of an implementation that Eric sent me in an email.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 76 +++++++++++++++++++++++++++++----------------------
 kernel/trace/trace.h  |  4 +--
 2 files changed, 46 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 57592a9dd63..10b1d7c1b1d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -48,7 +48,7 @@ int ftrace_enabled __read_mostly;
 static int last_ftrace_enabled;
 
 /* set when tracing only a pid */
-int ftrace_pid_trace = -1;
+struct pid *ftrace_pid_trace;
 
 /* Quick disabling of function tracer. */
 int function_trace_stop;
@@ -153,7 +153,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
 		else
 			func = ftrace_list_func;
 
-		if (ftrace_pid_trace >= 0) {
+		if (ftrace_pid_trace) {
 			set_ftrace_pid_function(func);
 			func = ftrace_pid_func;
 		}
@@ -209,7 +209,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
 		if (ftrace_list->next == &ftrace_list_end) {
 			ftrace_func_t func = ftrace_list->func;
 
-			if (ftrace_pid_trace >= 0) {
+			if (ftrace_pid_trace) {
 				set_ftrace_pid_function(func);
 				func = ftrace_pid_func;
 			}
@@ -239,7 +239,7 @@ static void ftrace_update_pid_func(void)
 
 	func = ftrace_trace_function;
 
-	if (ftrace_pid_trace >= 0) {
+	if (ftrace_pid_trace) {
 		set_ftrace_pid_function(func);
 		func = ftrace_pid_func;
 	} else {
@@ -1678,18 +1678,40 @@ ftrace_pid_read(struct file *file, char __user *ubuf,
 	char buf[64];
 	int r;
 
-	if (ftrace_pid_trace >= 0)
-		r = sprintf(buf, "%u\n", ftrace_pid_trace);
+	if (ftrace_pid_trace)
+		r = sprintf(buf, "%u\n", pid_nr(ftrace_pid_trace));
 	else
 		r = sprintf(buf, "no pid\n");
 
 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
+static void clear_ftrace_pid_task(struct pid **pid)
+{
+	struct task_struct *p;
+
+	do_each_pid_task(*pid, PIDTYPE_PID, p) {
+		clear_tsk_trace_trace(p);
+	} while_each_pid_task(*pid, PIDTYPE_PID, p);
+	put_pid(*pid);
+
+	*pid = NULL;
+}
+
+static void set_ftrace_pid_task(struct pid *pid)
+{
+	struct task_struct *p;
+
+	do_each_pid_task(pid, PIDTYPE_PID, p) {
+		set_tsk_trace_trace(p);
+	} while_each_pid_task(pid, PIDTYPE_PID, p);
+}
+
 static ssize_t
 ftrace_pid_write(struct file *filp, const char __user *ubuf,
 		   size_t cnt, loff_t *ppos)
 {
+	struct pid *pid;
 	char buf[64];
 	long val;
 	int ret;
@@ -1707,40 +1729,30 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
 		return ret;
 
 	mutex_lock(&ftrace_start_lock);
-	if (ret < 0) {
+	if (val < 0) {
 		/* disable pid tracing */
-		if (ftrace_pid_trace < 0)
+		if (!ftrace_pid_trace)
 			goto out;
-		ftrace_pid_trace = -1;
+
+		clear_ftrace_pid_task(&ftrace_pid_trace);
 
 	} else {
-		struct task_struct *p;
-		int found = 0;
+		pid = find_get_pid(val);
 
-		if (ftrace_pid_trace == val)
+		if (pid == ftrace_pid_trace) {
+			put_pid(pid);
 			goto out;
-
-		/*
-		 * Find the task that matches this pid.
-		 * TODO: use pid namespaces instead.
-		 */
-		rcu_read_lock();
-		for_each_process(p) {
-			if (p->pid == val) {
-				found = 1;
-				set_tsk_trace_trace(p);
-			} else if (test_tsk_trace_trace(p))
-				clear_tsk_trace_trace(p);
 		}
-		rcu_read_unlock();
 
-		if (found)
-			ftrace_pid_trace = val;
-		else {
-			if (ftrace_pid_trace < 0)
-				goto out;
-			ftrace_pid_trace = -1;
-		}
+		if (ftrace_pid_trace)
+			clear_ftrace_pid_task(&ftrace_pid_trace);
+
+		if (!pid)
+			goto out;
+
+		ftrace_pid_trace = pid;
+
+		set_ftrace_pid_task(ftrace_pid_trace);
 	}
 
 	/* update the function call */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 95fff37ed97..8b81b4d727b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -541,11 +541,11 @@ print_graph_function(struct trace_iterator *iter)
 }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
-extern int ftrace_pid_trace;
+extern struct pid *ftrace_pid_trace;
 
 static inline int ftrace_trace_task(struct task_struct *task)
 {
-	if (ftrace_pid_trace < 0)
+	if (ftrace_pid_trace)
 		return 1;
 
 	return test_tsk_trace_trace(task);
-- 
cgit v1.2.3


From e32d89569128e76bdf84867be0928902ca9f7555 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 4 Dec 2008 00:26:41 -0500
Subject: ftrace: add ability to only trace swapper tasks

Impact: new feature

This patch lets the swapper tasks of all CPUS be filtered by the
set_ftrace_pid file.

If '0' is echoed into this file, then all the idle tasks (aka swapper)
is flagged to be traced.  This affects all CPU idle tasks.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 74 +++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 63 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 10b1d7c1b1d..eb57dc1ea09 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -49,6 +49,7 @@ static int last_ftrace_enabled;
 
 /* set when tracing only a pid */
 struct pid *ftrace_pid_trace;
+static struct pid * const ftrace_swapper_pid = (struct pid *)1;
 
 /* Quick disabling of function tracer. */
 int function_trace_stop;
@@ -1678,7 +1679,9 @@ ftrace_pid_read(struct file *file, char __user *ubuf,
 	char buf[64];
 	int r;
 
-	if (ftrace_pid_trace)
+	if (ftrace_pid_trace == ftrace_swapper_pid)
+		r = sprintf(buf, "swapper tasks\n");
+	else if (ftrace_pid_trace)
 		r = sprintf(buf, "%u\n", pid_nr(ftrace_pid_trace));
 	else
 		r = sprintf(buf, "no pid\n");
@@ -1686,19 +1689,43 @@ ftrace_pid_read(struct file *file, char __user *ubuf,
 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
-static void clear_ftrace_pid_task(struct pid **pid)
+static void clear_ftrace_swapper(void)
 {
 	struct task_struct *p;
+	int cpu;
 
-	do_each_pid_task(*pid, PIDTYPE_PID, p) {
+	get_online_cpus();
+	for_each_online_cpu(cpu) {
+		p = idle_task(cpu);
 		clear_tsk_trace_trace(p);
-	} while_each_pid_task(*pid, PIDTYPE_PID, p);
-	put_pid(*pid);
+	}
+	put_online_cpus();
+}
 
-	*pid = NULL;
+static void set_ftrace_swapper(void)
+{
+	struct task_struct *p;
+	int cpu;
+
+	get_online_cpus();
+	for_each_online_cpu(cpu) {
+		p = idle_task(cpu);
+		set_tsk_trace_trace(p);
+	}
+	put_online_cpus();
 }
 
-static void set_ftrace_pid_task(struct pid *pid)
+static void clear_ftrace_pid(struct pid *pid)
+{
+	struct task_struct *p;
+
+	do_each_pid_task(pid, PIDTYPE_PID, p) {
+		clear_tsk_trace_trace(p);
+	} while_each_pid_task(pid, PIDTYPE_PID, p);
+	put_pid(pid);
+}
+
+static void set_ftrace_pid(struct pid *pid)
 {
 	struct task_struct *p;
 
@@ -1707,6 +1734,24 @@ static void set_ftrace_pid_task(struct pid *pid)
 	} while_each_pid_task(pid, PIDTYPE_PID, p);
 }
 
+static void clear_ftrace_pid_task(struct pid **pid)
+{
+	if (*pid == ftrace_swapper_pid)
+		clear_ftrace_swapper();
+	else
+		clear_ftrace_pid(*pid);
+
+	*pid = NULL;
+}
+
+static void set_ftrace_pid_task(struct pid *pid)
+{
+	if (pid == ftrace_swapper_pid)
+		set_ftrace_swapper();
+	else
+		set_ftrace_pid(pid);
+}
+
 static ssize_t
 ftrace_pid_write(struct file *filp, const char __user *ubuf,
 		   size_t cnt, loff_t *ppos)
@@ -1737,11 +1782,18 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
 		clear_ftrace_pid_task(&ftrace_pid_trace);
 
 	} else {
-		pid = find_get_pid(val);
+		/* swapper task is special */
+		if (!val) {
+			pid = ftrace_swapper_pid;
+			if (pid == ftrace_pid_trace)
+				goto out;
+		} else {
+			pid = find_get_pid(val);
 
-		if (pid == ftrace_pid_trace) {
-			put_pid(pid);
-			goto out;
+			if (pid == ftrace_pid_trace) {
+				put_pid(pid);
+				goto out;
+			}
 		}
 
 		if (ftrace_pid_trace)
-- 
cgit v1.2.3


From 6b2539302bee8e88c99e3c7d80c16a04dbe5e2ad Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 4 Dec 2008 09:18:28 +0100
Subject: tracing: fix typo and missing inline function

Impact: fix build bugs

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8b81b4d727b..b4b7b735184 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -529,7 +529,11 @@ static inline int ftrace_graph_addr(unsigned long addr)
 #else
 static inline int ftrace_trace_addr(unsigned long addr)
 {
-	return 1
+	return 1;
+}
+static inline int ftrace_graph_addr(unsigned long addr)
+{
+	return 1;
 }
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
-- 
cgit v1.2.3


From faec2ec505d397e9426754722b6e80d519c4938f Mon Sep 17 00:00:00 2001
From: Liming Wang <liming.wang@windriver.com>
Date: Thu, 4 Dec 2008 14:24:49 +0800
Subject: ftrace: avoid duplicated function when writing set_graph_function

Impact: fix a bug in function filter setting

when writing function to set_graph_function, we should check whether it
has existed in set_graph_function to avoid duplicating.

Signed-off-by: Liming Wang <liming.wang@windriver.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index eb57dc1ea09..d2b15653816 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1425,7 +1425,7 @@ ftrace_set_func(unsigned long *array, int idx, char *buffer)
 	struct dyn_ftrace *rec;
 	struct ftrace_page *pg;
 	int found = 0;
-	int i;
+	int i, j;
 
 	if (ftrace_disabled)
 		return -ENODEV;
@@ -1443,7 +1443,13 @@ ftrace_set_func(unsigned long *array, int idx, char *buffer)
 			kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
 			if (strcmp(str, buffer) == 0) {
 				found = 1;
-				array[idx] = rec->ip;
+				for (j = 0; j < idx; j++)
+					if (array[j] == rec->ip) {
+						found = 0;
+						break;
+					}
+				if (found)
+					array[idx] = rec->ip;
 				break;
 			}
 		}
-- 
cgit v1.2.3


From 00ef9f7348dfd2fc223ec42aceb30836e86b367f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 4 Dec 2008 09:00:17 +0100
Subject: lockdep: change a held lock's class

Impact: introduce new lockdep API

Allow to change a held lock's class. Basically the same as the existing
code to change a subclass therefore reuse all that.

The XFS code will be able to use this to annotate their inode locking.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 90f3fb64dbc..4fa6eeb4e8a 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -291,14 +291,12 @@ void lockdep_off(void)
 {
 	current->lockdep_recursion++;
 }
-
 EXPORT_SYMBOL(lockdep_off);
 
 void lockdep_on(void)
 {
 	current->lockdep_recursion--;
 }
-
 EXPORT_SYMBOL(lockdep_on);
 
 /*
@@ -2513,7 +2511,6 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
 	if (subclass)
 		register_lock_class(lock, subclass, 1);
 }
-
 EXPORT_SYMBOL_GPL(lockdep_init_map);
 
 /*
@@ -2694,8 +2691,9 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
 }
 
 static int
-__lock_set_subclass(struct lockdep_map *lock,
-		    unsigned int subclass, unsigned long ip)
+__lock_set_class(struct lockdep_map *lock, const char *name,
+		 struct lock_class_key *key, unsigned int subclass,
+		 unsigned long ip)
 {
 	struct task_struct *curr = current;
 	struct held_lock *hlock, *prev_hlock;
@@ -2722,6 +2720,7 @@ __lock_set_subclass(struct lockdep_map *lock,
 	return print_unlock_inbalance_bug(curr, lock, ip);
 
 found_it:
+	lockdep_init_map(lock, name, key, 0);
 	class = register_lock_class(lock, subclass, 0);
 	hlock->class_idx = class - lock_classes + 1;
 
@@ -2906,9 +2905,9 @@ static void check_flags(unsigned long flags)
 #endif
 }
 
-void
-lock_set_subclass(struct lockdep_map *lock,
-		  unsigned int subclass, unsigned long ip)
+void lock_set_class(struct lockdep_map *lock, const char *name,
+		    struct lock_class_key *key, unsigned int subclass,
+		    unsigned long ip)
 {
 	unsigned long flags;
 
@@ -2918,13 +2917,12 @@ lock_set_subclass(struct lockdep_map *lock,
 	raw_local_irq_save(flags);
 	current->lockdep_recursion = 1;
 	check_flags(flags);
-	if (__lock_set_subclass(lock, subclass, ip))
+	if (__lock_set_class(lock, name, key, subclass, ip))
 		check_chain_key(current);
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
 }
-
-EXPORT_SYMBOL_GPL(lock_set_subclass);
+EXPORT_SYMBOL_GPL(lock_set_class);
 
 /*
  * We are not always called with irqs disabled - do that here,
@@ -2948,7 +2946,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
 }
-
 EXPORT_SYMBOL_GPL(lock_acquire);
 
 void lock_release(struct lockdep_map *lock, int nested,
@@ -2966,7 +2963,6 @@ void lock_release(struct lockdep_map *lock, int nested,
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
 }
-
 EXPORT_SYMBOL_GPL(lock_release);
 
 #ifdef CONFIG_LOCK_STAT
@@ -3451,7 +3447,6 @@ retry:
 	if (unlock)
 		read_unlock(&tasklist_lock);
 }
-
 EXPORT_SYMBOL_GPL(debug_show_all_locks);
 
 /*
@@ -3472,7 +3467,6 @@ void debug_show_held_locks(struct task_struct *task)
 {
 		__debug_show_held_locks(task);
 }
-
 EXPORT_SYMBOL_GPL(debug_show_held_locks);
 
 void lockdep_sys_exit(void)
-- 
cgit v1.2.3


From 1fd8f2a3f9a91b287a876cef830b21baafc8a799 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 3 Dec 2008 23:45:11 +0100
Subject: tracing/function-graph-tracer: handle ftrace_printk entries

Handle the TRACE_PRINT entries from the function grapg tracer
and output them as a C comment just below the function that called
it, as if it was a comment inside this function.

Example with an ftrace_printk inside might_sleep() function:

void __might_sleep(char *file, int line)
{
	static unsigned long prev_jiffy;	/* ratelimiting */

	ftrace_printk("Hi I'm a comment in might_sleep() :-)");

A chunk of a resulting trace:

 0)               |        _reiserfs_free_block() {
 0)               |          reiserfs_read_bitmap_block() {
 0)               |            __bread() {
 0)               |              __getblk() {
 0)               |                __find_get_block() {
 0)   0.698 us    |                  mark_page_accessed();
 0)   2.267 us    |                }
 0)               |                __might_sleep() {
 0)               |                  /* Hi I'm a comment in might_sleep() :-) */
 0)   1.321 us    |                }
 0)   5.872 us    |              }
 0)   7.313 us    |            }
 0)   8.718 us    |          }

And this patch brings two minor fixes:

- The newline after a switch-out task has disappeared
- The "|" sign just before the cpu number on task-switch has been deleted.

 0)   0.616 us    |                pick_next_task_rt();
 0)   1.457 us    |                _spin_trylock();
 0)   0.653 us    |                _spin_unlock();
 0)   0.728 us    |                _spin_trylock();
 0)   0.631 us    |                _spin_unlock();
 0)   0.729 us    |                native_load_sp0();
 0)   0.593 us    |                native_load_tls();
 ------------------------------------------
 0)    cat-2834    =>   migrati-3
 ------------------------------------------

 0)               |    finish_task_switch() {
 0)   0.841 us    |      _spin_unlock_irq();
 0)   0.616 us    |      post_schedule_rt();
 0)   3.882 us    |    }

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c                 | 28 +++++++++++---
 kernel/trace/trace.h                 |  4 +-
 kernel/trace/trace_functions_graph.c | 72 +++++++++++++++++++++++++++++++++++-
 kernel/trace/trace_mmiotrace.c       |  2 +-
 4 files changed, 97 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8b6409a62b5..1ca74c0cee6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3335,7 +3335,7 @@ static int mark_printk(const char *fmt, ...)
 	int ret;
 	va_list args;
 	va_start(args, fmt);
-	ret = trace_vprintk(0, fmt, args);
+	ret = trace_vprintk(0, -1, fmt, args);
 	va_end(args);
 	return ret;
 }
@@ -3564,9 +3564,16 @@ static __init int tracer_init_debugfs(void)
 	return 0;
 }
 
-int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
+int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 {
-	static DEFINE_SPINLOCK(trace_buf_lock);
+	/*
+	 * Raw Spinlock because a normal spinlock would be traced here
+	 * and append an irrelevant couple spin_lock_irqsave/
+	 * spin_unlock_irqrestore traced by ftrace around this
+	 * TRACE_PRINTK trace.
+	 */
+	static raw_spinlock_t trace_buf_lock =
+				(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 	static char trace_buf[TRACE_BUF_SIZE];
 
 	struct ring_buffer_event *event;
@@ -3587,7 +3594,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 	if (unlikely(atomic_read(&data->disabled)))
 		goto out;
 
-	spin_lock_irqsave(&trace_buf_lock, flags);
+	local_irq_save(flags);
+	__raw_spin_lock(&trace_buf_lock);
 	len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
 
 	len = min(len, TRACE_BUF_SIZE-1);
@@ -3601,13 +3609,15 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 	tracing_generic_entry_update(&entry->ent, flags, pc);
 	entry->ent.type			= TRACE_PRINT;
 	entry->ip			= ip;
+	entry->depth			= depth;
 
 	memcpy(&entry->buf, trace_buf, len);
 	entry->buf[len] = 0;
 	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
 
  out_unlock:
-	spin_unlock_irqrestore(&trace_buf_lock, flags);
+	__raw_spin_unlock(&trace_buf_lock);
+	local_irq_restore(flags);
 
  out:
 	preempt_enable_notrace();
@@ -3625,7 +3635,13 @@ int __ftrace_printk(unsigned long ip, const char *fmt, ...)
 		return 0;
 
 	va_start(ap, fmt);
-	ret = trace_vprintk(ip, fmt, ap);
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	ret = trace_vprintk(ip, current->curr_ret_stack, fmt, ap);
+#else
+	ret = trace_vprintk(ip, -1, fmt, ap);
+#endif
+
 	va_end(ap);
 	return ret;
 }
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 0565ae9a221..fce98898205 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -117,6 +117,7 @@ struct userstack_entry {
 struct print_entry {
 	struct trace_entry	ent;
 	unsigned long		ip;
+	int			depth;
 	char			buf[];
 };
 
@@ -498,7 +499,8 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
 extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
 				 size_t cnt);
 extern long ns2usecs(cycle_t nsec);
-extern int trace_vprintk(unsigned long ip, const char *fmt, va_list args);
+extern int
+trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args);
 
 extern unsigned long trace_flags;
 
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index c66578f2fdc..32b7fb9a19d 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -173,7 +173,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu)
 
  */
 	ret = trace_seq_printf(s,
-		"\n ------------------------------------------\n |");
+		" ------------------------------------------\n");
 	if (!ret)
 		TRACE_TYPE_PARTIAL_LINE;
 
@@ -477,6 +477,71 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 	return TRACE_TYPE_HANDLED;
 }
 
+static enum print_line_t
+print_graph_comment(struct print_entry *trace, struct trace_seq *s,
+		   struct trace_entry *ent, struct trace_iterator *iter)
+{
+	int i;
+	int ret;
+
+	/* Pid */
+	if (verif_pid(s, ent->pid, iter->cpu) == TRACE_TYPE_PARTIAL_LINE)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Cpu */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
+		ret = print_graph_cpu(s, iter->cpu);
+		if (ret == TRACE_TYPE_PARTIAL_LINE)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+	/* Proc */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
+		ret = print_graph_proc(s, ent->pid);
+		if (ret == TRACE_TYPE_PARTIAL_LINE)
+			return TRACE_TYPE_PARTIAL_LINE;
+
+		ret = trace_seq_printf(s, " | ");
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+	/* No overhead */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
+		ret = trace_seq_printf(s, "  ");
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+	/* No time */
+	ret = trace_seq_printf(s, "            |  ");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Indentation */
+	if (trace->depth > 0)
+		for (i = 0; i < (trace->depth + 1) * TRACE_GRAPH_INDENT; i++) {
+			ret = trace_seq_printf(s, " ");
+			if (!ret)
+				return TRACE_TYPE_PARTIAL_LINE;
+		}
+
+	/* The comment */
+	ret = trace_seq_printf(s, "/* %s", trace->buf);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	if (ent->flags & TRACE_FLAG_CONT)
+		trace_seq_print_cont(s, iter);
+
+	ret = trace_seq_printf(s, " */\n");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
+
 enum print_line_t
 print_graph_function(struct trace_iterator *iter)
 {
@@ -495,6 +560,11 @@ print_graph_function(struct trace_iterator *iter)
 		trace_assign_type(field, entry);
 		return print_graph_return(&field->ret, s, entry, iter->cpu);
 	}
+	case TRACE_PRINT: {
+		struct print_entry *field;
+		trace_assign_type(field, entry);
+		return print_graph_comment(field, s, entry, iter);
+	}
 	default:
 		return TRACE_TYPE_UNHANDLED;
 	}
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 2a98a206acc..2fb6da6523b 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -366,5 +366,5 @@ void mmio_trace_mapping(struct mmiotrace_map *map)
 
 int mmio_trace_printk(const char *fmt, va_list args)
 {
-	return trace_vprintk(0, fmt, args);
+	return trace_vprintk(0, -1, fmt, args);
 }
-- 
cgit v1.2.3


From 37810659ea7d9572c5ac284ade272f806ef8f788 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 4 Dec 2008 11:17:10 +0100
Subject: hrtimer: removing all ur callback modes, fix hotplug

Impact: fix hrtimer locking (reported by lockdep) in the CPU hotplug case

This addition fixes the hotplug locking issue on my machine

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hrtimer.c | 65 ++++++++++++++++++++++++++++++++------------------------
 1 file changed, 37 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index efd6f41e1c1..b09c7a27631 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1496,7 +1496,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
 #ifdef CONFIG_HOTPLUG_CPU
 
 static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
-				 struct hrtimer_clock_base *new_base, int dcpu)
+				struct hrtimer_clock_base *new_base)
 {
 	struct hrtimer *timer;
 	struct rb_node *node;
@@ -1514,40 +1514,34 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 		__remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
 		timer->base = new_base;
 		/*
-		 * Enqueue the timer. Allow reprogramming of the event device
+		 * Enqueue the timers on the new cpu, but do not reprogram 
+		 * the timer as that would enable a deadlock between
+		 * hrtimer_enqueue_reprogramm() running the timer and us still
+		 * holding a nested base lock.
+		 *
+		 * Instead we tickle the hrtimer interrupt after the migration
+		 * is done, which will run all expired timers and re-programm
+		 * the timer device.
 		 */
-		enqueue_hrtimer(timer, new_base, 1);
+		enqueue_hrtimer(timer, new_base, 0);
 
-#ifdef CONFIG_HIGH_RES_TIMERS
-		/*
-		 * Happens with high res enabled when the timer was
-		 * already expired and the callback mode is
-		 * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The
-		 * enqueue code does not move them to the soft irq
-		 * pending list for performance/latency reasons, but
-		 * in the migration state, we need to do that
-		 * otherwise we end up with a stale timer.
-		 */
-		if (timer->state == HRTIMER_STATE_MIGRATE) {
-			/* XXX: running on offline cpu */
-			__run_hrtimer(timer);
-		}
-#endif
 		/* Clear the migration state bit */
 		timer->state &= ~HRTIMER_STATE_MIGRATE;
 	}
 }
 
-static void migrate_hrtimers(int cpu)
+static int migrate_hrtimers(int scpu)
 {
 	struct hrtimer_cpu_base *old_base, *new_base;
-	int i;
+	int dcpu, i;
 
-	BUG_ON(cpu_online(cpu));
-	old_base = &per_cpu(hrtimer_bases, cpu);
+	BUG_ON(cpu_online(scpu));
+	old_base = &per_cpu(hrtimer_bases, scpu);
 	new_base = &get_cpu_var(hrtimer_bases);
 
-	tick_cancel_sched_timer(cpu);
+	dcpu = smp_processor_id();
+
+	tick_cancel_sched_timer(scpu);
 	/*
 	 * The caller is globally serialized and nobody else
 	 * takes two locks at once, deadlock is not possible.
@@ -1557,32 +1551,47 @@ static void migrate_hrtimers(int cpu)
 
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
 		migrate_hrtimer_list(&old_base->clock_base[i],
-				     &new_base->clock_base[i], cpu);
+				     &new_base->clock_base[i]);
 	}
 
 	spin_unlock(&old_base->lock);
 	spin_unlock_irq(&new_base->lock);
 	put_cpu_var(hrtimer_bases);
+
+	return dcpu;
+}
+
+static void tickle_timers(void *arg)
+{
+	hrtimer_peek_ahead_timers();
 }
+
 #endif /* CONFIG_HOTPLUG_CPU */
 
 static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
 					unsigned long action, void *hcpu)
 {
-	unsigned int cpu = (long)hcpu;
+	int dcpu = -1, scpu = (long)hcpu;
 
 	switch (action) {
 
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
-		init_hrtimers_cpu(cpu);
+		init_hrtimers_cpu(scpu);
 		break;
 
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
-		clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu);
-		migrate_hrtimers(cpu);
+		clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu);
+		dcpu = migrate_hrtimers(scpu);
+		break;
+
+	case CPU_POST_DEAD:
+		if (dcpu == -1)
+			break;
+
+		smp_call_function_single(dcpu, tickle_timers, NULL, 0);
 		break;
 #endif
 
-- 
cgit v1.2.3


From 0871420fad5844cb63cfcf85508c17bd9b15c08f Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 16 Nov 2008 23:49:24 -0800
Subject: sparc64: Add tsb-ratio sysctl.

Add a sysctl to tweak the RSS limit used to decide when to grow
the TSB for an address space.

In order to avoid expensive divides and multiplies only simply
positive and negative powers of two are supported.

The function computed takes the number of TSB translations that will
fit at one time in the TSB of a given size, and either adds or
subtracts a percentage of entries.  This final value is the
RSS limit.

See tsb_size_to_rss_limit().

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/sysctl.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3d56fe7570d..4e2ac0aec9b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -121,6 +121,10 @@ extern int sg_big_buff;
 #include <asm/system.h>
 #endif
 
+#ifdef CONFIG_SPARC64
+extern int sysctl_tsb_ratio;
+#endif
+
 #ifdef __hppa__
 extern int pwrsw_enabled;
 extern int unaligned_enabled;
@@ -451,6 +455,16 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
+#ifdef CONFIG_SPARC64
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "tsb-ratio",
+		.data		= &sysctl_tsb_ratio,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
 #ifdef __hppa__
 	{
 		.ctl_name	= KERN_HPPA_PWRSW,
-- 
cgit v1.2.3


From ff32504fdc56407654584ef187b20022c94a3486 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 4 Dec 2008 23:47:35 +0100
Subject: tracing/ftrace: don't insert TRACE_PRINT during selftests

Impact: fix tracer selfstests false results

After setting a ftrace_printk somewhere in th kernel, I saw the
Function tracer selftest failing.

When a selftest occurs, the ring buffer is lurked to see if
some entries were inserted. But concurrent insertion such as
ftrace_printk could occured at the same time and could give
false positive or negative results.

This patch prevent prevent from TRACE_PRINT entries insertion
during selftests.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ea38652d631..5dca6ef1fbe 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -44,6 +44,14 @@
 unsigned long __read_mostly	tracing_max_latency = (cycle_t)ULONG_MAX;
 unsigned long __read_mostly	tracing_thresh;
 
+/* We need to change this state when a selftest is running.
+ * A selftest will lurk into the ring-buffer to count the
+ * entries inserted during the selftest although some concurrent
+ * insertions into the ring-buffer such as ftrace_printk could occurred
+ * at the same time, giving false positive or negative results.
+ */
+static atomic_t tracing_selftest_running = ATOMIC_INIT(0);
+
 /* For tracers that don't implement custom flags */
 static struct tracer_opt dummy_tracer_opt[] = {
 	{ }
@@ -589,6 +597,8 @@ int register_tracer(struct tracer *type)
 		struct tracer *saved_tracer = current_trace;
 		struct trace_array *tr = &global_trace;
 		int i;
+
+		atomic_set(&tracing_selftest_running, 1);
 		/*
 		 * Run a selftest on this tracer.
 		 * Here we reset the trace buffer, and set the current
@@ -603,6 +613,7 @@ int register_tracer(struct tracer *type)
 		/* the test is responsible for initializing and enabling */
 		pr_info("Testing tracer %s: ", type->name);
 		ret = type->selftest(type, tr);
+		atomic_set(&tracing_selftest_running, 0);
 		/* the test is responsible for resetting too */
 		current_trace = saved_tracer;
 		if (ret) {
@@ -3594,7 +3605,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 	unsigned long flags, irq_flags;
 	int cpu, len = 0, size, pc;
 
-	if (tracing_disabled)
+	if (tracing_disabled || atomic_read(&tracing_selftest_running))
 		return 0;
 
 	pc = preempt_count();
-- 
cgit v1.2.3


From 77d683f3e0258d522c5506e7b5fd05c9411184d9 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 4 Dec 2008 23:49:47 +0100
Subject: tracing/ftrace: fix the check of ftrace_trace_task

Impact: fix default empty traces on function-graph-tracer

The actual ftrace_trace_task() checks if ftrace_pid_trace is allocated
and return 1 if it is true.
If it is NULL, it will check the bit of pid tracing flag for the current
task (which are not set by default).
So by default, a task is not traced.
Actually all tasks should be traced by default and filter_by_pid when
ftrace_pid_trace is allocated.

The appropriate condition should be to return 1 if filter_by_pid is
set.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acke-dby: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index a71bbe0a363..5ac697065a4 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -551,7 +551,7 @@ extern struct pid *ftrace_pid_trace;
 
 static inline int ftrace_trace_task(struct task_struct *task)
 {
-	if (ftrace_pid_trace)
+	if (!ftrace_pid_trace)
 		return 1;
 
 	return test_tsk_trace_trace(task);
-- 
cgit v1.2.3


From 21a8c466f99063eeb8567318b4e305eda9015408 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 4 Dec 2008 23:51:23 +0100
Subject: tracing/ftrace: provide the macro task_curr_ret_stack()

Impact: cleanup

As suggested by Steven Rostedt, this patch provide a new macro
task_curr_ret_stack() to move the cpp conditionnal CONFIG into
the linux/ftrace.h headers.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5dca6ef1fbe..7a93c663e52 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3657,13 +3657,7 @@ int __ftrace_printk(unsigned long ip, const char *fmt, ...)
 		return 0;
 
 	va_start(ap, fmt);
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-	ret = trace_vprintk(ip, current->curr_ret_stack, fmt, ap);
-#else
-	ret = trace_vprintk(ip, -1, fmt, ap);
-#endif
-
+	ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
 	va_end(ap);
 	return ret;
 }
-- 
cgit v1.2.3


From 21bbecdaaef3a6acc19905ab88c0587817318870 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 4 Dec 2008 23:30:56 -0500
Subject: ftrace: use init_struct_pid as swapper pid

Impact: clean up

Using (struct pid *)-1 as the pointer for ftrace_swapper_pid is
a little confusing for others. This patch uses the address of the
actual init pid structure instead. This change is only for
clarity. It does not affect the code itself. Hopefully soon the
swapper tasks will all have their own pid structure and then
we can clean up the code a bit more.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d2b15653816..2971fe48f55 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -49,7 +49,7 @@ static int last_ftrace_enabled;
 
 /* set when tracing only a pid */
 struct pid *ftrace_pid_trace;
-static struct pid * const ftrace_swapper_pid = (struct pid *)1;
+static struct pid * const ftrace_swapper_pid = &init_struct_pid;
 
 /* Quick disabling of function tracer. */
 int function_trace_stop;
-- 
cgit v1.2.3


From c37bbb0fdcc01610fd55604eb6927210a1d20044 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Wed, 3 Dec 2008 13:17:06 -0600
Subject: user namespaces: let user_ns be cloned with fairsched

(These two patches are in the next-unacked branch of
git://git.kernel.org/pub/scm/linux/kernel/git/sergeh/userns-2.6.
If they get some ACKs, then I hope to feed this into security-next.
After these two, I think we're ready to tackle userns+capabilities)

Fairsched creates a per-uid directory under /sys/kernel/uids/.
So when you clone(CLONE_NEWUSER), it tries to create
/sys/kernel/uids/0, which already exists, and you get back
-ENOMEM.

This was supposed to be fixed by sysfs tagging, but that
was postponed (ok, rejected until sysfs locking is fixed).
So, just as with network namespaces, we just don't create
those directories for user namespaces other than the init.

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/user.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/user.c b/kernel/user.c
index 97202cb29ad..6c924bc48c0 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -246,6 +246,8 @@ static int uids_user_create(struct user_struct *up)
 	int error;
 
 	memset(kobj, 0, sizeof(struct kobject));
+	if (up->user_ns != &init_user_ns)
+		return 0;
 	kobj->kset = uids_kset;
 	error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid);
 	if (error) {
@@ -281,6 +283,8 @@ static void remove_user_sysfs_dir(struct work_struct *w)
 	unsigned long flags;
 	int remove_user = 0;
 
+	if (up->user_ns != &init_user_ns)
+		return;
 	/* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
 	 * atomic.
 	 */
-- 
cgit v1.2.3


From 7657d90497f98426af17f0ac633a9b335bb7a8fb Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Wed, 3 Dec 2008 13:17:33 -0600
Subject: user namespaces: require cap_set{ug}id for CLONE_NEWUSER

While ideally CLONE_NEWUSER will eventually require no
privilege, the required permission checks are currently
not there.  As a result, CLONE_NEWUSER has the same effect
as a setuid(0)+setgroups(1,"0").  While we already require
CAP_SYS_ADMIN, requiring CAP_SETUID and CAP_SETGID seems
appropriate.

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/fork.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 1dd89451fae..e3a85b33107 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1344,7 +1344,8 @@ long do_fork(unsigned long clone_flags,
 		/* hopefully this check will go away when userns support is
 		 * complete
 		 */
-		if (!capable(CAP_SYS_ADMIN))
+		if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
+				!capable(CAP_SETGID))
 			return -EPERM;
 	}
 
-- 
cgit v1.2.3


From decbec3838d10ecd7aabdb4c0e05aac0e5f5dc0c Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 8 Dec 2008 01:56:06 +0100
Subject: tracing/function-graph-tracer: implement a print_headers function

Impact: provide trace headers to explain a bit the output

This patch implements the print_headers callback for the function graph
tracer. These headers are output according to the current trace options.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_functions_graph.c | 33 +++++++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 32b7fb9a19d..af60eef4cbc 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -570,11 +570,36 @@ print_graph_function(struct trace_iterator *iter)
 	}
 }
 
+static void print_graph_headers(struct seq_file *s)
+{
+	/* 1st line */
+	seq_printf(s, "# ");
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
+		seq_printf(s, "CPU ");
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
+		seq_printf(s, "TASK/PID     ");
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD)
+		seq_printf(s, "OVERHEAD/");
+	seq_printf(s, "DURATION            FUNCTION CALLS\n");
+
+	/* 2nd line */
+	seq_printf(s, "# ");
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
+		seq_printf(s, "|   ");
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
+		seq_printf(s, "|      |     ");
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
+		seq_printf(s, "|        ");
+		seq_printf(s, "|                   |   |   |   |\n");
+	} else
+		seq_printf(s, "    |               |   |   |   |\n");
+}
 static struct tracer graph_trace __read_mostly = {
-	.name	     = "function_graph",
-	.init	     = graph_trace_init,
-	.reset	     = graph_trace_reset,
-	.print_line = print_graph_function,
+	.name	     	= "function_graph",
+	.init	     	= graph_trace_init,
+	.reset	     	= graph_trace_reset,
+	.print_line	= print_graph_function,
+	.print_header	= print_graph_headers,
 	.flags		= &tracer_flags,
 };
 
-- 
cgit v1.2.3


From 5436499e6098759c2340f8b906ea52f993dc4efb Mon Sep 17 00:00:00 2001
From: Ken Chen <kenchen@google.com>
Date: Sun, 7 Dec 2008 18:47:37 -0800
Subject: sched: fix sd_parent_degenerate on non-numa smp machine

Impact: optimize the sched domains tree some more

The addition of SD_SERIALIZE flag added to SD_NODE_INIT prevented top level
dummy numa sched_domain to be properly degenerated on non-numa smp machine.
The reason is that in sd_parent_degenerate(), it found that the child and
parent does not have comon sched_domain flags due to SD_SERIALIZE.  However,
for non-numa smp box, the top level is a dummy with a single sched_group.

Filter out SD_SERIALIZE if it is on non-numa machine to properly degenerate
top level node sched_domain.  this will cut back some of the sd domain walk
in the load balancer code.

Signed-off-by: Ken Chen <kenchen@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 152828239ef..74498c840f9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6768,6 +6768,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 				SD_BALANCE_EXEC |
 				SD_SHARE_CPUPOWER |
 				SD_SHARE_PKG_RESOURCES);
+		if (nr_node_ids == 1)
+			pflags &= ~SD_SERIALIZE;
 	}
 	if (~cflags & pflags)
 		return 0;
-- 
cgit v1.2.3


From 8b96f0119818964e4944fd1c423bf6770027d3ac Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 6 Dec 2008 03:40:00 +0100
Subject: tracing/function-graph-tracer: introduce __notrace_funcgraph to
 filter special functions

Impact: trace more functions

When the function graph tracer is configured, three more files are not
traced to prevent only four functions to be traced. And this impacts the
normal function tracer too.

arch/x86/kernel/process_64/32.c:

I had crashes when I let this file traced. After some debugging, I saw
that the "current" task point was changed inside__swtich_to(), ie:
"write_pda(pcurrent, next_p);" inside process_64.c Since the tracer store
the original return address of the function inside current, we had
crashes. Only __switch_to() has to be excluded from tracing.

kernel/module.c and kernel/extable.c:

Because of a function used internally by the function graph tracer:
__kernel_text_address()

To let the other functions inside these files to be traced, this patch
introduces the __notrace_funcgraph function prefix which is __notrace if
function graph tracer is configured and nothing if not.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Makefile  | 4 ----
 kernel/extable.c | 5 +++--
 kernel/module.c  | 2 +-
 3 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 703cf3b7389..19fad003b19 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -21,10 +21,6 @@ CFLAGS_REMOVE_cgroup-debug.o = -pg
 CFLAGS_REMOVE_sched_clock.o = -pg
 CFLAGS_REMOVE_sched.o = -pg
 endif
-ifdef CONFIG_FUNCTION_GRAPH_TRACER
-CFLAGS_REMOVE_extable.o = -pg # For __kernel_text_address()
-CFLAGS_REMOVE_module.o = -pg # For __module_text_address()
-endif
 
 obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
diff --git a/kernel/extable.c b/kernel/extable.c
index a26cb2e1702..feb0317cf09 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -17,6 +17,7 @@
 */
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/ftrace.h>
 #include <asm/uaccess.h>
 #include <asm/sections.h>
 
@@ -40,7 +41,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
 	return e;
 }
 
-int core_kernel_text(unsigned long addr)
+__notrace_funcgraph int core_kernel_text(unsigned long addr)
 {
 	if (addr >= (unsigned long)_stext &&
 	    addr <= (unsigned long)_etext)
@@ -53,7 +54,7 @@ int core_kernel_text(unsigned long addr)
 	return 0;
 }
 
-int __kernel_text_address(unsigned long addr)
+__notrace_funcgraph int __kernel_text_address(unsigned long addr)
 {
 	if (core_kernel_text(addr))
 		return 1;
diff --git a/kernel/module.c b/kernel/module.c
index 89bcf7c1327..dd2a54155b5 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2704,7 +2704,7 @@ int is_module_address(unsigned long addr)
 
 
 /* Is this a valid kernel address? */
-struct module *__module_text_address(unsigned long addr)
+__notrace_funcgraph struct module *__module_text_address(unsigned long addr)
 {
 	struct module *mod;
 
-- 
cgit v1.2.3


From 8e1b82e0866befaa0b2920be296c6e4c3fc7f422 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 6 Dec 2008 03:41:33 +0100
Subject: tracing/function-graph-tracer: turn tracing_selftest_running into an
 int

Impact: cleanup

Apply some suggestions of Steven Rostedt:

_turn tracing_selftest_running into a simple int (no need of an atomic_t)
_set it __read_mostly
_fix a comment style

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7a93c663e52..33549537f30 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -44,13 +44,14 @@
 unsigned long __read_mostly	tracing_max_latency = (cycle_t)ULONG_MAX;
 unsigned long __read_mostly	tracing_thresh;
 
-/* We need to change this state when a selftest is running.
+/*
+ * We need to change this state when a selftest is running.
  * A selftest will lurk into the ring-buffer to count the
  * entries inserted during the selftest although some concurrent
  * insertions into the ring-buffer such as ftrace_printk could occurred
  * at the same time, giving false positive or negative results.
  */
-static atomic_t tracing_selftest_running = ATOMIC_INIT(0);
+static bool __read_mostly tracing_selftest_running;
 
 /* For tracers that don't implement custom flags */
 static struct tracer_opt dummy_tracer_opt[] = {
@@ -574,6 +575,8 @@ int register_tracer(struct tracer *type)
 	unlock_kernel();
 	mutex_lock(&trace_types_lock);
 
+	tracing_selftest_running = true;
+
 	for (t = trace_types; t; t = t->next) {
 		if (strcmp(type->name, t->name) == 0) {
 			/* already found */
@@ -598,7 +601,6 @@ int register_tracer(struct tracer *type)
 		struct trace_array *tr = &global_trace;
 		int i;
 
-		atomic_set(&tracing_selftest_running, 1);
 		/*
 		 * Run a selftest on this tracer.
 		 * Here we reset the trace buffer, and set the current
@@ -613,7 +615,6 @@ int register_tracer(struct tracer *type)
 		/* the test is responsible for initializing and enabling */
 		pr_info("Testing tracer %s: ", type->name);
 		ret = type->selftest(type, tr);
-		atomic_set(&tracing_selftest_running, 0);
 		/* the test is responsible for resetting too */
 		current_trace = saved_tracer;
 		if (ret) {
@@ -635,6 +636,7 @@ int register_tracer(struct tracer *type)
 		max_tracer_type_len = len;
 
  out:
+	tracing_selftest_running = false;
 	mutex_unlock(&trace_types_lock);
 	lock_kernel();
 
@@ -3605,7 +3607,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 	unsigned long flags, irq_flags;
 	int cpu, len = 0, size, pc;
 
-	if (tracing_disabled || atomic_read(&tracing_selftest_running))
+	if (tracing_disabled || tracing_selftest_running)
 		return 0;
 
 	pc = preempt_count();
-- 
cgit v1.2.3


From 380c4b1411ccd6885f92b2c8ceb08433a720f44e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 6 Dec 2008 03:43:41 +0100
Subject: tracing/function-graph-tracer: append the tracing_graph_flag

Impact: Provide a way to pause the function graph tracer

As suggested by Steven Rostedt, the previous patch that prevented from
spinlock function tracing shouldn't use the raw_spinlock to fix it.
It's much better to follow lockdep with normal spinlock, so this patch
adds a new flag for each task to make the function graph tracer able
to be paused. We also can send an ftrace_printk whithout worrying of
the irrelevant traced spinlock during insertion.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c |  2 ++
 kernel/trace/trace.c  | 18 +++++-------------
 2 files changed, 7 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2971fe48f55..a12f80efcea 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1998,6 +1998,7 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
 			/* Make sure IRQs see the -1 first: */
 			barrier();
 			t->ret_stack = ret_stack_list[start++];
+			atomic_set(&t->tracing_graph_pause, 0);
 			atomic_set(&t->trace_overrun, 0);
 		}
 	} while_each_thread(g, t);
@@ -2077,6 +2078,7 @@ void ftrace_graph_init_task(struct task_struct *t)
 		if (!t->ret_stack)
 			return;
 		t->curr_ret_stack = -1;
+		atomic_set(&t->tracing_graph_pause, 0);
 		atomic_set(&t->trace_overrun, 0);
 	} else
 		t->ret_stack = NULL;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 33549537f30..0b8659bd5ad 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3590,14 +3590,7 @@ static __init int tracer_init_debugfs(void)
 
 int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 {
-	/*
-	 * Raw Spinlock because a normal spinlock would be traced here
-	 * and append an irrelevant couple spin_lock_irqsave/
-	 * spin_unlock_irqrestore traced by ftrace around this
-	 * TRACE_PRINTK trace.
-	 */
-	static raw_spinlock_t trace_buf_lock =
-				(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+	static DEFINE_SPINLOCK(trace_buf_lock);
 	static char trace_buf[TRACE_BUF_SIZE];
 
 	struct ring_buffer_event *event;
@@ -3618,8 +3611,8 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 	if (unlikely(atomic_read(&data->disabled)))
 		goto out;
 
-	local_irq_save(flags);
-	__raw_spin_lock(&trace_buf_lock);
+	pause_graph_tracing();
+	spin_lock_irqsave(&trace_buf_lock, irq_flags);
 	len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
 
 	len = min(len, TRACE_BUF_SIZE-1);
@@ -3640,9 +3633,8 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
 
  out_unlock:
-	__raw_spin_unlock(&trace_buf_lock);
-	local_irq_restore(flags);
-
+	spin_unlock_irqrestore(&trace_buf_lock, irq_flags);
+	unpause_graph_tracing();
  out:
 	preempt_enable_notrace();
 
-- 
cgit v1.2.3


From efbe027e95dc13ac343b6130948418d7ead7ddf1 Mon Sep 17 00:00:00 2001
From: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Date: Mon, 8 Dec 2008 20:52:49 +0530
Subject: sched: idle_balance() does not call load_balance_newidle()

Impact: fix SD_BALANCE_NEWIDLEand broaden its use

load_balance_newidle() does not get called if SD_BALANCE_NEWIDLE is
set at higher level domain (3-CPU) and not in low level domain (2-MC).

pulled_task is initialised to -1 and checked for non-zero which is
always true if the lowest level sched_domain does not have
SD_BALANCE_NEWIDLE flag set.

Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 74498c840f9..bb9c6384d07 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3685,7 +3685,7 @@ out_balanced:
 static void idle_balance(int this_cpu, struct rq *this_rq)
 {
 	struct sched_domain *sd;
-	int pulled_task = -1;
+	int pulled_task = 0;
 	unsigned long next_balance = jiffies + HZ;
 	cpumask_t tmpmask;
 
-- 
cgit v1.2.3


From e726f5f91effd8944c76475a2688093a03ba0d10 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 8 Dec 2008 16:55:53 +0100
Subject: tracing/function-graph-tracer: fix 'flags' variable mismatch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

this warning:

 kernel/trace/trace.c: In function ‘trace_vprintk’:
 kernel/trace/trace.c:3626: warning: ‘flags’ may be used uninitialized in this function

shows some confusion about irq_flags / flags use here. We already have
irq_flags so remove the extra flags variable.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0b8659bd5ad..8ebe0070c47 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3596,9 +3596,9 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 	struct ring_buffer_event *event;
 	struct trace_array *tr = &global_trace;
 	struct trace_array_cpu *data;
-	struct print_entry *entry;
-	unsigned long flags, irq_flags;
 	int cpu, len = 0, size, pc;
+	struct print_entry *entry;
+	unsigned long irq_flags;
 
 	if (tracing_disabled || tracing_selftest_running)
 		return 0;
@@ -3623,7 +3623,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 	if (!event)
 		goto out_unlock;
 	entry = ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, flags, pc);
+	tracing_generic_entry_update(&entry->ent, irq_flags, pc);
 	entry->ent.type			= TRACE_PRINT;
 	entry->ip			= ip;
 	entry->depth			= depth;
-- 
cgit v1.2.3


From a0a99b227da57f81319dd239bc4de811b0f530ec Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 8 Dec 2008 17:13:02 +0100
Subject: hrtimer: removing all ur callback modes, fix

> Ingo, this addition fixes the hotplug issue on my machine

And because we're all human...

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hrtimer.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index b09c7a27631..b741f850426 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1571,7 +1571,7 @@ static void tickle_timers(void *arg)
 static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
 					unsigned long action, void *hcpu)
 {
-	int dcpu = -1, scpu = (long)hcpu;
+	int dcpu, scpu = (long)hcpu;
 
 	switch (action) {
 
@@ -1585,12 +1585,6 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
 	case CPU_DEAD_FROZEN:
 		clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu);
 		dcpu = migrate_hrtimers(scpu);
-		break;
-
-	case CPU_POST_DEAD:
-		if (dcpu == -1)
-			break;
-
 		smp_call_function_single(dcpu, tickle_timers, NULL, 0);
 		break;
 #endif
-- 
cgit v1.2.3


From 94d6a5f7341ebaff53d4e41cc81fab37f0d9fbed Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Mon, 8 Dec 2008 15:52:21 -0600
Subject: user namespaces: document CFS behavior

Documented the currently bogus state of support for CFS user groups with
user namespaces.  In particular, all users in a user namespace should be
children of the user which created the user namespace.  This is yet to
be implemented.

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Acked-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/user.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/user.c b/kernel/user.c
index 6c924bc48c0..6608a3d8ca6 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -239,7 +239,13 @@ static struct kobj_type uids_ktype = {
 	.release = uids_release,
 };
 
-/* create /sys/kernel/uids/<uid>/cpu_share file for this user */
+/*
+ * Create /sys/kernel/uids/<uid>/cpu_share file for this user
+ * We do not create this file for users in a user namespace (until
+ * sysfs tagging is implemented).
+ *
+ * See Documentation/scheduler/sched-design-CFS.txt for ramifications.
+ */
 static int uids_user_create(struct user_struct *up)
 {
 	struct kobject *kobj = &up->kobj;
-- 
cgit v1.2.3


From 68814b58c52077da9561b544089fe532a0842f71 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Mon, 24 Nov 2008 12:24:12 +0100
Subject: ring_buffer: update description for ring_buffer_alloc()

Trivial patch.

Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 kernel/trace/ring_buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 668bbb5ef2b..c8996d239e4 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -381,7 +381,7 @@ extern int ring_buffer_page_too_big(void);
 
 /**
  * ring_buffer_alloc - allocate a new ring_buffer
- * @size: the size in bytes that is needed.
+ * @size: the size in bytes per cpu that is needed.
  * @flags: attributes to set for the ring buffer.
  *
  * Currently the only flag that is available is the RB_FL_OVERWRITE
-- 
cgit v1.2.3


From e2ac8ef576e45d9db7264abc51383e68d26067bb Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 12 Nov 2008 12:59:32 +0100
Subject: ftrace: remove unused function arg in trace_iterator_increment()

This removes the unused cpu function parameter.

Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 kernel/trace/trace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d86e3252f30..a96b335fe75 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -914,7 +914,7 @@ enum trace_file_type {
 	TRACE_FILE_LAT_FMT	= 1,
 };
 
-static void trace_iterator_increment(struct trace_iterator *iter, int cpu)
+static void trace_iterator_increment(struct trace_iterator *iter)
 {
 	/* Don't allow ftrace to trace into the ring buffers */
 	ftrace_disable_cpu();
@@ -993,7 +993,7 @@ static void *find_next_entry_inc(struct trace_iterator *iter)
 	iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts);
 
 	if (iter->ent)
-		trace_iterator_increment(iter, iter->cpu);
+		trace_iterator_increment(iter);
 
 	return iter->ent ? iter : NULL;
 }
-- 
cgit v1.2.3


From c4f50183f90fb1fd99aa5941f01b90cd1b882d2e Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Thu, 11 Dec 2008 16:49:22 +0100
Subject: ring_buffer: adding EXPORT_SYMBOLs

I added EXPORT_SYMBOL_GPLs for all functions part of the API
(ring_buffer.h). This is required since oprofile is using the ring
buffer and the compilation as modules would fail otherwise.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index c8996d239e4..30d57dd01a8 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -31,6 +31,7 @@ void tracing_on(void)
 {
 	ring_buffers_off = 0;
 }
+EXPORT_SYMBOL_GPL(tracing_on);
 
 /**
  * tracing_off - turn off all tracing buffers
@@ -44,6 +45,7 @@ void tracing_off(void)
 {
 	ring_buffers_off = 1;
 }
+EXPORT_SYMBOL_GPL(tracing_off);
 
 /* Up this if you want to test the TIME_EXTENTS and normalization */
 #define DEBUG_SHIFT 0
@@ -60,12 +62,14 @@ u64 ring_buffer_time_stamp(int cpu)
 
 	return time;
 }
+EXPORT_SYMBOL_GPL(ring_buffer_time_stamp);
 
 void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
 {
 	/* Just stupid testing the normalize function and deltas */
 	*ts >>= DEBUG_SHIFT;
 }
+EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
 
 #define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event))
 #define RB_ALIGNMENT_SHIFT	2
@@ -115,6 +119,7 @@ unsigned ring_buffer_event_length(struct ring_buffer_event *event)
 {
 	return rb_event_length(event);
 }
+EXPORT_SYMBOL_GPL(ring_buffer_event_length);
 
 /* inline for ring buffer fast paths */
 static inline void *
@@ -136,6 +141,7 @@ void *ring_buffer_event_data(struct ring_buffer_event *event)
 {
 	return rb_event_data(event);
 }
+EXPORT_SYMBOL_GPL(ring_buffer_event_data);
 
 #define for_each_buffer_cpu(buffer, cpu)		\
 	for_each_cpu_mask(cpu, buffer->cpumask)
@@ -444,6 +450,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
 	kfree(buffer);
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(ring_buffer_alloc);
 
 /**
  * ring_buffer_free - free a ring buffer.
@@ -459,6 +466,7 @@ ring_buffer_free(struct ring_buffer *buffer)
 
 	kfree(buffer);
 }
+EXPORT_SYMBOL_GPL(ring_buffer_free);
 
 static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
 
@@ -620,6 +628,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 	mutex_unlock(&buffer->mutex);
 	return -ENOMEM;
 }
+EXPORT_SYMBOL_GPL(ring_buffer_resize);
 
 static inline int rb_null_event(struct ring_buffer_event *event)
 {
@@ -1220,6 +1229,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
 		preempt_enable_notrace();
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
 
 static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
 		      struct ring_buffer_event *event)
@@ -1269,6 +1279,7 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
 
 /**
  * ring_buffer_write - write data to the buffer without reserving
@@ -1334,6 +1345,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(ring_buffer_write);
 
 static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
 {
@@ -1360,6 +1372,7 @@ void ring_buffer_record_disable(struct ring_buffer *buffer)
 {
 	atomic_inc(&buffer->record_disabled);
 }
+EXPORT_SYMBOL_GPL(ring_buffer_record_disable);
 
 /**
  * ring_buffer_record_enable - enable writes to the buffer
@@ -1372,6 +1385,7 @@ void ring_buffer_record_enable(struct ring_buffer *buffer)
 {
 	atomic_dec(&buffer->record_disabled);
 }
+EXPORT_SYMBOL_GPL(ring_buffer_record_enable);
 
 /**
  * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer
@@ -1393,6 +1407,7 @@ void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu)
 	cpu_buffer = buffer->buffers[cpu];
 	atomic_inc(&cpu_buffer->record_disabled);
 }
+EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu);
 
 /**
  * ring_buffer_record_enable_cpu - enable writes to the buffer
@@ -1412,6 +1427,7 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
 	cpu_buffer = buffer->buffers[cpu];
 	atomic_dec(&cpu_buffer->record_disabled);
 }
+EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
 
 /**
  * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
@@ -1428,6 +1444,7 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
 	cpu_buffer = buffer->buffers[cpu];
 	return cpu_buffer->entries;
 }
+EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
 
 /**
  * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer
@@ -1444,6 +1461,7 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
 	cpu_buffer = buffer->buffers[cpu];
 	return cpu_buffer->overrun;
 }
+EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
 
 /**
  * ring_buffer_entries - get the number of entries in a buffer
@@ -1466,6 +1484,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
 
 	return entries;
 }
+EXPORT_SYMBOL_GPL(ring_buffer_entries);
 
 /**
  * ring_buffer_overrun_cpu - get the number of overruns in buffer
@@ -1488,6 +1507,7 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
 
 	return overruns;
 }
+EXPORT_SYMBOL_GPL(ring_buffer_overruns);
 
 /**
  * ring_buffer_iter_reset - reset an iterator
@@ -1513,6 +1533,7 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
 	else
 		iter->read_stamp = iter->head_page->time_stamp;
 }
+EXPORT_SYMBOL_GPL(ring_buffer_iter_reset);
 
 /**
  * ring_buffer_iter_empty - check if an iterator has no more to read
@@ -1527,6 +1548,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
 	return iter->head_page == cpu_buffer->commit_page &&
 		iter->head == rb_commit_index(cpu_buffer);
 }
+EXPORT_SYMBOL_GPL(ring_buffer_iter_empty);
 
 static void
 rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
@@ -1797,6 +1819,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(ring_buffer_peek);
 
 /**
  * ring_buffer_iter_peek - peek at the next event to be read
@@ -1867,6 +1890,7 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(ring_buffer_iter_peek);
 
 /**
  * ring_buffer_consume - return an event and consume it
@@ -1894,6 +1918,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
 
 	return event;
 }
+EXPORT_SYMBOL_GPL(ring_buffer_consume);
 
 /**
  * ring_buffer_read_start - start a non consuming read of the buffer
@@ -1934,6 +1959,7 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
 
 	return iter;
 }
+EXPORT_SYMBOL_GPL(ring_buffer_read_start);
 
 /**
  * ring_buffer_finish - finish reading the iterator of the buffer
@@ -1950,6 +1976,7 @@ ring_buffer_read_finish(struct ring_buffer_iter *iter)
 	atomic_dec(&cpu_buffer->record_disabled);
 	kfree(iter);
 }
+EXPORT_SYMBOL_GPL(ring_buffer_read_finish);
 
 /**
  * ring_buffer_read - read the next item in the ring buffer by the iterator
@@ -1971,6 +1998,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
 
 	return event;
 }
+EXPORT_SYMBOL_GPL(ring_buffer_read);
 
 /**
  * ring_buffer_size - return the size of the ring buffer (in bytes)
@@ -1980,6 +2008,7 @@ unsigned long ring_buffer_size(struct ring_buffer *buffer)
 {
 	return BUF_PAGE_SIZE * buffer->pages;
 }
+EXPORT_SYMBOL_GPL(ring_buffer_size);
 
 static void
 rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
@@ -2022,6 +2051,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
 
 	spin_unlock_irqrestore(&cpu_buffer->lock, flags);
 }
+EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
 
 /**
  * ring_buffer_reset - reset a ring buffer
@@ -2034,6 +2064,7 @@ void ring_buffer_reset(struct ring_buffer *buffer)
 	for_each_buffer_cpu(buffer, cpu)
 		ring_buffer_reset_cpu(buffer, cpu);
 }
+EXPORT_SYMBOL_GPL(ring_buffer_reset);
 
 /**
  * rind_buffer_empty - is the ring buffer empty?
@@ -2052,6 +2083,7 @@ int ring_buffer_empty(struct ring_buffer *buffer)
 	}
 	return 1;
 }
+EXPORT_SYMBOL_GPL(ring_buffer_empty);
 
 /**
  * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty?
@@ -2068,6 +2100,7 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
 	cpu_buffer = buffer->buffers[cpu];
 	return rb_per_cpu_empty(cpu_buffer);
 }
+EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu);
 
 /**
  * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
@@ -2117,6 +2150,7 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
 
 static ssize_t
 rb_simple_read(struct file *filp, char __user *ubuf,
-- 
cgit v1.2.3


From fa116ea35ec7f40e890972324409e99eed008d56 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Thu, 11 Dec 2008 17:04:11 +0100
Subject: nohz: no softirq pending warnings for offline cpus

Impact: remove false positive warning

After a cpu was taken down during cpu hotplug (read: disabled for interrupts)
it still might have pending softirqs. However take_cpu_down makes sure
that the idle task will run next instead of ksoftirqd on the taken down cpu.
The idle task will call tick_nohz_stop_sched_tick which might warn about
pending softirqs just before the cpu kills itself completely.

However the pending softirqs on the dead cpu aren't a problem because they
will be moved to an online cpu during CPU_DEAD handling.

So make sure we warn only for online cpus.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/tick-sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 342fc9ccab4..dc17ffcf191 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -247,7 +247,7 @@ void tick_nohz_stop_sched_tick(int inidle)
 	if (need_resched())
 		goto end;
 
-	if (unlikely(local_softirq_pending())) {
+	if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
 		static int ratelimit;
 
 		if (ratelimit < 10) {
-- 
cgit v1.2.3


From a93751cab71d63126687551823ed3e70cd85854a Mon Sep 17 00:00:00 2001
From: Markus Metzger <markut.t.metzger@intel.com>
Date: Thu, 11 Dec 2008 13:53:26 +0100
Subject: x86, bts, ftrace: adapt the hw-branch-tracer to the ds.c interface

Impact: restructure code, cleanup

Remove BTS bits from the hw-branch-tracer (renamed from bts-tracer) and
use the ds interface.

Signed-off-by: Markus Metzger <markut.t.metzger@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig             |   4 +-
 kernel/trace/Makefile            |   2 +-
 kernel/trace/trace.c             |   2 +-
 kernel/trace/trace.h             |  14 +-
 kernel/trace/trace_bts.c         | 276 ---------------------------------------
 kernel/trace/trace_hw_branches.c | 205 +++++++++++++++++++++++++++++
 6 files changed, 215 insertions(+), 288 deletions(-)
 delete mode 100644 kernel/trace/trace_bts.c
 create mode 100644 kernel/trace/trace_hw_branches.c

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index bde6f03512d..d8bae6f4219 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -251,9 +251,9 @@ config STACK_TRACER
 
 	  Say N if unsure.
 
-config BTS_TRACER
+config HW_BRANCH_TRACER
 	depends on HAVE_HW_BRANCH_TRACER
-	bool "Trace branches"
+	bool "Trace hw branches"
 	select TRACING
 	help
 	  This tracer records all branches on the system in a circular
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 62dc561b667..349d5a93653 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -31,7 +31,7 @@ obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
 obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
-obj-$(CONFIG_BTS_TRACER) += trace_bts.o
+obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
 obj-$(CONFIG_POWER_TRACER) += trace_power.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8ebe0070c47..639344a4d3a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2425,7 +2425,7 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 
 	/* Notify the tracer early; before we stop tracing. */
 	if (iter->trace && iter->trace->open)
-			iter->trace->open(iter);
+		iter->trace->open(iter);
 
 	/* Annotate start of buffers if we had overruns */
 	if (ring_buffer_overruns(iter->tr->buffer))
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5ac697065a4..f07c246dd73 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -28,7 +28,7 @@ enum trace_type {
 	TRACE_GRAPH_RET,
 	TRACE_GRAPH_ENT,
 	TRACE_USER_STACK,
-	TRACE_BTS,
+	TRACE_HW_BRANCHES,
 	TRACE_POWER,
 
 	__TRACE_LAST_TYPE
@@ -159,10 +159,10 @@ struct trace_branch {
 	char			correct;
 };
 
-struct bts_entry {
+struct hw_branch_entry {
 	struct trace_entry	ent;
-	unsigned long		from;
-	unsigned long		to;
+	u64			from;
+	u64			to;
 };
 
 struct trace_power {
@@ -278,7 +278,7 @@ extern void __ftrace_bad_type(void);
 			  TRACE_GRAPH_ENT);		\
 		IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry,	\
 			  TRACE_GRAPH_RET);		\
-		IF_ASSIGN(var, ent, struct bts_entry, TRACE_BTS);\
+		IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
  		IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
 		__ftrace_bad_type();					\
 	} while (0)
@@ -414,9 +414,7 @@ void trace_function(struct trace_array *tr,
 
 void trace_graph_return(struct ftrace_graph_ret *trace);
 int trace_graph_entry(struct ftrace_graph_ent *trace);
-void trace_bts(struct trace_array *tr,
-	       unsigned long from,
-	       unsigned long to);
+void trace_hw_branch(struct trace_array *tr, u64 from, u64 to);
 
 void tracing_start_cmdline_record(void);
 void tracing_stop_cmdline_record(void);
diff --git a/kernel/trace/trace_bts.c b/kernel/trace/trace_bts.c
deleted file mode 100644
index 23b76e4690e..00000000000
--- a/kernel/trace/trace_bts.c
+++ /dev/null
@@ -1,276 +0,0 @@
-/*
- * BTS tracer
- *
- * Copyright (C) 2008 Markus Metzger <markus.t.metzger@gmail.com>
- *
- */
-
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/debugfs.h>
-#include <linux/ftrace.h>
-#include <linux/kallsyms.h>
-
-#include <asm/ds.h>
-
-#include "trace.h"
-
-
-#define SIZEOF_BTS (1 << 13)
-
-static DEFINE_PER_CPU(struct bts_tracer *, tracer);
-static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
-
-#define this_tracer per_cpu(tracer, smp_processor_id())
-#define this_buffer per_cpu(buffer, smp_processor_id())
-
-
-/*
- * Information to interpret a BTS record.
- * This will go into an in-kernel BTS interface.
- */
-static unsigned char sizeof_field;
-static unsigned long debugctl_mask;
-
-#define sizeof_bts (3 * sizeof_field)
-
-static void bts_trace_cpuinit(struct cpuinfo_x86 *c)
-{
-	switch (c->x86) {
-	case 0x6:
-		switch (c->x86_model) {
-		case 0x0 ... 0xC:
-			break;
-		case 0xD:
-		case 0xE: /* Pentium M */
-			sizeof_field = sizeof(long);
-			debugctl_mask = (1<<6)|(1<<7);
-			break;
-		default:
-			sizeof_field = 8;
-			debugctl_mask = (1<<6)|(1<<7);
-			break;
-		}
-		break;
-	case 0xF:
-		switch (c->x86_model) {
-		case 0x0:
-		case 0x1:
-		case 0x2: /* Netburst */
-			sizeof_field = sizeof(long);
-			debugctl_mask = (1<<2)|(1<<3);
-			break;
-		default:
-			/* sorry, don't know about them */
-			break;
-		}
-		break;
-	default:
-		/* sorry, don't know about them */
-		break;
-	}
-}
-
-static inline void bts_enable(void)
-{
-	unsigned long debugctl;
-
-	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl | debugctl_mask);
-}
-
-static inline void bts_disable(void)
-{
-	unsigned long debugctl;
-
-	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl & ~debugctl_mask);
-}
-
-static void bts_trace_reset(struct trace_array *tr)
-{
-	int cpu;
-
-	tr->time_start = ftrace_now(tr->cpu);
-
-	for_each_online_cpu(cpu)
-		tracing_reset(tr, cpu);
-}
-
-static void bts_trace_start_cpu(void *arg)
-{
-	this_tracer =
-		ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS,
-			       /* ovfl = */ NULL, /* th = */ (size_t)-1);
-	if (IS_ERR(this_tracer)) {
-		this_tracer = NULL;
-		return;
-	}
-
-	bts_enable();
-}
-
-static void bts_trace_start(struct trace_array *tr)
-{
-	int cpu;
-
-	bts_trace_reset(tr);
-
-	for_each_cpu_mask(cpu, cpu_possible_map)
-		smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
-}
-
-static void bts_trace_stop_cpu(void *arg)
-{
-	if (this_tracer) {
-		bts_disable();
-
-		ds_release_bts(this_tracer);
-		this_tracer = NULL;
-	}
-}
-
-static void bts_trace_stop(struct trace_array *tr)
-{
-	int cpu;
-
-	for_each_cpu_mask(cpu, cpu_possible_map)
-		smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
-}
-
-static int bts_trace_init(struct trace_array *tr)
-{
-	bts_trace_cpuinit(&boot_cpu_data);
-	bts_trace_reset(tr);
-	bts_trace_start(tr);
-
-	return 0;
-}
-
-static void bts_trace_print_header(struct seq_file *m)
-{
-#ifdef __i386__
-	seq_puts(m, "# CPU#    FROM           TO     FUNCTION\n");
-	seq_puts(m, "#  |       |             |         |\n");
-#else
-	seq_puts(m,
-		 "# CPU#        FROM                   TO         FUNCTION\n");
-	seq_puts(m,
-		 "#  |           |                     |             |\n");
-#endif
-}
-
-static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
-{
-	struct trace_entry *entry = iter->ent;
-	struct trace_seq *seq = &iter->seq;
-	struct bts_entry *it;
-
-	trace_assign_type(it, entry);
-
-	if (entry->type == TRACE_BTS) {
-		int ret;
-#ifdef CONFIG_KALLSYMS
-		char function[KSYM_SYMBOL_LEN];
-		sprint_symbol(function, it->from);
-#else
-		char *function = "<unknown>";
-#endif
-
-		ret = trace_seq_printf(seq, "%4d  0x%lx -> 0x%lx [%s]\n",
-				       entry->cpu, it->from, it->to, function);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;;
-		return TRACE_TYPE_HANDLED;
-	}
-	return TRACE_TYPE_UNHANDLED;
-}
-
-void trace_bts(struct trace_array *tr, unsigned long from, unsigned long to)
-{
-	struct ring_buffer_event *event;
-	struct bts_entry *entry;
-	unsigned long irq;
-
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq);
-	if (!event)
-		return;
-	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, from);
-	entry->ent.type = TRACE_BTS;
-	entry->ent.cpu = smp_processor_id();
-	entry->from = from;
-	entry->to   = to;
-	ring_buffer_unlock_commit(tr->buffer, event, irq);
-}
-
-static void trace_bts_at(struct trace_array *tr, size_t index)
-{
-	const void *raw = NULL;
-	unsigned long from, to;
-	int err;
-
-	err = ds_access_bts(this_tracer, index, &raw);
-	if (err < 0)
-		return;
-
-	from = *(const unsigned long *)raw;
-	to = *(const unsigned long *)((const char *)raw + sizeof_field);
-
-	trace_bts(tr, from, to);
-}
-
-static void trace_bts_cpu(void *arg)
-{
-	struct trace_array *tr = (struct trace_array *) arg;
-	size_t index = 0, end = 0, i;
-	int err;
-
-	if (!this_tracer)
-		return;
-
-	bts_disable();
-
-	err = ds_get_bts_index(this_tracer, &index);
-	if (err < 0)
-		goto out;
-
-	err = ds_get_bts_end(this_tracer, &end);
-	if (err < 0)
-		goto out;
-
-	for (i = index; i < end; i++)
-		trace_bts_at(tr, i);
-
-	for (i = 0; i < index; i++)
-		trace_bts_at(tr, i);
-
-out:
-	bts_enable();
-}
-
-static void trace_bts_prepare(struct trace_iterator *iter)
-{
-	int cpu;
-
-	for_each_cpu_mask(cpu, cpu_possible_map)
-		smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1);
-}
-
-struct tracer bts_tracer __read_mostly =
-{
-	.name		= "bts",
-	.init		= bts_trace_init,
-	.reset		= bts_trace_stop,
-	.print_header	= bts_trace_print_header,
-	.print_line	= bts_trace_print_line,
-	.start		= bts_trace_start,
-	.stop		= bts_trace_stop,
-	.open		= trace_bts_prepare
-};
-
-__init static int init_bts_trace(void)
-{
-	return register_tracer(&bts_tracer);
-}
-device_initcall(init_bts_trace);
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
new file mode 100644
index 00000000000..ee29e012aa9
--- /dev/null
+++ b/kernel/trace/trace_hw_branches.c
@@ -0,0 +1,205 @@
+/*
+ * h/w branch tracer for x86 based on bts
+ *
+ * Copyright (C) 2008 Markus Metzger <markus.t.metzger@gmail.com>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/ftrace.h>
+#include <linux/kallsyms.h>
+
+#include <asm/ds.h>
+
+#include "trace.h"
+
+
+#define SIZEOF_BTS (1 << 13)
+
+static DEFINE_PER_CPU(struct bts_tracer *, tracer);
+static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
+
+#define this_tracer per_cpu(tracer, smp_processor_id())
+#define this_buffer per_cpu(buffer, smp_processor_id())
+
+
+static void bts_trace_reset(struct trace_array *tr)
+{
+	int cpu;
+
+	tr->time_start = ftrace_now(tr->cpu);
+
+	for_each_online_cpu(cpu)
+		tracing_reset(tr, cpu);
+}
+
+static void bts_trace_start_cpu(void *arg)
+{
+	if (this_tracer)
+		ds_release_bts(this_tracer);
+
+	this_tracer =
+		ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS,
+			       /* ovfl = */ NULL, /* th = */ (size_t)-1,
+			       BTS_KERNEL);
+	if (IS_ERR(this_tracer)) {
+		this_tracer = NULL;
+		return;
+	}
+}
+
+static void bts_trace_start(struct trace_array *tr)
+{
+	int cpu;
+
+	bts_trace_reset(tr);
+
+	for_each_cpu_mask(cpu, cpu_possible_map)
+		smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
+}
+
+static void bts_trace_stop_cpu(void *arg)
+{
+	if (this_tracer) {
+		ds_release_bts(this_tracer);
+		this_tracer = NULL;
+	}
+}
+
+static void bts_trace_stop(struct trace_array *tr)
+{
+	int cpu;
+
+	for_each_cpu_mask(cpu, cpu_possible_map)
+		smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
+}
+
+static int bts_trace_init(struct trace_array *tr)
+{
+	bts_trace_reset(tr);
+	bts_trace_start(tr);
+
+	return 0;
+}
+
+static void bts_trace_print_header(struct seq_file *m)
+{
+	seq_puts(m,
+		 "# CPU#        FROM                   TO         FUNCTION\n");
+	seq_puts(m,
+		 "#  |           |                     |             |\n");
+}
+
+static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
+{
+	struct trace_entry *entry = iter->ent;
+	struct trace_seq *seq = &iter->seq;
+	struct hw_branch_entry *it;
+
+	trace_assign_type(it, entry);
+
+	if (entry->type == TRACE_HW_BRANCHES) {
+		if (trace_seq_printf(seq, "%4d  ", entry->cpu) &&
+		    trace_seq_printf(seq, "0x%016llx -> 0x%016llx ",
+				     it->from, it->to) &&
+		    (!it->from ||
+		     seq_print_ip_sym(seq, it->from, /* sym_flags = */ 0)) &&
+		    trace_seq_printf(seq, "\n"))
+			return TRACE_TYPE_HANDLED;
+		return TRACE_TYPE_PARTIAL_LINE;;
+	}
+	return TRACE_TYPE_UNHANDLED;
+}
+
+void trace_hw_branch(struct trace_array *tr, u64 from, u64 to)
+{
+	struct ring_buffer_event *event;
+	struct hw_branch_entry *entry;
+	unsigned long irq;
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq);
+	if (!event)
+		return;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, from);
+	entry->ent.type = TRACE_HW_BRANCHES;
+	entry->ent.cpu = smp_processor_id();
+	entry->from = from;
+	entry->to   = to;
+	ring_buffer_unlock_commit(tr->buffer, event, irq);
+}
+
+static void trace_bts_at(struct trace_array *tr,
+			 const struct bts_trace *trace, void *at)
+{
+	struct bts_struct bts;
+	int err = 0;
+
+	WARN_ON_ONCE(!trace->read);
+	if (!trace->read)
+		return;
+
+	err = trace->read(this_tracer, at, &bts);
+	if (err < 0)
+		return;
+
+	switch (bts.qualifier) {
+	case BTS_BRANCH:
+		trace_hw_branch(tr, bts.variant.lbr.from, bts.variant.lbr.to);
+		break;
+	}
+}
+
+static void trace_bts_cpu(void *arg)
+{
+	struct trace_array *tr = (struct trace_array *) arg;
+	const struct bts_trace *trace;
+	unsigned char *at;
+
+	if (!this_tracer)
+		return;
+
+	ds_suspend_bts(this_tracer);
+	trace = ds_read_bts(this_tracer);
+	if (!trace)
+		goto out;
+
+	for (at = trace->ds.top; (void *)at < trace->ds.end;
+	     at += trace->ds.size)
+		trace_bts_at(tr, trace, at);
+
+	for (at = trace->ds.begin; (void *)at < trace->ds.top;
+	     at += trace->ds.size)
+		trace_bts_at(tr, trace, at);
+
+out:
+	ds_resume_bts(this_tracer);
+}
+
+static void trace_bts_prepare(struct trace_iterator *iter)
+{
+	int cpu;
+
+	for_each_cpu_mask(cpu, cpu_possible_map)
+		smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1);
+}
+
+struct tracer bts_tracer __read_mostly =
+{
+	.name		= "hw-branch-tracer",
+	.init		= bts_trace_init,
+	.reset		= bts_trace_stop,
+	.print_header	= bts_trace_print_header,
+	.print_line	= bts_trace_print_line,
+	.start		= bts_trace_start,
+	.stop		= bts_trace_stop,
+	.open		= trace_bts_prepare
+};
+
+__init static int init_bts_trace(void)
+{
+	return register_tracer(&bts_tracer);
+}
+device_initcall(init_bts_trace);
-- 
cgit v1.2.3


From f8b755ac8e0cc3f330269e4c4504514f987167a2 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 9 Dec 2008 23:55:25 +0100
Subject: tracing/function-graph-tracer: Output arrows signal on hardirq
 call/return

Impact: make more obvious the hardirq calls in the output

When a hardirq is triggered inside the codeflow on output, we have
now two arrows that indicate the entry and return of the hardirq.

 0)               |          bit_waitqueue() {
 0)   0.880 us    |            __phys_addr();
 0)   2.699 us    |          }
 0)               |          __wake_up_bit() {
 0)   ==========> |          smp_apic_timer_interrupt() {
 0)   0.797 us    |            native_apic_mem_write();
 0)   0.715 us    |            exit_idle();
 0)               |            irq_enter() {
 0)   0.722 us    |              idle_cpu();
 0)   5.519 us    |            }
 0)               |            hrtimer_interrupt() {
 0)               |              ktime_get() {
 0)               |                ktime_get_ts() {
 0)   0.805 us    |                  getnstimeofday();

 [...]

 0) ! 108.528 us  |            }
 0)               |            irq_exit() {
 0)               |              do_softirq() {
 0)               |                __do_softirq() {
 0)   0.895 us    |                  __local_bh_disable();
 0)               |                  run_timer_softirq() {
 0)   0.827 us    |                    hrtimer_run_pending();
 0)   1.226 us    |                    _spin_lock_irq();
 0)               |                    _spin_unlock_irq() {
 0)   6.550 us    |                  }
 0)   0.924 us    |                  _local_bh_enable();
 0) + 12.129 us   |                }
 0) + 13.911 us   |              }
 0)   0.707 us    |              idle_cpu();
 0) + 17.009 us   |            }
 0) ! 137.419 us  |          }
 0)   <========== |
 0)   1.045 us    |          }
 0) ! 148.908 us  |        }
 0) ! 151.022 us  |      }
 0) ! 153.022 us  |    }
 0)   0.963 us    |    journal_mark_dirty();
 0)   0.925 us    |    __brelse();

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_functions_graph.c | 66 +++++++++++++++++++++++++++++++++---
 1 file changed, 62 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index af60eef4cbc..4bf39fcae97 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -231,6 +231,49 @@ trace_branch_is_leaf(struct trace_iterator *iter,
 	return true;
 }
 
+static enum print_line_t
+print_graph_irq(struct trace_seq *s, unsigned long addr,
+				enum trace_type type, int cpu, pid_t pid)
+{
+	int ret;
+
+	if (addr < (unsigned long)__irqentry_text_start ||
+		addr >= (unsigned long)__irqentry_text_end)
+		return TRACE_TYPE_UNHANDLED;
+
+	if (type == TRACE_GRAPH_ENT) {
+		ret = trace_seq_printf(s, "==========> |  ");
+	} else {
+		/* Cpu */
+		if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
+			ret = print_graph_cpu(s, cpu);
+			if (ret == TRACE_TYPE_PARTIAL_LINE)
+				return TRACE_TYPE_PARTIAL_LINE;
+		}
+		/* Proc */
+		if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
+			ret = print_graph_proc(s, pid);
+			if (ret == TRACE_TYPE_PARTIAL_LINE)
+				return TRACE_TYPE_PARTIAL_LINE;
+
+			ret = trace_seq_printf(s, " | ");
+			if (!ret)
+				return TRACE_TYPE_PARTIAL_LINE;
+		}
+
+		/* No overhead */
+		if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
+			ret = trace_seq_printf(s, "  ");
+			if (!ret)
+				return TRACE_TYPE_PARTIAL_LINE;
+		}
+
+		ret = trace_seq_printf(s, "<========== |\n");
+	}
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+	return TRACE_TYPE_HANDLED;
+}
 
 static enum print_line_t
 print_graph_duration(unsigned long long duration, struct trace_seq *s)
@@ -344,7 +387,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 
 static enum print_line_t
 print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
-			struct trace_seq *s)
+			struct trace_seq *s, pid_t pid, int cpu)
 {
 	int i;
 	int ret;
@@ -357,8 +400,18 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
-	/* No time */
-	ret = trace_seq_printf(s, "            |  ");
+	/* Interrupt */
+	ret = print_graph_irq(s, call->func, TRACE_GRAPH_ENT, cpu, pid);
+	if (ret == TRACE_TYPE_UNHANDLED) {
+		/* No time */
+		ret = trace_seq_printf(s, "            |  ");
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	} else {
+		if (ret == TRACE_TYPE_PARTIAL_LINE)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
 
 	/* Function */
 	for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
@@ -410,7 +463,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 	if (trace_branch_is_leaf(iter, field))
 		return print_graph_entry_leaf(iter, field, s);
 	else
-		return print_graph_entry_nested(field, s);
+		return print_graph_entry_nested(field, s, iter->ent->pid, cpu);
 
 }
 
@@ -474,6 +527,11 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
+
+	ret = print_graph_irq(s, trace->func, TRACE_GRAPH_RET, cpu, ent->pid);
+	if (ret == TRACE_TYPE_PARTIAL_LINE)
+		return TRACE_TYPE_PARTIAL_LINE;
+
 	return TRACE_TYPE_HANDLED;
 }
 
-- 
cgit v1.2.3


From cbc34ed1ac36690f75fd272e19e7b4fc29aae5a2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 10 Dec 2008 08:08:22 +0100
Subject: sched: fix tracepoints in scheduler

The trace point only caught one of many places where a task changes cpu,
put it in the right place to we get all of them.

Change the signature while we're at it.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 7729c4bbc8b..d377097572f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1851,6 +1851,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 
 	clock_offset = old_rq->clock - new_rq->clock;
 
+	trace_sched_migrate_task(p, task_cpu(p), new_cpu);
+
 #ifdef CONFIG_SCHEDSTATS
 	if (p->se.wait_start)
 		p->se.wait_start -= clock_offset;
@@ -2868,7 +2870,6 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
 	    || unlikely(!cpu_active(dest_cpu)))
 		goto out;
 
-	trace_sched_migrate_task(rq, p, dest_cpu);
 	/* force the process onto the specified CPU */
 	if (migrate_task(p, dest_cpu, &req)) {
 		/* Need to wait for migration thread (might exit: take ref). */
-- 
cgit v1.2.3


From ee79d1bdb6a10499e53f80b1e8d14110215178ba Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 9 Dec 2008 18:49:50 +0100
Subject: sched: let arch_update_cpu_topology indicate if topology changed

Change arch_update_cpu_topology so it returns 1 if the cpu topology changed
and 0 if it didn't change. This will be useful for the next patch which adds
a call to this function in partition_sched_domains.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index ef212da928e..fcfbbd9dbd6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7675,8 +7675,14 @@ static struct sched_domain_attr *dattr_cur;
  */
 static cpumask_t fallback_doms;
 
-void __attribute__((weak)) arch_update_cpu_topology(void)
+/*
+ * arch_update_cpu_topology lets virtualized architectures update the
+ * cpu core maps. It is supposed to return 1 if the topology changed
+ * or 0 if it stayed the same.
+ */
+int __attribute__((weak)) arch_update_cpu_topology(void)
 {
+	return 0;
 }
 
 /*
-- 
cgit v1.2.3


From d65bd5ecb2bd166cea4952a59b7e16cc3ad6ef6c Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 9 Dec 2008 18:49:51 +0100
Subject: sched: add missing arch_update_cpu_topology() call

arch_reinit_sched_domains() used to call arch_update_cpu_topology()
via arch_init_sched_domains(). This call got lost with
e761b7725234276a802322549cee5255305a0930 ("cpu hotplug, sched: Introduce
cpu_active_map and redo sched domain managment (take 2)".

So we might end up with outdated and missing cpus in the cpu core
maps (architecture used to call arch_reinit_sched_domains if cpu
topology changed).

This adds a call to arch_update_cpu_topology in partition_sched_domains
which gets called whenever scheduling domains get updated. Which is
what is supposed to happen when cpu topology changes.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index fcfbbd9dbd6..ad7b93be569 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7774,17 +7774,21 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
 			     struct sched_domain_attr *dattr_new)
 {
 	int i, j, n;
+	int new_topology;
 
 	mutex_lock(&sched_domains_mutex);
 
 	/* always unregister in case we don't destroy any domains */
 	unregister_sched_domain_sysctl();
 
+	/* Let architecture update cpu core mappings. */
+	new_topology = arch_update_cpu_topology();
+
 	n = doms_new ? ndoms_new : 0;
 
 	/* Destroy deleted domains */
 	for (i = 0; i < ndoms_cur; i++) {
-		for (j = 0; j < n; j++) {
+		for (j = 0; j < n && !new_topology; j++) {
 			if (cpus_equal(doms_cur[i], doms_new[j])
 			    && dattrs_equal(dattr_cur, i, dattr_new, j))
 				goto match1;
@@ -7804,7 +7808,7 @@ match1:
 
 	/* Build new domains */
 	for (i = 0; i < ndoms_new; i++) {
-		for (j = 0; j < ndoms_cur; j++) {
+		for (j = 0; j < ndoms_cur && !new_topology; j++) {
 			if (cpus_equal(doms_new[i], doms_cur[j])
 			    && dattrs_equal(dattr_new, i, dattr_cur, j))
 				goto match2;
-- 
cgit v1.2.3


From 001474491fabeca233168a8598f721c808040f90 Mon Sep 17 00:00:00 2001
From: "Woodruff, Richard" <r-woodruff2@ti.com>
Date: Mon, 1 Dec 2008 14:18:11 -0800
Subject: nohz: suppress needless timer reprogramming

In my device I get many interrupts from a high speed USB device in a very
short period of time.  The system spends a lot of time reprogramming the
hardware timer which is in a slower timing domain as compared to the CPU.
This results in the CPU spending a huge amount of time waiting for the
timer posting to be done.  All of this reprogramming is useless as the
wake up time has not changed.

As measured using ETM trace this drops my reprogramming penalty from
almost 60% CPU load down to 15% during high interrupt rate.  I can send
traces to show this.

Suppress setting of duplicate timer event when timer already stopped.
Timer programming can be very costly and can result in long cpu stall/wait
times.

[akpm@linux-foundation.org: coding-style fixes]
[tglx@linutronix.de: move the check to the right place and avoid raising
		     the softirq for nothing]

Signed-off-by: Richard Woodruff <r-woodruff2@ti.com>
Cc: johnstul@us.ibm.com
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-sched.c | 41 ++++++++++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index dc17ffcf191..87fc34f21db 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -282,8 +282,31 @@ void tick_nohz_stop_sched_tick(int inidle)
 	/* Schedule the tick, if we are at least one jiffie off */
 	if ((long)delta_jiffies >= 1) {
 
+		/*
+		* calculate the expiry time for the next timer wheel
+		* timer
+		*/
+		expires = ktime_add_ns(last_update, tick_period.tv64 *
+				   delta_jiffies);
+
+		/*
+		 * If this cpu is the one which updates jiffies, then
+		 * give up the assignment and let it be taken by the
+		 * cpu which runs the tick timer next, which might be
+		 * this cpu as well. If we don't drop this here the
+		 * jiffies might be stale and do_timer() never
+		 * invoked.
+		 */
+		if (cpu == tick_do_timer_cpu)
+			tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+
 		if (delta_jiffies > 1)
 			cpu_set(cpu, nohz_cpu_mask);
+
+		/* Skip reprogram of event if its not changed */
+		if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
+			goto out;
+
 		/*
 		 * nohz_stop_sched_tick can be called several times before
 		 * the nohz_restart_sched_tick is called. This happens when
@@ -306,17 +329,6 @@ void tick_nohz_stop_sched_tick(int inidle)
 			rcu_enter_nohz();
 		}
 
-		/*
-		 * If this cpu is the one which updates jiffies, then
-		 * give up the assignment and let it be taken by the
-		 * cpu which runs the tick timer next, which might be
-		 * this cpu as well. If we don't drop this here the
-		 * jiffies might be stale and do_timer() never
-		 * invoked.
-		 */
-		if (cpu == tick_do_timer_cpu)
-			tick_do_timer_cpu = TICK_DO_TIMER_NONE;
-
 		ts->idle_sleeps++;
 
 		/*
@@ -332,12 +344,7 @@ void tick_nohz_stop_sched_tick(int inidle)
 			goto out;
 		}
 
-		/*
-		 * calculate the expiry time for the next timer wheel
-		 * timer
-		 */
-		expires = ktime_add_ns(last_update, tick_period.tv64 *
-				       delta_jiffies);
+		/* Mark expiries */
 		ts->idle_expires = expires;
 
 		if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
-- 
cgit v1.2.3


From 27af4245b6ce99e08c6a7c38825383bf51119cc9 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 1 Dec 2008 14:18:13 -0800
Subject: posix-timers: use "struct pid*" instead of "struct task_struct*"

Impact: restructure, clean up code

k_itimer holds the ref to the ->it_process until sys_timer_delete(). This
allows to pin up to RLIMIT_SIGPENDING dead task_struct's. Change the code
to use "struct pid *" instead.

The patch doesn't kill ->it_process, it places ->it_pid into the union.
->it_process is still used by do_cpu_nanosleep() as before. It would be
trivial to change the nanosleep code as well, but since it uses it_process
in a special way I think it is better to keep this field for grep.

The patch bloats the kernel by 104 bytes and it also adds the new pointer,
->it_signal, to k_itimer. It is used by lock_timer() to verify that the
found timer was not created by another process. It is not clear why do we
use the global database (and thus the global idr_lock) for posix timers.
We still need the signal_struct->posix_timers which contains all useable
timers, perhaps it is better to use some form of per-process array
instead.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/posix-timers.c | 43 +++++++++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 5e79c662294..42a39afd694 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -116,7 +116,7 @@ static DEFINE_SPINLOCK(idr_lock);
  *	    must supply functions here, even if the function just returns
  *	    ENOSYS.  The standard POSIX timer management code assumes the
  *	    following: 1.) The k_itimer struct (sched.h) is used for the
- *	    timer.  2.) The list, it_lock, it_clock, it_id and it_process
+ *	    timer.  2.) The list, it_lock, it_clock, it_id and it_pid
  *	    fields are not modified by timer code.
  *
  *          At this time all functions EXCEPT clock_nanosleep can be
@@ -313,7 +313,8 @@ void do_schedule_next_timer(struct siginfo *info)
 
 int posix_timer_event(struct k_itimer *timr, int si_private)
 {
-	int shared, ret;
+	struct task_struct *task;
+	int shared, ret = -1;
 	/*
 	 * FIXME: if ->sigq is queued we can race with
 	 * dequeue_signal()->do_schedule_next_timer().
@@ -327,8 +328,13 @@ int posix_timer_event(struct k_itimer *timr, int si_private)
 	 */
 	timr->sigq->info.si_sys_private = si_private;
 
-	shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID);
-	ret = send_sigqueue(timr->sigq, timr->it_process, shared);
+	rcu_read_lock();
+	task = pid_task(timr->it_pid, PIDTYPE_PID);
+	if (task) {
+		shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID);
+		ret = send_sigqueue(timr->sigq, task, shared);
+	}
+	rcu_read_unlock();
 	/* If we failed to send the signal the timer stops. */
 	return ret > 0;
 }
@@ -405,7 +411,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
 	return ret;
 }
 
-static struct task_struct * good_sigevent(sigevent_t * event)
+static struct pid *good_sigevent(sigevent_t * event)
 {
 	struct task_struct *rtn = current->group_leader;
 
@@ -419,7 +425,7 @@ static struct task_struct * good_sigevent(sigevent_t * event)
 	    ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
 		return NULL;
 
-	return rtn;
+	return task_pid(rtn);
 }
 
 void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock)
@@ -471,7 +477,7 @@ sys_timer_create(const clockid_t which_clock,
 {
 	struct k_itimer *new_timer;
 	int error, new_timer_id;
-	struct task_struct *process;
+	struct pid *it_pid;
 	sigevent_t event;
 	int it_id_set = IT_ID_NOT_SET;
 
@@ -525,11 +531,9 @@ sys_timer_create(const clockid_t which_clock,
 			goto out;
 		}
 		rcu_read_lock();
-		process = good_sigevent(&event);
-		if (process)
-			get_task_struct(process);
+		it_pid = get_pid(good_sigevent(&event));
 		rcu_read_unlock();
-		if (!process) {
+		if (!it_pid) {
 			error = -EINVAL;
 			goto out;
 		}
@@ -537,8 +541,7 @@ sys_timer_create(const clockid_t which_clock,
 		event.sigev_notify = SIGEV_SIGNAL;
 		event.sigev_signo = SIGALRM;
 		event.sigev_value.sival_int = new_timer->it_id;
-		process = current->group_leader;
-		get_task_struct(process);
+		it_pid = get_pid(task_tgid(current));
 	}
 
 	new_timer->it_sigev_notify     = event.sigev_notify;
@@ -548,7 +551,8 @@ sys_timer_create(const clockid_t which_clock,
 	new_timer->sigq->info.si_code  = SI_TIMER;
 
 	spin_lock_irq(&current->sighand->siglock);
-	new_timer->it_process = process;
+	new_timer->it_pid = it_pid;
+	new_timer->it_signal = current->signal;
 	list_add(&new_timer->list, &current->signal->posix_timers);
 	spin_unlock_irq(&current->sighand->siglock);
 
@@ -583,8 +587,7 @@ static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags)
 	timr = idr_find(&posix_timers_id, (int)timer_id);
 	if (timr) {
 		spin_lock(&timr->it_lock);
-		if (timr->it_process &&
-		    same_thread_group(timr->it_process, current)) {
+		if (timr->it_pid && timr->it_signal == current->signal) {
 			spin_unlock(&idr_lock);
 			return timr;
 		}
@@ -831,8 +834,8 @@ retry_delete:
 	 * This keeps any tasks waiting on the spin lock from thinking
 	 * they got something (see the lock code above).
 	 */
-	put_task_struct(timer->it_process);
-	timer->it_process = NULL;
+	put_pid(timer->it_pid);
+	timer->it_pid = NULL;
 
 	unlock_timer(timer, flags);
 	release_posix_timer(timer, IT_ID_SET);
@@ -858,8 +861,8 @@ retry_delete:
 	 * This keeps any tasks waiting on the spin lock from thinking
 	 * they got something (see the lock code above).
 	 */
-	put_task_struct(timer->it_process);
-	timer->it_process = NULL;
+	put_pid(timer->it_pid);
+	timer->it_pid = NULL;
 
 	unlock_timer(timer, flags);
 	release_posix_timer(timer, IT_ID_SET);
-- 
cgit v1.2.3


From 899921025b406a71a8aeb2d7855658ea0cf0ed23 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 1 Dec 2008 14:18:15 -0800
Subject: posix-timers: check ->it_signal instead of ->it_pid to validate the
 timer

Impact: clean up, speed up

->it_pid (was ->it_process) has also a special meaning: if it is NULL,
the timer is under deletion or it wasn't initialized yet. We can check
->it_signal != NULL instead, this way we can

	- simplify sys_timer_create() a bit

	- remove yet another check from lock_timer()

	- move put_pid(->it_pid) into release_posix_timer() which
	  runs outside of ->it_lock

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/posix-timers.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 42a39afd694..aa922bbb640 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -464,6 +464,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
 		idr_remove(&posix_timers_id, tmr->it_id);
 		spin_unlock_irqrestore(&idr_lock, flags);
 	}
+	put_pid(tmr->it_pid);
 	sigqueue_free(tmr->sigq);
 	kmem_cache_free(posix_timers_cache, tmr);
 }
@@ -477,7 +478,6 @@ sys_timer_create(const clockid_t which_clock,
 {
 	struct k_itimer *new_timer;
 	int error, new_timer_id;
-	struct pid *it_pid;
 	sigevent_t event;
 	int it_id_set = IT_ID_NOT_SET;
 
@@ -531,9 +531,9 @@ sys_timer_create(const clockid_t which_clock,
 			goto out;
 		}
 		rcu_read_lock();
-		it_pid = get_pid(good_sigevent(&event));
+		new_timer->it_pid = get_pid(good_sigevent(&event));
 		rcu_read_unlock();
-		if (!it_pid) {
+		if (!new_timer->it_pid) {
 			error = -EINVAL;
 			goto out;
 		}
@@ -541,7 +541,7 @@ sys_timer_create(const clockid_t which_clock,
 		event.sigev_notify = SIGEV_SIGNAL;
 		event.sigev_signo = SIGALRM;
 		event.sigev_value.sival_int = new_timer->it_id;
-		it_pid = get_pid(task_tgid(current));
+		new_timer->it_pid = get_pid(task_tgid(current));
 	}
 
 	new_timer->it_sigev_notify     = event.sigev_notify;
@@ -551,7 +551,6 @@ sys_timer_create(const clockid_t which_clock,
 	new_timer->sigq->info.si_code  = SI_TIMER;
 
 	spin_lock_irq(&current->sighand->siglock);
-	new_timer->it_pid = it_pid;
 	new_timer->it_signal = current->signal;
 	list_add(&new_timer->list, &current->signal->posix_timers);
 	spin_unlock_irq(&current->sighand->siglock);
@@ -587,7 +586,7 @@ static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags)
 	timr = idr_find(&posix_timers_id, (int)timer_id);
 	if (timr) {
 		spin_lock(&timr->it_lock);
-		if (timr->it_pid && timr->it_signal == current->signal) {
+		if (timr->it_signal == current->signal) {
 			spin_unlock(&idr_lock);
 			return timr;
 		}
@@ -834,8 +833,7 @@ retry_delete:
 	 * This keeps any tasks waiting on the spin lock from thinking
 	 * they got something (see the lock code above).
 	 */
-	put_pid(timer->it_pid);
-	timer->it_pid = NULL;
+	timer->it_signal = NULL;
 
 	unlock_timer(timer, flags);
 	release_posix_timer(timer, IT_ID_SET);
@@ -861,8 +859,7 @@ retry_delete:
 	 * This keeps any tasks waiting on the spin lock from thinking
 	 * they got something (see the lock code above).
 	 */
-	put_pid(timer->it_pid);
-	timer->it_pid = NULL;
+	timer->it_signal = NULL;
 
 	unlock_timer(timer, flags);
 	release_posix_timer(timer, IT_ID_SET);
-- 
cgit v1.2.3


From 98a79d6a50181ca1ecf7400eda01d5dc1bc0dbf0 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Sat, 13 Dec 2008 21:19:41 +1030
Subject: cpumask: centralize cpu_online_map and cpu_possible_map

Impact: cleanup

Each SMP arch defines these themselves.  Move them to a central
location.

Twists:
1) Some archs (m32, parisc, s390) set possible_map to all 1, so we add a
   CONFIG_INIT_ALL_POSSIBLE for this rather than break them.

2) mips and sparc32 '#define cpu_possible_map phys_cpu_present_map'.
   Those archs simply have phys_cpu_present_map replaced everywhere.

3) Alpha defined cpu_possible_map to cpu_present_map; this is tricky
   so I just manipulate them both in sync.

4) IA64, cris and m32r have gratuitous 'extern cpumask_t cpu_possible_map'
   declarations.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Reviewed-by: Grant Grundler <grundler@parisc-linux.org>
Tested-by: Tony Luck <tony.luck@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Mike Travis <travis@sgi.com>
Cc: ink@jurassic.park.msu.ru
Cc: rmk@arm.linux.org.uk
Cc: starvik@axis.com
Cc: tony.luck@intel.com
Cc: takata@linux-m32r.org
Cc: ralf@linux-mips.org
Cc: grundler@parisc-linux.org
Cc: paulus@samba.org
Cc: schwidefsky@de.ibm.com
Cc: lethal@linux-sh.org
Cc: wli@holomorphy.com
Cc: davem@davemloft.net
Cc: jdike@addtoit.com
Cc: mingo@redhat.com
---
 kernel/cpu.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8ea32e8d68b..bae131a1211 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -24,19 +24,20 @@
 cpumask_t cpu_present_map __read_mostly;
 EXPORT_SYMBOL(cpu_present_map);
 
-#ifndef CONFIG_SMP
-
 /*
  * Represents all cpu's that are currently online.
  */
-cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
+cpumask_t cpu_online_map __read_mostly;
 EXPORT_SYMBOL(cpu_online_map);
 
+#ifdef CONFIG_INIT_ALL_POSSIBLE
 cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
+#else
+cpumask_t cpu_possible_map __read_mostly;
+#endif
 EXPORT_SYMBOL(cpu_possible_map);
 
-#else /* CONFIG_SMP */
-
+#ifdef CONFIG_SMP
 /* Serializes the updates to cpu_online_map, cpu_present_map */
 static DEFINE_MUTEX(cpu_add_remove_lock);
 
-- 
cgit v1.2.3


From 29c0177e6a4ac094302bed54a1d4bbb6b740a9ef Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Sat, 13 Dec 2008 21:20:25 +1030
Subject: cpumask: change cpumask_scnprintf, cpumask_parse_user, cpulist_parse,
 and cpulist_scnprintf to take pointers.

Impact: change calling convention of existing cpumask APIs

Most cpumask functions started with cpus_: these have been replaced by
cpumask_ ones which take struct cpumask pointers as expected.

These four functions don't have good replacement names; fortunately
they're rarely used, so we just change them over.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: paulus@samba.org
Cc: mingo@redhat.com
Cc: tony.luck@intel.com
Cc: ralf@linux-mips.org
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: cl@linux-foundation.org
Cc: srostedt@redhat.com
---
 kernel/cpuset.c      | 4 ++--
 kernel/irq/proc.c    | 4 ++--
 kernel/profile.c     | 4 ++--
 kernel/sched.c       | 4 ++--
 kernel/sched_stats.h | 2 +-
 kernel/taskstats.c   | 2 +-
 kernel/trace/trace.c | 4 ++--
 7 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 96c0ba13b8c..39c1a4c1c5a 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -896,7 +896,7 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
 	if (!*buf) {
 		cpus_clear(trialcs.cpus_allowed);
 	} else {
-		retval = cpulist_parse(buf, trialcs.cpus_allowed);
+		retval = cpulist_parse(buf, &trialcs.cpus_allowed);
 		if (retval < 0)
 			return retval;
 
@@ -1482,7 +1482,7 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
 	mask = cs->cpus_allowed;
 	mutex_unlock(&callback_mutex);
 
-	return cpulist_scnprintf(page, PAGE_SIZE, mask);
+	return cpulist_scnprintf(page, PAGE_SIZE, &mask);
 }
 
 static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index d257e7d6a8a..f293349d49d 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -47,7 +47,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
 	    irq_balancing_disabled(irq))
 		return -EIO;
 
-	err = cpumask_parse_user(buffer, count, new_value);
+	err = cpumask_parse_user(buffer, count, &new_value);
 	if (err)
 		return err;
 
@@ -95,7 +95,7 @@ static ssize_t default_affinity_write(struct file *file,
 	cpumask_t new_value;
 	int err;
 
-	err = cpumask_parse_user(buffer, count, new_value);
+	err = cpumask_parse_user(buffer, count, &new_value);
 	if (err)
 		return err;
 
diff --git a/kernel/profile.c b/kernel/profile.c
index dc41827fbfe..7d620dfdde5 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -442,7 +442,7 @@ void profile_tick(int type)
 static int prof_cpu_mask_read_proc(char *page, char **start, off_t off,
 			int count, int *eof, void *data)
 {
-	int len = cpumask_scnprintf(page, count, *(cpumask_t *)data);
+	int len = cpumask_scnprintf(page, count, (cpumask_t *)data);
 	if (count - len < 2)
 		return -EINVAL;
 	len += sprintf(page + len, "\n");
@@ -456,7 +456,7 @@ static int prof_cpu_mask_write_proc(struct file *file,
 	unsigned long full_count = count, err;
 	cpumask_t new_value;
 
-	err = cpumask_parse_user(buffer, count, new_value);
+	err = cpumask_parse_user(buffer, count, &new_value);
 	if (err)
 		return err;
 
diff --git a/kernel/sched.c b/kernel/sched.c
index e4bb1dd7b30..d2d16d1273b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6666,7 +6666,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 	struct sched_group *group = sd->groups;
 	char str[256];
 
-	cpulist_scnprintf(str, sizeof(str), sd->span);
+	cpulist_scnprintf(str, sizeof(str), &sd->span);
 	cpus_clear(*groupmask);
 
 	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
@@ -6720,7 +6720,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 
 		cpus_or(*groupmask, *groupmask, group->cpumask);
 
-		cpulist_scnprintf(str, sizeof(str), group->cpumask);
+		cpulist_scnprintf(str, sizeof(str), &group->cpumask);
 		printk(KERN_CONT " %s", str);
 
 		group = group->next;
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 7dbf72a2b02..6beff1e4eea 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -42,7 +42,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
 		for_each_domain(cpu, sd) {
 			enum cpu_idle_type itype;
 
-			cpumask_scnprintf(mask_str, mask_len, sd->span);
+			cpumask_scnprintf(mask_str, mask_len, &sd->span);
 			seq_printf(seq, "domain%d %s", dcount++, mask_str);
 			for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
 					itype++) {
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index bd6be76303c..6d7dc4ec4aa 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -352,7 +352,7 @@ static int parse(struct nlattr *na, cpumask_t *mask)
 	if (!data)
 		return -ENOMEM;
 	nla_strlcpy(data, na, len);
-	ret = cpulist_parse(data, *mask);
+	ret = cpulist_parse(data, mask);
 	kfree(data);
 	return ret;
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d86e3252f30..d2e75479dc5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2126,7 +2126,7 @@ tracing_cpumask_read(struct file *filp, char __user *ubuf,
 
 	mutex_lock(&tracing_cpumask_update_lock);
 
-	len = cpumask_scnprintf(mask_str, count, tracing_cpumask);
+	len = cpumask_scnprintf(mask_str, count, &tracing_cpumask);
 	if (count - len < 2) {
 		count = -EINVAL;
 		goto out_err;
@@ -2147,7 +2147,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
 	int err, cpu;
 
 	mutex_lock(&tracing_cpumask_update_lock);
-	err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
+	err = cpumask_parse_user(ubuf, count, &tracing_cpumask_new);
 	if (err)
 		goto err_unlock;
 
-- 
cgit v1.2.3


From 0de26520c7cabf36e1de090ea8092f011a6106ce Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Sat, 13 Dec 2008 21:20:26 +1030
Subject: cpumask: make irq_set_affinity() take a const struct cpumask

Impact: change existing irq_chip API

Not much point with gentle transition here: the struct irq_chip's
setaffinity method signature needs to change.

Fortunately, not widely used code, but hits a few architectures.

Note: In irq_select_affinity() I save a temporary in by mangling
irq_desc[irq].affinity directly.  Ingo, does this break anything?

(Folded in fix from KOSAKI Motohiro)

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Grant Grundler <grundler@parisc-linux.org>
Acked-by: Ingo Molnar <mingo@redhat.com>
Cc: ralf@linux-mips.org
Cc: grundler@parisc-linux.org
Cc: jeremy@xensource.com
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
---
 kernel/irq/chip.c         |  2 +-
 kernel/irq/manage.c       | 22 ++++++++++------------
 kernel/irq/migration.c    | 14 +++++++-------
 kernel/irq/proc.c         | 29 +++++++++++++++++++----------
 kernel/time/tick-common.c |  6 +++---
 5 files changed, 40 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 10b5092e9bf..58d8e31daa4 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -45,7 +45,7 @@ void dynamic_irq_init(unsigned int irq)
 	desc->irq_count = 0;
 	desc->irqs_unhandled = 0;
 #ifdef CONFIG_SMP
-	cpus_setall(desc->affinity);
+	cpumask_setall(&desc->affinity);
 #endif
 	spin_unlock_irqrestore(&desc->lock, flags);
 }
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 801addda3c4..10ad2f87ed9 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -79,7 +79,7 @@ int irq_can_set_affinity(unsigned int irq)
  *	@cpumask:	cpumask
  *
  */
-int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
+int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
@@ -91,14 +91,14 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
-		desc->affinity = cpumask;
+		cpumask_copy(&desc->affinity, cpumask);
 		desc->chip->set_affinity(irq, cpumask);
 	} else {
 		desc->status |= IRQ_MOVE_PENDING;
-		desc->pending_mask = cpumask;
+		cpumask_copy(&desc->pending_mask, cpumask);
 	}
 #else
-	desc->affinity = cpumask;
+	cpumask_copy(&desc->affinity, cpumask);
 	desc->chip->set_affinity(irq, cpumask);
 #endif
 	desc->status |= IRQ_AFFINITY_SET;
@@ -112,26 +112,24 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
  */
 int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc)
 {
-	cpumask_t mask;
-
 	if (!irq_can_set_affinity(irq))
 		return 0;
 
-	cpus_and(mask, cpu_online_map, irq_default_affinity);
-
 	/*
 	 * Preserve an userspace affinity setup, but make sure that
 	 * one of the targets is online.
 	 */
 	if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
-		if (cpus_intersects(desc->affinity, cpu_online_map))
-			mask = desc->affinity;
+		if (cpumask_any_and(&desc->affinity, cpu_online_mask)
+		    < nr_cpu_ids)
+			goto set_affinity;
 		else
 			desc->status &= ~IRQ_AFFINITY_SET;
 	}
 
-	desc->affinity = mask;
-	desc->chip->set_affinity(irq, mask);
+	cpumask_and(&desc->affinity, cpu_online_mask, &irq_default_affinity);
+set_affinity:
+	desc->chip->set_affinity(irq, &desc->affinity);
 
 	return 0;
 }
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 9db681d9581..bd72329e630 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -4,7 +4,6 @@
 void move_masked_irq(int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
-	cpumask_t tmp;
 
 	if (likely(!(desc->status & IRQ_MOVE_PENDING)))
 		return;
@@ -19,7 +18,7 @@ void move_masked_irq(int irq)
 
 	desc->status &= ~IRQ_MOVE_PENDING;
 
-	if (unlikely(cpus_empty(desc->pending_mask)))
+	if (unlikely(cpumask_empty(&desc->pending_mask)))
 		return;
 
 	if (!desc->chip->set_affinity)
@@ -27,8 +26,6 @@ void move_masked_irq(int irq)
 
 	assert_spin_locked(&desc->lock);
 
-	cpus_and(tmp, desc->pending_mask, cpu_online_map);
-
 	/*
 	 * If there was a valid mask to work with, please
 	 * do the disable, re-program, enable sequence.
@@ -41,10 +38,13 @@ void move_masked_irq(int irq)
 	 * For correct operation this depends on the caller
 	 * masking the irqs.
 	 */
-	if (likely(!cpus_empty(tmp))) {
-		desc->chip->set_affinity(irq,tmp);
+	if (likely(cpumask_any_and(&desc->pending_mask, cpu_online_mask)
+		   < nr_cpu_ids)) {
+		cpumask_and(&desc->affinity,
+			    &desc->pending_mask, cpu_online_mask);
+		desc->chip->set_affinity(irq, &desc->affinity);
 	}
-	cpus_clear(desc->pending_mask);
+	cpumask_clear(&desc->pending_mask);
 }
 
 void move_native_irq(int irq)
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index f293349d49d..8e91c976252 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -40,33 +40,42 @@ static ssize_t irq_affinity_proc_write(struct file *file,
 		const char __user *buffer, size_t count, loff_t *pos)
 {
 	unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
-	cpumask_t new_value;
+	cpumask_var_t new_value;
 	int err;
 
 	if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity ||
 	    irq_balancing_disabled(irq))
 		return -EIO;
 
-	err = cpumask_parse_user(buffer, count, &new_value);
+	if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
+		return -ENOMEM;
+
+	err = cpumask_parse_user(buffer, count, new_value);
 	if (err)
-		return err;
+		goto free_cpumask;
 
-	if (!is_affinity_mask_valid(new_value))
-		return -EINVAL;
+	if (!is_affinity_mask_valid(*new_value)) {
+		err = -EINVAL;
+		goto free_cpumask;
+	}
 
 	/*
 	 * Do not allow disabling IRQs completely - it's a too easy
 	 * way to make the system unusable accidentally :-) At least
 	 * one online CPU still has to be targeted.
 	 */
-	if (!cpus_intersects(new_value, cpu_online_map))
+	if (!cpumask_intersects(new_value, cpu_online_mask)) {
 		/* Special case for empty set - allow the architecture
 		   code to set default SMP affinity. */
-		return irq_select_affinity_usr(irq) ? -EINVAL : count;
-
-	irq_set_affinity(irq, new_value);
+		err = irq_select_affinity_usr(irq) ? -EINVAL : count;
+	} else {
+		irq_set_affinity(irq, new_value);
+		err = count;
+	}
 
-	return count;
+free_cpumask:
+	free_cpumask_var(new_value);
+	return err;
 }
 
 static int irq_affinity_proc_open(struct inode *inode, struct file *file)
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index df12434b43c..ab65d217583 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -136,7 +136,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
  */
 static void tick_setup_device(struct tick_device *td,
 			      struct clock_event_device *newdev, int cpu,
-			      const cpumask_t *cpumask)
+			      const struct cpumask *cpumask)
 {
 	ktime_t next_event;
 	void (*handler)(struct clock_event_device *) = NULL;
@@ -171,8 +171,8 @@ static void tick_setup_device(struct tick_device *td,
 	 * When the device is not per cpu, pin the interrupt to the
 	 * current cpu:
 	 */
-	if (!cpus_equal(newdev->cpumask, *cpumask))
-		irq_set_affinity(newdev->irq, *cpumask);
+	if (!cpumask_equal(&newdev->cpumask, cpumask))
+		irq_set_affinity(newdev->irq, cpumask);
 
 	/*
 	 * When global broadcasting is active, check if the current
-- 
cgit v1.2.3


From 320ab2b0b1e08e3805a3e1084a2f0eb1938d5d67 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Sat, 13 Dec 2008 21:20:26 +1030
Subject: cpumask: convert struct clock_event_device to cpumask pointers.

Impact: change calling convention of existing clock_event APIs

struct clock_event_timer's cpumask field gets changed to take pointer,
as does the ->broadcast function.

Another single-patch change.  For safety, we BUG_ON() in
clockevents_register_device() if it's not set.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/clockevents.c    | 2 ++
 kernel/time/tick-broadcast.c | 2 +-
 kernel/time/tick-common.c    | 8 ++++----
 3 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index f8d968063ce..ea2f48af83c 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -166,6 +166,8 @@ static void clockevents_notify_released(void)
 void clockevents_register_device(struct clock_event_device *dev)
 {
 	BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
+	BUG_ON(!dev->cpumask);
+
 	/*
 	 * A nsec2cyc multiplicator of 0 is invalid and we'd crash
 	 * on it, so fix it up and emit a warning:
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index f98a1b7b16e..9590af2327b 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -150,7 +150,7 @@ static void tick_do_broadcast(cpumask_t mask)
 		 */
 		cpu = first_cpu(mask);
 		td = &per_cpu(tick_cpu_device, cpu);
-		td->evtdev->broadcast(mask);
+		td->evtdev->broadcast(&mask);
 	}
 }
 
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index ab65d217583..f8372be7412 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -171,7 +171,7 @@ static void tick_setup_device(struct tick_device *td,
 	 * When the device is not per cpu, pin the interrupt to the
 	 * current cpu:
 	 */
-	if (!cpumask_equal(&newdev->cpumask, cpumask))
+	if (!cpumask_equal(newdev->cpumask, cpumask))
 		irq_set_affinity(newdev->irq, cpumask);
 
 	/*
@@ -202,14 +202,14 @@ static int tick_check_new_device(struct clock_event_device *newdev)
 	spin_lock_irqsave(&tick_device_lock, flags);
 
 	cpu = smp_processor_id();
-	if (!cpu_isset(cpu, newdev->cpumask))
+	if (!cpumask_test_cpu(cpu, newdev->cpumask))
 		goto out_bc;
 
 	td = &per_cpu(tick_cpu_device, cpu);
 	curdev = td->evtdev;
 
 	/* cpu local device ? */
-	if (!cpus_equal(newdev->cpumask, cpumask_of_cpu(cpu))) {
+	if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) {
 
 		/*
 		 * If the cpu affinity of the device interrupt can not
@@ -222,7 +222,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
 		 * If we have a cpu local device already, do not replace it
 		 * by a non cpu local device
 		 */
-		if (curdev && cpus_equal(curdev->cpumask, cpumask_of_cpu(cpu)))
+		if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu)))
 			goto out_bc;
 	}
 
-- 
cgit v1.2.3


From 03e89e4574a680af15f59329b061f35d9813aff4 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 16 Dec 2008 08:45:30 +0100
Subject: sched: fix wakeup preemption clock

Impact: sharpen the wakeup-granularity to always be against current scheduler time

It was possible to do the preemption check against an old time stamp.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 2 +-
 kernel/sched_fair.c | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index ad7b93be569..88215066efa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2266,6 +2266,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 
 	smp_wmb();
 	rq = task_rq_lock(p, &flags);
+	update_rq_clock(rq);
 	old_state = p->state;
 	if (!(old_state & state))
 		goto out;
@@ -2323,7 +2324,6 @@ out_activate:
 		schedstat_inc(p, se.nr_wakeups_local);
 	else
 		schedstat_inc(p, se.nr_wakeups_remote);
-	update_rq_clock(rq);
 	activate_task(rq, p, 1);
 	success = 1;
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 98345e45b05..928cd74cff0 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1345,12 +1345,11 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
 {
 	struct task_struct *curr = rq->curr;
 	struct sched_entity *se = &curr->se, *pse = &p->se;
+	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 
-	if (unlikely(rt_prio(p->prio))) {
-		struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+	update_curr(cfs_rq);
 
-		update_rq_clock(rq);
-		update_curr(cfs_rq);
+	if (unlikely(rt_prio(p->prio))) {
 		resched_task(curr);
 		return;
 	}
-- 
cgit v1.2.3


From 34f28ecd0f4bdc733c681294d02d9fab5880591b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 16 Dec 2008 08:45:31 +0100
Subject: sched: optimize update_curr()

Impact: micro-optimization

Skip the hard work when there is none.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 928cd74cff0..5ad4440f0fc 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -492,6 +492,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
 	 * overflow on 32 bits):
 	 */
 	delta_exec = (unsigned long)(now - curr->exec_start);
+	if (!delta_exec)
+		return;
 
 	__update_curr(cfs_rq, curr, delta_exec);
 	curr->exec_start = now;
-- 
cgit v1.2.3


From 720f54988e17b33f3f477010b3a68ee872d20d5a Mon Sep 17 00:00:00 2001
From: Ken Chen <kenchen@google.com>
Date: Mon, 15 Dec 2008 22:02:01 -0800
Subject: sched, cpuacct: refactoring cpuusage_read / cpuusage_write

Impact: micro-optimize the code on 64-bit architectures

In the thread regarding to 'export percpu cpuacct cgroup stats'
http://lkml.org/lkml/2008/12/7/13

akpm pointed out that current cpuacct code is inefficient.  This patch
refactoring the following:

* make cpu_rq locking only on 32-bit
* change iterator to each_present_cpu instead of each_possible_cpu to
  make it hotplug friendly.

It's a bit of code churn, but I was rewarded with 160 byte code size saving
on x86-64 arch and zero code size change on i386.

Signed-off-by: Ken Chen <kenchen@google.com>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 56 +++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 39 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 88215066efa..41b7e2d524d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9275,6 +9275,41 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 	kfree(ca);
 }
 
+static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
+{
+	u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+	u64 data;
+
+#ifndef CONFIG_64BIT
+	/*
+	 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
+	 */
+	spin_lock_irq(&cpu_rq(cpu)->lock);
+	data = *cpuusage;
+	spin_unlock_irq(&cpu_rq(cpu)->lock);
+#else
+	data = *cpuusage;
+#endif
+
+	return data;
+}
+
+static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
+{
+	u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+
+#ifndef CONFIG_64BIT
+	/*
+	 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
+	 */
+	spin_lock_irq(&cpu_rq(cpu)->lock);
+	*cpuusage = val;
+	spin_unlock_irq(&cpu_rq(cpu)->lock);
+#else
+	*cpuusage = val;
+#endif
+}
+
 /* return total cpu usage (in nanoseconds) of a group */
 static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
 {
@@ -9282,17 +9317,8 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
 	u64 totalcpuusage = 0;
 	int i;
 
-	for_each_possible_cpu(i) {
-		u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
-
-		/*
-		 * Take rq->lock to make 64-bit addition safe on 32-bit
-		 * platforms.
-		 */
-		spin_lock_irq(&cpu_rq(i)->lock);
-		totalcpuusage += *cpuusage;
-		spin_unlock_irq(&cpu_rq(i)->lock);
-	}
+	for_each_present_cpu(i)
+		totalcpuusage += cpuacct_cpuusage_read(ca, i);
 
 	return totalcpuusage;
 }
@@ -9309,13 +9335,9 @@ static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
 		goto out;
 	}
 
-	for_each_possible_cpu(i) {
-		u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
+	for_each_present_cpu(i)
+		cpuacct_cpuusage_write(ca, i, 0);
 
-		spin_lock_irq(&cpu_rq(i)->lock);
-		*cpuusage = 0;
-		spin_unlock_irq(&cpu_rq(i)->lock);
-	}
 out:
 	return err;
 }
-- 
cgit v1.2.3


From e9515c3c9feecd74174c2998add0db51e02abb8d Mon Sep 17 00:00:00 2001
From: Ken Chen <kenchen@google.com>
Date: Mon, 15 Dec 2008 22:04:15 -0800
Subject: sched, cpuacct: export percpu cpuacct cgroup stats

This patch export per-cpu CPU cycle usage for a given cpuacct cgroup.
There is a need for a user space monitor daemon to track group CPU
usage on per-cpu base.  It is also useful for monitoring CFS load
balancer behavior by tracking per CPU group usage.

Signed-off-by: Ken Chen <kenchen@google.com>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 41b7e2d524d..f53e2b8ef52 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9342,12 +9342,32 @@ out:
 	return err;
 }
 
+static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
+				   struct seq_file *m)
+{
+	struct cpuacct *ca = cgroup_ca(cgroup);
+	u64 percpu;
+	int i;
+
+	for_each_present_cpu(i) {
+		percpu = cpuacct_cpuusage_read(ca, i);
+		seq_printf(m, "%llu ", (unsigned long long) percpu);
+	}
+	seq_printf(m, "\n");
+	return 0;
+}
+
 static struct cftype files[] = {
 	{
 		.name = "usage",
 		.read_u64 = cpuusage_read,
 		.write_u64 = cpuusage_write,
 	},
+	{
+		.name = "usage_percpu",
+		.read_seq_string = cpuacct_percpu_seq_read,
+	},
+
 };
 
 static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
-- 
cgit v1.2.3


From 80f40ee4a07530cc3acbc239a9299ec47025825b Mon Sep 17 00:00:00 2001
From: Bharata B Rao <bharata@linux.vnet.ibm.com>
Date: Mon, 15 Dec 2008 11:56:48 +0530
Subject: sched: use RCU variant of list traversal in for_each_leaf_rt_rq()

Impact: fix potential of rare crash

for_each_leaf_rt_rq() walks an RCU protected list (rq->leaf_rt_rq_list),
but doesn't use list_for_each_entry_rcu(). Fix this.

Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d9ba9d5f99d..7bdf84c85cc 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -77,7 +77,7 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 }
 
 #define for_each_leaf_rt_rq(rt_rq, rq) \
-	list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
+	list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
 
 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
 {
-- 
cgit v1.2.3


From 3ac52669c7a24b93663acfcab606d1065ed1accd Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sat, 13 Dec 2008 09:15:27 -0800
Subject: resources: skip sanity check of busy resources

Impact: reduce false positives in iomem_map_sanity_check()

Some drivers (vesafb) only map/reserve a portion of a resource.
If then some other driver comes in and maps the whole resource,
the current code WARN_ON's. This is not the intent of the checks
in iomem_map_sanity_check(); rather these checks want to
warn when crossing *hardware* resources only.

This patch skips BUSY resources as suggested by Linus.

Note: having two drivers talk to the same hardware at the same
time is obviously not optimal behavior, but that's a separate story.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/resource.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/resource.c b/kernel/resource.c
index 4337063663e..e633106b12f 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -853,6 +853,15 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
 		if (PFN_DOWN(p->start) <= PFN_DOWN(addr) &&
 		    PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1))
 			continue;
+		/*
+		 * if a resource is "BUSY", it's not a hardware resource
+		 * but a driver mapping of such a resource; we don't want
+		 * to warn for those; some drivers legitimately map only
+		 * partial hardware resources. (example: vesafb)
+		 */
+		if (p->flags & IORESOURCE_BUSY)
+			continue;
+
 		printk(KERN_WARNING "resource map sanity check conflict: "
 		       "0x%llx 0x%llx 0x%llx 0x%llx %s\n",
 		       (unsigned long long)addr,
-- 
cgit v1.2.3


From 343e9099c8152daff20e10d6269edec21da44fc0 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 15 Dec 2008 16:13:07 -0800
Subject: rcu: fix rcutorture behavior during reboot

Impact: fix very rare reboot hang

Because rcutorture ignored all signals, it does not terminate in
response to the signals sent at shutdown time.  This can cause strange
failures due to its continuing to make use of kernel function too late
in the shutdown sequence.  This patch therefore adds a shutdown notifier
to rcutorture, causing it to shut down in response to a reboot or an
orderly shutdown.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutorture.c | 66 +++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 56 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 85cb90588a5..b3106552210 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -39,6 +39,7 @@
 #include <linux/moduleparam.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
+#include <linux/reboot.h>
 #include <linux/freezer.h>
 #include <linux/cpu.h>
 #include <linux/delay.h>
@@ -108,7 +109,6 @@ struct rcu_torture {
 	int rtort_mbtest;
 };
 
-static int fullstop = 0;	/* stop generating callbacks at test end. */
 static LIST_HEAD(rcu_torture_freelist);
 static struct rcu_torture *rcu_torture_current = NULL;
 static long rcu_torture_current_version = 0;
@@ -136,6 +136,30 @@ static int stutter_pause_test = 0;
 #endif
 int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
 
+#define FULLSTOP_SIGNALED 1	/* Bail due to signal. */
+#define FULLSTOP_CLEANUP  2	/* Orderly shutdown. */
+static int fullstop;		/* stop generating callbacks at test end. */
+DEFINE_MUTEX(fullstop_mutex);	/* protect fullstop transitions and */
+				/*  spawning of kthreads. */
+
+/*
+ * Detect and respond to a signal-based shutdown.
+ */
+static int
+rcutorture_shutdown_notify(struct notifier_block *unused1,
+			   unsigned long unused2, void *unused3)
+{
+	if (fullstop)
+		return NOTIFY_DONE;
+	if (signal_pending(current)) {
+		mutex_lock(&fullstop_mutex);
+		if (!ACCESS_ONCE(fullstop))
+			fullstop = FULLSTOP_SIGNALED;
+		mutex_unlock(&fullstop_mutex);
+	}
+	return NOTIFY_DONE;
+}
+
 /*
  * Allocate an element from the rcu_tortures pool.
  */
@@ -199,11 +223,12 @@ rcu_random(struct rcu_random_state *rrsp)
 static void
 rcu_stutter_wait(void)
 {
-	while (stutter_pause_test || !rcutorture_runnable)
+	while ((stutter_pause_test || !rcutorture_runnable) && !fullstop) {
 		if (rcutorture_runnable)
 			schedule_timeout_interruptible(1);
 		else
 			schedule_timeout_interruptible(round_jiffies_relative(HZ));
+	}
 }
 
 /*
@@ -599,7 +624,7 @@ rcu_torture_writer(void *arg)
 		rcu_stutter_wait();
 	} while (!kthread_should_stop() && !fullstop);
 	VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
-	while (!kthread_should_stop())
+	while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED)
 		schedule_timeout_uninterruptible(1);
 	return 0;
 }
@@ -624,7 +649,7 @@ rcu_torture_fakewriter(void *arg)
 	} while (!kthread_should_stop() && !fullstop);
 
 	VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping");
-	while (!kthread_should_stop())
+	while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED)
 		schedule_timeout_uninterruptible(1);
 	return 0;
 }
@@ -734,7 +759,7 @@ rcu_torture_reader(void *arg)
 	VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
 	if (irqreader && cur_ops->irqcapable)
 		del_timer_sync(&t);
-	while (!kthread_should_stop())
+	while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED)
 		schedule_timeout_uninterruptible(1);
 	return 0;
 }
@@ -831,7 +856,7 @@ rcu_torture_stats(void *arg)
 	do {
 		schedule_timeout_interruptible(stat_interval * HZ);
 		rcu_torture_stats_print();
-	} while (!kthread_should_stop());
+	} while (!kthread_should_stop() && !fullstop);
 	VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping");
 	return 0;
 }
@@ -899,7 +924,7 @@ rcu_torture_shuffle(void *arg)
 	do {
 		schedule_timeout_interruptible(shuffle_interval * HZ);
 		rcu_torture_shuffle_tasks();
-	} while (!kthread_should_stop());
+	} while (!kthread_should_stop() && !fullstop);
 	VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping");
 	return 0;
 }
@@ -914,10 +939,10 @@ rcu_torture_stutter(void *arg)
 	do {
 		schedule_timeout_interruptible(stutter * HZ);
 		stutter_pause_test = 1;
-		if (!kthread_should_stop())
+		if (!kthread_should_stop() && !fullstop)
 			schedule_timeout_interruptible(stutter * HZ);
 		stutter_pause_test = 0;
-	} while (!kthread_should_stop());
+	} while (!kthread_should_stop() && !fullstop);
 	VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping");
 	return 0;
 }
@@ -934,12 +959,27 @@ rcu_torture_print_module_parms(char *tag)
 		stutter, irqreader);
 }
 
+static struct notifier_block rcutorture_nb = {
+	.notifier_call = rcutorture_shutdown_notify,
+};
+
 static void
 rcu_torture_cleanup(void)
 {
 	int i;
 
-	fullstop = 1;
+	mutex_lock(&fullstop_mutex);
+	if (!fullstop) {
+		/* If being signaled, let it happen, then exit. */
+		mutex_unlock(&fullstop_mutex);
+		schedule_timeout_interruptible(10 * HZ);
+		if (cur_ops->cb_barrier != NULL)
+			cur_ops->cb_barrier();
+		return;
+	}
+	fullstop = FULLSTOP_CLEANUP;
+	mutex_unlock(&fullstop_mutex);
+	unregister_reboot_notifier(&rcutorture_nb);
 	if (stutter_task) {
 		VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
 		kthread_stop(stutter_task);
@@ -1015,6 +1055,8 @@ rcu_torture_init(void)
 		{ &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
 		  &srcu_ops, &sched_ops, &sched_ops_sync, };
 
+	mutex_lock(&fullstop_mutex);
+
 	/* Process args and tell the world that the torturer is on the job. */
 	for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
 		cur_ops = torture_ops[i];
@@ -1024,6 +1066,7 @@ rcu_torture_init(void)
 	if (i == ARRAY_SIZE(torture_ops)) {
 		printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
 		       torture_type);
+		mutex_unlock(&fullstop_mutex);
 		return (-EINVAL);
 	}
 	if (cur_ops->init)
@@ -1146,9 +1189,12 @@ rcu_torture_init(void)
 			goto unwind;
 		}
 	}
+	register_reboot_notifier(&rcutorture_nb);
+	mutex_unlock(&fullstop_mutex);
 	return 0;
 
 unwind:
+	mutex_unlock(&fullstop_mutex);
 	rcu_torture_cleanup();
 	return firsterr;
 }
-- 
cgit v1.2.3


From 2c2d7329d8afa9efa3ec24e19a53e7be9d14f242 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 16 Dec 2008 22:08:58 +0100
Subject: tracing/ftrace: use preempt_enable_no_resched_notrace in
 ring_buffer_time_stamp()

Impact: prevent a trace recursion

After some tests with function graph tracer under x86-32, I saw some recursions
caused by ring_buffer_time_stamp() that calls preempt_enable_no_notrace() which
calls preempt_schedule() which is traced itself.

This patch re-enables preemption without rescheduling.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7f69cfeaadf..eab81f918f6 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -107,7 +107,7 @@ u64 ring_buffer_time_stamp(int cpu)
 	preempt_disable_notrace();
 	/* shift to debug/test normalization and TIME_EXTENTS */
 	time = sched_clock() << DEBUG_SHIFT;
-	preempt_enable_notrace();
+	preempt_enable_no_resched_notrace();
 
 	return time;
 }
-- 
cgit v1.2.3


From 66896a85cf2890b6bbbc4c9ccdcd296600ffbf89 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 13 Dec 2008 20:18:13 +0100
Subject: tracing/ftrace: add the printk-msg-only option

Impact: display ftrace_printk messages "as is"

By default, ftrace_printk() messages find their output with some other
informations like pid, caller, ...
Sometimes a developer just want to have the ftrace_printk left "as is", without
other information.

This is done by providing a default-off option called printk-msg-only.
To enable it, just do `echo printk-msg-only > /debugfs/tracing/trace_options`

Before the patch:

           <...>-2739  [000]   145.692153: __might_sleep: I'm an ftrace_printk msg in __might_sleep
           <...>-2739  [000]   145.692155: __might_sleep: I'm another ftrace_printk msg in __might_sleep

After the patch and the printk-msg-only option enabled:

I'm an ftrace_printk msg in __might_sleep
I'm another ftrace_printk msg in __might_sleep

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 25 +++++++++++++++++++++++++
 kernel/trace/trace.h |  3 ++-
 2 files changed, 27 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 639344a4d3a..1a3d6b32978 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -287,6 +287,7 @@ static const char *trace_options[] = {
 	"annotate",
 	"userstacktrace",
 	"sym-userobj",
+	"printk-msg-only",
 	NULL
 };
 
@@ -2265,6 +2266,25 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 	return TRACE_TYPE_HANDLED;
 }
 
+static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry = iter->ent;
+	struct print_entry *field;
+	int ret;
+
+	trace_assign_type(field, entry);
+
+	ret = trace_seq_printf(s, field->buf);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	if (entry->flags & TRACE_FLAG_CONT)
+		trace_seq_print_cont(s, iter);
+
+	return TRACE_TYPE_HANDLED;
+}
+
 static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
@@ -2345,6 +2365,11 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
 			return ret;
 	}
 
+	if (iter->ent->type == TRACE_PRINT &&
+			trace_flags & TRACE_ITER_PRINTK &&
+			trace_flags & TRACE_ITER_PRINTK_MSGONLY)
+		return print_printk_msg_only(iter);
+
 	if (trace_flags & TRACE_ITER_BIN)
 		return print_bin_fmt(iter);
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f07c246dd73..fc75dce7a66 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -578,7 +578,8 @@ enum trace_iterator_flags {
 	TRACE_ITER_BRANCH		= 0x1000,
 	TRACE_ITER_ANNOTATE		= 0x2000,
 	TRACE_ITER_USERSTACKTRACE       = 0x4000,
-	TRACE_ITER_SYM_USEROBJ          = 0x8000
+	TRACE_ITER_SYM_USEROBJ          = 0x8000,
+	TRACE_ITER_PRINTK_MSGONLY	= 0x10000
 };
 
 /*
-- 
cgit v1.2.3


From 73500ac545d24610eb2cf8579ffc88957e9c5847 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Wed, 17 Dec 2008 17:29:56 -0800
Subject: futex: rename field in futex_q to clarify single waiter semantics

Impact: simplify code

I've tripped over the naming of this field a couple times.

The futex_q uses a "waiters" list to represent a single blocked task and
then calles wake_up_all().

This can lead to confusion in trying to understand the intent of the code,
which is to have a single futex_q for every task waiting on a futex.

This patch corrects the problem, using a single pointer to the waiting
task, and an appropriate call to wake_up, rather than wake_up_all.

Compile and boot tested on an 8way x86_64 machine.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/futex.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index ba0d3b83c09..99f8acce08b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -92,11 +92,12 @@ struct futex_pi_state {
  * A futex_q has a woken state, just like tasks have TASK_RUNNING.
  * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
  * The order of wakup is always to make the first condition true, then
- * wake up q->waiters, then make the second condition true.
+ * wake up q->waiter, then make the second condition true.
  */
 struct futex_q {
 	struct plist_node list;
-	wait_queue_head_t waiters;
+	/* There can only be a single waiter */
+	wait_queue_head_t waiter;
 
 	/* Which hash list lock to use: */
 	spinlock_t *lock_ptr;
@@ -573,7 +574,7 @@ static void wake_futex(struct futex_q *q)
 	 * The lock in wake_up_all() is a crucial memory barrier after the
 	 * plist_del() and also before assigning to q->lock_ptr.
 	 */
-	wake_up_all(&q->waiters);
+	wake_up(&q->waiter);
 	/*
 	 * The waiting task can free the futex_q as soon as this is written,
 	 * without taking any locks.  This must come last.
@@ -930,7 +931,7 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
 {
 	struct futex_hash_bucket *hb;
 
-	init_waitqueue_head(&q->waiters);
+	init_waitqueue_head(&q->waiter);
 
 	get_futex_key_refs(&q->key);
 	hb = hash_futex(&q->key);
@@ -1221,7 +1222,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
 
 	/* add_wait_queue is the barrier after __set_current_state. */
 	__set_current_state(TASK_INTERRUPTIBLE);
-	add_wait_queue(&q.waiters, &wait);
+	add_wait_queue(&q.waiter, &wait);
 	/*
 	 * !plist_node_empty() is safe here without any lock.
 	 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
-- 
cgit v1.2.3


From f38f1d2aa5a3520cf05da7cd6bd12fe2b0c509b7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 16 Dec 2008 23:06:40 -0500
Subject: trace: add a way to enable or disable the stack tracer

Impact: enhancement to stack tracer

The stack tracer currently is either on when configured in or
off when it is not. It can not be disabled when it is configured on.
(besides disabling the function tracer that it uses)

This patch adds a way to enable or disable the stack tracer at
run time. It defaults off on bootup, but a kernel parameter 'stacktrace'
has been added to enable it on bootup.

A new sysctl has been added "kernel.stack_tracer_enabled" to let
the user enable or disable the stack tracer at run time.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sysctl.c            | 10 +++++++++
 kernel/trace/Kconfig       | 13 ++++++++----
 kernel/trace/trace_stack.c | 52 ++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 67 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c83f566e940..6ac501a2dcc 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -487,6 +487,16 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &ftrace_enable_sysctl,
 	},
 #endif
+#ifdef CONFIG_STACK_TRACER
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "stack_tracer_enabled",
+		.data		= &stack_tracer_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &stack_trace_sysctl,
+	},
+#endif
 #ifdef CONFIG_TRACING
 	{
 		.ctl_name	= CTL_UNNUMBERED,
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d8bae6f4219..e2a4ff6fc3a 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -244,10 +244,15 @@ config STACK_TRACER
 
 	  This tracer works by hooking into every function call that the
 	  kernel executes, and keeping a maximum stack depth value and
-	  stack-trace saved. Because this logic has to execute in every
-	  kernel function, all the time, this option can slow down the
-	  kernel measurably and is generally intended for kernel
-	  developers only.
+	  stack-trace saved.  If this is configured with DYNAMIC_FTRACE
+	  then it will not have any overhead while the stack tracer
+	  is disabled.
+
+	  To enable the stack tracer on bootup, pass in 'stacktrace'
+	  on the kernel command line.
+
+	  The stack tracer can also be enabled or disabled via the
+	  sysctl kernel.stack_tracer_enabled
 
 	  Say N if unsure.
 
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 0b863f2cbc8..4842c969c78 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -10,6 +10,7 @@
 #include <linux/debugfs.h>
 #include <linux/ftrace.h>
 #include <linux/module.h>
+#include <linux/sysctl.h>
 #include <linux/init.h>
 #include <linux/fs.h>
 #include "trace.h"
@@ -31,6 +32,10 @@ static raw_spinlock_t max_stack_lock =
 
 static int stack_trace_disabled __read_mostly;
 static DEFINE_PER_CPU(int, trace_active);
+static DEFINE_MUTEX(stack_sysctl_mutex);
+
+int stack_tracer_enabled;
+static int last_stack_tracer_enabled;
 
 static inline void check_stack(void)
 {
@@ -174,7 +179,7 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
 	return count;
 }
 
-static struct file_operations stack_max_size_fops = {
+static const struct file_operations stack_max_size_fops = {
 	.open		= tracing_open_generic,
 	.read		= stack_max_size_read,
 	.write		= stack_max_size_write,
@@ -272,7 +277,7 @@ static int t_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static struct seq_operations stack_trace_seq_ops = {
+static const struct seq_operations stack_trace_seq_ops = {
 	.start		= t_start,
 	.next		= t_next,
 	.stop		= t_stop,
@@ -288,12 +293,48 @@ static int stack_trace_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
-static struct file_operations stack_trace_fops = {
+static const struct file_operations stack_trace_fops = {
 	.open		= stack_trace_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 };
 
+int
+stack_trace_sysctl(struct ctl_table *table, int write,
+		   struct file *file, void __user *buffer, size_t *lenp,
+		   loff_t *ppos)
+{
+	int ret;
+
+	mutex_lock(&stack_sysctl_mutex);
+
+	ret  = proc_dointvec(table, write, file, buffer, lenp, ppos);
+
+	if (ret || !write ||
+	    (last_stack_tracer_enabled == stack_tracer_enabled))
+		goto out;
+
+	last_stack_tracer_enabled = stack_tracer_enabled;
+
+	if (stack_tracer_enabled)
+		register_ftrace_function(&trace_ops);
+	else
+		unregister_ftrace_function(&trace_ops);
+
+ out:
+	mutex_unlock(&stack_sysctl_mutex);
+	return ret;
+}
+
+static int start_stack_trace __initdata;
+
+static __init int enable_stacktrace(char *str)
+{
+	start_stack_trace = 1;
+	return 1;
+}
+__setup("stacktrace", enable_stacktrace);
+
 static __init int stack_trace_init(void)
 {
 	struct dentry *d_tracer;
@@ -311,7 +352,10 @@ static __init int stack_trace_init(void)
 	if (!entry)
 		pr_warning("Could not create debugfs 'stack_trace' entry\n");
 
-	register_ftrace_function(&trace_ops);
+	if (start_stack_trace) {
+		register_ftrace_function(&trace_ops);
+		stack_tracer_enabled = 1;
+	}
 
 	return 0;
 }
-- 
cgit v1.2.3


From e05a43b744fb9518cbf8539a7ef33164ac60a70f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 17 Dec 2008 09:43:00 -0500
Subject: trace: better use of stack_trace_enabled for boot up code

Impact: clean up

Andrew Morton suggested to use the stack_tracer_enabled variable
to decide whether or not to start stack tracing on bootup.
This lets us remove the start_stack_trace variable.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_stack.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 4842c969c78..d0871bc0aca 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -308,7 +308,7 @@ stack_trace_sysctl(struct ctl_table *table, int write,
 
 	mutex_lock(&stack_sysctl_mutex);
 
-	ret  = proc_dointvec(table, write, file, buffer, lenp, ppos);
+	ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
 
 	if (ret || !write ||
 	    (last_stack_tracer_enabled == stack_tracer_enabled))
@@ -326,11 +326,10 @@ stack_trace_sysctl(struct ctl_table *table, int write,
 	return ret;
 }
 
-static int start_stack_trace __initdata;
-
 static __init int enable_stacktrace(char *str)
 {
-	start_stack_trace = 1;
+	stack_tracer_enabled = 1;
+	last_stack_tracer_enabled = 1;
 	return 1;
 }
 __setup("stacktrace", enable_stacktrace);
@@ -352,10 +351,8 @@ static __init int stack_trace_init(void)
 	if (!entry)
 		pr_warning("Could not create debugfs 'stack_trace' entry\n");
 
-	if (start_stack_trace) {
+	if (stack_tracer_enabled)
 		register_ftrace_function(&trace_ops);
-		stack_tracer_enabled = 1;
-	}
 
 	return 0;
 }
-- 
cgit v1.2.3


From ea3a6d6d60b2504c573fe3415f6617e8310c0236 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 17 Dec 2008 15:05:36 -0500
Subject: ftrace: add not to regex on filtering functions

Impact: enhancement

Ingo Molnar has asked about a way to remove items from the filter
lists. Currently, you can only add or replace items. The way
items are added to the list is through opening one of the list
files (set_ftrace_filter or set_ftrace_notrace) via append.
If the file is opened for truncate, the list is cleared.

  echo spin_lock > /debug/tracing/set_ftrace_filter

The above will replace the list with only spin_lock

  echo spin_lock >> /debug/tracing/set_ftrace_filter

The above will add spin_lock to the list.

Now this patch adds:

  echo '!spin_lock' >> /debug/tracing/set_ftrace_filter

This will remove spin_lock from the list.

The limited glob features of these lists also can be notted.

  echo '!spin_*' >> /debug/tracing/set_ftrace_filter

This will remove all functions that start with 'spin_'

Note:

  echo '!spin_*' > /debug/tracing/set_ftrace_filter

will simply clear out the list (notice the '>' instead of '>>')

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a12f80efcea..2f32969c09d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1047,6 +1047,13 @@ ftrace_match(unsigned char *buff, int len, int enable)
 	int type = MATCH_FULL;
 	unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
 	unsigned i, match = 0, search_len = 0;
+	int not = 0;
+
+	if (buff[0] == '!') {
+		not = 1;
+		buff++;
+		len--;
+	}
 
 	for (i = 0; i < len; i++) {
 		if (buff[i] == '*') {
@@ -1100,8 +1107,12 @@ ftrace_match(unsigned char *buff, int len, int enable)
 					matched = 1;
 				break;
 			}
-			if (matched)
-				rec->flags |= flag;
+			if (matched) {
+				if (not)
+					rec->flags &= ~flag;
+				else
+					rec->flags |= flag;
+			}
 		}
 		pg = pg->next;
 	}
-- 
cgit v1.2.3


From 3d9101e92529e1ff6014f95a69afc82f37b9b13a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 17 Dec 2008 22:34:13 +0100
Subject: trace: fix task state printout

Impact: fix occasionally incorrect trace output

The tracing code has interesting varieties of printing out task state.

Unfortunalely only one of the instances is correct as it copies the
code from sched.c:sched_show_task(). The others are plain wrong as
they treatthe bitfield as an integer offset into the character
array. Also the size check of the character array is wrong as it
includes the trailing \0.

Use a common state decoder inline which does the Right Thing.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 39 +++++++++++++++++----------------------
 1 file changed, 17 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d86e3252f30..803100518f1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1301,6 +1301,13 @@ lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
 
 static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
 
+static int task_state_char(unsigned long state)
+{
+	int bit = state ? __ffs(state) + 1 : 0;
+
+	return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
+}
+
 /*
  * The message is supposed to contain an ending newline.
  * If the printing stops prematurely, try to add a newline of our own.
@@ -1396,12 +1403,8 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 
 		trace_assign_type(field, entry);
 
-		T = field->next_state < sizeof(state_to_char) ?
-			state_to_char[field->next_state] : 'X';
-
-		state = field->prev_state ?
-			__ffs(field->prev_state) + 1 : 0;
-		S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X';
+		T = task_state_char(field->next_state);
+		S = task_state_char(field->prev_state);
 		comm = trace_find_cmdline(field->next_pid);
 		trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
 				 field->prev_pid,
@@ -1519,10 +1522,8 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 
 		trace_assign_type(field, entry);
 
-		S = field->prev_state < sizeof(state_to_char) ?
-			state_to_char[field->prev_state] : 'X';
-		T = field->next_state < sizeof(state_to_char) ?
-			state_to_char[field->next_state] : 'X';
+		T = task_state_char(field->next_state);
+		S = task_state_char(field->prev_state);
 		ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n",
 				       field->prev_pid,
 				       field->prev_prio,
@@ -1621,12 +1622,9 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
 
 		trace_assign_type(field, entry);
 
-		S = field->prev_state < sizeof(state_to_char) ?
-			state_to_char[field->prev_state] : 'X';
-		T = field->next_state < sizeof(state_to_char) ?
-			state_to_char[field->next_state] : 'X';
-		if (entry->type == TRACE_WAKE)
-			S = '+';
+		T = task_state_char(field->next_state);
+		S = entry->type == TRACE_WAKE ? '+' :
+			task_state_char(field->prev_state);
 		ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n",
 				       field->prev_pid,
 				       field->prev_prio,
@@ -1712,12 +1710,9 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 
 		trace_assign_type(field, entry);
 
-		S = field->prev_state < sizeof(state_to_char) ?
-			state_to_char[field->prev_state] : 'X';
-		T = field->next_state < sizeof(state_to_char) ?
-			state_to_char[field->next_state] : 'X';
-		if (entry->type == TRACE_WAKE)
-			S = '+';
+		T = task_state_char(field->next_state);
+		S = entry->type == TRACE_WAKE ? '+' :
+			task_state_char(field->prev_state);
 		SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
 		SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);
 		SEQ_PUT_HEX_FIELD_RET(s, S);
-- 
cgit v1.2.3


From 6d102bc68f3dd2ae0e305b09170b1751aa67baeb Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 17 Dec 2008 17:48:23 +0800
Subject: tracing/ring-buffer: remove unused ring_buffer size

Impact: remove dead code

struct ring_buffer.size is not set after ring_buffer is initialized
or resized. it is always 0.

we can use "buffer->pages * PAGE_SIZE" to get ring_buffer's size

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index eab81f918f6..bb6922a931b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -258,7 +258,6 @@ struct ring_buffer_per_cpu {
 };
 
 struct ring_buffer {
-	unsigned long			size;
 	unsigned			pages;
 	unsigned			flags;
 	int				cpus;
@@ -2210,8 +2209,7 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
 		return -EINVAL;
 
 	/* At least make sure the two buffers are somewhat the same */
-	if (buffer_a->size != buffer_b->size ||
-	    buffer_a->pages != buffer_b->pages)
+	if (buffer_a->pages != buffer_b->pages)
 		return -EINVAL;
 
 	cpu_buffer_a = buffer_a->buffers[cpu];
-- 
cgit v1.2.3


From 9c2c48020ec0dd6ecd27e5a1298f73b40d85a595 Mon Sep 17 00:00:00 2001
From: Ken Chen <kenchen@google.com>
Date: Tue, 16 Dec 2008 23:41:22 -0800
Subject: schedstat: consolidate per-task cpu runtime stats

Impact: simplify code

When we turn on CONFIG_SCHEDSTATS, per-task cpu runtime is accumulated
twice. Once in task->se.sum_exec_runtime and once in sched_info.cpu_time.
These two stats are exactly the same.

Given that task->se.sum_exec_runtime is always accumulated by the core
scheduler, sched_info can reuse that data instead of duplicate the accounting.

Signed-off-by: Ken Chen <kenchen@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/delayacct.c   | 2 +-
 kernel/sched.c       | 2 ++
 kernel/sched_stats.h | 5 ++---
 3 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index b3179dad71b..abb6e17505e 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -127,7 +127,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 	 */
 	t1 = tsk->sched_info.pcount;
 	t2 = tsk->sched_info.run_delay;
-	t3 = tsk->sched_info.cpu_time;
+	t3 = tsk->se.sum_exec_runtime;
 
 	d->cpu_count += t1;
 
diff --git a/kernel/sched.c b/kernel/sched.c
index f53e2b8ef52..fd835fc320b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -596,6 +596,8 @@ struct rq {
 #ifdef CONFIG_SCHEDSTATS
 	/* latency stats */
 	struct sched_info rq_sched_info;
+	unsigned long long rq_cpu_time;
+	/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
 
 	/* sys_sched_yield() stats */
 	unsigned int yld_exp_empty;
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 7dbf72a2b02..3b01098164c 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -31,7 +31,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
 		    rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count,
 		    rq->sched_switch, rq->sched_count, rq->sched_goidle,
 		    rq->ttwu_count, rq->ttwu_local,
-		    rq->rq_sched_info.cpu_time,
+		    rq->rq_cpu_time,
 		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
 
 		seq_printf(seq, "\n");
@@ -123,7 +123,7 @@ static inline void
 rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 {
 	if (rq)
-		rq->rq_sched_info.cpu_time += delta;
+		rq->rq_cpu_time += delta;
 }
 
 static inline void
@@ -236,7 +236,6 @@ static inline void sched_info_depart(struct task_struct *t)
 	unsigned long long delta = task_rq(t)->clock -
 					t->sched_info.last_arrival;
 
-	t->sched_info.cpu_time += delta;
 	rq_sched_info_depart(task_rq(t), delta);
 
 	if (t->state == TASK_RUNNING)
-- 
cgit v1.2.3


From 64db4cfff99c04cd5f550357edcc8780f96b54a2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 18 Dec 2008 21:55:32 +0100
Subject: "Tree RCU": scalable classic RCU implementation

This patch fixes a long-standing performance bug in classic RCU that
results in massive internal-to-RCU lock contention on systems with
more than a few hundred CPUs.  Although this patch creates a separate
flavor of RCU for ease of review and patch maintenance, it is intended
to replace classic RCU.

This patch still handles stress better than does mainline, so I am still
calling it ready for inclusion.  This patch is against the -tip tree.
Nevertheless, experience on an actual 1000+ CPU machine would still be
most welcome.

Most of the changes noted below were found while creating an rcutiny
(which should permit ejecting the current rcuclassic) and while doing
detailed line-by-line documentation.

Updates from v9 (http://lkml.org/lkml/2008/12/2/334):

o	Fixes from remainder of line-by-line code walkthrough,
	including comment spelling, initialization, undesirable
	narrowing due to type conversion, removing redundant memory
	barriers, removing redundant local-variable initialization,
	and removing redundant local variables.

	I do not believe that any of these fixes address the CPU-hotplug
	issues that Andi Kleen was seeing, but please do give it a whirl
	in case the machine is smarter than I am.

	A writeup from the walkthrough may be found at the following
	URL, in case you are suffering from terminal insomnia or
	masochism:

	http://www.kernel.org/pub/linux/kernel/people/paulmck/tmp/rcutree-walkthrough.2008.12.16a.pdf

o	Made rcutree tracing use seq_file, as suggested some time
	ago by Lai Jiangshan.

o	Added a .csv variant of the rcudata debugfs trace file, to allow
	people having thousands of CPUs to drop the data into
	a spreadsheet.	Tested with oocalc and gnumeric.  Updated
	documentation to suit.

Updates from v8 (http://lkml.org/lkml/2008/11/15/139):

o	Fix a theoretical race between grace-period initialization and
	force_quiescent_state() that could occur if more than three
	jiffies were required to carry out the grace-period
	initialization.  Which it might, if you had enough CPUs.

o	Apply Ingo's printk-standardization patch.

o	Substitute local variables for repeated accesses to global
	variables.

o	Fix comment misspellings and redundant (but harmless) increments
	of ->n_rcu_pending (this latter after having explicitly added it).

o	Apply checkpatch fixes.

Updates from v7 (http://lkml.org/lkml/2008/10/10/291):

o	Fixed a number of problems noted by Gautham Shenoy, including
	the cpu-stall-detection bug that he was having difficulty
	convincing me was real.  ;-)

o	Changed cpu-stall detection to wait for ten seconds rather than
	three in order to reduce false positive, as suggested by Ingo
	Molnar.

o	Produced a design document (http://lwn.net/Articles/305782/).
	The act of writing this document uncovered a number of both
	theoretical and "here and now" bugs as noted below.

o	Fix dynticks_nesting accounting confusion, simplify WARN_ON()
	condition, fix kerneldoc comments, and add memory barriers
	in dynticks interface functions.

o	Add more data to tracing.

o	Remove unused "rcu_barrier" field from rcu_data structure.

o	Count calls to rcu_pending() from scheduling-clock interrupt
	to use as a surrogate timebase should jiffies stop counting.

o	Fix a theoretical race between force_quiescent_state() and
	grace-period initialization.  Yes, initialization does have to
	go on for some jiffies for this race to occur, but given enough
	CPUs...

Updates from v6 (http://lkml.org/lkml/2008/9/23/448):

o	Fix a number of checkpatch.pl complaints.

o	Apply review comments from Ingo Molnar and Lai Jiangshan
	on the stall-detection code.

o	Fix several bugs in !CONFIG_SMP builds.

o	Fix a misspelled config-parameter name so that RCU now announces
	at boot time if stall detection is configured.

o	Run tests on numerous combinations of configurations parameters,
	which after the fixes above, now build and run correctly.

Updates from v5 (http://lkml.org/lkml/2008/9/15/92, bad subject line):

o	Fix a compiler error in the !CONFIG_FANOUT_EXACT case (blew a
	changeset some time ago, and finally got around to retesting
	this option).

o	Fix some tracing bugs in rcupreempt that caused incorrect
	totals to be printed.

o	I now test with a more brutal random-selection online/offline
	script (attached).  Probably more brutal than it needs to be
	on the people reading it as well, but so it goes.

o	A number of optimizations and usability improvements:

	o	Make rcu_pending() ignore the grace-period timeout when
		there is no grace period in progress.

	o	Make force_quiescent_state() avoid going for a global
		lock in the case where there is no grace period in
		progress.

	o	Rearrange struct fields to improve struct layout.

	o	Make call_rcu() initiate a grace period if RCU was
		idle, rather than waiting for the next scheduling
		clock interrupt.

	o	Invoke rcu_irq_enter() and rcu_irq_exit() only when
		idle, as suggested by Andi Kleen.  I still don't
		completely trust this change, and might back it out.

	o	Make CONFIG_RCU_TRACE be the single config variable
		manipulated for all forms of RCU, instead of the prior
		confusion.

	o	Document tracing files and formats for both rcupreempt
		and rcutree.

Updates from v4 for those missing v5 given its bad subject line:

o	Separated dynticks interface so that NMIs and irqs call separate
	functions, greatly simplifying it.  In particular, this code
	no longer requires a proof of correctness.  ;-)

o	Separated dynticks state out into its own per-CPU structure,
	avoiding the duplicated accounting.

o	The case where a dynticks-idle CPU runs an irq handler that
	invokes call_rcu() is now correctly handled, forcing that CPU
	out of dynticks-idle mode.

o	Review comments have been applied (thank you all!!!).
	For but one example, fixed the dynticks-ordering issue that
	Manfred pointed out, saving me much debugging.  ;-)

o	Adjusted rcuclassic and rcupreempt to handle dynticks changes.

Attached is an updated patch to Classic RCU that applies a hierarchy,
greatly reducing the contention on the top-level lock for large machines.
This passes 10-hour concurrent rcutorture and online-offline testing on
128-CPU ppc64 without dynticks enabled, and exposes some timekeeping
bugs in presence of dynticks (exciting working on a system where
"sleep 1" hangs until interrupted...), which were fixed in the
2.6.27 kernel.  It is getting more reliable than mainline by some
measures, so the next version will be against -tip for inclusion.
See also Manfred Spraul's recent patches (or his earlier work from
2004 at http://marc.info/?l=linux-kernel&m=108546384711797&w=2).
We will converge onto a common patch in the fullness of time, but are
currently exploring different regions of the design space.  That said,
I have already gratefully stolen quite a few of Manfred's ideas.

This patch provides CONFIG_RCU_FANOUT, which controls the bushiness
of the RCU hierarchy.  Defaults to 32 on 32-bit machines and 64 on
64-bit machines.  If CONFIG_NR_CPUS is less than CONFIG_RCU_FANOUT,
there is no hierarchy.  By default, the RCU initialization code will
adjust CONFIG_RCU_FANOUT to balance the hierarchy, so strongly NUMA
architectures may choose to set CONFIG_RCU_FANOUT_EXACT to disable
this balancing, allowing the hierarchy to be exactly aligned to the
underlying hardware.  Up to two levels of hierarchy are permitted
(in addition to the root node), allowing up to 16,384 CPUs on 32-bit
systems and up to 262,144 CPUs on 64-bit systems.  I just know that I
am going to regret saying this, but this seems more than sufficient
for the foreseeable future.  (Some architectures might wish to set
CONFIG_RCU_FANOUT=4, which would limit such architectures to 64 CPUs.
If this becomes a real problem, additional levels can be added, but I
doubt that it will make a significant difference on real hardware.)

In the common case, a given CPU will manipulate its private rcu_data
structure and the rcu_node structure that it shares with its immediate
neighbors.  This can reduce both lock and memory contention by multiple
orders of magnitude, which should eliminate the need for the strange
manipulations that are reported to be required when running Linux on
very large systems.

Some shortcomings:

o	More bugs will probably surface as a result of an ongoing
	line-by-line code inspection.

	Patches will be provided as required.

o	There are probably hangs, rcutorture failures, &c.  Seems
	quite stable on a 128-CPU machine, but that is kind of small
	compared to 4096 CPUs.  However, seems to do better than
	mainline.

	Patches will be provided as required.

o	The memory footprint of this version is several KB larger
	than rcuclassic.

	A separate UP-only rcutiny patch will be provided, which will
	reduce the memory footprint significantly, even compared
	to the old rcuclassic.  One such patch passes light testing,
	and has a memory footprint smaller even than rcuclassic.
	Initial reaction from various embedded guys was "it is not
	worth it", so am putting it aside.

Credits:

o	Manfred Spraul for ideas, review comments, and bugs spotted,
	as well as some good friendly competition.  ;-)

o	Josh Triplett, Ingo Molnar, Peter Zijlstra, Mathieu Desnoyers,
	Lai Jiangshan, Andi Kleen, Andy Whitcroft, and Andrew Morton
	for reviews and comments.

o	Thomas Gleixner for much-needed help with some timer issues
	(see patches below).

o	Jon M. Tollefson, Tim Pepper, Andrew Theurer, Jose R. Santos,
	Andy Whitcroft, Darrick Wong, Nishanth Aravamudan, Anton
	Blanchard, Dave Kleikamp, and Nathan Lynch for keeping machines
	alive despite my heavy abuse^Wtesting.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Kconfig.preempt    |   62 +-
 kernel/Makefile           |    6 +-
 kernel/rcupreempt.c       |   10 +
 kernel/rcupreempt_trace.c |   10 +-
 kernel/rcutree.c          | 1535 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/rcutree_trace.c    |  271 ++++++++
 kernel/softirq.c          |    5 +-
 7 files changed, 1883 insertions(+), 16 deletions(-)
 create mode 100644 kernel/rcutree.c
 create mode 100644 kernel/rcutree_trace.c

(limited to 'kernel')

diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 9fdba03dc1f..463f29743ea 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -52,10 +52,29 @@ config PREEMPT
 
 endchoice
 
+choice
+	prompt "RCU Implementation"
+	default CLASSIC_RCU
+
+config CLASSIC_RCU
+	bool "Classic RCU"
+	help
+	  This option selects the classic RCU implementation that is
+	  designed for best read-side performance on non-realtime
+	  systems.
+	  
+	  Select this option if you are unsure.
+
+config TREE_RCU
+	bool "Tree-based hierarchical RCU"
+	help
+	  This option selects the RCU implementation that is
+	  designed for very large SMP system with hundreds or
+	  thousands of CPUs.
+
 config PREEMPT_RCU
 	bool "Preemptible RCU"
 	depends on PREEMPT
-	default n
 	help
 	  This option reduces the latency of the kernel by making certain
 	  RCU sections preemptible. Normally RCU code is non-preemptible, if
@@ -64,16 +83,47 @@ config PREEMPT_RCU
 	  now-naive assumptions about each RCU read-side critical section
 	  remaining on a given CPU through its execution.
 
-	  Say N if you are unsure.
+endchoice
 
 config RCU_TRACE
-	bool "Enable tracing for RCU - currently stats in debugfs"
-	depends on PREEMPT_RCU
-	select DEBUG_FS
-	default y
+	bool "Enable tracing for RCU"
+	depends on TREE_RCU || PREEMPT_RCU
 	help
 	  This option provides tracing in RCU which presents stats
 	  in debugfs for debugging RCU implementation.
 
 	  Say Y here if you want to enable RCU tracing
 	  Say N if you are unsure.
+
+config RCU_FANOUT
+	int "Tree-based hierarchical RCU fanout value"
+	range 2 64 if 64BIT
+	range 2 32 if !64BIT
+	depends on TREE_RCU
+	default 64 if 64BIT
+	default 32 if !64BIT
+	help
+	  This option controls the fanout of hierarchical implementations
+	  of RCU, allowing RCU to work efficiently on machines with
+	  large numbers of CPUs.  This value must be at least the cube
+	  root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit
+	  systems and up to 262,144 for 64-bit systems.
+
+	  Select a specific number if testing RCU itself.
+	  Take the default if unsure.
+
+config RCU_FANOUT_EXACT
+	bool "Disable tree-based hierarchical RCU auto-balancing"
+	depends on TREE_RCU
+	default n
+	help
+	  This option forces use of the exact RCU_FANOUT value specified,
+	  regardless of imbalances in the hierarchy.  This is useful for
+	  testing RCU itself, and might one day be useful on systems with
+	  strong NUMA behavior.
+
+	  Without RCU_FANOUT_EXACT, the code will balance the hierarchy.
+
+	  Say n if unsure.
+
+	
diff --git a/kernel/Makefile b/kernel/Makefile
index 19fad003b19..b4fdbbff5ec 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -74,10 +74,10 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
+obj-$(CONFIG_TREE_RCU) += rcutree.o
 obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
-ifeq ($(CONFIG_PREEMPT_RCU),y)
-obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o
-endif
+obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
+obj-$(CONFIG_PREEMPT_RCU_TRACE) += rcupreempt_trace.o
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 59236e8b9da..04982659875 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -551,6 +551,16 @@ void rcu_irq_exit(void)
 	}
 }
 
+void rcu_nmi_enter(void)
+{
+	rcu_irq_enter();
+}
+
+void rcu_nmi_exit(void)
+{
+	rcu_irq_exit();
+}
+
 static void dyntick_save_progress_counter(int cpu)
 {
 	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
index 35c2d3360ec..7c2665cac17 100644
--- a/kernel/rcupreempt_trace.c
+++ b/kernel/rcupreempt_trace.c
@@ -149,12 +149,12 @@ static void rcupreempt_trace_sum(struct rcupreempt_trace *sp)
 		sp->done_length += cp->done_length;
 		sp->done_add += cp->done_add;
 		sp->done_remove += cp->done_remove;
-		atomic_set(&sp->done_invoked, atomic_read(&cp->done_invoked));
+		atomic_add(atomic_read(&cp->done_invoked), &sp->done_invoked);
 		sp->rcu_check_callbacks += cp->rcu_check_callbacks;
-		atomic_set(&sp->rcu_try_flip_1,
-			   atomic_read(&cp->rcu_try_flip_1));
-		atomic_set(&sp->rcu_try_flip_e1,
-			   atomic_read(&cp->rcu_try_flip_e1));
+		atomic_add(atomic_read(&cp->rcu_try_flip_1),
+			   &sp->rcu_try_flip_1);
+		atomic_add(atomic_read(&cp->rcu_try_flip_e1),
+			   &sp->rcu_try_flip_e1);
 		sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1;
 		sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1;
 		sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
new file mode 100644
index 00000000000..a342b032112
--- /dev/null
+++ b/kernel/rcutree.c
@@ -0,0 +1,1535 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2008
+ *
+ * Authors: Dipankar Sarma <dipankar@in.ibm.com>
+ *	    Manfred Spraul <manfred@colorfullife.com>
+ *	    Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version
+ *
+ * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 	Documentation/RCU
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/time.h>
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key rcu_lock_key;
+struct lockdep_map rcu_lock_map =
+	STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
+EXPORT_SYMBOL_GPL(rcu_lock_map);
+#endif
+
+/* Data structures. */
+
+#define RCU_STATE_INITIALIZER(name) { \
+	.level = { &name.node[0] }, \
+	.levelcnt = { \
+		NUM_RCU_LVL_0,  /* root of hierarchy. */ \
+		NUM_RCU_LVL_1, \
+		NUM_RCU_LVL_2, \
+		NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \
+	}, \
+	.signaled = RCU_SIGNAL_INIT, \
+	.gpnum = -300, \
+	.completed = -300, \
+	.onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \
+	.fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \
+	.n_force_qs = 0, \
+	.n_force_qs_ngp = 0, \
+}
+
+struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
+DEFINE_PER_CPU(struct rcu_data, rcu_data);
+
+struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
+DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
+
+#ifdef CONFIG_NO_HZ
+DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks);
+#endif /* #ifdef CONFIG_NO_HZ */
+
+static int blimit = 10;		/* Maximum callbacks per softirq. */
+static int qhimark = 10000;	/* If this many pending, ignore blimit. */
+static int qlowmark = 100;	/* Once only this many pending, use blimit. */
+
+static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
+
+/*
+ * Return the number of RCU batches processed thus far for debug & stats.
+ */
+long rcu_batches_completed(void)
+{
+	return rcu_state.completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+
+/*
+ * Return the number of RCU BH batches processed thus far for debug & stats.
+ */
+long rcu_batches_completed_bh(void)
+{
+	return rcu_bh_state.completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
+
+/*
+ * Does the CPU have callbacks ready to be invoked?
+ */
+static int
+cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
+{
+	return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL];
+}
+
+/*
+ * Does the current CPU require a yet-as-unscheduled grace period?
+ */
+static int
+cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	/* ACCESS_ONCE() because we are accessing outside of lock. */
+	return *rdp->nxttail[RCU_DONE_TAIL] &&
+	       ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum);
+}
+
+/*
+ * Return the root node of the specified rcu_state structure.
+ */
+static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
+{
+	return &rsp->node[0];
+}
+
+#ifdef CONFIG_SMP
+
+/*
+ * If the specified CPU is offline, tell the caller that it is in
+ * a quiescent state.  Otherwise, whack it with a reschedule IPI.
+ * Grace periods can end up waiting on an offline CPU when that
+ * CPU is in the process of coming online -- it will be added to the
+ * rcu_node bitmasks before it actually makes it online.  The same thing
+ * can happen while a CPU is in the process of coming online.  Because this
+ * race is quite rare, we check for it after detecting that the grace
+ * period has been delayed rather than checking each and every CPU
+ * each and every time we start a new grace period.
+ */
+static int rcu_implicit_offline_qs(struct rcu_data *rdp)
+{
+	/*
+	 * If the CPU is offline, it is in a quiescent state.  We can
+	 * trust its state not to change because interrupts are disabled.
+	 */
+	if (cpu_is_offline(rdp->cpu)) {
+		rdp->offline_fqs++;
+		return 1;
+	}
+
+	/* The CPU is online, so send it a reschedule IPI. */
+	if (rdp->cpu != smp_processor_id())
+		smp_send_reschedule(rdp->cpu);
+	else
+		set_need_resched();
+	rdp->resched_ipi++;
+	return 0;
+}
+
+#endif /* #ifdef CONFIG_SMP */
+
+#ifdef CONFIG_NO_HZ
+static DEFINE_RATELIMIT_STATE(rcu_rs, 10 * HZ, 5);
+
+/**
+ * rcu_enter_nohz - inform RCU that current CPU is entering nohz
+ *
+ * Enter nohz mode, in other words, -leave- the mode in which RCU
+ * read-side critical sections can occur.  (Though RCU read-side
+ * critical sections can occur in irq handlers in nohz mode, a possibility
+ * handled by rcu_irq_enter() and rcu_irq_exit()).
+ */
+void rcu_enter_nohz(void)
+{
+	unsigned long flags;
+	struct rcu_dynticks *rdtp;
+
+	smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
+	local_irq_save(flags);
+	rdtp = &__get_cpu_var(rcu_dynticks);
+	rdtp->dynticks++;
+	rdtp->dynticks_nesting--;
+	WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs);
+	local_irq_restore(flags);
+}
+
+/*
+ * rcu_exit_nohz - inform RCU that current CPU is leaving nohz
+ *
+ * Exit nohz mode, in other words, -enter- the mode in which RCU
+ * read-side critical sections normally occur.
+ */
+void rcu_exit_nohz(void)
+{
+	unsigned long flags;
+	struct rcu_dynticks *rdtp;
+
+	local_irq_save(flags);
+	rdtp = &__get_cpu_var(rcu_dynticks);
+	rdtp->dynticks++;
+	rdtp->dynticks_nesting++;
+	WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs);
+	local_irq_restore(flags);
+	smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
+}
+
+/**
+ * rcu_nmi_enter - inform RCU of entry to NMI context
+ *
+ * If the CPU was idle with dynamic ticks active, and there is no
+ * irq handler running, this updates rdtp->dynticks_nmi to let the
+ * RCU grace-period handling know that the CPU is active.
+ */
+void rcu_nmi_enter(void)
+{
+	struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
+
+	if (rdtp->dynticks & 0x1)
+		return;
+	rdtp->dynticks_nmi++;
+	WARN_ON_RATELIMIT(!(rdtp->dynticks_nmi & 0x1), &rcu_rs);
+	smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
+}
+
+/**
+ * rcu_nmi_exit - inform RCU of exit from NMI context
+ *
+ * If the CPU was idle with dynamic ticks active, and there is no
+ * irq handler running, this updates rdtp->dynticks_nmi to let the
+ * RCU grace-period handling know that the CPU is no longer active.
+ */
+void rcu_nmi_exit(void)
+{
+	struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
+
+	if (rdtp->dynticks & 0x1)
+		return;
+	smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
+	rdtp->dynticks_nmi++;
+	WARN_ON_RATELIMIT(rdtp->dynticks_nmi & 0x1, &rcu_rs);
+}
+
+/**
+ * rcu_irq_enter - inform RCU of entry to hard irq context
+ *
+ * If the CPU was idle with dynamic ticks active, this updates the
+ * rdtp->dynticks to let the RCU handling know that the CPU is active.
+ */
+void rcu_irq_enter(void)
+{
+	struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
+
+	if (rdtp->dynticks_nesting++)
+		return;
+	rdtp->dynticks++;
+	WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs);
+	smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
+}
+
+/**
+ * rcu_irq_exit - inform RCU of exit from hard irq context
+ *
+ * If the CPU was idle with dynamic ticks active, update the rdp->dynticks
+ * to put let the RCU handling be aware that the CPU is going back to idle
+ * with no ticks.
+ */
+void rcu_irq_exit(void)
+{
+	struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
+
+	if (--rdtp->dynticks_nesting)
+		return;
+	smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
+	rdtp->dynticks++;
+	WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs);
+
+	/* If the interrupt queued a callback, get out of dyntick mode. */
+	if (__get_cpu_var(rcu_data).nxtlist ||
+	    __get_cpu_var(rcu_bh_data).nxtlist)
+		set_need_resched();
+}
+
+/*
+ * Record the specified "completed" value, which is later used to validate
+ * dynticks counter manipulations.  Specify "rsp->completed - 1" to
+ * unconditionally invalidate any future dynticks manipulations (which is
+ * useful at the beginning of a grace period).
+ */
+static void dyntick_record_completed(struct rcu_state *rsp, long comp)
+{
+	rsp->dynticks_completed = comp;
+}
+
+#ifdef CONFIG_SMP
+
+/*
+ * Recall the previously recorded value of the completion for dynticks.
+ */
+static long dyntick_recall_completed(struct rcu_state *rsp)
+{
+	return rsp->dynticks_completed;
+}
+
+/*
+ * Snapshot the specified CPU's dynticks counter so that we can later
+ * credit them with an implicit quiescent state.  Return 1 if this CPU
+ * is already in a quiescent state courtesy of dynticks idle mode.
+ */
+static int dyntick_save_progress_counter(struct rcu_data *rdp)
+{
+	int ret;
+	int snap;
+	int snap_nmi;
+
+	snap = rdp->dynticks->dynticks;
+	snap_nmi = rdp->dynticks->dynticks_nmi;
+	smp_mb();	/* Order sampling of snap with end of grace period. */
+	rdp->dynticks_snap = snap;
+	rdp->dynticks_nmi_snap = snap_nmi;
+	ret = ((snap & 0x1) == 0) && ((snap_nmi & 0x1) == 0);
+	if (ret)
+		rdp->dynticks_fqs++;
+	return ret;
+}
+
+/*
+ * Return true if the specified CPU has passed through a quiescent
+ * state by virtue of being in or having passed through an dynticks
+ * idle state since the last call to dyntick_save_progress_counter()
+ * for this same CPU.
+ */
+static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
+{
+	long curr;
+	long curr_nmi;
+	long snap;
+	long snap_nmi;
+
+	curr = rdp->dynticks->dynticks;
+	snap = rdp->dynticks_snap;
+	curr_nmi = rdp->dynticks->dynticks_nmi;
+	snap_nmi = rdp->dynticks_nmi_snap;
+	smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
+
+	/*
+	 * If the CPU passed through or entered a dynticks idle phase with
+	 * no active irq/NMI handlers, then we can safely pretend that the CPU
+	 * already acknowledged the request to pass through a quiescent
+	 * state.  Either way, that CPU cannot possibly be in an RCU
+	 * read-side critical section that started before the beginning
+	 * of the current RCU grace period.
+	 */
+	if ((curr != snap || (curr & 0x1) == 0) &&
+	    (curr_nmi != snap_nmi || (curr_nmi & 0x1) == 0)) {
+		rdp->dynticks_fqs++;
+		return 1;
+	}
+
+	/* Go check for the CPU being offline. */
+	return rcu_implicit_offline_qs(rdp);
+}
+
+#endif /* #ifdef CONFIG_SMP */
+
+#else /* #ifdef CONFIG_NO_HZ */
+
+static void dyntick_record_completed(struct rcu_state *rsp, long comp)
+{
+}
+
+#ifdef CONFIG_SMP
+
+/*
+ * If there are no dynticks, then the only way that a CPU can passively
+ * be in a quiescent state is to be offline.  Unlike dynticks idle, which
+ * is a point in time during the prior (already finished) grace period,
+ * an offline CPU is always in a quiescent state, and thus can be
+ * unconditionally applied.  So just return the current value of completed.
+ */
+static long dyntick_recall_completed(struct rcu_state *rsp)
+{
+	return rsp->completed;
+}
+
+static int dyntick_save_progress_counter(struct rcu_data *rdp)
+{
+	return 0;
+}
+
+static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
+{
+	return rcu_implicit_offline_qs(rdp);
+}
+
+#endif /* #ifdef CONFIG_SMP */
+
+#endif /* #else #ifdef CONFIG_NO_HZ */
+
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+
+static void record_gp_stall_check_time(struct rcu_state *rsp)
+{
+	rsp->gp_start = jiffies;
+	rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
+}
+
+static void print_other_cpu_stall(struct rcu_state *rsp)
+{
+	int cpu;
+	long delta;
+	unsigned long flags;
+	struct rcu_node *rnp = rcu_get_root(rsp);
+	struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
+	struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
+
+	/* Only let one CPU complain about others per time interval. */
+
+	spin_lock_irqsave(&rnp->lock, flags);
+	delta = jiffies - rsp->jiffies_stall;
+	if (delta < RCU_STALL_RAT_DELAY || rsp->gpnum == rsp->completed) {
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
+	rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
+	spin_unlock_irqrestore(&rnp->lock, flags);
+
+	/* OK, time to rat on our buddy... */
+
+	printk(KERN_ERR "INFO: RCU detected CPU stalls:");
+	for (; rnp_cur < rnp_end; rnp_cur++) {
+		if (rnp_cur->qsmask == 0)
+			continue;
+		for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++)
+			if (rnp_cur->qsmask & (1UL << cpu))
+				printk(" %d", rnp_cur->grplo + cpu);
+	}
+	printk(" (detected by %d, t=%ld jiffies)\n",
+	       smp_processor_id(), (long)(jiffies - rsp->gp_start));
+	force_quiescent_state(rsp, 0);  /* Kick them all. */
+}
+
+static void print_cpu_stall(struct rcu_state *rsp)
+{
+	unsigned long flags;
+	struct rcu_node *rnp = rcu_get_root(rsp);
+
+	printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n",
+			smp_processor_id(), jiffies - rsp->gp_start);
+	dump_stack();
+	spin_lock_irqsave(&rnp->lock, flags);
+	if ((long)(jiffies - rsp->jiffies_stall) >= 0)
+		rsp->jiffies_stall =
+			jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
+	spin_unlock_irqrestore(&rnp->lock, flags);
+	set_need_resched();  /* kick ourselves to get things going. */
+}
+
+static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	long delta;
+	struct rcu_node *rnp;
+
+	delta = jiffies - rsp->jiffies_stall;
+	rnp = rdp->mynode;
+	if ((rnp->qsmask & rdp->grpmask) && delta >= 0) {
+
+		/* We haven't checked in, so go dump stack. */
+		print_cpu_stall(rsp);
+
+	} else if (rsp->gpnum != rsp->completed &&
+		   delta >= RCU_STALL_RAT_DELAY) {
+
+		/* They had two time units to dump stack, so complain. */
+		print_other_cpu_stall(rsp);
+	}
+}
+
+#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
+static void record_gp_stall_check_time(struct rcu_state *rsp)
+{
+}
+
+static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+}
+
+#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
+/*
+ * Update CPU-local rcu_data state to record the newly noticed grace period.
+ * This is used both when we started the grace period and when we notice
+ * that someone else started the grace period.
+ */
+static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	rdp->qs_pending = 1;
+	rdp->passed_quiesc = 0;
+	rdp->gpnum = rsp->gpnum;
+	rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
+				      RCU_JIFFIES_TILL_FORCE_QS;
+}
+
+/*
+ * Did someone else start a new RCU grace period start since we last
+ * checked?  Update local state appropriately if so.  Must be called
+ * on the CPU corresponding to rdp.
+ */
+static int
+check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	local_irq_save(flags);
+	if (rdp->gpnum != rsp->gpnum) {
+		note_new_gpnum(rsp, rdp);
+		ret = 1;
+	}
+	local_irq_restore(flags);
+	return ret;
+}
+
+/*
+ * Start a new RCU grace period if warranted, re-initializing the hierarchy
+ * in preparation for detecting the next grace period.  The caller must hold
+ * the root node's ->lock, which is released before return.  Hard irqs must
+ * be disabled.
+ */
+static void
+rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
+	__releases(rcu_get_root(rsp)->lock)
+{
+	struct rcu_data *rdp = rsp->rda[smp_processor_id()];
+	struct rcu_node *rnp = rcu_get_root(rsp);
+	struct rcu_node *rnp_cur;
+	struct rcu_node *rnp_end;
+
+	if (!cpu_needs_another_gp(rsp, rdp)) {
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
+
+	/* Advance to a new grace period and initialize state. */
+	rsp->gpnum++;
+	rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
+	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
+	rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
+				      RCU_JIFFIES_TILL_FORCE_QS;
+	record_gp_stall_check_time(rsp);
+	dyntick_record_completed(rsp, rsp->completed - 1);
+	note_new_gpnum(rsp, rdp);
+
+	/*
+	 * Because we are first, we know that all our callbacks will
+	 * be covered by this upcoming grace period, even the ones
+	 * that were registered arbitrarily recently.
+	 */
+	rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+	rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+
+	/* Special-case the common single-level case. */
+	if (NUM_RCU_NODES == 1) {
+		rnp->qsmask = rnp->qsmaskinit;
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
+
+	spin_unlock(&rnp->lock);  /* leave irqs disabled. */
+
+
+	/* Exclude any concurrent CPU-hotplug operations. */
+	spin_lock(&rsp->onofflock);  /* irqs already disabled. */
+
+	/*
+	 * Set the quiescent-state-needed bits in all the non-leaf RCU
+	 * nodes for all currently online CPUs.  This operation relies
+	 * on the layout of the hierarchy within the rsp->node[] array.
+	 * Note that other CPUs will access only the leaves of the
+	 * hierarchy, which still indicate that no grace period is in
+	 * progress.  In addition, we have excluded CPU-hotplug operations.
+	 *
+	 * We therefore do not need to hold any locks.  Any required
+	 * memory barriers will be supplied by the locks guarding the
+	 * leaf rcu_nodes in the hierarchy.
+	 */
+
+	rnp_end = rsp->level[NUM_RCU_LVLS - 1];
+	for (rnp_cur = &rsp->node[0]; rnp_cur < rnp_end; rnp_cur++)
+		rnp_cur->qsmask = rnp_cur->qsmaskinit;
+
+	/*
+	 * Now set up the leaf nodes.  Here we must be careful.  First,
+	 * we need to hold the lock in order to exclude other CPUs, which
+	 * might be contending for the leaf nodes' locks.  Second, as
+	 * soon as we initialize a given leaf node, its CPUs might run
+	 * up the rest of the hierarchy.  We must therefore acquire locks
+	 * for each node that we touch during this stage.  (But we still
+	 * are excluding CPU-hotplug operations.)
+	 *
+	 * Note that the grace period cannot complete until we finish
+	 * the initialization process, as there will be at least one
+	 * qsmask bit set in the root node until that time, namely the
+	 * one corresponding to this CPU.
+	 */
+	rnp_end = &rsp->node[NUM_RCU_NODES];
+	rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
+	for (; rnp_cur < rnp_end; rnp_cur++) {
+		spin_lock(&rnp_cur->lock);	/* irqs already disabled. */
+		rnp_cur->qsmask = rnp_cur->qsmaskinit;
+		spin_unlock(&rnp_cur->lock);	/* irqs already disabled. */
+	}
+
+	rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
+	spin_unlock_irqrestore(&rsp->onofflock, flags);
+}
+
+/*
+ * Advance this CPU's callbacks, but only if the current grace period
+ * has ended.  This may be called only from the CPU to whom the rdp
+ * belongs.
+ */
+static void
+rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	long completed_snap;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	completed_snap = ACCESS_ONCE(rsp->completed);  /* outside of lock. */
+
+	/* Did another grace period end? */
+	if (rdp->completed != completed_snap) {
+
+		/* Advance callbacks.  No harm if list empty. */
+		rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
+		rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
+		rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+
+		/* Remember that we saw this grace-period completion. */
+		rdp->completed = completed_snap;
+	}
+	local_irq_restore(flags);
+}
+
+/*
+ * Similar to cpu_quiet(), for which it is a helper function.  Allows
+ * a group of CPUs to be quieted at one go, though all the CPUs in the
+ * group must be represented by the same leaf rcu_node structure.
+ * That structure's lock must be held upon entry, and it is released
+ * before return.
+ */
+static void
+cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
+	      unsigned long flags)
+	__releases(rnp->lock)
+{
+	/* Walk up the rcu_node hierarchy. */
+	for (;;) {
+		if (!(rnp->qsmask & mask)) {
+
+			/* Our bit has already been cleared, so done. */
+			spin_unlock_irqrestore(&rnp->lock, flags);
+			return;
+		}
+		rnp->qsmask &= ~mask;
+		if (rnp->qsmask != 0) {
+
+			/* Other bits still set at this level, so done. */
+			spin_unlock_irqrestore(&rnp->lock, flags);
+			return;
+		}
+		mask = rnp->grpmask;
+		if (rnp->parent == NULL) {
+
+			/* No more levels.  Exit loop holding root lock. */
+
+			break;
+		}
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		rnp = rnp->parent;
+		spin_lock_irqsave(&rnp->lock, flags);
+	}
+
+	/*
+	 * Get here if we are the last CPU to pass through a quiescent
+	 * state for this grace period.  Clean up and let rcu_start_gp()
+	 * start up the next grace period if one is needed.  Note that
+	 * we still hold rnp->lock, as required by rcu_start_gp(), which
+	 * will release it.
+	 */
+	rsp->completed = rsp->gpnum;
+	rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
+	rcu_start_gp(rsp, flags);  /* releases rnp->lock. */
+}
+
+/*
+ * Record a quiescent state for the specified CPU, which must either be
+ * the current CPU or an offline CPU.  The lastcomp argument is used to
+ * make sure we are still in the grace period of interest.  We don't want
+ * to end the current grace period based on quiescent states detected in
+ * an earlier grace period!
+ */
+static void
+cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
+{
+	unsigned long flags;
+	unsigned long mask;
+	struct rcu_node *rnp;
+
+	rnp = rdp->mynode;
+	spin_lock_irqsave(&rnp->lock, flags);
+	if (lastcomp != ACCESS_ONCE(rsp->completed)) {
+
+		/*
+		 * Someone beat us to it for this grace period, so leave.
+		 * The race with GP start is resolved by the fact that we
+		 * hold the leaf rcu_node lock, so that the per-CPU bits
+		 * cannot yet be initialized -- so we would simply find our
+		 * CPU's bit already cleared in cpu_quiet_msk() if this race
+		 * occurred.
+		 */
+		rdp->passed_quiesc = 0;	/* try again later! */
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
+	mask = rdp->grpmask;
+	if ((rnp->qsmask & mask) == 0) {
+		spin_unlock_irqrestore(&rnp->lock, flags);
+	} else {
+		rdp->qs_pending = 0;
+
+		/*
+		 * This GP can't end until cpu checks in, so all of our
+		 * callbacks can be processed during the next GP.
+		 */
+		rdp = rsp->rda[smp_processor_id()];
+		rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+
+		cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */
+	}
+}
+
+/*
+ * Check to see if there is a new grace period of which this CPU
+ * is not yet aware, and if so, set up local rcu_data state for it.
+ * Otherwise, see if this CPU has just passed through its first
+ * quiescent state for this grace period, and record that fact if so.
+ */
+static void
+rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	/* If there is now a new grace period, record and return. */
+	if (check_for_new_grace_period(rsp, rdp))
+		return;
+
+	/*
+	 * Does this CPU still need to do its part for current grace period?
+	 * If no, return and let the other CPUs do their part as well.
+	 */
+	if (!rdp->qs_pending)
+		return;
+
+	/*
+	 * Was there a quiescent state since the beginning of the grace
+	 * period? If no, then exit and wait for the next call.
+	 */
+	if (!rdp->passed_quiesc)
+		return;
+
+	/* Tell RCU we are done (but cpu_quiet() will be the judge of that). */
+	cpu_quiet(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
+ * and move all callbacks from the outgoing CPU to the current one.
+ */
+static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
+{
+	int i;
+	unsigned long flags;
+	long lastcomp;
+	unsigned long mask;
+	struct rcu_data *rdp = rsp->rda[cpu];
+	struct rcu_data *rdp_me;
+	struct rcu_node *rnp;
+
+	/* Exclude any attempts to start a new grace period. */
+	spin_lock_irqsave(&rsp->onofflock, flags);
+
+	/* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
+	rnp = rdp->mynode;
+	mask = rdp->grpmask;	/* rnp->grplo is constant. */
+	do {
+		spin_lock(&rnp->lock);		/* irqs already disabled. */
+		rnp->qsmaskinit &= ~mask;
+		if (rnp->qsmaskinit != 0) {
+			spin_unlock(&rnp->lock); /* irqs already disabled. */
+			break;
+		}
+		mask = rnp->grpmask;
+		spin_unlock(&rnp->lock);	/* irqs already disabled. */
+		rnp = rnp->parent;
+	} while (rnp != NULL);
+	lastcomp = rsp->completed;
+
+	spin_unlock(&rsp->onofflock);		/* irqs remain disabled. */
+
+	/* Being offline is a quiescent state, so go record it. */
+	cpu_quiet(cpu, rsp, rdp, lastcomp);
+
+	/*
+	 * Move callbacks from the outgoing CPU to the running CPU.
+	 * Note that the outgoing CPU is now quiscent, so it is now
+	 * (uncharacteristically) safe to access it rcu_data structure.
+	 * Note also that we must carefully retain the order of the
+	 * outgoing CPU's callbacks in order for rcu_barrier() to work
+	 * correctly.  Finally, note that we start all the callbacks
+	 * afresh, even those that have passed through a grace period
+	 * and are therefore ready to invoke.  The theory is that hotplug
+	 * events are rare, and that if they are frequent enough to
+	 * indefinitely delay callbacks, you have far worse things to
+	 * be worrying about.
+	 */
+	rdp_me = rsp->rda[smp_processor_id()];
+	if (rdp->nxtlist != NULL) {
+		*rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
+		rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+		rdp->nxtlist = NULL;
+		for (i = 0; i < RCU_NEXT_SIZE; i++)
+			rdp->nxttail[i] = &rdp->nxtlist;
+		rdp_me->qlen += rdp->qlen;
+		rdp->qlen = 0;
+	}
+	local_irq_restore(flags);
+}
+
+/*
+ * Remove the specified CPU from the RCU hierarchy and move any pending
+ * callbacks that it might have to the current CPU.  This code assumes
+ * that at least one CPU in the system will remain running at all times.
+ * Any attempt to offline -all- CPUs is likely to strand RCU callbacks.
+ */
+static void rcu_offline_cpu(int cpu)
+{
+	__rcu_offline_cpu(cpu, &rcu_state);
+	__rcu_offline_cpu(cpu, &rcu_bh_state);
+}
+
+#else /* #ifdef CONFIG_HOTPLUG_CPU */
+
+static void rcu_offline_cpu(int cpu)
+{
+}
+
+#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
+
+/*
+ * Invoke any RCU callbacks that have made it to the end of their grace
+ * period.  Thottle as specified by rdp->blimit.
+ */
+static void rcu_do_batch(struct rcu_data *rdp)
+{
+	unsigned long flags;
+	struct rcu_head *next, *list, **tail;
+	int count;
+
+	/* If no callbacks are ready, just return.*/
+	if (!cpu_has_callbacks_ready_to_invoke(rdp))
+		return;
+
+	/*
+	 * Extract the list of ready callbacks, disabling to prevent
+	 * races with call_rcu() from interrupt handlers.
+	 */
+	local_irq_save(flags);
+	list = rdp->nxtlist;
+	rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
+	*rdp->nxttail[RCU_DONE_TAIL] = NULL;
+	tail = rdp->nxttail[RCU_DONE_TAIL];
+	for (count = RCU_NEXT_SIZE - 1; count >= 0; count--)
+		if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL])
+			rdp->nxttail[count] = &rdp->nxtlist;
+	local_irq_restore(flags);
+
+	/* Invoke callbacks. */
+	count = 0;
+	while (list) {
+		next = list->next;
+		prefetch(next);
+		list->func(list);
+		list = next;
+		if (++count >= rdp->blimit)
+			break;
+	}
+
+	local_irq_save(flags);
+
+	/* Update count, and requeue any remaining callbacks. */
+	rdp->qlen -= count;
+	if (list != NULL) {
+		*tail = rdp->nxtlist;
+		rdp->nxtlist = list;
+		for (count = 0; count < RCU_NEXT_SIZE; count++)
+			if (&rdp->nxtlist == rdp->nxttail[count])
+				rdp->nxttail[count] = tail;
+			else
+				break;
+	}
+
+	/* Reinstate batch limit if we have worked down the excess. */
+	if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
+		rdp->blimit = blimit;
+
+	local_irq_restore(flags);
+
+	/* Re-raise the RCU softirq if there are callbacks remaining. */
+	if (cpu_has_callbacks_ready_to_invoke(rdp))
+		raise_softirq(RCU_SOFTIRQ);
+}
+
+/*
+ * Check to see if this CPU is in a non-context-switch quiescent state
+ * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
+ * Also schedule the RCU softirq handler.
+ *
+ * This function must be called with hardirqs disabled.  It is normally
+ * invoked from the scheduling-clock interrupt.  If rcu_pending returns
+ * false, there is no point in invoking rcu_check_callbacks().
+ */
+void rcu_check_callbacks(int cpu, int user)
+{
+	if (user ||
+	    (idle_cpu(cpu) && !in_softirq() &&
+				hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+
+		/*
+		 * Get here if this CPU took its interrupt from user
+		 * mode or from the idle loop, and if this is not a
+		 * nested interrupt.  In this case, the CPU is in
+		 * a quiescent state, so count it.
+		 *
+		 * No memory barrier is required here because both
+		 * rcu_qsctr_inc() and rcu_bh_qsctr_inc() reference
+		 * only CPU-local variables that other CPUs neither
+		 * access nor modify, at least not while the corresponding
+		 * CPU is online.
+		 */
+
+		rcu_qsctr_inc(cpu);
+		rcu_bh_qsctr_inc(cpu);
+
+	} else if (!in_softirq()) {
+
+		/*
+		 * Get here if this CPU did not take its interrupt from
+		 * softirq, in other words, if it is not interrupting
+		 * a rcu_bh read-side critical section.  This is an _bh
+		 * critical section, so count it.
+		 */
+
+		rcu_bh_qsctr_inc(cpu);
+	}
+	raise_softirq(RCU_SOFTIRQ);
+}
+
+#ifdef CONFIG_SMP
+
+/*
+ * Scan the leaf rcu_node structures, processing dyntick state for any that
+ * have not yet encountered a quiescent state, using the function specified.
+ * Returns 1 if the current grace period ends while scanning (possibly
+ * because we made it end).
+ */
+static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
+			       int (*f)(struct rcu_data *))
+{
+	unsigned long bit;
+	int cpu;
+	unsigned long flags;
+	unsigned long mask;
+	struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
+	struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
+
+	for (; rnp_cur < rnp_end; rnp_cur++) {
+		mask = 0;
+		spin_lock_irqsave(&rnp_cur->lock, flags);
+		if (rsp->completed != lastcomp) {
+			spin_unlock_irqrestore(&rnp_cur->lock, flags);
+			return 1;
+		}
+		if (rnp_cur->qsmask == 0) {
+			spin_unlock_irqrestore(&rnp_cur->lock, flags);
+			continue;
+		}
+		cpu = rnp_cur->grplo;
+		bit = 1;
+		for (; cpu <= rnp_cur->grphi; cpu++, bit <<= 1) {
+			if ((rnp_cur->qsmask & bit) != 0 && f(rsp->rda[cpu]))
+				mask |= bit;
+		}
+		if (mask != 0 && rsp->completed == lastcomp) {
+
+			/* cpu_quiet_msk() releases rnp_cur->lock. */
+			cpu_quiet_msk(mask, rsp, rnp_cur, flags);
+			continue;
+		}
+		spin_unlock_irqrestore(&rnp_cur->lock, flags);
+	}
+	return 0;
+}
+
+/*
+ * Force quiescent states on reluctant CPUs, and also detect which
+ * CPUs are in dyntick-idle mode.
+ */
+static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
+{
+	unsigned long flags;
+	long lastcomp;
+	struct rcu_data *rdp = rsp->rda[smp_processor_id()];
+	struct rcu_node *rnp = rcu_get_root(rsp);
+	u8 signaled;
+
+	if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum))
+		return;  /* No grace period in progress, nothing to force. */
+	if (!spin_trylock_irqsave(&rsp->fqslock, flags)) {
+		rsp->n_force_qs_lh++; /* Inexact, can lose counts.  Tough! */
+		return;	/* Someone else is already on the job. */
+	}
+	if (relaxed &&
+	    (long)(rsp->jiffies_force_qs - jiffies) >= 0 &&
+	    (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) >= 0)
+		goto unlock_ret; /* no emergency and done recently. */
+	rsp->n_force_qs++;
+	spin_lock(&rnp->lock);
+	lastcomp = rsp->completed;
+	signaled = rsp->signaled;
+	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
+	rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
+				      RCU_JIFFIES_TILL_FORCE_QS;
+	if (lastcomp == rsp->gpnum) {
+		rsp->n_force_qs_ngp++;
+		spin_unlock(&rnp->lock);
+		goto unlock_ret;  /* no GP in progress, time updated. */
+	}
+	spin_unlock(&rnp->lock);
+	switch (signaled) {
+	case RCU_GP_INIT:
+
+		break; /* grace period still initializing, ignore. */
+
+	case RCU_SAVE_DYNTICK:
+
+		if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
+			break; /* So gcc recognizes the dead code. */
+
+		/* Record dyntick-idle state. */
+		if (rcu_process_dyntick(rsp, lastcomp,
+					dyntick_save_progress_counter))
+			goto unlock_ret;
+
+		/* Update state, record completion counter. */
+		spin_lock(&rnp->lock);
+		if (lastcomp == rsp->completed) {
+			rsp->signaled = RCU_FORCE_QS;
+			dyntick_record_completed(rsp, lastcomp);
+		}
+		spin_unlock(&rnp->lock);
+		break;
+
+	case RCU_FORCE_QS:
+
+		/* Check dyntick-idle state, send IPI to laggarts. */
+		if (rcu_process_dyntick(rsp, dyntick_recall_completed(rsp),
+					rcu_implicit_dynticks_qs))
+			goto unlock_ret;
+
+		/* Leave state in case more forcing is required. */
+
+		break;
+	}
+unlock_ret:
+	spin_unlock_irqrestore(&rsp->fqslock, flags);
+}
+
+#else /* #ifdef CONFIG_SMP */
+
+static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
+{
+	set_need_resched();
+}
+
+#endif /* #else #ifdef CONFIG_SMP */
+
+/*
+ * This does the RCU processing work from softirq context for the
+ * specified rcu_state and rcu_data structures.  This may be called
+ * only from the CPU to whom the rdp belongs.
+ */
+static void
+__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	unsigned long flags;
+
+	/*
+	 * If an RCU GP has gone long enough, go check for dyntick
+	 * idle CPUs and, if needed, send resched IPIs.
+	 */
+	if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
+	    (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0)
+		force_quiescent_state(rsp, 1);
+
+	/*
+	 * Advance callbacks in response to end of earlier grace
+	 * period that some other CPU ended.
+	 */
+	rcu_process_gp_end(rsp, rdp);
+
+	/* Update RCU state based on any recent quiescent states. */
+	rcu_check_quiescent_state(rsp, rdp);
+
+	/* Does this CPU require a not-yet-started grace period? */
+	if (cpu_needs_another_gp(rsp, rdp)) {
+		spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
+		rcu_start_gp(rsp, flags);  /* releases above lock */
+	}
+
+	/* If there are callbacks ready, invoke them. */
+	rcu_do_batch(rdp);
+}
+
+/*
+ * Do softirq processing for the current CPU.
+ */
+static void rcu_process_callbacks(struct softirq_action *unused)
+{
+	/*
+	 * Memory references from any prior RCU read-side critical sections
+	 * executed by the interrupted code must be seen before any RCU
+	 * grace-period manipulations below.
+	 */
+	smp_mb(); /* See above block comment. */
+
+	__rcu_process_callbacks(&rcu_state, &__get_cpu_var(rcu_data));
+	__rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
+
+	/*
+	 * Memory references from any later RCU read-side critical sections
+	 * executed by the interrupted code must be seen after any RCU
+	 * grace-period manipulations above.
+	 */
+	smp_mb(); /* See above block comment. */
+}
+
+static void
+__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
+	   struct rcu_state *rsp)
+{
+	unsigned long flags;
+	struct rcu_data *rdp;
+
+	head->func = func;
+	head->next = NULL;
+
+	smp_mb(); /* Ensure RCU update seen before callback registry. */
+
+	/*
+	 * Opportunistically note grace-period endings and beginnings.
+	 * Note that we might see a beginning right after we see an
+	 * end, but never vice versa, since this CPU has to pass through
+	 * a quiescent state betweentimes.
+	 */
+	local_irq_save(flags);
+	rdp = rsp->rda[smp_processor_id()];
+	rcu_process_gp_end(rsp, rdp);
+	check_for_new_grace_period(rsp, rdp);
+
+	/* Add the callback to our list. */
+	*rdp->nxttail[RCU_NEXT_TAIL] = head;
+	rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
+
+	/* Start a new grace period if one not already started. */
+	if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum)) {
+		unsigned long nestflag;
+		struct rcu_node *rnp_root = rcu_get_root(rsp);
+
+		spin_lock_irqsave(&rnp_root->lock, nestflag);
+		rcu_start_gp(rsp, nestflag);  /* releases rnp_root->lock. */
+	}
+
+	/* Force the grace period if too many callbacks or too long waiting. */
+	if (unlikely(++rdp->qlen > qhimark)) {
+		rdp->blimit = LONG_MAX;
+		force_quiescent_state(rsp, 0);
+	} else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
+		   (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0)
+		force_quiescent_state(rsp, 1);
+	local_irq_restore(flags);
+}
+
+/*
+ * Queue an RCU callback for invocation after a grace period.
+ */
+void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+	__call_rcu(head, func, &rcu_state);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+
+/*
+ * Queue an RCU for invocation after a quicker grace period.
+ */
+void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+	__call_rcu(head, func, &rcu_bh_state);
+}
+EXPORT_SYMBOL_GPL(call_rcu_bh);
+
+/*
+ * Check to see if there is any immediate RCU-related work to be done
+ * by the current CPU, for the specified type of RCU, returning 1 if so.
+ * The checks are in order of increasing expense: checks that can be
+ * carried out against CPU-local state are performed first.  However,
+ * we must check for CPU stalls first, else we might not get a chance.
+ */
+static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	rdp->n_rcu_pending++;
+
+	/* Check for CPU stalls, if enabled. */
+	check_cpu_stall(rsp, rdp);
+
+	/* Is the RCU core waiting for a quiescent state from this CPU? */
+	if (rdp->qs_pending)
+		return 1;
+
+	/* Does this CPU have callbacks ready to invoke? */
+	if (cpu_has_callbacks_ready_to_invoke(rdp))
+		return 1;
+
+	/* Has RCU gone idle with this CPU needing another grace period? */
+	if (cpu_needs_another_gp(rsp, rdp))
+		return 1;
+
+	/* Has another RCU grace period completed?  */
+	if (ACCESS_ONCE(rsp->completed) != rdp->completed) /* outside of lock */
+		return 1;
+
+	/* Has a new RCU grace period started? */
+	if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) /* outside of lock */
+		return 1;
+
+	/* Has an RCU GP gone long enough to send resched IPIs &c? */
+	if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) &&
+	    ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
+	     (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0))
+		return 1;
+
+	/* nothing to do */
+	return 0;
+}
+
+/*
+ * Check to see if there is any immediate RCU-related work to be done
+ * by the current CPU, returning 1 if so.  This function is part of the
+ * RCU implementation; it is -not- an exported member of the RCU API.
+ */
+int rcu_pending(int cpu)
+{
+	return __rcu_pending(&rcu_state, &per_cpu(rcu_data, cpu)) ||
+	       __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu));
+}
+
+/*
+ * Check to see if any future RCU-related work will need to be done
+ * by the current CPU, even if none need be done immediately, returning
+ * 1 if so.  This function is part of the RCU implementation; it is -not-
+ * an exported member of the RCU API.
+ */
+int rcu_needs_cpu(int cpu)
+{
+	/* RCU callbacks either ready or pending? */
+	return per_cpu(rcu_data, cpu).nxtlist ||
+	       per_cpu(rcu_bh_data, cpu).nxtlist;
+}
+
+/*
+ * Initialize a CPU's per-CPU RCU data.  We take this "scorched earth"
+ * approach so that we don't have to worry about how long the CPU has
+ * been gone, or whether it ever was online previously.  We do trust the
+ * ->mynode field, as it is constant for a given struct rcu_data and
+ * initialized during early boot.
+ *
+ * Note that only one online or offline event can be happening at a given
+ * time.  Note also that we can accept some slop in the rsp->completed
+ * access due to the fact that this CPU cannot possibly have any RCU
+ * callbacks in flight yet.
+ */
+static void
+rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
+{
+	unsigned long flags;
+	int i;
+	long lastcomp;
+	unsigned long mask;
+	struct rcu_data *rdp = rsp->rda[cpu];
+	struct rcu_node *rnp = rcu_get_root(rsp);
+
+	/* Set up local state, ensuring consistent view of global state. */
+	spin_lock_irqsave(&rnp->lock, flags);
+	lastcomp = rsp->completed;
+	rdp->completed = lastcomp;
+	rdp->gpnum = lastcomp;
+	rdp->passed_quiesc = 0;  /* We could be racing with new GP, */
+	rdp->qs_pending = 1;	 /*  so set up to respond to current GP. */
+	rdp->beenonline = 1;	 /* We have now been online. */
+	rdp->passed_quiesc_completed = lastcomp - 1;
+	rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
+	rdp->nxtlist = NULL;
+	for (i = 0; i < RCU_NEXT_SIZE; i++)
+		rdp->nxttail[i] = &rdp->nxtlist;
+	rdp->qlen = 0;
+	rdp->blimit = blimit;
+#ifdef CONFIG_NO_HZ
+	rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
+#endif /* #ifdef CONFIG_NO_HZ */
+	rdp->cpu = cpu;
+	spin_unlock(&rnp->lock);		/* irqs remain disabled. */
+
+	/*
+	 * A new grace period might start here.  If so, we won't be part
+	 * of it, but that is OK, as we are currently in a quiescent state.
+	 */
+
+	/* Exclude any attempts to start a new GP on large systems. */
+	spin_lock(&rsp->onofflock);		/* irqs already disabled. */
+
+	/* Add CPU to rcu_node bitmasks. */
+	rnp = rdp->mynode;
+	mask = rdp->grpmask;
+	do {
+		/* Exclude any attempts to start a new GP on small systems. */
+		spin_lock(&rnp->lock);	/* irqs already disabled. */
+		rnp->qsmaskinit |= mask;
+		mask = rnp->grpmask;
+		spin_unlock(&rnp->lock); /* irqs already disabled. */
+		rnp = rnp->parent;
+	} while (rnp != NULL && !(rnp->qsmaskinit & mask));
+
+	spin_unlock(&rsp->onofflock);		/* irqs remain disabled. */
+
+	/*
+	 * A new grace period might start here.  If so, we will be part of
+	 * it, and its gpnum will be greater than ours, so we will
+	 * participate.  It is also possible for the gpnum to have been
+	 * incremented before this function was called, and the bitmasks
+	 * to not be filled out until now, in which case we will also
+	 * participate due to our gpnum being behind.
+	 */
+
+	/* Since it is coming online, the CPU is in a quiescent state. */
+	cpu_quiet(cpu, rsp, rdp, lastcomp);
+	local_irq_restore(flags);
+}
+
+static void __cpuinit rcu_online_cpu(int cpu)
+{
+#ifdef CONFIG_NO_HZ
+	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+	rdtp->dynticks_nesting = 1;
+	rdtp->dynticks |= 1; 	/* need consecutive #s even for hotplug. */
+	rdtp->dynticks_nmi = (rdtp->dynticks_nmi + 1) & ~0x1;
+#endif /* #ifdef CONFIG_NO_HZ */
+	rcu_init_percpu_data(cpu, &rcu_state);
+	rcu_init_percpu_data(cpu, &rcu_bh_state);
+	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+}
+
+/*
+ * Handle CPU online/offline notifcation events.
+ */
+static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
+				unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		rcu_online_cpu(cpu);
+		break;
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+		rcu_offline_cpu(cpu);
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+/*
+ * Compute the per-level fanout, either using the exact fanout specified
+ * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
+ */
+#ifdef CONFIG_RCU_FANOUT_EXACT
+static void __init rcu_init_levelspread(struct rcu_state *rsp)
+{
+	int i;
+
+	for (i = NUM_RCU_LVLS - 1; i >= 0; i--)
+		rsp->levelspread[i] = CONFIG_RCU_FANOUT;
+}
+#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
+static void __init rcu_init_levelspread(struct rcu_state *rsp)
+{
+	int ccur;
+	int cprv;
+	int i;
+
+	cprv = NR_CPUS;
+	for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+		ccur = rsp->levelcnt[i];
+		rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
+		cprv = ccur;
+	}
+}
+#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */
+
+/*
+ * Helper function for rcu_init() that initializes one rcu_state structure.
+ */
+static void __init rcu_init_one(struct rcu_state *rsp)
+{
+	int cpustride = 1;
+	int i;
+	int j;
+	struct rcu_node *rnp;
+
+	/* Initialize the level-tracking arrays. */
+
+	for (i = 1; i < NUM_RCU_LVLS; i++)
+		rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
+	rcu_init_levelspread(rsp);
+
+	/* Initialize the elements themselves, starting from the leaves. */
+
+	for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+		cpustride *= rsp->levelspread[i];
+		rnp = rsp->level[i];
+		for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
+			spin_lock_init(&rnp->lock);
+			rnp->qsmask = 0;
+			rnp->qsmaskinit = 0;
+			rnp->grplo = j * cpustride;
+			rnp->grphi = (j + 1) * cpustride - 1;
+			if (rnp->grphi >= NR_CPUS)
+				rnp->grphi = NR_CPUS - 1;
+			if (i == 0) {
+				rnp->grpnum = 0;
+				rnp->grpmask = 0;
+				rnp->parent = NULL;
+			} else {
+				rnp->grpnum = j % rsp->levelspread[i - 1];
+				rnp->grpmask = 1UL << rnp->grpnum;
+				rnp->parent = rsp->level[i - 1] +
+					      j / rsp->levelspread[i - 1];
+			}
+			rnp->level = i;
+		}
+	}
+}
+
+/*
+ * Helper macro for __rcu_init().  To be used nowhere else!
+ * Assigns leaf node pointers into each CPU's rcu_data structure.
+ */
+#define RCU_DATA_PTR_INIT(rsp, rcu_data) \
+do { \
+	rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
+	j = 0; \
+	for_each_possible_cpu(i) { \
+		if (i > rnp[j].grphi) \
+			j++; \
+		per_cpu(rcu_data, i).mynode = &rnp[j]; \
+		(rsp)->rda[i] = &per_cpu(rcu_data, i); \
+	} \
+} while (0)
+
+static struct notifier_block __cpuinitdata rcu_nb = {
+	.notifier_call	= rcu_cpu_notify,
+};
+
+void __init __rcu_init(void)
+{
+	int i;			/* All used by RCU_DATA_PTR_INIT(). */
+	int j;
+	struct rcu_node *rnp;
+
+	printk(KERN_WARNING "Experimental hierarchical RCU implementation.\n");
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+	printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+	rcu_init_one(&rcu_state);
+	RCU_DATA_PTR_INIT(&rcu_state, rcu_data);
+	rcu_init_one(&rcu_bh_state);
+	RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data);
+
+	for_each_online_cpu(i)
+		rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
+	/* Register notifier for non-boot CPUs */
+	register_cpu_notifier(&rcu_nb);
+	printk(KERN_WARNING "Experimental hierarchical RCU init done.\n");
+}
+
+module_param(blimit, int, 0);
+module_param(qhimark, int, 0);
+module_param(qlowmark, int, 0);
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
new file mode 100644
index 00000000000..d6db3e83782
--- /dev/null
+++ b/kernel/rcutree_trace.c
@@ -0,0 +1,271 @@
+/*
+ * Read-Copy Update tracing for classic implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2008
+ *
+ * Papers:  http://www.rdrop.com/users/paulmck/RCU
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		Documentation/RCU
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
+{
+	if (!rdp->beenonline)
+		return;
+	seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d rpfq=%ld rp=%x",
+		   rdp->cpu,
+		   cpu_is_offline(rdp->cpu) ? '!' : ' ',
+		   rdp->completed, rdp->gpnum,
+		   rdp->passed_quiesc, rdp->passed_quiesc_completed,
+		   rdp->qs_pending,
+		   rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending,
+		   (int)(rdp->n_rcu_pending & 0xffff));
+#ifdef CONFIG_NO_HZ
+	seq_printf(m, " dt=%d/%d dn=%d df=%lu",
+		   rdp->dynticks->dynticks,
+		   rdp->dynticks->dynticks_nesting,
+		   rdp->dynticks->dynticks_nmi,
+		   rdp->dynticks_fqs);
+#endif /* #ifdef CONFIG_NO_HZ */
+	seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
+	seq_printf(m, " ql=%ld b=%ld\n", rdp->qlen, rdp->blimit);
+}
+
+#define PRINT_RCU_DATA(name, func, m) \
+	do { \
+		int _p_r_d_i; \
+		\
+		for_each_possible_cpu(_p_r_d_i) \
+			func(m, &per_cpu(name, _p_r_d_i)); \
+	} while (0)
+
+static int show_rcudata(struct seq_file *m, void *unused)
+{
+	seq_puts(m, "rcu:\n");
+	PRINT_RCU_DATA(rcu_data, print_one_rcu_data, m);
+	seq_puts(m, "rcu_bh:\n");
+	PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m);
+	return 0;
+}
+
+static int rcudata_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, show_rcudata, NULL);
+}
+
+static struct file_operations rcudata_fops = {
+	.owner = THIS_MODULE,
+	.open = rcudata_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
+{
+	if (!rdp->beenonline)
+		return;
+	seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d,%ld,%ld",
+		   rdp->cpu,
+		   cpu_is_offline(rdp->cpu) ? "\"Y\"" : "\"N\"",
+		   rdp->completed, rdp->gpnum,
+		   rdp->passed_quiesc, rdp->passed_quiesc_completed,
+		   rdp->qs_pending,
+		   rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending,
+		   rdp->n_rcu_pending);
+#ifdef CONFIG_NO_HZ
+	seq_printf(m, ",%d,%d,%d,%lu",
+		   rdp->dynticks->dynticks,
+		   rdp->dynticks->dynticks_nesting,
+		   rdp->dynticks->dynticks_nmi,
+		   rdp->dynticks_fqs);
+#endif /* #ifdef CONFIG_NO_HZ */
+	seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
+	seq_printf(m, ",%ld,%ld\n", rdp->qlen, rdp->blimit);
+}
+
+static int show_rcudata_csv(struct seq_file *m, void *unused)
+{
+	seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",\"rpfq\",\"rp\",");
+#ifdef CONFIG_NO_HZ
+	seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
+#endif /* #ifdef CONFIG_NO_HZ */
+	seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n");
+	seq_puts(m, "\"rcu:\"\n");
+	PRINT_RCU_DATA(rcu_data, print_one_rcu_data_csv, m);
+	seq_puts(m, "\"rcu_bh:\"\n");
+	PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m);
+	return 0;
+}
+
+static int rcudata_csv_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, show_rcudata_csv, NULL);
+}
+
+static struct file_operations rcudata_csv_fops = {
+	.owner = THIS_MODULE,
+	.open = rcudata_csv_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
+{
+	int level = 0;
+	struct rcu_node *rnp;
+
+	seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x "
+	              "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
+		   rsp->completed, rsp->gpnum, rsp->signaled,
+		   (long)(rsp->jiffies_force_qs - jiffies),
+		   (int)(jiffies & 0xffff),
+		   rsp->n_force_qs, rsp->n_force_qs_ngp,
+		   rsp->n_force_qs - rsp->n_force_qs_ngp,
+		   rsp->n_force_qs_lh);
+	for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
+		if (rnp->level != level) {
+			seq_puts(m, "\n");
+			level = rnp->level;
+		}
+		seq_printf(m, "%lx/%lx %d:%d ^%d    ",
+			   rnp->qsmask, rnp->qsmaskinit,
+			   rnp->grplo, rnp->grphi, rnp->grpnum);
+	}
+	seq_puts(m, "\n");
+}
+
+static int show_rcuhier(struct seq_file *m, void *unused)
+{
+	seq_puts(m, "rcu:\n");
+	print_one_rcu_state(m, &rcu_state);
+	seq_puts(m, "rcu_bh:\n");
+	print_one_rcu_state(m, &rcu_bh_state);
+	return 0;
+}
+
+static int rcuhier_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, show_rcuhier, NULL);
+}
+
+static struct file_operations rcuhier_fops = {
+	.owner = THIS_MODULE,
+	.open = rcuhier_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static int show_rcugp(struct seq_file *m, void *unused)
+{
+	seq_printf(m, "rcu: completed=%ld  gpnum=%ld\n",
+		   rcu_state.completed, rcu_state.gpnum);
+	seq_printf(m, "rcu_bh: completed=%ld  gpnum=%ld\n",
+		   rcu_bh_state.completed, rcu_bh_state.gpnum);
+	return 0;
+}
+
+static int rcugp_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, show_rcugp, NULL);
+}
+
+static struct file_operations rcugp_fops = {
+	.owner = THIS_MODULE,
+	.open = rcugp_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static struct dentry *rcudir, *datadir, *datadir_csv, *hierdir, *gpdir;
+static int __init rcuclassic_trace_init(void)
+{
+	rcudir = debugfs_create_dir("rcu", NULL);
+	if (!rcudir)
+		goto out;
+
+	datadir = debugfs_create_file("rcudata", 0444, rcudir,
+						NULL, &rcudata_fops);
+	if (!datadir)
+		goto free_out;
+
+	datadir_csv = debugfs_create_file("rcudata.csv", 0444, rcudir,
+						NULL, &rcudata_csv_fops);
+	if (!datadir_csv)
+		goto free_out;
+
+	gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
+	if (!gpdir)
+		goto free_out;
+
+	hierdir = debugfs_create_file("rcuhier", 0444, rcudir,
+						NULL, &rcuhier_fops);
+	if (!hierdir)
+		goto free_out;
+	return 0;
+free_out:
+	if (datadir)
+		debugfs_remove(datadir);
+	if (datadir_csv)
+		debugfs_remove(datadir_csv);
+	if (gpdir)
+		debugfs_remove(gpdir);
+	debugfs_remove(rcudir);
+out:
+	return 1;
+}
+
+static void __exit rcuclassic_trace_cleanup(void)
+{
+	debugfs_remove(datadir);
+	debugfs_remove(datadir_csv);
+	debugfs_remove(gpdir);
+	debugfs_remove(hierdir);
+	debugfs_remove(rcudir);
+}
+
+
+module_init(rcuclassic_trace_init);
+module_exit(rcuclassic_trace_cleanup);
+
+MODULE_AUTHOR("Paul E. McKenney");
+MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
+MODULE_LICENSE("GPL");
diff --git a/kernel/softirq.c b/kernel/softirq.c
index e7c69a720d6..80d323e6f61 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -269,6 +269,7 @@ void irq_enter(void)
 {
 	int cpu = smp_processor_id();
 
+	rcu_irq_enter();
 	if (idle_cpu(cpu) && !in_interrupt()) {
 		__irq_enter();
 		tick_check_idle(cpu);
@@ -295,9 +296,9 @@ void irq_exit(void)
 
 #ifdef CONFIG_NO_HZ
 	/* Make sure that timer wheel updates are propagated */
-	if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
-		tick_nohz_stop_sched_tick(0);
 	rcu_irq_exit();
+	if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
+		tick_nohz_stop_sched_tick(0);
 #endif
 	preempt_enable_no_resched();
 }
-- 
cgit v1.2.3


From b2e3c0adec918ea22b6c9d7c76193dd3aaba9bd4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 19 Dec 2008 00:48:27 +0100
Subject: hrtimers: fix warning in kernel/hrtimer.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

this warning:

  kernel/hrtimer.c: In function ‘hrtimer_cpu_notify’:
  kernel/hrtimer.c:1574: warning: unused variable ‘dcpu’

is caused because 'dcpu' is only used in the CONFIG_HOTPLUG_CPU case.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hrtimer.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index b741f850426..bda9cb92427 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1571,7 +1571,7 @@ static void tickle_timers(void *arg)
 static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
 					unsigned long action, void *hcpu)
 {
-	int dcpu, scpu = (long)hcpu;
+	int scpu = (long)hcpu;
 
 	switch (action) {
 
@@ -1583,10 +1583,14 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
+	{
+		int dcpu;
+
 		clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu);
 		dcpu = migrate_hrtimers(scpu);
 		smp_call_function_single(dcpu, tickle_timers, NULL, 0);
 		break;
+	}
 #endif
 
 	default:
-- 
cgit v1.2.3


From 3bddb9a3246f6df5cf3b7655cb541ac10203bb71 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 19 Dec 2008 01:03:29 +0100
Subject: tracing: fix warning in kernel/trace/trace.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

this warning:

  kernel/trace/trace.c: In function ‘print_lat_fmt’:
  kernel/trace/trace.c:1826: warning: unused variable ‘state’

Triggers because 'state' has become unused - remove it.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1a3d6b32978..49fc7201295 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1816,7 +1816,6 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 	char *comm;
 	int S, T;
 	int i;
-	unsigned state;
 
 	if (entry->type == TRACE_CONT)
 		return TRACE_TYPE_HANDLED;
-- 
cgit v1.2.3


From c71dd42db2c6f1637b92502a214587431c1a6ad2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 19 Dec 2008 01:09:51 +0100
Subject: tracing: fix warnings in kernel/trace/trace_sched_switch.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

these warnings:

  kernel/trace/trace_sched_switch.c: In function ‘tracing_sched_register’:
  kernel/trace/trace_sched_switch.c:96: warning: passing argument 1 of ‘register_trace_sched_wakeup_new’ from incompatible pointer type
  kernel/trace/trace_sched_switch.c:112: warning: passing argument 1 of ‘unregister_trace_sched_wakeup_new’ from incompatible pointer type
  kernel/trace/trace_sched_switch.c: In function ‘tracing_sched_unregister’:
  kernel/trace/trace_sched_switch.c:121: warning: passing argument 1 of ‘unregister_trace_sched_wakeup_new’ from incompatible pointer type

Trigger because sched_wakeup_new tracepoints need the same trace
signature as sched_wakeup - which was changed recently.

Fix it.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c                    | 2 +-
 kernel/trace/trace_sched_switch.c | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index d377097572f..ac5a70a87d1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2457,7 +2457,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 		p->sched_class->task_new(rq, p);
 		inc_nr_running(rq);
 	}
-	trace_sched_wakeup_new(rq, p);
+	trace_sched_wakeup_new(rq, p, 1);
 	check_preempt_curr(rq, p, 0);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_wake_up)
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 863390557b4..781d72ef873 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -247,3 +247,4 @@ __init static int init_sched_switch_trace(void)
 	return register_tracer(&sched_switch_trace);
 }
 device_initcall(init_sched_switch_trace);
+
-- 
cgit v1.2.3


From b56863630ddbdea6e22df8835f78f0b1da037103 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Thu, 18 Dec 2008 15:06:34 -0800
Subject: futex: clean up futex_(un)lock_pi fault handling

Impact: cleanup

Some apparently left over cruft code was complicating the fault logic:

Testing if uval != -EFAULT doesn't have any meaning, get_user() sets ret
to either 0 or -EFAULT, there's no need to compare uval, especially not
against EFAULT which it will never be.  This patch removes the superfluous
test and clarifies the comment blocks.

Build and boot tested on an 8way x86_64 system.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/futex.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 99f8acce08b..b4f87bac91c 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1565,12 +1565,11 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
 
  uaddr_faulted:
 	/*
-	 * We have to r/w  *(int __user *)uaddr, but we can't modify it
-	 * non-atomically.  Therefore, if get_user below is not
-	 * enough, we need to handle the fault ourselves, while
-	 * still holding the mmap_sem.
-	 *
-	 * ... and hb->lock. :-) --ANK
+	 * We have to r/w  *(int __user *)uaddr, and we have to modify it
+	 * atomically.  Therefore, if we continue to fault after get_user()
+	 * below, we need to handle the fault ourselves, while still holding
+	 * the mmap_sem.  This can occur if the uaddr is under contention as
+	 * we have to drop the mmap_sem in order to call get_user().
 	 */
 	queue_unlock(&q, hb);
 
@@ -1582,7 +1581,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
 	}
 
 	ret = get_user(uval, uaddr);
-	if (!ret && (uval != -EFAULT))
+	if (!ret)
 		goto retry;
 
 	if (to)
@@ -1676,12 +1675,11 @@ out:
 
 pi_faulted:
 	/*
-	 * We have to r/w  *(int __user *)uaddr, but we can't modify it
-	 * non-atomically.  Therefore, if get_user below is not
-	 * enough, we need to handle the fault ourselves, while
-	 * still holding the mmap_sem.
-	 *
-	 * ... and hb->lock. --ANK
+	 * We have to r/w  *(int __user *)uaddr, and we have to modify it
+	 * atomically.  Therefore, if we continue to fault after get_user()
+	 * below, we need to handle the fault ourselves, while still holding
+	 * the mmap_sem.  This can occur if the uaddr is under contention as
+	 * we have to drop the mmap_sem in order to call get_user().
 	 */
 	spin_unlock(&hb->lock);
 
@@ -1694,7 +1692,7 @@ pi_faulted:
 	}
 
 	ret = get_user(uval, uaddr);
-	if (!ret && (uval != -EFAULT))
+	if (!ret)
 		goto retry;
 
 	return ret;
-- 
cgit v1.2.3


From afb8a9b70b86866a60e08b2956ae4e1406390336 Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Thu, 18 Dec 2008 23:26:09 +0530
Subject: sched: framework for sched_mc/smt_power_savings=N

Impact: extend range of /sys/devices/system/cpu/sched_mc_power_savings

Currently the sched_mc/smt_power_savings variable is a boolean,
which either enables or disables topology based power savings.
This patch extends the behaviour of the variable from boolean to
multivalued, such that based on the value, we decide how
aggressively do we want to perform powersavings balance at
appropriate sched domain based on topology.

Variable levels of power saving tunable would benefit end user to
match the required level of power savings vs performance
trade-off depending on the system configuration and workloads.

This version makes the sched_mc_power_savings global variable to
take more values (0,1,2).  Later versions can have a single
tunable called sched_power_savings instead of
sched_{mc,smt}_power_savings.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index b309027bf9e..56b285cd535 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7906,14 +7906,25 @@ int arch_reinit_sched_domains(void)
 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 {
 	int ret;
+	unsigned int level = 0;
 
-	if (buf[0] != '0' && buf[0] != '1')
+	if (sscanf(buf, "%u", &level) != 1)
+		return -EINVAL;
+
+	/*
+	 * level is always be positive so don't check for
+	 * level < POWERSAVINGS_BALANCE_NONE which is 0
+	 * What happens on 0 or 1 byte write,
+	 * need to check for count as well?
+	 */
+
+	if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
 		return -EINVAL;
 
 	if (smt)
-		sched_smt_power_savings = (buf[0] == '1');
+		sched_smt_power_savings = level;
 	else
-		sched_mc_power_savings = (buf[0] == '1');
+		sched_mc_power_savings = level;
 
 	ret = arch_reinit_sched_domains();
 
-- 
cgit v1.2.3


From d5679bd11916eba5c8ee9033003e1a5ce56ece9a Mon Sep 17 00:00:00 2001
From: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Date: Thu, 18 Dec 2008 23:26:16 +0530
Subject: sched: favour lower logical cpu number for sched_mc balance

Impact: change load-balancing direction to match that of irqbalanced

Just in case two groups have identical load, prefer to move load to lower
logical cpu number rather than the present logic of moving to higher logical
number.

find_busiest_group() tries to look for a group_leader that has spare capacity
to take more tasks and freeup an appropriate least loaded group.  Just in case
there is a tie and the load is equal, then the group with higher logical number
is favoured.  This conflicts with user space irqbalance daemon that will move
interrupts to lower logical number if the system utilisation is very low.

Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 56b285cd535..94b9d11e331 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3241,7 +3241,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		 */
 		if ((sum_nr_running < min_nr_running) ||
 		    (sum_nr_running == min_nr_running &&
-		     cpumask_first(sched_group_cpus(group)) <
+		     cpumask_first(sched_group_cpus(group)) >
 		     cpumask_first(sched_group_cpus(group_min)))) {
 			group_min = group;
 			min_nr_running = sum_nr_running;
@@ -3257,7 +3257,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		if (sum_nr_running <= group_capacity - 1) {
 			if (sum_nr_running > leader_nr_running ||
 			    (sum_nr_running == leader_nr_running &&
-			     cpumask_first(sched_group_cpus(group)) >
+			     cpumask_first(sched_group_cpus(group)) <
 			     cpumask_first(sched_group_cpus(group_leader)))) {
 				group_leader = group;
 				leader_nr_running = sum_nr_running;
-- 
cgit v1.2.3


From 7a09b1a27b1e5a4957e4af9951420fea02c44fba Mon Sep 17 00:00:00 2001
From: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Date: Thu, 18 Dec 2008 23:26:22 +0530
Subject: sched: nominate preferred wakeup cpu

Impact: extend load-balancing code (no change in behavior yet)

When the system utilisation is low and more cpus are idle,
then the process waking up from sleep should prefer to
wakeup an idle cpu from semi-idle cpu package (multi core
package) rather than a completely idle cpu package which
would waste power.

Use the sched_mc balance logic in find_busiest_group() to
nominate a preferred wakeup cpu.

This info can be stored in appropriate sched_domain, but
updating this info in all copies of sched_domain is not
practical.  Hence this information is stored in root_domain
struct which is one copy per partitioned sched domain.
The root_domain can be accessed from each cpu's runqueue
and there is one copy per partitioned sched domain.

Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 94b9d11e331..c1b8b3031eb 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -509,6 +509,14 @@ struct root_domain {
 #ifdef CONFIG_SMP
 	struct cpupri cpupri;
 #endif
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+	/*
+	 * Preferred wake up cpu nominated by sched_mc balance that will be
+	 * used when most cpus are idle in the system indicating overall very
+	 * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
+	 */
+	unsigned int sched_mc_preferred_wakeup_cpu;
+#endif
 };
 
 /*
@@ -3384,6 +3392,10 @@ out_balanced:
 
 	if (this == group_leader && group_leader != group_min) {
 		*imbalance = min_load_per_task;
+		if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
+			cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
+					first_cpu(group_leader->cpumask);
+		}
 		return group_min;
 	}
 #endif
-- 
cgit v1.2.3


From 7eb52dfa70dbf5232b5b83ec4357e6bebaa8fde8 Mon Sep 17 00:00:00 2001
From: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Date: Thu, 18 Dec 2008 23:26:29 +0530
Subject: sched: bias task wakeups to preferred semi-idle packages

Impact: tweak task wakeup to save power more agressively

Preferred wakeup cpu (from a semi idle package) has been
nominated in find_busiest_group() in the previous patch.  Use
this information in sched_mc_preferred_wakeup_cpu in function
wake_idle() to bias task wakeups if the following conditions
are satisfied:

        - The present cpu that is trying to wakeup the process is
          idle and waking the target process on this cpu will
          potentially wakeup a completely idle package
        - The previous cpu on which the target process ran is
          also idle and hence selecting the previous cpu may
          wakeup a semi idle cpu package
        - The task being woken up is allowed to run in the
          nominated cpu (cpu affinity and restrictions)

Basically if both the current cpu and the previous cpu on
which the task ran is idle, select the nominated cpu from semi
idle cpu package for running the new task that is waking up.

Cache hotness is considered since the actual biasing happens
in wake_idle() only if the application is cache cold.

This technique will effectively move short running bursty jobs in
a mostly idle system.

Wakeup biasing for power savings gets automatically disabled if
system utilisation increases due to the fact that the probability
of finding both this_cpu and prev_cpu idle decreases.

Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 08ffffd4a41..36b5e34fa99 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1026,6 +1026,24 @@ static int wake_idle(int cpu, struct task_struct *p)
 {
 	struct sched_domain *sd;
 	int i;
+	unsigned int chosen_wakeup_cpu;
+	int this_cpu;
+
+	/*
+	 * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
+	 * are idle and this is not a kernel thread and this task's affinity
+	 * allows it to be moved to preferred cpu, then just move!
+	 */
+
+	this_cpu = smp_processor_id();
+	chosen_wakeup_cpu =
+		cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
+
+	if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
+		idle_cpu(cpu) && idle_cpu(this_cpu) &&
+		p->mm && !(p->flags & PF_KTHREAD) &&
+		cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
+		return chosen_wakeup_cpu;
 
 	/*
 	 * If it is idle, then it is the best cpu to run this task.
-- 
cgit v1.2.3


From ad273b32e482cdef306eac32b28d97f513a022f4 Mon Sep 17 00:00:00 2001
From: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Date: Thu, 18 Dec 2008 23:26:36 +0530
Subject: sched: activate active load balancing in new idle cpus

Impact: tweak task balancing to save power more agressively

Active load balancing is a process by which migration thread
is woken up on the target CPU in order to pull current
running task on another package into this newly idle
package.

This method is already in use with normal load_balance(),
this patch introduces this method to new idle cpus when
sched_mc is set to POWERSAVINGS_BALANCE_WAKEUP.

This logic provides effective consolidation of short running
daemon jobs in a almost idle system

The side effect of this patch may be ping-ponging of tasks
if the system is moderately utilised. May need to adjust the
iterations before triggering.

Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index c1b8b3031eb..8fc0d5aa43b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3670,10 +3670,64 @@ redo:
 	}
 
 	if (!ld_moved) {
+		int active_balance;
+
 		schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
 		if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 		    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 			return -1;
+
+		if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
+			return -1;
+
+		if (sd->nr_balance_failed++ < 2)
+			return -1;
+
+		/*
+		 * The only task running in a non-idle cpu can be moved to this
+		 * cpu in an attempt to completely freeup the other CPU
+		 * package. The same method used to move task in load_balance()
+		 * have been extended for load_balance_newidle() to speedup
+		 * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
+		 *
+		 * The package power saving logic comes from
+		 * find_busiest_group().  If there are no imbalance, then
+		 * f_b_g() will return NULL.  However when sched_mc={1,2} then
+		 * f_b_g() will select a group from which a running task may be
+		 * pulled to this cpu in order to make the other package idle.
+		 * If there is no opportunity to make a package idle and if
+		 * there are no imbalance, then f_b_g() will return NULL and no
+		 * action will be taken in load_balance_newidle().
+		 *
+		 * Under normal task pull operation due to imbalance, there
+		 * will be more than one task in the source run queue and
+		 * move_tasks() will succeed.  ld_moved will be true and this
+		 * active balance code will not be triggered.
+		 */
+
+		/* Lock busiest in correct order while this_rq is held */
+		double_lock_balance(this_rq, busiest);
+
+		/*
+		 * don't kick the migration_thread, if the curr
+		 * task on busiest cpu can't be moved to this_cpu
+		 */
+		if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+			double_unlock_balance(this_rq, busiest);
+			all_pinned = 1;
+			return ld_moved;
+		}
+
+		if (!busiest->active_balance) {
+			busiest->active_balance = 1;
+			busiest->push_cpu = this_cpu;
+			active_balance = 1;
+		}
+
+		double_unlock_balance(this_rq, busiest);
+		if (active_balance)
+			wake_up_process(busiest->migration_thread);
+
 	} else
 		sd->nr_balance_failed = 0;
 
-- 
cgit v1.2.3


From 9924da434a13668fceb208d56dbdf86d166862cc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 19 Dec 2008 00:53:40 +0100
Subject: sched: fix warning in kernel/sched.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: fix cpumask conversion bug

this warning:

  kernel/sched.c: In function ‘find_busiest_group’:
  kernel/sched.c:3429: warning: passing argument 1 of ‘__first_cpu’ from incompatible pointer type

shows that we forgot to convert a new patch to the new cpumask APIs.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 8fc0d5aa43b..ae5ca3f9e77 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3394,7 +3394,7 @@ out_balanced:
 		*imbalance = min_load_per_task;
 		if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
 			cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
-					first_cpu(group_leader->cpumask);
+				cpumask_first(sched_group_cpus(group_leader));
 		}
 		return group_min;
 	}
-- 
cgit v1.2.3


From 3fe0313e6ec572e6bb3f9d247316a834336db4be Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sun, 26 Oct 2008 20:50:26 +0100
Subject: Hibernate: Call platform_begin before swsusp_shrink_memory

Call platform_begin() before swsusp_shrink_memory() so that we can
always allocate enough memory to save the ACPI NVS region from
platform_begin().

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Acked-by: Nigel Cunningham <nigel@tuxonice.net>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 kernel/power/disk.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index c9d74083746..096fe4899ea 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -259,12 +259,12 @@ int hibernation_snapshot(int platform_mode)
 {
 	int error, ftrace_save;
 
-	/* Free memory before shutting down devices. */
-	error = swsusp_shrink_memory();
+	error = platform_begin(platform_mode);
 	if (error)
 		return error;
 
-	error = platform_begin(platform_mode);
+	/* Free memory before shutting down devices. */
+	error = swsusp_shrink_memory();
 	if (error)
 		goto Close;
 
-- 
cgit v1.2.3


From 3f4b0ef7f2899c91b1d6958779f084b44dd59d32 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 26 Oct 2008 20:52:15 +0100
Subject: ACPI hibernate: Add a mechanism to save/restore ACPI NVS memory

According to the ACPI Specification 3.0b, Section 15.3.2,
"OSPM will call the _PTS control method some time before entering a
sleeping state, to allow the platform's AML code to update this
memory image before entering the sleeping state. After the system
awakes from an S4 state, OSPM will restore this memory area and call
the _WAK control method to enable the BIOS to reclaim its memory
image."  For this reason, implement a mechanism allowing us to save
the NVS memory during hibernation and to restore it during the
subsequent resume.

Based on a patch by Zhang Rui.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Nigel Cunningham <nigel@tuxonice.net>
Cc: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 kernel/power/swsusp.c | 122 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 122 insertions(+)

(limited to 'kernel')

diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 023ff2a31d8..a92c9145155 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -262,3 +262,125 @@ int swsusp_shrink_memory(void)
 
 	return 0;
 }
+
+/*
+ * Platforms, like ACPI, may want us to save some memory used by them during
+ * hibernation and to restore the contents of this memory during the subsequent
+ * resume.  The code below implements a mechanism allowing us to do that.
+ */
+
+struct nvs_page {
+	unsigned long phys_start;
+	unsigned int size;
+	void *kaddr;
+	void *data;
+	struct list_head node;
+};
+
+static LIST_HEAD(nvs_list);
+
+/**
+ *	hibernate_nvs_register - register platform NVS memory region to save
+ *	@start - physical address of the region
+ *	@size - size of the region
+ *
+ *	The NVS region need not be page-aligned (both ends) and we arrange
+ *	things so that the data from page-aligned addresses in this region will
+ *	be copied into separate RAM pages.
+ */
+int hibernate_nvs_register(unsigned long start, unsigned long size)
+{
+	struct nvs_page *entry, *next;
+
+	while (size > 0) {
+		unsigned int nr_bytes;
+
+		entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
+		if (!entry)
+			goto Error;
+
+		list_add_tail(&entry->node, &nvs_list);
+		entry->phys_start = start;
+		nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
+		entry->size = (size < nr_bytes) ? size : nr_bytes;
+
+		start += entry->size;
+		size -= entry->size;
+	}
+	return 0;
+
+ Error:
+	list_for_each_entry_safe(entry, next, &nvs_list, node) {
+		list_del(&entry->node);
+		kfree(entry);
+	}
+	return -ENOMEM;
+}
+
+/**
+ *	hibernate_nvs_free - free data pages allocated for saving NVS regions
+ */
+void hibernate_nvs_free(void)
+{
+	struct nvs_page *entry;
+
+	list_for_each_entry(entry, &nvs_list, node)
+		if (entry->data) {
+			free_page((unsigned long)entry->data);
+			entry->data = NULL;
+			if (entry->kaddr) {
+				iounmap(entry->kaddr);
+				entry->kaddr = NULL;
+			}
+		}
+}
+
+/**
+ *	hibernate_nvs_alloc - allocate memory necessary for saving NVS regions
+ */
+int hibernate_nvs_alloc(void)
+{
+	struct nvs_page *entry;
+
+	list_for_each_entry(entry, &nvs_list, node) {
+		entry->data = (void *)__get_free_page(GFP_KERNEL);
+		if (!entry->data) {
+			hibernate_nvs_free();
+			return -ENOMEM;
+		}
+	}
+	return 0;
+}
+
+/**
+ *	hibernate_nvs_save - save NVS memory regions
+ */
+void hibernate_nvs_save(void)
+{
+	struct nvs_page *entry;
+
+	printk(KERN_INFO "PM: Saving platform NVS memory\n");
+
+	list_for_each_entry(entry, &nvs_list, node)
+		if (entry->data) {
+			entry->kaddr = ioremap(entry->phys_start, entry->size);
+			memcpy(entry->data, entry->kaddr, entry->size);
+		}
+}
+
+/**
+ *	hibernate_nvs_restore - restore NVS memory regions
+ *
+ *	This function is going to be called with interrupts disabled, so it
+ *	cannot iounmap the virtual addresses used to access the NVS region.
+ */
+void hibernate_nvs_restore(void)
+{
+	struct nvs_page *entry;
+
+	printk(KERN_INFO "PM: Restoring platform NVS memory\n");
+
+	list_for_each_entry(entry, &nvs_list, node)
+		if (entry->data)
+			memcpy(entry->kaddr, entry->data, entry->size);
+}
-- 
cgit v1.2.3


From 69643279a88dea000ac2f858091d0e365f778245 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 11 Nov 2008 21:32:44 +0100
Subject: Hibernate: Do not oops on resume if image data are incorrect

During resume from hibernation using the userland interface image
data are being passed from the used space process to the kernel.
These data need not be valid, but currently the kernel sometimes
oopses if it gets invalid image data, which is wrong.  Make the
kernel return error codes to the user space in such cases.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 kernel/power/snapshot.c | 43 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 32 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 5d2ab836e99..955c8cc9183 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -519,6 +519,14 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
 	return test_bit(bit, addr);
 }
 
+static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
+{
+	void *addr;
+	unsigned int bit;
+
+	return !memory_bm_find_bit(bm, pfn, &addr, &bit);
+}
+
 /**
  *	memory_bm_next_pfn - find the pfn that corresponds to the next set bit
  *	in the bitmap @bm.  If the pfn cannot be found, BM_END_OF_MAP is
@@ -1459,9 +1467,7 @@ load_header(struct swsusp_info *info)
  *	unpack_orig_pfns - for each element of @buf[] (1 page at a time) set
  *	the corresponding bit in the memory bitmap @bm
  */
-
-static inline void
-unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
+static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
 {
 	int j;
 
@@ -1469,8 +1475,13 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
 		if (unlikely(buf[j] == BM_END_OF_MAP))
 			break;
 
-		memory_bm_set_bit(bm, buf[j]);
+		if (memory_bm_pfn_present(bm, buf[j]))
+			memory_bm_set_bit(bm, buf[j]);
+		else
+			return -EFAULT;
 	}
+
+	return 0;
 }
 
 /* List of "safe" pages that may be used to store data loaded from the suspend
@@ -1608,7 +1619,7 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
 	pbe = chain_alloc(ca, sizeof(struct highmem_pbe));
 	if (!pbe) {
 		swsusp_free();
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 	}
 	pbe->orig_page = page;
 	if (safe_highmem_pages > 0) {
@@ -1677,7 +1688,7 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
 static inline void *
 get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
 {
-	return NULL;
+	return ERR_PTR(-EINVAL);
 }
 
 static inline void copy_last_highmem_page(void) {}
@@ -1788,8 +1799,13 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
 {
 	struct pbe *pbe;
-	struct page *page = pfn_to_page(memory_bm_next_pfn(bm));
+	struct page *page;
+	unsigned long pfn = memory_bm_next_pfn(bm);
 
+	if (pfn == BM_END_OF_MAP)
+		return ERR_PTR(-EFAULT);
+
+	page = pfn_to_page(pfn);
 	if (PageHighMem(page))
 		return get_highmem_page_buffer(page, ca);
 
@@ -1805,7 +1821,7 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
 	pbe = chain_alloc(ca, sizeof(struct pbe));
 	if (!pbe) {
 		swsusp_free();
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 	}
 	pbe->orig_address = page_address(page);
 	pbe->address = safe_pages_list;
@@ -1868,7 +1884,10 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 				return error;
 
 		} else if (handle->prev <= nr_meta_pages) {
-			unpack_orig_pfns(buffer, &copy_bm);
+			error = unpack_orig_pfns(buffer, &copy_bm);
+			if (error)
+				return error;
+
 			if (handle->prev == nr_meta_pages) {
 				error = prepare_image(&orig_bm, &copy_bm);
 				if (error)
@@ -1879,12 +1898,14 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 				restore_pblist = NULL;
 				handle->buffer = get_buffer(&orig_bm, &ca);
 				handle->sync_read = 0;
-				if (!handle->buffer)
-					return -ENOMEM;
+				if (IS_ERR(handle->buffer))
+					return PTR_ERR(handle->buffer);
 			}
 		} else {
 			copy_last_highmem_page();
 			handle->buffer = get_buffer(&orig_bm, &ca);
+			if (IS_ERR(handle->buffer))
+				return PTR_ERR(handle->buffer);
 			if (handle->buffer != buffer)
 				handle->sync_read = 0;
 		}
-- 
cgit v1.2.3


From 846705deb059c352cc0e5806d5964f815b8c6d98 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 26 Nov 2008 18:00:24 -0500
Subject: Hibernate: Take overlapping zones into account (rev. 2)

It has been requested to make hibernation work with memory
hotplugging enabled and for this purpose the hibernation code has to
be reworked to take the possible overlapping of zones into account.
Thus, rework the hibernation memory bitmaps code to prevent
duplication of PFNs from occuring and add checks to make sure that
one page frame will not be marked as saveable many times.

Additionally, use list.h lists instead of open-coded lists to
implement the memory bitmaps.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 kernel/power/snapshot.c | 325 +++++++++++++++++++++++++-----------------------
 1 file changed, 166 insertions(+), 159 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 955c8cc9183..ec9f153b2fc 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -25,6 +25,7 @@
 #include <linux/syscalls.h>
 #include <linux/console.h>
 #include <linux/highmem.h>
+#include <linux/list.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -192,12 +193,6 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
 	return ret;
 }
 
-static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
-{
-	free_list_of_pages(ca->chain, clear_page_nosave);
-	memset(ca, 0, sizeof(struct chain_allocator));
-}
-
 /**
  *	Data types related to memory bitmaps.
  *
@@ -233,7 +228,7 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
 #define BM_BITS_PER_BLOCK	(PAGE_SIZE << 3)
 
 struct bm_block {
-	struct bm_block *next;		/* next element of the list */
+	struct list_head hook;	/* hook into a list of bitmap blocks */
 	unsigned long start_pfn;	/* pfn represented by the first bit */
 	unsigned long end_pfn;	/* pfn represented by the last bit plus 1 */
 	unsigned long *data;	/* bitmap representing pages */
@@ -244,24 +239,15 @@ static inline unsigned long bm_block_bits(struct bm_block *bb)
 	return bb->end_pfn - bb->start_pfn;
 }
 
-struct zone_bitmap {
-	struct zone_bitmap *next;	/* next element of the list */
-	unsigned long start_pfn;	/* minimal pfn in this zone */
-	unsigned long end_pfn;		/* maximal pfn in this zone plus 1 */
-	struct bm_block *bm_blocks;	/* list of bitmap blocks */
-	struct bm_block *cur_block;	/* recently used bitmap block */
-};
-
 /* strcut bm_position is used for browsing memory bitmaps */
 
 struct bm_position {
-	struct zone_bitmap *zone_bm;
 	struct bm_block *block;
 	int bit;
 };
 
 struct memory_bitmap {
-	struct zone_bitmap *zone_bm_list;	/* list of zone bitmaps */
+	struct list_head blocks;	/* list of bitmap blocks */
 	struct linked_page *p_list;	/* list of pages used to store zone
 					 * bitmap objects and bitmap block
 					 * objects
@@ -273,11 +259,7 @@ struct memory_bitmap {
 
 static void memory_bm_position_reset(struct memory_bitmap *bm)
 {
-	struct zone_bitmap *zone_bm;
-
-	zone_bm = bm->zone_bm_list;
-	bm->cur.zone_bm = zone_bm;
-	bm->cur.block = zone_bm->bm_blocks;
+	bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook);
 	bm->cur.bit = 0;
 }
 
@@ -285,151 +267,184 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
 
 /**
  *	create_bm_block_list - create a list of block bitmap objects
+ *	@nr_blocks - number of blocks to allocate
+ *	@list - list to put the allocated blocks into
+ *	@ca - chain allocator to be used for allocating memory
  */
-
-static inline struct bm_block *
-create_bm_block_list(unsigned int nr_blocks, struct chain_allocator *ca)
+static int create_bm_block_list(unsigned long pages,
+				struct list_head *list,
+				struct chain_allocator *ca)
 {
-	struct bm_block *bblist = NULL;
+	unsigned int nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK);
 
 	while (nr_blocks-- > 0) {
 		struct bm_block *bb;
 
 		bb = chain_alloc(ca, sizeof(struct bm_block));
 		if (!bb)
-			return NULL;
-
-		bb->next = bblist;
-		bblist = bb;
+			return -ENOMEM;
+		list_add(&bb->hook, list);
 	}
-	return bblist;
+
+	return 0;
 }
 
+struct mem_extent {
+	struct list_head hook;
+	unsigned long start;
+	unsigned long end;
+};
+
 /**
- *	create_zone_bm_list - create a list of zone bitmap objects
+ *	free_mem_extents - free a list of memory extents
+ *	@list - list of extents to empty
  */
+static void free_mem_extents(struct list_head *list)
+{
+	struct mem_extent *ext, *aux;
+
+	list_for_each_entry_safe(ext, aux, list, hook) {
+		list_del(&ext->hook);
+		kfree(ext);
+	}
+}
 
-static inline struct zone_bitmap *
-create_zone_bm_list(unsigned int nr_zones, struct chain_allocator *ca)
+/**
+ *	create_mem_extents - create a list of memory extents representing
+ *	                     contiguous ranges of PFNs
+ *	@list - list to put the extents into
+ *	@gfp_mask - mask to use for memory allocations
+ */
+static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
 {
-	struct zone_bitmap *zbmlist = NULL;
+	struct zone *zone;
 
-	while (nr_zones-- > 0) {
-		struct zone_bitmap *zbm;
+	INIT_LIST_HEAD(list);
 
-		zbm = chain_alloc(ca, sizeof(struct zone_bitmap));
-		if (!zbm)
-			return NULL;
+	for_each_zone(zone) {
+		unsigned long zone_start, zone_end;
+		struct mem_extent *ext, *cur, *aux;
+
+		if (!populated_zone(zone))
+			continue;
+
+		zone_start = zone->zone_start_pfn;
+		zone_end = zone->zone_start_pfn + zone->spanned_pages;
 
-		zbm->next = zbmlist;
-		zbmlist = zbm;
+		list_for_each_entry(ext, list, hook)
+			if (zone_start <= ext->end)
+				break;
+
+		if (&ext->hook == list || zone_end < ext->start) {
+			/* New extent is necessary */
+			struct mem_extent *new_ext;
+
+			new_ext = kzalloc(sizeof(struct mem_extent), gfp_mask);
+			if (!new_ext) {
+				free_mem_extents(list);
+				return -ENOMEM;
+			}
+			new_ext->start = zone_start;
+			new_ext->end = zone_end;
+			list_add_tail(&new_ext->hook, &ext->hook);
+			continue;
+		}
+
+		/* Merge this zone's range of PFNs with the existing one */
+		if (zone_start < ext->start)
+			ext->start = zone_start;
+		if (zone_end > ext->end)
+			ext->end = zone_end;
+
+		/* More merging may be possible */
+		cur = ext;
+		list_for_each_entry_safe_continue(cur, aux, list, hook) {
+			if (zone_end < cur->start)
+				break;
+			if (zone_end < cur->end)
+				ext->end = cur->end;
+			list_del(&cur->hook);
+			kfree(cur);
+		}
 	}
-	return zbmlist;
+
+	return 0;
 }
 
 /**
   *	memory_bm_create - allocate memory for a memory bitmap
   */
-
 static int
 memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
 {
 	struct chain_allocator ca;
-	struct zone *zone;
-	struct zone_bitmap *zone_bm;
-	struct bm_block *bb;
-	unsigned int nr;
+	struct list_head mem_extents;
+	struct mem_extent *ext;
+	int error;
 
 	chain_init(&ca, gfp_mask, safe_needed);
+	INIT_LIST_HEAD(&bm->blocks);
 
-	/* Compute the number of zones */
-	nr = 0;
-	for_each_zone(zone)
-		if (populated_zone(zone))
-			nr++;
-
-	/* Allocate the list of zones bitmap objects */
-	zone_bm = create_zone_bm_list(nr, &ca);
-	bm->zone_bm_list = zone_bm;
-	if (!zone_bm) {
-		chain_free(&ca, PG_UNSAFE_CLEAR);
-		return -ENOMEM;
-	}
-
-	/* Initialize the zone bitmap objects */
-	for_each_zone(zone) {
-		unsigned long pfn;
+	error = create_mem_extents(&mem_extents, gfp_mask);
+	if (error)
+		return error;
 
-		if (!populated_zone(zone))
-			continue;
+	list_for_each_entry(ext, &mem_extents, hook) {
+		struct bm_block *bb;
+		unsigned long pfn = ext->start;
+		unsigned long pages = ext->end - ext->start;
 
-		zone_bm->start_pfn = zone->zone_start_pfn;
-		zone_bm->end_pfn = zone->zone_start_pfn + zone->spanned_pages;
-		/* Allocate the list of bitmap block objects */
-		nr = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
-		bb = create_bm_block_list(nr, &ca);
-		zone_bm->bm_blocks = bb;
-		zone_bm->cur_block = bb;
-		if (!bb)
-			goto Free;
+		bb = list_entry(bm->blocks.prev, struct bm_block, hook);
 
-		nr = zone->spanned_pages;
-		pfn = zone->zone_start_pfn;
-		/* Initialize the bitmap block objects */
-		while (bb) {
-			unsigned long *ptr;
+		error = create_bm_block_list(pages, bm->blocks.prev, &ca);
+		if (error)
+			goto Error;
 
-			ptr = get_image_page(gfp_mask, safe_needed);
-			bb->data = ptr;
-			if (!ptr)
-				goto Free;
+		list_for_each_entry_continue(bb, &bm->blocks, hook) {
+			bb->data = get_image_page(gfp_mask, safe_needed);
+			if (!bb->data) {
+				error = -ENOMEM;
+				goto Error;
+			}
 
 			bb->start_pfn = pfn;
-			if (nr >= BM_BITS_PER_BLOCK) {
+			if (pages >= BM_BITS_PER_BLOCK) {
 				pfn += BM_BITS_PER_BLOCK;
-				nr -= BM_BITS_PER_BLOCK;
+				pages -= BM_BITS_PER_BLOCK;
 			} else {
 				/* This is executed only once in the loop */
-				pfn += nr;
+				pfn += pages;
 			}
 			bb->end_pfn = pfn;
-			bb = bb->next;
 		}
-		zone_bm = zone_bm->next;
 	}
+
 	bm->p_list = ca.chain;
 	memory_bm_position_reset(bm);
-	return 0;
+ Exit:
+	free_mem_extents(&mem_extents);
+	return error;
 
- Free:
+ Error:
 	bm->p_list = ca.chain;
 	memory_bm_free(bm, PG_UNSAFE_CLEAR);
-	return -ENOMEM;
+	goto Exit;
 }
 
 /**
   *	memory_bm_free - free memory occupied by the memory bitmap @bm
   */
-
 static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
 {
-	struct zone_bitmap *zone_bm;
+	struct bm_block *bb;
 
-	/* Free the list of bit blocks for each zone_bitmap object */
-	zone_bm = bm->zone_bm_list;
-	while (zone_bm) {
-		struct bm_block *bb;
+	list_for_each_entry(bb, &bm->blocks, hook)
+		if (bb->data)
+			free_image_page(bb->data, clear_nosave_free);
 
-		bb = zone_bm->bm_blocks;
-		while (bb) {
-			if (bb->data)
-				free_image_page(bb->data, clear_nosave_free);
-			bb = bb->next;
-		}
-		zone_bm = zone_bm->next;
-	}
 	free_list_of_pages(bm->p_list, clear_nosave_free);
-	bm->zone_bm_list = NULL;
+
+	INIT_LIST_HEAD(&bm->blocks);
 }
 
 /**
@@ -437,38 +452,33 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
  *	to given pfn.  The cur_zone_bm member of @bm and the cur_block member
  *	of @bm->cur_zone_bm are updated.
  */
-
 static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
 				void **addr, unsigned int *bit_nr)
 {
-	struct zone_bitmap *zone_bm;
 	struct bm_block *bb;
 
-	/* Check if the pfn is from the current zone */
-	zone_bm = bm->cur.zone_bm;
-	if (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
-		zone_bm = bm->zone_bm_list;
-		/* We don't assume that the zones are sorted by pfns */
-		while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
-			zone_bm = zone_bm->next;
-
-			if (!zone_bm)
-				return -EFAULT;
-		}
-		bm->cur.zone_bm = zone_bm;
-	}
-	/* Check if the pfn corresponds to the current bitmap block */
-	bb = zone_bm->cur_block;
+	/*
+	 * Check if the pfn corresponds to the current bitmap block and find
+	 * the block where it fits if this is not the case.
+	 */
+	bb = bm->cur.block;
 	if (pfn < bb->start_pfn)
-		bb = zone_bm->bm_blocks;
+		list_for_each_entry_continue_reverse(bb, &bm->blocks, hook)
+			if (pfn >= bb->start_pfn)
+				break;
 
-	while (pfn >= bb->end_pfn) {
-		bb = bb->next;
+	if (pfn >= bb->end_pfn)
+		list_for_each_entry_continue(bb, &bm->blocks, hook)
+			if (pfn >= bb->start_pfn && pfn < bb->end_pfn)
+				break;
 
-		BUG_ON(!bb);
-	}
-	zone_bm->cur_block = bb;
+	if (&bb->hook == &bm->blocks)
+		return -EFAULT;
+
+	/* The block has been found */
+	bm->cur.block = bb;
 	pfn -= bb->start_pfn;
+	bm->cur.bit = pfn + 1;
 	*bit_nr = pfn;
 	*addr = bb->data;
 	return 0;
@@ -538,29 +548,21 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
 
 static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
 {
-	struct zone_bitmap *zone_bm;
 	struct bm_block *bb;
 	int bit;
 
+	bb = bm->cur.block;
 	do {
-		bb = bm->cur.block;
-		do {
-			bit = bm->cur.bit;
-			bit = find_next_bit(bb->data, bm_block_bits(bb), bit);
-			if (bit < bm_block_bits(bb))
-				goto Return_pfn;
-
-			bb = bb->next;
-			bm->cur.block = bb;
-			bm->cur.bit = 0;
-		} while (bb);
-		zone_bm = bm->cur.zone_bm->next;
-		if (zone_bm) {
-			bm->cur.zone_bm = zone_bm;
-			bm->cur.block = zone_bm->bm_blocks;
-			bm->cur.bit = 0;
-		}
-	} while (zone_bm);
+		bit = bm->cur.bit;
+		bit = find_next_bit(bb->data, bm_block_bits(bb), bit);
+		if (bit < bm_block_bits(bb))
+			goto Return_pfn;
+
+		bb = list_entry(bb->hook.next, struct bm_block, hook);
+		bm->cur.block = bb;
+		bm->cur.bit = 0;
+	} while (&bb->hook != &bm->blocks);
+
 	memory_bm_position_reset(bm);
 	return BM_END_OF_MAP;
 
@@ -816,8 +818,7 @@ static unsigned int count_free_highmem_pages(void)
  *	We should save the page if it isn't Nosave or NosaveFree, or Reserved,
  *	and it isn't a part of a free chunk of pages.
  */
-
-static struct page *saveable_highmem_page(unsigned long pfn)
+static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
 {
 	struct page *page;
 
@@ -825,6 +826,8 @@ static struct page *saveable_highmem_page(unsigned long pfn)
 		return NULL;
 
 	page = pfn_to_page(pfn);
+	if (page_zone(page) != zone)
+		return NULL;
 
 	BUG_ON(!PageHighMem(page));
 
@@ -854,13 +857,16 @@ unsigned int count_highmem_pages(void)
 		mark_free_pages(zone);
 		max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
 		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
-			if (saveable_highmem_page(pfn))
+			if (saveable_highmem_page(zone, pfn))
 				n++;
 	}
 	return n;
 }
 #else
-static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; }
+static inline void *saveable_highmem_page(struct zone *z, unsigned long p)
+{
+	return NULL;
+}
 #endif /* CONFIG_HIGHMEM */
 
 /**
@@ -871,8 +877,7 @@ static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; }
  *	of pages statically defined as 'unsaveable', and it isn't a part of
  *	a free chunk of pages.
  */
-
-static struct page *saveable_page(unsigned long pfn)
+static struct page *saveable_page(struct zone *zone, unsigned long pfn)
 {
 	struct page *page;
 
@@ -880,6 +885,8 @@ static struct page *saveable_page(unsigned long pfn)
 		return NULL;
 
 	page = pfn_to_page(pfn);
+	if (page_zone(page) != zone)
+		return NULL;
 
 	BUG_ON(PageHighMem(page));
 
@@ -911,7 +918,7 @@ unsigned int count_data_pages(void)
 		mark_free_pages(zone);
 		max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
 		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
-			if(saveable_page(pfn))
+			if (saveable_page(zone, pfn))
 				n++;
 	}
 	return n;
@@ -952,7 +959,7 @@ static inline struct page *
 page_is_saveable(struct zone *zone, unsigned long pfn)
 {
 	return is_highmem(zone) ?
-			saveable_highmem_page(pfn) : saveable_page(pfn);
+		saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn);
 }
 
 static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
@@ -983,7 +990,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
 	}
 }
 #else
-#define page_is_saveable(zone, pfn)	saveable_page(pfn)
+#define page_is_saveable(zone, pfn)	saveable_page(zone, pfn)
 
 static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
 {
-- 
cgit v1.2.3


From baa5835df10254762aedb6cb23a9c1508f969736 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 8 Dec 2008 00:52:49 +0100
Subject: Hibernate: Replace unnecessary evaluation of pfn_to_page()

Replace one evaluation of pfn_to_page() in copy_data_pages() with
the value of a local variable containing the right number already.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 kernel/power/snapshot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index ec9f153b2fc..f5fc2d7680f 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -981,7 +981,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
 			 * data modified by kmap_atomic()
 			 */
 			safe_copy_page(buffer, s_page);
-			dst = kmap_atomic(pfn_to_page(dst_pfn), KM_USER0);
+			dst = kmap_atomic(d_page, KM_USER0);
 			memcpy(dst, buffer, PAGE_SIZE);
 			kunmap_atomic(dst, KM_USER0);
 		} else {
-- 
cgit v1.2.3


From 213cc060797378059a28ebc5c539f3e9a80160bd Mon Sep 17 00:00:00 2001
From: Pekka J Enberg <penberg@cs.helsinki.fi>
Date: Fri, 19 Dec 2008 12:08:39 +0200
Subject: ftrace: introduce tracing_reset_online_cpus() helper

Impact: cleanup

This patch factors out common code from multiple tracers into a
tracing_reset_online_cpus() function and converts the tracers to use it.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c              | 10 ++++++++++
 kernel/trace/trace.h              |  1 +
 kernel/trace/trace_boot.c         | 12 +-----------
 kernel/trace/trace_functions.c    | 14 ++------------
 kernel/trace/trace_hw_branches.c  | 14 ++------------
 kernel/trace/trace_mmiotrace.c    |  6 +-----
 kernel/trace/trace_sched_switch.c | 14 ++------------
 kernel/trace/trace_sysprof.c      | 12 +-----------
 8 files changed, 20 insertions(+), 63 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0eb6d48347f..79db26e8216 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -679,6 +679,16 @@ void tracing_reset(struct trace_array *tr, int cpu)
 	ftrace_enable_cpu();
 }
 
+void tracing_reset_online_cpus(struct trace_array *tr)
+{
+	int cpu;
+
+	tr->time_start = ftrace_now(tr->cpu);
+
+	for_each_online_cpu(cpu)
+		tracing_reset(tr, cpu);
+}
+
 #define SAVED_CMDLINES 128
 static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
 static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index fc75dce7a66..cc7a4f86403 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -374,6 +374,7 @@ struct trace_iterator {
 int tracing_is_enabled(void);
 void trace_wake_up(void);
 void tracing_reset(struct trace_array *tr, int cpu);
+void tracing_reset_online_cpus(struct trace_array *tr);
 int tracing_open_generic(struct inode *inode, struct file *filp);
 struct dentry *tracing_init_dentry(void);
 void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index a4fa2c57e34..3ccebde2848 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -37,16 +37,6 @@ void disable_boot_trace(void)
 		tracing_stop_sched_switch_record();
 }
 
-static void reset_boot_trace(struct trace_array *tr)
-{
-	int cpu;
-
-	tr->time_start = ftrace_now(tr->cpu);
-
-	for_each_online_cpu(cpu)
-		tracing_reset(tr, cpu);
-}
-
 static int boot_trace_init(struct trace_array *tr)
 {
 	int cpu;
@@ -130,7 +120,7 @@ struct tracer boot_tracer __read_mostly =
 {
 	.name		= "initcall",
 	.init		= boot_trace_init,
-	.reset		= reset_boot_trace,
+	.reset		= tracing_reset_online_cpus,
 	.print_line	= initcall_print_line,
 };
 
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index e74f6d0a321..9236d7e25a1 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -16,20 +16,10 @@
 
 #include "trace.h"
 
-static void function_reset(struct trace_array *tr)
-{
-	int cpu;
-
-	tr->time_start = ftrace_now(tr->cpu);
-
-	for_each_online_cpu(cpu)
-		tracing_reset(tr, cpu);
-}
-
 static void start_function_trace(struct trace_array *tr)
 {
 	tr->cpu = get_cpu();
-	function_reset(tr);
+	tracing_reset_online_cpus(tr);
 	put_cpu();
 
 	tracing_start_cmdline_record();
@@ -55,7 +45,7 @@ static void function_trace_reset(struct trace_array *tr)
 
 static void function_trace_start(struct trace_array *tr)
 {
-	function_reset(tr);
+	tracing_reset_online_cpus(tr);
 }
 
 static struct tracer function_trace __read_mostly =
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index ee29e012aa9..b6a3e20a49a 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -25,16 +25,6 @@ static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
 #define this_buffer per_cpu(buffer, smp_processor_id())
 
 
-static void bts_trace_reset(struct trace_array *tr)
-{
-	int cpu;
-
-	tr->time_start = ftrace_now(tr->cpu);
-
-	for_each_online_cpu(cpu)
-		tracing_reset(tr, cpu);
-}
-
 static void bts_trace_start_cpu(void *arg)
 {
 	if (this_tracer)
@@ -54,7 +44,7 @@ static void bts_trace_start(struct trace_array *tr)
 {
 	int cpu;
 
-	bts_trace_reset(tr);
+	tracing_reset_online_cpus(tr);
 
 	for_each_cpu_mask(cpu, cpu_possible_map)
 		smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
@@ -78,7 +68,7 @@ static void bts_trace_stop(struct trace_array *tr)
 
 static int bts_trace_init(struct trace_array *tr)
 {
-	bts_trace_reset(tr);
+	tracing_reset_online_cpus(tr);
 	bts_trace_start(tr);
 
 	return 0;
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 2fb6da6523b..fffcb069f1d 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -22,14 +22,10 @@ static unsigned long prev_overruns;
 
 static void mmio_reset_data(struct trace_array *tr)
 {
-	int cpu;
-
 	overrun_detected = false;
 	prev_overruns = 0;
-	tr->time_start = ftrace_now(tr->cpu);
 
-	for_each_online_cpu(cpu)
-		tracing_reset(tr, cpu);
+	tracing_reset_online_cpus(tr);
 }
 
 static int mmio_trace_init(struct trace_array *tr)
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 781d72ef873..add2c1fdae9 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -72,16 +72,6 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee)
 	local_irq_restore(flags);
 }
 
-static void sched_switch_reset(struct trace_array *tr)
-{
-	int cpu;
-
-	tr->time_start = ftrace_now(tr->cpu);
-
-	for_each_online_cpu(cpu)
-		tracing_reset(tr, cpu);
-}
-
 static int tracing_sched_register(void)
 {
 	int ret;
@@ -197,7 +187,7 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr)
 
 static void start_sched_trace(struct trace_array *tr)
 {
-	sched_switch_reset(tr);
+	tracing_reset_online_cpus(tr);
 	tracing_start_sched_switch_record();
 }
 
@@ -221,7 +211,7 @@ static void sched_switch_trace_reset(struct trace_array *tr)
 
 static void sched_switch_trace_start(struct trace_array *tr)
 {
-	sched_switch_reset(tr);
+	tracing_reset_online_cpus(tr);
 	tracing_start_sched_switch();
 }
 
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 54960edb96d..01becf1f19f 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -234,20 +234,10 @@ static void stop_stack_timers(void)
 		stop_stack_timer(cpu);
 }
 
-static void stack_reset(struct trace_array *tr)
-{
-	int cpu;
-
-	tr->time_start = ftrace_now(tr->cpu);
-
-	for_each_online_cpu(cpu)
-		tracing_reset(tr, cpu);
-}
-
 static void start_stack_trace(struct trace_array *tr)
 {
 	mutex_lock(&sample_timer_lock);
-	stack_reset(tr);
+	tracing_reset_online_cpus(tr);
 	start_stack_timers();
 	tracer_enabled = 1;
 	mutex_unlock(&sample_timer_lock);
-- 
cgit v1.2.3


From 9bb482476c6c9d1ae033306440c51ceac93ea80c Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Tue, 16 Dec 2008 11:30:08 +0000
Subject: allow stripping of generated symbols under CONFIG_KALLSYMS_ALL

Building upon parts of the module stripping patch, this patch
introduces similar stripping for vmlinux when CONFIG_KALLSYMS_ALL=y.
Using CONFIG_KALLSYMS_STRIP_GENERATED reduces the overhead of
CONFIG_KALLSYMS_ALL from 245k/310k to 65k/80k for the (i386/x86-64)
kernels I tested with.

The patch also does away with the need to special case the kallsyms-
internal symbols by making them available even in the first linking
stage.

While it is a generated file, the patch includes the changes to
scripts/genksyms/keywords.c_shipped, as I'm unsure what the procedure
here is.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
---
 kernel/kallsyms.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 7b8b0f21a5b..e694afa0eb8 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -30,20 +30,19 @@
 #define all_var 0
 #endif
 
-/* These will be re-linked against their real values during the second link stage */
-extern const unsigned long kallsyms_addresses[] __attribute__((weak));
-extern const u8 kallsyms_names[] __attribute__((weak));
+extern const unsigned long kallsyms_addresses[];
+extern const u8 kallsyms_names[];
 
 /* tell the compiler that the count isn't in the small data section if the arch
  * has one (eg: FRV)
  */
 extern const unsigned long kallsyms_num_syms
-__attribute__((weak, section(".rodata")));
+	__attribute__((__section__(".rodata")));
 
-extern const u8 kallsyms_token_table[] __attribute__((weak));
-extern const u16 kallsyms_token_index[] __attribute__((weak));
+extern const u8 kallsyms_token_table[];
+extern const u16 kallsyms_token_index[];
 
-extern const unsigned long kallsyms_markers[] __attribute__((weak));
+extern const unsigned long kallsyms_markers[];
 
 static inline int is_kernel_inittext(unsigned long addr)
 {
@@ -168,9 +167,6 @@ static unsigned long get_symbol_pos(unsigned long addr,
 	unsigned long symbol_start = 0, symbol_end = 0;
 	unsigned long i, low, high, mid;
 
-	/* This kernel should never had been booted. */
-	BUG_ON(!kallsyms_addresses);
-
 	/* do a binary search on the sorted kallsyms_addresses array */
 	low = 0;
 	high = kallsyms_num_syms;
-- 
cgit v1.2.3


From 26cc271db798cf211d35967cbfbb53e997126b84 Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Fri, 19 Dec 2008 10:23:03 -0800
Subject: printk: fix discarding message when recursion_bug

Impact: fix truncated recursion bug message printout

When recursion_bug is true, kernel discards original message because printk_buf
contains recursion_bug_msg with NULL terminator. The sizeof(recursion_bug_msg)
makes this, use strlen() to get correct length without NULL terminator.

Reported-by: Toshikazu Nakayama <nakayama.ts@ncos.nec.co.jp>
Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/printk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index f492f1583d7..e651ab05655 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -662,7 +662,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 	if (recursion_bug) {
 		recursion_bug = 0;
 		strcpy(printk_buf, recursion_bug_msg);
-		printed_len = sizeof(recursion_bug_msg);
+		printed_len = strlen(recursion_bug_msg);
 	}
 	/* Emit the output into the temporary buffer */
 	printed_len += vscnprintf(printk_buf + printed_len,
-- 
cgit v1.2.3


From bf53de907dfdaac178c92d774aae7370d7b97d20 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 19 Dec 2008 15:10:24 +0100
Subject: x86, bts: add fork and exit handling

Impact: introduce new ptrace facility

Add arch_ptrace_untrace() function that is called when the tracer
detaches (either voluntarily or when the tracing task dies);
ptrace_disable() is only called on a voluntary detach.

Add ptrace_fork() and arch_ptrace_fork(). They are called when a
traced task is forked.

Clear DS and BTS related fields on fork.

Release DS resources and reclaim memory in ptrace_untrace(). This
releases resources already when the tracing task dies. We used to do
that when the traced task dies.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/fork.c   |  2 ++
 kernel/ptrace.c | 12 ++++++++++++
 2 files changed, 14 insertions(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 7b93da72d4a..65ce60adc8e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1096,6 +1096,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef CONFIG_DEBUG_MUTEXES
 	p->blocked_on = NULL; /* not blocked yet */
 #endif
+	if (unlikely(ptrace_reparented(current)))
+		ptrace_fork(p, clone_flags);
 
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	sched_fork(p, clone_flags);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 4c8bcd7dd8e..100a71cfdab 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -25,6 +25,17 @@
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
 
+
+/*
+ * Initialize a new task whose father had been ptraced.
+ *
+ * Called from copy_process().
+ */
+void ptrace_fork(struct task_struct *child, unsigned long clone_flags)
+{
+	arch_ptrace_fork(child, clone_flags);
+}
+
 /*
  * ptrace a task: make the debugger its new parent and
  * move it to the ptrace list.
@@ -72,6 +83,7 @@ void __ptrace_unlink(struct task_struct *child)
 	child->parent = child->real_parent;
 	list_del_init(&child->ptrace_entry);
 
+	arch_ptrace_untrace(child);
 	if (task_is_traced(child))
 		ptrace_untrace(child);
 }
-- 
cgit v1.2.3


From a8ccf1d6f60e3e6ae63122e02378cd4d40dd4aac Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 23 Dec 2008 11:32:24 -0500
Subject: ring-buffer: fix dangling commit race

Impact: fix stuck trace-buffers

If an interrupt comes in during the rb_set_commit_to_write and
pushes the tail page forward just at the right time, the commit
updates will miss the adding of the interrupt data. This will
cause the commit pointer to cease from moving forward.

Thanks to Jiaying Zhang for finding this race.

Reported-by: Jiaying Zhang <jiayingz@google.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: <stable@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bb6922a931b..d03f4f44a82 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -838,6 +838,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
 	 * back to us). This allows us to do a simple loop to
 	 * assign the commit to the tail.
 	 */
+ again:
 	while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
 		cpu_buffer->commit_page->page->commit =
 			cpu_buffer->commit_page->write;
@@ -853,6 +854,17 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
 			cpu_buffer->commit_page->write;
 		barrier();
 	}
+
+	/* again, keep gcc from optimizing */
+	barrier();
+
+	/*
+	 * If an interrupt came in just after the first while loop
+	 * and pushed the tail page forward, we will be left with
+	 * a dangling commit that will never go forward.
+	 */
+	if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page))
+		goto again;
 }
 
 static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
-- 
cgit v1.2.3


From 98db8df777438e16ad0f44a0fba05ebbdb73db8d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 23 Dec 2008 11:32:25 -0500
Subject: ring-buffer: prevent false positive warning

Impact: eliminate false WARN_ON message

If an interrupt goes off after the setting of the local variable
tail_page and before incrementing the write index of that page,
the interrupt could push the commit forward to the next page.

Later a check is made to see if interrupts pushed the buffer around
the entire ring buffer by comparing the next page to the last commited
page. This can produce a false positive if the interrupt had pushed
the commit page forward as stated above.

Thanks to Jiaying Zhang for finding this race.

Reported-by: Jiaying Zhang <jiayingz@google.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: <stable@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index d03f4f44a82..76f34c0ef29 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -962,12 +962,15 @@ static struct ring_buffer_event *
 __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		  unsigned type, unsigned long length, u64 *ts)
 {
-	struct buffer_page *tail_page, *head_page, *reader_page;
+	struct buffer_page *tail_page, *head_page, *reader_page, *commit_page;
 	unsigned long tail, write;
 	struct ring_buffer *buffer = cpu_buffer->buffer;
 	struct ring_buffer_event *event;
 	unsigned long flags;
 
+	commit_page = cpu_buffer->commit_page;
+	/* we just need to protect against interrupts */
+	barrier();
 	tail_page = cpu_buffer->tail_page;
 	write = local_add_return(length, &tail_page->write);
 	tail = write - length;
@@ -993,7 +996,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		 * it all the way around the buffer, bail, and warn
 		 * about it.
 		 */
-		if (unlikely(next_page == cpu_buffer->commit_page)) {
+		if (unlikely(next_page == commit_page)) {
 			WARN_ON_ONCE(1);
 			goto out_unlock;
 		}
-- 
cgit v1.2.3


From 36dffab679c7eeb91c2507400cf4da6e9e01164e Mon Sep 17 00:00:00 2001
From: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Date: Sat, 20 Dec 2008 10:06:38 +0530
Subject: sched: nominate preferred wakeup cpu, fix

Andrew Morton reported:

> kernel/sched.c: In function 'schedule':
> kernel/sched.c:3679: warning: 'active_balance' may be used uninitialized in this function
>
> This warning is correct - the code is buggy.

In sched.c load_balance_newidle, there's real potential use of
uninitialised variable - fix it.

Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index ae5ca3f9e77..756d981d91a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3670,7 +3670,7 @@ redo:
 	}
 
 	if (!ld_moved) {
-		int active_balance;
+		int active_balance = 0;
 
 		schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
 		if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-- 
cgit v1.2.3


From 12d79bafb75639f406a9f71aab94808c414c836e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 25 Dec 2008 09:31:28 +0100
Subject: rcu: provide RCU options on non-preempt architectures too

Impact: build fix

Some old architectures still do not use kernel/Kconfig.preempt, so the
moving of the RCU options there broke their build:

 In file included from /home/mingo/tip/include/linux/sem.h:81,
                 from /home/mingo/tip/include/linux/sched.h:69,
                 from /home/mingo/tip/arch/alpha/kernel/asm-offsets.c:9:
 /home/mingo/tip/include/linux/rcupdate.h:62:2: error: #error "Unknown RCU implementation specified to kernel configuration"

Move these options back to init/Kconfig, which every architecture
includes.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Kconfig.preempt | 75 --------------------------------------------------
 1 file changed, 75 deletions(-)

(limited to 'kernel')

diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 463f29743ea..bf987b95b35 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -52,78 +52,3 @@ config PREEMPT
 
 endchoice
 
-choice
-	prompt "RCU Implementation"
-	default CLASSIC_RCU
-
-config CLASSIC_RCU
-	bool "Classic RCU"
-	help
-	  This option selects the classic RCU implementation that is
-	  designed for best read-side performance on non-realtime
-	  systems.
-	  
-	  Select this option if you are unsure.
-
-config TREE_RCU
-	bool "Tree-based hierarchical RCU"
-	help
-	  This option selects the RCU implementation that is
-	  designed for very large SMP system with hundreds or
-	  thousands of CPUs.
-
-config PREEMPT_RCU
-	bool "Preemptible RCU"
-	depends on PREEMPT
-	help
-	  This option reduces the latency of the kernel by making certain
-	  RCU sections preemptible. Normally RCU code is non-preemptible, if
-	  this option is selected then read-only RCU sections become
-	  preemptible. This helps latency, but may expose bugs due to
-	  now-naive assumptions about each RCU read-side critical section
-	  remaining on a given CPU through its execution.
-
-endchoice
-
-config RCU_TRACE
-	bool "Enable tracing for RCU"
-	depends on TREE_RCU || PREEMPT_RCU
-	help
-	  This option provides tracing in RCU which presents stats
-	  in debugfs for debugging RCU implementation.
-
-	  Say Y here if you want to enable RCU tracing
-	  Say N if you are unsure.
-
-config RCU_FANOUT
-	int "Tree-based hierarchical RCU fanout value"
-	range 2 64 if 64BIT
-	range 2 32 if !64BIT
-	depends on TREE_RCU
-	default 64 if 64BIT
-	default 32 if !64BIT
-	help
-	  This option controls the fanout of hierarchical implementations
-	  of RCU, allowing RCU to work efficiently on machines with
-	  large numbers of CPUs.  This value must be at least the cube
-	  root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit
-	  systems and up to 262,144 for 64-bit systems.
-
-	  Select a specific number if testing RCU itself.
-	  Take the default if unsure.
-
-config RCU_FANOUT_EXACT
-	bool "Disable tree-based hierarchical RCU auto-balancing"
-	depends on TREE_RCU
-	default n
-	help
-	  This option forces use of the exact RCU_FANOUT value specified,
-	  regardless of imbalances in the hierarchy.  This is useful for
-	  testing RCU itself, and might one day be useful on systems with
-	  strong NUMA behavior.
-
-	  Without RCU_FANOUT_EXACT, the code will balance the hierarchy.
-
-	  Say n if unsure.
-
-	
-- 
cgit v1.2.3


From 9212ddb5eada64fec5a08b28207401f3cc3d0876 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 25 Dec 2008 11:21:20 +0100
Subject: stacktrace: provide save_stack_trace_tsk() weak alias

Impact: build fix

Some architectures have not implemented save_stack_trace_tsk() yet:

  fs/built-in.o: In function `proc_pid_stack':
  base.c:(.text+0x3f140): undefined reference to `save_stack_trace_tsk'

So warn about that if the facility is used.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/stacktrace.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 94b527ef1d1..eb212f8f8bc 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -6,6 +6,7 @@
  *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  */
 #include <linux/sched.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/stacktrace.h>
@@ -24,3 +25,13 @@ void print_stack_trace(struct stack_trace *trace, int spaces)
 }
 EXPORT_SYMBOL_GPL(print_stack_trace);
 
+/*
+ * Architectures that do not implement save_stack_trace_tsk get this
+ * weak alias and a once-per-bootup warning (whenever this facility
+ * is utilized - for example by procfs):
+ */
+__weak void
+save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
+{
+	WARN_ONCE(1, KERN_INFO "save_stack_trace_tsk() not implemented yet.\n");
+}
-- 
cgit v1.2.3


From 468a15bb4cc61694495cc5ed7ffca29e87c79b69 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 16 Dec 2008 08:07:03 +0100
Subject: sched, trace: update trace_sched_wakeup()

Impact: extend the wakeup tracepoint with the info whether the wakeup was real

Add the information needed to distinguish 'real' wakeups from 'false'
wakeups.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c                    | 2 +-
 kernel/trace/trace_sched_switch.c | 2 +-
 kernel/trace/trace_sched_wakeup.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index ceda5799466..dcb39bc88f6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2324,7 +2324,7 @@ out_activate:
 	success = 1;
 
 out_running:
-	trace_sched_wakeup(rq, p);
+	trace_sched_wakeup(rq, p, success);
 	check_preempt_curr(rq, p, sync);
 
 	p->state = TASK_RUNNING;
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index add2c1fdae9..df175cb4564 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -49,7 +49,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
 }
 
 static void
-probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee)
+probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
 {
 	struct trace_array_cpu *data;
 	unsigned long flags;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 0067b49746c..43586b689e3 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -211,7 +211,7 @@ static void wakeup_reset(struct trace_array *tr)
 }
 
 static void
-probe_wakeup(struct rq *rq, struct task_struct *p)
+probe_wakeup(struct rq *rq, struct task_struct *p, int success)
 {
 	int cpu = smp_processor_id();
 	unsigned long flags;
-- 
cgit v1.2.3


From 51bc39f4ba35bae153b32145077fb1109bcae14c Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Fri, 26 Dec 2008 12:23:00 +0900
Subject: hrtimer: remove #include <linux/irq.h>

Impact: cleanup

<linux/irq.h> can be removed and should be, because:

  - hrtimer doesn't use any irq feature.
  - <linux/irq.h> shouldn't be include from generic code.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hrtimer.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 47e63349d1b..0ad3f3d6d10 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -32,7 +32,6 @@
  */
 
 #include <linux/cpu.h>
-#include <linux/irq.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/hrtimer.h>
-- 
cgit v1.2.3


From f9af0e70911e9d6cc9a68f784dca86415486084d Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Fri, 26 Dec 2008 12:24:24 +0900
Subject: irq: for_each_irq_desc() move to irqnr.h

Impact: cleanup

before CONFIG_SPARSE_IRQ age, for_each_irq_desc() sat in irqnr.h and
could be called from generic code.

CONFIG_SPARSE_IRQ breaks this assumption, but SPARSE_IRQ version
for_each_irq_desc() also can move into irqnr.h easily.

Also, this patch unifies CONFIG_SPARSE_IRQ and !CONFIG_SPARSE_IRQ
for_each_irq_desc().

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/handle.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 6492400cb50..4db7d2df86b 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -203,7 +203,7 @@ out_unlock:
 	return desc;
 }
 
-#else
+#else /* !CONFIG_SPARSE_IRQ */
 
 struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
 	[0 ... NR_IRQS-1] = {
@@ -218,7 +218,16 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
 	}
 };
 
-#endif
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+	return (irq < NR_IRQS) ? irq_desc + irq : NULL;
+}
+
+struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+{
+	return irq_to_desc(irq);
+}
+#endif /* !CONFIG_SPARSE_IRQ */
 
 /*
  * What should we do if we get a hw irq event on an illegal vector?
-- 
cgit v1.2.3


From 26ddd8d5cac8a563953d5febe8c6e40909f7bce1 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Fri, 26 Dec 2008 14:24:10 +0900
Subject: proc: remove ifdef CONFIG_SPARSE_IRQ from stat.c

Impact: cleanup

irq_desc can be NULL when CONFIG_SPARSE_IRQ=y only.
therefore, NULL checking can move into kstat_irqs_cpu() of SPARSE_IRQ version.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: "Yinghai Lu" <yinghai@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/handle.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 4db7d2df86b..03479dfdebb 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -448,7 +448,7 @@ void early_init_irq_lock_class(void)
 unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
-	return desc->kstat_irqs[cpu];
+	return desc ? desc->kstat_irqs[cpu] : 0;
 }
 #endif
 EXPORT_SYMBOL(kstat_irqs_cpu);
-- 
cgit v1.2.3


From 18eefedfe8ad33e8fc7614c13359e29a9fab4644 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Fri, 26 Dec 2008 12:29:48 +0900
Subject: irq: simplify for_each_irq_desc() usage

Impact: cleanup

all for_each_irq_desc() usage point have !desc check.
then its check can move into for_each_irq_desc() macro.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/autoprobe.c | 15 ---------------
 kernel/irq/handle.c    |  3 ---
 kernel/irq/spurious.c  |  5 -----
 3 files changed, 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 650ce4102a6..cc0f7321b8c 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -40,9 +40,6 @@ unsigned long probe_irq_on(void)
 	 * flush such a longstanding irq before considering it as spurious.
 	 */
 	for_each_irq_desc_reverse(i, desc) {
-		if (!desc)
-			continue;
-
 		spin_lock_irq(&desc->lock);
 		if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
 			/*
@@ -71,9 +68,6 @@ unsigned long probe_irq_on(void)
 	 * happened in the previous stage, it may have masked itself)
 	 */
 	for_each_irq_desc_reverse(i, desc) {
-		if (!desc)
-			continue;
-
 		spin_lock_irq(&desc->lock);
 		if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
 			desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
@@ -92,9 +86,6 @@ unsigned long probe_irq_on(void)
 	 * Now filter out any obviously spurious interrupts
 	 */
 	for_each_irq_desc(i, desc) {
-		if (!desc)
-			continue;
-
 		spin_lock_irq(&desc->lock);
 		status = desc->status;
 
@@ -133,9 +124,6 @@ unsigned int probe_irq_mask(unsigned long val)
 	int i;
 
 	for_each_irq_desc(i, desc) {
-		if (!desc)
-			continue;
-
 		spin_lock_irq(&desc->lock);
 		status = desc->status;
 
@@ -178,9 +166,6 @@ int probe_irq_off(unsigned long val)
 	unsigned int status;
 
 	for_each_irq_desc(i, desc) {
-		if (!desc)
-			continue;
-
 		spin_lock_irq(&desc->lock);
 		status = desc->status;
 
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 03479dfdebb..7dbdfe52469 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -437,9 +437,6 @@ void early_init_irq_lock_class(void)
 	int i;
 
 	for_each_irq_desc(i, desc) {
-		if (!desc)
-			continue;
-
 		lockdep_set_class(&desc->lock, &irq_desc_lock_class);
 	}
 }
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 3738107531f..dd364c11e56 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -91,9 +91,6 @@ static int misrouted_irq(int irq)
 	int i, ok = 0;
 
 	for_each_irq_desc(i, desc) {
-		if (!desc)
-			continue;
-
 		if (!i)
 			 continue;
 
@@ -115,8 +112,6 @@ static void poll_spurious_irqs(unsigned long dummy)
 	for_each_irq_desc(i, desc) {
 		unsigned int status;
 
-		if (!desc)
-			continue;
 		if (!i)
 			 continue;
 
-- 
cgit v1.2.3


From 00c23634879062d1c38d60128bf150c394a359e8 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Tue, 23 Dec 2008 17:29:00 -0800
Subject: sparseirq: remove duplicated arch_early_irq_init()

Impact: clean up

We already have a weak copy of this function in init/main.c

Signed-off-by: Yinghai <yinghai@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/handle.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 7dbdfe52469..06b05a4d300 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -56,10 +56,6 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
 int nr_irqs = NR_IRQS;
 EXPORT_SYMBOL_GPL(nr_irqs);
 
-void __init __attribute__((weak)) arch_early_irq_init(void)
-{
-}
-
 #ifdef CONFIG_SPARSE_IRQ
 static struct irq_desc irq_desc_init = {
 	.irq	    = -1,
-- 
cgit v1.2.3


From be4d638c1597580ed2294d899d9f1a2cd10e462c Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 26 Dec 2008 22:23:43 +1030
Subject: cpumask: Replace cpu_coregroup_map with cpu_coregroup_mask

cpu_coregroup_map returned a cpumask_t: it's going away.

(Note, the sched part of this patch won't apply meaningfully to the
sched tree, but I'm posting it to show the goal).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Ingo Molnar <mingo@redhat.com>
---
 kernel/sched.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index d2d16d1273b..42929239830 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7119,7 +7119,7 @@ cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
 {
 	int group;
 #ifdef CONFIG_SCHED_MC
-	*mask = cpu_coregroup_map(cpu);
+	*mask = *cpu_coregroup_mask(cpu);
 	cpus_and(*mask, *mask, *cpu_map);
 	group = first_cpu(*mask);
 #elif defined(CONFIG_SCHED_SMT)
@@ -7485,7 +7485,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		sd = &per_cpu(core_domains, i);
 		SD_INIT(sd, MC);
 		set_domain_attribute(sd, attr);
-		sd->span = cpu_coregroup_map(i);
+		sd->span = *cpu_coregroup_mask(i);
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
@@ -7528,7 +7528,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SCHED_CPUMASK_VAR(this_core_map, allmasks);
 		SCHED_CPUMASK_VAR(send_covered, allmasks);
 
-		*this_core_map = cpu_coregroup_map(i);
+		*this_core_map = *cpu_coregroup_mask(i);
 		cpus_and(*this_core_map, *this_core_map, *cpu_map);
 		if (i != first_cpu(*this_core_map))
 			continue;
-- 
cgit v1.2.3


From 8b07cd44511f3aa78dd912cca6493275a6787dc5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 26 Dec 2008 19:10:04 +0100
Subject: sparseirq: do not printk when migrating IRQ descriptors

Impact: reduce printk noise

There were a couple of leftover KERN_DEBUG debugging printks, remove
them. Also clarify an error message.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/numa_migrate.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 089c3746358..a565ce3a4fb 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -74,10 +74,8 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
 
 	node = cpu_to_node(cpu);
 	desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
-	printk(KERN_DEBUG "  move irq_desc for %d to cpu %d node %d\n",
-		 irq, cpu, node);
 	if (!desc) {
-		printk(KERN_ERR "can not get new irq_desc for moving\n");
+		printk(KERN_ERR "irq %d: can not get new irq_desc for migration.\n", irq);
 		/* still use old one */
 		desc = old_desc;
 		goto out_unlock;
@@ -106,8 +104,6 @@ struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu)
 		return desc;
 
 	old_cpu = desc->cpu;
-	printk(KERN_DEBUG
-		 "try to move irq_desc from cpu %d to %d\n", old_cpu, cpu);
 	if (old_cpu != cpu) {
 		node = cpu_to_node(cpu);
 		old_node = cpu_to_node(old_cpu);
-- 
cgit v1.2.3


From 793f7b12a0c95e7bfec1badf9628043fb78fd440 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 26 Dec 2008 19:02:20 +0100
Subject: sparseirq: fix desc->lock init

Impact: cleanup

init_one_irq_desc() does not initialize the desc->lock properly -
you cannot init a lock by memcpying some other lock on it.

This happens to work right now (because irq_desc_init is never in use),
but it's a dangerous construct nevertheless, so fix it.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/handle.c       | 2 ++
 kernel/irq/numa_migrate.c | 1 +
 2 files changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 06b05a4d300..893da67b778 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -93,6 +93,8 @@ void __attribute__((weak)) arch_init_chip_data(struct irq_desc *desc, int cpu)
 static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
 {
 	memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
+
+	spin_lock_init(&desc->lock);
 	desc->irq = irq;
 #ifdef CONFIG_SMP
 	desc->cpu = cpu;
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index a565ce3a4fb..ecf765c6a77 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -42,6 +42,7 @@ static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
 		 struct irq_desc *desc, int cpu)
 {
 	memcpy(desc, old_desc, sizeof(struct irq_desc));
+	spin_lock_init(&desc->lock);
 	desc->cpu = cpu;
 	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
 	init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
-- 
cgit v1.2.3


From 13a0c3c269b223f60abfac8a9811d77111a8b4ba Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 26 Dec 2008 02:05:47 -0800
Subject: sparseirq: work around compiler optimizing away __weak functions

Impact: fix panic on null pointer with sparseirq

Some GCC versions seem to inline the weak global function,
when that function is empty.

Work it around, by making the functions return a (dummy) integer.

Signed-off-by: Yinghai <yinghai@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/handle.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 893da67b778..0bef3ecb7a0 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -86,8 +86,9 @@ void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
 		desc->kstat_irqs = (unsigned int *)ptr;
 }
 
-void __attribute__((weak)) arch_init_chip_data(struct irq_desc *desc, int cpu)
+int __weak arch_init_chip_data(struct irq_desc *desc, int cpu)
 {
+	return 0;
 }
 
 static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
@@ -132,7 +133,7 @@ static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_sm
 /* FIXME: use bootmem alloc ...*/
 static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS];
 
-void __init early_irq_init(void)
+int __init early_irq_init(void)
 {
 	struct irq_desc *desc;
 	int legacy_count;
@@ -151,7 +152,7 @@ void __init early_irq_init(void)
 	for (i = legacy_count; i < NR_IRQS; i++)
 		irq_desc_ptrs[i] = NULL;
 
-	arch_early_irq_init();
+	return arch_early_irq_init();
 }
 
 struct irq_desc *irq_to_desc(unsigned int irq)
-- 
cgit v1.2.3


From fa6beb37b0d9bc00f90f11154eeed9502d8b0a37 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 22 Dec 2008 20:24:09 -0800
Subject: sparseirq: set lock_class for legacy irq when sparse_irq is selected

Impact: add lockdep annotation to legacy IRQ descs

Warnings resulting out of this were not seen in practice, but it's prudent
to initialize the legacy descriptors to the lock class as well, symmetric
to how we do it with other descriptors.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/handle.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 0bef3ecb7a0..e1cf4e391ca 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -145,6 +145,7 @@ int __init early_irq_init(void)
 	for (i = 0; i < legacy_count; i++) {
 		desc[i].irq = i;
 		desc[i].kstat_irqs = kstat_irqs_legacy[i];
+		lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
 
 		irq_desc_ptrs[i] = desc + i;
 	}
-- 
cgit v1.2.3


From 12026ea16a618b289fcf457661aed24f57323a20 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 26 Dec 2008 22:38:15 -0800
Subject: sparseirq: fix hang with !SPARSE_IRQ

Impact: fix hang

Suresh report his two sockets system only works with SPARSE_IRQ enable
it turns out we miss the setting desc->irq

so provide early_irq_init() even !SPARSE_IRQ to set desc->irq

Reported-by: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/handle.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index e1cf4e391ca..157c04c3b15 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -218,6 +218,21 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
 	}
 };
 
+int __init early_irq_init(void)
+{
+	struct irq_desc *desc;
+	int count;
+	int i;
+
+	desc = irq_desc;
+	count = ARRAY_SIZE(irq_desc);
+
+	for (i = 0; i < count; i++)
+		desc[i].irq = i;
+
+	return arch_early_irq_init();
+}
+
 struct irq_desc *irq_to_desc(unsigned int irq)
 {
 	return (irq < NR_IRQS) ? irq_desc + irq : NULL;
-- 
cgit v1.2.3


From b2e2fe99628c4f944c3075258e536197b5a4f3f8 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 29 Dec 2008 00:16:45 +0100
Subject: sparseirq: work around __weak alias bug

Impact: fix boot crash if the kernel is built with certain GCC versions

GCC has a bug with __weak alias functions: if the functions are in
the same compilation unit as their call site, GCC can decide to
inline them - and thus rob the linker of the opportunity to override
the weak alias with the real thing.

This can lead to the boot crash reported by Kamalesh Babulal:

 ACPI: Core revision 20080926
 Setting APIC routing to flat
 BUG: unable to handle kernel NULL pointer dereference at
 0000000000000000
 IP: [<ffffffff8021f9a8>] add_pin_to_irq_cpu+0x14/0x74
 PGD 0
 Oops: 0000 [#1] SMP
 [...]

So move the arch_init_chip_data() function from handle.c to manage.c.

Reported-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/handle.c | 5 -----
 kernel/irq/manage.c | 9 +++++++++
 2 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 157c04c3b15..c20db0be917 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -86,11 +86,6 @@ void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
 		desc->kstat_irqs = (unsigned int *)ptr;
 }
 
-int __weak arch_init_chip_data(struct irq_desc *desc, int cpu)
-{
-	return 0;
-}
-
 static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
 {
 	memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 46953a06f4a..c2741b02ad3 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -261,6 +261,15 @@ void enable_irq(unsigned int irq)
 }
 EXPORT_SYMBOL(enable_irq);
 
+/*
+ * [ Not in kernel/irq/handle.c, so that GCC does not
+ *   inline the __weak alias: ]
+ */
+int __weak arch_init_chip_data(struct irq_desc *desc, int cpu)
+{
+	return 0;
+}
+
 static int set_irq_wake_real(unsigned int irq, unsigned int on)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
-- 
cgit v1.2.3


From 7c0990c7ee988aa193abbb7da3faeb9279146dbf Mon Sep 17 00:00:00 2001
From: Nikanth Karthikesan <knikanth@suse.de>
Date: Wed, 19 Nov 2008 10:20:23 +0100
Subject: Do not free io context when taking recursive faults in do_exit

When taking recursive faults in do_exit, if the io_context is not null,
exit_io_context() is being called. But it might decrement the refcount
more than once. It is better to leave this task alone.

Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 kernel/exit.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index c7422ca9203..9a213474f54 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1037,8 +1037,6 @@ NORET_TYPE void do_exit(long code)
 		 * task into the wait for ever nirwana as well.
 		 */
 		tsk->flags |= PF_EXITPIDONE;
-		if (tsk->io_context)
-			exit_io_context();
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule();
 	}
-- 
cgit v1.2.3


From abf137dd7712132ee56d5b3143c2ff61a72a5faa Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 9 Dec 2008 08:11:22 +0100
Subject: aio: make the lookup_ioctx() lockless

The mm->ioctx_list is currently protected by a reader-writer lock,
so we always grab that lock on the read side for doing ioctx
lookups. As the workload is extremely reader biased, turn this into
an rcu hlist so we can make lookup_ioctx() lockless. Get rid of
the rwlock and use a spinlock for providing update side exclusion.

There's usually only 1 entry on this list, so it doesn't make sense
to look into fancier data structures.

Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 kernel/fork.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 6144b36cd89..43cbf30669e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -415,8 +415,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 	set_mm_counter(mm, file_rss, 0);
 	set_mm_counter(mm, anon_rss, 0);
 	spin_lock_init(&mm->page_table_lock);
-	rwlock_init(&mm->ioctx_list_lock);
-	mm->ioctx_list = NULL;
+	spin_lock_init(&mm->ioctx_lock);
+	INIT_HLIST_HEAD(&mm->ioctx_list);
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	mm->cached_hole_size = ~0UL;
 	mm_init_owner(mm, p);
-- 
cgit v1.2.3


From 43a256322ac1fc105c181b3cade3b9bfc0b63ca1 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sun, 28 Dec 2008 16:01:13 -0800
Subject: sparseirq: move __weak symbols into separate compilation unit

GCC has a bug with __weak alias functions: if the functions are in
the same compilation unit as their call site, GCC can decide to
inline them - and thus rob the linker of the opportunity to override
the weak alias with the real thing.

So move all the IRQ handling related __weak symbols to kernel/irq/chip.c.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/manage.c |  9 ---------
 kernel/softirq.c    | 20 ++++++++++++++++++++
 2 files changed, 20 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index c2741b02ad3..46953a06f4a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -261,15 +261,6 @@ void enable_irq(unsigned int irq)
 }
 EXPORT_SYMBOL(enable_irq);
 
-/*
- * [ Not in kernel/irq/handle.c, so that GCC does not
- *   inline the __weak alias: ]
- */
-int __weak arch_init_chip_data(struct irq_desc *desc, int cpu)
-{
-	return 0;
-}
-
 static int set_irq_wake_real(unsigned int irq, unsigned int on)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index e7c69a720d6..daf46358d2d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -797,3 +797,23 @@ int on_each_cpu(void (*func) (void *info), void *info, int wait)
 }
 EXPORT_SYMBOL(on_each_cpu);
 #endif
+
+/*
+ * [ These __weak aliases are kept in a separate compilation unit, so that
+ *   GCC does not inline them incorrectly. ]
+ */
+
+int __init __weak early_irq_init(void)
+{
+	return 0;
+}
+
+int __init __weak arch_early_irq_init(void)
+{
+	return 0;
+}
+
+int __weak arch_init_chip_data(struct irq_desc *desc, int cpu)
+{
+	return 0;
+}
-- 
cgit v1.2.3


From b3199c025d1646e25e7d1d640dd605db251dccf8 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 30 Dec 2008 09:05:14 +1030
Subject: cpumask: switch over to cpu_online/possible/active/present_mask: core

Impact: cleanup

This implements the obsolescent cpu_online_map in terms of
cpu_online_mask, rather than the other way around.  Same for the other
maps.

The documentation comments are also updated to refer to _mask rather
than _map.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
---
 kernel/cpu.c | 49 +++++++++++++++++++++++--------------------------
 1 file changed, 23 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index bae131a1211..3ddc509b19c 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -15,30 +15,8 @@
 #include <linux/stop_machine.h>
 #include <linux/mutex.h>
 
-/*
- * Represents all cpu's present in the system
- * In systems capable of hotplug, this map could dynamically grow
- * as new cpu's are detected in the system via any platform specific
- * method, such as ACPI for e.g.
- */
-cpumask_t cpu_present_map __read_mostly;
-EXPORT_SYMBOL(cpu_present_map);
-
-/*
- * Represents all cpu's that are currently online.
- */
-cpumask_t cpu_online_map __read_mostly;
-EXPORT_SYMBOL(cpu_online_map);
-
-#ifdef CONFIG_INIT_ALL_POSSIBLE
-cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
-#else
-cpumask_t cpu_possible_map __read_mostly;
-#endif
-EXPORT_SYMBOL(cpu_possible_map);
-
 #ifdef CONFIG_SMP
-/* Serializes the updates to cpu_online_map, cpu_present_map */
+/* Serializes the updates to cpu_online_mask, cpu_present_mask */
 static DEFINE_MUTEX(cpu_add_remove_lock);
 
 static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
@@ -65,8 +43,6 @@ void __init cpu_hotplug_init(void)
 	cpu_hotplug.refcount = 0;
 }
 
-cpumask_t cpu_active_map;
-
 #ifdef CONFIG_HOTPLUG_CPU
 
 void get_online_cpus(void)
@@ -97,7 +73,7 @@ EXPORT_SYMBOL_GPL(put_online_cpus);
 
 /*
  * The following two API's must be used when attempting
- * to serialize the updates to cpu_online_map, cpu_present_map.
+ * to serialize the updates to cpu_online_mask, cpu_present_mask.
  */
 void cpu_maps_update_begin(void)
 {
@@ -503,3 +479,24 @@ EXPORT_SYMBOL_GPL(cpu_bit_bitmap);
 
 const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
 EXPORT_SYMBOL(cpu_all_bits);
+
+#ifdef CONFIG_INIT_ALL_POSSIBLE
+static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly
+	= CPU_BITS_ALL;
+#else
+static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly;
+#endif
+const struct cpumask *const cpu_possible_mask = to_cpumask(cpu_possible_bits);
+EXPORT_SYMBOL(cpu_possible_mask);
+
+static DECLARE_BITMAP(cpu_online_bits, CONFIG_NR_CPUS) __read_mostly;
+const struct cpumask *const cpu_online_mask = to_cpumask(cpu_online_bits);
+EXPORT_SYMBOL(cpu_online_mask);
+
+static DECLARE_BITMAP(cpu_present_bits, CONFIG_NR_CPUS) __read_mostly;
+const struct cpumask *const cpu_present_mask = to_cpumask(cpu_present_bits);
+EXPORT_SYMBOL(cpu_present_mask);
+
+static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly;
+const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits);
+EXPORT_SYMBOL(cpu_active_mask);
-- 
cgit v1.2.3


From 3fa41520696fec2815e2d88fbcccdda77ba4d693 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 30 Dec 2008 09:05:16 +1030
Subject: cpumask: make set_cpu_*/init_cpu_* out-of-line

They're only for use in boot/cpu hotplug code anyway, and this avoids
the use of deprecated cpu_*_map.

Stephen Rothwell points out that gcc 4.2.4 (on powerpc at least)
didn't like the cast away of const anyway:

  include/linux/cpumask.h: In function 'set_cpu_possible':
  include/linux/cpumask.h:1052: warning: passing argument 2 of 'cpumask_set_cpu' discards qualifiers from pointer target type

So this kills two birds with one stone.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/cpu.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3ddc509b19c..2c9f78f3a2f 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -500,3 +500,50 @@ EXPORT_SYMBOL(cpu_present_mask);
 static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly;
 const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits);
 EXPORT_SYMBOL(cpu_active_mask);
+
+void set_cpu_possible(unsigned int cpu, bool possible)
+{
+	if (possible)
+		cpumask_set_cpu(cpu, to_cpumask(cpu_possible_bits));
+	else
+		cpumask_clear_cpu(cpu, to_cpumask(cpu_possible_bits));
+}
+
+void set_cpu_present(unsigned int cpu, bool present)
+{
+	if (present)
+		cpumask_set_cpu(cpu, to_cpumask(cpu_present_bits));
+	else
+		cpumask_clear_cpu(cpu, to_cpumask(cpu_present_bits));
+}
+
+void set_cpu_online(unsigned int cpu, bool online)
+{
+	if (online)
+		cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits));
+	else
+		cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits));
+}
+
+void set_cpu_active(unsigned int cpu, bool active)
+{
+	if (active)
+		cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
+	else
+		cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits));
+}
+
+void init_cpu_present(const struct cpumask *src)
+{
+	cpumask_copy(to_cpumask(cpu_present_bits), src);
+}
+
+void init_cpu_possible(const struct cpumask *src)
+{
+	cpumask_copy(to_cpumask(cpu_possible_bits), src);
+}
+
+void init_cpu_online(const struct cpumask *src)
+{
+	cpumask_copy(to_cpumask(cpu_online_bits), src);
+}
-- 
cgit v1.2.3


From 54b11e6d57a10aa9d0009efd93873e17bffd5d30 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 30 Dec 2008 09:05:16 +1030
Subject: cpumask: smp_call_function_many()

Impact: Implementation change to remove cpumask_t from stack.

Actually change smp_call_function_mask() to smp_call_function_many().
We avoid cpumasks on the stack in this version.

(S390 has its own version, but that's going away apparently).

We have to do some dancing to figure out if 0 or 1 other cpus are in
the mask supplied and the online mask without allocating a tmp
cpumask.  It's still fairly cheap.

We allocate the cpumask at the end of the call_function_data
structure: if allocation fails we fallback to smp_call_function_single
rather than using the baroque quiescing code (which needs a cpumask on
stack).

(Thanks to Hiroshi Shimamoto for spotting several bugs in previous versions!)

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
Cc: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Cc: npiggin@suse.de
Cc: axboe@kernel.dk
---
 kernel/smp.c | 139 +++++++++++++++++++++--------------------------------------
 1 file changed, 49 insertions(+), 90 deletions(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index 75c8dde58c5..9f0eafed139 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -24,8 +24,8 @@ struct call_function_data {
 	struct call_single_data csd;
 	spinlock_t lock;
 	unsigned int refs;
-	cpumask_t cpumask;
 	struct rcu_head rcu_head;
+	unsigned long cpumask_bits[];
 };
 
 struct call_single_queue {
@@ -110,13 +110,13 @@ void generic_smp_call_function_interrupt(void)
 	list_for_each_entry_rcu(data, &call_function_queue, csd.list) {
 		int refs;
 
-		if (!cpu_isset(cpu, data->cpumask))
+		if (!cpumask_test_cpu(cpu, to_cpumask(data->cpumask_bits)))
 			continue;
 
 		data->csd.func(data->csd.info);
 
 		spin_lock(&data->lock);
-		cpu_clear(cpu, data->cpumask);
+		cpumask_clear_cpu(cpu, to_cpumask(data->cpumask_bits));
 		WARN_ON(data->refs == 0);
 		data->refs--;
 		refs = data->refs;
@@ -266,51 +266,13 @@ void __smp_call_function_single(int cpu, struct call_single_data *data)
 	generic_exec_single(cpu, data);
 }
 
-/* Dummy function */
-static void quiesce_dummy(void *unused)
-{
-}
-
-/*
- * Ensure stack based data used in call function mask is safe to free.
- *
- * This is needed by smp_call_function_mask when using on-stack data, because
- * a single call function queue is shared by all CPUs, and any CPU may pick up
- * the data item on the queue at any time before it is deleted. So we need to
- * ensure that all CPUs have transitioned through a quiescent state after
- * this call.
- *
- * This is a very slow function, implemented by sending synchronous IPIs to
- * all possible CPUs. For this reason, we have to alloc data rather than use
- * stack based data even in the case of synchronous calls. The stack based
- * data is then just used for deadlock/oom fallback which will be very rare.
- *
- * If a faster scheme can be made, we could go back to preferring stack based
- * data -- the data allocation/free is non-zero cost.
- */
-static void smp_call_function_mask_quiesce_stack(cpumask_t mask)
-{
-	struct call_single_data data;
-	int cpu;
-
-	data.func = quiesce_dummy;
-	data.info = NULL;
-
-	for_each_cpu_mask(cpu, mask) {
-		data.flags = CSD_FLAG_WAIT;
-		generic_exec_single(cpu, &data);
-	}
-}
-
 /**
- * smp_call_function_mask(): Run a function on a set of other CPUs.
- * @mask: The set of cpus to run on.
+ * smp_call_function_many(): Run a function on a set of other CPUs.
+ * @mask: The set of cpus to run on (only runs on online subset).
  * @func: The function to run. This must be fast and non-blocking.
  * @info: An arbitrary pointer to pass to the function.
  * @wait: If true, wait (atomically) until function has completed on other CPUs.
  *
- * Returns 0 on success, else a negative status code.
- *
  * If @wait is true, then returns once @func has returned. Note that @wait
  * will be implicitly turned on in case of allocation failures, since
  * we fall back to on-stack allocation.
@@ -319,53 +281,57 @@ static void smp_call_function_mask_quiesce_stack(cpumask_t mask)
  * hardware interrupt handler or from a bottom half handler. Preemption
  * must be disabled when calling this function.
  */
-int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
-			   int wait)
+void smp_call_function_many(const struct cpumask *mask,
+			    void (*func)(void *), void *info,
+			    bool wait)
 {
-	struct call_function_data d;
-	struct call_function_data *data = NULL;
-	cpumask_t allbutself;
+	struct call_function_data *data;
 	unsigned long flags;
-	int cpu, num_cpus;
-	int slowpath = 0;
+	int cpu, next_cpu;
 
 	/* Can deadlock when called with interrupts disabled */
 	WARN_ON(irqs_disabled());
 
-	cpu = smp_processor_id();
-	allbutself = cpu_online_map;
-	cpu_clear(cpu, allbutself);
-	cpus_and(mask, mask, allbutself);
-	num_cpus = cpus_weight(mask);
-
-	/*
-	 * If zero CPUs, return. If just a single CPU, turn this request
-	 * into a targetted single call instead since it's faster.
-	 */
-	if (!num_cpus)
-		return 0;
-	else if (num_cpus == 1) {
-		cpu = first_cpu(mask);
-		return smp_call_function_single(cpu, func, info, wait);
+	/* So, what's a CPU they want?  Ignoring this one. */
+	cpu = cpumask_first_and(mask, cpu_online_mask);
+	if (cpu == smp_processor_id())
+		cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
+	/* No online cpus?  We're done. */
+	if (cpu >= nr_cpu_ids)
+		return;
+
+	/* Do we have another CPU which isn't us? */
+	next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
+	if (next_cpu == smp_processor_id())
+		next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask);
+
+	/* Fastpath: do that cpu by itself. */
+	if (next_cpu >= nr_cpu_ids) {
+		smp_call_function_single(cpu, func, info, wait);
+		return;
 	}
 
-	data = kmalloc(sizeof(*data), GFP_ATOMIC);
-	if (data) {
-		data->csd.flags = CSD_FLAG_ALLOC;
-		if (wait)
-			data->csd.flags |= CSD_FLAG_WAIT;
-	} else {
-		data = &d;
-		data->csd.flags = CSD_FLAG_WAIT;
-		wait = 1;
-		slowpath = 1;
+	data = kmalloc(sizeof(*data) + cpumask_size(), GFP_ATOMIC);
+	if (unlikely(!data)) {
+		/* Slow path. */
+		for_each_online_cpu(cpu) {
+			if (cpu == smp_processor_id())
+				continue;
+			if (cpumask_test_cpu(cpu, mask))
+				smp_call_function_single(cpu, func, info, wait);
+		}
+		return;
 	}
 
 	spin_lock_init(&data->lock);
+	data->csd.flags = CSD_FLAG_ALLOC;
+	if (wait)
+		data->csd.flags |= CSD_FLAG_WAIT;
 	data->csd.func = func;
 	data->csd.info = info;
-	data->refs = num_cpus;
-	data->cpumask = mask;
+	cpumask_and(to_cpumask(data->cpumask_bits), mask, cpu_online_mask);
+	cpumask_clear_cpu(smp_processor_id(), to_cpumask(data->cpumask_bits));
+	data->refs = cpumask_weight(to_cpumask(data->cpumask_bits));
 
 	spin_lock_irqsave(&call_function_lock, flags);
 	list_add_tail_rcu(&data->csd.list, &call_function_queue);
@@ -377,18 +343,13 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
 	smp_mb();
 
 	/* Send a message to all CPUs in the map */
-	arch_send_call_function_ipi(mask);
+	arch_send_call_function_ipi(*to_cpumask(data->cpumask_bits));
 
 	/* optionally wait for the CPUs to complete */
-	if (wait) {
+	if (wait)
 		csd_flag_wait(&data->csd);
-		if (unlikely(slowpath))
-			smp_call_function_mask_quiesce_stack(mask);
-	}
-
-	return 0;
 }
-EXPORT_SYMBOL(smp_call_function_mask);
+EXPORT_SYMBOL(smp_call_function_many);
 
 /**
  * smp_call_function(): Run a function on all other CPUs.
@@ -396,7 +357,7 @@ EXPORT_SYMBOL(smp_call_function_mask);
  * @info: An arbitrary pointer to pass to the function.
  * @wait: If true, wait (atomically) until function has completed on other CPUs.
  *
- * Returns 0 on success, else a negative status code.
+ * Returns 0.
  *
  * If @wait is true, then returns once @func has returned; otherwise
  * it returns just before the target cpu calls @func. In case of allocation
@@ -407,12 +368,10 @@ EXPORT_SYMBOL(smp_call_function_mask);
  */
 int smp_call_function(void (*func)(void *), void *info, int wait)
 {
-	int ret;
-
 	preempt_disable();
-	ret = smp_call_function_mask(cpu_online_map, func, info, wait);
+	smp_call_function_many(cpu_online_mask, func, info, wait);
 	preempt_enable();
-	return ret;
+	return 0;
 }
 EXPORT_SYMBOL(smp_call_function);
 
-- 
cgit v1.2.3


From ce47d974f71af26d00832e83a43ac79bec272d99 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 30 Dec 2008 09:05:17 +1030
Subject: cpumask: arch_send_call_function_ipi_mask: core

Impact: new API to reduce stack usage

We're weaning the core code off handing cpumask's around on-stack.
This introduces arch_send_call_function_ipi_mask().

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/smp.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index 9f0eafed139..172b1826890 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -266,6 +266,12 @@ void __smp_call_function_single(int cpu, struct call_single_data *data)
 	generic_exec_single(cpu, data);
 }
 
+/* FIXME: Shim for archs using old arch_send_call_function_ipi API. */
+#ifndef arch_send_call_function_ipi_mask
+#define arch_send_call_function_ipi_mask(maskp) \
+	arch_send_call_function_ipi(*(maskp))
+#endif
+
 /**
  * smp_call_function_many(): Run a function on a set of other CPUs.
  * @mask: The set of cpus to run on (only runs on online subset).
@@ -343,7 +349,7 @@ void smp_call_function_many(const struct cpumask *mask,
 	smp_mb();
 
 	/* Send a message to all CPUs in the map */
-	arch_send_call_function_ipi(*to_cpumask(data->cpumask_bits));
+	arch_send_call_function_ipi_mask(to_cpumask(data->cpumask_bits));
 
 	/* optionally wait for the CPUs to complete */
 	if (wait)
-- 
cgit v1.2.3


From 42d35d48ce7cefb9429880af19d1c329d1554e7a Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Mon, 29 Dec 2008 15:49:53 -0800
Subject: futex: make futex_(get|put)_key() calls symmetric

Impact: cleanup

This patch makes the calls to futex_get_key_refs() and futex_drop_key_refs()
explicitly symmetric by only "putting" keys we successfully "got".  Also
cleanup a couple return points that didn't "put" after a successful "get".

Build and boot tested on an x86_64 system.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Cc: <stable@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/futex.c | 67 +++++++++++++++++++++++++++++++---------------------------
 1 file changed, 36 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index b4f87bac91c..c5ac55cc0c1 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -723,8 +723,8 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
 	}
 
 	spin_unlock(&hb->lock);
-out:
 	put_futex_key(fshared, &key);
+out:
 	return ret;
 }
 
@@ -748,7 +748,7 @@ retryfull:
 		goto out;
 	ret = get_futex_key(uaddr2, fshared, &key2);
 	if (unlikely(ret != 0))
-		goto out;
+		goto out_put_key1;
 
 	hb1 = hash_futex(&key1);
 	hb2 = hash_futex(&key2);
@@ -770,12 +770,12 @@ retry:
 		 * but we might get them from range checking
 		 */
 		ret = op_ret;
-		goto out;
+		goto out_put_keys;
 #endif
 
 		if (unlikely(op_ret != -EFAULT)) {
 			ret = op_ret;
-			goto out;
+			goto out_put_keys;
 		}
 
 		/*
@@ -789,7 +789,7 @@ retry:
 			ret = futex_handle_fault((unsigned long)uaddr2,
 						 attempt);
 			if (ret)
-				goto out;
+				goto out_put_keys;
 			goto retry;
 		}
 
@@ -827,10 +827,11 @@ retry:
 	spin_unlock(&hb1->lock);
 	if (hb1 != hb2)
 		spin_unlock(&hb2->lock);
-out:
+out_put_keys:
 	put_futex_key(fshared, &key2);
+out_put_key1:
 	put_futex_key(fshared, &key1);
-
+out:
 	return ret;
 }
 
@@ -847,13 +848,13 @@ static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
 	struct futex_q *this, *next;
 	int ret, drop_count = 0;
 
- retry:
+retry:
 	ret = get_futex_key(uaddr1, fshared, &key1);
 	if (unlikely(ret != 0))
 		goto out;
 	ret = get_futex_key(uaddr2, fshared, &key2);
 	if (unlikely(ret != 0))
-		goto out;
+		goto out_put_key1;
 
 	hb1 = hash_futex(&key1);
 	hb2 = hash_futex(&key2);
@@ -875,7 +876,7 @@ static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
 			if (!ret)
 				goto retry;
 
-			return ret;
+			goto out_put_keys;
 		}
 		if (curval != *cmpval) {
 			ret = -EAGAIN;
@@ -920,9 +921,11 @@ out_unlock:
 	while (--drop_count >= 0)
 		drop_futex_key_refs(&key1);
 
-out:
+out_put_keys:
 	put_futex_key(fshared, &key2);
+out_put_key1:
 	put_futex_key(fshared, &key1);
+out:
 	return ret;
 }
 
@@ -983,7 +986,7 @@ static int unqueue_me(struct futex_q *q)
 	int ret = 0;
 
 	/* In the common case we don't take the spinlock, which is nice. */
- retry:
+retry:
 	lock_ptr = q->lock_ptr;
 	barrier();
 	if (lock_ptr != NULL) {
@@ -1165,11 +1168,11 @@ static int futex_wait(u32 __user *uaddr, int fshared,
 
 	q.pi_state = NULL;
 	q.bitset = bitset;
- retry:
+retry:
 	q.key = FUTEX_KEY_INIT;
 	ret = get_futex_key(uaddr, fshared, &q.key);
 	if (unlikely(ret != 0))
-		goto out_release_sem;
+		goto out;
 
 	hb = queue_lock(&q);
 
@@ -1197,6 +1200,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
 
 	if (unlikely(ret)) {
 		queue_unlock(&q, hb);
+		put_futex_key(fshared, &q.key);
 
 		ret = get_user(uval, uaddr);
 
@@ -1206,7 +1210,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
 	}
 	ret = -EWOULDBLOCK;
 	if (uval != val)
-		goto out_unlock_release_sem;
+		goto out_unlock_put_key;
 
 	/* Only actually queue if *uaddr contained val.  */
 	queue_me(&q, hb);
@@ -1298,11 +1302,11 @@ static int futex_wait(u32 __user *uaddr, int fshared,
 		return -ERESTART_RESTARTBLOCK;
 	}
 
- out_unlock_release_sem:
+out_unlock_put_key:
 	queue_unlock(&q, hb);
-
- out_release_sem:
 	put_futex_key(fshared, &q.key);
+
+out:
 	return ret;
 }
 
@@ -1351,16 +1355,16 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
 	}
 
 	q.pi_state = NULL;
- retry:
+retry:
 	q.key = FUTEX_KEY_INIT;
 	ret = get_futex_key(uaddr, fshared, &q.key);
 	if (unlikely(ret != 0))
-		goto out_release_sem;
+		goto out;
 
- retry_unlocked:
+retry_unlocked:
 	hb = queue_lock(&q);
 
- retry_locked:
+retry_locked:
 	ret = lock_taken = 0;
 
 	/*
@@ -1381,14 +1385,14 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
 	 */
 	if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) {
 		ret = -EDEADLK;
-		goto out_unlock_release_sem;
+		goto out_unlock_put_key;
 	}
 
 	/*
 	 * Surprise - we got the lock. Just return to userspace:
 	 */
 	if (unlikely(!curval))
-		goto out_unlock_release_sem;
+		goto out_unlock_put_key;
 
 	uval = curval;
 
@@ -1424,7 +1428,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
 	 * We took the lock due to owner died take over.
 	 */
 	if (unlikely(lock_taken))
-		goto out_unlock_release_sem;
+		goto out_unlock_put_key;
 
 	/*
 	 * We dont have the lock. Look up the PI state (or create it if
@@ -1463,7 +1467,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
 				goto retry_locked;
 			}
 		default:
-			goto out_unlock_release_sem;
+			goto out_unlock_put_key;
 		}
 	}
 
@@ -1554,16 +1558,17 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
 		destroy_hrtimer_on_stack(&to->timer);
 	return ret != -EINTR ? ret : -ERESTARTNOINTR;
 
- out_unlock_release_sem:
+out_unlock_put_key:
 	queue_unlock(&q, hb);
 
- out_release_sem:
+out_put_key:
 	put_futex_key(fshared, &q.key);
+out:
 	if (to)
 		destroy_hrtimer_on_stack(&to->timer);
 	return ret;
 
- uaddr_faulted:
+uaddr_faulted:
 	/*
 	 * We have to r/w  *(int __user *)uaddr, and we have to modify it
 	 * atomically.  Therefore, if we continue to fault after get_user()
@@ -1576,7 +1581,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
 	if (attempt++) {
 		ret = futex_handle_fault((unsigned long)uaddr, attempt);
 		if (ret)
-			goto out_release_sem;
+			goto out_put_key;
 		goto retry_unlocked;
 	}
 
@@ -1668,9 +1673,9 @@ retry_unlocked:
 
 out_unlock:
 	spin_unlock(&hb->lock);
-out:
 	put_futex_key(fshared, &key);
 
+out:
 	return ret;
 
 pi_faulted:
-- 
cgit v1.2.3


From 1af237a099a3b8ff56aa384f605c6a68af7bf288 Mon Sep 17 00:00:00 2001
From: Huang Weiyi <weiyi.huang@gmail.com>
Date: Tue, 30 Dec 2008 06:41:44 +0800
Subject: tracing: removed duplicated #include

Removed duplicated #include in kernel/trace/trace.c.

Signed-off-by: Huang Weiyi <weiyi.huang@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/trace/trace.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3608f6cb2f7..4185d522163 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -30,7 +30,6 @@
 #include <linux/gfp.h>
 #include <linux/fs.h>
 #include <linux/kprobes.h>
-#include <linux/seq_file.h>
 #include <linux/writeback.h>
 
 #include <linux/stacktrace.h>
-- 
cgit v1.2.3


From 1c5745aa380efb6417b5681104b007c8612fb496 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Dec 2008 23:05:28 +0100
Subject: sched_clock: prevent scd->clock from moving backwards, take #2

Redo:

  5b7dba4: sched_clock: prevent scd->clock from moving backwards

which had to be reverted due to s2ram hangs:

  ca7e716: Revert "sched_clock: prevent scd->clock from moving backwards"

... this time with resume restoring GTOD later in the sequence
taken into account as well.

The "timekeeping_suspended" flag is not very nice but we cannot call into
GTOD before it has been properly resumed and the scheduler will run very
early in the resume sequence.

Cc: <stable@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_clock.c      | 5 ++++-
 kernel/time/timekeeping.c | 7 +++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index e8ab096ddfe..a0b0852414c 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -124,7 +124,7 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
 
 	clock = scd->tick_gtod + delta;
 	min_clock = wrap_max(scd->tick_gtod, scd->clock);
-	max_clock = scd->tick_gtod + TICK_NSEC;
+	max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC);
 
 	clock = wrap_max(clock, min_clock);
 	clock = wrap_min(clock, max_clock);
@@ -227,6 +227,9 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
  */
 void sched_clock_idle_wakeup_event(u64 delta_ns)
 {
+	if (timekeeping_suspended)
+		return;
+
 	sched_clock_tick();
 	touch_softlockup_watchdog();
 }
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index fa05e88aa76..900f1b6598d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -46,6 +46,9 @@ struct timespec xtime __attribute__ ((aligned (16)));
 struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
 static unsigned long total_sleep_time;		/* seconds */
 
+/* flag for if timekeeping is suspended */
+int __read_mostly timekeeping_suspended;
+
 static struct timespec xtime_cache __attribute__ ((aligned (16)));
 void update_xtime_cache(u64 nsec)
 {
@@ -92,6 +95,8 @@ void getnstimeofday(struct timespec *ts)
 	unsigned long seq;
 	s64 nsecs;
 
+	WARN_ON(timekeeping_suspended);
+
 	do {
 		seq = read_seqbegin(&xtime_lock);
 
@@ -299,8 +304,6 @@ void __init timekeeping_init(void)
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 }
 
-/* flag for if timekeeping is suspended */
-static int timekeeping_suspended;
 /* time in seconds when suspend began */
 static unsigned long timekeeping_suspend_time;
 
-- 
cgit v1.2.3


From 457533a7d3402d1d91fbc125c8bd1bd16dcd3cd4 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Wed, 31 Dec 2008 15:11:37 +0100
Subject: [PATCH] fix scaled & unscaled cputime accounting

The utimescaled / stimescaled fields in the task structure and the
global cpustat should be set on all architectures. On s390 the calls
to account_user_time_scaled and account_system_time_scaled never have
been added. In addition system time that is accounted as guest time
to the user time of a process is accounted to the scaled system time
instead of the scaled user time.
To fix the bugs and to prevent future forgetfulness this patch merges
account_system_time_scaled into account_system_time and
account_user_time_scaled into account_user_time.

Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: Michael Neuling <mikey@neuling.org>
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 kernel/sched.c           | 41 ++++++++++++++++-------------------------
 kernel/time/tick-sched.c |  5 +++--
 kernel/timer.c           | 12 +++++-------
 3 files changed, 24 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index fff1c4a20b6..5b03679ff71 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4080,13 +4080,17 @@ unsigned long long task_delta_exec(struct task_struct *p)
  * Account user cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @cputime: the cpu time spent in user space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
  */
-void account_user_time(struct task_struct *p, cputime_t cputime)
+void account_user_time(struct task_struct *p, cputime_t cputime,
+		       cputime_t cputime_scaled)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	cputime64_t tmp;
 
+	/* Add user time to process. */
 	p->utime = cputime_add(p->utime, cputime);
+	p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
 	account_group_user_time(p, cputime);
 
 	/* Add user time to cpustat. */
@@ -4103,51 +4107,49 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
  * Account guest cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @cputime: the cpu time spent in virtual machine since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
  */
-static void account_guest_time(struct task_struct *p, cputime_t cputime)
+static void account_guest_time(struct task_struct *p, cputime_t cputime,
+			       cputime_t cputime_scaled)
 {
 	cputime64_t tmp;
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 
 	tmp = cputime_to_cputime64(cputime);
 
+	/* Add guest time to process. */
 	p->utime = cputime_add(p->utime, cputime);
+	p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
 	account_group_user_time(p, cputime);
 	p->gtime = cputime_add(p->gtime, cputime);
 
+	/* Add guest time to cpustat. */
 	cpustat->user = cputime64_add(cpustat->user, tmp);
 	cpustat->guest = cputime64_add(cpustat->guest, tmp);
 }
 
-/*
- * Account scaled user cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in user space since the last update
- */
-void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
-{
-	p->utimescaled = cputime_add(p->utimescaled, cputime);
-}
-
 /*
  * Account system cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @hardirq_offset: the offset to subtract from hardirq_count()
  * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
  */
 void account_system_time(struct task_struct *p, int hardirq_offset,
-			 cputime_t cputime)
+			 cputime_t cputime, cputime_t cputime_scaled)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	struct rq *rq = this_rq();
 	cputime64_t tmp;
 
 	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
-		account_guest_time(p, cputime);
+		account_guest_time(p, cputime, cputime_scaled);
 		return;
 	}
 
+	/* Add system time to process. */
 	p->stime = cputime_add(p->stime, cputime);
+	p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
 	account_group_system_time(p, cputime);
 
 	/* Add system time to cpustat. */
@@ -4166,17 +4168,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 	acct_update_integrals(p);
 }
 
-/*
- * Account scaled system cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @hardirq_offset: the offset to subtract from hardirq_count()
- * @cputime: the cpu time spent in kernel space since the last update
- */
-void account_system_time_scaled(struct task_struct *p, cputime_t cputime)
-{
-	p->stimescaled = cputime_add(p->stimescaled, cputime);
-}
-
 /*
  * Account for involuntary wait time.
  * @p: the process from which the cpu time has been stolen
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 8f3fc2582d3..1f2fce2479f 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -420,6 +420,7 @@ void tick_nohz_restart_sched_tick(void)
 	int cpu = smp_processor_id();
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 	unsigned long ticks;
+	cputime_t cputime;
 	ktime_t now;
 
 	local_irq_disable();
@@ -452,8 +453,8 @@ void tick_nohz_restart_sched_tick(void)
 	 */
 	if (ticks && ticks < LONG_MAX) {
 		add_preempt_count(HARDIRQ_OFFSET);
-		account_system_time(current, HARDIRQ_OFFSET,
-				    jiffies_to_cputime(ticks));
+		cputime = jiffies_to_cputime(ticks);
+		account_system_time(current, HARDIRQ_OFFSET, cputime, cputime);
 		sub_preempt_count(HARDIRQ_OFFSET);
 	}
 
diff --git a/kernel/timer.c b/kernel/timer.c
index 566257d1dc1..b5efb528aa1 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1023,13 +1023,11 @@ void account_process_tick(struct task_struct *p, int user_tick)
 {
 	cputime_t one_jiffy = jiffies_to_cputime(1);
 
-	if (user_tick) {
-		account_user_time(p, one_jiffy);
-		account_user_time_scaled(p, cputime_to_scaled(one_jiffy));
-	} else {
-		account_system_time(p, HARDIRQ_OFFSET, one_jiffy);
-		account_system_time_scaled(p, cputime_to_scaled(one_jiffy));
-	}
+	if (user_tick)
+		account_user_time(p, one_jiffy, cputime_to_scaled(one_jiffy));
+	else
+		account_system_time(p, HARDIRQ_OFFSET, one_jiffy,
+				    cputime_to_scaled(one_jiffy));
 }
 #endif
 
-- 
cgit v1.2.3


From 79741dd35713ff4f6fd0eafd59fa94e8a4ba922d Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Wed, 31 Dec 2008 15:11:38 +0100
Subject: [PATCH] idle cputime accounting

The cpu time spent by the idle process actually doing something is
currently accounted as idle time. This is plain wrong, the architectures
that support VIRT_CPU_ACCOUNTING=y can do better: distinguish between the
time spent doing nothing and the time spent by idle doing work. The first
is accounted with account_idle_time and the second with account_system_time.
The architectures that use the account_xxx_time interface directly and not
the account_xxx_ticks interface now need to do the check for the idle
process in their arch code. In particular to improve the system vs true
idle time accounting the arch code needs to measure the true idle time
instead of just testing for the idle process.
To improve the tick based accounting as well we would need an architecture
primitive that can tell us if the pt_regs of the interrupted context
points to the magic instruction that halts the cpu.

In addition idle time is no more added to the stime of the idle process.
This field now contains the system time of the idle process as it should
be. On systems without VIRT_CPU_ACCOUNTING this will always be zero as
every tick that occurs while idle is running will be accounted as idle
time.

This patch contains the necessary common code changes to be able to
distinguish idle system time and true idle time. The architectures with
support for VIRT_CPU_ACCOUNTING need some changes to exploit this.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 kernel/sched.c           | 80 ++++++++++++++++++++++++++++++++++++++----------
 kernel/time/tick-sched.c | 13 ++++----
 kernel/timer.c           | 13 --------
 3 files changed, 69 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 5b03679ff71..635eaffe1e4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4139,7 +4139,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 			 cputime_t cputime, cputime_t cputime_scaled)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-	struct rq *rq = this_rq();
 	cputime64_t tmp;
 
 	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
@@ -4158,37 +4157,84 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 		cpustat->irq = cputime64_add(cpustat->irq, tmp);
 	else if (softirq_count())
 		cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
-	else if (p != rq->idle)
-		cpustat->system = cputime64_add(cpustat->system, tmp);
-	else if (atomic_read(&rq->nr_iowait) > 0)
-		cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
 	else
-		cpustat->idle = cputime64_add(cpustat->idle, tmp);
+		cpustat->system = cputime64_add(cpustat->system, tmp);
+
 	/* Account for system time used */
 	acct_update_integrals(p);
 }
 
 /*
  * Account for involuntary wait time.
- * @p: the process from which the cpu time has been stolen
  * @steal: the cpu time spent in involuntary wait
  */
-void account_steal_time(struct task_struct *p, cputime_t steal)
+void account_steal_time(cputime_t cputime)
+{
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+	cputime64_t cputime64 = cputime_to_cputime64(cputime);
+
+	cpustat->steal = cputime64_add(cpustat->steal, cputime64);
+}
+
+/*
+ * Account for idle time.
+ * @cputime: the cpu time spent in idle wait
+ */
+void account_idle_time(cputime_t cputime)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-	cputime64_t tmp = cputime_to_cputime64(steal);
+	cputime64_t cputime64 = cputime_to_cputime64(cputime);
 	struct rq *rq = this_rq();
 
-	if (p == rq->idle) {
-		p->stime = cputime_add(p->stime, steal);
-		if (atomic_read(&rq->nr_iowait) > 0)
-			cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
-		else
-			cpustat->idle = cputime64_add(cpustat->idle, tmp);
-	} else
-		cpustat->steal = cputime64_add(cpustat->steal, tmp);
+	if (atomic_read(&rq->nr_iowait) > 0)
+		cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
+	else
+		cpustat->idle = cputime64_add(cpustat->idle, cputime64);
+}
+
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+
+/*
+ * Account a single tick of cpu time.
+ * @p: the process that the cpu time gets accounted to
+ * @user_tick: indicates if the tick is a user or a system tick
+ */
+void account_process_tick(struct task_struct *p, int user_tick)
+{
+	cputime_t one_jiffy = jiffies_to_cputime(1);
+	cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy);
+	struct rq *rq = this_rq();
+
+	if (user_tick)
+		account_user_time(p, one_jiffy, one_jiffy_scaled);
+	else if (p != rq->idle)
+		account_system_time(p, HARDIRQ_OFFSET, one_jiffy,
+				    one_jiffy_scaled);
+	else
+		account_idle_time(one_jiffy);
+}
+
+/*
+ * Account multiple ticks of steal time.
+ * @p: the process from which the cpu time has been stolen
+ * @ticks: number of stolen ticks
+ */
+void account_steal_ticks(unsigned long ticks)
+{
+	account_steal_time(jiffies_to_cputime(ticks));
+}
+
+/*
+ * Account multiple ticks of idle time.
+ * @ticks: number of stolen ticks
+ */
+void account_idle_ticks(unsigned long ticks)
+{
+	account_idle_time(jiffies_to_cputime(ticks));
 }
 
+#endif
+
 /*
  * Use precise platform statistics if available:
  */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 1f2fce2479f..611fa4c0baa 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -419,8 +419,9 @@ void tick_nohz_restart_sched_tick(void)
 {
 	int cpu = smp_processor_id();
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
 	unsigned long ticks;
-	cputime_t cputime;
+#endif
 	ktime_t now;
 
 	local_irq_disable();
@@ -442,6 +443,7 @@ void tick_nohz_restart_sched_tick(void)
 	tick_do_update_jiffies64(now);
 	cpu_clear(cpu, nohz_cpu_mask);
 
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
 	/*
 	 * We stopped the tick in idle. Update process times would miss the
 	 * time we slept as update_process_times does only a 1 tick
@@ -451,12 +453,9 @@ void tick_nohz_restart_sched_tick(void)
 	/*
 	 * We might be one off. Do not randomly account a huge number of ticks!
 	 */
-	if (ticks && ticks < LONG_MAX) {
-		add_preempt_count(HARDIRQ_OFFSET);
-		cputime = jiffies_to_cputime(ticks);
-		account_system_time(current, HARDIRQ_OFFSET, cputime, cputime);
-		sub_preempt_count(HARDIRQ_OFFSET);
-	}
+	if (ticks && ticks < LONG_MAX)
+		account_idle_ticks(ticks);
+#endif
 
 	touch_softlockup_watchdog();
 	/*
diff --git a/kernel/timer.c b/kernel/timer.c
index b5efb528aa1..dee3f641a7a 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1018,19 +1018,6 @@ unsigned long get_next_timer_interrupt(unsigned long now)
 }
 #endif
 
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
-void account_process_tick(struct task_struct *p, int user_tick)
-{
-	cputime_t one_jiffy = jiffies_to_cputime(1);
-
-	if (user_tick)
-		account_user_time(p, one_jiffy, cputime_to_scaled(one_jiffy));
-	else
-		account_system_time(p, HARDIRQ_OFFSET, one_jiffy,
-				    cputime_to_scaled(one_jiffy));
-}
-#endif
-
 /*
  * Called from the timer interrupt handler to charge one tick to the current
  * process.  user_tick is 1 if the tick is user time, 0 for system.
-- 
cgit v1.2.3


From 4f4b6c1a94a8735bbdc030a2911cf395495645b6 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 1 Jan 2009 10:12:15 +1030
Subject: cpumask: prepare for iterators to only go to
 nr_cpu_ids/nr_cpumask_bits.: core

Impact: cleanup

In future, all cpumask ops will only be valid (in general) for bit
numbers < nr_cpu_ids.  So use that instead of NR_CPUS in iterators
and other comparisons.

This is always safe: no cpu number can be >= nr_cpu_ids, and
nr_cpu_ids is initialized to NR_CPUS at boot.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: James Morris <jmorris@namei.org>
Cc: Eric Biederman <ebiederm@xmission.com>
---
 kernel/kexec.c | 2 +-
 kernel/smp.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index ac0fde7b54d..3fb855ad6aa 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1116,7 +1116,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
 	struct elf_prstatus prstatus;
 	u32 *buf;
 
-	if ((cpu < 0) || (cpu >= NR_CPUS))
+	if ((cpu < 0) || (cpu >= nr_cpu_ids))
 		return;
 
 	/* Using ELF notes here is opportunistic.
diff --git a/kernel/smp.c b/kernel/smp.c
index 172b1826890..5cfa0e5e3e8 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -223,7 +223,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 		local_irq_save(flags);
 		func(info);
 		local_irq_restore(flags);
-	} else if ((unsigned)cpu < NR_CPUS && cpu_online(cpu)) {
+	} else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
 		struct call_single_data *data = NULL;
 
 		if (!wait) {
-- 
cgit v1.2.3


From 9e01c1b74c9531e301c900edaa92a99fcb7738f2 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 1 Jan 2009 10:12:22 +1030
Subject: cpumask: convert kernel trace functions

Impact: Reduce future memory usage, use new cpumask API.

(Eventually, cpumask_var_t will be allocated based on nr_cpu_ids, not NR_CPUS).

Convert kernel trace functions to use struct cpumask API:
1) Use cpumask_copy/cpumask_test_cpu/for_each_cpu.
2) Use cpumask_var_t and alloc_cpumask_var/free_cpumask_var everywhere.
3) Use on_each_cpu instead of playing with current->cpus_allowed.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c   | 42 ++++++++++++++++++-------------
 kernel/trace/trace.c         | 60 ++++++++++++++++++++++++++------------------
 kernel/trace/trace_sysprof.c | 13 +++-------
 3 files changed, 64 insertions(+), 51 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 1d601a7c458..a9d9760dc7b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -195,7 +195,7 @@ void *ring_buffer_event_data(struct ring_buffer_event *event)
 EXPORT_SYMBOL_GPL(ring_buffer_event_data);
 
 #define for_each_buffer_cpu(buffer, cpu)		\
-	for_each_cpu_mask(cpu, buffer->cpumask)
+	for_each_cpu(cpu, buffer->cpumask)
 
 #define TS_SHIFT	27
 #define TS_MASK		((1ULL << TS_SHIFT) - 1)
@@ -267,7 +267,7 @@ struct ring_buffer {
 	unsigned			pages;
 	unsigned			flags;
 	int				cpus;
-	cpumask_t			cpumask;
+	cpumask_var_t			cpumask;
 	atomic_t			record_disabled;
 
 	struct mutex			mutex;
@@ -458,6 +458,9 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
 	if (!buffer)
 		return NULL;
 
+	if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
+		goto fail_free_buffer;
+
 	buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
 	buffer->flags = flags;
 
@@ -465,14 +468,14 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
 	if (buffer->pages == 1)
 		buffer->pages++;
 
-	buffer->cpumask = cpu_possible_map;
+	cpumask_copy(buffer->cpumask, cpu_possible_mask);
 	buffer->cpus = nr_cpu_ids;
 
 	bsize = sizeof(void *) * nr_cpu_ids;
 	buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()),
 				  GFP_KERNEL);
 	if (!buffer->buffers)
-		goto fail_free_buffer;
+		goto fail_free_cpumask;
 
 	for_each_buffer_cpu(buffer, cpu) {
 		buffer->buffers[cpu] =
@@ -492,6 +495,9 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
 	}
 	kfree(buffer->buffers);
 
+ fail_free_cpumask:
+	free_cpumask_var(buffer->cpumask);
+
  fail_free_buffer:
 	kfree(buffer);
 	return NULL;
@@ -510,6 +516,8 @@ ring_buffer_free(struct ring_buffer *buffer)
 	for_each_buffer_cpu(buffer, cpu)
 		rb_free_cpu_buffer(buffer->buffers[cpu]);
 
+	free_cpumask_var(buffer->cpumask);
+
 	kfree(buffer);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_free);
@@ -1283,7 +1291,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
 
 	cpu = raw_smp_processor_id();
 
-	if (!cpu_isset(cpu, buffer->cpumask))
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		goto out;
 
 	cpu_buffer = buffer->buffers[cpu];
@@ -1396,7 +1404,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
 
 	cpu = raw_smp_processor_id();
 
-	if (!cpu_isset(cpu, buffer->cpumask))
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		goto out;
 
 	cpu_buffer = buffer->buffers[cpu];
@@ -1478,7 +1486,7 @@ void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 
-	if (!cpu_isset(cpu, buffer->cpumask))
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return;
 
 	cpu_buffer = buffer->buffers[cpu];
@@ -1498,7 +1506,7 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 
-	if (!cpu_isset(cpu, buffer->cpumask))
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return;
 
 	cpu_buffer = buffer->buffers[cpu];
@@ -1515,7 +1523,7 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 
-	if (!cpu_isset(cpu, buffer->cpumask))
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return 0;
 
 	cpu_buffer = buffer->buffers[cpu];
@@ -1532,7 +1540,7 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 
-	if (!cpu_isset(cpu, buffer->cpumask))
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return 0;
 
 	cpu_buffer = buffer->buffers[cpu];
@@ -1850,7 +1858,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 	struct buffer_page *reader;
 	int nr_loops = 0;
 
-	if (!cpu_isset(cpu, buffer->cpumask))
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return NULL;
 
 	cpu_buffer = buffer->buffers[cpu];
@@ -2025,7 +2033,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
 	struct ring_buffer_event *event;
 	unsigned long flags;
 
-	if (!cpu_isset(cpu, buffer->cpumask))
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return NULL;
 
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
@@ -2062,7 +2070,7 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
 	struct ring_buffer_iter *iter;
 	unsigned long flags;
 
-	if (!cpu_isset(cpu, buffer->cpumask))
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return NULL;
 
 	iter = kmalloc(sizeof(*iter), GFP_KERNEL);
@@ -2172,7 +2180,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
 	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
 	unsigned long flags;
 
-	if (!cpu_isset(cpu, buffer->cpumask))
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return;
 
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
@@ -2228,7 +2236,7 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 
-	if (!cpu_isset(cpu, buffer->cpumask))
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return 1;
 
 	cpu_buffer = buffer->buffers[cpu];
@@ -2252,8 +2260,8 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
 	struct ring_buffer_per_cpu *cpu_buffer_a;
 	struct ring_buffer_per_cpu *cpu_buffer_b;
 
-	if (!cpu_isset(cpu, buffer_a->cpumask) ||
-	    !cpu_isset(cpu, buffer_b->cpumask))
+	if (!cpumask_test_cpu(cpu, buffer_a->cpumask) ||
+	    !cpumask_test_cpu(cpu, buffer_b->cpumask))
 		return -EINVAL;
 
 	/* At least make sure the two buffers are somewhat the same */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0e91f43b6ba..5d04e27f3b4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -89,10 +89,10 @@ static inline void ftrace_enable_cpu(void)
 	preempt_enable();
 }
 
-static cpumask_t __read_mostly		tracing_buffer_mask;
+static cpumask_var_t __read_mostly	tracing_buffer_mask;
 
 #define for_each_tracing_cpu(cpu)	\
-	for_each_cpu_mask(cpu, tracing_buffer_mask)
+	for_each_cpu(cpu, tracing_buffer_mask)
 
 /*
  * ftrace_dump_on_oops - variable to dump ftrace buffer on oops
@@ -2646,13 +2646,7 @@ static struct file_operations show_traces_fops = {
 /*
  * Only trace on a CPU if the bitmask is set:
  */
-static cpumask_t tracing_cpumask = CPU_MASK_ALL;
-
-/*
- * When tracing/tracing_cpu_mask is modified then this holds
- * the new bitmask we are about to install:
- */
-static cpumask_t tracing_cpumask_new;
+static cpumask_var_t tracing_cpumask;
 
 /*
  * The tracer itself will not take this lock, but still we want
@@ -2674,7 +2668,7 @@ tracing_cpumask_read(struct file *filp, char __user *ubuf,
 
 	mutex_lock(&tracing_cpumask_update_lock);
 
-	len = cpumask_scnprintf(mask_str, count, &tracing_cpumask);
+	len = cpumask_scnprintf(mask_str, count, tracing_cpumask);
 	if (count - len < 2) {
 		count = -EINVAL;
 		goto out_err;
@@ -2693,9 +2687,13 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
 		      size_t count, loff_t *ppos)
 {
 	int err, cpu;
+	cpumask_var_t tracing_cpumask_new;
+
+	if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
+		return -ENOMEM;
 
 	mutex_lock(&tracing_cpumask_update_lock);
-	err = cpumask_parse_user(ubuf, count, &tracing_cpumask_new);
+	err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
 	if (err)
 		goto err_unlock;
 
@@ -2706,26 +2704,28 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
 		 * Increase/decrease the disabled counter if we are
 		 * about to flip a bit in the cpumask:
 		 */
-		if (cpu_isset(cpu, tracing_cpumask) &&
-				!cpu_isset(cpu, tracing_cpumask_new)) {
+		if (cpumask_test_cpu(cpu, tracing_cpumask) &&
+				!cpumask_test_cpu(cpu, tracing_cpumask_new)) {
 			atomic_inc(&global_trace.data[cpu]->disabled);
 		}
-		if (!cpu_isset(cpu, tracing_cpumask) &&
-				cpu_isset(cpu, tracing_cpumask_new)) {
+		if (!cpumask_test_cpu(cpu, tracing_cpumask) &&
+				cpumask_test_cpu(cpu, tracing_cpumask_new)) {
 			atomic_dec(&global_trace.data[cpu]->disabled);
 		}
 	}
 	__raw_spin_unlock(&ftrace_max_lock);
 	local_irq_enable();
 
-	tracing_cpumask = tracing_cpumask_new;
+	cpumask_copy(tracing_cpumask, tracing_cpumask_new);
 
 	mutex_unlock(&tracing_cpumask_update_lock);
+	free_cpumask_var(tracing_cpumask_new);
 
 	return count;
 
 err_unlock:
 	mutex_unlock(&tracing_cpumask_update_lock);
+	free_cpumask_var(tracing_cpumask);
 
 	return err;
 }
@@ -3752,7 +3752,6 @@ void ftrace_dump(void)
 	static DEFINE_SPINLOCK(ftrace_dump_lock);
 	/* use static because iter can be a bit big for the stack */
 	static struct trace_iterator iter;
-	static cpumask_t mask;
 	static int dump_ran;
 	unsigned long flags;
 	int cnt = 0, cpu;
@@ -3786,8 +3785,6 @@ void ftrace_dump(void)
 	 * and then release the locks again.
 	 */
 
-	cpus_clear(mask);
-
 	while (!trace_empty(&iter)) {
 
 		if (!cnt)
@@ -3823,19 +3820,28 @@ __init static int tracer_alloc_buffers(void)
 {
 	struct trace_array_cpu *data;
 	int i;
+	int ret = -ENOMEM;
 
-	/* TODO: make the number of buffers hot pluggable with CPUS */
-	tracing_buffer_mask = cpu_possible_map;
+	if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
+		goto out;
+
+	if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
+		goto out_free_buffer_mask;
 
+	cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
+	cpumask_copy(tracing_cpumask, cpu_all_mask);
+
+	/* TODO: make the number of buffers hot pluggable with CPUS */
 	global_trace.buffer = ring_buffer_alloc(trace_buf_size,
 						   TRACE_BUFFER_FLAGS);
 	if (!global_trace.buffer) {
 		printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
 		WARN_ON(1);
-		return 0;
+		goto out_free_cpumask;
 	}
 	global_trace.entries = ring_buffer_size(global_trace.buffer);
 
+
 #ifdef CONFIG_TRACER_MAX_TRACE
 	max_tr.buffer = ring_buffer_alloc(trace_buf_size,
 					     TRACE_BUFFER_FLAGS);
@@ -3843,7 +3849,7 @@ __init static int tracer_alloc_buffers(void)
 		printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
 		WARN_ON(1);
 		ring_buffer_free(global_trace.buffer);
-		return 0;
+		goto out_free_cpumask;
 	}
 	max_tr.entries = ring_buffer_size(max_tr.buffer);
 	WARN_ON(max_tr.entries != global_trace.entries);
@@ -3873,8 +3879,14 @@ __init static int tracer_alloc_buffers(void)
 				       &trace_panic_notifier);
 
 	register_die_notifier(&trace_die_notifier);
+	ret = 0;
 
-	return 0;
+out_free_cpumask:
+	free_cpumask_var(tracing_cpumask);
+out_free_buffer_mask:
+	free_cpumask_var(tracing_buffer_mask);
+out:
+	return ret;
 }
 early_initcall(tracer_alloc_buffers);
 fs_initcall(tracer_init_debugfs);
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index a5779bd975d..eaca5ad803f 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -196,9 +196,9 @@ static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer)
 	return HRTIMER_RESTART;
 }
 
-static void start_stack_timer(int cpu)
+static void start_stack_timer(void *unused)
 {
-	struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu);
+	struct hrtimer *hrtimer = &__get_cpu_var(stack_trace_hrtimer);
 
 	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hrtimer->function = stack_trace_timer_fn;
@@ -208,14 +208,7 @@ static void start_stack_timer(int cpu)
 
 static void start_stack_timers(void)
 {
-	cpumask_t saved_mask = current->cpus_allowed;
-	int cpu;
-
-	for_each_online_cpu(cpu) {
-		set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
-		start_stack_timer(cpu);
-	}
-	set_cpus_allowed_ptr(current, &saved_mask);
+	on_each_cpu(start_stack_timer, NULL, 1);
 }
 
 static void stop_stack_timer(int cpu)
-- 
cgit v1.2.3


From 4462344ee9ea9224d026801b877887f2f39774a3 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 1 Jan 2009 10:12:23 +1030
Subject: cpumask: convert kernel trace functions further

Impact: Reduce future memory usage, use new cpumask API.

Since the last patch was created and acked, more old cpumask users
slipped into kernel/trace.

Mostly trivial conversions, except struct trace_iterator's "started"
member becomes a cpumask_var_t.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/trace/trace.c                 | 12 +++++++++---
 kernel/trace/trace.h                 |  2 +-
 kernel/trace/trace_boot.c            |  2 +-
 kernel/trace/trace_functions_graph.c |  2 +-
 kernel/trace/trace_hw_branches.c     |  6 +++---
 kernel/trace/trace_power.c           |  2 +-
 6 files changed, 16 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5d04e27f3b4..c580233add9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1811,10 +1811,10 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
 	if (!(iter->iter_flags & TRACE_FILE_ANNOTATE))
 		return;
 
-	if (cpu_isset(iter->cpu, iter->started))
+	if (cpumask_test_cpu(iter->cpu, iter->started))
 		return;
 
-	cpu_set(iter->cpu, iter->started);
+	cpumask_set_cpu(iter->cpu, iter->started);
 	trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu);
 }
 
@@ -3114,10 +3114,15 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
 	if (!iter)
 		return -ENOMEM;
 
+	if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
+		kfree(iter);
+		return -ENOMEM;
+	}
+
 	mutex_lock(&trace_types_lock);
 
 	/* trace pipe does not show start of buffer */
-	cpus_setall(iter->started);
+	cpumask_setall(iter->started);
 
 	iter->tr = &global_trace;
 	iter->trace = current_trace;
@@ -3134,6 +3139,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
 {
 	struct trace_iterator *iter = file->private_data;
 
+	free_cpumask_var(iter->started);
 	kfree(iter);
 	atomic_dec(&tracing_reader);
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cc7a4f86403..4d3d381bfd9 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -368,7 +368,7 @@ struct trace_iterator {
 	loff_t			pos;
 	long			idx;
 
-	cpumask_t		started;
+	cpumask_var_t		started;
 };
 
 int tracing_is_enabled(void);
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 3ccebde2848..366c8c333e1 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -42,7 +42,7 @@ static int boot_trace_init(struct trace_array *tr)
 	int cpu;
 	boot_trace = tr;
 
-	for_each_cpu_mask(cpu, cpu_possible_map)
+	for_each_cpu(cpu, cpu_possible_mask)
 		tracing_reset(tr, cpu);
 
 	tracing_sched_switch_assign_trace(tr);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 4bf39fcae97..930c08e5b38 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -79,7 +79,7 @@ print_graph_cpu(struct trace_seq *s, int cpu)
 	int i;
 	int ret;
 	int log10_this = log10_cpu(cpu);
-	int log10_all = log10_cpu(cpus_weight_nr(cpu_online_map));
+	int log10_all = log10_cpu(cpumask_weight(cpu_online_mask));
 
 
 	/*
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index b6a3e20a49a..649df22d435 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -46,7 +46,7 @@ static void bts_trace_start(struct trace_array *tr)
 
 	tracing_reset_online_cpus(tr);
 
-	for_each_cpu_mask(cpu, cpu_possible_map)
+	for_each_cpu(cpu, cpu_possible_mask)
 		smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
 }
 
@@ -62,7 +62,7 @@ static void bts_trace_stop(struct trace_array *tr)
 {
 	int cpu;
 
-	for_each_cpu_mask(cpu, cpu_possible_map)
+	for_each_cpu(cpu, cpu_possible_mask)
 		smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
 }
 
@@ -172,7 +172,7 @@ static void trace_bts_prepare(struct trace_iterator *iter)
 {
 	int cpu;
 
-	for_each_cpu_mask(cpu, cpu_possible_map)
+	for_each_cpu(cpu, cpu_possible_mask)
 		smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1);
 }
 
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index a7172a352f6..7bda248daf5 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -39,7 +39,7 @@ static int power_trace_init(struct trace_array *tr)
 
 	trace_power_enabled = 1;
 
-	for_each_cpu_mask(cpu, cpu_possible_map)
+	for_each_cpu(cpu, cpu_possible_mask)
 		tracing_reset(tr, cpu);
 	return 0;
 }
-- 
cgit v1.2.3


From f1fc057c79cb2d27602fb3ad08a031f13459ef27 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 1 Jan 2009 10:12:23 +1030
Subject: cpumask: remove any_online_cpu() users: kernel/

Impact: Remove obsolete API usage

any_online_cpu() is a good name, but it takes a cpumask_t, not a
pointer.

There are several places where any_online_cpu() doesn't really want a
mask arg at all.  Replace all callers with cpumask_any() and
cpumask_any_and().

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
---
 kernel/softirq.c    | 2 +-
 kernel/softlockup.c | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 466e75ce271..b7568d7def2 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -733,7 +733,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
 			break;
 		/* Unbind so it can run.  Fall thru. */
 		kthread_bind(per_cpu(ksoftirqd, hotcpu),
-			     any_online_cpu(cpu_online_map));
+			     cpumask_any(cpu_online_mask));
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN: {
 		struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 1ab790c67b1..492f0c72fec 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -303,7 +303,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		break;
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
-		check_cpu = any_online_cpu(cpu_online_map);
+		check_cpu = cpumask_any(cpu_online_mask);
 		wake_up_process(per_cpu(watchdog_task, hotcpu));
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
@@ -313,7 +313,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 			cpumask_t temp_cpu_online_map = cpu_online_map;
 
 			cpu_clear(hotcpu, temp_cpu_online_map);
-			check_cpu = any_online_cpu(temp_cpu_online_map);
+			check_cpu = cpumask_any(&temp_cpu_online_map);
 		}
 		break;
 
@@ -323,7 +323,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 			break;
 		/* Unbind so it can run.  Fall thru. */
 		kthread_bind(per_cpu(watchdog_task, hotcpu),
-			     any_online_cpu(cpu_online_map));
+			     cpumask_any(cpu_online_mask));
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 		p = per_cpu(watchdog_task, hotcpu);
-- 
cgit v1.2.3


From a45185d2d7108b01b90b9e0293377be4d6346dde Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 1 Jan 2009 10:12:24 +1030
Subject: cpumask: convert kernel/compat.c

Impact: Reduce stack usage, use new cpumask API.

Straightforward conversion; cpumasks' size is given by cpumask_size() (now
a variable rather than fixed) and on-stack cpu masks use cpumask_var_t.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/compat.c | 49 ++++++++++++++++++++++++++++++-------------------
 1 file changed, 30 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/compat.c b/kernel/compat.c
index 8eafe3eb50d..d52e2ec1deb 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -454,16 +454,16 @@ asmlinkage long compat_sys_waitid(int which, compat_pid_t pid,
 }
 
 static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr,
-				    unsigned len, cpumask_t *new_mask)
+				    unsigned len, struct cpumask *new_mask)
 {
 	unsigned long *k;
 
-	if (len < sizeof(cpumask_t))
-		memset(new_mask, 0, sizeof(cpumask_t));
-	else if (len > sizeof(cpumask_t))
-		len = sizeof(cpumask_t);
+	if (len < cpumask_size())
+		memset(new_mask, 0, cpumask_size());
+	else if (len > cpumask_size())
+		len = cpumask_size();
 
-	k = cpus_addr(*new_mask);
+	k = cpumask_bits(new_mask);
 	return compat_get_bitmap(k, user_mask_ptr, len * 8);
 }
 
@@ -471,40 +471,51 @@ asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid,
 					     unsigned int len,
 					     compat_ulong_t __user *user_mask_ptr)
 {
-	cpumask_t new_mask;
+	cpumask_var_t new_mask;
 	int retval;
 
-	retval = compat_get_user_cpu_mask(user_mask_ptr, len, &new_mask);
+	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	retval = compat_get_user_cpu_mask(user_mask_ptr, len, new_mask);
 	if (retval)
-		return retval;
+		goto out;
 
-	return sched_setaffinity(pid, &new_mask);
+	retval = sched_setaffinity(pid, new_mask);
+out:
+	free_cpumask_var(new_mask);
+	return retval;
 }
 
 asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
 					     compat_ulong_t __user *user_mask_ptr)
 {
 	int ret;
-	cpumask_t mask;
+	cpumask_var_t mask;
 	unsigned long *k;
-	unsigned int min_length = sizeof(cpumask_t);
+	unsigned int min_length = cpumask_size();
 
-	if (NR_CPUS <= BITS_PER_COMPAT_LONG)
+	if (nr_cpu_ids <= BITS_PER_COMPAT_LONG)
 		min_length = sizeof(compat_ulong_t);
 
 	if (len < min_length)
 		return -EINVAL;
 
-	ret = sched_getaffinity(pid, &mask);
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	ret = sched_getaffinity(pid, mask);
 	if (ret < 0)
-		return ret;
+		goto out;
 
-	k = cpus_addr(mask);
+	k = cpumask_bits(mask);
 	ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8);
-	if (ret)
-		return ret;
+	if (ret == 0)
+		ret = min_length;
 
-	return min_length;
+out:
+	free_cpumask_var(mask);
+	return ret;
 }
 
 int get_compat_itimerspec(struct itimerspec *dst,
-- 
cgit v1.2.3


From e7577c50f2fb2d1c167e2c04a4b4c2cc042acb82 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 1 Jan 2009 10:12:25 +1030
Subject: cpumask: convert kernel/workqueue.c

Impact: Reduce memory usage, use new cpumask API.

cpu_populated_map becomes a cpumask_var_t, and cpu_singlethread_map is
simply a cpumask pointer: it's simply the cpumask containing the first
possible CPU anyway.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/workqueue.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4952322cba4..2f445833ae3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -73,7 +73,7 @@ static DEFINE_SPINLOCK(workqueue_lock);
 static LIST_HEAD(workqueues);
 
 static int singlethread_cpu __read_mostly;
-static cpumask_t cpu_singlethread_map __read_mostly;
+static const struct cpumask *cpu_singlethread_map __read_mostly;
 /*
  * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD
  * flushes cwq->worklist. This means that flush_workqueue/wait_on_work
@@ -81,7 +81,7 @@ static cpumask_t cpu_singlethread_map __read_mostly;
  * use cpu_possible_map, the cpumask below is more a documentation
  * than optimization.
  */
-static cpumask_t cpu_populated_map __read_mostly;
+static cpumask_var_t cpu_populated_map __read_mostly;
 
 /* If it's single threaded, it isn't in the list of workqueues. */
 static inline int is_wq_single_threaded(struct workqueue_struct *wq)
@@ -89,10 +89,10 @@ static inline int is_wq_single_threaded(struct workqueue_struct *wq)
 	return wq->singlethread;
 }
 
-static const cpumask_t *wq_cpu_map(struct workqueue_struct *wq)
+static const struct cpumask *wq_cpu_map(struct workqueue_struct *wq)
 {
 	return is_wq_single_threaded(wq)
-		? &cpu_singlethread_map : &cpu_populated_map;
+		? cpu_singlethread_map : cpu_populated_map;
 }
 
 static
@@ -410,7 +410,7 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
  */
 void flush_workqueue(struct workqueue_struct *wq)
 {
-	const cpumask_t *cpu_map = wq_cpu_map(wq);
+	const struct cpumask *cpu_map = wq_cpu_map(wq);
 	int cpu;
 
 	might_sleep();
@@ -532,7 +532,7 @@ static void wait_on_work(struct work_struct *work)
 {
 	struct cpu_workqueue_struct *cwq;
 	struct workqueue_struct *wq;
-	const cpumask_t *cpu_map;
+	const struct cpumask *cpu_map;
 	int cpu;
 
 	might_sleep();
@@ -903,7 +903,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
  */
 void destroy_workqueue(struct workqueue_struct *wq)
 {
-	const cpumask_t *cpu_map = wq_cpu_map(wq);
+	const struct cpumask *cpu_map = wq_cpu_map(wq);
 	int cpu;
 
 	cpu_maps_update_begin();
@@ -933,7 +933,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 
 	switch (action) {
 	case CPU_UP_PREPARE:
-		cpu_set(cpu, cpu_populated_map);
+		cpumask_set_cpu(cpu, cpu_populated_map);
 	}
 undo:
 	list_for_each_entry(wq, &workqueues, list) {
@@ -964,7 +964,7 @@ undo:
 	switch (action) {
 	case CPU_UP_CANCELED:
 	case CPU_POST_DEAD:
-		cpu_clear(cpu, cpu_populated_map);
+		cpumask_clear_cpu(cpu, cpu_populated_map);
 	}
 
 	return ret;
@@ -1017,9 +1017,11 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
 
 void __init init_workqueues(void)
 {
-	cpu_populated_map = cpu_online_map;
-	singlethread_cpu = first_cpu(cpu_possible_map);
-	cpu_singlethread_map = cpumask_of_cpu(singlethread_cpu);
+	alloc_cpumask_var(&cpu_populated_map, GFP_KERNEL);
+
+	cpumask_copy(cpu_populated_map, cpu_online_mask);
+	singlethread_cpu = cpumask_first(cpu_possible_mask);
+	cpu_singlethread_map = cpumask_of(singlethread_cpu);
 	hotcpu_notifier(workqueue_cpu_callback, 0);
 	keventd_wq = create_workqueue("events");
 	BUG_ON(!keventd_wq);
-- 
cgit v1.2.3


From 6b954823c24f04ed026a8517f6bab5abda279db8 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 1 Jan 2009 10:12:25 +1030
Subject: cpumask: convert kernel time functions

Impact: Use new APIs

Convert kernel/time functions to use struct cpumask *.

Note the ugly bitmap declarations in tick-broadcast.c.  These should
be cpumask_var_t, but there was no obvious initialization function to
put the alloc_cpumask_var() calls in.  This was safe.

(Eventually 'struct cpumask' will be undefined for CONFIG_CPUMASK_OFFSTACK,
so we use a bitmap here to show we really mean it).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
---
 kernel/time/clocksource.c    |   6 +--
 kernel/time/tick-broadcast.c | 113 ++++++++++++++++++++++---------------------
 kernel/time/tick-common.c    |   6 +--
 3 files changed, 64 insertions(+), 61 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 9ed2eec9752..32141b15d63 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -148,7 +148,7 @@ static void clocksource_watchdog(unsigned long data)
 		int next_cpu = next_cpu_nr(raw_smp_processor_id(), cpu_online_map);
 
 		if (next_cpu >= nr_cpu_ids)
-			next_cpu = first_cpu(cpu_online_map);
+			next_cpu = cpumask_first(cpu_online_mask);
 		watchdog_timer.expires += WATCHDOG_INTERVAL;
 		add_timer_on(&watchdog_timer, next_cpu);
 	}
@@ -173,7 +173,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
 			watchdog_last = watchdog->read();
 			watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
 			add_timer_on(&watchdog_timer,
-				     first_cpu(cpu_online_map));
+				     cpumask_first(cpu_online_mask));
 		}
 	} else {
 		if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
@@ -195,7 +195,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
 				watchdog_timer.expires =
 					jiffies + WATCHDOG_INTERVAL;
 				add_timer_on(&watchdog_timer,
-					     first_cpu(cpu_online_map));
+					     cpumask_first(cpu_online_mask));
 			}
 		}
 	}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 9590af2327b..356fac57a18 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -28,7 +28,9 @@
  */
 
 struct tick_device tick_broadcast_device;
-static cpumask_t tick_broadcast_mask;
+/* FIXME: Use cpumask_var_t. */
+static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS);
+static DECLARE_BITMAP(tmpmask, NR_CPUS);
 static DEFINE_SPINLOCK(tick_broadcast_lock);
 static int tick_broadcast_force;
 
@@ -46,9 +48,9 @@ struct tick_device *tick_get_broadcast_device(void)
 	return &tick_broadcast_device;
 }
 
-cpumask_t *tick_get_broadcast_mask(void)
+struct cpumask *tick_get_broadcast_mask(void)
 {
-	return &tick_broadcast_mask;
+	return to_cpumask(tick_broadcast_mask);
 }
 
 /*
@@ -72,7 +74,7 @@ int tick_check_broadcast_device(struct clock_event_device *dev)
 
 	clockevents_exchange_device(NULL, dev);
 	tick_broadcast_device.evtdev = dev;
-	if (!cpus_empty(tick_broadcast_mask))
+	if (!cpumask_empty(tick_get_broadcast_mask()))
 		tick_broadcast_start_periodic(dev);
 	return 1;
 }
@@ -104,7 +106,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
 	 */
 	if (!tick_device_is_functional(dev)) {
 		dev->event_handler = tick_handle_periodic;
-		cpu_set(cpu, tick_broadcast_mask);
+		cpumask_set_cpu(cpu, tick_get_broadcast_mask());
 		tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
 		ret = 1;
 	} else {
@@ -116,7 +118,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
 		if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) {
 			int cpu = smp_processor_id();
 
-			cpu_clear(cpu, tick_broadcast_mask);
+			cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
 			tick_broadcast_clear_oneshot(cpu);
 		}
 	}
@@ -125,9 +127,9 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
 }
 
 /*
- * Broadcast the event to the cpus, which are set in the mask
+ * Broadcast the event to the cpus, which are set in the mask (mangled).
  */
-static void tick_do_broadcast(cpumask_t mask)
+static void tick_do_broadcast(struct cpumask *mask)
 {
 	int cpu = smp_processor_id();
 	struct tick_device *td;
@@ -135,22 +137,21 @@ static void tick_do_broadcast(cpumask_t mask)
 	/*
 	 * Check, if the current cpu is in the mask
 	 */
-	if (cpu_isset(cpu, mask)) {
-		cpu_clear(cpu, mask);
+	if (cpumask_test_cpu(cpu, mask)) {
+		cpumask_clear_cpu(cpu, mask);
 		td = &per_cpu(tick_cpu_device, cpu);
 		td->evtdev->event_handler(td->evtdev);
 	}
 
-	if (!cpus_empty(mask)) {
+	if (!cpumask_empty(mask)) {
 		/*
 		 * It might be necessary to actually check whether the devices
 		 * have different broadcast functions. For now, just use the
 		 * one of the first device. This works as long as we have this
 		 * misfeature only on x86 (lapic)
 		 */
-		cpu = first_cpu(mask);
-		td = &per_cpu(tick_cpu_device, cpu);
-		td->evtdev->broadcast(&mask);
+		td = &per_cpu(tick_cpu_device, cpumask_first(mask));
+		td->evtdev->broadcast(mask);
 	}
 }
 
@@ -160,12 +161,11 @@ static void tick_do_broadcast(cpumask_t mask)
  */
 static void tick_do_periodic_broadcast(void)
 {
-	cpumask_t mask;
-
 	spin_lock(&tick_broadcast_lock);
 
-	cpus_and(mask, cpu_online_map, tick_broadcast_mask);
-	tick_do_broadcast(mask);
+	cpumask_and(to_cpumask(tmpmask),
+		    cpu_online_mask, tick_get_broadcast_mask());
+	tick_do_broadcast(to_cpumask(tmpmask));
 
 	spin_unlock(&tick_broadcast_lock);
 }
@@ -228,13 +228,13 @@ static void tick_do_broadcast_on_off(void *why)
 	if (!tick_device_is_functional(dev))
 		goto out;
 
-	bc_stopped = cpus_empty(tick_broadcast_mask);
+	bc_stopped = cpumask_empty(tick_get_broadcast_mask());
 
 	switch (*reason) {
 	case CLOCK_EVT_NOTIFY_BROADCAST_ON:
 	case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
-		if (!cpu_isset(cpu, tick_broadcast_mask)) {
-			cpu_set(cpu, tick_broadcast_mask);
+		if (!cpumask_test_cpu(cpu, tick_get_broadcast_mask())) {
+			cpumask_set_cpu(cpu, tick_get_broadcast_mask());
 			if (tick_broadcast_device.mode ==
 			    TICKDEV_MODE_PERIODIC)
 				clockevents_shutdown(dev);
@@ -244,8 +244,8 @@ static void tick_do_broadcast_on_off(void *why)
 		break;
 	case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
 		if (!tick_broadcast_force &&
-		    cpu_isset(cpu, tick_broadcast_mask)) {
-			cpu_clear(cpu, tick_broadcast_mask);
+		    cpumask_test_cpu(cpu, tick_get_broadcast_mask())) {
+			cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
 			if (tick_broadcast_device.mode ==
 			    TICKDEV_MODE_PERIODIC)
 				tick_setup_periodic(dev, 0);
@@ -253,7 +253,7 @@ static void tick_do_broadcast_on_off(void *why)
 		break;
 	}
 
-	if (cpus_empty(tick_broadcast_mask)) {
+	if (cpumask_empty(tick_get_broadcast_mask())) {
 		if (!bc_stopped)
 			clockevents_shutdown(bc);
 	} else if (bc_stopped) {
@@ -272,7 +272,7 @@ out:
  */
 void tick_broadcast_on_off(unsigned long reason, int *oncpu)
 {
-	if (!cpu_isset(*oncpu, cpu_online_map))
+	if (!cpumask_test_cpu(*oncpu, cpu_online_mask))
 		printk(KERN_ERR "tick-broadcast: ignoring broadcast for "
 		       "offline CPU #%d\n", *oncpu);
 	else
@@ -303,10 +303,10 @@ void tick_shutdown_broadcast(unsigned int *cpup)
 	spin_lock_irqsave(&tick_broadcast_lock, flags);
 
 	bc = tick_broadcast_device.evtdev;
-	cpu_clear(cpu, tick_broadcast_mask);
+	cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
 
 	if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
-		if (bc && cpus_empty(tick_broadcast_mask))
+		if (bc && cpumask_empty(tick_get_broadcast_mask()))
 			clockevents_shutdown(bc);
 	}
 
@@ -342,10 +342,10 @@ int tick_resume_broadcast(void)
 
 		switch (tick_broadcast_device.mode) {
 		case TICKDEV_MODE_PERIODIC:
-			if(!cpus_empty(tick_broadcast_mask))
+			if (!cpumask_empty(tick_get_broadcast_mask()))
 				tick_broadcast_start_periodic(bc);
-			broadcast = cpu_isset(smp_processor_id(),
-					      tick_broadcast_mask);
+			broadcast = cpumask_test_cpu(smp_processor_id(),
+						     tick_get_broadcast_mask());
 			break;
 		case TICKDEV_MODE_ONESHOT:
 			broadcast = tick_resume_broadcast_oneshot(bc);
@@ -360,14 +360,15 @@ int tick_resume_broadcast(void)
 
 #ifdef CONFIG_TICK_ONESHOT
 
-static cpumask_t tick_broadcast_oneshot_mask;
+/* FIXME: use cpumask_var_t. */
+static DECLARE_BITMAP(tick_broadcast_oneshot_mask, NR_CPUS);
 
 /*
- * Debugging: see timer_list.c
+ * Exposed for debugging: see timer_list.c
  */
-cpumask_t *tick_get_broadcast_oneshot_mask(void)
+struct cpumask *tick_get_broadcast_oneshot_mask(void)
 {
-	return &tick_broadcast_oneshot_mask;
+	return to_cpumask(tick_broadcast_oneshot_mask);
 }
 
 static int tick_broadcast_set_event(ktime_t expires, int force)
@@ -389,7 +390,7 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
  */
 void tick_check_oneshot_broadcast(int cpu)
 {
-	if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) {
+	if (cpumask_test_cpu(cpu, to_cpumask(tick_broadcast_oneshot_mask))) {
 		struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
 
 		clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT);
@@ -402,7 +403,6 @@ void tick_check_oneshot_broadcast(int cpu)
 static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
 {
 	struct tick_device *td;
-	cpumask_t mask;
 	ktime_t now, next_event;
 	int cpu;
 
@@ -410,13 +410,13 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
 again:
 	dev->next_event.tv64 = KTIME_MAX;
 	next_event.tv64 = KTIME_MAX;
-	mask = CPU_MASK_NONE;
+	cpumask_clear(to_cpumask(tmpmask));
 	now = ktime_get();
 	/* Find all expired events */
-	for_each_cpu_mask_nr(cpu, tick_broadcast_oneshot_mask) {
+	for_each_cpu(cpu, tick_get_broadcast_oneshot_mask()) {
 		td = &per_cpu(tick_cpu_device, cpu);
 		if (td->evtdev->next_event.tv64 <= now.tv64)
-			cpu_set(cpu, mask);
+			cpumask_set_cpu(cpu, to_cpumask(tmpmask));
 		else if (td->evtdev->next_event.tv64 < next_event.tv64)
 			next_event.tv64 = td->evtdev->next_event.tv64;
 	}
@@ -424,7 +424,7 @@ again:
 	/*
 	 * Wakeup the cpus which have an expired event.
 	 */
-	tick_do_broadcast(mask);
+	tick_do_broadcast(to_cpumask(tmpmask));
 
 	/*
 	 * Two reasons for reprogram:
@@ -476,15 +476,16 @@ void tick_broadcast_oneshot_control(unsigned long reason)
 		goto out;
 
 	if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
-		if (!cpu_isset(cpu, tick_broadcast_oneshot_mask)) {
-			cpu_set(cpu, tick_broadcast_oneshot_mask);
+		if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
+			cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask());
 			clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
 			if (dev->next_event.tv64 < bc->next_event.tv64)
 				tick_broadcast_set_event(dev->next_event, 1);
 		}
 	} else {
-		if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) {
-			cpu_clear(cpu, tick_broadcast_oneshot_mask);
+		if (cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
+			cpumask_clear_cpu(cpu,
+					  tick_get_broadcast_oneshot_mask());
 			clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
 			if (dev->next_event.tv64 != KTIME_MAX)
 				tick_program_event(dev->next_event, 1);
@@ -502,10 +503,11 @@ out:
  */
 static void tick_broadcast_clear_oneshot(int cpu)
 {
-	cpu_clear(cpu, tick_broadcast_oneshot_mask);
+	cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
 }
 
-static void tick_broadcast_init_next_event(cpumask_t *mask, ktime_t expires)
+static void tick_broadcast_init_next_event(struct cpumask *mask,
+					   ktime_t expires)
 {
 	struct tick_device *td;
 	int cpu;
@@ -526,7 +528,6 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 	if (bc->event_handler != tick_handle_oneshot_broadcast) {
 		int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
 		int cpu = smp_processor_id();
-		cpumask_t mask;
 
 		bc->event_handler = tick_handle_oneshot_broadcast;
 		clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
@@ -540,13 +541,15 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 		 * oneshot_mask bits for those and program the
 		 * broadcast device to fire.
 		 */
-		mask = tick_broadcast_mask;
-		cpu_clear(cpu, mask);
-		cpus_or(tick_broadcast_oneshot_mask,
-			tick_broadcast_oneshot_mask, mask);
-
-		if (was_periodic && !cpus_empty(mask)) {
-			tick_broadcast_init_next_event(&mask, tick_next_period);
+		cpumask_copy(to_cpumask(tmpmask), tick_get_broadcast_mask());
+		cpumask_clear_cpu(cpu, to_cpumask(tmpmask));
+		cpumask_or(tick_get_broadcast_oneshot_mask(),
+			   tick_get_broadcast_oneshot_mask(),
+			   to_cpumask(tmpmask));
+
+		if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) {
+			tick_broadcast_init_next_event(to_cpumask(tmpmask),
+						       tick_next_period);
 			tick_broadcast_set_event(tick_next_period, 1);
 		} else
 			bc->next_event.tv64 = KTIME_MAX;
@@ -585,7 +588,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
 	 * Clear the broadcast mask flag for the dead cpu, but do not
 	 * stop the broadcast device!
 	 */
-	cpu_clear(cpu, tick_broadcast_oneshot_mask);
+	cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
 
 	spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index f8372be7412..63e05d423a0 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -254,7 +254,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
 		curdev = NULL;
 	}
 	clockevents_exchange_device(curdev, newdev);
-	tick_setup_device(td, newdev, cpu, &cpumask_of_cpu(cpu));
+	tick_setup_device(td, newdev, cpu, cpumask_of(cpu));
 	if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
 		tick_oneshot_notify();
 
@@ -299,9 +299,9 @@ static void tick_shutdown(unsigned int *cpup)
 	}
 	/* Transfer the do_timer job away from this cpu */
 	if (*cpup == tick_do_timer_cpu) {
-		int cpu = first_cpu(cpu_online_map);
+		int cpu = cpumask_first(cpu_online_mask);
 
-		tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu :
+		tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu :
 			TICK_DO_TIMER_NONE;
 	}
 	spin_unlock_irqrestore(&tick_device_lock, flags);
-- 
cgit v1.2.3


From d036e67b40f52bdd95392390108defbac7e53837 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 1 Jan 2009 10:12:26 +1030
Subject: cpumask: convert kernel/irq

Impact: Reduce stack usage, use new cpumask API.  ALPHA mod!

Main change is that irq_default_affinity becomes a cpumask_var_t, so
treat it as a pointer (this effects alpha).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/irq/manage.c | 11 +++++++++--
 kernel/irq/proc.c   | 32 +++++++++++++++++++++-----------
 2 files changed, 30 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 61c4a9b6216..cd0cd8dcb34 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -16,8 +16,15 @@
 #include "internals.h"
 
 #ifdef CONFIG_SMP
+cpumask_var_t irq_default_affinity;
 
-cpumask_t irq_default_affinity = CPU_MASK_ALL;
+static int init_irq_default_affinity(void)
+{
+	alloc_cpumask_var(&irq_default_affinity, GFP_KERNEL);
+	cpumask_setall(irq_default_affinity);
+	return 0;
+}
+core_initcall(init_irq_default_affinity);
 
 /**
  *	synchronize_irq - wait for pending IRQ handlers (on other CPUs)
@@ -127,7 +134,7 @@ int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc)
 			desc->status &= ~IRQ_AFFINITY_SET;
 	}
 
-	cpumask_and(&desc->affinity, cpu_online_mask, &irq_default_affinity);
+	cpumask_and(&desc->affinity, cpu_online_mask, irq_default_affinity);
 set_affinity:
 	desc->chip->set_affinity(irq, &desc->affinity);
 
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index d2c0e5ee53c..2abd3a7716e 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -20,7 +20,7 @@ static struct proc_dir_entry *root_irq_dir;
 static int irq_affinity_proc_show(struct seq_file *m, void *v)
 {
 	struct irq_desc *desc = irq_to_desc((long)m->private);
-	cpumask_t *mask = &desc->affinity;
+	const struct cpumask *mask = &desc->affinity;
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	if (desc->status & IRQ_MOVE_PENDING)
@@ -93,7 +93,7 @@ static const struct file_operations irq_affinity_proc_fops = {
 
 static int default_affinity_show(struct seq_file *m, void *v)
 {
-	seq_cpumask(m, &irq_default_affinity);
+	seq_cpumask(m, irq_default_affinity);
 	seq_putc(m, '\n');
 	return 0;
 }
@@ -101,27 +101,37 @@ static int default_affinity_show(struct seq_file *m, void *v)
 static ssize_t default_affinity_write(struct file *file,
 		const char __user *buffer, size_t count, loff_t *ppos)
 {
-	cpumask_t new_value;
+	cpumask_var_t new_value;
 	int err;
 
-	err = cpumask_parse_user(buffer, count, &new_value);
+	if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
+		return -ENOMEM;
+
+	err = cpumask_parse_user(buffer, count, new_value);
 	if (err)
-		return err;
+		goto out;
 
-	if (!is_affinity_mask_valid(new_value))
-		return -EINVAL;
+	if (!is_affinity_mask_valid(new_value)) {
+		err = -EINVAL;
+		goto out;
+	}
 
 	/*
 	 * Do not allow disabling IRQs completely - it's a too easy
 	 * way to make the system unusable accidentally :-) At least
 	 * one online CPU still has to be targeted.
 	 */
-	if (!cpus_intersects(new_value, cpu_online_map))
-		return -EINVAL;
+	if (!cpumask_intersects(new_value, cpu_online_mask)) {
+		err = -EINVAL;
+		goto out;
+	}
 
-	irq_default_affinity = new_value;
+	cpumask_copy(irq_default_affinity, new_value);
+	err = count;
 
-	return count;
+out:
+	free_cpumask_var(new_value);
+	return err;
 }
 
 static int default_affinity_open(struct inode *inode, struct file *file)
-- 
cgit v1.2.3


From bd232f97b30f6bb630efa136a777647545db3039 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 1 Jan 2009 10:12:26 +1030
Subject: cpumask: convert RCU implementations

Impact: use new cpumask API.

rcu_ctrlblk contains a cpumask, and it's highly optimized so I don't want
a cpumask_var_t (ie. a pointer) for the CONFIG_CPUMASK_OFFSTACK case.  It
could use a dangling bitmap, and be allocated in __rcu_init to save memory,
but for the moment we use a bitmap.

(Eventually 'struct cpumask' will be undefined for CONFIG_CPUMASK_OFFSTACK,
so we use a bitmap here to show we really mean it).

We remove on-stack cpumasks, using cpumask_var_t for
rcu_torture_shuffle_tasks() and for_each_cpu_and in force_quiescent_state().

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/rcuclassic.c | 32 +++++++++++++++++---------------
 kernel/rcupreempt.c | 19 ++++++++++---------
 kernel/rcutorture.c | 27 +++++++++++++++------------
 3 files changed, 42 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index e503a002f33..0ff9b05706a 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -63,14 +63,14 @@ static struct rcu_ctrlblk rcu_ctrlblk = {
 	.completed = -300,
 	.pending = -300,
 	.lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
-	.cpumask = CPU_MASK_NONE,
+	.cpumask = CPU_BITS_NONE,
 };
 static struct rcu_ctrlblk rcu_bh_ctrlblk = {
 	.cur = -300,
 	.completed = -300,
 	.pending = -300,
 	.lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
-	.cpumask = CPU_MASK_NONE,
+	.cpumask = CPU_BITS_NONE,
 };
 
 DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
@@ -85,7 +85,6 @@ static void force_quiescent_state(struct rcu_data *rdp,
 			struct rcu_ctrlblk *rcp)
 {
 	int cpu;
-	cpumask_t cpumask;
 	unsigned long flags;
 
 	set_need_resched();
@@ -96,10 +95,10 @@ static void force_quiescent_state(struct rcu_data *rdp,
 		 * Don't send IPI to itself. With irqs disabled,
 		 * rdp->cpu is the current cpu.
 		 *
-		 * cpu_online_map is updated by the _cpu_down()
+		 * cpu_online_mask is updated by the _cpu_down()
 		 * using __stop_machine(). Since we're in irqs disabled
 		 * section, __stop_machine() is not exectuting, hence
-		 * the cpu_online_map is stable.
+		 * the cpu_online_mask is stable.
 		 *
 		 * However,  a cpu might have been offlined _just_ before
 		 * we disabled irqs while entering here.
@@ -107,13 +106,14 @@ static void force_quiescent_state(struct rcu_data *rdp,
 		 * notification, leading to the offlined cpu's bit
 		 * being set in the rcp->cpumask.
 		 *
-		 * Hence cpumask = (rcp->cpumask & cpu_online_map) to prevent
+		 * Hence cpumask = (rcp->cpumask & cpu_online_mask) to prevent
 		 * sending smp_reschedule() to an offlined CPU.
 		 */
-		cpus_and(cpumask, rcp->cpumask, cpu_online_map);
-		cpu_clear(rdp->cpu, cpumask);
-		for_each_cpu_mask_nr(cpu, cpumask)
-			smp_send_reschedule(cpu);
+		for_each_cpu_and(cpu,
+				  to_cpumask(rcp->cpumask), cpu_online_mask) {
+			if (cpu != rdp->cpu)
+				smp_send_reschedule(cpu);
+		}
 	}
 	spin_unlock_irqrestore(&rcp->lock, flags);
 }
@@ -193,7 +193,7 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
 
 	printk(KERN_ERR "INFO: RCU detected CPU stalls:");
 	for_each_possible_cpu(cpu) {
-		if (cpu_isset(cpu, rcp->cpumask))
+		if (cpumask_test_cpu(cpu, to_cpumask(rcp->cpumask)))
 			printk(" %d", cpu);
 	}
 	printk(" (detected by %d, t=%ld jiffies)\n",
@@ -221,7 +221,8 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
 	long delta;
 
 	delta = jiffies - rcp->jiffies_stall;
-	if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) {
+	if (cpumask_test_cpu(smp_processor_id(), to_cpumask(rcp->cpumask)) &&
+		delta >= 0) {
 
 		/* We haven't checked in, so go dump stack. */
 		print_cpu_stall(rcp);
@@ -393,7 +394,8 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
 		 * unnecessarily.
 		 */
 		smp_mb();
-		cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
+		cpumask_andnot(to_cpumask(rcp->cpumask),
+			       cpu_online_mask, &nohz_cpu_mask);
 
 		rcp->signaled = 0;
 	}
@@ -406,8 +408,8 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
  */
 static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
 {
-	cpu_clear(cpu, rcp->cpumask);
-	if (cpus_empty(rcp->cpumask)) {
+	cpumask_clear_cpu(cpu, to_cpumask(rcp->cpumask));
+	if (cpumask_empty(to_cpumask(rcp->cpumask))) {
 		/* batch completed ! */
 		rcp->completed = rcp->cur;
 		rcu_start_batch(rcp);
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 04982659875..f9dc8f3720f 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -164,7 +164,8 @@ static char *rcu_try_flip_state_names[] =
 	{ "idle", "waitack", "waitzero", "waitmb" };
 #endif /* #ifdef CONFIG_RCU_TRACE */
 
-static cpumask_t rcu_cpu_online_map __read_mostly = CPU_MASK_NONE;
+static DECLARE_BITMAP(rcu_cpu_online_map, NR_CPUS) __read_mostly
+	= CPU_BITS_NONE;
 
 /*
  * Enum and per-CPU flag to determine when each CPU has seen
@@ -758,7 +759,7 @@ rcu_try_flip_idle(void)
 
 	/* Now ask each CPU for acknowledgement of the flip. */
 
-	for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) {
+	for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
 		per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
 		dyntick_save_progress_counter(cpu);
 	}
@@ -776,7 +777,7 @@ rcu_try_flip_waitack(void)
 	int cpu;
 
 	RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
-	for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
+	for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
 		if (rcu_try_flip_waitack_needed(cpu) &&
 		    per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
 			RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
@@ -808,7 +809,7 @@ rcu_try_flip_waitzero(void)
 	/* Check to see if the sum of the "last" counters is zero. */
 
 	RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
-	for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
+	for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
 		sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
 	if (sum != 0) {
 		RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
@@ -823,7 +824,7 @@ rcu_try_flip_waitzero(void)
 	smp_mb();  /*  ^^^^^^^^^^^^ */
 
 	/* Call for a memory barrier from each CPU. */
-	for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) {
+	for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
 		per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
 		dyntick_save_progress_counter(cpu);
 	}
@@ -843,7 +844,7 @@ rcu_try_flip_waitmb(void)
 	int cpu;
 
 	RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
-	for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
+	for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
 		if (rcu_try_flip_waitmb_needed(cpu) &&
 		    per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
 			RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
@@ -1032,7 +1033,7 @@ void rcu_offline_cpu(int cpu)
 	RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0;
 	RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0;
 
-	cpu_clear(cpu, rcu_cpu_online_map);
+	cpumask_clear_cpu(cpu, to_cpumask(rcu_cpu_online_map));
 
 	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
 
@@ -1072,7 +1073,7 @@ void __cpuinit rcu_online_cpu(int cpu)
 	struct rcu_data *rdp;
 
 	spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
-	cpu_set(cpu, rcu_cpu_online_map);
+	cpumask_set_cpu(cpu, to_cpumask(rcu_cpu_online_map));
 	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
 
 	/*
@@ -1430,7 +1431,7 @@ void __init __rcu_init(void)
 	 * We don't need protection against CPU-Hotplug here
 	 * since
 	 * a) If a CPU comes online while we are iterating over the
-	 *    cpu_online_map below, we would only end up making a
+	 *    cpu_online_mask below, we would only end up making a
 	 *    duplicate call to rcu_online_cpu() which sets the corresponding
 	 *    CPU's mask in the rcu_cpu_online_map.
 	 *
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index b3106552210..3245b40952c 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -868,49 +868,52 @@ static int rcu_idle_cpu;	/* Force all torture tasks off this CPU */
  */
 static void rcu_torture_shuffle_tasks(void)
 {
-	cpumask_t tmp_mask;
+	cpumask_var_t tmp_mask;
 	int i;
 
-	cpus_setall(tmp_mask);
+	if (!alloc_cpumask_var(&tmp_mask, GFP_KERNEL))
+		BUG();
+
+	cpumask_setall(tmp_mask);
 	get_online_cpus();
 
 	/* No point in shuffling if there is only one online CPU (ex: UP) */
-	if (num_online_cpus() == 1) {
-		put_online_cpus();
-		return;
-	}
+	if (num_online_cpus() == 1)
+		goto out;
 
 	if (rcu_idle_cpu != -1)
-		cpu_clear(rcu_idle_cpu, tmp_mask);
+		cpumask_clear_cpu(rcu_idle_cpu, tmp_mask);
 
-	set_cpus_allowed_ptr(current, &tmp_mask);
+	set_cpus_allowed_ptr(current, tmp_mask);
 
 	if (reader_tasks) {
 		for (i = 0; i < nrealreaders; i++)
 			if (reader_tasks[i])
 				set_cpus_allowed_ptr(reader_tasks[i],
-						     &tmp_mask);
+						     tmp_mask);
 	}
 
 	if (fakewriter_tasks) {
 		for (i = 0; i < nfakewriters; i++)
 			if (fakewriter_tasks[i])
 				set_cpus_allowed_ptr(fakewriter_tasks[i],
-						     &tmp_mask);
+						     tmp_mask);
 	}
 
 	if (writer_task)
-		set_cpus_allowed_ptr(writer_task, &tmp_mask);
+		set_cpus_allowed_ptr(writer_task, tmp_mask);
 
 	if (stats_task)
-		set_cpus_allowed_ptr(stats_task, &tmp_mask);
+		set_cpus_allowed_ptr(stats_task, tmp_mask);
 
 	if (rcu_idle_cpu == -1)
 		rcu_idle_cpu = num_online_cpus() - 1;
 	else
 		rcu_idle_cpu--;
 
+out:
 	put_online_cpus();
+	free_cpumask_var(tmp_mask);
 }
 
 /* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
-- 
cgit v1.2.3


From c309b917cab55799ea489d7b5f1b77025d9f8462 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 1 Jan 2009 10:12:27 +1030
Subject: cpumask: convert kernel/profile.c

Impact: Reduce kernel memory usage, use new cpumask API.

Avoid a static cpumask_t for prof_cpu_mask, and an on-stack cpumask_t
in prof_cpu_mask_write_proc.  Both become cpumask_var_t.

prof_cpu_mask is only allocated when profiling is on, but the NULL
checks are optimized out by gcc for the !CPUMASK_OFFSTACK case.

Also removed some strange and unnecessary casts.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/profile.c | 38 +++++++++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/profile.c b/kernel/profile.c
index 4cb7d68fed8..d18e2d2654f 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -45,7 +45,7 @@ static unsigned long prof_len, prof_shift;
 int prof_on __read_mostly;
 EXPORT_SYMBOL_GPL(prof_on);
 
-static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
+static cpumask_var_t prof_cpu_mask;
 #ifdef CONFIG_SMP
 static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
 static DEFINE_PER_CPU(int, cpu_profile_flip);
@@ -113,9 +113,13 @@ int __ref profile_init(void)
 	buffer_bytes = prof_len*sizeof(atomic_t);
 	if (!slab_is_available()) {
 		prof_buffer = alloc_bootmem(buffer_bytes);
+		alloc_bootmem_cpumask_var(&prof_cpu_mask);
 		return 0;
 	}
 
+	if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
+		return -ENOMEM;
+
 	prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL);
 	if (prof_buffer)
 		return 0;
@@ -128,6 +132,7 @@ int __ref profile_init(void)
 	if (prof_buffer)
 		return 0;
 
+	free_cpumask_var(prof_cpu_mask);
 	return -ENOMEM;
 }
 
@@ -386,13 +391,15 @@ out_free:
 		return NOTIFY_BAD;
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
-		cpu_set(cpu, prof_cpu_mask);
+		if (prof_cpu_mask != NULL)
+			cpumask_set_cpu(cpu, prof_cpu_mask);
 		break;
 	case CPU_UP_CANCELED:
 	case CPU_UP_CANCELED_FROZEN:
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
-		cpu_clear(cpu, prof_cpu_mask);
+		if (prof_cpu_mask != NULL)
+			cpumask_clear_cpu(cpu, prof_cpu_mask);
 		if (per_cpu(cpu_profile_hits, cpu)[0]) {
 			page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
 			per_cpu(cpu_profile_hits, cpu)[0] = NULL;
@@ -430,7 +437,8 @@ void profile_tick(int type)
 
 	if (type == CPU_PROFILING && timer_hook)
 		timer_hook(regs);
-	if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask))
+	if (!user_mode(regs) && prof_cpu_mask != NULL &&
+	    cpumask_test_cpu(smp_processor_id(), prof_cpu_mask))
 		profile_hit(type, (void *)profile_pc(regs));
 }
 
@@ -442,7 +450,7 @@ void profile_tick(int type)
 static int prof_cpu_mask_read_proc(char *page, char **start, off_t off,
 			int count, int *eof, void *data)
 {
-	int len = cpumask_scnprintf(page, count, (cpumask_t *)data);
+	int len = cpumask_scnprintf(page, count, data);
 	if (count - len < 2)
 		return -EINVAL;
 	len += sprintf(page + len, "\n");
@@ -452,16 +460,20 @@ static int prof_cpu_mask_read_proc(char *page, char **start, off_t off,
 static int prof_cpu_mask_write_proc(struct file *file,
 	const char __user *buffer,  unsigned long count, void *data)
 {
-	cpumask_t *mask = (cpumask_t *)data;
+	struct cpumask *mask = data;
 	unsigned long full_count = count, err;
-	cpumask_t new_value;
+	cpumask_var_t new_value;
 
-	err = cpumask_parse_user(buffer, count, &new_value);
-	if (err)
-		return err;
+	if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
+		return -ENOMEM;
 
-	*mask = new_value;
-	return full_count;
+	err = cpumask_parse_user(buffer, count, new_value);
+	if (!err) {
+		cpumask_copy(mask, new_value);
+		err = full_count;
+	}
+	free_cpumask_var(new_value);
+	return err;
 }
 
 void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
@@ -472,7 +484,7 @@ void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
 	entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir);
 	if (!entry)
 		return;
-	entry->data = (void *)&prof_cpu_mask;
+	entry->data = prof_cpu_mask;
 	entry->read_proc = prof_cpu_mask_read_proc;
 	entry->write_proc = prof_cpu_mask_write_proc;
 }
-- 
cgit v1.2.3


From e0b582ec56f1a1d8b30ebf340a7b91fb09f26c8c Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 1 Jan 2009 10:12:28 +1030
Subject: cpumask: convert kernel/cpu.c

Impact: Reduce kernel stack and memory usage, use new cpumask API.

Use cpumask_var_t for take_cpu_down() stack var, and frozen_cpus.

Note that notify_cpu_starting() can be called before core_initcall
allocates frozen_cpus, but the NULL check is optimized out by gcc for
the CONFIG_CPUMASK_OFFSTACK=n case.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/cpu.c | 48 +++++++++++++++++++++++++++++-------------------
 1 file changed, 29 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2c9f78f3a2f..47fff3b63cb 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -194,7 +194,7 @@ static int __ref take_cpu_down(void *_param)
 static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 {
 	int err, nr_calls = 0;
-	cpumask_t old_allowed, tmp;
+	cpumask_var_t old_allowed;
 	void *hcpu = (void *)(long)cpu;
 	unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
 	struct take_cpu_down_param tcd_param = {
@@ -208,6 +208,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	if (!cpu_online(cpu))
 		return -EINVAL;
 
+	if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL))
+		return -ENOMEM;
+
 	cpu_hotplug_begin();
 	err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
 					hcpu, -1, &nr_calls);
@@ -222,13 +225,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	}
 
 	/* Ensure that we are not runnable on dying cpu */
-	old_allowed = current->cpus_allowed;
-	cpus_setall(tmp);
-	cpu_clear(cpu, tmp);
-	set_cpus_allowed_ptr(current, &tmp);
-	tmp = cpumask_of_cpu(cpu);
+	cpumask_copy(old_allowed, &current->cpus_allowed);
+	set_cpus_allowed_ptr(current,
+			     cpumask_of(cpumask_any_but(cpu_online_mask, cpu)));
 
-	err = __stop_machine(take_cpu_down, &tcd_param, &tmp);
+	err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
 	if (err) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
 		if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
@@ -254,7 +255,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	check_for_tasks(cpu);
 
 out_allowed:
-	set_cpus_allowed_ptr(current, &old_allowed);
+	set_cpus_allowed_ptr(current, old_allowed);
 out_release:
 	cpu_hotplug_done();
 	if (!err) {
@@ -262,6 +263,7 @@ out_release:
 					    hcpu) == NOTIFY_BAD)
 			BUG();
 	}
+	free_cpumask_var(old_allowed);
 	return err;
 }
 
@@ -280,7 +282,7 @@ int __ref cpu_down(unsigned int cpu)
 
 	/*
 	 * Make sure the all cpus did the reschedule and are not
-	 * using stale version of the cpu_active_map.
+	 * using stale version of the cpu_active_mask.
 	 * This is not strictly necessary becuase stop_machine()
 	 * that we run down the line already provides the required
 	 * synchronization. But it's really a side effect and we do not
@@ -344,7 +346,7 @@ out_notify:
 int __cpuinit cpu_up(unsigned int cpu)
 {
 	int err = 0;
-	if (!cpu_isset(cpu, cpu_possible_map)) {
+	if (!cpu_possible(cpu)) {
 		printk(KERN_ERR "can't online cpu %d because it is not "
 			"configured as may-hotadd at boot time\n", cpu);
 #if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
@@ -369,25 +371,25 @@ out:
 }
 
 #ifdef CONFIG_PM_SLEEP_SMP
-static cpumask_t frozen_cpus;
+static cpumask_var_t frozen_cpus;
 
 int disable_nonboot_cpus(void)
 {
 	int cpu, first_cpu, error = 0;
 
 	cpu_maps_update_begin();
-	first_cpu = first_cpu(cpu_online_map);
+	first_cpu = cpumask_first(cpu_online_mask);
 	/* We take down all of the non-boot CPUs in one shot to avoid races
 	 * with the userspace trying to use the CPU hotplug at the same time
 	 */
-	cpus_clear(frozen_cpus);
+	cpumask_clear(frozen_cpus);
 	printk("Disabling non-boot CPUs ...\n");
 	for_each_online_cpu(cpu) {
 		if (cpu == first_cpu)
 			continue;
 		error = _cpu_down(cpu, 1);
 		if (!error) {
-			cpu_set(cpu, frozen_cpus);
+			cpumask_set_cpu(cpu, frozen_cpus);
 			printk("CPU%d is down\n", cpu);
 		} else {
 			printk(KERN_ERR "Error taking CPU%d down: %d\n",
@@ -413,11 +415,11 @@ void __ref enable_nonboot_cpus(void)
 	/* Allow everyone to use the CPU hotplug again */
 	cpu_maps_update_begin();
 	cpu_hotplug_disabled = 0;
-	if (cpus_empty(frozen_cpus))
+	if (cpumask_empty(frozen_cpus))
 		goto out;
 
 	printk("Enabling non-boot CPUs ...\n");
-	for_each_cpu_mask_nr(cpu, frozen_cpus) {
+	for_each_cpu(cpu, frozen_cpus) {
 		error = _cpu_up(cpu, 1);
 		if (!error) {
 			printk("CPU%d is up\n", cpu);
@@ -425,10 +427,18 @@ void __ref enable_nonboot_cpus(void)
 		}
 		printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
 	}
-	cpus_clear(frozen_cpus);
+	cpumask_clear(frozen_cpus);
 out:
 	cpu_maps_update_done();
 }
+
+static int alloc_frozen_cpus(void)
+{
+	if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO))
+		return -ENOMEM;
+	return 0;
+}
+core_initcall(alloc_frozen_cpus);
 #endif /* CONFIG_PM_SLEEP_SMP */
 
 /**
@@ -444,7 +454,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
 	unsigned long val = CPU_STARTING;
 
 #ifdef CONFIG_PM_SLEEP_SMP
-	if (cpu_isset(cpu, frozen_cpus))
+	if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus))
 		val = CPU_STARTING_FROZEN;
 #endif /* CONFIG_PM_SLEEP_SMP */
 	raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu);
@@ -456,7 +466,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
  * cpu_bit_bitmap[] is a special, "compressed" data structure that
  * represents all NR_CPUS bits binary values of 1<<nr.
  *
- * It is used by cpumask_of_cpu() to get a constant address to a CPU
+ * It is used by cpumask_of() to get a constant address to a CPU
  * mask value that has a single bit set only.
  */
 
-- 
cgit v1.2.3


From 41c7bb9588904eb060a95bcad47bd3804a1ece25 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 1 Jan 2009 10:12:28 +1030
Subject: cpumask: convert rest of files in kernel/

Impact: Reduce stack usage, use new cpumask API.

Mainly changing cpumask_t to 'struct cpumask' and similar simple API
conversion.  Two conversions worth mentioning:

1) we use cpumask_any_but to avoid a temporary in kernel/softlockup.c,
2) Use cpumask_var_t in taskstats_user_cmd().

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
---
 kernel/power/poweroff.c |  2 +-
 kernel/softlockup.c     |  6 ++----
 kernel/stop_machine.c   |  8 ++++----
 kernel/taskstats.c      | 39 ++++++++++++++++++++++++---------------
 4 files changed, 31 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 72016f05147..97890831e1b 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -27,7 +27,7 @@ static DECLARE_WORK(poweroff_work, do_poweroff);
 static void handle_poweroff(int key, struct tty_struct *tty)
 {
 	/* run sysrq poweroff on boot cpu */
-	schedule_work_on(first_cpu(cpu_online_map), &poweroff_work);
+	schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work);
 }
 
 static struct sysrq_key_op	sysrq_poweroff_op = {
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 492f0c72fec..d9188c66278 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -310,10 +310,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
 		if (hotcpu == check_cpu) {
-			cpumask_t temp_cpu_online_map = cpu_online_map;
-
-			cpu_clear(hotcpu, temp_cpu_online_map);
-			check_cpu = cpumask_any(&temp_cpu_online_map);
+			/* Pick any other online cpu. */
+			check_cpu = cpumask_any_but(cpu_online_mask, hotcpu);
 		}
 		break;
 
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 24e8ceacc38..286c41722e8 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -69,10 +69,10 @@ static void stop_cpu(struct work_struct *unused)
 	int err;
 
 	if (!active_cpus) {
-		if (cpu == first_cpu(cpu_online_map))
+		if (cpu == cpumask_first(cpu_online_mask))
 			smdata = &active;
 	} else {
-		if (cpu_isset(cpu, *active_cpus))
+		if (cpumask_test_cpu(cpu, active_cpus))
 			smdata = &active;
 	}
 	/* Simple state machine */
@@ -109,7 +109,7 @@ static int chill(void *unused)
 	return 0;
 }
 
-int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
+int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 {
 	struct work_struct *sm_work;
 	int i, ret;
@@ -142,7 +142,7 @@ int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
 	return ret;
 }
 
-int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
+int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 {
 	int ret;
 
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 6d7dc4ec4aa..888adbcca30 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -290,18 +290,17 @@ ret:
 	return;
 }
 
-static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
+static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
 {
 	struct listener_list *listeners;
 	struct listener *s, *tmp;
 	unsigned int cpu;
-	cpumask_t mask = *maskp;
 
-	if (!cpus_subset(mask, cpu_possible_map))
+	if (!cpumask_subset(mask, cpu_possible_mask))
 		return -EINVAL;
 
 	if (isadd == REGISTER) {
-		for_each_cpu_mask_nr(cpu, mask) {
+		for_each_cpu(cpu, mask) {
 			s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
 					 cpu_to_node(cpu));
 			if (!s)
@@ -320,7 +319,7 @@ static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
 
 	/* Deregister or cleanup */
 cleanup:
-	for_each_cpu_mask_nr(cpu, mask) {
+	for_each_cpu(cpu, mask) {
 		listeners = &per_cpu(listener_array, cpu);
 		down_write(&listeners->sem);
 		list_for_each_entry_safe(s, tmp, &listeners->list, list) {
@@ -335,7 +334,7 @@ cleanup:
 	return 0;
 }
 
-static int parse(struct nlattr *na, cpumask_t *mask)
+static int parse(struct nlattr *na, struct cpumask *mask)
 {
 	char *data;
 	int len;
@@ -428,23 +427,33 @@ err:
 
 static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 {
-	int rc = 0;
+	int rc;
 	struct sk_buff *rep_skb;
 	struct taskstats *stats;
 	size_t size;
-	cpumask_t mask;
+	cpumask_var_t mask;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
 
-	rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask);
+	rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask);
 	if (rc < 0)
-		return rc;
-	if (rc == 0)
-		return add_del_listener(info->snd_pid, &mask, REGISTER);
+		goto free_return_rc;
+	if (rc == 0) {
+		rc = add_del_listener(info->snd_pid, mask, REGISTER);
+		goto free_return_rc;
+	}
 
-	rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask);
+	rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask);
 	if (rc < 0)
+		goto free_return_rc;
+	if (rc == 0) {
+		rc = add_del_listener(info->snd_pid, mask, DEREGISTER);
+free_return_rc:
+		free_cpumask_var(mask);
 		return rc;
-	if (rc == 0)
-		return add_del_listener(info->snd_pid, &mask, DEREGISTER);
+	}
+	free_cpumask_var(mask);
 
 	/*
 	 * Size includes space for nested attributes
-- 
cgit v1.2.3


From 5db0e1e9e0f30f160b832a0b5cd1131954bf4f6e Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 1 Jan 2009 10:12:29 +1030
Subject: cpumask: replace for_each_cpu_mask_nr with for_each_cpu in
 kernel/time/

Impact: cleanup

Simple replacement, now the _nr is redundant.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
Cc: Ingo Molnar <mingo@redhat.com>
---
 kernel/time/clocksource.c    | 3 ++-
 kernel/time/tick-broadcast.c | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 32141b15d63..ca89e1593f0 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -145,7 +145,8 @@ static void clocksource_watchdog(unsigned long data)
 		 * Cycle through CPUs to check if the CPUs stay
 		 * synchronized to each other.
 		 */
-		int next_cpu = next_cpu_nr(raw_smp_processor_id(), cpu_online_map);
+		int next_cpu = cpumask_next(raw_smp_processor_id(),
+					    cpu_online_mask);
 
 		if (next_cpu >= nr_cpu_ids)
 			next_cpu = cpumask_first(cpu_online_mask);
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 356fac57a18..118a3b3b3f9 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -512,7 +512,7 @@ static void tick_broadcast_init_next_event(struct cpumask *mask,
 	struct tick_device *td;
 	int cpu;
 
-	for_each_cpu_mask_nr(cpu, *mask) {
+	for_each_cpu(cpu, mask) {
 		td = &per_cpu(tick_cpu_device, cpu);
 		if (td->evtdev)
 			td->evtdev->next_event = expires;
-- 
cgit v1.2.3


From 0a582440ff546e2c6610d1acec325e91b4efd313 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Fri, 2 Jan 2009 12:16:42 +0100
Subject: sched: fix sched_slice()

Impact: fix bad-interactivity buglet

Fix sched_slice() to emit a sane result whether a task is currently
enqueued or not.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Tested-by: Jayson King <dev@jaysonking.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

 kernel/sched_fair.c |   30 ++++++++++++------------------
 1 file changed, 12 insertions(+), 18 deletions(-)
---
 kernel/sched_fair.c | 30 ++++++++++++------------------
 1 file changed, 12 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5ad4440f0fc..b808563f4f1 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -385,20 +385,6 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
 }
 #endif
 
-/*
- * delta *= P[w / rw]
- */
-static inline unsigned long
-calc_delta_weight(unsigned long delta, struct sched_entity *se)
-{
-	for_each_sched_entity(se) {
-		delta = calc_delta_mine(delta,
-				se->load.weight, &cfs_rq_of(se)->load);
-	}
-
-	return delta;
-}
-
 /*
  * delta /= w
  */
@@ -440,12 +426,20 @@ static u64 __sched_period(unsigned long nr_running)
  */
 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	unsigned long nr_running = cfs_rq->nr_running;
+	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
 
-	if (unlikely(!se->on_rq))
-		nr_running++;
+	for_each_sched_entity(se) {
+		struct load_weight *load = &cfs_rq->load;
 
-	return calc_delta_weight(__sched_period(nr_running), se);
+		if (unlikely(!se->on_rq)) {
+			struct load_weight lw = cfs_rq->load;
+
+			update_load_add(&lw, se->load.weight);
+			load = &lw;
+		}
+		slice = calc_delta_mine(slice, se->load.weight, load);
+	}
+	return slice;
 }
 
 /*
-- 
cgit v1.2.3


From 90621c40cc4ab7b0a414311ce37e7cc7173403b6 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Mon, 29 Dec 2008 19:43:21 -0800
Subject: futex: catch certain assymetric (get|put)_futex_key calls

Impact: add debug check

Following up on my previous key reference accounting patches, this patch
will catch puts on keys that haven't been "got".  This won't catch nested
get/put mismatches though.

Build and boot tested, with minimal desktop activity and a run of the
open_posix_testsuite in LTP for testing.  No warnings logged.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Cc: <stable@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/futex.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index c5ac55cc0c1..206d4c90688 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -170,8 +170,11 @@ static void get_futex_key_refs(union futex_key *key)
  */
 static void drop_futex_key_refs(union futex_key *key)
 {
-	if (!key->both.ptr)
+	if (!key->both.ptr) {
+		/* If we're here then we tried to put a key we failed to get */
+		WARN_ON_ONCE(1);
 		return;
+	}
 
 	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
 	case FUT_OFF_INODE:
-- 
cgit v1.2.3


From 263ec6457bb23d57b575ede18ff6c3d11e0b4e96 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 3 Jan 2009 13:16:09 +0100
Subject: cpumask: convert RCU implementations, fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: cleanup

This warning:

 kernel/rcuclassic.c: In function ‘rcu_start_batch’:
 kernel/rcuclassic.c:397: warning: passing argument 1 of ‘cpumask_andnot’ from incompatible pointer type

triggers because one usage site of rcp->cpumask was not converted
to to_cpumask(rcp->cpumask). There's no ill effects of this bug.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcuclassic.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 6ec495f60ea..490934fc7ac 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -394,7 +394,8 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
 		 * unnecessarily.
 		 */
 		smp_mb();
-		cpumask_andnot(&rcp->cpumask, cpu_online_mask, nohz_cpu_mask);
+		cpumask_andnot(to_cpumask(rcp->cpumask),
+			       cpu_online_mask, nohz_cpu_mask);
 
 		rcp->signaled = 0;
 	}
-- 
cgit v1.2.3


From 6bdf197b04b3ae7c85785bc5a9576f1bcb0ac7c0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 3 Jan 2009 12:50:46 +0100
Subject: ia64: cpumask fix for is_affinity_mask_valid()

Impact: build fix on ia64

ia64's default_affinity_write() still had old cpumask_t usage:

 /home/mingo/tip/kernel/irq/proc.c: In function `default_affinity_write':
 /home/mingo/tip/kernel/irq/proc.c:114: error: incompatible type for argument 1 of `is_affinity_mask_valid'
 make[3]: *** [kernel/irq/proc.o] Error 1
 make[3]: *** Waiting for unfinished jobs....

update it to cpumask_var_t.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 2abd3a7716e..aae3f742bce 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -54,7 +54,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
 	if (err)
 		goto free_cpumask;
 
-	if (!is_affinity_mask_valid(*new_value)) {
+	if (!is_affinity_mask_valid(new_value)) {
 		err = -EINVAL;
 		goto free_cpumask;
 	}
-- 
cgit v1.2.3


From 6ca09dfc9f180d038dcef93c167a833f43a8246f Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Wed, 31 Dec 2008 18:08:45 -0800
Subject: sched: put back some stack hog changes that were undone in
 kernel/sched.c

Impact: prevents panic from stack overflow on numa-capable machines.

Some of the "removal of stack hogs" changes in kernel/sched.c by using
node_to_cpumask_ptr were undone by the early cpumask API updates, and
causes a panic due to stack overflow.  This patch undoes those changes
by using cpumask_of_node() which returns a 'const struct cpumask *'.

In addition, cpu_coregoup_map is replaced with cpu_coregroup_mask further
reducing stack usage.  (Both of these updates removed 9 FIXME's!)

Also:
   Pick up some remaining changes from the old 'cpumask_t' functions to
   the new 'struct cpumask *' functions.

   Optimize memory traffic by allocating each percpu local_cpu_mask on the
   same node as the referring cpu.

Signed-off-by: Mike Travis <travis@sgi.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    | 53 +++++++++++++++--------------------------------------
 kernel/sched_rt.c |  3 ++-
 2 files changed, 17 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 27ba1d642f0..dd862d70e71 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3715,7 +3715,7 @@ redo:
 		 * don't kick the migration_thread, if the curr
 		 * task on busiest cpu can't be moved to this_cpu
 		 */
-		if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+		if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
 			double_unlock_balance(this_rq, busiest);
 			all_pinned = 1;
 			return ld_moved;
@@ -6220,9 +6220,7 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 {
 	int dest_cpu;
-	/* FIXME: Use cpumask_of_node here. */
-	cpumask_t _nodemask = node_to_cpumask(cpu_to_node(dead_cpu));
-	const struct cpumask *nodemask = &_nodemask;
+	const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));
 
 again:
 	/* Look for allowed, online CPU in same node. */
@@ -7133,21 +7131,18 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
 static void sched_domain_node_span(int node, struct cpumask *span)
 {
 	nodemask_t used_nodes;
-	/* FIXME: use cpumask_of_node() */
-	node_to_cpumask_ptr(nodemask, node);
 	int i;
 
-	cpus_clear(*span);
+	cpumask_clear(span);
 	nodes_clear(used_nodes);
 
-	cpus_or(*span, *span, *nodemask);
+	cpumask_or(span, span, cpumask_of_node(node));
 	node_set(node, used_nodes);
 
 	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
 		int next_node = find_next_best_node(node, &used_nodes);
 
-		node_to_cpumask_ptr_next(nodemask, next_node);
-		cpus_or(*span, *span, *nodemask);
+		cpumask_or(span, span, cpumask_of_node(next_node));
 	}
 }
 #endif /* CONFIG_NUMA */
@@ -7227,9 +7222,7 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
 {
 	int group;
 #ifdef CONFIG_SCHED_MC
-	/* FIXME: Use cpu_coregroup_mask. */
-	*mask = cpu_coregroup_map(cpu);
-	cpus_and(*mask, *mask, *cpu_map);
+	cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
 	group = cpumask_first(mask);
 #elif defined(CONFIG_SCHED_SMT)
 	cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
@@ -7259,10 +7252,8 @@ static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
 				 struct cpumask *nodemask)
 {
 	int group;
-	/* FIXME: use cpumask_of_node */
-	node_to_cpumask_ptr(pnodemask, cpu_to_node(cpu));
 
-	cpumask_and(nodemask, pnodemask, cpu_map);
+	cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
 	group = cpumask_first(nodemask);
 
 	if (sg)
@@ -7313,10 +7304,8 @@ static void free_sched_groups(const struct cpumask *cpu_map,
 
 		for (i = 0; i < nr_node_ids; i++) {
 			struct sched_group *oldsg, *sg = sched_group_nodes[i];
-			/* FIXME: Use cpumask_of_node */
-			node_to_cpumask_ptr(pnodemask, i);
 
-			cpus_and(*nodemask, *pnodemask, *cpu_map);
+			cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
 			if (cpumask_empty(nodemask))
 				continue;
 
@@ -7525,9 +7514,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 	for_each_cpu(i, cpu_map) {
 		struct sched_domain *sd = NULL, *p;
 
-		/* FIXME: use cpumask_of_node */
-		*nodemask = node_to_cpumask(cpu_to_node(i));
-		cpus_and(*nodemask, *nodemask, *cpu_map);
+		cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map);
 
 #ifdef CONFIG_NUMA
 		if (cpumask_weight(cpu_map) >
@@ -7568,9 +7555,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 		sd = &per_cpu(core_domains, i).sd;
 		SD_INIT(sd, MC);
 		set_domain_attribute(sd, attr);
-		*sched_domain_span(sd) = cpu_coregroup_map(i);
-		cpumask_and(sched_domain_span(sd),
-			    sched_domain_span(sd), cpu_map);
+		cpumask_and(sched_domain_span(sd), cpu_map,
+						   cpu_coregroup_mask(i));
 		sd->parent = p;
 		p->child = sd;
 		cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7606,9 +7592,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 #ifdef CONFIG_SCHED_MC
 	/* Set up multi-core groups */
 	for_each_cpu(i, cpu_map) {
-		/* FIXME: Use cpu_coregroup_mask */
-		*this_core_map = cpu_coregroup_map(i);
-		cpus_and(*this_core_map, *this_core_map, *cpu_map);
+		cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map);
 		if (i != cpumask_first(this_core_map))
 			continue;
 
@@ -7620,9 +7604,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 
 	/* Set up physical groups */
 	for (i = 0; i < nr_node_ids; i++) {
-		/* FIXME: Use cpumask_of_node */
-		*nodemask = node_to_cpumask(i);
-		cpus_and(*nodemask, *nodemask, *cpu_map);
+		cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
 		if (cpumask_empty(nodemask))
 			continue;
 
@@ -7644,11 +7626,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 		struct sched_group *sg, *prev;
 		int j;
 
-		/* FIXME: Use cpumask_of_node */
-		*nodemask = node_to_cpumask(i);
 		cpumask_clear(covered);
-
-		cpus_and(*nodemask, *nodemask, *cpu_map);
+		cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
 		if (cpumask_empty(nodemask)) {
 			sched_group_nodes[i] = NULL;
 			continue;
@@ -7679,8 +7658,6 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 
 		for (j = 0; j < nr_node_ids; j++) {
 			int n = (i + j) % nr_node_ids;
-			/* FIXME: Use cpumask_of_node */
-			node_to_cpumask_ptr(pnodemask, n);
 
 			cpumask_complement(notcovered, covered);
 			cpumask_and(tmpmask, notcovered, cpu_map);
@@ -7688,7 +7665,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 			if (cpumask_empty(tmpmask))
 				break;
 
-			cpumask_and(tmpmask, tmpmask, pnodemask);
+			cpumask_and(tmpmask, tmpmask, cpumask_of_node(n));
 			if (cpumask_empty(tmpmask))
 				continue;
 
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 833b6d44483..954e1a81b79 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1383,7 +1383,8 @@ static inline void init_sched_rt_class(void)
 	unsigned int i;
 
 	for_each_possible_cpu(i)
-		alloc_cpumask_var(&per_cpu(local_cpu_mask, i), GFP_KERNEL);
+		alloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
+					GFP_KERNEL, cpu_to_node(i));
 }
 #endif /* CONFIG_SMP */
 
-- 
cgit v1.2.3


From 8916edef5888c5d8fe283714416a9ca95b4c3431 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Sun, 4 Jan 2009 05:40:37 +0900
Subject: getrusage: RUSAGE_THREAD should return ru_utime and ru_stime

Impact: task stats regression fix

Original getrusage(RUSAGE_THREAD) implementation can return ru_utime and
ru_stime. But commit "f06febc: timers: fix itimer/many thread hang" broke it.

this patch restores it.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sys.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index d356d79e84a..61dbfd4a54d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1627,6 +1627,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 	utime = stime = cputime_zero;
 
 	if (who == RUSAGE_THREAD) {
+		utime = task_utime(current);
+		stime = task_stime(current);
 		accumulate_thread_rusage(p, r);
 		goto out;
 	}
-- 
cgit v1.2.3


From 4f6b434fee2402b3decdeae9d16eb648725ae426 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 9 Dec 2008 19:50:34 -0500
Subject: don't reallocate buffer in every audit_sockaddr()

No need to do that more than once per process lifetime; allocating/freeing
on each sendto/accept/etc. is bloody pointless.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 46 ++++++++++++++++++++++------------------------
 1 file changed, 22 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 4819f371197..c2e43ebb1b6 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -174,12 +174,6 @@ struct audit_aux_data_socketcall {
 	unsigned long		args[0];
 };
 
-struct audit_aux_data_sockaddr {
-	struct audit_aux_data	d;
-	int			len;
-	char			a[0];
-};
-
 struct audit_aux_data_fd_pair {
 	struct	audit_aux_data d;
 	int	fd[2];
@@ -234,7 +228,8 @@ struct audit_context {
 	struct audit_context *previous; /* For nested syscalls */
 	struct audit_aux_data *aux;
 	struct audit_aux_data *aux_pids;
-
+	struct sockaddr_storage *sockaddr;
+	size_t sockaddr_len;
 				/* Save things to print about task_struct */
 	pid_t		    pid, ppid;
 	uid_t		    uid, euid, suid, fsuid;
@@ -921,6 +916,7 @@ static inline void audit_free_context(struct audit_context *context)
 		free_tree_refs(context);
 		audit_free_aux(context);
 		kfree(context->filterkey);
+		kfree(context->sockaddr);
 		kfree(context);
 		context  = previous;
 	} while (context);
@@ -1383,13 +1379,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 				audit_log_format(ab, " a%d=%lx", i, axs->args[i]);
 			break; }
 
-		case AUDIT_SOCKADDR: {
-			struct audit_aux_data_sockaddr *axs = (void *)aux;
-
-			audit_log_format(ab, "saddr=");
-			audit_log_n_hex(ab, axs->a, axs->len);
-			break; }
-
 		case AUDIT_FD_PAIR: {
 			struct audit_aux_data_fd_pair *axs = (void *)aux;
 			audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]);
@@ -1421,6 +1410,16 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 		audit_log_end(ab);
 	}
 
+	if (context->sockaddr_len) {
+		ab = audit_log_start(context, GFP_KERNEL, AUDIT_SOCKADDR);
+		if (ab) {
+			audit_log_format(ab, "saddr=");
+			audit_log_n_hex(ab, (void *)context->sockaddr,
+					context->sockaddr_len);
+			audit_log_end(ab);
+		}
+	}
+
 	for (aux = context->aux_pids; aux; aux = aux->next) {
 		struct audit_aux_data_pids *axs = (void *)aux;
 
@@ -1689,6 +1688,7 @@ void audit_syscall_exit(int valid, long return_code)
 		context->aux_pids = NULL;
 		context->target_pid = 0;
 		context->target_sid = 0;
+		context->sockaddr_len = 0;
 		kfree(context->filterkey);
 		context->filterkey = NULL;
 		tsk->audit_context = context;
@@ -2468,22 +2468,20 @@ int __audit_fd_pair(int fd1, int fd2)
  */
 int audit_sockaddr(int len, void *a)
 {
-	struct audit_aux_data_sockaddr *ax;
 	struct audit_context *context = current->audit_context;
 
 	if (likely(!context || context->dummy))
 		return 0;
 
-	ax = kmalloc(sizeof(*ax) + len, GFP_KERNEL);
-	if (!ax)
-		return -ENOMEM;
-
-	ax->len = len;
-	memcpy(ax->a, a, len);
+	if (!context->sockaddr) {
+		void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL);
+		if (!p)
+			return -ENOMEM;
+		context->sockaddr = p;
+	}
 
-	ax->d.type = AUDIT_SOCKADDR;
-	ax->d.next = context->aux;
-	context->aux = (void *)ax;
+	context->sockaddr_len = len;
+	memcpy(context->sockaddr, a, len);
 	return 0;
 }
 
-- 
cgit v1.2.3


From f3298dc4f2277874d40cb4fc3a6e277317d6603b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 10 Dec 2008 03:16:51 -0500
Subject: sanitize audit_socketcall

* don't bother with allocations
* now that it can't fail, make it return void

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 66 ++++++++++++++++++++++++++++++++------------------------
 1 file changed, 38 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index c2e43ebb1b6..5cda66466e1 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -168,12 +168,6 @@ struct audit_aux_data_execve {
 	struct mm_struct *mm;
 };
 
-struct audit_aux_data_socketcall {
-	struct audit_aux_data	d;
-	int			nargs;
-	unsigned long		args[0];
-};
-
 struct audit_aux_data_fd_pair {
 	struct	audit_aux_data d;
 	int	fd[2];
@@ -247,6 +241,14 @@ struct audit_context {
 	struct audit_tree_refs *trees, *first_trees;
 	int tree_count;
 
+	int type;
+	union {
+		struct {
+			int nargs;
+			long args[6];
+		} socketcall;
+	};
+
 #if AUDIT_DEBUG
 	int		    put_count;
 	int		    ino_count;
@@ -1226,6 +1228,27 @@ static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
 		audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver);
 }
 
+static void show_special(struct audit_context *context)
+{
+	struct audit_buffer *ab;
+	int i;
+
+	ab = audit_log_start(context, GFP_KERNEL, context->type);
+	if (!ab)
+		return;
+
+	switch (context->type) {
+	case AUDIT_SOCKETCALL: {
+		int nargs = context->socketcall.nargs;
+		audit_log_format(ab, "nargs=%d", nargs);
+		for (i = 0; i < nargs; i++)
+			audit_log_format(ab, " a%d=%lx", i,
+				context->socketcall.args[i]);
+		break; }
+	}
+	audit_log_end(ab);
+}
+
 static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
 {
 	const struct cred *cred;
@@ -1372,13 +1395,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 			audit_log_execve_info(context, &ab, axi);
 			break; }
 
-		case AUDIT_SOCKETCALL: {
-			struct audit_aux_data_socketcall *axs = (void *)aux;
-			audit_log_format(ab, "nargs=%d", axs->nargs);
-			for (i=0; i<axs->nargs; i++)
-				audit_log_format(ab, " a%d=%lx", i, axs->args[i]);
-			break; }
-
 		case AUDIT_FD_PAIR: {
 			struct audit_aux_data_fd_pair *axs = (void *)aux;
 			audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]);
@@ -1410,6 +1426,9 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 		audit_log_end(ab);
 	}
 
+	if (context->type)
+		show_special(context);
+
 	if (context->sockaddr_len) {
 		ab = audit_log_start(context, GFP_KERNEL, AUDIT_SOCKADDR);
 		if (ab) {
@@ -1689,6 +1708,7 @@ void audit_syscall_exit(int valid, long return_code)
 		context->target_pid = 0;
 		context->target_sid = 0;
 		context->sockaddr_len = 0;
+		context->type = 0;
 		kfree(context->filterkey);
 		context->filterkey = NULL;
 		tsk->audit_context = context;
@@ -2406,27 +2426,17 @@ int audit_bprm(struct linux_binprm *bprm)
  * @nargs: number of args
  * @args: args array
  *
- * Returns 0 for success or NULL context or < 0 on error.
  */
-int audit_socketcall(int nargs, unsigned long *args)
+void audit_socketcall(int nargs, unsigned long *args)
 {
-	struct audit_aux_data_socketcall *ax;
 	struct audit_context *context = current->audit_context;
 
 	if (likely(!context || context->dummy))
-		return 0;
-
-	ax = kmalloc(sizeof(*ax) + nargs * sizeof(unsigned long), GFP_KERNEL);
-	if (!ax)
-		return -ENOMEM;
-
-	ax->nargs = nargs;
-	memcpy(ax->args, args, nargs * sizeof(unsigned long));
+		return;
 
-	ax->d.type = AUDIT_SOCKETCALL;
-	ax->d.next = context->aux;
-	context->aux = (void *)ax;
-	return 0;
+	context->type = AUDIT_SOCKETCALL;
+	context->socketcall.nargs = nargs;
+	memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long));
 }
 
 /**
-- 
cgit v1.2.3


From a33e6751003c5ade603737d828b1519d980ce392 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 10 Dec 2008 03:40:06 -0500
Subject: sanitize audit_ipc_obj()

* get rid of allocations
* make it return void
* simplify callers

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 88 ++++++++++++++++++++++++--------------------------------
 1 file changed, 37 insertions(+), 51 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 5cda66466e1..73504313264 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -247,6 +247,12 @@ struct audit_context {
 			int nargs;
 			long args[6];
 		} socketcall;
+		struct {
+			uid_t			uid;
+			gid_t			gid;
+			mode_t			mode;
+			u32			osid;
+		} ipc;
 	};
 
 #if AUDIT_DEBUG
@@ -605,19 +611,12 @@ static int audit_filter_rules(struct task_struct *tsk,
 					}
 				}
 				/* Find ipc objects that match */
-				if (ctx) {
-					struct audit_aux_data *aux;
-					for (aux = ctx->aux; aux;
-					     aux = aux->next) {
-						if (aux->type == AUDIT_IPC) {
-							struct audit_aux_data_ipcctl *axi = (void *)aux;
-							if (security_audit_rule_match(axi->osid, f->type, f->op, f->lsm_rule, ctx)) {
-								++result;
-								break;
-							}
-						}
-					}
-				}
+				if (!ctx || ctx->type != AUDIT_IPC)
+					break;
+				if (security_audit_rule_match(ctx->ipc.osid,
+							      f->type, f->op,
+							      f->lsm_rule, ctx))
+					++result;
 			}
 			break;
 		case AUDIT_ARG0:
@@ -1228,7 +1227,7 @@ static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
 		audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver);
 }
 
-static void show_special(struct audit_context *context)
+static void show_special(struct audit_context *context, int *call_panic)
 {
 	struct audit_buffer *ab;
 	int i;
@@ -1245,6 +1244,23 @@ static void show_special(struct audit_context *context)
 			audit_log_format(ab, " a%d=%lx", i,
 				context->socketcall.args[i]);
 		break; }
+	case AUDIT_IPC: {
+		u32 osid = context->ipc.osid;
+
+		audit_log_format(ab, "ouid=%u ogid=%u mode=%#o",
+			 context->ipc.uid, context->ipc.gid, context->ipc.mode);
+		if (osid) {
+			char *ctx = NULL;
+			u32 len;
+			if (security_secid_to_secctx(osid, &ctx, &len)) {
+				audit_log_format(ab, " osid=%u", osid);
+				*call_panic = 1;
+			} else {
+				audit_log_format(ab, " obj=%s", ctx);
+				security_release_secctx(ctx, len);
+			}
+		}
+		break; }
 	}
 	audit_log_end(ab);
 }
@@ -1363,26 +1379,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 				axi->mqstat.mq_msgsize, axi->mqstat.mq_curmsgs);
 			break; }
 
-		case AUDIT_IPC: {
-			struct audit_aux_data_ipcctl *axi = (void *)aux;
-			audit_log_format(ab, 
-				 "ouid=%u ogid=%u mode=%#o",
-				 axi->uid, axi->gid, axi->mode);
-			if (axi->osid != 0) {
-				char *ctx = NULL;
-				u32 len;
-				if (security_secid_to_secctx(
-						axi->osid, &ctx, &len)) {
-					audit_log_format(ab, " osid=%u",
-							axi->osid);
-					call_panic = 1;
-				} else {
-					audit_log_format(ab, " obj=%s", ctx);
-					security_release_secctx(ctx, len);
-				}
-			}
-			break; }
-
 		case AUDIT_IPC_SET_PERM: {
 			struct audit_aux_data_ipcctl *axi = (void *)aux;
 			audit_log_format(ab,
@@ -1427,7 +1423,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 	}
 
 	if (context->type)
-		show_special(context);
+		show_special(context, &call_panic);
 
 	if (context->sockaddr_len) {
 		ab = audit_log_start(context, GFP_KERNEL, AUDIT_SOCKADDR);
@@ -2349,25 +2345,15 @@ int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
  * audit_ipc_obj - record audit data for ipc object
  * @ipcp: ipc permissions
  *
- * Returns 0 for success or NULL context or < 0 on error.
  */
-int __audit_ipc_obj(struct kern_ipc_perm *ipcp)
+void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
 {
-	struct audit_aux_data_ipcctl *ax;
 	struct audit_context *context = current->audit_context;
-
-	ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
-	if (!ax)
-		return -ENOMEM;
-
-	ax->uid = ipcp->uid;
-	ax->gid = ipcp->gid;
-	ax->mode = ipcp->mode;
-	security_ipc_getsecid(ipcp, &ax->osid);
-	ax->d.type = AUDIT_IPC;
-	ax->d.next = context->aux;
-	context->aux = (void *)ax;
-	return 0;
+	context->ipc.uid = ipcp->uid;
+	context->ipc.gid = ipcp->gid;
+	context->ipc.mode = ipcp->mode;
+	security_ipc_getsecid(ipcp, &context->ipc.osid);
+	context->type = AUDIT_IPC;
 }
 
 /**
-- 
cgit v1.2.3


From e816f370cbadd2afea9f1a42f232d0636137d563 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 10 Dec 2008 03:47:15 -0500
Subject: sanitize audit_ipc_set_perm()

* get rid of allocations
* make it return void
* simplify callers

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 59 +++++++++++++++++++++++++-------------------------------
 1 file changed, 26 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 73504313264..fbed62e05bc 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -151,16 +151,6 @@ struct audit_aux_data_mq_getsetattr {
 	struct mq_attr 		mqstat;
 };
 
-struct audit_aux_data_ipcctl {
-	struct audit_aux_data	d;
-	struct ipc_perm		p;
-	unsigned long		qbytes;
-	uid_t			uid;
-	gid_t			gid;
-	mode_t			mode;
-	u32			osid;
-};
-
 struct audit_aux_data_execve {
 	struct audit_aux_data	d;
 	int argc;
@@ -252,6 +242,11 @@ struct audit_context {
 			gid_t			gid;
 			mode_t			mode;
 			u32			osid;
+			int			has_perm;
+			uid_t			perm_uid;
+			gid_t			perm_gid;
+			mode_t			perm_mode;
+			unsigned long		qbytes;
 		} ipc;
 	};
 
@@ -1260,6 +1255,19 @@ static void show_special(struct audit_context *context, int *call_panic)
 				security_release_secctx(ctx, len);
 			}
 		}
+		if (context->ipc.has_perm) {
+			audit_log_end(ab);
+			ab = audit_log_start(context, GFP_KERNEL,
+					     AUDIT_IPC_SET_PERM);
+			audit_log_format(ab,
+				"qbytes=%lx ouid=%u ogid=%u mode=%#o",
+				context->ipc.qbytes,
+				context->ipc.perm_uid,
+				context->ipc.perm_gid,
+				context->ipc.perm_mode);
+			if (!ab)
+				return;
+		}
 		break; }
 	}
 	audit_log_end(ab);
@@ -1379,13 +1387,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 				axi->mqstat.mq_msgsize, axi->mqstat.mq_curmsgs);
 			break; }
 
-		case AUDIT_IPC_SET_PERM: {
-			struct audit_aux_data_ipcctl *axi = (void *)aux;
-			audit_log_format(ab,
-				"qbytes=%lx ouid=%u ogid=%u mode=%#o",
-				axi->qbytes, axi->uid, axi->gid, axi->mode);
-			break; }
-
 		case AUDIT_EXECVE: {
 			struct audit_aux_data_execve *axi = (void *)aux;
 			audit_log_execve_info(context, &ab, axi);
@@ -2352,6 +2353,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
 	context->ipc.uid = ipcp->uid;
 	context->ipc.gid = ipcp->gid;
 	context->ipc.mode = ipcp->mode;
+	context->ipc.has_perm = 0;
 	security_ipc_getsecid(ipcp, &context->ipc.osid);
 	context->type = AUDIT_IPC;
 }
@@ -2363,26 +2365,17 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
  * @gid: msgq group id
  * @mode: msgq mode (permissions)
  *
- * Returns 0 for success or NULL context or < 0 on error.
+ * Called only after audit_ipc_obj().
  */
-int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
+void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
 {
-	struct audit_aux_data_ipcctl *ax;
 	struct audit_context *context = current->audit_context;
 
-	ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
-	if (!ax)
-		return -ENOMEM;
-
-	ax->qbytes = qbytes;
-	ax->uid = uid;
-	ax->gid = gid;
-	ax->mode = mode;
-
-	ax->d.type = AUDIT_IPC_SET_PERM;
-	ax->d.next = context->aux;
-	context->aux = (void *)ax;
-	return 0;
+	context->ipc.qbytes = qbytes;
+	context->ipc.perm_uid = uid;
+	context->ipc.perm_gid = gid;
+	context->ipc.perm_mode = mode;
+	context->ipc.has_perm = 1;
 }
 
 int audit_bprm(struct linux_binprm *bprm)
-- 
cgit v1.2.3


From 7392906ea915b9a2c14dea32b3604b4e178f82f7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 10 Dec 2008 06:58:59 -0500
Subject: sanitize audit_mq_getsetattr()

* get rid of allocations
* make it return void
* don't duplicate parts of audit_dummy_context()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 54 +++++++++++++++++-------------------------------------
 1 file changed, 17 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index fbed62e05bc..c50178c7e24 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -145,12 +145,6 @@ struct audit_aux_data_mq_notify {
 	struct sigevent 	notification;
 };
 
-struct audit_aux_data_mq_getsetattr {
-	struct audit_aux_data	d;
-	mqd_t			mqdes;
-	struct mq_attr 		mqstat;
-};
-
 struct audit_aux_data_execve {
 	struct audit_aux_data	d;
 	int argc;
@@ -248,6 +242,10 @@ struct audit_context {
 			mode_t			perm_mode;
 			unsigned long		qbytes;
 		} ipc;
+		struct {
+			mqd_t			mqdes;
+			struct mq_attr 		mqstat;
+		} mq_getsetattr;
 	};
 
 #if AUDIT_DEBUG
@@ -1269,6 +1267,15 @@ static void show_special(struct audit_context *context, int *call_panic)
 				return;
 		}
 		break; }
+	case AUDIT_MQ_GETSETATTR: {
+		struct mq_attr *attr = &context->mq_getsetattr.mqstat;
+		audit_log_format(ab,
+			"mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld "
+			"mq_curmsgs=%ld ",
+			context->mq_getsetattr.mqdes,
+			attr->mq_flags, attr->mq_maxmsg,
+			attr->mq_msgsize, attr->mq_curmsgs);
+		break; }
 	}
 	audit_log_end(ab);
 }
@@ -1377,16 +1384,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 				axi->notification.sigev_signo);
 			break; }
 
-		case AUDIT_MQ_GETSETATTR: {
-			struct audit_aux_data_mq_getsetattr *axi = (void *)aux;
-			audit_log_format(ab,
-				"mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld "
-				"mq_curmsgs=%ld ",
-				axi->mqdes,
-				axi->mqstat.mq_flags, axi->mqstat.mq_maxmsg,
-				axi->mqstat.mq_msgsize, axi->mqstat.mq_curmsgs);
-			break; }
-
 		case AUDIT_EXECVE: {
 			struct audit_aux_data_execve *axi = (void *)aux;
 			audit_log_execve_info(context, &ab, axi);
@@ -2316,30 +2313,13 @@ int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification)
  * @mqdes: MQ descriptor
  * @mqstat: MQ flags
  *
- * Returns 0 for success or NULL context or < 0 on error.
  */
-int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
+void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
 {
-	struct audit_aux_data_mq_getsetattr *ax;
 	struct audit_context *context = current->audit_context;
-
-	if (!audit_enabled)
-		return 0;
-
-	if (likely(!context))
-		return 0;
-
-	ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
-	if (!ax)
-		return -ENOMEM;
-
-	ax->mqdes = mqdes;
-	ax->mqstat = *mqstat;
-
-	ax->d.type = AUDIT_MQ_GETSETATTR;
-	ax->d.next = context->aux;
-	context->aux = (void *)ax;
-	return 0;
+	context->mq_getsetattr.mqdes = mqdes;
+	context->mq_getsetattr.mqstat = *mqstat;
+	context->type = AUDIT_MQ_GETSETATTR;
 }
 
 /**
-- 
cgit v1.2.3


From 20114f71b27cafeb7c7e41d2b0f0b68c3fbb022b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 10 Dec 2008 07:16:12 -0500
Subject: sanitize audit_mq_notify()

* don't copy_from_user() twice
* don't bother with allocations
* don't duplicate parts of audit_dummy_context()
* make it return void

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 56 ++++++++++++++++----------------------------------------
 1 file changed, 16 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index c50178c7e24..3ece960de89 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -139,12 +139,6 @@ struct audit_aux_data_mq_sendrecv {
 	struct timespec		abs_timeout;
 };
 
-struct audit_aux_data_mq_notify {
-	struct audit_aux_data	d;
-	mqd_t			mqdes;
-	struct sigevent 	notification;
-};
-
 struct audit_aux_data_execve {
 	struct audit_aux_data	d;
 	int argc;
@@ -246,6 +240,10 @@ struct audit_context {
 			mqd_t			mqdes;
 			struct mq_attr 		mqstat;
 		} mq_getsetattr;
+		struct {
+			mqd_t			mqdes;
+			int			sigev_signo;
+		} mq_notify;
 	};
 
 #if AUDIT_DEBUG
@@ -1267,6 +1265,11 @@ static void show_special(struct audit_context *context, int *call_panic)
 				return;
 		}
 		break; }
+	case AUDIT_MQ_NOTIFY: {
+		audit_log_format(ab, "mqdes=%d sigev_signo=%d",
+				context->mq_notify.mqdes,
+				context->mq_notify.sigev_signo);
+		break; }
 	case AUDIT_MQ_GETSETATTR: {
 		struct mq_attr *attr = &context->mq_getsetattr.mqstat;
 		audit_log_format(ab,
@@ -1376,14 +1379,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 				axi->abs_timeout.tv_sec, axi->abs_timeout.tv_nsec);
 			break; }
 
-		case AUDIT_MQ_NOTIFY: {
-			struct audit_aux_data_mq_notify *axi = (void *)aux;
-			audit_log_format(ab,
-				"mqdes=%d sigev_signo=%d",
-				axi->mqdes,
-				axi->notification.sigev_signo);
-			break; }
-
 		case AUDIT_EXECVE: {
 			struct audit_aux_data_execve *axi = (void *)aux;
 			audit_log_execve_info(context, &ab, axi);
@@ -2274,38 +2269,19 @@ int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len,
  * @mqdes: MQ descriptor
  * @u_notification: Notification event
  *
- * Returns 0 for success or NULL context or < 0 on error.
  */
 
-int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification)
+void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification)
 {
-	struct audit_aux_data_mq_notify *ax;
 	struct audit_context *context = current->audit_context;
 
-	if (!audit_enabled)
-		return 0;
-
-	if (likely(!context))
-		return 0;
-
-	ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
-	if (!ax)
-		return -ENOMEM;
-
-	if (u_notification != NULL) {
-		if (copy_from_user(&ax->notification, u_notification, sizeof(ax->notification))) {
-			kfree(ax);
-			return -EFAULT;
-		}
-	} else
-		memset(&ax->notification, 0, sizeof(ax->notification));
-
-	ax->mqdes = mqdes;
+	if (notification)
+		context->mq_notify.sigev_signo = notification->sigev_signo;
+	else
+		context->mq_notify.sigev_signo = 0;
 
-	ax->d.type = AUDIT_MQ_NOTIFY;
-	ax->d.next = context->aux;
-	context->aux = (void *)ax;
-	return 0;
+	context->mq_notify.mqdes = mqdes;
+	context->type = AUDIT_MQ_NOTIFY;
 }
 
 /**
-- 
cgit v1.2.3


From c32c8af43b9adde8d6f938d8e6328c13b8de79ac Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 14 Dec 2008 03:46:48 -0500
Subject: sanitize AUDIT_MQ_SENDRECV

* logging the original value of *msg_prio in mq_timedreceive(2)
  is insane - the argument is write-only (i.e. syscall always
  ignores the original value and only overwrites it).
* merge __audit_mq_timed{send,receive}
* don't do copy_from_user() twice
* don't mess with allocations in auditsc part
* ... and don't bother checking !audit_enabled and !context in there -
  we'd already checked for audit_dummy_context().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 127 +++++++++++++------------------------------------------
 1 file changed, 29 insertions(+), 98 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 3ece960de89..140c4745347 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -131,14 +131,6 @@ struct audit_aux_data_mq_open {
 	struct mq_attr		attr;
 };
 
-struct audit_aux_data_mq_sendrecv {
-	struct audit_aux_data	d;
-	mqd_t			mqdes;
-	size_t			msg_len;
-	unsigned int		msg_prio;
-	struct timespec		abs_timeout;
-};
-
 struct audit_aux_data_execve {
 	struct audit_aux_data	d;
 	int argc;
@@ -244,6 +236,12 @@ struct audit_context {
 			mqd_t			mqdes;
 			int			sigev_signo;
 		} mq_notify;
+		struct {
+			mqd_t			mqdes;
+			size_t			msg_len;
+			unsigned int		msg_prio;
+			struct timespec		abs_timeout;
+		} mq_sendrecv;
 	};
 
 #if AUDIT_DEBUG
@@ -1265,6 +1263,16 @@ static void show_special(struct audit_context *context, int *call_panic)
 				return;
 		}
 		break; }
+	case AUDIT_MQ_SENDRECV: {
+		audit_log_format(ab,
+			"mqdes=%d msg_len=%zd msg_prio=%u "
+			"abs_timeout_sec=%ld abs_timeout_nsec=%ld",
+			context->mq_sendrecv.mqdes,
+			context->mq_sendrecv.msg_len,
+			context->mq_sendrecv.msg_prio,
+			context->mq_sendrecv.abs_timeout.tv_sec,
+			context->mq_sendrecv.abs_timeout.tv_nsec);
+		break; }
 	case AUDIT_MQ_NOTIFY: {
 		audit_log_format(ab, "mqdes=%d sigev_signo=%d",
 				context->mq_notify.mqdes,
@@ -1370,15 +1378,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 				axi->attr.mq_curmsgs);
 			break; }
 
-		case AUDIT_MQ_SENDRECV: {
-			struct audit_aux_data_mq_sendrecv *axi = (void *)aux;
-			audit_log_format(ab,
-				"mqdes=%d msg_len=%zd msg_prio=%u "
-				"abs_timeout_sec=%ld abs_timeout_nsec=%ld",
-				axi->mqdes, axi->msg_len, axi->msg_prio,
-				axi->abs_timeout.tv_sec, axi->abs_timeout.tv_nsec);
-			break; }
-
 		case AUDIT_EXECVE: {
 			struct audit_aux_data_execve *axi = (void *)aux;
 			audit_log_execve_info(context, &ab, axi);
@@ -2171,97 +2170,29 @@ int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr)
 }
 
 /**
- * __audit_mq_timedsend - record audit data for a POSIX MQ timed send
+ * __audit_mq_sendrecv - record audit data for a POSIX MQ timed send/receive
  * @mqdes: MQ descriptor
  * @msg_len: Message length
  * @msg_prio: Message priority
- * @u_abs_timeout: Message timeout in absolute time
- *
- * Returns 0 for success or NULL context or < 0 on error.
- */
-int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
-			const struct timespec __user *u_abs_timeout)
-{
-	struct audit_aux_data_mq_sendrecv *ax;
-	struct audit_context *context = current->audit_context;
-
-	if (!audit_enabled)
-		return 0;
-
-	if (likely(!context))
-		return 0;
-
-	ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
-	if (!ax)
-		return -ENOMEM;
-
-	if (u_abs_timeout != NULL) {
-		if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) {
-			kfree(ax);
-			return -EFAULT;
-		}
-	} else
-		memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout));
-
-	ax->mqdes = mqdes;
-	ax->msg_len = msg_len;
-	ax->msg_prio = msg_prio;
-
-	ax->d.type = AUDIT_MQ_SENDRECV;
-	ax->d.next = context->aux;
-	context->aux = (void *)ax;
-	return 0;
-}
-
-/**
- * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive
- * @mqdes: MQ descriptor
- * @msg_len: Message length
- * @u_msg_prio: Message priority
- * @u_abs_timeout: Message timeout in absolute time
+ * @abs_timeout: Message timeout in absolute time
  *
- * Returns 0 for success or NULL context or < 0 on error.
  */
-int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len,
-				unsigned int __user *u_msg_prio,
-				const struct timespec __user *u_abs_timeout)
+void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
+			const struct timespec *abs_timeout)
 {
-	struct audit_aux_data_mq_sendrecv *ax;
 	struct audit_context *context = current->audit_context;
+	struct timespec *p = &context->mq_sendrecv.abs_timeout;
 
-	if (!audit_enabled)
-		return 0;
-
-	if (likely(!context))
-		return 0;
-
-	ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
-	if (!ax)
-		return -ENOMEM;
-
-	if (u_msg_prio != NULL) {
-		if (get_user(ax->msg_prio, u_msg_prio)) {
-			kfree(ax);
-			return -EFAULT;
-		}
-	} else
-		ax->msg_prio = 0;
-
-	if (u_abs_timeout != NULL) {
-		if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) {
-			kfree(ax);
-			return -EFAULT;
-		}
-	} else
-		memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout));
+	if (abs_timeout)
+		memcpy(p, abs_timeout, sizeof(struct timespec));
+	else
+		memset(p, 0, sizeof(struct timespec));
 
-	ax->mqdes = mqdes;
-	ax->msg_len = msg_len;
+	context->mq_sendrecv.mqdes = mqdes;
+	context->mq_sendrecv.msg_len = msg_len;
+	context->mq_sendrecv.msg_prio = msg_prio;
 
-	ax->d.type = AUDIT_MQ_SENDRECV;
-	ax->d.next = context->aux;
-	context->aux = (void *)ax;
-	return 0;
+	context->type = AUDIT_MQ_SENDRECV;
 }
 
 /**
-- 
cgit v1.2.3


From 564f6993ffef656aebaf46cf2f1f6cb4f5c97207 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 14 Dec 2008 04:02:26 -0500
Subject: sanitize audit_mq_open()

* don't bother with allocations
* don't do double copy_from_user()
* don't duplicate parts of check for audit_dummy_context()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 65 ++++++++++++++++++++------------------------------------
 1 file changed, 23 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 140c4745347..83e946f1cdd 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -124,13 +124,6 @@ struct audit_aux_data {
 /* Number of target pids per aux struct. */
 #define AUDIT_AUX_PIDS	16
 
-struct audit_aux_data_mq_open {
-	struct audit_aux_data	d;
-	int			oflag;
-	mode_t			mode;
-	struct mq_attr		attr;
-};
-
 struct audit_aux_data_execve {
 	struct audit_aux_data	d;
 	int argc;
@@ -242,6 +235,11 @@ struct audit_context {
 			unsigned int		msg_prio;
 			struct timespec		abs_timeout;
 		} mq_sendrecv;
+		struct {
+			int			oflag;
+			mode_t			mode;
+			struct mq_attr		attr;
+		} mq_open;
 	};
 
 #if AUDIT_DEBUG
@@ -1263,6 +1261,16 @@ static void show_special(struct audit_context *context, int *call_panic)
 				return;
 		}
 		break; }
+	case AUDIT_MQ_OPEN: {
+		audit_log_format(ab,
+			"oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld "
+			"mq_msgsize=%ld mq_curmsgs=%ld",
+			context->mq_open.oflag, context->mq_open.mode,
+			context->mq_open.attr.mq_flags,
+			context->mq_open.attr.mq_maxmsg,
+			context->mq_open.attr.mq_msgsize,
+			context->mq_open.attr.mq_curmsgs);
+		break; }
 	case AUDIT_MQ_SENDRECV: {
 		audit_log_format(ab,
 			"mqdes=%d msg_len=%zd msg_prio=%u "
@@ -1368,15 +1376,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 			continue; /* audit_panic has been called */
 
 		switch (aux->type) {
-		case AUDIT_MQ_OPEN: {
-			struct audit_aux_data_mq_open *axi = (void *)aux;
-			audit_log_format(ab,
-				"oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld "
-				"mq_msgsize=%ld mq_curmsgs=%ld",
-				axi->oflag, axi->mode, axi->attr.mq_flags,
-				axi->attr.mq_maxmsg, axi->attr.mq_msgsize,
-				axi->attr.mq_curmsgs);
-			break; }
 
 		case AUDIT_EXECVE: {
 			struct audit_aux_data_execve *axi = (void *)aux;
@@ -2135,38 +2134,20 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
  * @mode: mode bits
  * @u_attr: queue attributes
  *
- * Returns 0 for success or NULL context or < 0 on error.
  */
-int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr)
+void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr)
 {
-	struct audit_aux_data_mq_open *ax;
 	struct audit_context *context = current->audit_context;
 
-	if (!audit_enabled)
-		return 0;
-
-	if (likely(!context))
-		return 0;
-
-	ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
-	if (!ax)
-		return -ENOMEM;
-
-	if (u_attr != NULL) {
-		if (copy_from_user(&ax->attr, u_attr, sizeof(ax->attr))) {
-			kfree(ax);
-			return -EFAULT;
-		}
-	} else
-		memset(&ax->attr, 0, sizeof(ax->attr));
+	if (attr)
+		memcpy(&context->mq_open.attr, attr, sizeof(struct mq_attr));
+	else
+		memset(&context->mq_open.attr, 0, sizeof(struct mq_attr));
 
-	ax->oflag = oflag;
-	ax->mode = mode;
+	context->mq_open.oflag = oflag;
+	context->mq_open.mode = mode;
 
-	ax->d.type = AUDIT_MQ_OPEN;
-	ax->d.next = context->aux;
-	context->aux = (void *)ax;
-	return 0;
+	context->type = AUDIT_MQ_OPEN;
 }
 
 /**
-- 
cgit v1.2.3


From 157cf649a735a2f7e8dba0ed08e6e38b6c30d886 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 14 Dec 2008 04:57:47 -0500
Subject: sanitize audit_fd_pair()

* no allocations
* return void

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 44 ++++++++++++++------------------------------
 1 file changed, 14 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 83e946f1cdd..327e65d5067 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -131,11 +131,6 @@ struct audit_aux_data_execve {
 	struct mm_struct *mm;
 };
 
-struct audit_aux_data_fd_pair {
-	struct	audit_aux_data d;
-	int	fd[2];
-};
-
 struct audit_aux_data_pids {
 	struct audit_aux_data	d;
 	pid_t			target_pid[AUDIT_AUX_PIDS];
@@ -241,6 +236,7 @@ struct audit_context {
 			struct mq_attr		attr;
 		} mq_open;
 	};
+	int fds[2];
 
 #if AUDIT_DEBUG
 	int		    put_count;
@@ -1382,11 +1378,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 			audit_log_execve_info(context, &ab, axi);
 			break; }
 
-		case AUDIT_FD_PAIR: {
-			struct audit_aux_data_fd_pair *axs = (void *)aux;
-			audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]);
-			break; }
-
 		case AUDIT_BPRM_FCAPS: {
 			struct audit_aux_data_bprm_fcaps *axs = (void *)aux;
 			audit_log_format(ab, "fver=%x", axs->fcap_ver);
@@ -1416,6 +1407,15 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 	if (context->type)
 		show_special(context, &call_panic);
 
+	if (context->fds[0] >= 0) {
+		ab = audit_log_start(context, GFP_KERNEL, AUDIT_FD_PAIR);
+		if (ab) {
+			audit_log_format(ab, "fd0=%d fd1=%d",
+					context->fds[0], context->fds[1]);
+			audit_log_end(ab);
+		}
+	}
+
 	if (context->sockaddr_len) {
 		ab = audit_log_start(context, GFP_KERNEL, AUDIT_SOCKADDR);
 		if (ab) {
@@ -1696,6 +1696,7 @@ void audit_syscall_exit(int valid, long return_code)
 		context->target_sid = 0;
 		context->sockaddr_len = 0;
 		context->type = 0;
+		context->fds[0] = -1;
 		kfree(context->filterkey);
 		context->filterkey = NULL;
 		tsk->audit_context = context;
@@ -2291,29 +2292,12 @@ void audit_socketcall(int nargs, unsigned long *args)
  * @fd1: the first file descriptor
  * @fd2: the second file descriptor
  *
- * Returns 0 for success or NULL context or < 0 on error.
  */
-int __audit_fd_pair(int fd1, int fd2)
+void __audit_fd_pair(int fd1, int fd2)
 {
 	struct audit_context *context = current->audit_context;
-	struct audit_aux_data_fd_pair *ax;
-
-	if (likely(!context)) {
-		return 0;
-	}
-
-	ax = kmalloc(sizeof(*ax), GFP_KERNEL);
-	if (!ax) {
-		return -ENOMEM;
-	}
-
-	ax->fd[0] = fd1;
-	ax->fd[1] = fd2;
-
-	ax->d.type = AUDIT_FD_PAIR;
-	ax->d.next = context->aux;
-	context->aux = (void *)ax;
-	return 0;
+	context->fds[0] = fd1;
+	context->fds[1] = fd2;
 }
 
 /**
-- 
cgit v1.2.3


From 57f71a0af4244d9ba3c0bce74b1d2e66e8d520bd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 4 Jan 2009 14:52:57 -0500
Subject: sanitize audit_log_capset()

* no allocations
* return void
* don't duplicate checked for dummy context

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c    | 44 ++++++++++++++++----------------------------
 kernel/capability.c |  4 +---
 2 files changed, 17 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 327e65d5067..c76a58215f5 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -235,6 +235,10 @@ struct audit_context {
 			mode_t			mode;
 			struct mq_attr		attr;
 		} mq_open;
+		struct {
+			pid_t			pid;
+			struct audit_cap_data	cap;
+		} capset;
 	};
 	int fds[2];
 
@@ -1291,6 +1295,12 @@ static void show_special(struct audit_context *context, int *call_panic)
 			attr->mq_flags, attr->mq_maxmsg,
 			attr->mq_msgsize, attr->mq_curmsgs);
 		break; }
+	case AUDIT_CAPSET: {
+		audit_log_format(ab, "pid=%d", context->capset.pid);
+		audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable);
+		audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted);
+		audit_log_cap(ab, "cap_pe", &context->capset.cap.effective);
+		break; }
 	}
 	audit_log_end(ab);
 }
@@ -1392,14 +1402,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 			audit_log_cap(ab, "new_pe", &axs->new_pcap.effective);
 			break; }
 
-		case AUDIT_CAPSET: {
-			struct audit_aux_data_capset *axs = (void *)aux;
-			audit_log_format(ab, "pid=%d", axs->pid);
-			audit_log_cap(ab, "cap_pi", &axs->cap.inheritable);
-			audit_log_cap(ab, "cap_pp", &axs->cap.permitted);
-			audit_log_cap(ab, "cap_pe", &axs->cap.effective);
-			break; }
-
 		}
 		audit_log_end(ab);
 	}
@@ -2456,29 +2458,15 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
  * Record the aguments userspace sent to sys_capset for later printing by the
  * audit system if applicable
  */
-int __audit_log_capset(pid_t pid,
+void __audit_log_capset(pid_t pid,
 		       const struct cred *new, const struct cred *old)
 {
-	struct audit_aux_data_capset *ax;
 	struct audit_context *context = current->audit_context;
-
-	if (likely(!audit_enabled || !context || context->dummy))
-		return 0;
-
-	ax = kmalloc(sizeof(*ax), GFP_KERNEL);
-	if (!ax)
-		return -ENOMEM;
-
-	ax->d.type = AUDIT_CAPSET;
-	ax->d.next = context->aux;
-	context->aux = (void *)ax;
-
-	ax->pid = pid;
-	ax->cap.effective   = new->cap_effective;
-	ax->cap.inheritable = new->cap_effective;
-	ax->cap.permitted   = new->cap_permitted;
-
-	return 0;
+	context->capset.pid = pid;
+	context->capset.cap.effective   = new->cap_effective;
+	context->capset.cap.inheritable = new->cap_effective;
+	context->capset.cap.permitted   = new->cap_permitted;
+	context->type = AUDIT_CAPSET;
 }
 
 /**
diff --git a/kernel/capability.c b/kernel/capability.c
index 36b4b4daebe..c598d9d5be4 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -280,9 +280,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 	if (ret < 0)
 		goto error;
 
-	ret = audit_log_capset(pid, new, current_cred());
-	if (ret < 0)
-		return ret;
+	audit_log_capset(pid, new, current_cred());
 
 	return commit_creds(new);
 
-- 
cgit v1.2.3


From 1a9d0797b8977d413435277bf9661efbbd584693 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 14 Dec 2008 12:04:02 -0500
Subject: audit_update_lsm_rules() misses the audit_inode_hash[] ones

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditfilter.c | 77 ++++++++++++++++++++++++++++++++--------------------
 1 file changed, 47 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 9fd85a4640a..0febaa0f784 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1778,6 +1778,41 @@ unlock_and_return:
 	return result;
 }
 
+static int update_lsm_rule(struct audit_entry *entry)
+{
+	struct audit_entry *nentry;
+	struct audit_watch *watch;
+	struct audit_tree *tree;
+	int err = 0;
+
+	if (!security_audit_rule_known(&entry->rule))
+		return 0;
+
+	watch = entry->rule.watch;
+	tree = entry->rule.tree;
+	nentry = audit_dupe_rule(&entry->rule, watch);
+	if (IS_ERR(nentry)) {
+		/* save the first error encountered for the
+		 * return value */
+		err = PTR_ERR(nentry);
+		audit_panic("error updating LSM filters");
+		if (watch)
+			list_del(&entry->rule.rlist);
+		list_del_rcu(&entry->list);
+	} else {
+		if (watch) {
+			list_add(&nentry->rule.rlist, &watch->rules);
+			list_del(&entry->rule.rlist);
+		} else if (tree)
+			list_replace_init(&entry->rule.rlist,
+				     &nentry->rule.rlist);
+		list_replace_rcu(&entry->list, &nentry->list);
+	}
+	call_rcu(&entry->rcu, audit_free_rule_rcu);
+
+	return err;
+}
+
 /* This function will re-initialize the lsm_rule field of all applicable rules.
  * It will traverse the filter lists serarching for rules that contain LSM
  * specific filter fields.  When such a rule is found, it is copied, the
@@ -1785,42 +1820,24 @@ unlock_and_return:
  * updated rule. */
 int audit_update_lsm_rules(void)
 {
-	struct audit_entry *entry, *n, *nentry;
-	struct audit_watch *watch;
-	struct audit_tree *tree;
+	struct audit_entry *e, *n;
 	int i, err = 0;
 
 	/* audit_filter_mutex synchronizes the writers */
 	mutex_lock(&audit_filter_mutex);
 
 	for (i = 0; i < AUDIT_NR_FILTERS; i++) {
-		list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) {
-			if (!security_audit_rule_known(&entry->rule))
-				continue;
-
-			watch = entry->rule.watch;
-			tree = entry->rule.tree;
-			nentry = audit_dupe_rule(&entry->rule, watch);
-			if (IS_ERR(nentry)) {
-				/* save the first error encountered for the
-				 * return value */
-				if (!err)
-					err = PTR_ERR(nentry);
-				audit_panic("error updating LSM filters");
-				if (watch)
-					list_del(&entry->rule.rlist);
-				list_del_rcu(&entry->list);
-			} else {
-				if (watch) {
-					list_add(&nentry->rule.rlist,
-						 &watch->rules);
-					list_del(&entry->rule.rlist);
-				} else if (tree)
-					list_replace_init(&entry->rule.rlist,
-						     &nentry->rule.rlist);
-				list_replace_rcu(&entry->list, &nentry->list);
-			}
-			call_rcu(&entry->rcu, audit_free_rule_rcu);
+		list_for_each_entry_safe(e, n, &audit_filter_list[i], list) {
+			int res = update_lsm_rule(e);
+			if (!err)
+				err = res;
+		}
+	}
+	for (i=0; i< AUDIT_INODE_BUCKETS; i++) {
+		list_for_each_entry_safe(e, n, &audit_inode_hash[i], list) {
+			int res = update_lsm_rule(e);
+			if (!err)
+				err = res;
 		}
 	}
 
-- 
cgit v1.2.3


From 0590b9335a1c72a3f0defcc6231287f7817e07c8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 14 Dec 2008 23:45:27 -0500
Subject: fixing audit rule ordering mess, part 1

Problem: ordering between the rules on exit chain is currently lost;
all watch and inode rules are listed after everything else _and_
exit,never on one kind doesn't stop exit,always on another from
being matched.

Solution: assign priorities to rules, keep track of the current
highest-priority matching rule and its result (always/never).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit.h       |  5 +---
 kernel/auditfilter.c | 17 +++++++++--
 kernel/auditsc.c     | 79 ++++++++++++++++++++++++++++------------------------
 3 files changed, 58 insertions(+), 43 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.h b/kernel/audit.h
index 9d6717412fe..16f18cac661 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -159,11 +159,8 @@ static inline int audit_signal_info(int sig, struct task_struct *t)
 		return __audit_signal_info(sig, t);
 	return 0;
 }
-extern enum audit_state audit_filter_inodes(struct task_struct *,
-					    struct audit_context *);
-extern void audit_set_auditable(struct audit_context *);
+extern void audit_filter_inodes(struct task_struct *, struct audit_context *);
 #else
 #define audit_signal_info(s,t) AUDIT_DISABLED
 #define audit_filter_inodes(t,c) AUDIT_DISABLED
-#define audit_set_auditable(c)
 #endif
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 0febaa0f784..995a2e86808 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -919,6 +919,7 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
 	new->action = old->action;
 	for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
 		new->mask[i] = old->mask[i];
+	new->prio = old->prio;
 	new->buflen = old->buflen;
 	new->inode_f = old->inode_f;
 	new->watch = NULL;
@@ -987,9 +988,8 @@ static void audit_update_watch(struct audit_parent *parent,
 
 		/* If the update involves invalidating rules, do the inode-based
 		 * filtering now, so we don't omit records. */
-		if (invalidating && current->audit_context &&
-		    audit_filter_inodes(current, current->audit_context) == AUDIT_RECORD_CONTEXT)
-			audit_set_auditable(current->audit_context);
+		if (invalidating && current->audit_context)
+			audit_filter_inodes(current, current->audit_context);
 
 		nwatch = audit_dupe_watch(owatch);
 		if (IS_ERR(nwatch)) {
@@ -1258,6 +1258,9 @@ static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
 	return ret;
 }
 
+static u64 prio_low = ~0ULL/2;
+static u64 prio_high = ~0ULL/2 - 1;
+
 /* Add rule to given filterlist if not a duplicate. */
 static inline int audit_add_rule(struct audit_entry *entry,
 				 struct list_head *list)
@@ -1319,6 +1322,14 @@ static inline int audit_add_rule(struct audit_entry *entry,
 		}
 	}
 
+	entry->rule.prio = ~0ULL;
+	if (entry->rule.listnr == AUDIT_FILTER_EXIT) {
+		if (entry->rule.flags & AUDIT_FILTER_PREPEND)
+			entry->rule.prio = ++prio_high;
+		else
+			entry->rule.prio = --prio_low;
+	}
+
 	if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
 		list_add_rcu(&entry->list, list);
 		entry->rule.flags &= ~AUDIT_FILTER_PREPEND;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index c76a58215f5..19d2c2747c8 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -165,14 +165,14 @@ struct audit_tree_refs {
 struct audit_context {
 	int		    dummy;	/* must be the first element */
 	int		    in_syscall;	/* 1 if task is in a syscall */
-	enum audit_state    state;
+	enum audit_state    state, current_state;
 	unsigned int	    serial;     /* serial number for record */
 	struct timespec	    ctime;      /* time of syscall entry */
 	int		    major;      /* syscall number */
 	unsigned long	    argv[4];    /* syscall arguments */
 	int		    return_valid; /* return code is valid */
 	long		    return_code;/* syscall return code */
-	int		    auditable;  /* 1 if record should be written */
+	u64		    prio;
 	int		    name_count;
 	struct audit_names  names[AUDIT_NAMES];
 	char *		    filterkey;	/* key for rule that triggered record */
@@ -630,8 +630,16 @@ static int audit_filter_rules(struct task_struct *tsk,
 			return 0;
 		}
 	}
-	if (rule->filterkey && ctx)
-		ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
+
+	if (ctx) {
+		if (rule->prio <= ctx->prio)
+			return 0;
+		if (rule->filterkey) {
+			kfree(ctx->filterkey);
+			ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
+		}
+		ctx->prio = rule->prio;
+	}
 	switch (rule->action) {
 	case AUDIT_NEVER:    *state = AUDIT_DISABLED;	    break;
 	case AUDIT_ALWAYS:   *state = AUDIT_RECORD_CONTEXT; break;
@@ -685,6 +693,7 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
 			    audit_filter_rules(tsk, &e->rule, ctx, NULL,
 					       &state)) {
 				rcu_read_unlock();
+				ctx->current_state = state;
 				return state;
 			}
 		}
@@ -698,15 +707,14 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
  * buckets applicable to the inode numbers in audit_names[].
  * Regarding audit_state, same rules apply as for audit_filter_syscall().
  */
-enum audit_state audit_filter_inodes(struct task_struct *tsk,
-				     struct audit_context *ctx)
+void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
 {
 	int i;
 	struct audit_entry *e;
 	enum audit_state state;
 
 	if (audit_pid && tsk->tgid == audit_pid)
-		return AUDIT_DISABLED;
+		return;
 
 	rcu_read_lock();
 	for (i = 0; i < ctx->name_count; i++) {
@@ -723,17 +731,20 @@ enum audit_state audit_filter_inodes(struct task_struct *tsk,
 			if ((e->rule.mask[word] & bit) == bit &&
 			    audit_filter_rules(tsk, &e->rule, ctx, n, &state)) {
 				rcu_read_unlock();
-				return state;
+				ctx->current_state = state;
+				return;
 			}
 		}
 	}
 	rcu_read_unlock();
-	return AUDIT_BUILD_CONTEXT;
 }
 
-void audit_set_auditable(struct audit_context *ctx)
+static void audit_set_auditable(struct audit_context *ctx)
 {
-	ctx->auditable = 1;
+	if (!ctx->prio) {
+		ctx->prio = 1;
+		ctx->current_state = AUDIT_RECORD_CONTEXT;
+	}
 }
 
 static inline struct audit_context *audit_get_context(struct task_struct *tsk,
@@ -764,23 +775,11 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
 	else
 		context->return_code  = return_code;
 
-	if (context->in_syscall && !context->dummy && !context->auditable) {
-		enum audit_state state;
-
-		state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]);
-		if (state == AUDIT_RECORD_CONTEXT) {
-			context->auditable = 1;
-			goto get_context;
-		}
-
-		state = audit_filter_inodes(tsk, context);
-		if (state == AUDIT_RECORD_CONTEXT)
-			context->auditable = 1;
-
+	if (context->in_syscall && !context->dummy) {
+		audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]);
+		audit_filter_inodes(tsk, context);
 	}
 
-get_context:
-
 	tsk->audit_context = NULL;
 	return context;
 }
@@ -790,8 +789,7 @@ static inline void audit_free_names(struct audit_context *context)
 	int i;
 
 #if AUDIT_DEBUG == 2
-	if (context->auditable
-	    ||context->put_count + context->ino_count != context->name_count) {
+	if (context->put_count + context->ino_count != context->name_count) {
 		printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d"
 		       " name_count=%d put_count=%d"
 		       " ino_count=%d [NOT freeing]\n",
@@ -842,6 +840,7 @@ static inline void audit_zero_context(struct audit_context *context,
 {
 	memset(context, 0, sizeof(*context));
 	context->state      = state;
+	context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
 }
 
 static inline struct audit_context *audit_alloc_context(enum audit_state state)
@@ -1543,7 +1542,7 @@ void audit_free(struct task_struct *tsk)
 	 * We use GFP_ATOMIC here because we might be doing this
 	 * in the context of the idle thread */
 	/* that can happen only if we are called from do_exit() */
-	if (context->in_syscall && context->auditable)
+	if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
 		audit_log_exit(context, tsk);
 
 	audit_free_context(context);
@@ -1627,15 +1626,17 @@ void audit_syscall_entry(int arch, int major,
 
 	state = context->state;
 	context->dummy = !audit_n_rules;
-	if (!context->dummy && (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT))
+	if (!context->dummy && state == AUDIT_BUILD_CONTEXT) {
+		context->prio = 0;
 		state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]);
+	}
 	if (likely(state == AUDIT_DISABLED))
 		return;
 
 	context->serial     = 0;
 	context->ctime      = CURRENT_TIME;
 	context->in_syscall = 1;
-	context->auditable  = !!(state == AUDIT_RECORD_CONTEXT);
+	context->current_state  = state;
 	context->ppid       = 0;
 }
 
@@ -1643,17 +1644,20 @@ void audit_finish_fork(struct task_struct *child)
 {
 	struct audit_context *ctx = current->audit_context;
 	struct audit_context *p = child->audit_context;
-	if (!p || !ctx || !ctx->auditable)
+	if (!p || !ctx)
+		return;
+	if (!ctx->in_syscall || ctx->current_state != AUDIT_RECORD_CONTEXT)
 		return;
 	p->arch = ctx->arch;
 	p->major = ctx->major;
 	memcpy(p->argv, ctx->argv, sizeof(ctx->argv));
 	p->ctime = ctx->ctime;
 	p->dummy = ctx->dummy;
-	p->auditable = ctx->auditable;
 	p->in_syscall = ctx->in_syscall;
 	p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL);
 	p->ppid = current->pid;
+	p->prio = ctx->prio;
+	p->current_state = ctx->current_state;
 }
 
 /**
@@ -1677,11 +1681,11 @@ void audit_syscall_exit(int valid, long return_code)
 	if (likely(!context))
 		return;
 
-	if (context->in_syscall && context->auditable)
+	if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
 		audit_log_exit(context, tsk);
 
 	context->in_syscall = 0;
-	context->auditable  = 0;
+	context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
 
 	if (context->previous) {
 		struct audit_context *new_context = context->previous;
@@ -2091,7 +2095,10 @@ int auditsc_get_stamp(struct audit_context *ctx,
 	t->tv_sec  = ctx->ctime.tv_sec;
 	t->tv_nsec = ctx->ctime.tv_nsec;
 	*serial    = ctx->serial;
-	ctx->auditable = 1;
+	if (!ctx->prio) {
+		ctx->prio = 1;
+		ctx->current_state = AUDIT_RECORD_CONTEXT;
+	}
 	return 1;
 }
 
-- 
cgit v1.2.3


From e45aa212ea81d39b38ba158df344dc3a500153e5 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 15 Dec 2008 01:17:50 -0500
Subject: audit rules ordering, part 2

Fix the actual rule listing; add per-type lists _not_ used for matching,
with all exit,... sitting on one such list.  Simplifies "do something
for all rules" logics, while we are at it...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit_tree.c  |  1 +
 kernel/auditfilter.c | 95 +++++++++++++++++++++-------------------------------
 2 files changed, 40 insertions(+), 56 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 8b509441f49..48bddad2a3d 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -450,6 +450,7 @@ static void kill_rules(struct audit_tree *tree)
 			audit_log_end(ab);
 			rule->tree = NULL;
 			list_del_rcu(&entry->list);
+			list_del(&entry->rule.list);
 			call_rcu(&entry->rcu, audit_free_rule_rcu);
 		}
 	}
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 995a2e86808..5d4edc6f7a3 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -86,6 +86,14 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
 #error Fix audit_filter_list initialiser
 #endif
 };
+static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
+	LIST_HEAD_INIT(audit_rules_list[0]),
+	LIST_HEAD_INIT(audit_rules_list[1]),
+	LIST_HEAD_INIT(audit_rules_list[2]),
+	LIST_HEAD_INIT(audit_rules_list[3]),
+	LIST_HEAD_INIT(audit_rules_list[4]),
+	LIST_HEAD_INIT(audit_rules_list[5]),
+};
 
 DEFINE_MUTEX(audit_filter_mutex);
 
@@ -1007,12 +1015,15 @@ static void audit_update_watch(struct audit_parent *parent,
 			list_del_rcu(&oentry->list);
 
 			nentry = audit_dupe_rule(&oentry->rule, nwatch);
-			if (IS_ERR(nentry))
+			if (IS_ERR(nentry)) {
+				list_del(&oentry->rule.list);
 				audit_panic("error updating watch, removing");
-			else {
+			} else {
 				int h = audit_hash_ino((u32)ino);
 				list_add(&nentry->rule.rlist, &nwatch->rules);
 				list_add_rcu(&nentry->list, &audit_inode_hash[h]);
+				list_replace(&oentry->rule.list,
+					     &nentry->rule.list);
 			}
 
 			call_rcu(&oentry->rcu, audit_free_rule_rcu);
@@ -1077,6 +1088,7 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
 				audit_log_end(ab);
 			}
 			list_del(&r->rlist);
+			list_del(&r->list);
 			list_del_rcu(&e->list);
 			call_rcu(&e->rcu, audit_free_rule_rcu);
 		}
@@ -1331,9 +1343,13 @@ static inline int audit_add_rule(struct audit_entry *entry,
 	}
 
 	if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
+		list_add(&entry->rule.list,
+			 &audit_rules_list[entry->rule.listnr]);
 		list_add_rcu(&entry->list, list);
 		entry->rule.flags &= ~AUDIT_FILTER_PREPEND;
 	} else {
+		list_add_tail(&entry->rule.list,
+			      &audit_rules_list[entry->rule.listnr]);
 		list_add_tail_rcu(&entry->list, list);
 	}
 #ifdef CONFIG_AUDITSYSCALL
@@ -1415,6 +1431,7 @@ static inline int audit_del_rule(struct audit_entry *entry,
 		audit_remove_tree_rule(&e->rule);
 
 	list_del_rcu(&e->list);
+	list_del(&e->rule.list);
 	call_rcu(&e->rcu, audit_free_rule_rcu);
 
 #ifdef CONFIG_AUDITSYSCALL
@@ -1443,30 +1460,16 @@ out:
 static void audit_list(int pid, int seq, struct sk_buff_head *q)
 {
 	struct sk_buff *skb;
-	struct audit_entry *entry;
+	struct audit_krule *r;
 	int i;
 
 	/* This is a blocking read, so use audit_filter_mutex instead of rcu
 	 * iterator to sync with list writers. */
 	for (i=0; i<AUDIT_NR_FILTERS; i++) {
-		list_for_each_entry(entry, &audit_filter_list[i], list) {
-			struct audit_rule *rule;
-
-			rule = audit_krule_to_rule(&entry->rule);
-			if (unlikely(!rule))
-				break;
-			skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1,
-					 rule, sizeof(*rule));
-			if (skb)
-				skb_queue_tail(q, skb);
-			kfree(rule);
-		}
-	}
-	for (i = 0; i < AUDIT_INODE_BUCKETS; i++) {
-		list_for_each_entry(entry, &audit_inode_hash[i], list) {
+		list_for_each_entry(r, &audit_rules_list[i], list) {
 			struct audit_rule *rule;
 
-			rule = audit_krule_to_rule(&entry->rule);
+			rule = audit_krule_to_rule(r);
 			if (unlikely(!rule))
 				break;
 			skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1,
@@ -1485,30 +1488,16 @@ static void audit_list(int pid, int seq, struct sk_buff_head *q)
 static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
 {
 	struct sk_buff *skb;
-	struct audit_entry *e;
+	struct audit_krule *r;
 	int i;
 
 	/* This is a blocking read, so use audit_filter_mutex instead of rcu
 	 * iterator to sync with list writers. */
 	for (i=0; i<AUDIT_NR_FILTERS; i++) {
-		list_for_each_entry(e, &audit_filter_list[i], list) {
-			struct audit_rule_data *data;
-
-			data = audit_krule_to_data(&e->rule);
-			if (unlikely(!data))
-				break;
-			skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
-					 data, sizeof(*data) + data->buflen);
-			if (skb)
-				skb_queue_tail(q, skb);
-			kfree(data);
-		}
-	}
-	for (i=0; i< AUDIT_INODE_BUCKETS; i++) {
-		list_for_each_entry(e, &audit_inode_hash[i], list) {
+		list_for_each_entry(r, &audit_rules_list[i], list) {
 			struct audit_rule_data *data;
 
-			data = audit_krule_to_data(&e->rule);
+			data = audit_krule_to_data(r);
 			if (unlikely(!data))
 				break;
 			skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
@@ -1789,35 +1778,37 @@ unlock_and_return:
 	return result;
 }
 
-static int update_lsm_rule(struct audit_entry *entry)
+static int update_lsm_rule(struct audit_krule *r)
 {
+	struct audit_entry *entry = container_of(r, struct audit_entry, rule);
 	struct audit_entry *nentry;
 	struct audit_watch *watch;
 	struct audit_tree *tree;
 	int err = 0;
 
-	if (!security_audit_rule_known(&entry->rule))
+	if (!security_audit_rule_known(r))
 		return 0;
 
-	watch = entry->rule.watch;
-	tree = entry->rule.tree;
-	nentry = audit_dupe_rule(&entry->rule, watch);
+	watch = r->watch;
+	tree = r->tree;
+	nentry = audit_dupe_rule(r, watch);
 	if (IS_ERR(nentry)) {
 		/* save the first error encountered for the
 		 * return value */
 		err = PTR_ERR(nentry);
 		audit_panic("error updating LSM filters");
 		if (watch)
-			list_del(&entry->rule.rlist);
+			list_del(&r->rlist);
 		list_del_rcu(&entry->list);
+		list_del(&r->list);
 	} else {
 		if (watch) {
 			list_add(&nentry->rule.rlist, &watch->rules);
-			list_del(&entry->rule.rlist);
+			list_del(&r->rlist);
 		} else if (tree)
-			list_replace_init(&entry->rule.rlist,
-				     &nentry->rule.rlist);
+			list_replace_init(&r->rlist, &nentry->rule.rlist);
 		list_replace_rcu(&entry->list, &nentry->list);
+		list_replace(&r->list, &nentry->rule.list);
 	}
 	call_rcu(&entry->rcu, audit_free_rule_rcu);
 
@@ -1831,27 +1822,19 @@ static int update_lsm_rule(struct audit_entry *entry)
  * updated rule. */
 int audit_update_lsm_rules(void)
 {
-	struct audit_entry *e, *n;
+	struct audit_krule *r, *n;
 	int i, err = 0;
 
 	/* audit_filter_mutex synchronizes the writers */
 	mutex_lock(&audit_filter_mutex);
 
 	for (i = 0; i < AUDIT_NR_FILTERS; i++) {
-		list_for_each_entry_safe(e, n, &audit_filter_list[i], list) {
-			int res = update_lsm_rule(e);
-			if (!err)
-				err = res;
-		}
-	}
-	for (i=0; i< AUDIT_INODE_BUCKETS; i++) {
-		list_for_each_entry_safe(e, n, &audit_inode_hash[i], list) {
-			int res = update_lsm_rule(e);
+		list_for_each_entry_safe(r, n, &audit_rules_list[i], list) {
+			int res = update_lsm_rule(r);
 			if (!err)
 				err = res;
 		}
 	}
-
 	mutex_unlock(&audit_filter_mutex);
 
 	return err;
-- 
cgit v1.2.3


From e048e02c89db7bd49d1a5fac77a11c8fb3603087 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 16 Dec 2008 03:51:22 -0500
Subject: make sure that filterkey of task,always rules is reported

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 19d2c2747c8..8cbddff6c28 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -652,7 +652,7 @@ static int audit_filter_rules(struct task_struct *tsk,
  * completely disabled for this task.  Since we only have the task
  * structure at this point, we can only check uid and gid.
  */
-static enum audit_state audit_filter_task(struct task_struct *tsk)
+static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
 {
 	struct audit_entry *e;
 	enum audit_state   state;
@@ -660,6 +660,8 @@ static enum audit_state audit_filter_task(struct task_struct *tsk)
 	rcu_read_lock();
 	list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) {
 		if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) {
+			if (state == AUDIT_RECORD_CONTEXT)
+				*key = kstrdup(e->rule.filterkey, GFP_ATOMIC);
 			rcu_read_unlock();
 			return state;
 		}
@@ -866,18 +868,21 @@ int audit_alloc(struct task_struct *tsk)
 {
 	struct audit_context *context;
 	enum audit_state     state;
+	char *key = NULL;
 
 	if (likely(!audit_ever_enabled))
 		return 0; /* Return if not auditing. */
 
-	state = audit_filter_task(tsk);
+	state = audit_filter_task(tsk, &key);
 	if (likely(state == AUDIT_DISABLED))
 		return 0;
 
 	if (!(context = audit_alloc_context(state))) {
+		kfree(key);
 		audit_log_lost("out of memory in audit_alloc");
 		return -ENOMEM;
 	}
+	context->filterkey = key;
 
 	tsk->audit_context  = context;
 	set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
@@ -1703,8 +1708,10 @@ void audit_syscall_exit(int valid, long return_code)
 		context->sockaddr_len = 0;
 		context->type = 0;
 		context->fds[0] = -1;
-		kfree(context->filterkey);
-		context->filterkey = NULL;
+		if (context->state != AUDIT_RECORD_CONTEXT) {
+			kfree(context->filterkey);
+			context->filterkey = NULL;
+		}
 		tsk->audit_context = context;
 	}
 }
-- 
cgit v1.2.3


From 36c4f1b18c8a7d0adb4085e7f531860b837bb6b0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 15 Dec 2008 01:50:28 -0500
Subject: clean up audit_rule_{add,del} a bit

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditfilter.c | 42 +++++++++++++++++-------------------------
 1 file changed, 17 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 5d4edc6f7a3..e6e3829cadd 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1114,12 +1114,16 @@ static void audit_inotify_unregister(struct list_head *in_list)
 /* Find an existing audit rule.
  * Caller must hold audit_filter_mutex to prevent stale rule data. */
 static struct audit_entry *audit_find_rule(struct audit_entry *entry,
-					   struct list_head *list)
+					   struct list_head **p)
 {
 	struct audit_entry *e, *found = NULL;
+	struct list_head *list;
 	int h;
 
-	if (entry->rule.watch) {
+	if (entry->rule.inode_f) {
+		h = audit_hash_ino(entry->rule.inode_f->val);
+		*p = list = &audit_inode_hash[h];
+	} else if (entry->rule.watch) {
 		/* we don't know the inode number, so must walk entire hash */
 		for (h = 0; h < AUDIT_INODE_BUCKETS; h++) {
 			list = &audit_inode_hash[h];
@@ -1130,6 +1134,8 @@ static struct audit_entry *audit_find_rule(struct audit_entry *entry,
 				}
 		}
 		goto out;
+	} else {
+		*p = list = &audit_filter_list[entry->rule.listnr];
 	}
 
 	list_for_each_entry(e, list, list)
@@ -1274,14 +1280,13 @@ static u64 prio_low = ~0ULL/2;
 static u64 prio_high = ~0ULL/2 - 1;
 
 /* Add rule to given filterlist if not a duplicate. */
-static inline int audit_add_rule(struct audit_entry *entry,
-				 struct list_head *list)
+static inline int audit_add_rule(struct audit_entry *entry)
 {
 	struct audit_entry *e;
-	struct audit_field *inode_f = entry->rule.inode_f;
 	struct audit_watch *watch = entry->rule.watch;
 	struct audit_tree *tree = entry->rule.tree;
 	struct nameidata *ndp = NULL, *ndw = NULL;
+	struct list_head *list;
 	int h, err;
 #ifdef CONFIG_AUDITSYSCALL
 	int dont_count = 0;
@@ -1292,13 +1297,8 @@ static inline int audit_add_rule(struct audit_entry *entry,
 		dont_count = 1;
 #endif
 
-	if (inode_f) {
-		h = audit_hash_ino(inode_f->val);
-		list = &audit_inode_hash[h];
-	}
-
 	mutex_lock(&audit_filter_mutex);
-	e = audit_find_rule(entry, list);
+	e = audit_find_rule(entry, &list);
 	mutex_unlock(&audit_filter_mutex);
 	if (e) {
 		err = -EEXIST;
@@ -1372,15 +1372,14 @@ error:
 }
 
 /* Remove an existing rule from filterlist. */
-static inline int audit_del_rule(struct audit_entry *entry,
-				 struct list_head *list)
+static inline int audit_del_rule(struct audit_entry *entry)
 {
 	struct audit_entry  *e;
-	struct audit_field *inode_f = entry->rule.inode_f;
 	struct audit_watch *watch, *tmp_watch = entry->rule.watch;
 	struct audit_tree *tree = entry->rule.tree;
+	struct list_head *list;
 	LIST_HEAD(inotify_list);
-	int h, ret = 0;
+	int ret = 0;
 #ifdef CONFIG_AUDITSYSCALL
 	int dont_count = 0;
 
@@ -1390,13 +1389,8 @@ static inline int audit_del_rule(struct audit_entry *entry,
 		dont_count = 1;
 #endif
 
-	if (inode_f) {
-		h = audit_hash_ino(inode_f->val);
-		list = &audit_inode_hash[h];
-	}
-
 	mutex_lock(&audit_filter_mutex);
-	e = audit_find_rule(entry, list);
+	e = audit_find_rule(entry, &list);
 	if (!e) {
 		mutex_unlock(&audit_filter_mutex);
 		ret = -ENOENT;
@@ -1603,8 +1597,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
 		if (IS_ERR(entry))
 			return PTR_ERR(entry);
 
-		err = audit_add_rule(entry,
-				     &audit_filter_list[entry->rule.listnr]);
+		err = audit_add_rule(entry);
 		audit_log_rule_change(loginuid, sessionid, sid, "add",
 				      &entry->rule, !err);
 
@@ -1620,8 +1613,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
 		if (IS_ERR(entry))
 			return PTR_ERR(entry);
 
-		err = audit_del_rule(entry,
-				     &audit_filter_list[entry->rule.listnr]);
+		err = audit_del_rule(entry);
 		audit_log_rule_change(loginuid, sessionid, sid, "remove",
 				      &entry->rule, !err);
 
-- 
cgit v1.2.3


From 5af75d8d58d0f9f7b7c0515b35786b22892d5f12 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 16 Dec 2008 05:59:26 -0500
Subject: audit: validate comparison operations, store them in sane form

Don't store the field->op in the messy (and very inconvenient for e.g.
audit_comparator()) form; translate to dense set of values and do full
validation of userland-submitted value while we are at it.

->audit_init_rule() and ->audit_match_rule() get new values now; in-tree
instances updated.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit_tree.c  |   2 +-
 kernel/auditfilter.c | 132 +++++++++++++++++++++++++--------------------------
 2 files changed, 66 insertions(+), 68 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 48bddad2a3d..8ad9545b8db 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -618,7 +618,7 @@ int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
 
 	if (pathname[0] != '/' ||
 	    rule->listnr != AUDIT_FILTER_EXIT ||
-	    op & ~AUDIT_EQUAL ||
+	    op != Audit_equal ||
 	    rule->inode_f || rule->watch || rule->tree)
 		return -EINVAL;
 	rule->tree = alloc_tree(pathname);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index e6e3829cadd..fbf24d121d9 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -252,7 +252,8 @@ static inline int audit_to_inode(struct audit_krule *krule,
 				 struct audit_field *f)
 {
 	if (krule->listnr != AUDIT_FILTER_EXIT ||
-	    krule->watch || krule->inode_f || krule->tree)
+	    krule->watch || krule->inode_f || krule->tree ||
+	    (f->op != Audit_equal && f->op != Audit_not_equal))
 		return -EINVAL;
 
 	krule->inode_f = f;
@@ -270,7 +271,7 @@ static int audit_to_watch(struct audit_krule *krule, char *path, int len,
 
 	if (path[0] != '/' || path[len-1] == '/' ||
 	    krule->listnr != AUDIT_FILTER_EXIT ||
-	    op & ~AUDIT_EQUAL ||
+	    op != Audit_equal ||
 	    krule->inode_f || krule->watch || krule->tree)
 		return -EINVAL;
 
@@ -420,12 +421,32 @@ exit_err:
 	return ERR_PTR(err);
 }
 
+static u32 audit_ops[] =
+{
+	[Audit_equal] = AUDIT_EQUAL,
+	[Audit_not_equal] = AUDIT_NOT_EQUAL,
+	[Audit_bitmask] = AUDIT_BIT_MASK,
+	[Audit_bittest] = AUDIT_BIT_TEST,
+	[Audit_lt] = AUDIT_LESS_THAN,
+	[Audit_gt] = AUDIT_GREATER_THAN,
+	[Audit_le] = AUDIT_LESS_THAN_OR_EQUAL,
+	[Audit_ge] = AUDIT_GREATER_THAN_OR_EQUAL,
+};
+
+static u32 audit_to_op(u32 op)
+{
+	u32 n;
+	for (n = Audit_equal; n < Audit_bad && audit_ops[n] != op; n++)
+		;
+	return n;
+}
+
+
 /* Translate struct audit_rule to kernel's rule respresentation.
  * Exists for backward compatibility with userspace. */
 static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
 {
 	struct audit_entry *entry;
-	struct audit_field *ino_f;
 	int err = 0;
 	int i;
 
@@ -435,12 +456,28 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
 
 	for (i = 0; i < rule->field_count; i++) {
 		struct audit_field *f = &entry->rule.fields[i];
+		u32 n;
+
+		n = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS);
+
+		/* Support for legacy operators where
+		 * AUDIT_NEGATE bit signifies != and otherwise assumes == */
+		if (n & AUDIT_NEGATE)
+			f->op = Audit_not_equal;
+		else if (!n)
+			f->op = Audit_equal;
+		else
+			f->op = audit_to_op(n);
+
+		entry->rule.vers_ops = (n & AUDIT_OPERATORS) ? 2 : 1;
 
-		f->op = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS);
 		f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS);
 		f->val = rule->values[i];
 
 		err = -EINVAL;
+		if (f->op == Audit_bad)
+			goto exit_free;
+
 		switch(f->type) {
 		default:
 			goto exit_free;
@@ -462,11 +499,8 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
 		case AUDIT_EXIT:
 		case AUDIT_SUCCESS:
 			/* bit ops are only useful on syscall args */
-			if (f->op == AUDIT_BIT_MASK ||
-						f->op == AUDIT_BIT_TEST) {
-				err = -EINVAL;
+			if (f->op == Audit_bitmask || f->op == Audit_bittest)
 				goto exit_free;
-			}
 			break;
 		case AUDIT_ARG0:
 		case AUDIT_ARG1:
@@ -475,11 +509,8 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
 			break;
 		/* arch is only allowed to be = or != */
 		case AUDIT_ARCH:
-			if ((f->op != AUDIT_NOT_EQUAL) && (f->op != AUDIT_EQUAL)
-					&& (f->op != AUDIT_NEGATE) && (f->op)) {
-				err = -EINVAL;
+			if (f->op != Audit_not_equal && f->op != Audit_equal)
 				goto exit_free;
-			}
 			entry->rule.arch_f = f;
 			break;
 		case AUDIT_PERM:
@@ -496,33 +527,10 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
 				goto exit_free;
 			break;
 		}
-
-		entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1;
-
-		/* Support for legacy operators where
-		 * AUDIT_NEGATE bit signifies != and otherwise assumes == */
-		if (f->op & AUDIT_NEGATE)
-			f->op = AUDIT_NOT_EQUAL;
-		else if (!f->op)
-			f->op = AUDIT_EQUAL;
-		else if (f->op == AUDIT_OPERATORS) {
-			err = -EINVAL;
-			goto exit_free;
-		}
 	}
 
-	ino_f = entry->rule.inode_f;
-	if (ino_f) {
-		switch(ino_f->op) {
-		case AUDIT_NOT_EQUAL:
-			entry->rule.inode_f = NULL;
-		case AUDIT_EQUAL:
-			break;
-		default:
-			err = -EINVAL;
-			goto exit_free;
-		}
-	}
+	if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal)
+		entry->rule.inode_f = NULL;
 
 exit_nofree:
 	return entry;
@@ -538,7 +546,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 {
 	int err = 0;
 	struct audit_entry *entry;
-	struct audit_field *ino_f;
 	void *bufp;
 	size_t remain = datasz - sizeof(struct audit_rule_data);
 	int i;
@@ -554,11 +561,11 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 		struct audit_field *f = &entry->rule.fields[i];
 
 		err = -EINVAL;
-		if (!(data->fieldflags[i] & AUDIT_OPERATORS) ||
-		    data->fieldflags[i] & ~AUDIT_OPERATORS)
+
+		f->op = audit_to_op(data->fieldflags[i]);
+		if (f->op == Audit_bad)
 			goto exit_free;
 
-		f->op = data->fieldflags[i] & AUDIT_OPERATORS;
 		f->type = data->fields[i];
 		f->val = data->values[i];
 		f->lsm_str = NULL;
@@ -670,18 +677,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 		}
 	}
 
-	ino_f = entry->rule.inode_f;
-	if (ino_f) {
-		switch(ino_f->op) {
-		case AUDIT_NOT_EQUAL:
-			entry->rule.inode_f = NULL;
-		case AUDIT_EQUAL:
-			break;
-		default:
-			err = -EINVAL;
-			goto exit_free;
-		}
-	}
+	if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal)
+		entry->rule.inode_f = NULL;
 
 exit_nofree:
 	return entry;
@@ -721,10 +718,10 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule)
 		rule->fields[i] = krule->fields[i].type;
 
 		if (krule->vers_ops == 1) {
-			if (krule->fields[i].op & AUDIT_NOT_EQUAL)
+			if (krule->fields[i].op == Audit_not_equal)
 				rule->fields[i] |= AUDIT_NEGATE;
 		} else {
-			rule->fields[i] |= krule->fields[i].op;
+			rule->fields[i] |= audit_ops[krule->fields[i].op];
 		}
 	}
 	for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i];
@@ -752,7 +749,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
 		struct audit_field *f = &krule->fields[i];
 
 		data->fields[i] = f->type;
-		data->fieldflags[i] = f->op;
+		data->fieldflags[i] = audit_ops[f->op];
 		switch(f->type) {
 		case AUDIT_SUBJ_USER:
 		case AUDIT_SUBJ_ROLE:
@@ -1626,28 +1623,29 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
 	return err;
 }
 
-int audit_comparator(const u32 left, const u32 op, const u32 right)
+int audit_comparator(u32 left, u32 op, u32 right)
 {
 	switch (op) {
-	case AUDIT_EQUAL:
+	case Audit_equal:
 		return (left == right);
-	case AUDIT_NOT_EQUAL:
+	case Audit_not_equal:
 		return (left != right);
-	case AUDIT_LESS_THAN:
+	case Audit_lt:
 		return (left < right);
-	case AUDIT_LESS_THAN_OR_EQUAL:
+	case Audit_le:
 		return (left <= right);
-	case AUDIT_GREATER_THAN:
+	case Audit_gt:
 		return (left > right);
-	case AUDIT_GREATER_THAN_OR_EQUAL:
+	case Audit_ge:
 		return (left >= right);
-	case AUDIT_BIT_MASK:
+	case Audit_bitmask:
 		return (left & right);
-	case AUDIT_BIT_TEST:
+	case Audit_bittest:
 		return ((left & right) == right);
+	default:
+		BUG();
+		return 0;
 	}
-	BUG();
-	return 0;
 }
 
 /* Compare given dentry name with last component in given path,
-- 
cgit v1.2.3


From 7b574b7b0124ed344911f5d581e9bc2d83bbeb19 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Sun, 4 Jan 2009 12:00:45 -0800
Subject: cgroups: fix a race between cgroup_clone and umount

The race is calling cgroup_clone() while umounting the ns cgroup subsys,
and thus cgroup_clone() might access invalid cgroup_fs, or kill_sb() is
called after cgroup_clone() created a new dir in it.

The BUG I triggered is BUG_ON(root->number_of_cgroups != 1);

  ------------[ cut here ]------------
  kernel BUG at kernel/cgroup.c:1093!
  invalid opcode: 0000 [#1] SMP
  ...
  Process umount (pid: 5177, ti=e411e000 task=e40c4670 task.ti=e411e000)
  ...
  Call Trace:
   [<c0493df7>] ? deactivate_super+0x3f/0x51
   [<c04a3600>] ? mntput_no_expire+0xb3/0xdd
   [<c04a3ab2>] ? sys_umount+0x265/0x2ac
   [<c04a3b06>] ? sys_oldumount+0xd/0xf
   [<c0403911>] ? sysenter_do_call+0x12/0x31
  ...
  EIP: [<c0456e76>] cgroup_kill_sb+0x23/0xe0 SS:ESP 0068:e411ef2c
  ---[ end trace c766c1be3bf944ac ]---

Cc: Serge E. Hallyn <serue@us.ibm.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: "Serge E. Hallyn" <serue@us.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 48348dde6d8..891a84eb9d3 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2945,7 +2945,11 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
 	parent = task_cgroup(tsk, subsys->subsys_id);
 
 	/* Pin the hierarchy */
-	atomic_inc(&parent->root->sb->s_active);
+	if (!atomic_inc_not_zero(&parent->root->sb->s_active)) {
+		/* We race with the final deactivate_super() */
+		mutex_unlock(&cgroup_mutex);
+		return 0;
+	}
 
 	/* Keep the cgroup alive */
 	get_css_set(cg);
-- 
cgit v1.2.3


From ca4787b779dd698a2a33a328aa5fa90a3e954077 Mon Sep 17 00:00:00 2001
From: Tim Abbott <tabbott@MIT.EDU>
Date: Mon, 5 Jan 2009 08:40:10 -0600
Subject: kernel/module.c: compare symbol values when marking symbols as
 exported in /proc/kallsyms.

When there are two symbols in a module with the same name, one of which is
exported, both will be marked as exported in /proc/kallsyms.  There aren't
any instances of this in the current kernel, but it is easy to construct a
simple module with two compilation units that exhibits the problem.

$ objdump -j .text -t testmod.ko | grep foo
00000000 l     F .text	00000032 foo
00000080 g     F .text	00000001 foo
$ sudo insmod testmod.ko
$ grep "T foo" /proc/kallsyms
c28e8000 T foo	[testmod]
c28e8080 T foo	[testmod]

Fix this by comparing the symbol values once we've found the exported
symbol table entry matching the symbol name.  Tested using Ksplice:

$ ksplice-create --patch=this_commit.patch --id=bar .
$ sudo ksplice-apply ksplice-bar.tar.gz
Done!
$ grep "T foo" /proc/kallsyms
c28e8080 T foo	[testmod]

Signed-off-by: Tim Abbott <tabbott@mit.edu>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index dd2a54155b5..895c5675edb 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1725,15 +1725,15 @@ static const struct kernel_symbol *lookup_symbol(const char *name,
 	return NULL;
 }
 
-static int is_exported(const char *name, const struct module *mod)
+static int is_exported(const char *name, unsigned long value,
+		       const struct module *mod)
 {
-	if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab))
-		return 1;
+	const struct kernel_symbol *ks;
+	if (!mod)
+		ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab);
 	else
-		if (mod && lookup_symbol(name, mod->syms, mod->syms + mod->num_syms))
-			return 1;
-		else
-			return 0;
+		ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms);
+	return ks != NULL && ks->value == value;
 }
 
 /* As per nm */
@@ -2504,7 +2504,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 			strlcpy(name, mod->strtab + mod->symtab[symnum].st_name,
 				KSYM_NAME_LEN);
 			strlcpy(module_name, mod->name, MODULE_NAME_LEN);
-			*exported = is_exported(name, mod);
+			*exported = is_exported(name, *value, mod);
 			preempt_enable();
 			return 0;
 		}
-- 
cgit v1.2.3


From d1e99d7ae4e6bbd1ebb5e81ecd3af2b8793efee0 Mon Sep 17 00:00:00 2001
From: Jianjun Kong <jianjun@zeuux.org>
Date: Mon, 8 Dec 2008 14:26:29 +0800
Subject: module: fix warning of unused function when !CONFIG_PROC_FS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix this warning:
kernel/module.c:824: warning: ‘print_unload_info’ defined but not used
print_unload_info() just was used when CONFIG_PROC_FS was defined.
This patch mark print_unload_info() inline to solve the problem.

Signed-off-by: Jianjun Kong <jianjun@zeuux.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
CC: Ingo Molnar <mingo@elte.hu>
CC: Américo Wang <xiyou.wangcong@gmail.com>
---
 kernel/module.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 895c5675edb..d3d254571bd 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -820,7 +820,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
 	return ret;
 }
 
-static void print_unload_info(struct seq_file *m, struct module *mod)
+static inline void print_unload_info(struct seq_file *m, struct module *mod)
 {
 	struct module_use *use;
 	int printed_something = 0;
@@ -893,7 +893,7 @@ void module_put(struct module *module)
 EXPORT_SYMBOL(module_put);
 
 #else /* !CONFIG_MODULE_UNLOAD */
-static void print_unload_info(struct seq_file *m, struct module *mod)
+static inline void print_unload_info(struct seq_file *m, struct module *mod)
 {
 	/* We don't know the usage count, or what modules are using. */
 	seq_printf(m, " - -");
-- 
cgit v1.2.3


From 088af9a6e05d51e7c3dc85d45d8b7a52c3ee08d7 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Wed, 31 Dec 2008 12:31:18 +0100
Subject: module: fix module loading failure of large kernel modules for parisc

When creating the final layout of a kernel module in memory, allow the
module loader to reserve some additional memory in front of a given section.
This is currently only needed for the parisc port which needs to put the
stub entries there to fulfill the 17/22bit PCREL relocations with large
kernel modules like xfs.

Signed-off-by: Helge Deller <deller@gmx.de>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (renamed fn)
---
 kernel/module.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index d3d254571bd..4299aefc20b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1578,11 +1578,21 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
 	return ret;
 }
 
+/* Additional bytes needed by arch in front of individual sections */
+unsigned int __weak arch_mod_section_prepend(struct module *mod,
+					     unsigned int section)
+{
+	/* default implementation just returns zero */
+	return 0;
+}
+
 /* Update size with this section: return offset. */
-static long get_offset(unsigned int *size, Elf_Shdr *sechdr)
+static long get_offset(struct module *mod, unsigned int *size,
+		       Elf_Shdr *sechdr, unsigned int section)
 {
 	long ret;
 
+	*size += arch_mod_section_prepend(mod, section);
 	ret = ALIGN(*size, sechdr->sh_addralign ?: 1);
 	*size = ret + sechdr->sh_size;
 	return ret;
@@ -1622,7 +1632,7 @@ static void layout_sections(struct module *mod,
 			    || strncmp(secstrings + s->sh_name,
 				       ".init", 5) == 0)
 				continue;
-			s->sh_entsize = get_offset(&mod->core_size, s);
+			s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
 			DEBUGP("\t%s\n", secstrings + s->sh_name);
 		}
 		if (m == 0)
@@ -1640,7 +1650,7 @@ static void layout_sections(struct module *mod,
 			    || strncmp(secstrings + s->sh_name,
 				       ".init", 5) != 0)
 				continue;
-			s->sh_entsize = (get_offset(&mod->init_size, s)
+			s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
 					 | INIT_OFFSET_MASK);
 			DEBUGP("\t%s\n", secstrings + s->sh_name);
 		}
-- 
cgit v1.2.3


From 9ea09af3bd3090e8349ca2899ca2011bd94cda85 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 22 Dec 2008 12:36:30 +0100
Subject: stop_machine: introduce stop_machine_create/destroy.

Introduce stop_machine_create/destroy. With this interface subsystems
that need a non-failing stop_machine environment can create the
stop_machine machine threads before actually calling stop_machine.
When the threads aren't needed anymore they can be killed with
stop_machine_destroy again.

When stop_machine gets called and the threads aren't present they
will be created and destroyed automatically. This restores the old
behaviour of stop_machine.

This patch also converts cpu hotplug to the new interface since it
is special: cpu_down calls __stop_machine instead of stop_machine.
However the kstop threads will only be created when stop_machine
gets called.

Changing the code so that the threads would be created automatically
on __stop_machine is currently not possible: when __stop_machine gets
called we hold cpu_add_remove_lock, which is the same lock that
create_rt_workqueue would take. So the workqueue needs to be created
before the cpu hotplug code locks cpu_add_remove_lock.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/cpu.c          |  6 +++++-
 kernel/stop_machine.c | 55 +++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 50 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 47fff3b63cb..30e74dd6d01 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -269,8 +269,11 @@ out_release:
 
 int __ref cpu_down(unsigned int cpu)
 {
-	int err = 0;
+	int err;
 
+	err = stop_machine_create();
+	if (err)
+		return err;
 	cpu_maps_update_begin();
 
 	if (cpu_hotplug_disabled) {
@@ -297,6 +300,7 @@ int __ref cpu_down(unsigned int cpu)
 
 out:
 	cpu_maps_update_done();
+	stop_machine_destroy();
 	return err;
 }
 EXPORT_SYMBOL(cpu_down);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 286c41722e8..0cd415ee62a 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -38,7 +38,10 @@ struct stop_machine_data {
 static unsigned int num_threads;
 static atomic_t thread_ack;
 static DEFINE_MUTEX(lock);
-
+/* setup_lock protects refcount, stop_machine_wq and stop_machine_work. */
+static DEFINE_MUTEX(setup_lock);
+/* Users of stop_machine. */
+static int refcount;
 static struct workqueue_struct *stop_machine_wq;
 static struct stop_machine_data active, idle;
 static const cpumask_t *active_cpus;
@@ -109,6 +112,43 @@ static int chill(void *unused)
 	return 0;
 }
 
+int stop_machine_create(void)
+{
+	mutex_lock(&setup_lock);
+	if (refcount)
+		goto done;
+	stop_machine_wq = create_rt_workqueue("kstop");
+	if (!stop_machine_wq)
+		goto err_out;
+	stop_machine_work = alloc_percpu(struct work_struct);
+	if (!stop_machine_work)
+		goto err_out;
+done:
+	refcount++;
+	mutex_unlock(&setup_lock);
+	return 0;
+
+err_out:
+	if (stop_machine_wq)
+		destroy_workqueue(stop_machine_wq);
+	mutex_unlock(&setup_lock);
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(stop_machine_create);
+
+void stop_machine_destroy(void)
+{
+	mutex_lock(&setup_lock);
+	refcount--;
+	if (refcount)
+		goto done;
+	destroy_workqueue(stop_machine_wq);
+	free_percpu(stop_machine_work);
+done:
+	mutex_unlock(&setup_lock);
+}
+EXPORT_SYMBOL_GPL(stop_machine_destroy);
+
 int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 {
 	struct work_struct *sm_work;
@@ -146,19 +186,14 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 {
 	int ret;
 
+	ret = stop_machine_create();
+	if (ret)
+		return ret;
 	/* No CPUs can come up or down during this. */
 	get_online_cpus();
 	ret = __stop_machine(fn, data, cpus);
 	put_online_cpus();
-
+	stop_machine_destroy();
 	return ret;
 }
 EXPORT_SYMBOL_GPL(stop_machine);
-
-static int __init stop_machine_init(void)
-{
-	stop_machine_wq = create_rt_workqueue("kstop");
-	stop_machine_work = alloc_percpu(struct work_struct);
-	return 0;
-}
-core_initcall(stop_machine_init);
-- 
cgit v1.2.3


From 9e01892c4234070bbcf3a9f582514c8b91464375 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 22 Dec 2008 12:36:31 +0100
Subject: module: convert to stop_machine_create/destroy.

The module code relies on a non-failing stop_machine call. So we create
the kstop threads in advance and with that make sure the call won't fail.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 4299aefc20b..f47cce910f2 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -757,8 +757,16 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
 		return -EFAULT;
 	name[MODULE_NAME_LEN-1] = '\0';
 
-	if (mutex_lock_interruptible(&module_mutex) != 0)
-		return -EINTR;
+	/* Create stop_machine threads since free_module relies on
+	 * a non-failing stop_machine call. */
+	ret = stop_machine_create();
+	if (ret)
+		return ret;
+
+	if (mutex_lock_interruptible(&module_mutex) != 0) {
+		ret = -EINTR;
+		goto out_stop;
+	}
 
 	mod = find_module(name);
 	if (!mod) {
@@ -817,6 +825,8 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
 
  out:
 	mutex_unlock(&module_mutex);
+out_stop:
+	stop_machine_destroy();
 	return ret;
 }
 
@@ -1875,6 +1885,13 @@ static noinline struct module *load_module(void __user *umod,
 	/* vmalloc barfs on "unusual" numbers.  Check here */
 	if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
 		return ERR_PTR(-ENOMEM);
+
+	/* Create stop_machine threads since the error path relies on
+	 * a non-failing stop_machine call. */
+	err = stop_machine_create();
+	if (err)
+		goto free_hdr;
+
 	if (copy_from_user(hdr, umod, len) != 0) {
 		err = -EFAULT;
 		goto free_hdr;
@@ -2258,6 +2275,7 @@ static noinline struct module *load_module(void __user *umod,
 	/* Get rid of temporary copy */
 	vfree(hdr);
 
+	stop_machine_destroy();
 	/* Done! */
 	return mod;
 
@@ -2280,6 +2298,7 @@ static noinline struct module *load_module(void __user *umod,
 	kfree(args);
  free_hdr:
 	vfree(hdr);
+	stop_machine_destroy();
 	return ERR_PTR(err);
 
  truncated:
-- 
cgit v1.2.3


From 14eaddc967b16017d4a1a24d2be6c28ecbe06ed8 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 31 Dec 2008 15:15:42 +0000
Subject: CRED: Fix regression in cap_capable() as shown up by sys_faccessat()
 [ver #2]

Fix a regression in cap_capable() due to:

	commit 5ff7711e635b32f0a1e558227d030c7e45b4a465
	Author: David Howells <dhowells@redhat.com>
	Date:   Wed Dec 31 02:52:28 2008 +0000

	    CRED: Differentiate objective and effective subjective credentials on a task

The problem is that the above patch allows a process to have two sets of
credentials, and for the most part uses the subjective credentials when
accessing current's creds.

There is, however, one exception: cap_capable(), and thus capable(), uses the
real/objective credentials of the target task, whether or not it is the current
task.

Ordinarily this doesn't matter, since usually the two cred pointers in current
point to the same set of creds.  However, sys_faccessat() makes use of this
facility to override the credentials of the calling process to make its test,
without affecting the creds as seen from other processes.

One of the things sys_faccessat() does is to make an adjustment to the
effective capabilities mask, which cap_capable(), as it stands, then ignores.

The affected capability check is in generic_permission():

	if (!(mask & MAY_EXEC) || execute_ok(inode))
		if (capable(CAP_DAC_OVERRIDE))
			return 0;

This change splits capable() from has_capability() down into the commoncap and
SELinux code.  The capable() security op now only deals with the current
process, and uses the current process's subjective creds.  A new security op -
task_capable() - is introduced that can check any task's objective creds.

strictly the capable() security op is superfluous with the presence of the
task_capable() op, however it should be faster to call the capable() op since
two fewer arguments need be passed down through the various layers.

This can be tested by compiling the following program from the XFS testsuite:

/*
 *  t_access_root.c - trivial test program to show permission bug.
 *
 *  Written by Michael Kerrisk - copyright ownership not pursued.
 *  Sourced from: http://linux.derkeiler.com/Mailing-Lists/Kernel/2003-10/6030.html
 */
#include <limits.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <sys/stat.h>

#define UID 500
#define GID 100
#define PERM 0
#define TESTPATH "/tmp/t_access"

static void
errExit(char *msg)
{
    perror(msg);
    exit(EXIT_FAILURE);
} /* errExit */

static void
accessTest(char *file, int mask, char *mstr)
{
    printf("access(%s, %s) returns %d\n", file, mstr, access(file, mask));
} /* accessTest */

int
main(int argc, char *argv[])
{
    int fd, perm, uid, gid;
    char *testpath;
    char cmd[PATH_MAX + 20];

    testpath = (argc > 1) ? argv[1] : TESTPATH;
    perm = (argc > 2) ? strtoul(argv[2], NULL, 8) : PERM;
    uid = (argc > 3) ? atoi(argv[3]) : UID;
    gid = (argc > 4) ? atoi(argv[4]) : GID;

    unlink(testpath);

    fd = open(testpath, O_RDWR | O_CREAT, 0);
    if (fd == -1) errExit("open");

    if (fchown(fd, uid, gid) == -1) errExit("fchown");
    if (fchmod(fd, perm) == -1) errExit("fchmod");
    close(fd);

    snprintf(cmd, sizeof(cmd), "ls -l %s", testpath);
    system(cmd);

    if (seteuid(uid) == -1) errExit("seteuid");

    accessTest(testpath, 0, "0");
    accessTest(testpath, R_OK, "R_OK");
    accessTest(testpath, W_OK, "W_OK");
    accessTest(testpath, X_OK, "X_OK");
    accessTest(testpath, R_OK | W_OK, "R_OK | W_OK");
    accessTest(testpath, R_OK | X_OK, "R_OK | X_OK");
    accessTest(testpath, W_OK | X_OK, "W_OK | X_OK");
    accessTest(testpath, R_OK | W_OK | X_OK, "R_OK | W_OK | X_OK");

    exit(EXIT_SUCCESS);
} /* main */

This can be run against an Ext3 filesystem as well as against an XFS
filesystem.  If successful, it will show:

	[root@andromeda src]# ./t_access_root /tmp/xxx 0 4043 4043
	---------- 1 dhowells dhowells 0 2008-12-31 03:00 /tmp/xxx
	access(/tmp/xxx, 0) returns 0
	access(/tmp/xxx, R_OK) returns 0
	access(/tmp/xxx, W_OK) returns 0
	access(/tmp/xxx, X_OK) returns -1
	access(/tmp/xxx, R_OK | W_OK) returns 0
	access(/tmp/xxx, R_OK | X_OK) returns -1
	access(/tmp/xxx, W_OK | X_OK) returns -1
	access(/tmp/xxx, R_OK | W_OK | X_OK) returns -1

If unsuccessful, it will show:

	[root@andromeda src]# ./t_access_root /tmp/xxx 0 4043 4043
	---------- 1 dhowells dhowells 0 2008-12-31 02:56 /tmp/xxx
	access(/tmp/xxx, 0) returns 0
	access(/tmp/xxx, R_OK) returns -1
	access(/tmp/xxx, W_OK) returns -1
	access(/tmp/xxx, X_OK) returns -1
	access(/tmp/xxx, R_OK | W_OK) returns -1
	access(/tmp/xxx, R_OK | X_OK) returns -1
	access(/tmp/xxx, W_OK | X_OK) returns -1
	access(/tmp/xxx, R_OK | W_OK | X_OK) returns -1

I've also tested the fix with the SELinux and syscalls LTP testsuites.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/capability.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/capability.c b/kernel/capability.c
index 36b4b4daebe..df62f53f84a 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -308,7 +308,7 @@ int capable(int cap)
 		BUG();
 	}
 
-	if (has_capability(current, cap)) {
+	if (security_capable(cap) == 0) {
 		current->flags |= PF_SUPERPRIV;
 		return 1;
 	}
-- 
cgit v1.2.3


From c12172c0251761c54260376eb29a5f6547495580 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 4 Jan 2009 20:30:06 -0800
Subject: rcu: fix rcutree grace-period-latency bug on small systems

Impact: fix delays during bootup

Kudos to Andi Kleen for finding a grace-period-latency problem!  The
problem was that the special-case code for small machines never updated
the ->signaled field to indicate that grace-period initialization had
completed, which prevented force_quiescent_state() from ever expediting
grace periods.  This problem resulted in grace periods extending for more
than 20 seconds.  Not subtle.  I introduced this bug during my inspection
process when I fixed a race between grace-period initialization and
force_quiescent_state() execution.

The following patch properly updates the ->signaled field for the
"small"-system case (no more than 32 CPUs for 32-bit kernels and no more
than 64 CPUs for 64-bit kernels).

Reported-by: Andi Kleen <andi@firstfloor.org>
Tested-by: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index a342b032112..88d921c5c44 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -572,6 +572,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
 	/* Special-case the common single-level case. */
 	if (NUM_RCU_NODES == 1) {
 		rnp->qsmask = rnp->qsmaskinit;
+		rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
 		spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
 	}
-- 
cgit v1.2.3


From 90a4d2c0106bb690f0b6af3d506febc35c658aa7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 4 Jan 2009 11:41:11 -0800
Subject: rcu: make treercu safe for suspend and resume

Impact: fix kernel warnings [and potential crash] during suspend+resume

Kudos to both Dhaval Giani and Jens Axboe for finding a bug in treercu
that causes warnings after suspend-resume cycles in Dhaval's case and
during stress tests in Jens's case.  It would also probably cause failures
if heavily stressed.  The solution, ironically enough, is to revert to
rcupreempt's code for initializing the dynticks state.  And the patch
even results in smaller code -- so what was I thinking???

This is 2.6.29 material, given that people really do suspend and resume
Linux these days.  ;-)

Reported-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Reported-by: Jens Axboe <jens.axboe@oracle.com>
Tested-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Tested-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 88d921c5c44..f2d8638e6c6 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -79,7 +79,10 @@ struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
 
 #ifdef CONFIG_NO_HZ
-DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks);
+DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
+	.dynticks_nesting = 1,
+	.dynticks = 1,
+};
 #endif /* #ifdef CONFIG_NO_HZ */
 
 static int blimit = 10;		/* Maximum callbacks per softirq. */
@@ -1380,13 +1383,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 
 static void __cpuinit rcu_online_cpu(int cpu)
 {
-#ifdef CONFIG_NO_HZ
-	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
-	rdtp->dynticks_nesting = 1;
-	rdtp->dynticks |= 1; 	/* need consecutive #s even for hotplug. */
-	rdtp->dynticks_nmi = (rdtp->dynticks_nmi + 1) & ~0x1;
-#endif /* #ifdef CONFIG_NO_HZ */
 	rcu_init_percpu_data(cpu, &rcu_state);
 	rcu_init_percpu_data(cpu, &rcu_bh_state);
 	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
-- 
cgit v1.2.3


From ea7d3fef4222cd98556a0b386598268d4dbf6670 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 4 Jan 2009 13:03:02 -0800
Subject: rcu: eliminate synchronize_rcu_xxx macro

Impact: cleanup

Expand macro into two files.

The synchronize_rcu_xxx macro is quite ugly and it's only used by two
callers, so expand it instead.  This makes this code easier to change.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcupdate.c   | 11 +++++++++--
 kernel/rcupreempt.c | 11 ++++++++++-
 2 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index ad63af8b252..d92a76a881a 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -77,8 +77,15 @@ void wakeme_after_rcu(struct rcu_head  *head)
  * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
  * and may be nested.
  */
-void synchronize_rcu(void);	/* Makes kernel-doc tools happy */
-synchronize_rcu_xxx(synchronize_rcu, call_rcu)
+void synchronize_rcu(void)
+{
+	struct rcu_synchronize rcu;
+	init_completion(&rcu.completion);
+	/* Will wake me after RCU finished. */
+	call_rcu(&rcu.head, wakeme_after_rcu);
+	/* Wait for it. */
+	wait_for_completion(&rcu.completion);
+}
 EXPORT_SYMBOL_GPL(synchronize_rcu);
 
 static void rcu_barrier_callback(struct rcu_head *notused)
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index f9dc8f3720f..33cfc50781f 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -1177,7 +1177,16 @@ EXPORT_SYMBOL_GPL(call_rcu_sched);
  * in -rt this does -not- necessarily result in all currently executing
  * interrupt -handlers- having completed.
  */
-synchronize_rcu_xxx(__synchronize_sched, call_rcu_sched)
+void __synchronize_sched(void)
+{
+	struct rcu_synchronize rcu;
+
+	init_completion(&rcu.completion);
+	/* Will wake me after RCU finished. */
+	call_rcu_sched(&rcu.head, wakeme_after_rcu);
+	/* Wait for it. */
+	wait_for_completion(&rcu.completion);
+}
 EXPORT_SYMBOL_GPL(__synchronize_sched);
 
 /*
-- 
cgit v1.2.3


From c59ab97e9ecdee9084d2da09e5a8ceea9a396508 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 4 Jan 2009 18:28:27 -0800
Subject: rcu: fix rcutorture bug

Fix an rcutorture bug that prevents the shutdown notifier from ever
actually having any effect, due to the fact that kthreads ignore all
signals.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutorture.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 3245b40952c..1cff28db56b 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -136,7 +136,7 @@ static int stutter_pause_test = 0;
 #endif
 int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
 
-#define FULLSTOP_SIGNALED 1	/* Bail due to signal. */
+#define FULLSTOP_SHUTDOWN 1	/* Bail due to system shutdown/panic. */
 #define FULLSTOP_CLEANUP  2	/* Orderly shutdown. */
 static int fullstop;		/* stop generating callbacks at test end. */
 DEFINE_MUTEX(fullstop_mutex);	/* protect fullstop transitions and */
@@ -151,12 +151,10 @@ rcutorture_shutdown_notify(struct notifier_block *unused1,
 {
 	if (fullstop)
 		return NOTIFY_DONE;
-	if (signal_pending(current)) {
-		mutex_lock(&fullstop_mutex);
-		if (!ACCESS_ONCE(fullstop))
-			fullstop = FULLSTOP_SIGNALED;
-		mutex_unlock(&fullstop_mutex);
-	}
+	mutex_lock(&fullstop_mutex);
+	if (!fullstop)
+		fullstop = FULLSTOP_SHUTDOWN;
+	mutex_unlock(&fullstop_mutex);
 	return NOTIFY_DONE;
 }
 
@@ -624,7 +622,7 @@ rcu_torture_writer(void *arg)
 		rcu_stutter_wait();
 	} while (!kthread_should_stop() && !fullstop);
 	VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
-	while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED)
+	while (!kthread_should_stop() && fullstop != FULLSTOP_SHUTDOWN)
 		schedule_timeout_uninterruptible(1);
 	return 0;
 }
@@ -649,7 +647,7 @@ rcu_torture_fakewriter(void *arg)
 	} while (!kthread_should_stop() && !fullstop);
 
 	VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping");
-	while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED)
+	while (!kthread_should_stop() && fullstop != FULLSTOP_SHUTDOWN)
 		schedule_timeout_uninterruptible(1);
 	return 0;
 }
@@ -759,7 +757,7 @@ rcu_torture_reader(void *arg)
 	VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
 	if (irqreader && cur_ops->irqcapable)
 		del_timer_sync(&t);
-	while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED)
+	while (!kthread_should_stop() && fullstop != FULLSTOP_SHUTDOWN)
 		schedule_timeout_uninterruptible(1);
 	return 0;
 }
-- 
cgit v1.2.3


From 8bdec955b0da2ffbd10eb9b200651dd1f9e366f2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 5 Jan 2009 11:28:19 +0100
Subject: hrtimer: splitout peek ahead functionality

Impact: cleanup

Provide a peek ahead function that assumes irqs disabled, allows for micro
optimizations.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hrtimer.c | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index eb2bfefa6dc..8f7001c97e0 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1243,6 +1243,22 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 	}
 }
 
+/*
+ * local version of hrtimer_peek_ahead_timers() called with interrupts
+ * disabled.
+ */
+static void __hrtimer_peek_ahead_timers(void)
+{
+	struct tick_device *td;
+
+	if (!hrtimer_hres_active())
+		return;
+
+	td = &__get_cpu_var(tick_cpu_device);
+	if (td && td->evtdev)
+		hrtimer_interrupt(td->evtdev);
+}
+
 /**
  * hrtimer_peek_ahead_timers -- run soft-expired timers now
  *
@@ -1254,16 +1270,10 @@ void hrtimer_interrupt(struct clock_event_device *dev)
  */
 void hrtimer_peek_ahead_timers(void)
 {
-	struct tick_device *td;
 	unsigned long flags;
 
-	if (!hrtimer_hres_active())
-		return;
-
 	local_irq_save(flags);
-	td = &__get_cpu_var(tick_cpu_device);
-	if (td && td->evtdev)
-		hrtimer_interrupt(td->evtdev);
+	__hrtimer_peek_ahead_timers();
 	local_irq_restore(flags);
 }
 
-- 
cgit v1.2.3


From d5fd43c4ae04523e1dcd7794f9c511b289851350 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 5 Jan 2009 11:28:20 +0100
Subject: hrtimer: fix HOTPLUG_CPU=n compile warning

Impact: cleanup

 kernel/hrtimer.c: In function 'hrtimer_cpu_notify':
 kernel/hrtimer.c:1574: warning: unused variable 'dcpu'

Introduced by commit 37810659ea7d9572c5ac284ade272f806ef8f788
("hrtimer: removing all ur callback modes, fix hotplug") from the
timers.  dcpu is only used if CONFIG_HOTPLUG_CPU is set.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hrtimer.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 8f7001c97e0..9c2bfa84128 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1504,6 +1504,11 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
 
 #ifdef CONFIG_HOTPLUG_CPU
 
+static void tickle_timers(void *arg)
+{
+	hrtimer_peek_ahead_timers();
+}
+
 static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 				struct hrtimer_clock_base *new_base)
 {
@@ -1539,7 +1544,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 	}
 }
 
-static int migrate_hrtimers(int scpu)
+static void migrate_hrtimers(int scpu)
 {
 	struct hrtimer_cpu_base *old_base, *new_base;
 	int dcpu, i;
@@ -1567,12 +1572,7 @@ static int migrate_hrtimers(int scpu)
 	spin_unlock_irq(&new_base->lock);
 	put_cpu_var(hrtimer_bases);
 
-	return dcpu;
-}
-
-static void tickle_timers(void *arg)
-{
-	hrtimer_peek_ahead_timers();
+	smp_call_function_single(dcpu, tickle_timers, NULL, 0);
 }
 
 #endif /* CONFIG_HOTPLUG_CPU */
@@ -1593,11 +1593,8 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 	{
-		int dcpu;
-
 		clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu);
-		dcpu = migrate_hrtimers(scpu);
-		smp_call_function_single(dcpu, tickle_timers, NULL, 0);
+		migrate_hrtimers(scpu);
 		break;
 	}
 #endif
-- 
cgit v1.2.3


From 731a55ba0f17064f85903b7bf8e24849ec6cfa20 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 5 Jan 2009 11:28:21 +0100
Subject: hrtimer: simplify hotplug migration

Impact: cleanup

No need for a smp function call, which is likely to run on the same
CPU anyway. We can just call hrtimers_peek_ahead() in the interrupts
disabled section of migrate_hrtimers().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hrtimer.c | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 9c2bfa84128..8010a67cead 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1504,11 +1504,6 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
 
 #ifdef CONFIG_HOTPLUG_CPU
 
-static void tickle_timers(void *arg)
-{
-	hrtimer_peek_ahead_timers();
-}
-
 static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 				struct hrtimer_clock_base *new_base)
 {
@@ -1547,20 +1542,19 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 static void migrate_hrtimers(int scpu)
 {
 	struct hrtimer_cpu_base *old_base, *new_base;
-	int dcpu, i;
+	int i;
 
 	BUG_ON(cpu_online(scpu));
-	old_base = &per_cpu(hrtimer_bases, scpu);
-	new_base = &get_cpu_var(hrtimer_bases);
-
-	dcpu = smp_processor_id();
-
 	tick_cancel_sched_timer(scpu);
+
+	local_irq_disable();
+	old_base = &per_cpu(hrtimer_bases, scpu);
+	new_base = &__get_cpu_var(hrtimer_bases);
 	/*
 	 * The caller is globally serialized and nobody else
 	 * takes two locks at once, deadlock is not possible.
 	 */
-	spin_lock_irq(&new_base->lock);
+	spin_lock(&new_base->lock);
 	spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
 
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
@@ -1569,10 +1563,11 @@ static void migrate_hrtimers(int scpu)
 	}
 
 	spin_unlock(&old_base->lock);
-	spin_unlock_irq(&new_base->lock);
-	put_cpu_var(hrtimer_bases);
+	spin_unlock(&new_base->lock);
 
-	smp_call_function_single(dcpu, tickle_timers, NULL, 0);
+	/* Check, if we got expired work to do */
+	__hrtimer_peek_ahead_timers();
+	local_irq_enable();
 }
 
 #endif /* CONFIG_HOTPLUG_CPU */
-- 
cgit v1.2.3


From a6037b61c2f5fc99c57c15b26d7cfa58bbb34008 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 5 Jan 2009 11:28:22 +0100
Subject: hrtimer: fix recursion deadlock by re-introducing the softirq

Impact: fix rare runtime deadlock

There are a few sites that do:

  spin_lock_irq(&foo)
  hrtimer_start(&bar)
    __run_hrtimer(&bar)
      func()
        spin_lock(&foo)

which obviously deadlocks. In order to avoid this, never call __run_hrtimer()
from hrtimer_start*() context, but instead defer this to softirq context.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hrtimer.c | 60 +++++++++++++++++++++++++-------------------------------
 1 file changed, 27 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 8010a67cead..b68e98f4e4c 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -634,7 +634,6 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
 {
 }
 
-static void __run_hrtimer(struct hrtimer *timer);
 
 /*
  * When High resolution timers are active, try to reprogram. Note, that in case
@@ -646,13 +645,9 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
 					    struct hrtimer_clock_base *base)
 {
 	if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
-		/*
-		 * XXX: recursion check?
-		 * hrtimer_forward() should round up with timer granularity
-		 * so that we never get into inf recursion here,
-		 * it doesn't do that though
-		 */
-		__run_hrtimer(timer);
+		spin_unlock(&base->cpu_base->lock);
+		raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+		spin_lock(&base->cpu_base->lock);
 		return 1;
 	}
 	return 0;
@@ -705,11 +700,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
 }
 static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
 static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
-static inline int hrtimer_reprogram(struct hrtimer *timer,
-				    struct hrtimer_clock_base *base)
-{
-	return 0;
-}
 
 #endif /* CONFIG_HIGH_RES_TIMERS */
 
@@ -780,9 +770,11 @@ EXPORT_SYMBOL_GPL(hrtimer_forward);
  *
  * The timer is inserted in expiry order. Insertion into the
  * red black tree is O(log(n)). Must hold the base lock.
+ *
+ * Returns 1 when the new timer is the leftmost timer in the tree.
  */
-static void enqueue_hrtimer(struct hrtimer *timer,
-			    struct hrtimer_clock_base *base, int reprogram)
+static int enqueue_hrtimer(struct hrtimer *timer,
+			   struct hrtimer_clock_base *base)
 {
 	struct rb_node **link = &base->active.rb_node;
 	struct rb_node *parent = NULL;
@@ -814,20 +806,8 @@ static void enqueue_hrtimer(struct hrtimer *timer,
 	 * Insert the timer to the rbtree and check whether it
 	 * replaces the first pending timer
 	 */
-	if (leftmost) {
-		/*
-		 * Reprogram the clock event device. When the timer is already
-		 * expired hrtimer_enqueue_reprogram has either called the
-		 * callback or added it to the pending list and raised the
-		 * softirq.
-		 *
-		 * This is a NOP for !HIGHRES
-		 */
-		if (reprogram && hrtimer_enqueue_reprogram(timer, base))
-			return;
-
+	if (leftmost)
 		base->first = &timer->node;
-	}
 
 	rb_link_node(&timer->node, parent, link);
 	rb_insert_color(&timer->node, &base->active);
@@ -836,6 +816,8 @@ static void enqueue_hrtimer(struct hrtimer *timer,
 	 * state of a possibly running callback.
 	 */
 	timer->state |= HRTIMER_STATE_ENQUEUED;
+
+	return leftmost;
 }
 
 /*
@@ -912,7 +894,7 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n
 {
 	struct hrtimer_clock_base *base, *new_base;
 	unsigned long flags;
-	int ret;
+	int ret, leftmost;
 
 	base = lock_hrtimer_base(timer, &flags);
 
@@ -940,12 +922,16 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n
 
 	timer_stats_hrtimer_set_start_info(timer);
 
+	leftmost = enqueue_hrtimer(timer, new_base);
+
 	/*
 	 * Only allow reprogramming if the new base is on this CPU.
 	 * (it might still be on another CPU if the timer was pending)
+	 *
+	 * XXX send_remote_softirq() ?
 	 */
-	enqueue_hrtimer(timer, new_base,
-			new_base->cpu_base == &__get_cpu_var(hrtimer_bases));
+	if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases))
+		hrtimer_enqueue_reprogram(timer, new_base);
 
 	unlock_hrtimer_base(timer, &flags);
 
@@ -1163,7 +1149,7 @@ static void __run_hrtimer(struct hrtimer *timer)
 	 */
 	if (restart != HRTIMER_NORESTART) {
 		BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
-		enqueue_hrtimer(timer, base, 0);
+		enqueue_hrtimer(timer, base);
 	}
 	timer->state &= ~HRTIMER_STATE_CALLBACK;
 }
@@ -1277,6 +1263,11 @@ void hrtimer_peek_ahead_timers(void)
 	local_irq_restore(flags);
 }
 
+static void run_hrtimer_softirq(struct softirq_action *h)
+{
+	hrtimer_peek_ahead_timers();
+}
+
 #endif	/* CONFIG_HIGH_RES_TIMERS */
 
 /*
@@ -1532,7 +1523,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 		 * is done, which will run all expired timers and re-programm
 		 * the timer device.
 		 */
-		enqueue_hrtimer(timer, new_base, 0);
+		enqueue_hrtimer(timer, new_base);
 
 		/* Clear the migration state bit */
 		timer->state &= ~HRTIMER_STATE_MIGRATE;
@@ -1610,6 +1601,9 @@ void __init hrtimers_init(void)
 	hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
 			  (void *)(long)smp_processor_id());
 	register_cpu_notifier(&hrtimers_nb);
+#ifdef CONFIG_HIGH_RES_TIMERS
+	open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
+#endif
 }
 
 /**
-- 
cgit v1.2.3


From e3f1d883740b09e5116d4d4e30a6a6987264a83c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 5 Jan 2009 11:28:23 +0100
Subject: hrtimer: fixup comments

Clean up the comments

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hrtimer.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index b68e98f4e4c..aa024f2af78 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1143,9 +1143,9 @@ static void __run_hrtimer(struct hrtimer *timer)
 	spin_lock(&cpu_base->lock);
 
 	/*
-	 * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid
-	 * reprogramming of the event hardware. This happens at the end of this
-	 * function anyway.
+	 * Note: We clear the CALLBACK bit after enqueue_hrtimer and
+	 * we do not reprogramm the event hardware. Happens either in
+	 * hrtimer_start_range_ns() or in hrtimer_interrupt()
 	 */
 	if (restart != HRTIMER_NORESTART) {
 		BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
@@ -1514,14 +1514,12 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 		__remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
 		timer->base = new_base;
 		/*
-		 * Enqueue the timers on the new cpu, but do not reprogram 
-		 * the timer as that would enable a deadlock between
-		 * hrtimer_enqueue_reprogramm() running the timer and us still
-		 * holding a nested base lock.
-		 *
-		 * Instead we tickle the hrtimer interrupt after the migration
-		 * is done, which will run all expired timers and re-programm
-		 * the timer device.
+		 * Enqueue the timers on the new cpu. This does not
+		 * reprogram the event device in case the timer
+		 * expires before the earliest on this CPU, but we run
+		 * hrtimer_interrupt after we migrated everything to
+		 * sort out already expired timers and reprogram the
+		 * event device.
 		 */
 		enqueue_hrtimer(timer, new_base);
 
-- 
cgit v1.2.3


From 39aac64812da70f0af262f4700e67637338cbb3b Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 5 Jan 2009 19:18:02 +0800
Subject: sched: mark sched_create_sysfs_power_savings_entries() as __init

Impact: cleanup

The only caller is cpu_dev_init() which is marked as __init.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 545c6fccd1d..9a8e296959c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8060,7 +8060,7 @@ static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
 		   sched_smt_power_savings_store);
 #endif
 
-int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
+int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
 {
 	int err = 0;
 
-- 
cgit v1.2.3


From c70f22d203fc02c805b6ed4a3483b740dc36786b Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 5 Jan 2009 19:07:50 +0800
Subject: sched: clean up arch_reinit_sched_domains()

- Make arch_reinit_sched_domains() static. It was exported to be used in
  s390, but now rebuild_sched_domains() is used instead.

- Make it return void.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 9a8e296959c..c5019a5dcaa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7987,7 +7987,7 @@ match2:
 }
 
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-int arch_reinit_sched_domains(void)
+static void arch_reinit_sched_domains(void)
 {
 	get_online_cpus();
 
@@ -7996,13 +7996,10 @@ int arch_reinit_sched_domains(void)
 
 	rebuild_sched_domains();
 	put_online_cpus();
-
-	return 0;
 }
 
 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 {
-	int ret;
 	unsigned int level = 0;
 
 	if (sscanf(buf, "%u", &level) != 1)
@@ -8023,9 +8020,9 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 	else
 		sched_mc_power_savings = level;
 
-	ret = arch_reinit_sched_domains();
+	arch_reinit_sched_domains();
 
-	return ret ? ret : count;
+	return count;
 }
 
 #ifdef CONFIG_SCHED_MC
-- 
cgit v1.2.3


From 82c5b7b527ccc4b5d3cf832437e842f9d2920a79 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 5 Jan 2009 14:11:10 +0100
Subject: hrtimer: splitout peek ahead functionality, fix

Impact: build fix on !CONFIG_HIGH_RES_TIMERS

Fix:

  kernel/hrtimer.c:1586: error: implicit declaration of function '__hrtimer_peek_ahead_timers'

Signen-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hrtimer.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index aa024f2af78..1455b7651b6 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1268,7 +1268,11 @@ static void run_hrtimer_softirq(struct softirq_action *h)
 	hrtimer_peek_ahead_timers();
 }
 
-#endif	/* CONFIG_HIGH_RES_TIMERS */
+#else /* CONFIG_HIGH_RES_TIMERS */
+
+static inline void __hrtimer_peek_ahead_timers(void) { }
+
+#endif	/* !CONFIG_HIGH_RES_TIMERS */
 
 /*
  * Called from timer softirq every jiffy, expire hrtimers:
-- 
cgit v1.2.3


From 56ff5efad96182f4d3cb3dc6b07396762c658f16 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 9 Dec 2008 09:34:39 -0500
Subject: zero i_uid/i_gid on inode allocation

... and don't bother in callers.  Don't bother with zeroing i_blocks,
while we are at it - it's already been zeroed.

i_mode is not worth the effort; it has no common default value.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/cgroup.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 48348dde6d8..f7c5099a057 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -573,7 +573,6 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
 		inode->i_mode = mode;
 		inode->i_uid = current_fsuid();
 		inode->i_gid = current_fsgid();
-		inode->i_blocks = 0;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
 	}
-- 
cgit v1.2.3


From 0c910d289567163dbe40ccc174b36afd1c7723bd Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 6 Jan 2009 17:39:06 +0800
Subject: sched: fix double kfree in failure path

It's not the responsibility of init_rootdomain() to free root_domain
allocated by alloc_rootdomain().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index c5019a5dcaa..973f97362ce 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6970,7 +6970,7 @@ static int init_rootdomain(struct root_domain *rd, bool bootmem)
 	}
 
 	if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
-		goto free_rd;
+		goto out;
 	if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
 		goto free_span;
 	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
@@ -6986,8 +6986,7 @@ free_online:
 	free_cpumask_var(rd->online);
 free_span:
 	free_cpumask_var(rd->span);
-free_rd:
-	kfree(rd);
+out:
 	return -ENOMEM;
 }
 
-- 
cgit v1.2.3


From db2f59c8c9b315f2b88b1dac159b988c6009034d Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 6 Jan 2009 17:40:36 +0800
Subject: sched: fix section mismatch

init_rootdomain() calls alloc_bootmem_cpumask_var() at system boot,
so does cpupri_init().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c        | 2 +-
 kernel/sched_cpupri.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 973f97362ce..2e3545f57e7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6957,7 +6957,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
 
-static int init_rootdomain(struct root_domain *rd, bool bootmem)
+static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
 {
 	memset(rd, 0, sizeof(*rd));
 
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 018b7be1db2..1e00bfacf9b 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -151,7 +151,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
  *
  * Returns: -ENOMEM if memory fails.
  */
-int cpupri_init(struct cpupri *cp, bool bootmem)
+int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)
 {
 	int i;
 
-- 
cgit v1.2.3


From edb123e16c6092bd08b67d1130ff03efeada0c89 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Thu, 4 Dec 2008 12:39:49 +0100
Subject: trivial: printk: fix indentation of new_text_line declaration

Remove bogus indentation of new_text_line declaration introduced in
commit ac60ad741.

Acked-by: Nick Andrew <nick@nick-andrew.net>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/printk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index e651ab05655..7015733793e 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -619,7 +619,7 @@ static int acquire_console_semaphore_for_printk(unsigned int cpu)
 static const char recursion_bug_msg [] =
 		KERN_CRIT "BUG: recent printk recursion!\n";
 static int recursion_bug;
-	static int new_text_line = 1;
+static int new_text_line = 1;
 static char printk_buf[1024];
 
 asmlinkage int vprintk(const char *fmt, va_list args)
-- 
cgit v1.2.3


From 025dfdafe77f20b3890981a394774baab7b9c827 Mon Sep 17 00:00:00 2001
From: Frederik Schwarzer <schwarzerf@gmail.com>
Date: Thu, 16 Oct 2008 19:02:37 +0200
Subject: trivial: fix then -> than typos in comments and documentation

- (better, more, bigger ...) then -> (...) than

Signed-off-by: Frederik Schwarzer <schwarzerf@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/pid.c          | 2 +-
 kernel/time/jiffies.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/pid.c b/kernel/pid.c
index 064e76afa50..af9224cdd6c 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -475,7 +475,7 @@ pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
 EXPORT_SYMBOL(task_session_nr_ns);
 
 /*
- * Used by proc to find the first pid that is greater then or equal to nr.
+ * Used by proc to find the first pid that is greater than or equal to nr.
  *
  * If there is a pid at nr this function is exactly the same as find_pid_ns.
  */
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 1ca99557e92..06f197560f3 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -45,7 +45,7 @@
  *
  * The value 8 is somewhat carefully chosen, as anything
  * larger can result in overflows. NSEC_PER_JIFFY grows as
- * HZ shrinks, so values greater then 8 overflow 32bits when
+ * HZ shrinks, so values greater than 8 overflow 32bits when
  * HZ=100.
  */
 #define JIFFIES_SHIFT	8
-- 
cgit v1.2.3


From cd3772e6898c6386f21d2958346d6dd57d4204f5 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Sun, 16 Nov 2008 18:22:09 +0800
Subject: kernel/ksysfs.c:fix dependence on CONFIG_NET

Access to uevent_seqnum and uevent_helper does not need to
depend on CONFIG_NET, so remove it.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/ksysfs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 08dd8ed86c7..528dd78e7e7 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -24,7 +24,7 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
 static struct kobj_attribute _name##_attr = \
 	__ATTR(_name, 0644, _name##_show, _name##_store)
 
-#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
+#if defined(CONFIG_HOTPLUG)
 /* current uevent sequence number */
 static ssize_t uevent_seqnum_show(struct kobject *kobj,
 				  struct kobj_attribute *attr, char *buf)
@@ -137,7 +137,7 @@ struct kobject *kernel_kobj;
 EXPORT_SYMBOL_GPL(kernel_kobj);
 
 static struct attribute * kernel_attrs[] = {
-#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
+#if defined(CONFIG_HOTPLUG)
 	&uevent_seqnum_attr.attr,
 	&uevent_helper_attr.attr,
 #endif
-- 
cgit v1.2.3


From 81ff86a11f54c9e266c6a6bc3ecd2c9a0f1e11cc Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Tue, 6 Jan 2009 10:44:39 -0800
Subject: pm: struct device - replace bus_id with dev_name(), dev_set_name()

Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/power/main.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/main.c b/kernel/power/main.c
index 613f16941b8..23998887397 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -615,7 +615,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
 	/* this may fail if the RTC hasn't been initialized */
 	status = rtc_read_time(rtc, &alm.time);
 	if (status < 0) {
-		printk(err_readtime, rtc->dev.bus_id, status);
+		printk(err_readtime, dev_name(&rtc->dev), status);
 		return;
 	}
 	rtc_tm_to_time(&alm.time, &now);
@@ -626,7 +626,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
 
 	status = rtc_set_alarm(rtc, &alm);
 	if (status < 0) {
-		printk(err_wakealarm, rtc->dev.bus_id, status);
+		printk(err_wakealarm, dev_name(&rtc->dev), status);
 		return;
 	}
 
@@ -660,7 +660,7 @@ static int __init has_wakealarm(struct device *dev, void *name_ptr)
 	if (!device_may_wakeup(candidate->dev.parent))
 		return 0;
 
-	*(char **)name_ptr = dev->bus_id;
+	*(const char **)name_ptr = dev_name(dev);
 	return 1;
 }
 
-- 
cgit v1.2.3


From 29881c4502ba05f46bc12ae8053d4e08d7e2615c Mon Sep 17 00:00:00 2001
From: James Morris <jmorris@namei.org>
Date: Wed, 7 Jan 2009 09:21:54 +1100
Subject: Revert "CRED: Fix regression in cap_capable() as shown up by
 sys_faccessat() [ver #2]"

This reverts commit 14eaddc967b16017d4a1a24d2be6c28ecbe06ed8.

David has a better version to come.
---
 kernel/capability.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/capability.c b/kernel/capability.c
index df62f53f84a..36b4b4daebe 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -308,7 +308,7 @@ int capable(int cap)
 		BUG();
 	}
 
-	if (security_capable(cap) == 0) {
+	if (has_capability(current, cap)) {
 		current->flags |= PF_SUPERPRIV;
 		return 1;
 	}
-- 
cgit v1.2.3


From 3699c53c485bf0168e6500d0ed18bf931584dd7c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 6 Jan 2009 22:27:01 +0000
Subject: CRED: Fix regression in cap_capable() as shown up by sys_faccessat()
 [ver #3]

Fix a regression in cap_capable() due to:

	commit 3b11a1decef07c19443d24ae926982bc8ec9f4c0
	Author: David Howells <dhowells@redhat.com>
	Date:   Fri Nov 14 10:39:26 2008 +1100

	    CRED: Differentiate objective and effective subjective credentials on a task

The problem is that the above patch allows a process to have two sets of
credentials, and for the most part uses the subjective credentials when
accessing current's creds.

There is, however, one exception: cap_capable(), and thus capable(), uses the
real/objective credentials of the target task, whether or not it is the current
task.

Ordinarily this doesn't matter, since usually the two cred pointers in current
point to the same set of creds.  However, sys_faccessat() makes use of this
facility to override the credentials of the calling process to make its test,
without affecting the creds as seen from other processes.

One of the things sys_faccessat() does is to make an adjustment to the
effective capabilities mask, which cap_capable(), as it stands, then ignores.

The affected capability check is in generic_permission():

	if (!(mask & MAY_EXEC) || execute_ok(inode))
		if (capable(CAP_DAC_OVERRIDE))
			return 0;

This change passes the set of credentials to be tested down into the commoncap
and SELinux code.  The security functions called by capable() and
has_capability() select the appropriate set of credentials from the process
being checked.

This can be tested by compiling the following program from the XFS testsuite:

/*
 *  t_access_root.c - trivial test program to show permission bug.
 *
 *  Written by Michael Kerrisk - copyright ownership not pursued.
 *  Sourced from: http://linux.derkeiler.com/Mailing-Lists/Kernel/2003-10/6030.html
 */
#include <limits.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <sys/stat.h>

#define UID 500
#define GID 100
#define PERM 0
#define TESTPATH "/tmp/t_access"

static void
errExit(char *msg)
{
    perror(msg);
    exit(EXIT_FAILURE);
} /* errExit */

static void
accessTest(char *file, int mask, char *mstr)
{
    printf("access(%s, %s) returns %d\n", file, mstr, access(file, mask));
} /* accessTest */

int
main(int argc, char *argv[])
{
    int fd, perm, uid, gid;
    char *testpath;
    char cmd[PATH_MAX + 20];

    testpath = (argc > 1) ? argv[1] : TESTPATH;
    perm = (argc > 2) ? strtoul(argv[2], NULL, 8) : PERM;
    uid = (argc > 3) ? atoi(argv[3]) : UID;
    gid = (argc > 4) ? atoi(argv[4]) : GID;

    unlink(testpath);

    fd = open(testpath, O_RDWR | O_CREAT, 0);
    if (fd == -1) errExit("open");

    if (fchown(fd, uid, gid) == -1) errExit("fchown");
    if (fchmod(fd, perm) == -1) errExit("fchmod");
    close(fd);

    snprintf(cmd, sizeof(cmd), "ls -l %s", testpath);
    system(cmd);

    if (seteuid(uid) == -1) errExit("seteuid");

    accessTest(testpath, 0, "0");
    accessTest(testpath, R_OK, "R_OK");
    accessTest(testpath, W_OK, "W_OK");
    accessTest(testpath, X_OK, "X_OK");
    accessTest(testpath, R_OK | W_OK, "R_OK | W_OK");
    accessTest(testpath, R_OK | X_OK, "R_OK | X_OK");
    accessTest(testpath, W_OK | X_OK, "W_OK | X_OK");
    accessTest(testpath, R_OK | W_OK | X_OK, "R_OK | W_OK | X_OK");

    exit(EXIT_SUCCESS);
} /* main */

This can be run against an Ext3 filesystem as well as against an XFS
filesystem.  If successful, it will show:

	[root@andromeda src]# ./t_access_root /tmp/xxx 0 4043 4043
	---------- 1 dhowells dhowells 0 2008-12-31 03:00 /tmp/xxx
	access(/tmp/xxx, 0) returns 0
	access(/tmp/xxx, R_OK) returns 0
	access(/tmp/xxx, W_OK) returns 0
	access(/tmp/xxx, X_OK) returns -1
	access(/tmp/xxx, R_OK | W_OK) returns 0
	access(/tmp/xxx, R_OK | X_OK) returns -1
	access(/tmp/xxx, W_OK | X_OK) returns -1
	access(/tmp/xxx, R_OK | W_OK | X_OK) returns -1

If unsuccessful, it will show:

	[root@andromeda src]# ./t_access_root /tmp/xxx 0 4043 4043
	---------- 1 dhowells dhowells 0 2008-12-31 02:56 /tmp/xxx
	access(/tmp/xxx, 0) returns 0
	access(/tmp/xxx, R_OK) returns -1
	access(/tmp/xxx, W_OK) returns -1
	access(/tmp/xxx, X_OK) returns -1
	access(/tmp/xxx, R_OK | W_OK) returns -1
	access(/tmp/xxx, R_OK | X_OK) returns -1
	access(/tmp/xxx, W_OK | X_OK) returns -1
	access(/tmp/xxx, R_OK | W_OK | X_OK) returns -1

I've also tested the fix with the SELinux and syscalls LTP testsuites.

Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: J. Bruce Fields <bfields@citi.umich.edu>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/capability.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/capability.c b/kernel/capability.c
index 36b4b4daebe..df62f53f84a 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -308,7 +308,7 @@ int capable(int cap)
 		BUG();
 	}
 
-	if (has_capability(current, cap)) {
+	if (security_capable(cap) == 0) {
 		current->flags |= PF_SUPERPRIV;
 		return 1;
 	}
-- 
cgit v1.2.3


From 75aa199410359dc5fbcf9025ff7af98a9d20f0d5 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 6 Jan 2009 14:39:01 -0800
Subject: oom: print triggering task's cpuset and mems allowed

When cpusets are enabled, it's necessary to print the triggering task's
set of allowable nodes so the subsequently printed meminfo can be
interpreted correctly.

We also print the task's cpuset name for informational purposes.

[rientjes@google.com: task lock current before dereferencing cpuset]
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 39c1a4c1c5a..345ace5117d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -239,6 +239,17 @@ static struct cpuset top_cpuset = {
 
 static DEFINE_MUTEX(callback_mutex);
 
+/*
+ * cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist
+ * buffers.  They are statically allocated to prevent using excess stack
+ * when calling cpuset_print_task_mems_allowed().
+ */
+#define CPUSET_NAME_LEN		(128)
+#define	CPUSET_NODELIST_LEN	(256)
+static char cpuset_name[CPUSET_NAME_LEN];
+static char cpuset_nodelist[CPUSET_NODELIST_LEN];
+static DEFINE_SPINLOCK(cpuset_buffer_lock);
+
 /*
  * This is ugly, but preserves the userspace API for existing cpuset
  * users. If someone tries to mount the "cpuset" filesystem, we
@@ -2356,6 +2367,29 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
 	return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
 }
 
+/**
+ * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
+ * @task: pointer to task_struct of some task.
+ *
+ * Description: Prints @task's name, cpuset name, and cached copy of its
+ * mems_allowed to the kernel log.  Must hold task_lock(task) to allow
+ * dereferencing task_cs(task).
+ */
+void cpuset_print_task_mems_allowed(struct task_struct *tsk)
+{
+	struct dentry *dentry;
+
+	dentry = task_cs(tsk)->css.cgroup->dentry;
+	spin_lock(&cpuset_buffer_lock);
+	snprintf(cpuset_name, CPUSET_NAME_LEN,
+		 dentry ? (const char *)dentry->d_name.name : "/");
+	nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
+			   tsk->mems_allowed);
+	printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
+	       tsk->comm, cpuset_name, cpuset_nodelist);
+	spin_unlock(&cpuset_buffer_lock);
+}
+
 /*
  * Collection of memory_pressure is suppressed unless
  * this flag is enabled by writing "1" to the special
-- 
cgit v1.2.3


From e5991371ee0d1c0ce19e133c6f9075b49c5b4ae8 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Tue, 6 Jan 2009 14:39:22 -0800
Subject: mm: remove cgroup_mm_owner_callbacks

cgroup_mm_owner_callbacks() was brought in to support the memrlimit
controller, but sneaked into mainline ahead of it.  That controller has
now been shelved, and the mm_owner_changed() args were inadequate for it
anyway (they needed an mm pointer instead of a task pointer).

Remove the dead code, and restore mm_update_next_owner() locking to how it
was before: taking mmap_sem there does nothing for memcontrol.c, now the
only user of mm->owner.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Paul Menage <menage@google.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 33 ---------------------------------
 kernel/exit.c   | 16 ++++++----------
 2 files changed, 6 insertions(+), 43 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 87bb0258fd2..f221446aa02 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -116,7 +116,6 @@ static int root_count;
  * be called.
  */
 static int need_forkexit_callback __read_mostly;
-static int need_mm_owner_callback __read_mostly;
 
 /* convenient tests for these bits */
 inline int cgroup_is_removed(const struct cgroup *cgrp)
@@ -2539,7 +2538,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 	init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
 
 	need_forkexit_callback |= ss->fork || ss->exit;
-	need_mm_owner_callback |= !!ss->mm_owner_changed;
 
 	/* At system boot, before all subsystems have been
 	 * registered, no tasks have been forked, so we don't
@@ -2789,37 +2787,6 @@ void cgroup_fork_callbacks(struct task_struct *child)
 	}
 }
 
-#ifdef CONFIG_MM_OWNER
-/**
- * cgroup_mm_owner_callbacks - run callbacks when the mm->owner changes
- * @p: the new owner
- *
- * Called on every change to mm->owner. mm_init_owner() does not
- * invoke this routine, since it assigns the mm->owner the first time
- * and does not change it.
- *
- * The callbacks are invoked with mmap_sem held in read mode.
- */
-void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
-{
-	struct cgroup *oldcgrp, *newcgrp = NULL;
-
-	if (need_mm_owner_callback) {
-		int i;
-		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-			struct cgroup_subsys *ss = subsys[i];
-			oldcgrp = task_cgroup(old, ss->subsys_id);
-			if (new)
-				newcgrp = task_cgroup(new, ss->subsys_id);
-			if (oldcgrp == newcgrp)
-				continue;
-			if (ss->mm_owner_changed)
-				ss->mm_owner_changed(ss, oldcgrp, newcgrp, new);
-		}
-	}
-}
-#endif /* CONFIG_MM_OWNER */
-
 /**
  * cgroup_post_fork - called on a new task after adding it to the task list
  * @child: the task in question
diff --git a/kernel/exit.c b/kernel/exit.c
index c9e5a1c14e0..f923724ab3c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -642,35 +642,31 @@ retry:
 	/*
 	 * We found no owner yet mm_users > 1: this implies that we are
 	 * most likely racing with swapoff (try_to_unuse()) or /proc or
-	 * ptrace or page migration (get_task_mm()).  Mark owner as NULL,
-	 * so that subsystems can understand the callback and take action.
+	 * ptrace or page migration (get_task_mm()).  Mark owner as NULL.
 	 */
-	down_write(&mm->mmap_sem);
-	cgroup_mm_owner_callbacks(mm->owner, NULL);
 	mm->owner = NULL;
-	up_write(&mm->mmap_sem);
 	return;
 
 assign_new_owner:
 	BUG_ON(c == p);
 	get_task_struct(c);
-	read_unlock(&tasklist_lock);
-	down_write(&mm->mmap_sem);
 	/*
 	 * The task_lock protects c->mm from changing.
 	 * We always want mm->owner->mm == mm
 	 */
 	task_lock(c);
+	/*
+	 * Delay read_unlock() till we have the task_lock()
+	 * to ensure that c does not slip away underneath us
+	 */
+	read_unlock(&tasklist_lock);
 	if (c->mm != mm) {
 		task_unlock(c);
-		up_write(&mm->mmap_sem);
 		put_task_struct(c);
 		goto retry;
 	}
-	cgroup_mm_owner_callbacks(mm->owner, c);
 	mm->owner = c;
 	task_unlock(c);
-	up_write(&mm->mmap_sem);
 	put_task_struct(c);
 }
 #endif /* CONFIG_MM_OWNER */
-- 
cgit v1.2.3


From 2da02997e08d3efe8174c7a47696e6f7cbe69ba9 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 6 Jan 2009 14:39:31 -0800
Subject: mm: add dirty_background_bytes and dirty_bytes sysctls

This change introduces two new sysctls to /proc/sys/vm:
dirty_background_bytes and dirty_bytes.

dirty_background_bytes is the counterpart to dirty_background_ratio and
dirty_bytes is the counterpart to dirty_ratio.

With growing memory capacities of individual machines, it's no longer
sufficient to specify dirty thresholds as a percentage of the amount of
dirtyable memory over the entire system.

dirty_background_bytes and dirty_bytes specify quantities of memory, in
bytes, that represent the dirty limits for the entire system.  If either
of these values is set, its value represents the amount of dirty memory
that is needed to commence either background or direct writeback.

When a `bytes' or `ratio' file is written, its counterpart becomes a
function of the written value.  For example, if dirty_bytes is written to
be 8096, 8K of memory is required to commence direct writeback.
dirty_ratio is then functionally equivalent to 8K / the amount of
dirtyable memory:

	dirtyable_memory = free pages + mapped pages + file cache

	dirty_background_bytes = dirty_background_ratio * dirtyable_memory
		-or-
	dirty_background_ratio = dirty_background_bytes / dirtyable_memory

		AND

	dirty_bytes = dirty_ratio * dirtyable_memory
		-or-
	dirty_ratio = dirty_bytes / dirtyable_memory

Only one of dirty_background_bytes and dirty_background_ratio may be
specified at a time, and only one of dirty_bytes and dirty_ratio may be
specified.  When one sysctl is written, the other appears as 0 when read.

The `bytes' files operate on a page size granularity since dirty limits
are compared with ZVC values, which are in page units.

Prior to this change, the minimum dirty_ratio was 5 as implemented by
get_dirty_limits() although /proc/sys/vm/dirty_ratio would show any user
written value between 0 and 100.  This restriction is maintained, but
dirty_bytes has a lower limit of only one page.

Also prior to this change, the dirty_background_ratio could not equal or
exceed dirty_ratio.  This restriction is maintained in addition to
restricting dirty_background_bytes.  If either background threshold equals
or exceeds that of the dirty threshold, it is implicitly set to half the
dirty threshold.

Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Andrea Righi <righi.andrea@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ff6d45c7626..92f6e5bc3c2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -87,10 +87,6 @@ extern int rcutorture_runnable;
 #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
 
 /* Constants used for minimum and  maximum */
-#if defined(CONFIG_HIGHMEM) || defined(CONFIG_DETECT_SOFTLOCKUP)
-static int one = 1;
-#endif
-
 #ifdef CONFIG_DETECT_SOFTLOCKUP
 static int sixty = 60;
 static int neg_one = -1;
@@ -101,6 +97,7 @@ static int two = 2;
 #endif
 
 static int zero;
+static int one = 1;
 static int one_hundred = 100;
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
@@ -952,11 +949,21 @@ static struct ctl_table vm_table[] = {
 		.data		= &dirty_background_ratio,
 		.maxlen		= sizeof(dirty_background_ratio),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= &dirty_background_ratio_handler,
 		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 		.extra2		= &one_hundred,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "dirty_background_bytes",
+		.data		= &dirty_background_bytes,
+		.maxlen		= sizeof(dirty_background_bytes),
+		.mode		= 0644,
+		.proc_handler	= &dirty_background_bytes_handler,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &one,
+	},
 	{
 		.ctl_name	= VM_DIRTY_RATIO,
 		.procname	= "dirty_ratio",
@@ -968,6 +975,16 @@ static struct ctl_table vm_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one_hundred,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "dirty_bytes",
+		.data		= &vm_dirty_bytes,
+		.maxlen		= sizeof(vm_dirty_bytes),
+		.mode		= 0644,
+		.proc_handler	= &dirty_bytes_handler,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &one,
+	},
 	{
 		.procname	= "dirty_writeback_centisecs",
 		.data		= &dirty_writeback_interval,
-- 
cgit v1.2.3


From 901608d9045146aec6f14a7777ea4b1501c379f0 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 6 Jan 2009 14:40:29 -0800
Subject: mm: introduce get_mm_hiwater_xxx(), fix taskstats->hiwater_xxx
 accounting

xacct_add_tsk() relies on do_exit()->update_hiwater_xxx() and uses
mm->hiwater_xxx directly, this leads to 2 problems:

- taskstats_user_cmd() can call fill_pid()->xacct_add_tsk() at any
  moment before the task exits, so we should check the current values of
  rss/vm anyway.

- do_exit()->update_hiwater_xxx() calls are racy.  An exiting thread can
  be preempted right before mm->hiwater_xxx = new_val, and another thread
  can use A_LOT of memory and exit in between.  When the first thread
  resumes it can be the last thread in the thread group, in that case we
  report the wrong hiwater_xxx values which do not take A_LOT into
  account.

Introduce get_mm_hiwater_rss() and get_mm_hiwater_vm() helpers and change
xacct_add_tsk() to use them.  The first helper will also be used by
rusage->ru_maxrss accounting.

Kill do_exit()->update_hiwater_xxx() calls.  Unless we are going to
decrease rss/vm there is no point to update mm->hiwater_xxx, and nobody
can look at this mm_struct when exit_mmap() actually unmaps the memory.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c   | 5 +----
 kernel/tsacct.c | 4 ++--
 2 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index f923724ab3c..c7740fa3252 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1051,10 +1051,7 @@ NORET_TYPE void do_exit(long code)
 				preempt_count());
 
 	acct_update_integrals(tsk);
-	if (tsk->mm) {
-		update_hiwater_rss(tsk->mm);
-		update_hiwater_vm(tsk->mm);
-	}
+
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
 	if (group_dead) {
 		hrtimer_cancel(&tsk->signal->real_timer);
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 2dc06ab3571..43f891b05a4 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -92,8 +92,8 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 	mm = get_task_mm(p);
 	if (mm) {
 		/* adjust to KB unit */
-		stats->hiwater_rss   = mm->hiwater_rss * PAGE_SIZE / KB;
-		stats->hiwater_vm    = mm->hiwater_vm * PAGE_SIZE / KB;
+		stats->hiwater_rss   = get_mm_hiwater_rss(mm) * PAGE_SIZE / KB;
+		stats->hiwater_vm    = get_mm_hiwater_vm(mm)  * PAGE_SIZE / KB;
 		mmput(mm);
 	}
 	stats->read_char	= p->ioac.rchar;
-- 
cgit v1.2.3


From f1883f86dea84fe47a71a39fc1afccc005915ed8 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 6 Jan 2009 14:40:45 -0800
Subject: Remove remaining unwinder code

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Gabor Gombas <gombasg@sztaki.hu>
Cc: Jan Beulich <jbeulich@novell.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>,
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/module.c | 15 ---------------
 1 file changed, 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index f47cce910f2..34b56cf0661 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -43,7 +43,6 @@
 #include <linux/device.h>
 #include <linux/string.h>
 #include <linux/mutex.h>
-#include <linux/unwind.h>
 #include <linux/rculist.h>
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -1449,8 +1448,6 @@ static void free_module(struct module *mod)
 	remove_sect_attrs(mod);
 	mod_kobject_remove(mod);
 
-	unwind_remove_table(mod->unwind_info, 0);
-
 	/* Arch-specific cleanup. */
 	module_arch_cleanup(mod);
 
@@ -1867,7 +1864,6 @@ static noinline struct module *load_module(void __user *umod,
 	unsigned int symindex = 0;
 	unsigned int strindex = 0;
 	unsigned int modindex, versindex, infoindex, pcpuindex;
-	unsigned int unwindex = 0;
 	unsigned int num_kp, num_mcount;
 	struct kernel_param *kp;
 	struct module *mod;
@@ -1957,9 +1953,6 @@ static noinline struct module *load_module(void __user *umod,
 	versindex = find_sec(hdr, sechdrs, secstrings, "__versions");
 	infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
 	pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
-#ifdef ARCH_UNWIND_SECTION_NAME
-	unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME);
-#endif
 
 	/* Don't keep modinfo and version sections. */
 	sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
@@ -1969,8 +1962,6 @@ static noinline struct module *load_module(void __user *umod,
 	sechdrs[symindex].sh_flags |= SHF_ALLOC;
 	sechdrs[strindex].sh_flags |= SHF_ALLOC;
 #endif
-	if (unwindex)
-		sechdrs[unwindex].sh_flags |= SHF_ALLOC;
 
 	/* Check module struct version now, before we try to use module. */
 	if (!check_modstruct_version(sechdrs, versindex, mod)) {
@@ -2267,11 +2258,6 @@ static noinline struct module *load_module(void __user *umod,
 	add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
 	add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
 
-	/* Size of section 0 is 0, so this works well if no unwind info. */
-	mod->unwind_info = unwind_add_table(mod,
-					    (void *)sechdrs[unwindex].sh_addr,
-					    sechdrs[unwindex].sh_size);
-
 	/* Get rid of temporary copy */
 	vfree(hdr);
 
@@ -2370,7 +2356,6 @@ sys_init_module(void __user *umod,
 	mutex_lock(&module_mutex);
 	/* Drop initial reference. */
 	module_put(mod);
-	unwind_remove_table(mod->unwind_info, 1);
 	module_free(mod, mod->module_init);
 	mod->module_init = NULL;
 	mod->init_size = 0;
-- 
cgit v1.2.3


From 60348802e9cb137ee86590c3e4c57c1ec2e8fc69 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Tue, 6 Jan 2009 14:40:46 -0800
Subject: fork.c: cleanup for copy_sighand()

Check CLONE_SIGHAND only is enough, because combination of CLONE_THREAD and
CLONE_SIGHAND is already done in copy_process().

Impact: cleanup, no functionality changed

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 43cbf30669e..23b91211667 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -758,7 +758,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
 {
 	struct sighand_struct *sig;
 
-	if (clone_flags & (CLONE_SIGHAND | CLONE_THREAD)) {
+	if (clone_flags & CLONE_SIGHAND) {
 		atomic_inc(&current->sighand->count);
 		return 0;
 	}
-- 
cgit v1.2.3


From d6624f996ae539344e8d748cce1117ae7af06fbf Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Tue, 6 Jan 2009 14:40:54 -0800
Subject: oops: increment the oops UUID every time we oops

... because we do want repeated same-oops to be seen by automated
tools like kerneloops.org

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/panic.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 13f06349a78..2a2ff36ff44 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -299,6 +299,8 @@ static int init_oops_id(void)
 {
 	if (!oops_id)
 		get_random_bytes(&oops_id, sizeof(oops_id));
+	else
+		oops_id++;
 
 	return 0;
 }
-- 
cgit v1.2.3


From e3d5a27d5862b6425d0879272e24abecf7245105 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Tue, 6 Jan 2009 14:41:02 -0800
Subject: Allow times and time system calls to return small negative values

At the moment, the times() system call will appear to fail for a period
shortly after boot, while the value it want to return is between -4095 and
-1.  The same thing will also happen for the time() system call on 32-bit
platforms some time in 2106 or so.

On some platforms, such as x86, this is unavoidable because of the system
call ABI, but other platforms such as powerpc have a separate error
indication from the return value, so system calls can in fact return small
negative values without indicating an error.  On those platforms,
force_successful_syscall_return() provides a way to indicate that the
system call return value should not be treated as an error even if it is
in the range which would normally be taken as a negative error number.

This adds a force_successful_syscall_return() call to the time() and
times() system calls plus their 32-bit compat versions, so that they don't
erroneously indicate an error on those platforms whose system call ABI has
a separate error indication.  This will not affect anything on other
platforms.

Joakim Tjernlund added the fix for time() and the compat versions of
time() and times(), after I did the fix for times().

Signed-off-by: Joakim Tjernlund <Joakim.Tjernlund@transmode.se>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/compat.c | 5 ++++-
 kernel/sys.c    | 2 ++
 kernel/time.c   | 4 +++-
 3 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/compat.c b/kernel/compat.c
index d52e2ec1deb..42d56544460 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -24,6 +24,7 @@
 #include <linux/migrate.h>
 #include <linux/posix-timers.h>
 #include <linux/times.h>
+#include <linux/ptrace.h>
 
 #include <asm/uaccess.h>
 
@@ -229,6 +230,7 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
 		if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
 			return -EFAULT;
 	}
+	force_successful_syscall_return();
 	return compat_jiffies_to_clock_t(jiffies);
 }
 
@@ -894,8 +896,9 @@ asmlinkage long compat_sys_time(compat_time_t __user * tloc)
 
 	if (tloc) {
 		if (put_user(i,tloc))
-			i = -EFAULT;
+			return -EFAULT;
 	}
+	force_successful_syscall_return();
 	return i;
 }
 
diff --git a/kernel/sys.c b/kernel/sys.c
index d356d79e84a..4a43617cd56 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -33,6 +33,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/seccomp.h>
 #include <linux/cpu.h>
+#include <linux/ptrace.h>
 
 #include <linux/compat.h>
 #include <linux/syscalls.h>
@@ -927,6 +928,7 @@ asmlinkage long sys_times(struct tms __user * tbuf)
 		if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
 			return -EFAULT;
 	}
+	force_successful_syscall_return();
 	return (long) jiffies_64_to_clock_t(get_jiffies_64());
 }
 
diff --git a/kernel/time.c b/kernel/time.c
index d63a4336fad..4886e3ce83a 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -37,6 +37,7 @@
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/math64.h>
+#include <linux/ptrace.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -65,8 +66,9 @@ asmlinkage long sys_time(time_t __user * tloc)
 
 	if (tloc) {
 		if (put_user(i,tloc))
-			i = -EFAULT;
+			return -EFAULT;
 	}
+	force_successful_syscall_return();
 	return i;
 }
 
-- 
cgit v1.2.3


From 26e5438e4b77f04a51870f9415ffed68004fac1d Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 6 Jan 2009 14:41:10 -0800
Subject: profile: don't include <asm/ptrace.h> twice.

Currently, kernel/profile.c include <asm/ptrace.h> twice.  It can be
removed.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/profile.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/profile.c b/kernel/profile.c
index d18e2d2654f..784933acf5b 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -445,7 +445,6 @@ void profile_tick(int type)
 #ifdef CONFIG_PROC_FS
 #include <linux/proc_fs.h>
 #include <asm/uaccess.h>
-#include <asm/ptrace.h>
 
 static int prof_cpu_mask_read_proc(char *page, char **start, off_t off,
 			int count, int *eof, void *data)
-- 
cgit v1.2.3


From bc2f70151fe7a117dbe8347edc5a877e749572a3 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 6 Jan 2009 14:41:46 -0800
Subject: kprobes: bugfix: try_module_get even if calling_mod is NULL

When someone called register_*probe() from kernel-core code(not from
module) and that probes a kernel module, users can remove the probed
module because kprobe doesn't increment reference counter of the module.
(on the other hand, if the kernel-module calls register_*probe, kprobe
increments refcount of the probed module.)

Currently, we have no register_*probe() calling from kernel-core(except
smoke-test, but the smoke-test doesn't probe module), so there is no real
bugs.  But the logic is wrong(or not fair) and it can causes a problem
when someone might want to probe module from kernel.

After this patch is applied, even if someone put register_*probe() call in
the kernel-core code, it increments the reference counter of the probed
module, and it prevents user to remove the module until stopping probing
it.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kprobes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 9f8a3f25259..3afd354c46f 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -634,7 +634,7 @@ static int __kprobes __register_kprobe(struct kprobe *p,
 		 * avoid incrementing the module refcount, so as to allow
 		 * unloading of self probing modules.
 		 */
-		if (calling_mod && calling_mod != probed_mod) {
+		if (calling_mod != probed_mod) {
 			if (unlikely(!try_module_get(probed_mod))) {
 				preempt_enable();
 				return -EINVAL;
-- 
cgit v1.2.3


From 8e1144050e49dd4ef19c117dc5626f212cfe73cf Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 6 Jan 2009 14:41:47 -0800
Subject: kprobes: indirectly call kprobe_target

Call kprobe_target indirectly.  This prevents gcc to unroll a noinline
function in caller function.

I ported patches which had been discussed on
http://sources.redhat.com/bugzilla/show_bug.cgi?id=3542

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: David Miller <davem@davemloft.net>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/test_kprobes.c | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 06b6395b45b..9c0127ead6a 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -22,21 +22,10 @@
 
 static u32 rand1, preh_val, posth_val, jph_val;
 static int errors, handler_errors, num_tests;
+static u32 (*target)(u32 value);
 
 static noinline u32 kprobe_target(u32 value)
 {
-	/*
-	 * gcc ignores noinline on some architectures unless we stuff
-	 * sufficient lard into the function. The get_kprobe() here is
-	 * just for that.
-	 *
-	 * NOTE: We aren't concerned about the correctness of get_kprobe()
-	 * here; hence, this call is neither under !preempt nor with the
-	 * kprobe_mutex held. This is fine(tm)
-	 */
-	if (get_kprobe((void *)0xdeadbeef))
-		printk(KERN_INFO "Kprobe smoke test: probe on 0xdeadbeef!\n");
-
 	return (value / div_factor);
 }
 
@@ -74,7 +63,7 @@ static int test_kprobe(void)
 		return ret;
 	}
 
-	ret = kprobe_target(rand1);
+	ret = target(rand1);
 	unregister_kprobe(&kp);
 
 	if (preh_val == 0) {
@@ -121,7 +110,7 @@ static int test_jprobe(void)
 		return ret;
 	}
 
-	ret = kprobe_target(rand1);
+	ret = target(rand1);
 	unregister_jprobe(&jp);
 	if (jph_val == 0) {
 		printk(KERN_ERR "Kprobe smoke test failed: "
@@ -177,7 +166,7 @@ static int test_kretprobe(void)
 		return ret;
 	}
 
-	ret = kprobe_target(rand1);
+	ret = target(rand1);
 	unregister_kretprobe(&rp);
 	if (krph_val != rand1) {
 		printk(KERN_ERR "Kprobe smoke test failed: "
@@ -193,6 +182,8 @@ int init_test_probes(void)
 {
 	int ret;
 
+	target = kprobe_target;
+
 	do {
 		rand1 = random32();
 	} while (rand1 <= div_factor);
-- 
cgit v1.2.3


From 12da3b888b2035bb0f106122f1cc1b6d357fad53 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 6 Jan 2009 14:41:48 -0800
Subject: kprobes: add tests for register_kprobes

Add testcases for *probe batch registration (register_kprobes) to kprobes
sanity tests.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: David Miller <davem@davemloft.net>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/test_kprobes.c | 189 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 189 insertions(+)

(limited to 'kernel')

diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 9c0127ead6a..4f104515a19 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -23,6 +23,7 @@
 static u32 rand1, preh_val, posth_val, jph_val;
 static int errors, handler_errors, num_tests;
 static u32 (*target)(u32 value);
+static u32 (*target2)(u32 value);
 
 static noinline u32 kprobe_target(u32 value)
 {
@@ -81,6 +82,84 @@ static int test_kprobe(void)
 	return 0;
 }
 
+static noinline u32 kprobe_target2(u32 value)
+{
+	return (value / div_factor) + 1;
+}
+
+static int kp_pre_handler2(struct kprobe *p, struct pt_regs *regs)
+{
+	preh_val = (rand1 / div_factor) + 1;
+	return 0;
+}
+
+static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs,
+		unsigned long flags)
+{
+	if (preh_val != (rand1 / div_factor) + 1) {
+		handler_errors++;
+		printk(KERN_ERR "Kprobe smoke test failed: "
+				"incorrect value in post_handler2\n");
+	}
+	posth_val = preh_val + div_factor;
+}
+
+static struct kprobe kp2 = {
+	.symbol_name = "kprobe_target2",
+	.pre_handler = kp_pre_handler2,
+	.post_handler = kp_post_handler2
+};
+
+static int test_kprobes(void)
+{
+	int ret;
+	struct kprobe *kps[2] = {&kp, &kp2};
+
+	kp.addr = 0; /* addr should be cleard for reusing kprobe. */
+	ret = register_kprobes(kps, 2);
+	if (ret < 0) {
+		printk(KERN_ERR "Kprobe smoke test failed: "
+				"register_kprobes returned %d\n", ret);
+		return ret;
+	}
+
+	preh_val = 0;
+	posth_val = 0;
+	ret = target(rand1);
+
+	if (preh_val == 0) {
+		printk(KERN_ERR "Kprobe smoke test failed: "
+				"kprobe pre_handler not called\n");
+		handler_errors++;
+	}
+
+	if (posth_val == 0) {
+		printk(KERN_ERR "Kprobe smoke test failed: "
+				"kprobe post_handler not called\n");
+		handler_errors++;
+	}
+
+	preh_val = 0;
+	posth_val = 0;
+	ret = target2(rand1);
+
+	if (preh_val == 0) {
+		printk(KERN_ERR "Kprobe smoke test failed: "
+				"kprobe pre_handler2 not called\n");
+		handler_errors++;
+	}
+
+	if (posth_val == 0) {
+		printk(KERN_ERR "Kprobe smoke test failed: "
+				"kprobe post_handler2 not called\n");
+		handler_errors++;
+	}
+
+	unregister_kprobes(kps, 2);
+	return 0;
+
+}
+
 static u32 j_kprobe_target(u32 value)
 {
 	if (value != rand1) {
@@ -121,6 +200,43 @@ static int test_jprobe(void)
 	return 0;
 }
 
+static struct jprobe jp2 = {
+	.entry          = j_kprobe_target,
+	.kp.symbol_name = "kprobe_target2"
+};
+
+static int test_jprobes(void)
+{
+	int ret;
+	struct jprobe *jps[2] = {&jp, &jp2};
+
+	jp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */
+	ret = register_jprobes(jps, 2);
+	if (ret < 0) {
+		printk(KERN_ERR "Kprobe smoke test failed: "
+				"register_jprobes returned %d\n", ret);
+		return ret;
+	}
+
+	jph_val = 0;
+	ret = target(rand1);
+	if (jph_val == 0) {
+		printk(KERN_ERR "Kprobe smoke test failed: "
+				"jprobe handler not called\n");
+		handler_errors++;
+	}
+
+	jph_val = 0;
+	ret = target2(rand1);
+	if (jph_val == 0) {
+		printk(KERN_ERR "Kprobe smoke test failed: "
+				"jprobe handler2 not called\n");
+		handler_errors++;
+	}
+	unregister_jprobes(jps, 2);
+
+	return 0;
+}
 #ifdef CONFIG_KRETPROBES
 static u32 krph_val;
 
@@ -176,6 +292,63 @@ static int test_kretprobe(void)
 
 	return 0;
 }
+
+static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs)
+{
+	unsigned long ret = regs_return_value(regs);
+
+	if (ret != (rand1 / div_factor) + 1) {
+		handler_errors++;
+		printk(KERN_ERR "Kprobe smoke test failed: "
+				"incorrect value in kretprobe handler2\n");
+	}
+	if (krph_val == 0) {
+		handler_errors++;
+		printk(KERN_ERR "Kprobe smoke test failed: "
+				"call to kretprobe entry handler failed\n");
+	}
+
+	krph_val = rand1;
+	return 0;
+}
+
+static struct kretprobe rp2 = {
+	.handler	= return_handler2,
+	.entry_handler  = entry_handler,
+	.kp.symbol_name = "kprobe_target2"
+};
+
+static int test_kretprobes(void)
+{
+	int ret;
+	struct kretprobe *rps[2] = {&rp, &rp2};
+
+	rp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */
+	ret = register_kretprobes(rps, 2);
+	if (ret < 0) {
+		printk(KERN_ERR "Kprobe smoke test failed: "
+				"register_kretprobe returned %d\n", ret);
+		return ret;
+	}
+
+	krph_val = 0;
+	ret = target(rand1);
+	if (krph_val != rand1) {
+		printk(KERN_ERR "Kprobe smoke test failed: "
+				"kretprobe handler not called\n");
+		handler_errors++;
+	}
+
+	krph_val = 0;
+	ret = target2(rand1);
+	if (krph_val != rand1) {
+		printk(KERN_ERR "Kprobe smoke test failed: "
+				"kretprobe handler2 not called\n");
+		handler_errors++;
+	}
+	unregister_kretprobes(rps, 2);
+	return 0;
+}
 #endif /* CONFIG_KRETPROBES */
 
 int init_test_probes(void)
@@ -183,6 +356,7 @@ int init_test_probes(void)
 	int ret;
 
 	target = kprobe_target;
+	target2 = kprobe_target2;
 
 	do {
 		rand1 = random32();
@@ -194,16 +368,31 @@ int init_test_probes(void)
 	if (ret < 0)
 		errors++;
 
+	num_tests++;
+	ret = test_kprobes();
+	if (ret < 0)
+		errors++;
+
 	num_tests++;
 	ret = test_jprobe();
 	if (ret < 0)
 		errors++;
 
+	num_tests++;
+	ret = test_jprobes();
+	if (ret < 0)
+		errors++;
+
 #ifdef CONFIG_KRETPROBES
 	num_tests++;
 	ret = test_kretprobe();
 	if (ret < 0)
 		errors++;
+
+	num_tests++;
+	ret = test_kretprobes();
+	if (ret < 0)
+		errors++;
 #endif /* CONFIG_KRETPROBES */
 
 	if (errors)
-- 
cgit v1.2.3


From a06f6211ef9b1785922f9d0e8766d63ac4e66de1 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 6 Jan 2009 14:41:49 -0800
Subject: module: add within_module_core() and within_module_init()

This series of patches allows kprobes to probe module's __init and __exit
functions.  This means, you can probe driver initialization and
terminating.

Currently, kprobes can't probe __init function because these functions are
freed after module initialization.  And it also can't probe module __exit
functions because kprobe increments reference count of target module and
user can't unload it.  this means __exit functions never be called unless
removing probes from the module.

To solve both cases, this series of patches introduces GONE flag and sets
it when the target code is freed(for this purpose, kprobes hooks
MODULE_STATE_* events).  This also removes refcount incrementing for
allowing user to unload target module.  Users can check which probes are
GONE by debugfs interface.  For taking timing of freeing module's .init
text, these also include a patch which adds module's notifier of
MODULE_STATE_LIVE event.

This patch:

Add within_module_core() and within_module_init() for checking whether an
address is in the module .init.text section or .text section, and replace
within() local inline functions in kernel/module.c with them.

kprobes uses these functions to check where the kprobe is inserted.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/module.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 34b56cf0661..cc79c942c57 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2390,7 +2390,7 @@ static const char *get_ksymbol(struct module *mod,
 	unsigned long nextval;
 
 	/* At worse, next value is at end of module */
-	if (within(addr, mod->module_init, mod->init_size))
+	if (within_module_init(addr, mod))
 		nextval = (unsigned long)mod->module_init+mod->init_text_size;
 	else
 		nextval = (unsigned long)mod->module_core+mod->core_text_size;
@@ -2438,8 +2438,8 @@ const char *module_address_lookup(unsigned long addr,
 
 	preempt_disable();
 	list_for_each_entry_rcu(mod, &modules, list) {
-		if (within(addr, mod->module_init, mod->init_size)
-		    || within(addr, mod->module_core, mod->core_size)) {
+		if (within_module_init(addr, mod) ||
+		    within_module_core(addr, mod)) {
 			if (modname)
 				*modname = mod->name;
 			ret = get_ksymbol(mod, addr, size, offset);
@@ -2461,8 +2461,8 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)
 
 	preempt_disable();
 	list_for_each_entry_rcu(mod, &modules, list) {
-		if (within(addr, mod->module_init, mod->init_size) ||
-		    within(addr, mod->module_core, mod->core_size)) {
+		if (within_module_init(addr, mod) ||
+		    within_module_core(addr, mod)) {
 			const char *sym;
 
 			sym = get_ksymbol(mod, addr, NULL, NULL);
@@ -2485,8 +2485,8 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
 
 	preempt_disable();
 	list_for_each_entry_rcu(mod, &modules, list) {
-		if (within(addr, mod->module_init, mod->init_size) ||
-		    within(addr, mod->module_core, mod->core_size)) {
+		if (within_module_init(addr, mod) ||
+		    within_module_core(addr, mod)) {
 			const char *sym;
 
 			sym = get_ksymbol(mod, addr, size, offset);
@@ -2705,7 +2705,7 @@ int is_module_address(unsigned long addr)
 	preempt_disable();
 
 	list_for_each_entry_rcu(mod, &modules, list) {
-		if (within(addr, mod->module_core, mod->core_size)) {
+		if (within_module_core(addr, mod)) {
 			preempt_enable();
 			return 1;
 		}
-- 
cgit v1.2.3


From 129415607845d4daea11ddcba706005c69dcb942 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 6 Jan 2009 14:41:50 -0800
Subject: kprobes: add kprobe_insn_mutex and cleanup arch_remove_kprobe()

Add kprobe_insn_mutex for protecting kprobe_insn_pages hlist, and remove
kprobe_mutex from architecture dependent code.

This allows us to call arch_remove_kprobe() (and free_insn_slot) while
holding kprobe_mutex.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kprobes.c | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3afd354c46f..29e87921437 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -69,7 +69,7 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
 /* NOTE: change this value only with kprobe_mutex held */
 static bool kprobe_enabled;
 
-DEFINE_MUTEX(kprobe_mutex);		/* Protects kprobe_table */
+static DEFINE_MUTEX(kprobe_mutex);	/* Protects kprobe_table */
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
 static struct {
 	spinlock_t lock ____cacheline_aligned_in_smp;
@@ -115,6 +115,7 @@ enum kprobe_slot_state {
 	SLOT_USED = 2,
 };
 
+static DEFINE_MUTEX(kprobe_insn_mutex);	/* Protects kprobe_insn_pages */
 static struct hlist_head kprobe_insn_pages;
 static int kprobe_garbage_slots;
 static int collect_garbage_slots(void);
@@ -144,10 +145,10 @@ loop_end:
 }
 
 /**
- * get_insn_slot() - Find a slot on an executable page for an instruction.
+ * __get_insn_slot() - Find a slot on an executable page for an instruction.
  * We allocate an executable page if there's no room on existing ones.
  */
-kprobe_opcode_t __kprobes *get_insn_slot(void)
+static kprobe_opcode_t __kprobes *__get_insn_slot(void)
 {
 	struct kprobe_insn_page *kip;
 	struct hlist_node *pos;
@@ -196,6 +197,15 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
 	return kip->insns;
 }
 
+kprobe_opcode_t __kprobes *get_insn_slot(void)
+{
+	kprobe_opcode_t *ret;
+	mutex_lock(&kprobe_insn_mutex);
+	ret = __get_insn_slot();
+	mutex_unlock(&kprobe_insn_mutex);
+	return ret;
+}
+
 /* Return 1 if all garbages are collected, otherwise 0. */
 static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
 {
@@ -226,9 +236,13 @@ static int __kprobes collect_garbage_slots(void)
 {
 	struct kprobe_insn_page *kip;
 	struct hlist_node *pos, *next;
+	int safety;
 
 	/* Ensure no-one is preepmted on the garbages */
-	if (check_safety() != 0)
+	mutex_unlock(&kprobe_insn_mutex);
+	safety = check_safety();
+	mutex_lock(&kprobe_insn_mutex);
+	if (safety != 0)
 		return -EAGAIN;
 
 	hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) {
@@ -251,6 +265,7 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
 	struct kprobe_insn_page *kip;
 	struct hlist_node *pos;
 
+	mutex_lock(&kprobe_insn_mutex);
 	hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) {
 		if (kip->insns <= slot &&
 		    slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
@@ -267,6 +282,8 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
 
 	if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE)
 		collect_garbage_slots();
+
+	mutex_unlock(&kprobe_insn_mutex);
 }
 #endif
 
-- 
cgit v1.2.3


From 017c39bdb1b3ac1da6db339474a77b528043c05a Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 6 Jan 2009 14:41:51 -0800
Subject: kprobes: add __kprobes to kprobe internal functions

Add __kprobes to kprobes internal functions for protecting from probing by
kprobes itself.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kprobes.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 29e87921437..a1e233a1958 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -410,7 +410,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
 		hlist_add_head(&ri->hlist, head);
 }
 
-void kretprobe_hash_lock(struct task_struct *tsk,
+void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
 			 struct hlist_head **head, unsigned long *flags)
 {
 	unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
@@ -421,13 +421,15 @@ void kretprobe_hash_lock(struct task_struct *tsk,
 	spin_lock_irqsave(hlist_lock, *flags);
 }
 
-static void kretprobe_table_lock(unsigned long hash, unsigned long *flags)
+static void __kprobes kretprobe_table_lock(unsigned long hash,
+	unsigned long *flags)
 {
 	spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
 	spin_lock_irqsave(hlist_lock, *flags);
 }
 
-void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags)
+void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
+	unsigned long *flags)
 {
 	unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
 	spinlock_t *hlist_lock;
@@ -436,7 +438,7 @@ void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags)
 	spin_unlock_irqrestore(hlist_lock, *flags);
 }
 
-void kretprobe_table_unlock(unsigned long hash, unsigned long *flags)
+void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags)
 {
 	spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
 	spin_unlock_irqrestore(hlist_lock, *flags);
@@ -762,7 +764,7 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
 	}
 }
 
-static int __register_kprobes(struct kprobe **kps, int num,
+static int __kprobes __register_kprobes(struct kprobe **kps, int num,
 	unsigned long called_from)
 {
 	int i, ret = 0;
@@ -828,7 +830,7 @@ unsigned long __weak arch_deref_entry_point(void *entry)
 	return (unsigned long)entry;
 }
 
-static int __register_jprobes(struct jprobe **jps, int num,
+static int __kprobes __register_jprobes(struct jprobe **jps, int num,
 	unsigned long called_from)
 {
 	struct jprobe *jp;
@@ -990,7 +992,7 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp,
 	return ret;
 }
 
-static int __register_kretprobes(struct kretprobe **rps, int num,
+static int __kprobes __register_kretprobes(struct kretprobe **rps, int num,
 	unsigned long called_from)
 {
 	int ret = 0, i;
-- 
cgit v1.2.3


From e8386a0cb22f4a2d439384212c494ad0bda848fe Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 6 Jan 2009 14:41:52 -0800
Subject: kprobes: support probing module __exit function

Allows kprobes to probe __exit routine.  This adds flags member to struct
kprobe.  When module is freed(kprobes hooks module_notifier to get this
event), kprobes which probe the functions in that module are set to "Gone"
flag to the flags member.  These "Gone" probes are never be enabled.
Users can check the GONE flag through debugfs.

This also removes mod_refcounted, because we couldn't free a module if
kprobe incremented the refcount of that module.

[akpm@linux-foundation.org: document some locking]
[mhiramat@redhat.com: bugfix: pass aggr_kprobe to arch_remove_kprobe]
[mhiramat@redhat.com: bugfix: release old_p's insn_slot before error return]
Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kprobes.c | 159 +++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 119 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index a1e233a1958..cb732a9aa55 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -327,7 +327,7 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
 	struct kprobe *kp;
 
 	list_for_each_entry_rcu(kp, &p->list, list) {
-		if (kp->pre_handler) {
+		if (kp->pre_handler && !kprobe_gone(kp)) {
 			set_kprobe_instance(kp);
 			if (kp->pre_handler(kp, regs))
 				return 1;
@@ -343,7 +343,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
 	struct kprobe *kp;
 
 	list_for_each_entry_rcu(kp, &p->list, list) {
-		if (kp->post_handler) {
+		if (kp->post_handler && !kprobe_gone(kp)) {
 			set_kprobe_instance(kp);
 			kp->post_handler(kp, regs, flags);
 			reset_kprobe_instance();
@@ -545,9 +545,10 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
 	ap->addr = p->addr;
 	ap->pre_handler = aggr_pre_handler;
 	ap->fault_handler = aggr_fault_handler;
-	if (p->post_handler)
+	/* We don't care the kprobe which has gone. */
+	if (p->post_handler && !kprobe_gone(p))
 		ap->post_handler = aggr_post_handler;
-	if (p->break_handler)
+	if (p->break_handler && !kprobe_gone(p))
 		ap->break_handler = aggr_break_handler;
 
 	INIT_LIST_HEAD(&ap->list);
@@ -566,17 +567,41 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
 	int ret = 0;
 	struct kprobe *ap;
 
+	if (kprobe_gone(old_p)) {
+		/*
+		 * Attempting to insert new probe at the same location that
+		 * had a probe in the module vaddr area which already
+		 * freed. So, the instruction slot has already been
+		 * released. We need a new slot for the new probe.
+		 */
+		ret = arch_prepare_kprobe(old_p);
+		if (ret)
+			return ret;
+	}
 	if (old_p->pre_handler == aggr_pre_handler) {
 		copy_kprobe(old_p, p);
 		ret = add_new_kprobe(old_p, p);
+		ap = old_p;
 	} else {
 		ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
-		if (!ap)
+		if (!ap) {
+			if (kprobe_gone(old_p))
+				arch_remove_kprobe(old_p);
 			return -ENOMEM;
+		}
 		add_aggr_kprobe(ap, old_p);
 		copy_kprobe(ap, p);
 		ret = add_new_kprobe(ap, p);
 	}
+	if (kprobe_gone(old_p)) {
+		/*
+		 * If the old_p has gone, its breakpoint has been disarmed.
+		 * We have to arm it again after preparing real kprobes.
+		 */
+		ap->flags &= ~KPROBE_FLAG_GONE;
+		if (kprobe_enabled)
+			arch_arm_kprobe(ap);
+	}
 	return ret;
 }
 
@@ -639,8 +664,7 @@ static int __kprobes __register_kprobe(struct kprobe *p,
 		return -EINVAL;
 	}
 
-	p->mod_refcounted = 0;
-
+	p->flags = 0;
 	/*
 	 * Check if are we probing a module.
 	 */
@@ -649,16 +673,14 @@ static int __kprobes __register_kprobe(struct kprobe *p,
 		struct module *calling_mod;
 		calling_mod = __module_text_address(called_from);
 		/*
-		 * We must allow modules to probe themself and in this case
-		 * avoid incrementing the module refcount, so as to allow
-		 * unloading of self probing modules.
+		 * We must hold a refcount of the probed module while updating
+		 * its code to prohibit unexpected unloading.
 		 */
 		if (calling_mod != probed_mod) {
 			if (unlikely(!try_module_get(probed_mod))) {
 				preempt_enable();
 				return -EINVAL;
 			}
-			p->mod_refcounted = 1;
 		} else
 			probed_mod = NULL;
 	}
@@ -687,8 +709,9 @@ static int __kprobes __register_kprobe(struct kprobe *p,
 out:
 	mutex_unlock(&kprobe_mutex);
 
-	if (ret && probed_mod)
+	if (probed_mod)
 		module_put(probed_mod);
+
 	return ret;
 }
 
@@ -716,16 +739,16 @@ valid_p:
 	     list_is_singular(&old_p->list))) {
 		/*
 		 * Only probe on the hash list. Disarm only if kprobes are
-		 * enabled - otherwise, the breakpoint would already have
-		 * been removed. We save on flushing icache.
+		 * enabled and not gone - otherwise, the breakpoint would
+		 * already have been removed. We save on flushing icache.
 		 */
-		if (kprobe_enabled)
+		if (kprobe_enabled && !kprobe_gone(old_p))
 			arch_disarm_kprobe(p);
 		hlist_del_rcu(&old_p->hlist);
 	} else {
-		if (p->break_handler)
+		if (p->break_handler && !kprobe_gone(p))
 			old_p->break_handler = NULL;
-		if (p->post_handler) {
+		if (p->post_handler && !kprobe_gone(p)) {
 			list_for_each_entry_rcu(list_p, &old_p->list, list) {
 				if ((list_p != p) && (list_p->post_handler))
 					goto noclean;
@@ -740,27 +763,16 @@ noclean:
 
 static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
 {
-	struct module *mod;
 	struct kprobe *old_p;
 
-	if (p->mod_refcounted) {
-		/*
-		 * Since we've already incremented refcount,
-		 * we don't need to disable preemption.
-		 */
-		mod = module_text_address((unsigned long)p->addr);
-		if (mod)
-			module_put(mod);
-	}
-
-	if (list_empty(&p->list) || list_is_singular(&p->list)) {
-		if (!list_empty(&p->list)) {
-			/* "p" is the last child of an aggr_kprobe */
-			old_p = list_entry(p->list.next, struct kprobe, list);
-			list_del(&p->list);
-			kfree(old_p);
-		}
+	if (list_empty(&p->list))
 		arch_remove_kprobe(p);
+	else if (list_is_singular(&p->list)) {
+		/* "p" is the last child of an aggr_kprobe */
+		old_p = list_entry(p->list.next, struct kprobe, list);
+		list_del(&p->list);
+		arch_remove_kprobe(old_p);
+		kfree(old_p);
 	}
 }
 
@@ -1074,6 +1086,67 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
 
 #endif /* CONFIG_KRETPROBES */
 
+/* Set the kprobe gone and remove its instruction buffer. */
+static void __kprobes kill_kprobe(struct kprobe *p)
+{
+	struct kprobe *kp;
+	p->flags |= KPROBE_FLAG_GONE;
+	if (p->pre_handler == aggr_pre_handler) {
+		/*
+		 * If this is an aggr_kprobe, we have to list all the
+		 * chained probes and mark them GONE.
+		 */
+		list_for_each_entry_rcu(kp, &p->list, list)
+			kp->flags |= KPROBE_FLAG_GONE;
+		p->post_handler = NULL;
+		p->break_handler = NULL;
+	}
+	/*
+	 * Here, we can remove insn_slot safely, because no thread calls
+	 * the original probed function (which will be freed soon) any more.
+	 */
+	arch_remove_kprobe(p);
+}
+
+/* Module notifier call back, checking kprobes on the module */
+static int __kprobes kprobes_module_callback(struct notifier_block *nb,
+					     unsigned long val, void *data)
+{
+	struct module *mod = data;
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct kprobe *p;
+	unsigned int i;
+
+	if (val != MODULE_STATE_GOING)
+		return NOTIFY_DONE;
+
+	/*
+	 * module .text section will be freed. We need to
+	 * disable kprobes which have been inserted in the section.
+	 */
+	mutex_lock(&kprobe_mutex);
+	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+		head = &kprobe_table[i];
+		hlist_for_each_entry_rcu(p, node, head, hlist)
+			if (within_module_core((unsigned long)p->addr, mod)) {
+				/*
+				 * The vaddr this probe is installed will soon
+				 * be vfreed buy not synced to disk. Hence,
+				 * disarming the breakpoint isn't needed.
+				 */
+				kill_kprobe(p);
+			}
+	}
+	mutex_unlock(&kprobe_mutex);
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block kprobe_module_nb = {
+	.notifier_call = kprobes_module_callback,
+	.priority = 0
+};
+
 static int __init init_kprobes(void)
 {
 	int i, err = 0;
@@ -1130,6 +1203,9 @@ static int __init init_kprobes(void)
 	err = arch_init_kprobes();
 	if (!err)
 		err = register_die_notifier(&kprobe_exceptions_nb);
+	if (!err)
+		err = register_module_notifier(&kprobe_module_nb);
+
 	kprobes_initialized = (err == 0);
 
 	if (!err)
@@ -1150,10 +1226,12 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
 	else
 		kprobe_type = "k";
 	if (sym)
-		seq_printf(pi, "%p  %s  %s+0x%x  %s\n", p->addr, kprobe_type,
-			sym, offset, (modname ? modname : " "));
+		seq_printf(pi, "%p  %s  %s+0x%x  %s %s\n", p->addr, kprobe_type,
+			sym, offset, (modname ? modname : " "),
+			(kprobe_gone(p) ? "[GONE]" : ""));
 	else
-		seq_printf(pi, "%p  %s  %p\n", p->addr, kprobe_type, p->addr);
+		seq_printf(pi, "%p  %s  %p %s\n", p->addr, kprobe_type, p->addr,
+			(kprobe_gone(p) ? "[GONE]" : ""));
 }
 
 static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -1234,7 +1312,8 @@ static void __kprobes enable_all_kprobes(void)
 	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
 		head = &kprobe_table[i];
 		hlist_for_each_entry_rcu(p, node, head, hlist)
-			arch_arm_kprobe(p);
+			if (!kprobe_gone(p))
+				arch_arm_kprobe(p);
 	}
 
 	kprobe_enabled = true;
@@ -1263,7 +1342,7 @@ static void __kprobes disable_all_kprobes(void)
 	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
 		head = &kprobe_table[i];
 		hlist_for_each_entry_rcu(p, node, head, hlist) {
-			if (!arch_trampoline_kprobe(p))
+			if (!arch_trampoline_kprobe(p) && !kprobe_gone(p))
 				arch_disarm_kprobe(p);
 		}
 	}
-- 
cgit v1.2.3


From 49ad2fd76c97133fb396edc24ded7fe26093a578 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 6 Jan 2009 14:41:53 -0800
Subject: kprobes: remove called_from argument

Remove called_from argument from kprobes which had been used for
preventing self-refering of kernel module.  However, since we don't keep
module's refcount after registering kprobe any more, there is no reason to
check that.

This patch also simplifies registering/unregistering functions because we
don't need to use __builtin_return_address(0) which was passed to
called_from.

[ananth@in.ibm.com: build fix]
Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kprobes.c | 72 +++++++++++---------------------------------------------
 1 file changed, 14 insertions(+), 58 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index cb732a9aa55..ddefb9fae0c 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -644,8 +644,7 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
 	return (kprobe_opcode_t *)(((char *)addr) + p->offset);
 }
 
-static int __kprobes __register_kprobe(struct kprobe *p,
-	unsigned long called_from)
+int __kprobes register_kprobe(struct kprobe *p)
 {
 	int ret = 0;
 	struct kprobe *old_p;
@@ -670,19 +669,14 @@ static int __kprobes __register_kprobe(struct kprobe *p,
 	 */
 	probed_mod = __module_text_address((unsigned long) p->addr);
 	if (probed_mod) {
-		struct module *calling_mod;
-		calling_mod = __module_text_address(called_from);
 		/*
 		 * We must hold a refcount of the probed module while updating
 		 * its code to prohibit unexpected unloading.
 		 */
-		if (calling_mod != probed_mod) {
-			if (unlikely(!try_module_get(probed_mod))) {
-				preempt_enable();
-				return -EINVAL;
-			}
-		} else
-			probed_mod = NULL;
+		if (unlikely(!try_module_get(probed_mod))) {
+			preempt_enable();
+			return -EINVAL;
+		}
 	}
 	preempt_enable();
 
@@ -776,15 +770,14 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
 	}
 }
 
-static int __kprobes __register_kprobes(struct kprobe **kps, int num,
-	unsigned long called_from)
+int __kprobes register_kprobes(struct kprobe **kps, int num)
 {
 	int i, ret = 0;
 
 	if (num <= 0)
 		return -EINVAL;
 	for (i = 0; i < num; i++) {
-		ret = __register_kprobe(kps[i], called_from);
+		ret = register_kprobe(kps[i]);
 		if (ret < 0) {
 			if (i > 0)
 				unregister_kprobes(kps, i);
@@ -794,26 +787,11 @@ static int __kprobes __register_kprobes(struct kprobe **kps, int num,
 	return ret;
 }
 
-/*
- * Registration and unregistration functions for kprobe.
- */
-int __kprobes register_kprobe(struct kprobe *p)
-{
-	return __register_kprobes(&p, 1,
-				  (unsigned long)__builtin_return_address(0));
-}
-
 void __kprobes unregister_kprobe(struct kprobe *p)
 {
 	unregister_kprobes(&p, 1);
 }
 
-int __kprobes register_kprobes(struct kprobe **kps, int num)
-{
-	return __register_kprobes(kps, num,
-				  (unsigned long)__builtin_return_address(0));
-}
-
 void __kprobes unregister_kprobes(struct kprobe **kps, int num)
 {
 	int i;
@@ -842,8 +820,7 @@ unsigned long __weak arch_deref_entry_point(void *entry)
 	return (unsigned long)entry;
 }
 
-static int __kprobes __register_jprobes(struct jprobe **jps, int num,
-	unsigned long called_from)
+int __kprobes register_jprobes(struct jprobe **jps, int num)
 {
 	struct jprobe *jp;
 	int ret = 0, i;
@@ -861,7 +838,7 @@ static int __kprobes __register_jprobes(struct jprobe **jps, int num,
 			/* Todo: Verify probepoint is a function entry point */
 			jp->kp.pre_handler = setjmp_pre_handler;
 			jp->kp.break_handler = longjmp_break_handler;
-			ret = __register_kprobe(&jp->kp, called_from);
+			ret = register_kprobe(&jp->kp);
 		}
 		if (ret < 0) {
 			if (i > 0)
@@ -874,8 +851,7 @@ static int __kprobes __register_jprobes(struct jprobe **jps, int num,
 
 int __kprobes register_jprobe(struct jprobe *jp)
 {
-	return __register_jprobes(&jp, 1,
-		(unsigned long)__builtin_return_address(0));
+	return register_jprobes(&jp, 1);
 }
 
 void __kprobes unregister_jprobe(struct jprobe *jp)
@@ -883,12 +859,6 @@ void __kprobes unregister_jprobe(struct jprobe *jp)
 	unregister_jprobes(&jp, 1);
 }
 
-int __kprobes register_jprobes(struct jprobe **jps, int num)
-{
-	return __register_jprobes(jps, num,
-		(unsigned long)__builtin_return_address(0));
-}
-
 void __kprobes unregister_jprobes(struct jprobe **jps, int num)
 {
 	int i;
@@ -951,8 +921,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
 	return 0;
 }
 
-static int __kprobes __register_kretprobe(struct kretprobe *rp,
-					  unsigned long called_from)
+int __kprobes register_kretprobe(struct kretprobe *rp)
 {
 	int ret = 0;
 	struct kretprobe_instance *inst;
@@ -998,21 +967,20 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp,
 
 	rp->nmissed = 0;
 	/* Establish function entry probe point */
-	ret = __register_kprobe(&rp->kp, called_from);
+	ret = register_kprobe(&rp->kp);
 	if (ret != 0)
 		free_rp_inst(rp);
 	return ret;
 }
 
-static int __kprobes __register_kretprobes(struct kretprobe **rps, int num,
-	unsigned long called_from)
+int __kprobes register_kretprobes(struct kretprobe **rps, int num)
 {
 	int ret = 0, i;
 
 	if (num <= 0)
 		return -EINVAL;
 	for (i = 0; i < num; i++) {
-		ret = __register_kretprobe(rps[i], called_from);
+		ret = register_kretprobe(rps[i]);
 		if (ret < 0) {
 			if (i > 0)
 				unregister_kretprobes(rps, i);
@@ -1022,23 +990,11 @@ static int __kprobes __register_kretprobes(struct kretprobe **rps, int num,
 	return ret;
 }
 
-int __kprobes register_kretprobe(struct kretprobe *rp)
-{
-	return __register_kretprobes(&rp, 1,
-			(unsigned long)__builtin_return_address(0));
-}
-
 void __kprobes unregister_kretprobe(struct kretprobe *rp)
 {
 	unregister_kretprobes(&rp, 1);
 }
 
-int __kprobes register_kretprobes(struct kretprobe **rps, int num)
-{
-	return __register_kretprobes(rps, num,
-			(unsigned long)__builtin_return_address(0));
-}
-
 void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
 {
 	int i;
-- 
cgit v1.2.3


From 0deddf436a37c18ceb26c6e3b632fb9b5f58a0c1 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 6 Jan 2009 14:41:54 -0800
Subject: module: add MODULE_STATE_LIVE notify

Add a module notifier call which notifies that the state of a module
changes from MODULE_STATE_COMING to MODULE_STATE_LIVE.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/module.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index cc79c942c57..496dcb57b60 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2352,6 +2352,8 @@ sys_init_module(void __user *umod,
 	/* Now it's a first class citizen!  Wake up anyone waiting for it. */
 	mod->state = MODULE_STATE_LIVE;
 	wake_up(&module_wq);
+	blocking_notifier_call_chain(&module_notify_list,
+				     MODULE_STATE_LIVE, mod);
 
 	mutex_lock(&module_mutex);
 	/* Drop initial reference. */
-- 
cgit v1.2.3


From f24659d96f4e056125f14498285203d1427cb18e Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 6 Jan 2009 14:41:55 -0800
Subject: kprobes: support probing module __init function

Allow kprobes to probe module __init routines.  When __init functions are
freed, kprobes which probe those functions are set to "Gone" flag.  These
"Gone" probes are disarmed from the code and never be enabled.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kprobes.c | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ddefb9fae0c..1b9cbdc0127 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -677,6 +677,16 @@ int __kprobes register_kprobe(struct kprobe *p)
 			preempt_enable();
 			return -EINVAL;
 		}
+		/*
+		 * If the module freed .init.text, we couldn't insert
+		 * kprobes in there.
+		 */
+		if (within_module_init((unsigned long)p->addr, probed_mod) &&
+		    probed_mod->state != MODULE_STATE_COMING) {
+			module_put(probed_mod);
+			preempt_enable();
+			return -EINVAL;
+		}
 	}
 	preempt_enable();
 
@@ -1073,19 +1083,24 @@ static int __kprobes kprobes_module_callback(struct notifier_block *nb,
 	struct hlist_node *node;
 	struct kprobe *p;
 	unsigned int i;
+	int checkcore = (val == MODULE_STATE_GOING);
 
-	if (val != MODULE_STATE_GOING)
+	if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE)
 		return NOTIFY_DONE;
 
 	/*
-	 * module .text section will be freed. We need to
-	 * disable kprobes which have been inserted in the section.
+	 * When MODULE_STATE_GOING was notified, both of module .text and
+	 * .init.text sections would be freed. When MODULE_STATE_LIVE was
+	 * notified, only .init.text section would be freed. We need to
+	 * disable kprobes which have been inserted in the sections.
 	 */
 	mutex_lock(&kprobe_mutex);
 	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
 		head = &kprobe_table[i];
 		hlist_for_each_entry_rcu(p, node, head, hlist)
-			if (within_module_core((unsigned long)p->addr, mod)) {
+			if (within_module_init((unsigned long)p->addr, mod) ||
+			    (checkcore &&
+			     within_module_core((unsigned long)p->addr, mod))) {
 				/*
 				 * The vaddr this probe is installed will soon
 				 * be vfreed buy not synced to disk. Hence,
-- 
cgit v1.2.3


From bd4207c9016749f0a212faf7f7f49e5317d96d9b Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Tue, 6 Jan 2009 14:42:39 -0800
Subject: kmod: fix varargs kernel-doc

Fix varargs kernel-doc format in kmod.c:
Use @... instead of @varargs.

Warning(kernel/kmod.c:67): Excess function parameter or struct member 'varargs' description in 'request_module'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kmod.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index b46dbb90866..a27a5f64443 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -51,8 +51,8 @@ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
 
 /**
  * request_module - try to load a kernel module
- * @fmt:     printf style format string for the name of the module
- * @varargs: arguements as specified in the format string
+ * @fmt: printf style format string for the name of the module
+ * @...: arguments as specified in the format string
  *
  * Load a module using the user mode module loader. The function returns
  * zero on success or a negative errno code on failure. Note that a
-- 
cgit v1.2.3


From 09bca05c90c639f57aae057e0c28f287e61f5a07 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Tue, 6 Jan 2009 14:42:45 -0800
Subject: SEND_SIG_NOINFO: masquerade si_pid when crossing pid-ns boundary

For SEND_SIG_NOINFO, si_pid is currently set to the pid of sender
in sender's active pid namespace. But if the receiver is in a
Eg: when parent sends the 'pdeath_signal' to a child that is in
a descendant pid namespace, we should set si_pid 0.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Acked-By: Roland McGrath <roland@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 8e95855ff3c..31db63b3f88 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -858,7 +858,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			q->info.si_signo = sig;
 			q->info.si_errno = 0;
 			q->info.si_code = SI_USER;
-			q->info.si_pid = task_pid_vnr(current);
+			q->info.si_pid = task_pid_nr_ns(current,
+							task_active_pid_ns(t));
 			q->info.si_uid = current_uid();
 			break;
 		case (unsigned long) SEND_SIG_PRIV:
-- 
cgit v1.2.3


From 9cd4fd10437dda6b520cb1410b28f36967a34de8 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Tue, 6 Jan 2009 14:42:46 -0800
Subject: SEND_SIG_NOINFO: set si_pid to tgid instead of pid

POSIX requires the si_pid to be the process id of the sender, so ->si_pid
should really be set to 'tgid'.  This change does have following changes
in behavior:

	- When sending pdeath_signal on re-parent to a sub-thread, ->si_pid
	  cannot be used to identify the thread that did the re-parent since
	  it will now show the tgid instead of thread id.

	- A multi-threaded application that expects to find the specific
	  thread that encountered a SIGPIPE using the ->si_pid will now
	  break.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Acked-By: Roland McGrath <roland@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 31db63b3f88..3152ac3b62e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -858,7 +858,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			q->info.si_signo = sig;
 			q->info.si_errno = 0;
 			q->info.si_code = SI_USER;
-			q->info.si_pid = task_pid_nr_ns(current,
+			q->info.si_pid = task_tgid_nr_ns(current,
 							task_active_pid_ns(t));
 			q->info.si_uid = current_uid();
 			break;
-- 
cgit v1.2.3


From 4cb0e11b15d2badad455fcd538af0cccf05dc012 Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Tue, 6 Jan 2009 14:42:47 -0800
Subject: coredump_filter: permit changing of the default filter

Introduce a new kernel parameter `coredump_filter'.  Setting a value to
this parameter causes the default bitmask of coredump_filter to be
changed.

It is useful for users to change coredump_filter settings for the whole
system at boot time.  Without this parameter, users have to change
coredump_filter settings for each /proc/<pid>/ in an initializing script.

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 23b91211667..7b8f2a78be3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -400,6 +400,18 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
 #define allocate_mm()	(kmem_cache_alloc(mm_cachep, GFP_KERNEL))
 #define free_mm(mm)	(kmem_cache_free(mm_cachep, (mm)))
 
+static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
+
+static int __init coredump_filter_setup(char *s)
+{
+	default_dump_filter =
+		(simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
+		MMF_DUMP_FILTER_MASK;
+	return 1;
+}
+
+__setup("coredump_filter=", coredump_filter_setup);
+
 #include <linux/init_task.h>
 
 static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
@@ -408,8 +420,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 	atomic_set(&mm->mm_count, 1);
 	init_rwsem(&mm->mmap_sem);
 	INIT_LIST_HEAD(&mm->mmlist);
-	mm->flags = (current->mm) ? current->mm->flags
-				  : MMF_DUMP_FILTER_DEFAULT;
+	mm->flags = (current->mm) ? current->mm->flags : default_dump_filter;
 	mm->core_state = NULL;
 	mm->nr_ptes = 0;
 	set_mm_counter(mm, file_rss, 0);
-- 
cgit v1.2.3


From 0bef3c2dc7d0c8238330785c8f4504761b0e370b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 6 Jan 2009 14:43:08 -0800
Subject: dma_alloc_from_coherent(): fix fallback to generic memory

If bitmap_find_free_region() fails and DMA_MEMORY_EXCLUSIVE is not set,
the function will fail to write anything to *ret and will return 1.             This will cause dma_alloc_coherent() to return an uninitialised value,
crashing the kernel, perhaps via DMA to a random address.

Fix that by changing it to return zero in this case, so the caller will
proceed to allocate the memory from the generic memory allocator.

Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Cc: Dmitry Baryshkov <dbaryshkov@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/dma-coherent.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
index f013a0c2e11..4bdcea822b4 100644
--- a/kernel/dma-coherent.c
+++ b/kernel/dma-coherent.c
@@ -116,11 +116,25 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size,
 		int page = bitmap_find_free_region(mem->bitmap, mem->size,
 						     order);
 		if (page >= 0) {
+			/*
+			 * Memory was found in the per-device arena.
+			 */
 			*dma_handle = mem->device_base + (page << PAGE_SHIFT);
 			*ret = mem->virt_base + (page << PAGE_SHIFT);
 			memset(*ret, 0, size);
-		} else if (mem->flags & DMA_MEMORY_EXCLUSIVE)
+		} else if (mem->flags & DMA_MEMORY_EXCLUSIVE) {
+			/*
+			 * The per-device arena is exhausted and we are not
+			 * permitted to fall back to generic memory.
+			 */
 			*ret = NULL;
+		} else {
+			/*
+			 * The per-device arena is exhausted and we are
+			 * permitted to fall back to generic memory.
+			 */
+			 return 0;
+		}
 	}
 	return (mem != NULL);
 }
-- 
cgit v1.2.3


From eccd83e116e7f414a1da3aae3745384b7b171883 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 6 Jan 2009 14:43:09 -0800
Subject: dma_alloc_coherent: clean it up

This thing was rather stupidly coded.  Rework it all prior to making
changes.

Also, rename local variable `page': kernel readers expect something called
`page' to have type `struct page *'.

Cc: Guennadi Liakhovetski <lg@denx.de>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Dmitry Baryshkov <dbaryshkov@gmail.com>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/dma-coherent.c | 54 +++++++++++++++++++++++++++------------------------
 1 file changed, 29 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
index 4bdcea822b4..8056d081609 100644
--- a/kernel/dma-coherent.c
+++ b/kernel/dma-coherent.c
@@ -109,34 +109,38 @@ EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
 int dma_alloc_from_coherent(struct device *dev, ssize_t size,
 				       dma_addr_t *dma_handle, void **ret)
 {
-	struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
+	struct dma_coherent_mem *mem;
 	int order = get_order(size);
+	int pageno;
 
-	if (mem) {
-		int page = bitmap_find_free_region(mem->bitmap, mem->size,
-						     order);
-		if (page >= 0) {
-			/*
-			 * Memory was found in the per-device arena.
-			 */
-			*dma_handle = mem->device_base + (page << PAGE_SHIFT);
-			*ret = mem->virt_base + (page << PAGE_SHIFT);
-			memset(*ret, 0, size);
-		} else if (mem->flags & DMA_MEMORY_EXCLUSIVE) {
-			/*
-			 * The per-device arena is exhausted and we are not
-			 * permitted to fall back to generic memory.
-			 */
-			*ret = NULL;
-		} else {
-			/*
-			 * The per-device arena is exhausted and we are
-			 * permitted to fall back to generic memory.
-			 */
-			 return 0;
-		}
+	if (!dev)
+		return 0;
+	mem = dev->dma_mem;
+	if (!mem)
+		return 0;
+
+	pageno = bitmap_find_free_region(mem->bitmap, mem->size, order);
+	if (pageno >= 0) {
+		/*
+		 * Memory was found in the per-device arena.
+		 */
+		*dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
+		*ret = mem->virt_base + (pageno << PAGE_SHIFT);
+		memset(*ret, 0, size);
+	} else if (mem->flags & DMA_MEMORY_EXCLUSIVE) {
+		/*
+		 * The per-device arena is exhausted and we are not
+		 * permitted to fall back to generic memory.
+		 */
+		*ret = NULL;
+	} else {
+		/*
+		 * The per-device arena is exhausted and we are
+		 * permitted to fall back to generic memory.
+		 */
+		 return 0;
 	}
-	return (mem != NULL);
+	return 1;
 }
 EXPORT_SYMBOL(dma_alloc_from_coherent);
 
-- 
cgit v1.2.3


From 58c6d3dfe436eb8cfb451981d8fdc9044eaf42da Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 6 Jan 2009 14:43:10 -0800
Subject: dma-coherent: catch oversized requests to dma_alloc_from_coherent()

Prevent passing an order to bitmap_find_free_region() that is larger than
the actual bitmap can represent.

These requests can come from device drivers that have no idea how big the
dma region is and need to rely on dma_alloc_from_coherent() to sort it out
for them.

Reported-by: Guennadi Liakhovetski <lg@denx.de>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Dmitry Baryshkov <dbaryshkov@gmail.com>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/dma-coherent.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
index 8056d081609..038707404b7 100644
--- a/kernel/dma-coherent.c
+++ b/kernel/dma-coherent.c
@@ -118,6 +118,8 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size,
 	mem = dev->dma_mem;
 	if (!mem)
 		return 0;
+	if (unlikely(size > mem->size))
+ 		return 0;
 
 	pageno = bitmap_find_free_region(mem->bitmap, mem->size, order);
 	if (pageno >= 0) {
-- 
cgit v1.2.3


From da8d5089da6dfd54e5fd05d0c291a63c2bcf6885 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 7 Jan 2009 15:28:57 +0100
Subject: sched: fix possible recursive rq->lock

Vaidyanathan Srinivasan reported:

 > =============================================
 > [ INFO: possible recursive locking detected ]
 > 2.6.28-autotest-tip-sv #1
 > ---------------------------------------------
 > klogd/5062 is trying to acquire lock:
 >  (&rq->lock){++..}, at: [<ffffffff8022aca2>] task_rq_lock+0x45/0x7e
 >
 > but task is already holding lock:
 >  (&rq->lock){++..}, at: [<ffffffff805f7354>] schedule+0x158/0xa31

With sched_mc at 2. (it is default-off)

Strictly speaking we'll not deadlock, because ttwu will not be able to
place the migration task on our rq, but since the code can deal with
both rqs getting unlocked, this seems the easiest way out.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 2e3545f57e7..deb5ac8c12f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3728,8 +3728,13 @@ redo:
 		}
 
 		double_unlock_balance(this_rq, busiest);
+		/*
+		 * Should not call ttwu while holding a rq->lock
+		 */
+		spin_unlock(&this_rq->lock);
 		if (active_balance)
 			wake_up_process(busiest->migration_thread);
+		spin_lock(&this_rq->lock);
 
 	} else
 		sd->nr_balance_failed = 0;
-- 
cgit v1.2.3


From 22a9d645677feefd402befd02edd59b122289ef1 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Wed, 7 Jan 2009 08:45:46 -0800
Subject: async: Asynchronous function calls to speed up kernel boot

Right now, most of the kernel boot is strictly synchronous, such that
various hardware delays are done sequentially.

In order to make the kernel boot faster, this patch introduces
infrastructure to allow doing some of the initialization steps
asynchronously, which will hide significant portions of the hardware delays
in practice.

In order to not change device order and other similar observables, this
patch does NOT do full parallel initialization.

Rather, it operates more in the way an out of order CPU does; the work may
be done out of order and asynchronous, but the observable effects
(instruction retiring for the CPU) are still done in the original sequence.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 kernel/Makefile        |   3 +-
 kernel/async.c         | 321 +++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/irq/autoprobe.c |   5 +
 kernel/module.c        |   2 +
 4 files changed, 330 insertions(+), 1 deletion(-)
 create mode 100644 kernel/async.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index e1c5bf3365c..2921d90ce32 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,8 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
 	    rcupdate.o extable.o params.o posix-timers.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
-	    notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o
+	    notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
+	    async.o
 
 ifdef CONFIG_FUNCTION_TRACER
 # Do not trace debug files and internal ftrace files
diff --git a/kernel/async.c b/kernel/async.c
new file mode 100644
index 00000000000..afaa8a653d5
--- /dev/null
+++ b/kernel/async.c
@@ -0,0 +1,321 @@
+/*
+ * async.c: Asynchronous function calls for boot performance
+ *
+ * (C) Copyright 2009 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+
+/*
+
+Goals and Theory of Operation
+
+The primary goal of this feature is to reduce the kernel boot time,
+by doing various independent hardware delays and discovery operations
+decoupled and not strictly serialized.
+
+More specifically, the asynchronous function call concept allows
+certain operations (primarily during system boot) to happen
+asynchronously, out of order, while these operations still
+have their externally visible parts happen sequentially and in-order.
+(not unlike how out-of-order CPUs retire their instructions in order)
+
+Key to the asynchronous function call implementation is the concept of
+a "sequence cookie" (which, although it has an abstracted type, can be
+thought of as a monotonically incrementing number).
+
+The async core will assign each scheduled event such a sequence cookie and
+pass this to the called functions.
+
+The asynchronously called function should before doing a globally visible
+operation, such as registering device numbers, call the
+async_synchronize_cookie() function and pass in its own cookie. The
+async_synchronize_cookie() function will make sure that all asynchronous
+operations that were scheduled prior to the operation corresponding with the
+cookie have completed.
+
+Subsystem/driver initialization code that scheduled asynchronous probe
+functions, but which shares global resources with other drivers/subsystems
+that do not use the asynchronous call feature, need to do a full
+synchronization with the async_synchronize_full() function, before returning
+from their init function. This is to maintain strict ordering between the
+asynchronous and synchronous parts of the kernel.
+
+*/
+
+#include <linux/async.h>
+#include <linux/module.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/kthread.h>
+#include <asm/atomic.h>
+
+static async_cookie_t next_cookie = 1;
+
+#define MAX_THREADS	256
+#define MAX_WORK	32768
+
+static LIST_HEAD(async_pending);
+static LIST_HEAD(async_running);
+static DEFINE_SPINLOCK(async_lock);
+
+struct async_entry {
+	struct list_head list;
+	async_cookie_t   cookie;
+	async_func_ptr	 *func;
+	void             *data;
+	struct list_head *running;
+};
+
+static DECLARE_WAIT_QUEUE_HEAD(async_done);
+static DECLARE_WAIT_QUEUE_HEAD(async_new);
+
+static atomic_t entry_count;
+static atomic_t thread_count;
+
+extern int initcall_debug;
+
+
+/*
+ * MUST be called with the lock held!
+ */
+static async_cookie_t  __lowest_in_progress(struct list_head *running)
+{
+	struct async_entry *entry;
+	if (!list_empty(&async_pending)) {
+		entry = list_first_entry(&async_pending,
+			struct async_entry, list);
+		return entry->cookie;
+	} else if (!list_empty(running)) {
+		entry = list_first_entry(running,
+			struct async_entry, list);
+		return entry->cookie;
+	} else {
+		/* nothing in progress... next_cookie is "infinity" */
+		return next_cookie;
+	}
+
+}
+/*
+ * pick the first pending entry and run it
+ */
+static void run_one_entry(void)
+{
+	unsigned long flags;
+	struct async_entry *entry;
+	ktime_t calltime, delta, rettime;
+
+	/* 1) pick one task from the pending queue */
+
+	spin_lock_irqsave(&async_lock, flags);
+	if (list_empty(&async_pending))
+		goto out;
+	entry = list_first_entry(&async_pending, struct async_entry, list);
+
+	/* 2) move it to the running queue */
+	list_del(&entry->list);
+	list_add_tail(&entry->list, &async_running);
+	spin_unlock_irqrestore(&async_lock, flags);
+
+	/* 3) run it (and print duration)*/
+	if (initcall_debug) {
+		printk("calling  %lli_%pF @ %i\n", entry->cookie, entry->func, task_pid_nr(current));
+		calltime = ktime_get();
+	}
+	entry->func(entry->data, entry->cookie);
+	if (initcall_debug) {
+		rettime = ktime_get();
+		delta = ktime_sub(rettime, calltime);
+		printk("initcall %lli_%pF returned 0 after %lld usecs\n", entry->cookie,
+			entry->func, ktime_to_ns(delta) >> 10);
+	}
+
+	/* 4) remove it from the running queue */
+	spin_lock_irqsave(&async_lock, flags);
+	list_del(&entry->list);
+
+	/* 5) free the entry  */
+	kfree(entry);
+	atomic_dec(&entry_count);
+
+	spin_unlock_irqrestore(&async_lock, flags);
+
+	/* 6) wake up any waiters. */
+	wake_up(&async_done);
+	return;
+
+out:
+	spin_unlock_irqrestore(&async_lock, flags);
+}
+
+
+static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running)
+{
+	struct async_entry *entry;
+	unsigned long flags;
+	async_cookie_t newcookie;
+	
+
+	/* allow irq-off callers */
+	entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC);
+
+	/*
+	 * If we're out of memory or if there's too much work
+	 * pending already, we execute synchronously.
+	 */
+	if (!entry || atomic_read(&entry_count) > MAX_WORK) {
+		kfree(entry);
+		spin_lock_irqsave(&async_lock, flags);
+		newcookie = next_cookie++;
+		spin_unlock_irqrestore(&async_lock, flags);
+
+		/* low on memory.. run synchronously */
+		ptr(data, newcookie);
+		return newcookie;
+	}
+	entry->func = ptr;
+	entry->data = data;
+	entry->running = running;
+
+	spin_lock_irqsave(&async_lock, flags);
+	newcookie = entry->cookie = next_cookie++;
+	list_add_tail(&entry->list, &async_pending);
+	atomic_inc(&entry_count);
+	spin_unlock_irqrestore(&async_lock, flags);
+	wake_up(&async_new);
+	return newcookie;
+}
+
+async_cookie_t async_schedule(async_func_ptr *ptr, void *data)
+{
+	return __async_schedule(ptr, data, &async_pending);
+}
+EXPORT_SYMBOL_GPL(async_schedule);
+
+async_cookie_t async_schedule_special(async_func_ptr *ptr, void *data, struct list_head *running)
+{
+	return __async_schedule(ptr, data, running);
+}
+EXPORT_SYMBOL_GPL(async_schedule_special);
+
+void async_synchronize_full(void)
+{
+	async_synchronize_cookie(next_cookie);
+}
+EXPORT_SYMBOL_GPL(async_synchronize_full);
+
+void async_synchronize_full_special(struct list_head *list)
+{
+	async_synchronize_cookie_special(next_cookie, list);
+}
+EXPORT_SYMBOL_GPL(async_synchronize_full_special);
+
+void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *running)
+{
+	ktime_t starttime, delta, endtime;
+
+	if (initcall_debug) {
+		printk("async_waiting @ %i\n", task_pid_nr(current));
+		starttime = ktime_get();
+	}
+
+	wait_event(async_done, __lowest_in_progress(running) >= cookie);
+
+	if (initcall_debug) {
+		endtime = ktime_get();
+		delta = ktime_sub(endtime, starttime);
+
+		printk("async_continuing @ %i after %lli usec\n",
+			task_pid_nr(current), ktime_to_ns(delta) >> 10);
+	}
+}
+EXPORT_SYMBOL_GPL(async_synchronize_cookie_special);
+
+void async_synchronize_cookie(async_cookie_t cookie)
+{
+	async_synchronize_cookie_special(cookie, &async_running);
+}
+EXPORT_SYMBOL_GPL(async_synchronize_cookie);
+
+
+static int async_thread(void *unused)
+{
+	DECLARE_WAITQUEUE(wq, current);
+	add_wait_queue(&async_new, &wq);
+
+	while (!kthread_should_stop()) {
+		int ret = HZ;
+		set_current_state(TASK_INTERRUPTIBLE);
+		/*
+		 * check the list head without lock.. false positives
+		 * are dealt with inside run_one_entry() while holding
+		 * the lock.
+		 */
+		rmb();
+		if (!list_empty(&async_pending))
+			run_one_entry();
+		else
+			ret = schedule_timeout(HZ);
+
+		if (ret == 0) {
+			/*
+			 * we timed out, this means we as thread are redundant.
+			 * we sign off and die, but we to avoid any races there
+			 * is a last-straw check to see if work snuck in.
+			 */
+			atomic_dec(&thread_count);
+			wmb(); /* manager must see our departure first */
+			if (list_empty(&async_pending))
+				break;
+			/*
+			 * woops work came in between us timing out and us
+			 * signing off; we need to stay alive and keep working.
+			 */
+			atomic_inc(&thread_count);
+		}
+	}
+	remove_wait_queue(&async_new, &wq);
+
+	return 0;
+}
+
+static int async_manager_thread(void *unused)
+{
+	DECLARE_WAITQUEUE(wq, current);
+	add_wait_queue(&async_new, &wq);
+
+	while (!kthread_should_stop()) {
+		int tc, ec;
+
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		tc = atomic_read(&thread_count);
+		rmb();
+		ec = atomic_read(&entry_count);
+
+		while (tc < ec && tc < MAX_THREADS) {
+			kthread_run(async_thread, NULL, "async/%i", tc);
+			atomic_inc(&thread_count);
+			tc++;
+		}
+
+		schedule();
+	}
+	remove_wait_queue(&async_new, &wq);
+
+	return 0;
+}
+
+static int __init async_init(void)
+{
+	kthread_run(async_manager_thread, NULL, "async/mgr");
+	return 0;
+}
+
+core_initcall(async_init);
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index cc0f7321b8c..1de9700f416 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -10,6 +10,7 @@
 #include <linux/module.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
+#include <linux/async.h>
 
 #include "internals.h"
 
@@ -34,6 +35,10 @@ unsigned long probe_irq_on(void)
 	unsigned int status;
 	int i;
 
+	/*
+	 * quiesce the kernel, or at least the asynchronous portion
+	 */
+	async_synchronize_full();
 	mutex_lock(&probing_active);
 	/*
 	 * something may have generated an irq long ago and we want to
diff --git a/kernel/module.c b/kernel/module.c
index 496dcb57b60..c9332c90d5a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -50,6 +50,7 @@
 #include <asm/sections.h>
 #include <linux/tracepoint.h>
 #include <linux/ftrace.h>
+#include <linux/async.h>
 
 #if 0
 #define DEBUGP printk
@@ -816,6 +817,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
 		mod->exit();
 	blocking_notifier_call_chain(&module_notify_list,
 				     MODULE_STATE_GOING, mod);
+	async_synchronize_full();
 	mutex_lock(&module_mutex);
 	/* Store the name of the last unloaded module for diagnostic purposes */
 	strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
-- 
cgit v1.2.3


From ad160d23198193135cb2bcc75222e0816b5838c0 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Wed, 7 Jan 2009 09:28:53 -0800
Subject: async: don't do the initcall stuff post boot

while tracking the asynchronous calls during boot using the initcall_debug
convention is useful, doing it once the kernel is done is actually
bad now that we use asynchronous operations post boot as well...

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 kernel/async.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/async.c b/kernel/async.c
index afaa8a653d5..97373380c9e 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -124,12 +124,12 @@ static void run_one_entry(void)
 	spin_unlock_irqrestore(&async_lock, flags);
 
 	/* 3) run it (and print duration)*/
-	if (initcall_debug) {
+	if (initcall_debug && system_state == SYSTEM_BOOTING) {
 		printk("calling  %lli_%pF @ %i\n", entry->cookie, entry->func, task_pid_nr(current));
 		calltime = ktime_get();
 	}
 	entry->func(entry->data, entry->cookie);
-	if (initcall_debug) {
+	if (initcall_debug && system_state == SYSTEM_BOOTING) {
 		rettime = ktime_get();
 		delta = ktime_sub(rettime, calltime);
 		printk("initcall %lli_%pF returned 0 after %lld usecs\n", entry->cookie,
@@ -220,14 +220,14 @@ void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *r
 {
 	ktime_t starttime, delta, endtime;
 
-	if (initcall_debug) {
+	if (initcall_debug && system_state == SYSTEM_BOOTING) {
 		printk("async_waiting @ %i\n", task_pid_nr(current));
 		starttime = ktime_get();
 	}
 
 	wait_event(async_done, __lowest_in_progress(running) >= cookie);
 
-	if (initcall_debug) {
+	if (initcall_debug && system_state == SYSTEM_BOOTING) {
 		endtime = ktime_get();
 		delta = ktime_sub(endtime, starttime);
 
-- 
cgit v1.2.3


From e8de1481fd7126ee9e93d6889da6f00c05e1e019 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Wed, 22 Oct 2008 19:55:31 -0700
Subject: resource: allow MMIO exclusivity for device drivers

Device drivers that use pci_request_regions() (and similar APIs) have a
reasonable expectation that they are the only ones accessing their device.
As part of the e1000e hunt, we were afraid that some userland (X or some
bootsplash stuff) was mapping the MMIO region that the driver thought it
had exclusively via /dev/mem or via various sysfs resource mappings.

This patch adds the option for device drivers to cause their reserved
regions to the "banned from /dev/mem use" list, so now both kernel memory
and device-exclusive MMIO regions are banned.
NOTE: This is only active when CONFIG_STRICT_DEVMEM is set.

In addition to the config option, a kernel parameter iomem=relaxed is
provided for the cases where developers want to diagnose, in the field,
drivers issues from userspace.

Reviewed-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 kernel/resource.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 58 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/resource.c b/kernel/resource.c
index e633106b12f..ca6a1536b20 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -623,7 +623,7 @@ resource_size_t resource_alignment(struct resource *res)
  */
 struct resource * __request_region(struct resource *parent,
 				   resource_size_t start, resource_size_t n,
-				   const char *name)
+				   const char *name, int flags)
 {
 	struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
 
@@ -634,6 +634,7 @@ struct resource * __request_region(struct resource *parent,
 	res->start = start;
 	res->end = start + n - 1;
 	res->flags = IORESOURCE_BUSY;
+	res->flags |= flags;
 
 	write_lock(&resource_lock);
 
@@ -679,7 +680,7 @@ int __check_region(struct resource *parent, resource_size_t start,
 {
 	struct resource * res;
 
-	res = __request_region(parent, start, n, "check-region");
+	res = __request_region(parent, start, n, "check-region", 0);
 	if (!res)
 		return -EBUSY;
 
@@ -776,7 +777,7 @@ struct resource * __devm_request_region(struct device *dev,
 	dr->start = start;
 	dr->n = n;
 
-	res = __request_region(parent, start, n, name);
+	res = __request_region(parent, start, n, name, 0);
 	if (res)
 		devres_add(dev, dr);
 	else
@@ -876,3 +877,57 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
 
 	return err;
 }
+
+#ifdef CONFIG_STRICT_DEVMEM
+static int strict_iomem_checks = 1;
+#else
+static int strict_iomem_checks;
+#endif
+
+/*
+ * check if an address is reserved in the iomem resource tree
+ * returns 1 if reserved, 0 if not reserved.
+ */
+int iomem_is_exclusive(u64 addr)
+{
+	struct resource *p = &iomem_resource;
+	int err = 0;
+	loff_t l;
+	int size = PAGE_SIZE;
+
+	if (!strict_iomem_checks)
+		return 0;
+
+	addr = addr & PAGE_MASK;
+
+	read_lock(&resource_lock);
+	for (p = p->child; p ; p = r_next(NULL, p, &l)) {
+		/*
+		 * We can probably skip the resources without
+		 * IORESOURCE_IO attribute?
+		 */
+		if (p->start >= addr + size)
+			break;
+		if (p->end < addr)
+			continue;
+		if (p->flags & IORESOURCE_BUSY &&
+		     p->flags & IORESOURCE_EXCLUSIVE) {
+			err = 1;
+			break;
+		}
+	}
+	read_unlock(&resource_lock);
+
+	return err;
+}
+
+static int __init strict_iomem(char *str)
+{
+	if (strstr(str, "relaxed"))
+		strict_iomem_checks = 0;
+	if (strstr(str, "strict"))
+		strict_iomem_checks = 1;
+	return 1;
+}
+
+__setup("iomem=", strict_iomem);
-- 
cgit v1.2.3


From a0e280e0f33f6c859a235fb69a875ed8f3420388 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 7 Jan 2009 16:19:46 +0100
Subject: stop_machine/cpu hotplug: fix disable_nonboot_cpus

disable_nonboot_cpus calls _cpu_down. But _cpu_down requires that the
caller already created the stop_machine workqueue (like cpu_down does).
Otherwise a call to stop_machine will lead to accesses to random memory
regions.

When introducing this new interface (9ea09af3bd3090e8349ca2899ca2011bd94cda85
"stop_machine: introduce stop_machine_create/destroy") I missed the second
call site of _cpu_down.
So add the missing stop_machine_create/destroy calls to disable_nonboot_cpus
as well.

Fixes suspend-to-ram/disk and also this bug:

[  286.547348] BUG: unable to handle kernel paging request at 6b6b6b6b
[  286.548940] IP: [<c0150ca4>] __stop_machine+0x88/0xe3
[  286.550598] Oops: 0002 [#1] SMP
[  286.560580] Pid: 3273, comm: halt Not tainted (2.6.28-06127-g238c6d5
[  286.560580] EIP: is at __stop_machine+0x88/0xe3
[  286.560580] Process halt (pid: 3273, ti=f1a28000 task=f4530f30
[  286.560580] Call Trace:
[  286.560580]  [<c03d04e4>] ? _cpu_down+0x10f/0x234
[  286.560580]  [<c012a57e>] ? disable_nonboot_cpus+0x58/0xdc
[  286.560580]  [<c01360c0>] ? kernel_poweroff+0x22/0x39
[  286.560580]  [<c0136301>] ? sys_reboot+0xde/0x14c
[  286.560580]  [<c01331b2>] ? complete_signal+0x179/0x191
[  286.560580]  [<c0133396>] ? send_signal+0x1cc/0x1e1
[  286.560580]  [<c03de418>] ? _spin_unlock_irqrestore+0x2d/0x3c
[  286.560580]  [<c0133b65>] ? group_send_signal_info+0x58/0x61
[  286.560580]  [<c0133b9e>] ? kill_pid_info+0x30/0x3a
[  286.560580]  [<c0133d49>] ? sys_kill+0x75/0x13a
[  286.560580]  [<c01a06cb>] ? mntput_no_expire+ox1f/0x101
[  286.560580]  [<c019b3b3>] ? dput+0x1e/0x105
[  286.560580]  [<c018ef87>] ?  __fput+0x150/0x158
[  286.560580]  [<c0157abf>] ? audit_syscall_entry+0x137/0x159
[  286.560580]  [<c010329f>] ? sysenter_do_call+0x12/0x34

Reported-and-tested-by: "Justin P. Mattock" <justinmattock@gmail.com>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Tested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpu.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 30e74dd6d01..79e40f00dcb 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -379,8 +379,11 @@ static cpumask_var_t frozen_cpus;
 
 int disable_nonboot_cpus(void)
 {
-	int cpu, first_cpu, error = 0;
+	int cpu, first_cpu, error;
 
+	error = stop_machine_create();
+	if (error)
+		return error;
 	cpu_maps_update_begin();
 	first_cpu = cpumask_first(cpu_online_mask);
 	/* We take down all of the non-boot CPUs in one shot to avoid races
@@ -409,6 +412,7 @@ int disable_nonboot_cpus(void)
 		printk(KERN_ERR "Non-boot CPUs are not disabled\n");
 	}
 	cpu_maps_update_done();
+	stop_machine_destroy();
 	return error;
 }
 
-- 
cgit v1.2.3


From 465634adc1d09b490c8ee31885575be39d375d53 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 7 Jan 2009 15:32:11 +0100
Subject: ring_buffer: fix ring_buffer_event_length()

Function ring_buffer_event_length() provides an interface to detect
the length of data stored in an entry. However, the length contains
offsets depending on the internal usage. This makes it unusable. This
patch fixes this and now ring_buffer_event_length() returns the
alligned length that has been used in ring_buffer_lock_reserve().

Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 kernel/trace/ring_buffer.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 30d57dd01a8..d42b882dfe4 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -117,7 +117,13 @@ rb_event_length(struct ring_buffer_event *event)
  */
 unsigned ring_buffer_event_length(struct ring_buffer_event *event)
 {
-	return rb_event_length(event);
+	unsigned length = rb_event_length(event);
+	if (event->type != RINGBUF_TYPE_DATA)
+		return length;
+	length -= RB_EVNT_HDR_SIZE;
+	if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0]))
+                length -= sizeof(event->array[0]);
+	return length;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_event_length);
 
-- 
cgit v1.2.3


From c9d557c19f94df42db78d4a5de4d25feee694bad Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 7 Jan 2009 14:33:30 -0800
Subject: rcu: fix bug in rcutorture system-shutdown code

This patch fixes an rcutorture bug found by Eric Sesterhenn that
resulted in oopses in response to "rmmod rcutorture".  The problem
was in some new code that attempted to handle the case where a system
is shut down while rcutorture is still running, for example, when
rcutorture is built into the kernel so that it cannot be removed.
The fix causes the rcutorture threads to "park" in an
schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT) rather than
trying to get them to terminate cleanly.  Concurrent shutdown and
rmmod is illegal.

I believe that this is 2.6.29 material, as it is used in some testing
setups.

For reference, here are the rcutorture operating modes:

CONFIG_RCU_TORTURE_TEST=m

	This is the normal rcutorture build.  Use "modprobe rcutorture"
	(with optional arguments) to start, and "rmmod rcutorture" to
	stop.  If you shut the system down without doing the rmmod, you
	should see console output like:

	rcutorture thread rcu_torture_writer parking due to system shutdown

	One for each rcutorture kthread.

CONFIG_RCU_TORTURE_TEST=y
CONFIG_RCU_TORTURE_TEST_RUNNABLE=n

	Use this if you want rcutorture built in, but don't want the
	test to start running during early boot.  To start the
	torturing:

		echo 1 > /proc/sys/kernel/rcutorture_runnable

	To stop the torturing, s/1/0/

	You will get "parking" console messages as noted above when
	you shut the system down.

CONFIG_RCU_TORTURE_TEST=y
CONFIG_RCU_TORTURE_TEST_RUNNABLE=y

	Same as above, except that the torturing starts during early
	boot.  Only for the stout of heart and strong of stomach.
	The same /proc entry noted above may be used to control the
	test.

Located-by: Eric Sesterhenn <snakebyte@gmx.de>
Tested-by: Eric Sesterhenn <snakebyte@gmx.de>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutorture.c | 113 +++++++++++++++++++++++++++++++---------------------
 1 file changed, 68 insertions(+), 45 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 1cff28db56b..7c4142a79f0 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -136,28 +136,46 @@ static int stutter_pause_test = 0;
 #endif
 int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
 
-#define FULLSTOP_SHUTDOWN 1	/* Bail due to system shutdown/panic. */
-#define FULLSTOP_CLEANUP  2	/* Orderly shutdown. */
-static int fullstop;		/* stop generating callbacks at test end. */
-DEFINE_MUTEX(fullstop_mutex);	/* protect fullstop transitions and */
-				/*  spawning of kthreads. */
+/* Mediate rmmod and system shutdown.  Concurrent rmmod & shutdown illegal! */
+
+#define FULLSTOP_DONTSTOP 0	/* Normal operation. */
+#define FULLSTOP_SHUTDOWN 1	/* System shutdown with rcutorture running. */
+#define FULLSTOP_RMMOD    2	/* Normal rmmod of rcutorture. */
+static int fullstop = FULLSTOP_RMMOD;
+DEFINE_MUTEX(fullstop_mutex);	/* Protect fullstop transitions and spawning */
+				/*  of kthreads. */
 
 /*
- * Detect and respond to a signal-based shutdown.
+ * Detect and respond to a system shutdown.
  */
 static int
 rcutorture_shutdown_notify(struct notifier_block *unused1,
 			   unsigned long unused2, void *unused3)
 {
-	if (fullstop)
-		return NOTIFY_DONE;
 	mutex_lock(&fullstop_mutex);
-	if (!fullstop)
+	if (fullstop == FULLSTOP_DONTSTOP)
 		fullstop = FULLSTOP_SHUTDOWN;
+	else
+		printk(KERN_WARNING /* but going down anyway, so... */
+		       "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
 	mutex_unlock(&fullstop_mutex);
 	return NOTIFY_DONE;
 }
 
+/*
+ * Absorb kthreads into a kernel function that won't return, so that
+ * they won't ever access module text or data again.
+ */
+static void rcutorture_shutdown_absorb(char *title)
+{
+	if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
+		printk(KERN_NOTICE
+		       "rcutorture thread %s parking due to system shutdown\n",
+		       title);
+		schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT);
+	}
+}
+
 /*
  * Allocate an element from the rcu_tortures pool.
  */
@@ -219,13 +237,14 @@ rcu_random(struct rcu_random_state *rrsp)
 }
 
 static void
-rcu_stutter_wait(void)
+rcu_stutter_wait(char *title)
 {
-	while ((stutter_pause_test || !rcutorture_runnable) && !fullstop) {
+	while (stutter_pause_test || !rcutorture_runnable) {
 		if (rcutorture_runnable)
 			schedule_timeout_interruptible(1);
 		else
 			schedule_timeout_interruptible(round_jiffies_relative(HZ));
+		rcutorture_shutdown_absorb(title);
 	}
 }
 
@@ -287,7 +306,7 @@ rcu_torture_cb(struct rcu_head *p)
 	int i;
 	struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
 
-	if (fullstop) {
+	if (fullstop != FULLSTOP_DONTSTOP) {
 		/* Test is ending, just drop callbacks on the floor. */
 		/* The next initialization will pick up the pieces. */
 		return;
@@ -619,10 +638,11 @@ rcu_torture_writer(void *arg)
 		}
 		rcu_torture_current_version++;
 		oldbatch = cur_ops->completed();
-		rcu_stutter_wait();
-	} while (!kthread_should_stop() && !fullstop);
+		rcu_stutter_wait("rcu_torture_writer");
+	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
 	VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
-	while (!kthread_should_stop() && fullstop != FULLSTOP_SHUTDOWN)
+	rcutorture_shutdown_absorb("rcu_torture_writer");
+	while (!kthread_should_stop())
 		schedule_timeout_uninterruptible(1);
 	return 0;
 }
@@ -643,11 +663,12 @@ rcu_torture_fakewriter(void *arg)
 		schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
 		udelay(rcu_random(&rand) & 0x3ff);
 		cur_ops->sync();
-		rcu_stutter_wait();
-	} while (!kthread_should_stop() && !fullstop);
+		rcu_stutter_wait("rcu_torture_fakewriter");
+	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
 
 	VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping");
-	while (!kthread_should_stop() && fullstop != FULLSTOP_SHUTDOWN)
+	rcutorture_shutdown_absorb("rcu_torture_fakewriter");
+	while (!kthread_should_stop())
 		schedule_timeout_uninterruptible(1);
 	return 0;
 }
@@ -752,12 +773,13 @@ rcu_torture_reader(void *arg)
 		preempt_enable();
 		cur_ops->readunlock(idx);
 		schedule();
-		rcu_stutter_wait();
-	} while (!kthread_should_stop() && !fullstop);
+		rcu_stutter_wait("rcu_torture_reader");
+	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
 	VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
+	rcutorture_shutdown_absorb("rcu_torture_reader");
 	if (irqreader && cur_ops->irqcapable)
 		del_timer_sync(&t);
-	while (!kthread_should_stop() && fullstop != FULLSTOP_SHUTDOWN)
+	while (!kthread_should_stop())
 		schedule_timeout_uninterruptible(1);
 	return 0;
 }
@@ -854,7 +876,8 @@ rcu_torture_stats(void *arg)
 	do {
 		schedule_timeout_interruptible(stat_interval * HZ);
 		rcu_torture_stats_print();
-	} while (!kthread_should_stop() && !fullstop);
+		rcutorture_shutdown_absorb("rcu_torture_stats");
+	} while (!kthread_should_stop());
 	VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping");
 	return 0;
 }
@@ -866,52 +889,49 @@ static int rcu_idle_cpu;	/* Force all torture tasks off this CPU */
  */
 static void rcu_torture_shuffle_tasks(void)
 {
-	cpumask_var_t tmp_mask;
+	cpumask_t tmp_mask;
 	int i;
 
-	if (!alloc_cpumask_var(&tmp_mask, GFP_KERNEL))
-		BUG();
-
-	cpumask_setall(tmp_mask);
+	cpus_setall(tmp_mask);
 	get_online_cpus();
 
 	/* No point in shuffling if there is only one online CPU (ex: UP) */
-	if (num_online_cpus() == 1)
-		goto out;
+	if (num_online_cpus() == 1) {
+		put_online_cpus();
+		return;
+	}
 
 	if (rcu_idle_cpu != -1)
-		cpumask_clear_cpu(rcu_idle_cpu, tmp_mask);
+		cpu_clear(rcu_idle_cpu, tmp_mask);
 
-	set_cpus_allowed_ptr(current, tmp_mask);
+	set_cpus_allowed_ptr(current, &tmp_mask);
 
 	if (reader_tasks) {
 		for (i = 0; i < nrealreaders; i++)
 			if (reader_tasks[i])
 				set_cpus_allowed_ptr(reader_tasks[i],
-						     tmp_mask);
+						     &tmp_mask);
 	}
 
 	if (fakewriter_tasks) {
 		for (i = 0; i < nfakewriters; i++)
 			if (fakewriter_tasks[i])
 				set_cpus_allowed_ptr(fakewriter_tasks[i],
-						     tmp_mask);
+						     &tmp_mask);
 	}
 
 	if (writer_task)
-		set_cpus_allowed_ptr(writer_task, tmp_mask);
+		set_cpus_allowed_ptr(writer_task, &tmp_mask);
 
 	if (stats_task)
-		set_cpus_allowed_ptr(stats_task, tmp_mask);
+		set_cpus_allowed_ptr(stats_task, &tmp_mask);
 
 	if (rcu_idle_cpu == -1)
 		rcu_idle_cpu = num_online_cpus() - 1;
 	else
 		rcu_idle_cpu--;
 
-out:
 	put_online_cpus();
-	free_cpumask_var(tmp_mask);
 }
 
 /* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
@@ -925,7 +945,8 @@ rcu_torture_shuffle(void *arg)
 	do {
 		schedule_timeout_interruptible(shuffle_interval * HZ);
 		rcu_torture_shuffle_tasks();
-	} while (!kthread_should_stop() && !fullstop);
+		rcutorture_shutdown_absorb("rcu_torture_shuffle");
+	} while (!kthread_should_stop());
 	VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping");
 	return 0;
 }
@@ -940,10 +961,11 @@ rcu_torture_stutter(void *arg)
 	do {
 		schedule_timeout_interruptible(stutter * HZ);
 		stutter_pause_test = 1;
-		if (!kthread_should_stop() && !fullstop)
+		if (!kthread_should_stop())
 			schedule_timeout_interruptible(stutter * HZ);
 		stutter_pause_test = 0;
-	} while (!kthread_should_stop() && !fullstop);
+		rcutorture_shutdown_absorb("rcu_torture_stutter");
+	} while (!kthread_should_stop());
 	VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping");
 	return 0;
 }
@@ -970,15 +992,16 @@ rcu_torture_cleanup(void)
 	int i;
 
 	mutex_lock(&fullstop_mutex);
-	if (!fullstop) {
-		/* If being signaled, let it happen, then exit. */
+	if (fullstop == FULLSTOP_SHUTDOWN) {
+		printk(KERN_WARNING /* but going down anyway, so... */
+		       "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
 		mutex_unlock(&fullstop_mutex);
-		schedule_timeout_interruptible(10 * HZ);
+		schedule_timeout_uninterruptible(10);
 		if (cur_ops->cb_barrier != NULL)
 			cur_ops->cb_barrier();
 		return;
 	}
-	fullstop = FULLSTOP_CLEANUP;
+	fullstop = FULLSTOP_RMMOD;
 	mutex_unlock(&fullstop_mutex);
 	unregister_reboot_notifier(&rcutorture_nb);
 	if (stutter_task) {
@@ -1078,7 +1101,7 @@ rcu_torture_init(void)
 	else
 		nrealreaders = 2 * num_online_cpus();
 	rcu_torture_print_module_parms("Start of test");
-	fullstop = 0;
+	fullstop = FULLSTOP_DONTSTOP;
 
 	/* Set up the freelist. */
 
-- 
cgit v1.2.3


From 8feae13110d60cc6287afabc2887366b0eb226c2 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 8 Jan 2009 12:04:47 +0000
Subject: NOMMU: Make VMAs per MM as for MMU-mode linux

Make VMAs per mm_struct as for MMU-mode linux.  This solves two problems:

 (1) In SYSV SHM where nattch for a segment does not reflect the number of
     shmat's (and forks) done.

 (2) In mmap() where the VMA's vm_mm is set to point to the parent mm by an
     exec'ing process when VM_EXECUTABLE is specified, regardless of the fact
     that a VMA might be shared and already have its vm_mm assigned to another
     process or a dead process.

A new struct (vm_region) is introduced to track a mapped region and to remember
the circumstances under which it may be shared and the vm_list_struct structure
is discarded as it's no longer required.

This patch makes the following additional changes:

 (1) Regions are now allocated with alloc_pages() rather than kmalloc() and
     with no recourse to __GFP_COMP, so the pages are not composite.  Instead,
     each page has a reference on it held by the region.  Anything else that is
     interested in such a page will have to get a reference on it to retain it.
     When the pages are released due to unmapping, each page is passed to
     put_page() and will be freed when the page usage count reaches zero.

 (2) Excess pages are trimmed after an allocation as the allocation must be
     made as a power-of-2 quantity of pages.

 (3) VMAs are added to the parent MM's R/B tree and mmap lists.  As an MM may
     end up with overlapping VMAs within the tree, the VMA struct address is
     appended to the sort key.

 (4) Non-anonymous VMAs are now added to the backing inode's prio list.

 (5) Holes may be punched in anonymous VMAs with munmap(), releasing parts of
     the backing region.  The VMA and region structs will be split if
     necessary.

 (6) sys_shmdt() only releases one attachment to a SYSV IPC shared memory
     segment instead of all the attachments at that addresss.  Multiple
     shmat()'s return the same address under NOMMU-mode instead of different
     virtual addresses as under MMU-mode.

 (7) Core dumping for ELF-FDPIC requires fewer exceptions for NOMMU-mode.

 (8) /proc/maps is now the global list of mapped regions, and may list bits
     that aren't actually mapped anywhere.

 (9) /proc/meminfo gains a line (tagged "MmapCopy") that indicates the amount
     of RAM currently allocated by mmap to hold mappable regions that can't be
     mapped directly.  These are copies of the backing device or file if not
     anonymous.

These changes make NOMMU mode more similar to MMU mode.  The downside is that
NOMMU mode requires some extra memory to track things over NOMMU without this
patch (VMAs are no longer shared, and there are now region structs).

Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Mike Frysinger <vapier.adi@gmail.com>
Acked-by: Paul Mundt <lethal@linux-sh.org>
---
 kernel/fork.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 7b8f2a78be3..0bce4a43bb3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1481,12 +1481,10 @@ void __init proc_caches_init(void)
 	fs_cachep = kmem_cache_create("fs_cache",
 			sizeof(struct fs_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
-	vm_area_cachep = kmem_cache_create("vm_area_struct",
-			sizeof(struct vm_area_struct), 0,
-			SLAB_PANIC, NULL);
 	mm_cachep = kmem_cache_create("mm_struct",
 			sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+	mmap_init();
 }
 
 /*
-- 
cgit v1.2.3


From dd8632a12e500a684478fea0951f380478d56fed Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Thu, 8 Jan 2009 12:04:47 +0000
Subject: NOMMU: Make mmap allocation page trimming behaviour configurable.

NOMMU mmap allocates a piece of memory for an mmap that's rounded up in size to
the nearest power-of-2 number of pages.  Currently it then discards the excess
pages back to the page allocator, making that memory available for use by other
things.  This can, however, cause greater amount of fragmentation.

To counter this, a sysctl is added in order to fine-tune the trimming
behaviour.  The default behaviour remains to trim pages aggressively, while
this can either be disabled completely or set to a higher page-granular
watermark in order to have finer-grained control.

vm region vm_top bits taken from an earlier patch by David Howells.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Mike Frysinger <vapier.adi@gmail.com>
---
 kernel/sysctl.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 92f6e5bc3c2..89d74436318 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -82,6 +82,9 @@ extern int percpu_pagelist_fraction;
 extern int compat_log;
 extern int latencytop_enabled;
 extern int sysctl_nr_open_min, sysctl_nr_open_max;
+#ifndef CONFIG_MMU
+extern int sysctl_nr_trim_pages;
+#endif
 #ifdef CONFIG_RCU_TORTURE_TEST
 extern int rcutorture_runnable;
 #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
@@ -1102,6 +1105,17 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec
 	},
+#else
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "nr_trim_pages",
+		.data		= &sysctl_nr_trim_pages,
+		.maxlen		= sizeof(sysctl_nr_trim_pages),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
 #endif
 	{
 		.ctl_name	= VM_LAPTOP_MODE,
-- 
cgit v1.2.3


From b9456371a73871d001e67b5f4eac118c2c278e1c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 8 Jan 2009 11:18:31 +0000
Subject: CRED: Fix commit_creds() on a process that has no mm

Fix commit_creds()'s handling of a process that has no mm (such as one that is
calling or has called daemonize()).  commit_creds() should check to see if
task->mm is not NULL before calling set_dumpable() on it.

Reported-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/cred.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cred.c b/kernel/cred.c
index ff7bc071991..480a61aec80 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -372,7 +372,8 @@ int commit_creds(struct cred *new)
 	    old->fsuid != new->fsuid ||
 	    old->fsgid != new->fsgid ||
 	    !cap_issubset(new->cap_permitted, old->cap_permitted)) {
-		set_dumpable(task->mm, suid_dumpable);
+		if (task->mm)
+			set_dumpable(task->mm, suid_dumpable);
 		task->pdeath_signal = 0;
 		smp_wmb();
 	}
-- 
cgit v1.2.3


From 75139b8274c3e30354daea623f14b43a482a0bb5 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 7 Jan 2009 18:07:33 -0800
Subject: cgroups: remove some redundant NULL checks

- In cgroup_clone(), if vfs_mkdir() returns successfully,
  dentry->d_fsdata will be the pointer to the newly created
  cgroup and won't be NULL.

- a cgroup file's dentry->d_fsdata won't be NULL, guaranteed
  by cgroup_add_file().

- When walking through the subsystems of a cgroup_fs (using
  for_each_subsys), cgrp->subsys[ss->subsys_id] won't be NULL,
  guaranteed by cgroup_create().

(Also remove 2 unused variables in cgroup_rmdir().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 29 +++++++----------------------
 1 file changed, 7 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f221446aa02..220e0fd659f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -586,7 +586,7 @@ static void cgroup_call_pre_destroy(struct cgroup *cgrp)
 {
 	struct cgroup_subsys *ss;
 	for_each_subsys(cgrp->root, ss)
-		if (ss->pre_destroy && cgrp->subsys[ss->subsys_id])
+		if (ss->pre_destroy)
 			ss->pre_destroy(ss, cgrp);
 	return;
 }
@@ -610,10 +610,8 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 		/*
 		 * Release the subsystem state objects.
 		 */
-		for_each_subsys(cgrp->root, ss) {
-			if (cgrp->subsys[ss->subsys_id])
-				ss->destroy(ss, cgrp);
-		}
+		for_each_subsys(cgrp->root, ss)
+			ss->destroy(ss, cgrp);
 
 		cgrp->root->number_of_cgroups--;
 		mutex_unlock(&cgroup_mutex);
@@ -1445,7 +1443,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
 	struct cftype *cft = __d_cft(file->f_dentry);
 	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
 
-	if (!cft || cgroup_is_removed(cgrp))
+	if (cgroup_is_removed(cgrp))
 		return -ENODEV;
 	if (cft->write)
 		return cft->write(cgrp, cft, file, buf, nbytes, ppos);
@@ -1490,7 +1488,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
 	struct cftype *cft = __d_cft(file->f_dentry);
 	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
 
-	if (!cft || cgroup_is_removed(cgrp))
+	if (cgroup_is_removed(cgrp))
 		return -ENODEV;
 
 	if (cft->read)
@@ -1554,10 +1552,8 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
 	err = generic_file_open(inode, file);
 	if (err)
 		return err;
-
 	cft = __d_cft(file->f_dentry);
-	if (!cft)
-		return -ENODEV;
+
 	if (cft->read_map || cft->read_seq_string) {
 		struct cgroup_seqfile_state *state =
 			kzalloc(sizeof(*state), GFP_USER);
@@ -2463,8 +2459,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	struct cgroup *cgrp = dentry->d_fsdata;
 	struct dentry *d;
 	struct cgroup *parent;
-	struct super_block *sb;
-	struct cgroupfs_root *root;
 
 	/* the vfs holds both inode->i_mutex already */
 
@@ -2487,8 +2481,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 
 	mutex_lock(&cgroup_mutex);
 	parent = cgrp->parent;
-	root = cgrp->root;
-	sb = root->sb;
 
 	if (atomic_read(&cgrp->count)
 	    || !list_empty(&cgrp->children)
@@ -2937,7 +2929,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
 	}
 
 	/* Create the cgroup directory, which also creates the cgroup */
-	ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755);
+	ret = vfs_mkdir(inode, dentry, 0755);
 	child = __d_cgrp(dentry);
 	dput(dentry);
 	if (ret) {
@@ -2947,13 +2939,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
 		goto out_release;
 	}
 
-	if (!child) {
-		printk(KERN_INFO
-		       "Couldn't find new cgroup %s\n", nodename);
-		ret = -ENOMEM;
-		goto out_release;
-	}
-
 	/* The cgroup now exists. Retake cgroup_mutex and check
 	 * that we're still in the same state that we thought we
 	 * were. */
-- 
cgit v1.2.3


From cae7a366f77ea5c9f54ae98c5fc65056877a89ed Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 7 Jan 2009 18:07:34 -0800
Subject: ns_cgroup: remove unused spinlock

I happened to find the spinlock in struct ns_cgroup is never used.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ns_cgroup.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 43c2111cd54..78bc3fdac0d 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -13,7 +13,6 @@
 
 struct ns_cgroup {
 	struct cgroup_subsys_state css;
-	spinlock_t lock;
 };
 
 struct cgroup_subsys ns_subsys;
@@ -84,7 +83,6 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
 	ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
 	if (!ns_cgroup)
 		return ERR_PTR(-ENOMEM);
-	spin_lock_init(&ns_cgroup->lock);
 	return &ns_cgroup->css;
 }
 
-- 
cgit v1.2.3


From b12b533fa523e94e0cc9dc23274ae4f9439f1313 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 7 Jan 2009 18:07:36 -0800
Subject: cgroups: add lock for child->cgroups in cgroup_post_fork()

When cgroup_post_fork() is called, child is seen by find_task_by_vpid(),
so child->cgroups maybe be changed, It'll incorrect.

child->cgroups<old>'s refcnt is decreased
child->cgroups<new>'s refcnt is increased
but child->cg_list is added to child->cgroups<old>'s list.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Reviewed-by: Paul Menage <menage@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 220e0fd659f..d7ab4ffd8fd 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2792,8 +2792,10 @@ void cgroup_post_fork(struct task_struct *child)
 {
 	if (use_task_css_set_links) {
 		write_lock(&css_set_lock);
+		task_lock(child);
 		if (list_empty(&child->cg_list))
 			list_add(&child->cg_list, &child->cgroups->tasks);
+		task_unlock(child);
 		write_unlock(&css_set_lock);
 	}
 }
-- 
cgit v1.2.3


From 2019f634ce5904c19eba4e86f51b1a119a53a9f1 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 7 Jan 2009 18:07:36 -0800
Subject: cgroups: fix cgroup_iter_next() bug

We access res->cgroups without the task_lock(), so res->cgroups may be
changed.  it's unreliable, and "if (l == &res->cgroups->tasks)" may be
false forever.

We don't need add any lock for fixing this bug.  we just access to struct
css_set by struct cg_cgroup_link, not by struct task_struct.

Since we hold css_set_lock, struct cg_cgroup_link is reliable.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Reviewed-by: Paul Menage <menage@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d7ab4ffd8fd..a391ab3bdfc 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1808,6 +1808,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
 {
 	struct task_struct *res;
 	struct list_head *l = it->task;
+	struct cg_cgroup_link *link;
 
 	/* If the iterator cg is NULL, we have no tasks */
 	if (!it->cg_link)
@@ -1815,7 +1816,8 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
 	res = list_entry(l, struct task_struct, cg_list);
 	/* Advance iterator to find next entry */
 	l = l->next;
-	if (l == &res->cgroups->tasks) {
+	link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list);
+	if (l == &link->cg->tasks) {
 		/* We reached the end of this task list - move on to
 		 * the next cg_cgroup_link */
 		cgroup_advance_iter(cgrp, it);
-- 
cgit v1.2.3


From b2aa30f7bb381e04c93eed106089ba55553955f1 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 7 Jan 2009 18:07:37 -0800
Subject: cgroups: don't put struct cgroupfs_root protected by RCU

We don't access struct cgroupfs_root in fast path, so we should not put
struct cgroupfs_root protected by RCU

But the comment in struct cgroup_subsys.root confuse us.

struct cgroup_subsys.root is used in these places:

1 find_css_set(): if (ss->root->subsys_list.next == &ss->sibling)
2 rebind_subsystems(): if (ss->root != &rootnode)
                       rcu_assign_pointer(ss->root, root);
                       rcu_assign_pointer(subsys[i]->root, &rootnode);
3 cgroup_has_css_refs(): if (ss->root != cgrp->root)
4 cgroup_init_subsys(): ss->root = &rootnode;
5 proc_cgroupstats_show(): ss->name, ss->root->subsys_bits,
                           ss->root->number_of_cgroups, !ss->disabled);
6 cgroup_clone(): root = subsys->root;
                  if ((root != subsys->root) ||

All these place we have held cgroup_lock() or we don't dereference to
struct cgroupfs_root.  It's means wo don't need RCU when use struct
cgroup_subsys.root, and we should not put struct cgroupfs_root protected
by RCU.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Reviewed-by: Paul Menage <menage@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a391ab3bdfc..a288da176e4 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -713,7 +713,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 			cgrp->subsys[i] = dummytop->subsys[i];
 			cgrp->subsys[i]->cgroup = cgrp;
 			list_add(&ss->sibling, &root->subsys_list);
-			rcu_assign_pointer(ss->root, root);
+			ss->root = root;
 			if (ss->bind)
 				ss->bind(ss, cgrp);
 
@@ -725,7 +725,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 				ss->bind(ss, dummytop);
 			dummytop->subsys[i]->cgroup = dummytop;
 			cgrp->subsys[i] = NULL;
-			rcu_assign_pointer(subsys[i]->root, &rootnode);
+			subsys[i]->root = &rootnode;
 			list_del(&ss->sibling);
 		} else if (bit & final_bits) {
 			/* Subsystem state should already exist */
-- 
cgit v1.2.3


From 104cbd55377029e70fc2cee01089e84b9c36e5dc Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 7 Jan 2009 18:07:38 -0800
Subject: cgroups: use task_lock() for access tsk->cgroups safe in
 cgroup_clone()

Use task_lock() protect tsk->cgroups and get_css_set(tsk->cgroups).

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a288da176e4..00d5136d38c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2903,6 +2903,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
 		mutex_unlock(&cgroup_mutex);
 		return 0;
 	}
+	task_lock(tsk);
 	cg = tsk->cgroups;
 	parent = task_cgroup(tsk, subsys->subsys_id);
 
@@ -2915,6 +2916,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
 
 	/* Keep the cgroup alive */
 	get_css_set(cg);
+	task_unlock(tsk);
 	mutex_unlock(&cgroup_mutex);
 
 	/* Now do the VFS work to create a cgroup */
-- 
cgit v1.2.3


From 77efecd9e0526327548152df715ab8644ecb5ba0 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 7 Jan 2009 18:07:39 -0800
Subject: cgroups: call find_css_set() safely in cgroup_attach_task()

In cgroup_attach_task(), tsk maybe exit when we call find_css_set().  and
find_css_set() will access to invalid css_set.

This patch increases the count before get_css_set(), and decreases it
after find_css_set().

NOTE:

css_set's refcount is also taskcount, after this patch applied, taskcount
may be off-by-one WHEN cgroup_lock() is not held.  but I reviewed other
code which use taskcount, they are still correct.  No regression found by
reviewing and simply testing.

So I do not use two counters in css_set.  (one counter for taskcount, the
other for refcount.  like struct mm_struct) If this fix cause regression,
we will use two counters in css_set.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 00d5136d38c..61e92c5867e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1214,7 +1214,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 	int retval = 0;
 	struct cgroup_subsys *ss;
 	struct cgroup *oldcgrp;
-	struct css_set *cg = tsk->cgroups;
+	struct css_set *cg;
 	struct css_set *newcg;
 	struct cgroupfs_root *root = cgrp->root;
 	int subsys_id;
@@ -1234,11 +1234,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 		}
 	}
 
+	task_lock(tsk);
+	cg = tsk->cgroups;
+	get_css_set(cg);
+	task_unlock(tsk);
 	/*
 	 * Locate or allocate a new css_set for this task,
 	 * based on its final set of cgroups
 	 */
 	newcg = find_css_set(cg, cgrp);
+	put_css_set(cg);
 	if (!newcg)
 		return -ENOMEM;
 
-- 
cgit v1.2.3


From 7534432dcc3c654a8671b6b0cdffd1dbdbc73074 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 7 Jan 2009 18:07:40 -0800
Subject: cgroups: remove rcu_read_lock() in cgroupstats_build()

cgroup_iter_* do not need rcu_read_lock().

In cgroup_enable_task_cg_lists(), do_each_thread() and while_each_thread()
are protected by RCU, it's OK, for write_lock(&css_set_lock) implies
rcu_read_lock() in non-RT kernel.

If we need explicit rcu_read_lock(), we should add rcu_read_lock() in
cgroup_enable_task_cg_lists(), not cgroup_iter_*.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 61e92c5867e..f55af3daffc 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2055,7 +2055,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
 
 	ret = 0;
 	cgrp = dentry->d_fsdata;
-	rcu_read_lock();
 
 	cgroup_iter_start(cgrp, &it);
 	while ((tsk = cgroup_iter_next(cgrp, &it))) {
@@ -2080,7 +2079,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
 	}
 	cgroup_iter_end(cgrp, &it);
 
-	rcu_read_unlock();
 err:
 	return ret;
 }
-- 
cgit v1.2.3


From e5f6a8609bab0c2d7543ab1505105e011832afd7 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 7 Jan 2009 18:07:41 -0800
Subject: cgroups: make root_list contains active hierarchies only

Don't link rootnode to the root list, so root_list contains active
hierarchies only as the comment indicates.  And rename for_each_root() to
for_each_active_root().

Also remove redundant check in cgroup_kill_sb().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f55af3daffc..fd572d05769 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -84,7 +84,7 @@ struct cgroupfs_root {
 	/* Tracks how many cgroups are currently defined in hierarchy.*/
 	int number_of_cgroups;
 
-	/* A list running through the mounted hierarchies */
+	/* A list running through the active hierarchies */
 	struct list_head root_list;
 
 	/* Hierarchy-specific flags */
@@ -148,8 +148,8 @@ static int notify_on_release(const struct cgroup *cgrp)
 #define for_each_subsys(_root, _ss) \
 list_for_each_entry(_ss, &_root->subsys_list, sibling)
 
-/* for_each_root() allows you to iterate across the active hierarchies */
-#define for_each_root(_root) \
+/* for_each_active_root() allows you to iterate across the active hierarchies */
+#define for_each_active_root(_root) \
 list_for_each_entry(_root, &roots, root_list)
 
 /* the list of cgroups eligible for automatic release. Protected by
@@ -1111,10 +1111,9 @@ static void cgroup_kill_sb(struct super_block *sb) {
 	}
 	write_unlock(&css_set_lock);
 
-	if (!list_empty(&root->root_list)) {
-		list_del(&root->root_list);
-		root_count--;
-	}
+	list_del(&root->root_list);
+	root_count--;
+
 	mutex_unlock(&cgroup_mutex);
 
 	kfree(root);
@@ -2559,7 +2558,6 @@ int __init cgroup_init_early(void)
 	INIT_HLIST_NODE(&init_css_set.hlist);
 	css_set_count = 1;
 	init_cgroup_root(&rootnode);
-	list_add(&rootnode.root_list, &roots);
 	root_count = 1;
 	init_task.cgroups = &init_css_set;
 
@@ -2666,15 +2664,12 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
 
 	mutex_lock(&cgroup_mutex);
 
-	for_each_root(root) {
+	for_each_active_root(root) {
 		struct cgroup_subsys *ss;
 		struct cgroup *cgrp;
 		int subsys_id;
 		int count = 0;
 
-		/* Skip this hierarchy if it has no active subsystems */
-		if (!root->actual_subsys_bits)
-			continue;
 		seq_printf(m, "%lu:", root->subsys_bits);
 		for_each_subsys(root, ss)
 			seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
-- 
cgit v1.2.3


From 33a68ac1c1b695216e873ee12e819adbe73e4d9f Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 7 Jan 2009 18:07:42 -0800
Subject: cgroups: add inactive subsystems to rootnode.subsys_list

Though for an inactive hierarchy, we have subsys->root == &rootnode, but
rootnode's subsys_list is always empty.

This conflicts with the code in find_css_set():

	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
		...
		if (ss->root->subsys_list.next == &ss->sibling) {
			...
		}
	}
	if (list_empty(&rootnode.subsys_list)) {
		...
	}

The above code assumes rootnode.subsys_list links all inactive
hierarchies.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index fd572d05769..abf7248f501 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -712,7 +712,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 			BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
 			cgrp->subsys[i] = dummytop->subsys[i];
 			cgrp->subsys[i]->cgroup = cgrp;
-			list_add(&ss->sibling, &root->subsys_list);
+			list_move(&ss->sibling, &root->subsys_list);
 			ss->root = root;
 			if (ss->bind)
 				ss->bind(ss, cgrp);
@@ -726,7 +726,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 			dummytop->subsys[i]->cgroup = dummytop;
 			cgrp->subsys[i] = NULL;
 			subsys[i]->root = &rootnode;
-			list_del(&ss->sibling);
+			list_move(&ss->sibling, &rootnode.subsys_list);
 		} else if (bit & final_bits) {
 			/* Subsystem state should already exist */
 			BUG_ON(!cgrp->subsys[i]);
@@ -2521,6 +2521,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 	printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
 
 	/* Create the top cgroup state for this subsystem */
+	list_add(&ss->sibling, &rootnode.subsys_list);
 	ss->root = &rootnode;
 	css = ss->create(ss, dummytop);
 	/* We don't handle early failures gracefully */
-- 
cgit v1.2.3


From c12f65d4396e05c51ce3af7f159ead98574a587c Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 7 Jan 2009 18:07:42 -0800
Subject: cgroups: introduce link_css_set() to remove duplicate code

Add a common function link_css_set() to link a css_set to a cgroup.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 68 +++++++++++++++++++++++++--------------------------------
 1 file changed, 30 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index abf7248f501..4c475ce4e22 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -384,6 +384,25 @@ static int allocate_cg_links(int count, struct list_head *tmp)
 	return 0;
 }
 
+/**
+ * link_css_set - a helper function to link a css_set to a cgroup
+ * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links()
+ * @cg: the css_set to be linked
+ * @cgrp: the destination cgroup
+ */
+static void link_css_set(struct list_head *tmp_cg_links,
+			 struct css_set *cg, struct cgroup *cgrp)
+{
+	struct cg_cgroup_link *link;
+
+	BUG_ON(list_empty(tmp_cg_links));
+	link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
+				cgrp_link_list);
+	link->cg = cg;
+	list_move(&link->cgrp_link_list, &cgrp->css_sets);
+	list_add(&link->cg_link_list, &cg->cg_links);
+}
+
 /*
  * find_css_set() takes an existing cgroup group and a
  * cgroup object, and returns a css_set object that's
@@ -399,7 +418,6 @@ static struct css_set *find_css_set(
 	int i;
 
 	struct list_head tmp_cg_links;
-	struct cg_cgroup_link *link;
 
 	struct hlist_head *hhead;
 
@@ -444,26 +462,11 @@ static struct css_set *find_css_set(
 		 * only do it for the first subsystem in each
 		 * hierarchy
 		 */
-		if (ss->root->subsys_list.next == &ss->sibling) {
-			BUG_ON(list_empty(&tmp_cg_links));
-			link = list_entry(tmp_cg_links.next,
-					  struct cg_cgroup_link,
-					  cgrp_link_list);
-			list_del(&link->cgrp_link_list);
-			list_add(&link->cgrp_link_list, &cgrp->css_sets);
-			link->cg = res;
-			list_add(&link->cg_link_list, &res->cg_links);
-		}
-	}
-	if (list_empty(&rootnode.subsys_list)) {
-		link = list_entry(tmp_cg_links.next,
-				  struct cg_cgroup_link,
-				  cgrp_link_list);
-		list_del(&link->cgrp_link_list);
-		list_add(&link->cgrp_link_list, &dummytop->css_sets);
-		link->cg = res;
-		list_add(&link->cg_link_list, &res->cg_links);
+		if (ss->root->subsys_list.next == &ss->sibling)
+			link_css_set(&tmp_cg_links, res, cgrp);
 	}
+	if (list_empty(&rootnode.subsys_list))
+		link_css_set(&tmp_cg_links, res, dummytop);
 
 	BUG_ON(!list_empty(&tmp_cg_links));
 
@@ -988,7 +991,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 		root = NULL;
 	} else {
 		/* New superblock */
-		struct cgroup *cgrp = &root->top_cgroup;
+		struct cgroup *root_cgrp = &root->top_cgroup;
 		struct inode *inode;
 		int i;
 
@@ -1029,7 +1032,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 		list_add(&root->root_list, &roots);
 		root_count++;
 
-		sb->s_root->d_fsdata = &root->top_cgroup;
+		sb->s_root->d_fsdata = root_cgrp;
 		root->top_cgroup.dentry = sb->s_root;
 
 		/* Link the top cgroup in this hierarchy into all
@@ -1040,29 +1043,18 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 			struct hlist_node *node;
 			struct css_set *cg;
 
-			hlist_for_each_entry(cg, node, hhead, hlist) {
-				struct cg_cgroup_link *link;
-
-				BUG_ON(list_empty(&tmp_cg_links));
-				link = list_entry(tmp_cg_links.next,
-						  struct cg_cgroup_link,
-						  cgrp_link_list);
-				list_del(&link->cgrp_link_list);
-				link->cg = cg;
-				list_add(&link->cgrp_link_list,
-					 &root->top_cgroup.css_sets);
-				list_add(&link->cg_link_list, &cg->cg_links);
-			}
+			hlist_for_each_entry(cg, node, hhead, hlist)
+				link_css_set(&tmp_cg_links, cg, root_cgrp);
 		}
 		write_unlock(&css_set_lock);
 
 		free_cg_links(&tmp_cg_links);
 
-		BUG_ON(!list_empty(&cgrp->sibling));
-		BUG_ON(!list_empty(&cgrp->children));
+		BUG_ON(!list_empty(&root_cgrp->sibling));
+		BUG_ON(!list_empty(&root_cgrp->children));
 		BUG_ON(root->number_of_cgroups != 1);
 
-		cgroup_populate_dir(cgrp);
+		cgroup_populate_dir(root_cgrp);
 		mutex_unlock(&inode->i_mutex);
 		mutex_unlock(&cgroup_mutex);
 	}
-- 
cgit v1.2.3


From e7b80bb695a5b64c92e314838e083b2f3bdf29b2 Mon Sep 17 00:00:00 2001
From: Gowrishankar M <gowrishankar.m@in.ibm.com>
Date: Wed, 7 Jan 2009 18:07:43 -0800
Subject: cgroups: skip processes from other namespaces when listing a cgroup

Once tasks are populated from system namespace inside cgroup, container
replaces other namespace task with 0 while listing tasks, inside
container.

Though this is expected behaviour from container end, there is no use of
showing unwanted 0s.

In this patch, we check if a process is in same namespace before loading
into pid array.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Gowrishankar M <gowrishankar.m@in.ibm.com>
Acked-by: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4c475ce4e22..cb7c72b91f4 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2007,14 +2007,16 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
  */
 static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
 {
-	int n = 0;
+	int n = 0, pid;
 	struct cgroup_iter it;
 	struct task_struct *tsk;
 	cgroup_iter_start(cgrp, &it);
 	while ((tsk = cgroup_iter_next(cgrp, &it))) {
 		if (unlikely(n == npids))
 			break;
-		pidarray[n++] = task_pid_vnr(tsk);
+		pid = task_pid_vnr(tsk);
+		if (pid > 0)
+			pidarray[n++] = pid;
 	}
 	cgroup_iter_end(cgrp, &it);
 	return n;
-- 
cgit v1.2.3


From a47295e6bc42ad35f9c15ac66f598aa24debd4e2 Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Wed, 7 Jan 2009 18:07:44 -0800
Subject: cgroups: make cgroup_path() RCU-safe

Fix races between /proc/sched_debug by freeing cgroup objects via an RCU
callback.  Thus any cgroup reference obtained from an RCU-safe source will
remain valid during the RCU section.  Since dentries are also RCU-safe,
this allows us to traverse up the tree safely.

Additionally, make cgroup_path() check for a NULL cgrp->dentry to avoid
trying to report a path for a partially-created cgroup.

[lizf@cn.fujitsu.com: call deactive_super() in cgroup_diput()]
Signed-off-by: Paul Menage <menage@google.com>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Tested-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index cb7c72b91f4..83ea4f524be 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -271,7 +271,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
 
 	rcu_read_lock();
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-		struct cgroup *cgrp = cg->subsys[i]->cgroup;
+		struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup);
 		if (atomic_dec_and_test(&cgrp->count) &&
 		    notify_on_release(cgrp)) {
 			if (taskexit)
@@ -594,6 +594,13 @@ static void cgroup_call_pre_destroy(struct cgroup *cgrp)
 	return;
 }
 
+static void free_cgroup_rcu(struct rcu_head *obj)
+{
+	struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
+
+	kfree(cgrp);
+}
+
 static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 {
 	/* is dentry a directory ? if so, kfree() associated cgroup */
@@ -619,11 +626,13 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 		cgrp->root->number_of_cgroups--;
 		mutex_unlock(&cgroup_mutex);
 
-		/* Drop the active superblock reference that we took when we
-		 * created the cgroup */
+		/*
+		 * Drop the active superblock reference that we took when we
+		 * created the cgroup
+		 */
 		deactivate_super(cgrp->root->sb);
 
-		kfree(cgrp);
+		call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
 	}
 	iput(inode);
 }
@@ -1134,14 +1143,16 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
  * @buf: the buffer to write the path into
  * @buflen: the length of the buffer
  *
- * Called with cgroup_mutex held. Writes path of cgroup into buf.
- * Returns 0 on success, -errno on error.
+ * Called with cgroup_mutex held or else with an RCU-protected cgroup
+ * reference.  Writes path of cgroup into buf.  Returns 0 on success,
+ * -errno on error.
  */
 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 {
 	char *start;
+	struct dentry *dentry = rcu_dereference(cgrp->dentry);
 
-	if (cgrp == dummytop) {
+	if (!dentry || cgrp == dummytop) {
 		/*
 		 * Inactive subsystems have no dentry for their root
 		 * cgroup
@@ -1154,13 +1165,14 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 
 	*--start = '\0';
 	for (;;) {
-		int len = cgrp->dentry->d_name.len;
+		int len = dentry->d_name.len;
 		if ((start -= len) < buf)
 			return -ENAMETOOLONG;
 		memcpy(start, cgrp->dentry->d_name.name, len);
 		cgrp = cgrp->parent;
 		if (!cgrp)
 			break;
+		dentry = rcu_dereference(cgrp->dentry);
 		if (!cgrp->parent)
 			continue;
 		if (--start < buf)
@@ -1663,7 +1675,7 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
 	if (!error) {
 		dentry->d_fsdata = cgrp;
 		inc_nlink(parent->d_inode);
-		cgrp->dentry = dentry;
+		rcu_assign_pointer(cgrp->dentry, dentry);
 		dget(dentry);
 	}
 	dput(dentry);
-- 
cgit v1.2.3


From 28dbc4b6a01fb579a9441c7b81e3d3413dc452df Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbir@linux.vnet.ibm.com>
Date: Wed, 7 Jan 2009 18:08:05 -0800
Subject: memcg: memory cgroup resource counters for hierarchy

Add support for building hierarchies in resource counters.  Cgroups allows
us to build a deep hierarchy, but we currently don't link the resource
counters belonging to the memory controller control groups, in the same
fashion as the corresponding cgroup entries in the cgroup hierarchy.  This
patch provides the infrastructure for resource counters that have the same
hiearchy as their cgroup counter parts.

These set of patches are based on the resource counter hiearchy patches
posted by Pavel Emelianov.

NOTE: Building hiearchies is expensive, deeper hierarchies imply charging
the all the way up to the root.  It is known that hiearchies are
expensive, so the user needs to be careful and aware of the trade-offs
before creating very deep ones.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Pavel Emelianov <xemul@openvz.org>
Cc: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/res_counter.c | 44 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index f275c8eca77..bf8e7534c80 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -15,10 +15,11 @@
 #include <linux/uaccess.h>
 #include <linux/mm.h>
 
-void res_counter_init(struct res_counter *counter)
+void res_counter_init(struct res_counter *counter, struct res_counter *parent)
 {
 	spin_lock_init(&counter->lock);
 	counter->limit = (unsigned long long)LLONG_MAX;
+	counter->parent = parent;
 }
 
 int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
@@ -34,14 +35,34 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
 	return 0;
 }
 
-int res_counter_charge(struct res_counter *counter, unsigned long val)
+int res_counter_charge(struct res_counter *counter, unsigned long val,
+			struct res_counter **limit_fail_at)
 {
 	int ret;
 	unsigned long flags;
-
-	spin_lock_irqsave(&counter->lock, flags);
-	ret = res_counter_charge_locked(counter, val);
-	spin_unlock_irqrestore(&counter->lock, flags);
+	struct res_counter *c, *u;
+
+	*limit_fail_at = NULL;
+	local_irq_save(flags);
+	for (c = counter; c != NULL; c = c->parent) {
+		spin_lock(&c->lock);
+		ret = res_counter_charge_locked(c, val);
+		spin_unlock(&c->lock);
+		if (ret < 0) {
+			*limit_fail_at = c;
+			goto undo;
+		}
+	}
+	ret = 0;
+	goto done;
+undo:
+	for (u = counter; u != c; u = u->parent) {
+		spin_lock(&u->lock);
+		res_counter_uncharge_locked(u, val);
+		spin_unlock(&u->lock);
+	}
+done:
+	local_irq_restore(flags);
 	return ret;
 }
 
@@ -56,10 +77,15 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
 void res_counter_uncharge(struct res_counter *counter, unsigned long val)
 {
 	unsigned long flags;
+	struct res_counter *c;
 
-	spin_lock_irqsave(&counter->lock, flags);
-	res_counter_uncharge_locked(counter, val);
-	spin_unlock_irqrestore(&counter->lock, flags);
+	local_irq_save(flags);
+	for (c = counter; c != NULL; c = c->parent) {
+		spin_lock(&c->lock);
+		res_counter_uncharge_locked(c, val);
+		spin_unlock(&c->lock);
+	}
+	local_irq_restore(flags);
 }
 
 
-- 
cgit v1.2.3


From 999cd8a450f8f93701669a61cac4d3b19eca07e8 Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Wed, 7 Jan 2009 18:08:36 -0800
Subject: cgroups: add a per-subsystem hierarchy_mutex

These patches introduce new locking/refcount support for cgroups to
reduce the need for subsystems to call cgroup_lock(). This will
ultimately allow the atomicity of cgroup_rmdir() (which was removed
recently) to be restored.

These three patches give:

1/3 - introduce a per-subsystem hierarchy_mutex which a subsystem can
     use to prevent changes to its own cgroup tree

2/3 - use hierarchy_mutex in place of calling cgroup_lock() in the
     memory controller

3/3 - introduce a css_tryget() function similar to the one recently
      proposed by Kamezawa, but avoiding spurious refcount failures in
      the event of a race between a css_tryget() and an unsuccessful
      cgroup_rmdir()

Future patches will likely involve:

- using hierarchy mutex in place of cgroup_lock() in more subsystems
 where appropriate

- restoring the atomicity of cgroup_rmdir() with respect to cgroup_create()

This patch:

Add a hierarchy_mutex to the cgroup_subsys object that protects changes to
the hierarchy observed by that subsystem.  It is taken by the cgroup
subsystem (in addition to cgroup_mutex) for the following operations:

- linking a cgroup into that subsystem's cgroup tree
- unlinking a cgroup from that subsystem's cgroup tree
- moving the subsystem to/from a hierarchy (including across the
  bind() callback)

Thus if the subsystem holds its own hierarchy_mutex, it can safely
traverse its own hierarchy.

Signed-off-by: Paul Menage <menage@google.com>
Tested-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 37 +++++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 83ea4f524be..8b6379cdf63 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -722,23 +722,26 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 			BUG_ON(cgrp->subsys[i]);
 			BUG_ON(!dummytop->subsys[i]);
 			BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
+			mutex_lock(&ss->hierarchy_mutex);
 			cgrp->subsys[i] = dummytop->subsys[i];
 			cgrp->subsys[i]->cgroup = cgrp;
 			list_move(&ss->sibling, &root->subsys_list);
 			ss->root = root;
 			if (ss->bind)
 				ss->bind(ss, cgrp);
-
+			mutex_unlock(&ss->hierarchy_mutex);
 		} else if (bit & removed_bits) {
 			/* We're removing this subsystem */
 			BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
 			BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
+			mutex_lock(&ss->hierarchy_mutex);
 			if (ss->bind)
 				ss->bind(ss, dummytop);
 			dummytop->subsys[i]->cgroup = dummytop;
 			cgrp->subsys[i] = NULL;
 			subsys[i]->root = &rootnode;
 			list_move(&ss->sibling, &rootnode.subsys_list);
+			mutex_unlock(&ss->hierarchy_mutex);
 		} else if (bit & final_bits) {
 			/* Subsystem state should already exist */
 			BUG_ON(!cgrp->subsys[i]);
@@ -2338,6 +2341,29 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
 	cgrp->subsys[ss->subsys_id] = css;
 }
 
+static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
+{
+	/* We need to take each hierarchy_mutex in a consistent order */
+	int i;
+
+	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+		struct cgroup_subsys *ss = subsys[i];
+		if (ss->root == root)
+			mutex_lock_nested(&ss->hierarchy_mutex, i);
+	}
+}
+
+static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
+{
+	int i;
+
+	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+		struct cgroup_subsys *ss = subsys[i];
+		if (ss->root == root)
+			mutex_unlock(&ss->hierarchy_mutex);
+	}
+}
+
 /*
  * cgroup_create - create a cgroup
  * @parent: cgroup that will be parent of the new cgroup
@@ -2386,7 +2412,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 		init_cgroup_css(css, ss, cgrp);
 	}
 
+	cgroup_lock_hierarchy(root);
 	list_add(&cgrp->sibling, &cgrp->parent->children);
+	cgroup_unlock_hierarchy(root);
 	root->number_of_cgroups++;
 
 	err = cgroup_create_dir(cgrp, dentry, mode);
@@ -2504,8 +2532,12 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	if (!list_empty(&cgrp->release_list))
 		list_del(&cgrp->release_list);
 	spin_unlock(&release_list_lock);
-	/* delete my sibling from parent->children */
+
+	cgroup_lock_hierarchy(cgrp->root);
+	/* delete this cgroup from parent->children */
 	list_del(&cgrp->sibling);
+	cgroup_unlock_hierarchy(cgrp->root);
+
 	spin_lock(&cgrp->dentry->d_lock);
 	d = dget(cgrp->dentry);
 	spin_unlock(&d->d_lock);
@@ -2547,6 +2579,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 	 * need to invoke fork callbacks here. */
 	BUG_ON(!list_empty(&init_task.tasks));
 
+	mutex_init(&ss->hierarchy_mutex);
 	ss->active = 1;
 }
 
-- 
cgit v1.2.3


From e7c5ec9193d32b9559a3bb8893ceedbda85201ff Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Wed, 7 Jan 2009 18:08:38 -0800
Subject: cgroups: add css_tryget()

Add css_tryget(), that obtains a counted reference on a CSS.  It is used
in situations where the caller has a "weak" reference to the CSS, i.e.
one that does not protect the cgroup from removal via a reference count,
but would instead be cleaned up by a destroy() callback.

css_tryget() will return true on success, or false if the cgroup is being
removed.

This is similar to Kamezawa Hiroyuki's patch from a week or two ago, but
with the difference that in the event of css_tryget() racing with a
cgroup_rmdir(), css_tryget() will only return false if the cgroup really
does get removed.

This implementation is done by biasing css->refcnt, so that a refcnt of 1
means "releasable" and 0 means "released or releasing".  In the event of a
race, css_tryget() distinguishes between "released" and "releasing" by
checking for the CSS_REMOVED flag in css->flags.

Signed-off-by: Paul Menage <menage@google.com>
Tested-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 56 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 8b6379cdf63..c29831076e7 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2333,7 +2333,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
 			       struct cgroup *cgrp)
 {
 	css->cgroup = cgrp;
-	atomic_set(&css->refcnt, 0);
+	atomic_set(&css->refcnt, 1);
 	css->flags = 0;
 	if (cgrp == dummytop)
 		set_bit(CSS_ROOT, &css->flags);
@@ -2465,7 +2465,7 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
 {
 	/* Check the reference count on each subsystem. Since we
 	 * already established that there are no tasks in the
-	 * cgroup, if the css refcount is also 0, then there should
+	 * cgroup, if the css refcount is also 1, then there should
 	 * be no outstanding references, so the subsystem is safe to
 	 * destroy. We scan across all subsystems rather than using
 	 * the per-hierarchy linked list of mounted subsystems since
@@ -2486,12 +2486,62 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
 		 * matter, since it can only happen if the cgroup
 		 * has been deleted and hence no longer needs the
 		 * release agent to be called anyway. */
-		if (css && atomic_read(&css->refcnt))
+		if (css && (atomic_read(&css->refcnt) > 1))
 			return 1;
 	}
 	return 0;
 }
 
+/*
+ * Atomically mark all (or else none) of the cgroup's CSS objects as
+ * CSS_REMOVED. Return true on success, or false if the cgroup has
+ * busy subsystems. Call with cgroup_mutex held
+ */
+
+static int cgroup_clear_css_refs(struct cgroup *cgrp)
+{
+	struct cgroup_subsys *ss;
+	unsigned long flags;
+	bool failed = false;
+	local_irq_save(flags);
+	for_each_subsys(cgrp->root, ss) {
+		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+		int refcnt;
+		do {
+			/* We can only remove a CSS with a refcnt==1 */
+			refcnt = atomic_read(&css->refcnt);
+			if (refcnt > 1) {
+				failed = true;
+				goto done;
+			}
+			BUG_ON(!refcnt);
+			/*
+			 * Drop the refcnt to 0 while we check other
+			 * subsystems. This will cause any racing
+			 * css_tryget() to spin until we set the
+			 * CSS_REMOVED bits or abort
+			 */
+		} while (atomic_cmpxchg(&css->refcnt, refcnt, 0) != refcnt);
+	}
+ done:
+	for_each_subsys(cgrp->root, ss) {
+		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+		if (failed) {
+			/*
+			 * Restore old refcnt if we previously managed
+			 * to clear it from 1 to 0
+			 */
+			if (!atomic_read(&css->refcnt))
+				atomic_set(&css->refcnt, 1);
+		} else {
+			/* Commit the fact that the CSS is removed */
+			set_bit(CSS_REMOVED, &css->flags);
+		}
+	}
+	local_irq_restore(flags);
+	return !failed;
+}
+
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 {
 	struct cgroup *cgrp = dentry->d_fsdata;
@@ -2522,7 +2572,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 
 	if (atomic_read(&cgrp->count)
 	    || !list_empty(&cgrp->children)
-	    || cgroup_has_css_refs(cgrp)) {
+	    || !cgroup_clear_css_refs(cgrp)) {
 		mutex_unlock(&cgroup_mutex);
 		return -EBUSY;
 	}
@@ -3078,7 +3128,8 @@ void __css_put(struct cgroup_subsys_state *css)
 {
 	struct cgroup *cgrp = css->cgroup;
 	rcu_read_lock();
-	if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cgrp)) {
+	if ((atomic_dec_return(&css->refcnt) == 1) &&
+	    notify_on_release(cgrp)) {
 		set_bit(CGRP_RELEASABLE, &cgrp->flags);
 		check_for_release(cgrp);
 	}
-- 
cgit v1.2.3


From 13337714f3b0307dc7f75ef5d83ecf0db2abbd65 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 7 Jan 2009 18:08:39 -0800
Subject: cpuset: rcu_read_lock() to protect task_cs()

task_cs() calls task_subsys_state().

We must use rcu_read_lock() to protect cgroup_subsys_state().

It's correct that top_cpuset is never freed, but cgroup_subsys_state()
accesses css_set, this css_set maybe freed when task_cs() called.

We use use rcu_read_lock() to protect it.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 345ace5117d..a841b5c01ef 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -375,14 +375,9 @@ void cpuset_update_task_memory_state(void)
 	struct task_struct *tsk = current;
 	struct cpuset *cs;
 
-	if (task_cs(tsk) == &top_cpuset) {
-		/* Don't need rcu for top_cpuset.  It's never freed. */
-		my_cpusets_mem_gen = top_cpuset.mems_generation;
-	} else {
-		rcu_read_lock();
-		my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
-		rcu_read_unlock();
-	}
+	rcu_read_lock();
+	my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
+	rcu_read_unlock();
 
 	if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
 		mutex_lock(&callback_mutex);
-- 
cgit v1.2.3


From f5813d94279a18ff5936d675e24b44b44a571197 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Wed, 7 Jan 2009 18:08:40 -0800
Subject: cpusets: set task's cpu_allowed to cpu_possible_map when attaching it
 into top cpuset

I found a bug on my dual-cpu box.  I created a sub cpuset in top cpuset
and assign 1 to its cpus.  And then we attach some tasks into this sub
cpuset.  After this, we offline CPU1.  Now, the tasks in this new cpuset
are moved into top cpuset automatically because there is no cpu in sub
cpuset.  Then we online CPU1, we find all the tasks which doesn't belong
to top cpuset originally just run on CPU0.

We fix this bug by setting task's cpu_allowed to cpu_possible_map when
attaching it into top cpuset.  This method needn't modify the current
behavior of cpusets on CPU hotplug, and all of tasks in top cpuset use
cpu_possible_map to initialize their cpu_allowed.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index a841b5c01ef..6012e326e85 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1338,10 +1338,14 @@ static void cpuset_attach(struct cgroup_subsys *ss,
 	struct cpuset *oldcs = cgroup_cs(oldcont);
 	int err;
 
-	mutex_lock(&callback_mutex);
-	guarantee_online_cpus(cs, &cpus);
+	if (cs == &top_cpuset) {
+		cpus = cpu_possible_map;
+	} else {
+		mutex_lock(&callback_mutex);
+		guarantee_online_cpus(cs, &cpus);
+		mutex_unlock(&callback_mutex);
+	}
 	err = set_cpus_allowed_ptr(tsk, &cpus);
-	mutex_unlock(&callback_mutex);
 	if (err)
 		return;
 
-- 
cgit v1.2.3


From 5a7625df725a486ad600b0036b6111dbd6321c41 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 7 Jan 2009 18:08:41 -0800
Subject: cpuset: remove on stack cpumask_t in cpuset_sprintf_cpulist()

This patchset converts cpuset to use new cpumask API, and thus
remove on stack cpumask_t to reduce stack usage.

Before:
 # cat kernel/cpuset.c include/linux/cpuset.h | grep -c cpumask_t
 21
After:
 # cat kernel/cpuset.c include/linux/cpuset.h | grep -c cpumask_t
 0

This patch:

Impact: reduce stack usage

It's safe to call cpulist_scnprintf inside callback_mutex, and thus we can
just remove the cpumask_t and no need to allocate a cpumask_var_t.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Mike Travis <travis@sgi.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6012e326e85..41c2343df97 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1486,13 +1486,13 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
 
 static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
 {
-	cpumask_t mask;
+	int ret;
 
 	mutex_lock(&callback_mutex);
-	mask = cs->cpus_allowed;
+	ret = cpulist_scnprintf(page, PAGE_SIZE, &cs->cpus_allowed);
 	mutex_unlock(&callback_mutex);
 
-	return cpulist_scnprintf(page, PAGE_SIZE, &mask);
+	return ret;
 }
 
 static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
-- 
cgit v1.2.3


From 5771f0a2236df69683e9abea87f35f522c97ff94 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 7 Jan 2009 18:08:41 -0800
Subject: cpuset: remove on stack cpumask_t in cpuset_can_attach()

Impact: reduce stack usage

Just use cs->cpus_allowed, and no need to allocate a cpumask_var_t.

Signed-off-by: Li Zefan <lizf@cn.fujistu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Mike Travis <travis@sgi.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 41c2343df97..afa29cfc5bb 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1311,20 +1311,19 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
 			     struct cgroup *cont, struct task_struct *tsk)
 {
 	struct cpuset *cs = cgroup_cs(cont);
+	int ret = 0;
 
 	if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
 		return -ENOSPC;
-	if (tsk->flags & PF_THREAD_BOUND) {
-		cpumask_t mask;
 
+	if (tsk->flags & PF_THREAD_BOUND) {
 		mutex_lock(&callback_mutex);
-		mask = cs->cpus_allowed;
+		if (!cpus_equal(tsk->cpus_allowed, cs->cpus_allowed))
+			ret = -EINVAL;
 		mutex_unlock(&callback_mutex);
-		if (!cpus_equal(tsk->cpus_allowed, mask))
-			return -EINVAL;
 	}
 
-	return security_task_setscheduler(tsk, 0, NULL);
+	return ret < 0 ? ret : security_task_setscheduler(tsk, 0, NULL);
 }
 
 static void cpuset_attach(struct cgroup_subsys *ss,
-- 
cgit v1.2.3


From 2341d1b6598c7146d64a5050b53a72a5a819617f Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 7 Jan 2009 18:08:42 -0800
Subject: cpuset: convert cpuset_attach() to use cpumask_var_t

Impact: reduce stack usage

Allocate a global cpumask_var_t at boot, and use it in cpuset_attach(), so
we won't fail cpuset_attach().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Mike Travis <travis@sgi.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index afa29cfc5bb..1e32e6b380a 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1306,6 +1306,9 @@ static int fmeter_getrate(struct fmeter *fmp)
 	return val;
 }
 
+/* Protected by cgroup_lock */
+static cpumask_var_t cpus_attach;
+
 /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
 static int cpuset_can_attach(struct cgroup_subsys *ss,
 			     struct cgroup *cont, struct task_struct *tsk)
@@ -1330,7 +1333,6 @@ static void cpuset_attach(struct cgroup_subsys *ss,
 			  struct cgroup *cont, struct cgroup *oldcont,
 			  struct task_struct *tsk)
 {
-	cpumask_t cpus;
 	nodemask_t from, to;
 	struct mm_struct *mm;
 	struct cpuset *cs = cgroup_cs(cont);
@@ -1338,13 +1340,13 @@ static void cpuset_attach(struct cgroup_subsys *ss,
 	int err;
 
 	if (cs == &top_cpuset) {
-		cpus = cpu_possible_map;
+		cpumask_copy(cpus_attach, cpu_possible_mask);
 	} else {
 		mutex_lock(&callback_mutex);
-		guarantee_online_cpus(cs, &cpus);
+		guarantee_online_cpus(cs, cpus_attach);
 		mutex_unlock(&callback_mutex);
 	}
-	err = set_cpus_allowed_ptr(tsk, &cpus);
+	err = set_cpus_allowed_ptr(tsk, cpus_attach);
 	if (err)
 		return;
 
@@ -1357,7 +1359,6 @@ static void cpuset_attach(struct cgroup_subsys *ss,
 			cpuset_migrate_mm(mm, &from, &to);
 		mmput(mm);
 	}
-
 }
 
 /* The various types of files and directories in a cpuset file system */
@@ -1838,6 +1839,9 @@ int __init cpuset_init(void)
 	if (err < 0)
 		return err;
 
+	if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
+		BUG();
+
 	number_of_cpusets = 1;
 	return 0;
 }
-- 
cgit v1.2.3


From 645fcc9d2f6946f97a41c8d00edee38f8a6f0060 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 7 Jan 2009 18:08:43 -0800
Subject: cpuset: don't allocate trial cpuset on stack

Impact: cleanups, reduce stack usage

This patch prepares for the next patch.  When we convert
cpuset.cpus_allowed to cpumask_var_t, (trialcs = *cs) no longer works.

Another result of this patch is reducing stack usage of trialcs.
sizeof(*cs) can be as large as 148 bytes on x86_64, so it's really not
good to have it on stack.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Mike Travis <travis@sgi.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 93 +++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 60 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 1e32e6b380a..f66527bfd21 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -415,6 +415,24 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
 		is_mem_exclusive(p) <= is_mem_exclusive(q);
 }
 
+/**
+ * alloc_trial_cpuset - allocate a trial cpuset
+ * @cs: the cpuset that the trial cpuset duplicates
+ */
+static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs)
+{
+	return kmemdup(cs, sizeof(*cs), GFP_KERNEL);
+}
+
+/**
+ * free_trial_cpuset - free the trial cpuset
+ * @trial: the trial cpuset to be freed
+ */
+static void free_trial_cpuset(struct cpuset *trial)
+{
+	kfree(trial);
+}
+
 /*
  * validate_change() - Used to validate that any proposed cpuset change
  *		       follows the structural rules for cpusets.
@@ -880,10 +898,10 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
  * @cs: the cpuset to consider
  * @buf: buffer of cpu numbers written to this cpuset
  */
-static int update_cpumask(struct cpuset *cs, const char *buf)
+static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
+			  const char *buf)
 {
 	struct ptr_heap heap;
-	struct cpuset trialcs;
 	int retval;
 	int is_load_balanced;
 
@@ -891,8 +909,6 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
 	if (cs == &top_cpuset)
 		return -EACCES;
 
-	trialcs = *cs;
-
 	/*
 	 * An empty cpus_allowed is ok only if the cpuset has no tasks.
 	 * Since cpulist_parse() fails on an empty mask, we special case
@@ -900,31 +916,31 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
 	 * with tasks have cpus.
 	 */
 	if (!*buf) {
-		cpus_clear(trialcs.cpus_allowed);
+		cpus_clear(trialcs->cpus_allowed);
 	} else {
-		retval = cpulist_parse(buf, &trialcs.cpus_allowed);
+		retval = cpulist_parse(buf, &trialcs->cpus_allowed);
 		if (retval < 0)
 			return retval;
 
-		if (!cpus_subset(trialcs.cpus_allowed, cpu_online_map))
+		if (!cpus_subset(trialcs->cpus_allowed, cpu_online_map))
 			return -EINVAL;
 	}
-	retval = validate_change(cs, &trialcs);
+	retval = validate_change(cs, trialcs);
 	if (retval < 0)
 		return retval;
 
 	/* Nothing to do if the cpus didn't change */
-	if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
+	if (cpus_equal(cs->cpus_allowed, trialcs->cpus_allowed))
 		return 0;
 
 	retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
 	if (retval)
 		return retval;
 
-	is_load_balanced = is_sched_load_balance(&trialcs);
+	is_load_balanced = is_sched_load_balance(trialcs);
 
 	mutex_lock(&callback_mutex);
-	cs->cpus_allowed = trialcs.cpus_allowed;
+	cs->cpus_allowed = trialcs->cpus_allowed;
 	mutex_unlock(&callback_mutex);
 
 	/*
@@ -1099,9 +1115,9 @@ done:
  * lock each such tasks mm->mmap_sem, scan its vma's and rebind
  * their mempolicies to the cpusets new mems_allowed.
  */
-static int update_nodemask(struct cpuset *cs, const char *buf)
+static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
+			   const char *buf)
 {
-	struct cpuset trialcs;
 	nodemask_t oldmem;
 	int retval;
 
@@ -1112,8 +1128,6 @@ static int update_nodemask(struct cpuset *cs, const char *buf)
 	if (cs == &top_cpuset)
 		return -EACCES;
 
-	trialcs = *cs;
-
 	/*
 	 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
 	 * Since nodelist_parse() fails on an empty mask, we special case
@@ -1121,27 +1135,27 @@ static int update_nodemask(struct cpuset *cs, const char *buf)
 	 * with tasks have memory.
 	 */
 	if (!*buf) {
-		nodes_clear(trialcs.mems_allowed);
+		nodes_clear(trialcs->mems_allowed);
 	} else {
-		retval = nodelist_parse(buf, trialcs.mems_allowed);
+		retval = nodelist_parse(buf, trialcs->mems_allowed);
 		if (retval < 0)
 			goto done;
 
-		if (!nodes_subset(trialcs.mems_allowed,
+		if (!nodes_subset(trialcs->mems_allowed,
 				node_states[N_HIGH_MEMORY]))
 			return -EINVAL;
 	}
 	oldmem = cs->mems_allowed;
-	if (nodes_equal(oldmem, trialcs.mems_allowed)) {
+	if (nodes_equal(oldmem, trialcs->mems_allowed)) {
 		retval = 0;		/* Too easy - nothing to do */
 		goto done;
 	}
-	retval = validate_change(cs, &trialcs);
+	retval = validate_change(cs, trialcs);
 	if (retval < 0)
 		goto done;
 
 	mutex_lock(&callback_mutex);
-	cs->mems_allowed = trialcs.mems_allowed;
+	cs->mems_allowed = trialcs->mems_allowed;
 	cs->mems_generation = cpuset_mems_generation++;
 	mutex_unlock(&callback_mutex);
 
@@ -1181,31 +1195,36 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
 		       int turning_on)
 {
-	struct cpuset trialcs;
+	struct cpuset *trialcs;
 	int err;
 	int balance_flag_changed;
 
-	trialcs = *cs;
+	trialcs = alloc_trial_cpuset(cs);
+	if (!trialcs)
+		return -ENOMEM;
+
 	if (turning_on)
-		set_bit(bit, &trialcs.flags);
+		set_bit(bit, &trialcs->flags);
 	else
-		clear_bit(bit, &trialcs.flags);
+		clear_bit(bit, &trialcs->flags);
 
-	err = validate_change(cs, &trialcs);
+	err = validate_change(cs, trialcs);
 	if (err < 0)
-		return err;
+		goto out;
 
 	balance_flag_changed = (is_sched_load_balance(cs) !=
-		 			is_sched_load_balance(&trialcs));
+				is_sched_load_balance(trialcs));
 
 	mutex_lock(&callback_mutex);
-	cs->flags = trialcs.flags;
+	cs->flags = trialcs->flags;
 	mutex_unlock(&callback_mutex);
 
-	if (!cpus_empty(trialcs.cpus_allowed) && balance_flag_changed)
+	if (!cpus_empty(trialcs->cpus_allowed) && balance_flag_changed)
 		async_rebuild_sched_domains();
 
-	return 0;
+out:
+	free_trial_cpuset(trialcs);
+	return err;
 }
 
 /*
@@ -1453,21 +1472,29 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
 				const char *buf)
 {
 	int retval = 0;
+	struct cpuset *cs = cgroup_cs(cgrp);
+	struct cpuset *trialcs;
 
 	if (!cgroup_lock_live_group(cgrp))
 		return -ENODEV;
 
+	trialcs = alloc_trial_cpuset(cs);
+	if (!trialcs)
+		return -ENOMEM;
+
 	switch (cft->private) {
 	case FILE_CPULIST:
-		retval = update_cpumask(cgroup_cs(cgrp), buf);
+		retval = update_cpumask(cs, trialcs, buf);
 		break;
 	case FILE_MEMLIST:
-		retval = update_nodemask(cgroup_cs(cgrp), buf);
+		retval = update_nodemask(cs, trialcs, buf);
 		break;
 	default:
 		retval = -EINVAL;
 		break;
 	}
+
+	free_trial_cpuset(trialcs);
 	cgroup_unlock();
 	return retval;
 }
-- 
cgit v1.2.3


From 300ed6cbb70718872cb4936d1d22ef295f9ba44d Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 7 Jan 2009 18:08:44 -0800
Subject: cpuset: convert cpuset->cpus_allowed to cpumask_var_t

Impact: use new cpumask API

This patch mainly does the following things:
- change cs->cpus_allowed from cpumask_t to cpumask_var_t
- call alloc_bootmem_cpumask_var() for top_cpuset in cpuset_init_early()
- call alloc_cpumask_var() for other cpusets
- replace cpus_xxx() to cpumask_xxx()

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Mike Travis <travis@sgi.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 100 +++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 60 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f66527bfd21..fc294aa9a97 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -84,7 +84,7 @@ struct cpuset {
 	struct cgroup_subsys_state css;
 
 	unsigned long flags;		/* "unsigned long" so bitops work */
-	cpumask_t cpus_allowed;		/* CPUs allowed to tasks in cpuset */
+	cpumask_var_t cpus_allowed;	/* CPUs allowed to tasks in cpuset */
 	nodemask_t mems_allowed;	/* Memory Nodes allowed to tasks */
 
 	struct cpuset *parent;		/* my parent */
@@ -195,8 +195,6 @@ static int cpuset_mems_generation;
 
 static struct cpuset top_cpuset = {
 	.flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
-	.cpus_allowed = CPU_MASK_ALL,
-	.mems_allowed = NODE_MASK_ALL,
 };
 
 /*
@@ -278,7 +276,7 @@ static struct file_system_type cpuset_fs_type = {
 };
 
 /*
- * Return in *pmask the portion of a cpusets's cpus_allowed that
+ * Return in pmask the portion of a cpusets's cpus_allowed that
  * are online.  If none are online, walk up the cpuset hierarchy
  * until we find one that does have some online cpus.  If we get
  * all the way to the top and still haven't found any online cpus,
@@ -293,13 +291,13 @@ static struct file_system_type cpuset_fs_type = {
 
 static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
 {
-	while (cs && !cpus_intersects(cs->cpus_allowed, cpu_online_map))
+	while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
 		cs = cs->parent;
 	if (cs)
-		cpus_and(*pmask, cs->cpus_allowed, cpu_online_map);
+		cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
 	else
-		*pmask = cpu_online_map;
-	BUG_ON(!cpus_intersects(*pmask, cpu_online_map));
+		cpumask_copy(pmask, cpu_online_mask);
+	BUG_ON(!cpumask_intersects(pmask, cpu_online_mask));
 }
 
 /*
@@ -409,7 +407,7 @@ void cpuset_update_task_memory_state(void)
 
 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
 {
-	return	cpus_subset(p->cpus_allowed, q->cpus_allowed) &&
+	return	cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
 		nodes_subset(p->mems_allowed, q->mems_allowed) &&
 		is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
 		is_mem_exclusive(p) <= is_mem_exclusive(q);
@@ -421,7 +419,19 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
  */
 static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs)
 {
-	return kmemdup(cs, sizeof(*cs), GFP_KERNEL);
+	struct cpuset *trial;
+
+	trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
+	if (!trial)
+		return NULL;
+
+	if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) {
+		kfree(trial);
+		return NULL;
+	}
+	cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
+
+	return trial;
 }
 
 /**
@@ -430,6 +440,7 @@ static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs)
  */
 static void free_trial_cpuset(struct cpuset *trial)
 {
+	free_cpumask_var(trial->cpus_allowed);
 	kfree(trial);
 }
 
@@ -482,7 +493,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
 		c = cgroup_cs(cont);
 		if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
 		    c != cur &&
-		    cpus_intersects(trial->cpus_allowed, c->cpus_allowed))
+		    cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
 			return -EINVAL;
 		if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
 		    c != cur &&
@@ -492,7 +503,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
 
 	/* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */
 	if (cgroup_task_count(cur->css.cgroup)) {
-		if (cpus_empty(trial->cpus_allowed) ||
+		if (cpumask_empty(trial->cpus_allowed) ||
 		    nodes_empty(trial->mems_allowed)) {
 			return -ENOSPC;
 		}
@@ -507,7 +518,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
  */
 static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
 {
-	return cpus_intersects(a->cpus_allowed, b->cpus_allowed);
+	return cpumask_intersects(a->cpus_allowed, b->cpus_allowed);
 }
 
 static void
@@ -532,7 +543,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
 		cp = list_first_entry(&q, struct cpuset, stack_list);
 		list_del(q.next);
 
-		if (cpus_empty(cp->cpus_allowed))
+		if (cpumask_empty(cp->cpus_allowed))
 			continue;
 
 		if (is_sched_load_balance(cp))
@@ -627,7 +638,7 @@ static int generate_sched_domains(cpumask_t **domains,
 			*dattr = SD_ATTR_INIT;
 			update_domain_attr_tree(dattr, &top_cpuset);
 		}
-		*doms = top_cpuset.cpus_allowed;
+		cpumask_copy(doms, top_cpuset.cpus_allowed);
 
 		ndoms = 1;
 		goto done;
@@ -646,7 +657,7 @@ static int generate_sched_domains(cpumask_t **domains,
 		cp = list_first_entry(&q, struct cpuset, stack_list);
 		list_del(q.next);
 
-		if (cpus_empty(cp->cpus_allowed))
+		if (cpumask_empty(cp->cpus_allowed))
 			continue;
 
 		/*
@@ -739,7 +750,7 @@ restart:
 			struct cpuset *b = csa[j];
 
 			if (apn == b->pn) {
-				cpus_or(*dp, *dp, b->cpus_allowed);
+				cpumask_or(dp, dp, b->cpus_allowed);
 				if (dattr)
 					update_domain_attr_tree(dattr + nslot, b);
 
@@ -848,7 +859,7 @@ void rebuild_sched_domains(void)
 static int cpuset_test_cpumask(struct task_struct *tsk,
 			       struct cgroup_scanner *scan)
 {
-	return !cpus_equal(tsk->cpus_allowed,
+	return !cpumask_equal(&tsk->cpus_allowed,
 			(cgroup_cs(scan->cg))->cpus_allowed);
 }
 
@@ -866,7 +877,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk,
 static void cpuset_change_cpumask(struct task_struct *tsk,
 				  struct cgroup_scanner *scan)
 {
-	set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed));
+	set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed));
 }
 
 /**
@@ -916,13 +927,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 	 * with tasks have cpus.
 	 */
 	if (!*buf) {
-		cpus_clear(trialcs->cpus_allowed);
+		cpumask_clear(trialcs->cpus_allowed);
 	} else {
-		retval = cpulist_parse(buf, &trialcs->cpus_allowed);
+		retval = cpulist_parse(buf, trialcs->cpus_allowed);
 		if (retval < 0)
 			return retval;
 
-		if (!cpus_subset(trialcs->cpus_allowed, cpu_online_map))
+		if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask))
 			return -EINVAL;
 	}
 	retval = validate_change(cs, trialcs);
@@ -930,7 +941,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 		return retval;
 
 	/* Nothing to do if the cpus didn't change */
-	if (cpus_equal(cs->cpus_allowed, trialcs->cpus_allowed))
+	if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
 		return 0;
 
 	retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
@@ -940,7 +951,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 	is_load_balanced = is_sched_load_balance(trialcs);
 
 	mutex_lock(&callback_mutex);
-	cs->cpus_allowed = trialcs->cpus_allowed;
+	cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
 	mutex_unlock(&callback_mutex);
 
 	/*
@@ -1028,7 +1039,7 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
 	cpuset_being_rebound = cs;		/* causes mpol_dup() rebind */
 
 	fudge = 10;				/* spare mmarray[] slots */
-	fudge += cpus_weight(cs->cpus_allowed);	/* imagine one fork-bomb/cpu */
+	fudge += cpumask_weight(cs->cpus_allowed);/* imagine 1 fork-bomb/cpu */
 	retval = -ENOMEM;
 
 	/*
@@ -1176,7 +1187,8 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
 
 	if (val != cs->relax_domain_level) {
 		cs->relax_domain_level = val;
-		if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs))
+		if (!cpumask_empty(cs->cpus_allowed) &&
+		    is_sched_load_balance(cs))
 			async_rebuild_sched_domains();
 	}
 
@@ -1219,7 +1231,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
 	cs->flags = trialcs->flags;
 	mutex_unlock(&callback_mutex);
 
-	if (!cpus_empty(trialcs->cpus_allowed) && balance_flag_changed)
+	if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
 		async_rebuild_sched_domains();
 
 out:
@@ -1335,12 +1347,12 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
 	struct cpuset *cs = cgroup_cs(cont);
 	int ret = 0;
 
-	if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
+	if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
 		return -ENOSPC;
 
 	if (tsk->flags & PF_THREAD_BOUND) {
 		mutex_lock(&callback_mutex);
-		if (!cpus_equal(tsk->cpus_allowed, cs->cpus_allowed))
+		if (!cpumask_equal(&tsk->cpus_allowed, cs->cpus_allowed))
 			ret = -EINVAL;
 		mutex_unlock(&callback_mutex);
 	}
@@ -1516,7 +1528,7 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
 	int ret;
 
 	mutex_lock(&callback_mutex);
-	ret = cpulist_scnprintf(page, PAGE_SIZE, &cs->cpus_allowed);
+	ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
 	mutex_unlock(&callback_mutex);
 
 	return ret;
@@ -1755,7 +1767,7 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
 	parent_cs = cgroup_cs(parent);
 
 	cs->mems_allowed = parent_cs->mems_allowed;
-	cs->cpus_allowed = parent_cs->cpus_allowed;
+	cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
 	return;
 }
 
@@ -1781,6 +1793,10 @@ static struct cgroup_subsys_state *cpuset_create(
 	cs = kmalloc(sizeof(*cs), GFP_KERNEL);
 	if (!cs)
 		return ERR_PTR(-ENOMEM);
+	if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
+		kfree(cs);
+		return ERR_PTR(-ENOMEM);
+	}
 
 	cpuset_update_task_memory_state();
 	cs->flags = 0;
@@ -1789,7 +1805,7 @@ static struct cgroup_subsys_state *cpuset_create(
 	if (is_spread_slab(parent))
 		set_bit(CS_SPREAD_SLAB, &cs->flags);
 	set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
-	cpus_clear(cs->cpus_allowed);
+	cpumask_clear(cs->cpus_allowed);
 	nodes_clear(cs->mems_allowed);
 	cs->mems_generation = cpuset_mems_generation++;
 	fmeter_init(&cs->fmeter);
@@ -1816,6 +1832,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
 		update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
 
 	number_of_cpusets--;
+	free_cpumask_var(cs->cpus_allowed);
 	kfree(cs);
 }
 
@@ -1839,6 +1856,8 @@ struct cgroup_subsys cpuset_subsys = {
 
 int __init cpuset_init_early(void)
 {
+	alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed);
+
 	top_cpuset.mems_generation = cpuset_mems_generation++;
 	return 0;
 }
@@ -1854,7 +1873,7 @@ int __init cpuset_init(void)
 {
 	int err = 0;
 
-	cpus_setall(top_cpuset.cpus_allowed);
+	cpumask_setall(top_cpuset.cpus_allowed);
 	nodes_setall(top_cpuset.mems_allowed);
 
 	fmeter_init(&top_cpuset.fmeter);
@@ -1943,7 +1962,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
 	 * has online cpus, so can't be empty).
 	 */
 	parent = cs->parent;
-	while (cpus_empty(parent->cpus_allowed) ||
+	while (cpumask_empty(parent->cpus_allowed) ||
 			nodes_empty(parent->mems_allowed))
 		parent = parent->parent;
 
@@ -1984,7 +2003,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
 		}
 
 		/* Continue past cpusets with all cpus, mems online */
-		if (cpus_subset(cp->cpus_allowed, cpu_online_map) &&
+		if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) &&
 		    nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
 			continue;
 
@@ -1992,13 +2011,14 @@ static void scan_for_empty_cpusets(struct cpuset *root)
 
 		/* Remove offline cpus and mems from this cpuset. */
 		mutex_lock(&callback_mutex);
-		cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
+		cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
+			    cpu_online_mask);
 		nodes_and(cp->mems_allowed, cp->mems_allowed,
 						node_states[N_HIGH_MEMORY]);
 		mutex_unlock(&callback_mutex);
 
 		/* Move tasks from the empty cpuset to a parent */
-		if (cpus_empty(cp->cpus_allowed) ||
+		if (cpumask_empty(cp->cpus_allowed) ||
 		     nodes_empty(cp->mems_allowed))
 			remove_tasks_in_empty_cpuset(cp);
 		else {
@@ -2039,7 +2059,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
 	}
 
 	cgroup_lock();
-	top_cpuset.cpus_allowed = cpu_online_map;
+	cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
 	scan_for_empty_cpusets(&top_cpuset);
 	ndoms = generate_sched_domains(&doms, &attr);
 	cgroup_unlock();
@@ -2084,7 +2104,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
 
 void __init cpuset_init_smp(void)
 {
-	top_cpuset.cpus_allowed = cpu_online_map;
+	cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
 	top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
 
 	hotcpu_notifier(cpuset_track_online_cpus, 0);
@@ -2096,7 +2116,7 @@ void __init cpuset_init_smp(void)
  * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
  * @pmask: pointer to cpumask_t variable to receive cpus_allowed set.
  *
- * Description: Returns the cpumask_t cpus_allowed of the cpuset
+ * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
  * attached to the specified @tsk.  Guaranteed to return some non-empty
  * subset of cpu_online_map, even if this means going outside the
  * tasks cpuset.
-- 
cgit v1.2.3


From 6af866af34a96fed24a55979a78b6f73bd4e8e87 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 7 Jan 2009 18:08:45 -0800
Subject: cpuset: remove remaining pointers to cpumask_t

Impact: cleanups, use new cpumask API

Final trivial cleanups: mainly s/cpumask_t/struct cpumask

Note there is a FIXME in generate_sched_domains(). A future patch will
change struct cpumask *doms to struct cpumask *doms[].
(I suppose Rusty will do this.)

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Mike Travis <travis@sgi.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index fc294aa9a97..647c77a88fc 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -289,7 +289,8 @@ static struct file_system_type cpuset_fs_type = {
  * Call with callback_mutex held.
  */
 
-static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
+static void guarantee_online_cpus(const struct cpuset *cs,
+				  struct cpumask *pmask)
 {
 	while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
 		cs = cs->parent;
@@ -610,7 +611,8 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
  *	element of the partition (one sched domain) to be passed to
  *	partition_sched_domains().
  */
-static int generate_sched_domains(cpumask_t **domains,
+/* FIXME: see the FIXME in partition_sched_domains() */
+static int generate_sched_domains(struct cpumask **domains,
 			struct sched_domain_attr **attributes)
 {
 	LIST_HEAD(q);		/* queue of cpusets to be scanned */
@@ -618,10 +620,10 @@ static int generate_sched_domains(cpumask_t **domains,
 	struct cpuset **csa;	/* array of all cpuset ptrs */
 	int csn;		/* how many cpuset ptrs in csa so far */
 	int i, j, k;		/* indices for partition finding loops */
-	cpumask_t *doms;	/* resulting partition; i.e. sched domains */
+	struct cpumask *doms;	/* resulting partition; i.e. sched domains */
 	struct sched_domain_attr *dattr;  /* attributes for custom domains */
 	int ndoms = 0;		/* number of sched domains in result */
-	int nslot;		/* next empty doms[] cpumask_t slot */
+	int nslot;		/* next empty doms[] struct cpumask slot */
 
 	doms = NULL;
 	dattr = NULL;
@@ -629,7 +631,7 @@ static int generate_sched_domains(cpumask_t **domains,
 
 	/* Special case for the 99% of systems with one, full, sched domain */
 	if (is_sched_load_balance(&top_cpuset)) {
-		doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
+		doms = kmalloc(cpumask_size(), GFP_KERNEL);
 		if (!doms)
 			goto done;
 
@@ -708,7 +710,7 @@ restart:
 	 * Now we know how many domains to create.
 	 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
 	 */
-	doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
+	doms = kmalloc(ndoms * cpumask_size(), GFP_KERNEL);
 	if (!doms)
 		goto done;
 
@@ -720,7 +722,7 @@ restart:
 
 	for (nslot = 0, i = 0; i < csn; i++) {
 		struct cpuset *a = csa[i];
-		cpumask_t *dp;
+		struct cpumask *dp;
 		int apn = a->pn;
 
 		if (apn < 0) {
@@ -743,7 +745,7 @@ restart:
 			continue;
 		}
 
-		cpus_clear(*dp);
+		cpumask_clear(dp);
 		if (dattr)
 			*(dattr + nslot) = SD_ATTR_INIT;
 		for (j = i; j < csn; j++) {
@@ -790,7 +792,7 @@ done:
 static void do_rebuild_sched_domains(struct work_struct *unused)
 {
 	struct sched_domain_attr *attr;
-	cpumask_t *doms;
+	struct cpumask *doms;
 	int ndoms;
 
 	get_online_cpus();
@@ -2044,7 +2046,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
 				unsigned long phase, void *unused_cpu)
 {
 	struct sched_domain_attr *attr;
-	cpumask_t *doms;
+	struct cpumask *doms;
 	int ndoms;
 
 	switch (phase) {
@@ -2114,7 +2116,7 @@ void __init cpuset_init_smp(void)
 /**
  * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
  * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
- * @pmask: pointer to cpumask_t variable to receive cpus_allowed set.
+ * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
  *
  * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
  * attached to the specified @tsk.  Guaranteed to return some non-empty
@@ -2122,7 +2124,7 @@ void __init cpuset_init_smp(void)
  * tasks cpuset.
  **/
 
-void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask)
+void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
 {
 	mutex_lock(&callback_mutex);
 	cpuset_cpus_allowed_locked(tsk, pmask);
@@ -2133,7 +2135,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask)
  * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
  * Must be called with callback_mutex held.
  **/
-void cpuset_cpus_allowed_locked(struct task_struct *tsk, cpumask_t *pmask)
+void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
 {
 	task_lock(tsk);
 	guarantee_online_cpus(task_cs(tsk), pmask);
-- 
cgit v1.2.3


From 61bce0f1371cfff497fe85594fd39d1a0b15ebe1 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 7 Jan 2009 18:08:49 -0800
Subject: pid: generalize task_active_pid_ns

Currently task_active_pid_ns is not safe to call after a task becomes a
zombie and exit_task_namespaces is called, as nsproxy becomes NULL.  By
reading the pid namespace from the pid of the task we can trivially solve
this problem at the cost of one extra memory read in what should be the
same cacheline as we read the namespace from.

When moving things around I have made task_active_pid_ns out of line
because keeping it in pid_namespace.h would require adding includes of
pid.h and sched.h that I don't think we want.

This change does make task_active_pid_ns unsafe to call during
copy_process until we attach a pid on the task_struct which seems to be a
reasonable trade off.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Bastian Blank <bastian@waldi.eu.org>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Nadia Derbey <Nadia.Derbey@bull.net>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 4 ++--
 kernel/pid.c  | 6 ++++++
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 7b8f2a78be3..4018308048c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1126,12 +1126,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	if (pid != &init_struct_pid) {
 		retval = -ENOMEM;
-		pid = alloc_pid(task_active_pid_ns(p));
+		pid = alloc_pid(p->nsproxy->pid_ns);
 		if (!pid)
 			goto bad_fork_cleanup_io;
 
 		if (clone_flags & CLONE_NEWPID) {
-			retval = pid_ns_prepare_proc(task_active_pid_ns(p));
+			retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
 			if (retval < 0)
 				goto bad_fork_free_pid;
 		}
diff --git a/kernel/pid.c b/kernel/pid.c
index af9224cdd6c..1b3586fe753 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -474,6 +474,12 @@ pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
 }
 EXPORT_SYMBOL(task_session_nr_ns);
 
+struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
+{
+	return ns_of_pid(task_pid(tsk));
+}
+EXPORT_SYMBOL_GPL(task_active_pid_ns);
+
 /*
  * Used by proc to find the first pid that is greater than or equal to nr.
  *
-- 
cgit v1.2.3


From df4927bf6ccf6278a97a44bd107080c71b269cb5 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Wed, 7 Jan 2009 18:09:14 -0800
Subject: generic swap(): sched: remove local swap() macro

Use the new generic implementation.

Signed-off-by: Wu Fengguang <wfg@linux.intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched_fair.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e0c0b4bc3f0..8e1352c7555 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1617,8 +1617,6 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 	}
 }
 
-#define swap(a, b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0)
-
 /*
  * Share the fairness runtime between parent and child, thus the
  * total amount of pressure for CPU stays equal - new tasks
-- 
cgit v1.2.3


From 33b04b9308959af7febc1c111c766fa3fd8b1934 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Thu, 8 Jan 2009 12:35:11 -0800
Subject: async: make async_synchronize_full() more serializing

turns out that there are real problems with allowing async
tasks that are scheduled from async tasks to run after
the async_synchronize_full() returns.

This patch makes the _full more strict and a complete
synchronization. Later I might need to add back a lighter
form of synchronization for other uses.. but not right now.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/async.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/async.c b/kernel/async.c
index 97373380c9e..64cc916299a 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -206,7 +206,9 @@ EXPORT_SYMBOL_GPL(async_schedule_special);
 
 void async_synchronize_full(void)
 {
-	async_synchronize_cookie(next_cookie);
+	do {
+		async_synchronize_cookie(next_cookie);
+	} while (!list_empty(&async_running) || !list_empty(&async_pending));
 }
 EXPORT_SYMBOL_GPL(async_synchronize_full);
 
-- 
cgit v1.2.3


From 0de336814107358bc8c4173bf9ce2d42445173fe Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 9 Jan 2009 16:13:41 +0000
Subject: CRED: Missing put_cred() in prepare_kernel_cred()

Missing put_cred() in the error handling path of prepare_kernel_cred().

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cred.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/cred.c b/kernel/cred.c
index ff7bc071991..fc222e4acfb 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -529,6 +529,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
 
 error:
 	put_cred(new);
+	put_cred(old);
 	return NULL;
 }
 EXPORT_SYMBOL(prepare_kernel_cred);
-- 
cgit v1.2.3


From 43529c97122f2c851126447963eedcb8cba74fbe Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 9 Jan 2009 16:13:46 +0000
Subject: CRED: Must initialise the new creds in prepare_kernel_cred()

The newly allocated creds in prepare_kernel_cred() must be initialised
before get_uid() and get_group_info() can access them.  They should be
copied from the old credentials.

Reported-by: Steve Dickson <steved@redhat.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cred.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/cred.c b/kernel/cred.c
index fc222e4acfb..043f78c133c 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -506,6 +506,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
 	else
 		old = get_cred(&init_cred);
 
+	*new = *old;
 	get_uid(new->user);
 	get_group_info(new->group_info);
 
-- 
cgit v1.2.3


From cdb80f630be5cbc23d82331f24dc4704f75b64f4 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 9 Jan 2009 13:23:45 -0800
Subject: async: make async a command line option for now

... and have it default off.
This does allow people to work with it for testing.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 kernel/async.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/async.c b/kernel/async.c
index 64cc916299a..f286e9f2b73 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -65,6 +65,8 @@ static LIST_HEAD(async_pending);
 static LIST_HEAD(async_running);
 static DEFINE_SPINLOCK(async_lock);
 
+static int async_enabled = 0;
+
 struct async_entry {
 	struct list_head list;
 	async_cookie_t   cookie;
@@ -169,7 +171,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
 	 * If we're out of memory or if there's too much work
 	 * pending already, we execute synchronously.
 	 */
-	if (!entry || atomic_read(&entry_count) > MAX_WORK) {
+	if (!async_enabled || !entry || atomic_read(&entry_count) > MAX_WORK) {
 		kfree(entry);
 		spin_lock_irqsave(&async_lock, flags);
 		newcookie = next_cookie++;
@@ -316,8 +318,18 @@ static int async_manager_thread(void *unused)
 
 static int __init async_init(void)
 {
-	kthread_run(async_manager_thread, NULL, "async/mgr");
+	if (async_enabled)
+		kthread_run(async_manager_thread, NULL, "async/mgr");
 	return 0;
 }
 
+static int __init setup_async(char *str)
+{
+	async_enabled = 1;
+	return 1;
+}
+
+__setup("fastboot", setup_async);
+
+
 core_initcall(async_init);
-- 
cgit v1.2.3


From 62ea9ceb17a74bc7544211bfeecf4170c554ac4f Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Sun, 11 Jan 2009 01:04:16 +0100
Subject: cpumask: fix CONFIG_NUMA=y sched.c

Impact: fix panic on ia64 with NR_CPUS=1024

struct sched_domain is now a dangling structure; where we really want
static ones, we need to use static_sched_domain.

(As the FIXME in this file says, cpumask_var_t would be better, but
this code is hairy enough without trying to add initialization code to
the right places).

Reported-by: Mike Travis <travis@sgi.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index deb5ac8c12f..f0c0a81d763 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7282,10 +7282,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
  * groups, so roll our own. Now each node has its own list of groups which
  * gets dynamically allocated.
  */
-static DEFINE_PER_CPU(struct sched_domain, node_domains);
+static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
 static struct sched_group ***sched_group_nodes_bycpu;
 
-static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
+static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
 static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
 
 static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
@@ -7560,7 +7560,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 #ifdef CONFIG_NUMA
 		if (cpumask_weight(cpu_map) >
 				SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
-			sd = &per_cpu(allnodes_domains, i);
+			sd = &per_cpu(allnodes_domains, i).sd;
 			SD_INIT(sd, ALLNODES);
 			set_domain_attribute(sd, attr);
 			cpumask_copy(sched_domain_span(sd), cpu_map);
@@ -7570,7 +7570,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 		} else
 			p = NULL;
 
-		sd = &per_cpu(node_domains, i);
+		sd = &per_cpu(node_domains, i).sd;
 		SD_INIT(sd, NODE);
 		set_domain_attribute(sd, attr);
 		sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
@@ -7688,7 +7688,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 		for_each_cpu(j, nodemask) {
 			struct sched_domain *sd;
 
-			sd = &per_cpu(node_domains, j);
+			sd = &per_cpu(node_domains, j).sd;
 			sd->groups = sg;
 		}
 		sg->__cpu_power = 0;
-- 
cgit v1.2.3


From 805194c35b91999b139e4d6b6145f4f84fd4c814 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Sat, 10 Jan 2009 15:43:15 +0800
Subject: sched: partly revert "sched debug: remove NULL checking in
 print_cfs_rt_rq()"

Impact: avoid accessing NULL tg.css->cgroup

In commit 0a0db8f5c9d4bbb9bbfcc2b6cb6bce2d0ef4d73d, I removed checking
NULL tg.css->cgroup, but I realized I was wrong when I found reading
/proc/sched_debug can race with cgroup_create().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_debug.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 4293cfa9681..16eeba4e416 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -145,6 +145,19 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
 	read_unlock_irqrestore(&tasklist_lock, flags);
 }
 
+#if defined(CONFIG_CGROUP_SCHED) && \
+	(defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED))
+static void task_group_path(struct task_group *tg, char *buf, int buflen)
+{
+	/* may be NULL if the underlying cgroup isn't fully-created yet */
+	if (!tg->css.cgroup) {
+		buf[0] = '\0';
+		return;
+	}
+	cgroup_path(tg->css.cgroup, buf, buflen);
+}
+#endif
+
 void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 {
 	s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
@@ -154,10 +167,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	unsigned long flags;
 
 #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
-	char path[128] = "";
+	char path[128];
 	struct task_group *tg = cfs_rq->tg;
 
-	cgroup_path(tg->css.cgroup, path, sizeof(path));
+	task_group_path(tg, path, sizeof(path));
 
 	SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
 #elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
@@ -208,10 +221,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 {
 #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
-	char path[128] = "";
+	char path[128];
 	struct task_group *tg = rt_rq->tg;
 
-	cgroup_path(tg->css.cgroup, path, sizeof(path));
+	task_group_path(tg, path, sizeof(path));
 
 	SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
 #else
-- 
cgit v1.2.3


From 53ce3d9564908794ae7dd32969089b57df5fc098 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Fri, 9 Jan 2009 12:27:08 -0800
Subject: smp_call_function_single(): be slightly less stupid

If you do

	smp_call_function_single(expression-with-side-effects, ...)

then expression-with-side-effects never gets evaluated on UP builds.

As always, implementing it in C is the correct thing to do.

While we're there, uninline it for size and possible header dependency
reasons.

And create a new kernel/up.c, as a place in which to put
uniprocessor-specific code and storage.  It should mirror kernel/smp.c.

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Makefile |  6 +++++-
 kernel/up.c     | 18 ++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)
 create mode 100644 kernel/up.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 2921d90ce32..2aebc4cd787 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -40,7 +40,11 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
 obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
-obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o
+ifeq ($(CONFIG_USE_GENERIC_SMP_HELPERS),y)
+obj-y += smp.o
+else
+obj-y += up.o
+endif
 obj-$(CONFIG_SMP) += spinlock.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
 obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
diff --git a/kernel/up.c b/kernel/up.c
new file mode 100644
index 00000000000..ce62cc9e9f7
--- /dev/null
+++ b/kernel/up.c
@@ -0,0 +1,18 @@
+/*
+ * Uniprocessor-only support functions.  The counterpart to kernel/smp.c
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/smp.h>
+
+int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+				int wait)
+{
+	WARN_ON(cpuid != 0);
+	local_irq_disable();
+	(func)(info);
+	local_irq_enable();
+	return 0;
+}
+EXPORT_SYMBOL(smp_call_function_single);
-- 
cgit v1.2.3


From 93423b8665f43a0c7a006a1d5be048b99db56d32 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 11 Jan 2009 05:15:21 +0100
Subject: smp_call_function_single(): be slightly less stupid, fix

Impact: build fix on Alpha

 kernel/up.c: In function 'smp_call_function_single':
 kernel/up.c:12: error: 'cpuid' undeclared (first use in this function)
 kernel/up.c:12: error: (Each undeclared identifier is reported only once
 kernel/up.c:12: error: for each function it appears in.)

The typo didnt show up on x86 because 'cpuid' happens to be a
function address as well ...

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/up.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/up.c b/kernel/up.c
index ce62cc9e9f7..c04b9dcfceb 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -9,10 +9,12 @@
 int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 				int wait)
 {
-	WARN_ON(cpuid != 0);
+	WARN_ON(cpu != 0);
+
 	local_irq_disable();
 	(func)(info);
 	local_irq_enable();
+
 	return 0;
 }
 EXPORT_SYMBOL(smp_call_function_single);
-- 
cgit v1.2.3


From fd2ab30b65e961b974ae0bc71e0d47d6b35e0968 Mon Sep 17 00:00:00 2001
From: Steven Noonan <steven@uplinklabs.net>
Date: Sun, 11 Jan 2009 01:04:22 -0800
Subject: kernel/sched.c: add missing forward declaration for 'double_rq_lock'

Impact: build fix on certain configs

Added 'double_rq_lock' forward declaration, allowing double_rq_lock
to be used in _double_lock_balance().

Signed-off-by: Steven Noonan <steven@uplinklabs.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index f0c0a81d763..8be2c13b50d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -125,6 +125,9 @@ DEFINE_TRACE(sched_switch);
 DEFINE_TRACE(sched_migrate_task);
 
 #ifdef CONFIG_SMP
+
+static void double_rq_lock(struct rq *rq1, struct rq *rq2);
+
 /*
  * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
  * Since cpu_power is a 'constant', we can use a reciprocal divide.
-- 
cgit v1.2.3


From 01e3eb82278bf45221fc38b391bc5ee0f6a314d6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 Jan 2009 13:00:50 +0100
Subject: Revert "sched: improve preempt debugging"

This reverts commit 7317d7b87edb41a9135e30be1ec3f7ef817c53dd.

This has been reported (and bisected) by Alexey Zaytsev and
Kamalesh Babulal to produce annoying warnings during bootup
on both x86 and powerpc.

kernel_locked() is not a valid test in IRQ context (we update the
BKL's ->lock_depth and the preempt count separately and non-atomicalyy),
so we cannot put it into the generic preempt debugging checks which
can run in IRQ contexts too.

Reported-and-bisected-by: Alexey Zaytsev <alexey.zaytsev@gmail.com>
Reported-and-bisected-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 8be2c13b50d..3b630d88266 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4440,7 +4440,7 @@ void __kprobes sub_preempt_count(int val)
 	/*
 	 * Underflow?
 	 */
-       if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
+	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
 		return;
 	/*
 	 * Is the spinlock portion underflowing?
-- 
cgit v1.2.3


From 6e96281412f2f757abe623e08a9577e2bbd3402f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 Jan 2009 16:04:37 +0100
Subject: smp_call_function_single(): be slightly less stupid, fix #2

fix m68k build failure:

 tip/kernel/up.c: In function 'smp_call_function_single':
 tip/kernel/up.c:16: error: dereferencing pointer to incomplete type
 make[2]: *** [kernel/up.o] Error 1

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/up.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/up.c b/kernel/up.c
index c04b9dcfceb..1ff27a28bb7 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -2,6 +2,7 @@
  * Uniprocessor-only support functions.  The counterpart to kernel/smp.c
  */
 
+#include <linux/interrupt.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/smp.h>
-- 
cgit v1.2.3


From 37a76bd4f1b716949fc38a6842e89f0ccb8384d0 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 11 Jan 2009 15:35:01 +0000
Subject: async: fix __lowest_in_progress()

At 37000 feet somewhere near Greenland I woke up from a half-sleep with the
realisation that __lowest_in_progress() is buggy. After landing I checked
and there were indeed 2 problems with it; this patch fixes both:
* The order of the list checks was wrong
* The locking was not correct.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/async.c | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/async.c b/kernel/async.c
index f286e9f2b73..608b32b4281 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -90,12 +90,12 @@ extern int initcall_debug;
 static async_cookie_t  __lowest_in_progress(struct list_head *running)
 {
 	struct async_entry *entry;
-	if (!list_empty(&async_pending)) {
-		entry = list_first_entry(&async_pending,
+	if (!list_empty(running)) {
+		entry = list_first_entry(running,
 			struct async_entry, list);
 		return entry->cookie;
-	} else if (!list_empty(running)) {
-		entry = list_first_entry(running,
+	} else if (!list_empty(&async_pending)) {
+		entry = list_first_entry(&async_pending,
 			struct async_entry, list);
 		return entry->cookie;
 	} else {
@@ -104,6 +104,17 @@ static async_cookie_t  __lowest_in_progress(struct list_head *running)
 	}
 
 }
+
+static async_cookie_t  lowest_in_progress(struct list_head *running)
+{
+	unsigned long flags;
+	async_cookie_t ret;
+
+	spin_lock_irqsave(&async_lock, flags);
+	ret = __lowest_in_progress(running);
+	spin_unlock_irqrestore(&async_lock, flags);
+	return ret;
+}
 /*
  * pick the first pending entry and run it
  */
@@ -229,7 +240,7 @@ void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *r
 		starttime = ktime_get();
 	}
 
-	wait_event(async_done, __lowest_in_progress(running) >= cookie);
+	wait_event(async_done, lowest_in_progress(running) >= cookie);
 
 	if (initcall_debug && system_state == SYSTEM_BOOTING) {
 		endtime = ktime_get();
-- 
cgit v1.2.3


From 2ed7c03ec17779afb4fcfa3b8c61df61bd4879ba Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:13:54 +0100
Subject: [CVE-2009-0029] Convert all system calls to return a long

Convert all system calls to return a long. This should be a NOP since all
converted types should have the same size anyway.
With the exception of sys_exit_group which returned void. But that doesn't
matter since the system call doesn't return.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 kernel/exit.c   | 4 +++-
 kernel/signal.c | 2 +-
 kernel/timer.c  | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index c7740fa3252..fac9b040af2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1182,9 +1182,11 @@ do_group_exit(int exit_code)
  * wait4()-ing process will get the correct exit code - even if this
  * thread is not the thread group leader.
  */
-asmlinkage void sys_exit_group(int error_code)
+asmlinkage long sys_exit_group(int error_code)
 {
 	do_group_exit((error_code & 0xff) << 8);
+	/* NOTREACHED */
+	return 0;
 }
 
 static struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
diff --git a/kernel/signal.c b/kernel/signal.c
index 3152ac3b62e..856a5479d49 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2559,7 +2559,7 @@ sys_ssetmask(int newmask)
 /*
  * For backwards compatibility.  Functionality superseded by sigaction.
  */
-asmlinkage unsigned long
+asmlinkage long
 sys_signal(int sig, __sighandler_t handler)
 {
 	struct k_sigaction new_sa, old_sa;
diff --git a/kernel/timer.c b/kernel/timer.c
index dee3f641a7a..7b8697d7f04 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1129,7 +1129,7 @@ void do_timer(unsigned long ticks)
  * For backwards compatibility?  This can be done in libc so Alpha
  * and all newer ports shouldn't need it.
  */
-asmlinkage unsigned long sys_alarm(unsigned int seconds)
+asmlinkage long sys_alarm(unsigned int seconds)
 {
 	return alarm_setitimer(seconds);
 }
-- 
cgit v1.2.3


From f627a741d24f12955fa2d9f8831c3b12860635bd Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:13:58 +0100
Subject: [CVE-2009-0029] Make sys_syslog a conditional system call

Remove the -ENOSYS implementation for !CONFIG_PRINTK and use
the cond_syscall infrastructure instead.

Acked-by: Kyle McMartin <kyle@redhat.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 kernel/printk.c | 5 -----
 kernel/sys_ni.c | 1 +
 2 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 7015733793e..e48cf33783f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -742,11 +742,6 @@ EXPORT_SYMBOL(vprintk);
 
 #else
 
-asmlinkage long sys_syslog(int type, char __user *buf, int len)
-{
-	return -ENOSYS;
-}
-
 static void call_console_drivers(unsigned start, unsigned end)
 {
 }
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index e14a2328170..27dad296738 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -131,6 +131,7 @@ cond_syscall(sys_io_destroy);
 cond_syscall(sys_io_submit);
 cond_syscall(sys_io_cancel);
 cond_syscall(sys_io_getevents);
+cond_syscall(sys_syslog);
 
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read);
-- 
cgit v1.2.3


From 58fd3aa288939d3097fa04505b25c2f5e6e144d1 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:03 +0100
Subject: [CVE-2009-0029] System call wrappers part 01

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 kernel/hrtimer.c |  4 ++--
 kernel/sys.c     |  2 +-
 kernel/time.c    | 14 +++++++-------
 kernel/timer.c   |  6 +++---
 4 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 1455b7651b6..2dc30c59c5f 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1467,8 +1467,8 @@ out:
 	return ret;
 }
 
-asmlinkage long
-sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
+SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
+		struct timespec __user *, rmtp)
 {
 	struct timespec tu;
 
diff --git a/kernel/sys.c b/kernel/sys.c
index 763c3c17ded..37165e55233 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -919,7 +919,7 @@ void do_sys_times(struct tms *tms)
 	tms->tms_cstime = cputime_to_clock_t(cstime);
 }
 
-asmlinkage long sys_times(struct tms __user * tbuf)
+SYSCALL_DEFINE1(times, struct tms __user *, tbuf)
 {
 	if (tbuf) {
 		struct tms tmp;
diff --git a/kernel/time.c b/kernel/time.c
index 4886e3ce83a..29511943871 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -60,7 +60,7 @@ EXPORT_SYMBOL(sys_tz);
  * why not move it into the appropriate arch directory (for those
  * architectures that need it).
  */
-asmlinkage long sys_time(time_t __user * tloc)
+SYSCALL_DEFINE1(time, time_t __user *, tloc)
 {
 	time_t i = get_seconds();
 
@@ -79,7 +79,7 @@ asmlinkage long sys_time(time_t __user * tloc)
  * architectures that need it).
  */
 
-asmlinkage long sys_stime(time_t __user *tptr)
+SYSCALL_DEFINE1(stime, time_t __user *, tptr)
 {
 	struct timespec tv;
 	int err;
@@ -99,8 +99,8 @@ asmlinkage long sys_stime(time_t __user *tptr)
 
 #endif /* __ARCH_WANT_SYS_TIME */
 
-asmlinkage long sys_gettimeofday(struct timeval __user *tv,
-				 struct timezone __user *tz)
+SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
+		struct timezone __user *, tz)
 {
 	if (likely(tv != NULL)) {
 		struct timeval ktv;
@@ -184,8 +184,8 @@ int do_sys_settimeofday(struct timespec *tv, struct timezone *tz)
 	return 0;
 }
 
-asmlinkage long sys_settimeofday(struct timeval __user *tv,
-				struct timezone __user *tz)
+SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv,
+		struct timezone __user *, tz)
 {
 	struct timeval user_tv;
 	struct timespec	new_ts;
@@ -205,7 +205,7 @@ asmlinkage long sys_settimeofday(struct timeval __user *tv,
 	return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
 }
 
-asmlinkage long sys_adjtimex(struct timex __user *txc_p)
+SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p)
 {
 	struct timex txc;		/* Local copy of parameter */
 	int ret;
diff --git a/kernel/timer.c b/kernel/timer.c
index 7b8697d7f04..76041df06c5 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1129,7 +1129,7 @@ void do_timer(unsigned long ticks)
  * For backwards compatibility?  This can be done in libc so Alpha
  * and all newer ports shouldn't need it.
  */
-asmlinkage long sys_alarm(unsigned int seconds)
+SYSCALL_DEFINE1(alarm, unsigned int, seconds)
 {
 	return alarm_setitimer(seconds);
 }
@@ -1152,7 +1152,7 @@ asmlinkage long sys_alarm(unsigned int seconds)
  *
  * This is SMP safe as current->tgid does not change.
  */
-asmlinkage long sys_getpid(void)
+SYSCALL_DEFINE0(getpid)
 {
 	return task_tgid_vnr(current);
 }
@@ -1308,7 +1308,7 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout)
 EXPORT_SYMBOL(schedule_timeout_uninterruptible);
 
 /* Thread ID - the internal kernel "pid" */
-asmlinkage long sys_gettid(void)
+SYSCALL_DEFINE0(gettid)
 {
 	return task_pid_vnr(current);
 }
-- 
cgit v1.2.3


From dbf040d9d1cbf1ef6250bdb095c5c118950bcde8 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:04 +0100
Subject: [CVE-2009-0029] System call wrappers part 02

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 kernel/sys.c   | 10 +++++-----
 kernel/timer.c | 10 +++++-----
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 37165e55233..4c33555f8d9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -756,7 +756,7 @@ error:
 	return retval;
 }
 
-asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid)
+SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __user *, suid)
 {
 	const struct cred *cred = current_cred();
 	int retval;
@@ -814,7 +814,7 @@ error:
 	return retval;
 }
 
-asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid)
+SYSCALL_DEFINE3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __user *, sgid)
 {
 	const struct cred *cred = current_cred();
 	int retval;
@@ -1015,7 +1015,7 @@ out:
 	return err;
 }
 
-asmlinkage long sys_getpgid(pid_t pid)
+SYSCALL_DEFINE1(getpgid, pid_t, pid)
 {
 	struct task_struct *p;
 	struct pid *grp;
@@ -1045,14 +1045,14 @@ out:
 
 #ifdef __ARCH_WANT_SYS_GETPGRP
 
-asmlinkage long sys_getpgrp(void)
+SYSCALL_DEFINE0(getpgrp)
 {
 	return sys_getpgid(0);
 }
 
 #endif
 
-asmlinkage long sys_getsid(pid_t pid)
+SYSCALL_DEFINE1(getsid, pid_t, pid)
 {
 	struct task_struct *p;
 	struct pid *sid;
diff --git a/kernel/timer.c b/kernel/timer.c
index 76041df06c5..14a51530a4c 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1163,7 +1163,7 @@ SYSCALL_DEFINE0(getpid)
  * value of ->real_parent under rcu_read_lock(), see
  * release_task()->call_rcu(delayed_put_task_struct).
  */
-asmlinkage long sys_getppid(void)
+SYSCALL_DEFINE0(getppid)
 {
 	int pid;
 
@@ -1174,25 +1174,25 @@ asmlinkage long sys_getppid(void)
 	return pid;
 }
 
-asmlinkage long sys_getuid(void)
+SYSCALL_DEFINE0(getuid)
 {
 	/* Only we change this so SMP safe */
 	return current_uid();
 }
 
-asmlinkage long sys_geteuid(void)
+SYSCALL_DEFINE0(geteuid)
 {
 	/* Only we change this so SMP safe */
 	return current_euid();
 }
 
-asmlinkage long sys_getgid(void)
+SYSCALL_DEFINE0(getgid)
 {
 	/* Only we change this so SMP safe */
 	return current_gid();
 }
 
-asmlinkage long sys_getegid(void)
+SYSCALL_DEFINE0(getegid)
 {
 	/* Only we change this so SMP safe */
 	return  current_egid();
-- 
cgit v1.2.3


From ae1251ab785f6da87219df8352ffdac68bba23e4 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:05 +0100
Subject: [CVE-2009-0029] System call wrappers part 03

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 kernel/sys.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 4c33555f8d9..ace9ced598b 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -478,7 +478,7 @@ void ctrl_alt_del(void)
  * SMP: There are not races, the GIDs are checked only by filesystem
  *      operations (as far as semantic preservation is concerned).
  */
-asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
+SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
 {
 	const struct cred *old;
 	struct cred *new;
@@ -529,7 +529,7 @@ error:
  *
  * SMP: Same implicit races as above.
  */
-asmlinkage long sys_setgid(gid_t gid)
+SYSCALL_DEFINE1(setgid, gid_t, gid)
 {
 	const struct cred *old;
 	struct cred *new;
@@ -597,7 +597,7 @@ static int set_user(struct cred *new)
  * 100% compatible with BSD.  A program which uses just setuid() will be
  * 100% compatible with POSIX with saved IDs. 
  */
-asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
+SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
 {
 	const struct cred *old;
 	struct cred *new;
@@ -661,7 +661,7 @@ error:
  * will allow a root program to temporarily drop privileges and be able to
  * regain them by swapping the real and effective uid.  
  */
-asmlinkage long sys_setuid(uid_t uid)
+SYSCALL_DEFINE1(setuid, uid_t, uid)
 {
 	const struct cred *old;
 	struct cred *new;
@@ -705,7 +705,7 @@ error:
  * This function implements a generic ability to update ruid, euid,
  * and suid.  This allows you to implement the 4.4 compatible seteuid().
  */
-asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
+SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
 {
 	const struct cred *old;
 	struct cred *new;
@@ -771,7 +771,7 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __u
 /*
  * Same as above, but for rgid, egid, sgid.
  */
-asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
+SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
 {
 	const struct cred *old;
 	struct cred *new;
@@ -833,7 +833,7 @@ SYSCALL_DEFINE3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __u
  * whatever uid it wants to). It normally shadows "euid", except when
  * explicitly set by setfsuid() or for access..
  */
-asmlinkage long sys_setfsuid(uid_t uid)
+SYSCALL_DEFINE1(setfsuid, uid_t, uid)
 {
 	const struct cred *old;
 	struct cred *new;
@@ -870,7 +870,7 @@ change_okay:
 /*
  * Samma på svenska..
  */
-asmlinkage long sys_setfsgid(gid_t gid)
+SYSCALL_DEFINE1(setfsgid, gid_t, gid)
 {
 	const struct cred *old;
 	struct cred *new;
@@ -1311,7 +1311,7 @@ int set_current_groups(struct group_info *group_info)
 
 EXPORT_SYMBOL(set_current_groups);
 
-asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist)
+SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist)
 {
 	const struct cred *cred = current_cred();
 	int i;
-- 
cgit v1.2.3


From b290ebe2c46d01b742b948ce03f09e8a3efb9a92 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:06 +0100
Subject: [CVE-2009-0029] System call wrappers part 04

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 kernel/acct.c        | 2 +-
 kernel/capability.c  | 4 ++--
 kernel/exec_domain.c | 3 +--
 kernel/itimer.c      | 2 +-
 kernel/signal.c      | 7 +++----
 kernel/sys.c         | 6 +++---
 6 files changed, 11 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index d57b7cbb98b..7afa3156416 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -277,7 +277,7 @@ static int acct_on(char *name)
  * should be written. If the filename is NULL, accounting will be
  * shutdown.
  */
-asmlinkage long sys_acct(const char __user *name)
+SYSCALL_DEFINE1(acct, const char __user *, name)
 {
 	int error;
 
diff --git a/kernel/capability.c b/kernel/capability.c
index 688926e496b..4e17041963f 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -161,7 +161,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
  *
  * Returns 0 on success and < 0 on error.
  */
-asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
+SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
 {
 	int ret = 0;
 	pid_t pid;
@@ -235,7 +235,7 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
  *
  * Returns 0 on success and < 0 on error.
  */
-asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
+SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
 {
 	struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
 	unsigned i, tocopy;
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 0511716e942..667c841c295 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -209,8 +209,7 @@ static int __init proc_execdomains_init(void)
 module_init(proc_execdomains_init);
 #endif
 
-asmlinkage long
-sys_personality(u_long personality)
+SYSCALL_DEFINE1(personality, u_long, personality)
 {
 	u_long old = current->personality;
 
diff --git a/kernel/itimer.c b/kernel/itimer.c
index db7c358b9a0..7e0663ea94f 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -100,7 +100,7 @@ int do_getitimer(int which, struct itimerval *value)
 	return 0;
 }
 
-asmlinkage long sys_getitimer(int which, struct itimerval __user *value)
+SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value)
 {
 	int error = -EFAULT;
 	struct itimerval get_buffer;
diff --git a/kernel/signal.c b/kernel/signal.c
index 856a5479d49..3fe08eaa5de 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2434,8 +2434,7 @@ out:
 
 #ifdef __ARCH_WANT_SYS_SIGPENDING
 
-asmlinkage long
-sys_sigpending(old_sigset_t __user *set)
+SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
 {
 	return do_sigpending(set, sizeof(*set));
 }
@@ -2446,8 +2445,8 @@ sys_sigpending(old_sigset_t __user *set)
 /* Some platforms have their own version with special arguments others
    support only sys_rt_sigprocmask.  */
 
-asmlinkage long
-sys_sigprocmask(int how, old_sigset_t __user *set, old_sigset_t __user *oset)
+SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, set,
+		old_sigset_t __user *, oset)
 {
 	int error;
 	old_sigset_t old_set, new_set;
diff --git a/kernel/sys.c b/kernel/sys.c
index ace9ced598b..cbe4502c28a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -944,7 +944,7 @@ SYSCALL_DEFINE1(times, struct tms __user *, tbuf)
  * Auch. Had to add the 'did_exec' flag to conform completely to POSIX.
  * LBT 04.03.94
  */
-asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
+SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
 {
 	struct task_struct *p;
 	struct task_struct *group_leader = current->group_leader;
@@ -1080,7 +1080,7 @@ out:
 	return retval;
 }
 
-asmlinkage long sys_setsid(void)
+SYSCALL_DEFINE0(setsid)
 {
 	struct task_struct *group_leader = current->group_leader;
 	struct pid *sid = task_pid(group_leader);
@@ -1340,7 +1340,7 @@ out:
  *	without another task interfering.
  */
  
-asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist)
+SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
 {
 	struct group_info *group_info;
 	int retval;
-- 
cgit v1.2.3


From 362e9c07c7220c0a78c88826fc0d2bf7e4a4bb68 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:07 +0100
Subject: [CVE-2009-0029] System call wrappers part 05

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 kernel/itimer.c       |  5 ++---
 kernel/posix-timers.c | 43 +++++++++++++++++++------------------------
 2 files changed, 21 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/itimer.c b/kernel/itimer.c
index 7e0663ea94f..6a5fe93dd8b 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -260,9 +260,8 @@ unsigned int alarm_setitimer(unsigned int seconds)
 	return it_old.it_value.tv_sec;
 }
 
-asmlinkage long sys_setitimer(int which,
-			      struct itimerval __user *value,
-			      struct itimerval __user *ovalue)
+SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value,
+		struct itimerval __user *, ovalue)
 {
 	struct itimerval set_buffer, get_buffer;
 	int error;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 887c63787de..052ec4d195c 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -477,10 +477,9 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
 
 /* Create a POSIX.1b interval timer. */
 
-asmlinkage long
-sys_timer_create(const clockid_t which_clock,
-		 struct sigevent __user *timer_event_spec,
-		 timer_t __user * created_timer_id)
+SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
+		struct sigevent __user *, timer_event_spec,
+		timer_t __user *, created_timer_id)
 {
 	struct k_itimer *new_timer;
 	int error, new_timer_id;
@@ -661,8 +660,8 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
 }
 
 /* Get the time remaining on a POSIX.1b interval timer. */
-asmlinkage long
-sys_timer_gettime(timer_t timer_id, struct itimerspec __user *setting)
+SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
+		struct itimerspec __user *, setting)
 {
 	struct k_itimer *timr;
 	struct itimerspec cur_setting;
@@ -691,8 +690,7 @@ sys_timer_gettime(timer_t timer_id, struct itimerspec __user *setting)
  * the call back to do_schedule_next_timer().  So all we need to do is
  * to pick up the frozen overrun.
  */
-asmlinkage long
-sys_timer_getoverrun(timer_t timer_id)
+SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
 {
 	struct k_itimer *timr;
 	int overrun;
@@ -760,10 +758,9 @@ common_timer_set(struct k_itimer *timr, int flags,
 }
 
 /* Set a POSIX.1b interval timer */
-asmlinkage long
-sys_timer_settime(timer_t timer_id, int flags,
-		  const struct itimerspec __user *new_setting,
-		  struct itimerspec __user *old_setting)
+SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
+		const struct itimerspec __user *, new_setting,
+		struct itimerspec __user *, old_setting)
 {
 	struct k_itimer *timr;
 	struct itimerspec new_spec, old_spec;
@@ -816,8 +813,7 @@ static inline int timer_delete_hook(struct k_itimer *timer)
 }
 
 /* Delete a POSIX.1b interval timer. */
-asmlinkage long
-sys_timer_delete(timer_t timer_id)
+SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)
 {
 	struct k_itimer *timer;
 	unsigned long flags;
@@ -903,8 +899,8 @@ int do_posix_clock_nonanosleep(const clockid_t clock, int flags,
 }
 EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep);
 
-asmlinkage long sys_clock_settime(const clockid_t which_clock,
-				  const struct timespec __user *tp)
+SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
+		const struct timespec __user *, tp)
 {
 	struct timespec new_tp;
 
@@ -916,8 +912,8 @@ asmlinkage long sys_clock_settime(const clockid_t which_clock,
 	return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp));
 }
 
-asmlinkage long
-sys_clock_gettime(const clockid_t which_clock, struct timespec __user *tp)
+SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
+		struct timespec __user *,tp)
 {
 	struct timespec kernel_tp;
 	int error;
@@ -933,8 +929,8 @@ sys_clock_gettime(const clockid_t which_clock, struct timespec __user *tp)
 
 }
 
-asmlinkage long
-sys_clock_getres(const clockid_t which_clock, struct timespec __user *tp)
+SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
+		struct timespec __user *, tp)
 {
 	struct timespec rtn_tp;
 	int error;
@@ -963,10 +959,9 @@ static int common_nsleep(const clockid_t which_clock, int flags,
 				 which_clock);
 }
 
-asmlinkage long
-sys_clock_nanosleep(const clockid_t which_clock, int flags,
-		    const struct timespec __user *rqtp,
-		    struct timespec __user *rmtp)
+SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
+		const struct timespec __user *, rqtp,
+		struct timespec __user *, rmtp)
 {
 	struct timespec t;
 
-- 
cgit v1.2.3


From 5add95d4f7cf08f6f62510f19576992912387501 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:08 +0100
Subject: [CVE-2009-0029] System call wrappers part 06

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 kernel/sched.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 8be2c13b50d..1a0fdfa5ddf 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5126,7 +5126,7 @@ int can_nice(const struct task_struct *p, const int nice)
  * sys_setpriority is a more generic, but much slower function that
  * does similar things.
  */
-asmlinkage long sys_nice(int increment)
+SYSCALL_DEFINE1(nice, int, increment)
 {
 	long nice, retval;
 
@@ -5433,8 +5433,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
  * @policy: new policy.
  * @param: structure containing the new RT priority.
  */
-asmlinkage long
-sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
+SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
+		struct sched_param __user *, param)
 {
 	/* negative values for policy are not valid */
 	if (policy < 0)
@@ -5448,7 +5448,7 @@ sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
  * @pid: the pid in question.
  * @param: structure containing the new RT priority.
  */
-asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
 {
 	return do_sched_setscheduler(pid, -1, param);
 }
@@ -5457,7 +5457,7 @@ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
  * @pid: the pid in question.
  */
-asmlinkage long sys_sched_getscheduler(pid_t pid)
+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
 {
 	struct task_struct *p;
 	int retval;
@@ -5482,7 +5482,7 @@ asmlinkage long sys_sched_getscheduler(pid_t pid)
  * @pid: the pid in question.
  * @param: structure containing the RT priority.
  */
-asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
 {
 	struct sched_param lp;
 	struct task_struct *p;
@@ -5600,8 +5600,8 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
  * @user_mask_ptr: user-space pointer to the new cpu mask
  */
-asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
-				      unsigned long __user *user_mask_ptr)
+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
+		unsigned long __user *, user_mask_ptr)
 {
 	cpumask_var_t new_mask;
 	int retval;
@@ -5648,8 +5648,8 @@ out_unlock:
  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
  * @user_mask_ptr: user-space pointer to hold the current cpu mask
  */
-asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
-				      unsigned long __user *user_mask_ptr)
+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
+		unsigned long __user *, user_mask_ptr)
 {
 	int ret;
 	cpumask_var_t mask;
@@ -5678,7 +5678,7 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
  * This function yields the current CPU to other tasks. If there are no
  * other threads running on this CPU then this function will return.
  */
-asmlinkage long sys_sched_yield(void)
+SYSCALL_DEFINE0(sched_yield)
 {
 	struct rq *rq = this_rq_lock();
 
@@ -5819,7 +5819,7 @@ long __sched io_schedule_timeout(long timeout)
  * this syscall returns the maximum rt_priority that can be used
  * by a given scheduling class.
  */
-asmlinkage long sys_sched_get_priority_max(int policy)
+SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
 {
 	int ret = -EINVAL;
 
@@ -5844,7 +5844,7 @@ asmlinkage long sys_sched_get_priority_max(int policy)
  * this syscall returns the minimum rt_priority that can be used
  * by a given scheduling class.
  */
-asmlinkage long sys_sched_get_priority_min(int policy)
+SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
 {
 	int ret = -EINVAL;
 
-- 
cgit v1.2.3


From 754fe8d297bfae7b77f7ce866e2fb0c5fb186506 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:09 +0100
Subject: [CVE-2009-0029] System call wrappers part 07

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 kernel/exit.c   | 8 ++++----
 kernel/kexec.c  | 5 ++---
 kernel/sched.c  | 4 ++--
 kernel/signal.c | 2 +-
 kernel/sys.c    | 7 ++++---
 5 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index fac9b040af2..08895df0eab 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1141,7 +1141,7 @@ NORET_TYPE void complete_and_exit(struct completion *comp, long code)
 
 EXPORT_SYMBOL(complete_and_exit);
 
-asmlinkage long sys_exit(int error_code)
+SYSCALL_DEFINE1(exit, int, error_code)
 {
 	do_exit((error_code&0xff)<<8);
 }
@@ -1182,7 +1182,7 @@ do_group_exit(int exit_code)
  * wait4()-ing process will get the correct exit code - even if this
  * thread is not the thread group leader.
  */
-asmlinkage long sys_exit_group(int error_code)
+SYSCALL_DEFINE1(exit_group, int, error_code)
 {
 	do_group_exit((error_code & 0xff) << 8);
 	/* NOTREACHED */
@@ -1795,8 +1795,8 @@ asmlinkage long sys_waitid(int which, pid_t upid,
 	return ret;
 }
 
-asmlinkage long sys_wait4(pid_t upid, int __user *stat_addr,
-			  int options, struct rusage __user *ru)
+SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
+		int, options, struct rusage __user *, ru)
 {
 	struct pid *pid = NULL;
 	enum pid_type type;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 3fb855ad6aa..8a6d7b08864 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -934,9 +934,8 @@ struct kimage *kexec_crash_image;
 
 static DEFINE_MUTEX(kexec_mutex);
 
-asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
-				struct kexec_segment __user *segments,
-				unsigned long flags)
+SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
+		struct kexec_segment __user *, segments, unsigned long, flags)
 {
 	struct kimage **dest_image, *image;
 	int result;
diff --git a/kernel/sched.c b/kernel/sched.c
index 1a0fdfa5ddf..65c02037b05 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5869,8 +5869,8 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
  * this syscall writes the default timeslice value of a given process
  * into the user-space timespec buffer. A value of '0' means infinity.
  */
-asmlinkage
-long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
+SYSCALL_DEFINE4(sched_rr_get_interval, pid_t, pid,
+		struct timespec __user *, interval)
 {
 	struct task_struct *p;
 	unsigned int time_slice;
diff --git a/kernel/signal.c b/kernel/signal.c
index 3fe08eaa5de..41f32e08615 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1961,7 +1961,7 @@ EXPORT_SYMBOL(unblock_all_signals);
  * System call entry points.
  */
 
-asmlinkage long sys_restart_syscall(void)
+SYSCALL_DEFINE0(restart_syscall)
 {
 	struct restart_block *restart = &current_thread_info()->restart_block;
 	return restart->fn(restart);
diff --git a/kernel/sys.c b/kernel/sys.c
index cbe4502c28a..39b192b4003 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -143,7 +143,7 @@ out:
 	return error;
 }
 
-asmlinkage long sys_setpriority(int which, int who, int niceval)
+SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
 {
 	struct task_struct *g, *p;
 	struct user_struct *user;
@@ -208,7 +208,7 @@ out:
  * has been offset by 20 (ie it returns 40..1 instead of -20..19)
  * to stay compatible.
  */
-asmlinkage long sys_getpriority(int which, int who)
+SYSCALL_DEFINE2(getpriority, int, which, int, who)
 {
 	struct task_struct *g, *p;
 	struct user_struct *user;
@@ -355,7 +355,8 @@ EXPORT_SYMBOL_GPL(kernel_power_off);
  *
  * reboot doesn't sync: do that yourself before calling this.
  */
-asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user * arg)
+SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
+		void __user *, arg)
 {
 	char buffer[256];
 
-- 
cgit v1.2.3


From 17da2bd90abf428523de0fb98f7075e00e3ed42e Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:10 +0100
Subject: [CVE-2009-0029] System call wrappers part 08

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 kernel/exit.c   |  7 +++----
 kernel/fork.c   |  2 +-
 kernel/futex.c  |  6 +++---
 kernel/module.c | 10 ++++------
 kernel/sched.c  |  2 +-
 kernel/signal.c | 18 +++++++-----------
 6 files changed, 19 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 08895df0eab..f80dec3f187 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1754,9 +1754,8 @@ end:
 	return retval;
 }
 
-asmlinkage long sys_waitid(int which, pid_t upid,
-			   struct siginfo __user *infop, int options,
-			   struct rusage __user *ru)
+SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
+		infop, int, options, struct rusage __user *, ru)
 {
 	struct pid *pid = NULL;
 	enum pid_type type;
@@ -1833,7 +1832,7 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
  * sys_waitpid() remains for compatibility. waitpid() should be
  * implemented by calling sys_wait4() from libc.a.
  */
-asmlinkage long sys_waitpid(pid_t pid, int __user *stat_addr, int options)
+SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
 {
 	return sys_wait4(pid, stat_addr, options, NULL);
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index 1d68f1255dd..8eb37d38c6a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -901,7 +901,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
 	clear_freeze_flag(p);
 }
 
-asmlinkage long sys_set_tid_address(int __user *tidptr)
+SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
 {
 	current->clear_child_tid = tidptr;
 
diff --git a/kernel/futex.c b/kernel/futex.c
index 002aa189eb0..e86931d8d4e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1978,9 +1978,9 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 }
 
 
-asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
-			  struct timespec __user *utime, u32 __user *uaddr2,
-			  u32 val3)
+SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
+		struct timespec __user *, utime, u32 __user *, uaddr2,
+		u32, val3)
 {
 	struct timespec ts;
 	ktime_t t, *tp = NULL;
diff --git a/kernel/module.c b/kernel/module.c
index c9332c90d5a..e8b51d41dd7 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -743,8 +743,8 @@ static void wait_for_zero_refcount(struct module *mod)
 	mutex_lock(&module_mutex);
 }
 
-asmlinkage long
-sys_delete_module(const char __user *name_user, unsigned int flags)
+SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
+		unsigned int, flags)
 {
 	struct module *mod;
 	char name[MODULE_NAME_LEN];
@@ -2296,10 +2296,8 @@ static noinline struct module *load_module(void __user *umod,
 }
 
 /* This is where the real work happens */
-asmlinkage long
-sys_init_module(void __user *umod,
-		unsigned long len,
-		const char __user *uargs)
+SYSCALL_DEFINE3(init_module, void __user *, umod,
+		unsigned long, len, const char __user *, uargs)
 {
 	struct module *mod;
 	int ret = 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index 65c02037b05..eb1931eef58 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5869,7 +5869,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
  * this syscall writes the default timeslice value of a given process
  * into the user-space timespec buffer. A value of '0' means infinity.
  */
-SYSCALL_DEFINE4(sched_rr_get_interval, pid_t, pid,
+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
 		struct timespec __user *, interval)
 {
 	struct task_struct *p;
diff --git a/kernel/signal.c b/kernel/signal.c
index 41f32e08615..278cc8737f1 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2014,8 +2014,8 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
 	return error;
 }
 
-asmlinkage long
-sys_rt_sigprocmask(int how, sigset_t __user *set, sigset_t __user *oset, size_t sigsetsize)
+SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set,
+		sigset_t __user *, oset, size_t, sigsetsize)
 {
 	int error = -EINVAL;
 	sigset_t old_set, new_set;
@@ -2074,8 +2074,7 @@ out:
 	return error;
 }	
 
-asmlinkage long
-sys_rt_sigpending(sigset_t __user *set, size_t sigsetsize)
+SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize)
 {
 	return do_sigpending(set, sigsetsize);
 }
@@ -2146,11 +2145,9 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
 
 #endif
 
-asmlinkage long
-sys_rt_sigtimedwait(const sigset_t __user *uthese,
-		    siginfo_t __user *uinfo,
-		    const struct timespec __user *uts,
-		    size_t sigsetsize)
+SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
+		siginfo_t __user *, uinfo, const struct timespec __user *, uts,
+		size_t, sigsetsize)
 {
 	int ret, sig;
 	sigset_t these;
@@ -2223,8 +2220,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
 	return ret;
 }
 
-asmlinkage long
-sys_kill(pid_t pid, int sig)
+SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
 {
 	struct siginfo info;
 
-- 
cgit v1.2.3


From a5f8fa9e9ba5ef3305e147f41ad6e1e84ac1f0bd Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:11 +0100
Subject: [CVE-2009-0029] System call wrappers part 09

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 kernel/signal.c | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 278cc8737f1..e2333929611 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2279,7 +2279,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
  *  exists but it's not belonging to the target process anymore. This
  *  method solves the problem of threads exiting and PIDs getting reused.
  */
-asmlinkage long sys_tgkill(pid_t tgid, pid_t pid, int sig)
+SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig)
 {
 	/* This is only valid for single tasks */
 	if (pid <= 0 || tgid <= 0)
@@ -2291,8 +2291,7 @@ asmlinkage long sys_tgkill(pid_t tgid, pid_t pid, int sig)
 /*
  *  Send a signal to only one task, even if it's a CLONE_THREAD task.
  */
-asmlinkage long
-sys_tkill(pid_t pid, int sig)
+SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
 {
 	/* This is only valid for single tasks */
 	if (pid <= 0)
@@ -2301,8 +2300,8 @@ sys_tkill(pid_t pid, int sig)
 	return do_tkill(0, pid, sig);
 }
 
-asmlinkage long
-sys_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t __user *uinfo)
+SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
+		siginfo_t __user *, uinfo)
 {
 	siginfo_t info;
 
@@ -2526,15 +2525,13 @@ out:
 /*
  * For backwards compatibility.  Functionality superseded by sigprocmask.
  */
-asmlinkage long
-sys_sgetmask(void)
+SYSCALL_DEFINE0(sgetmask)
 {
 	/* SMP safe */
 	return current->blocked.sig[0];
 }
 
-asmlinkage long
-sys_ssetmask(int newmask)
+SYSCALL_DEFINE1(ssetmask, int, newmask)
 {
 	int old;
 
@@ -2554,8 +2551,7 @@ sys_ssetmask(int newmask)
 /*
  * For backwards compatibility.  Functionality superseded by sigaction.
  */
-asmlinkage long
-sys_signal(int sig, __sighandler_t handler)
+SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler)
 {
 	struct k_sigaction new_sa, old_sa;
 	int ret;
@@ -2572,8 +2568,7 @@ sys_signal(int sig, __sighandler_t handler)
 
 #ifdef __ARCH_WANT_SYS_PAUSE
 
-asmlinkage long
-sys_pause(void)
+SYSCALL_DEFINE0(pause)
 {
 	current->state = TASK_INTERRUPTIBLE;
 	schedule();
-- 
cgit v1.2.3


From ca013e945b1ba5828b151ee646946f1297b67a4c Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:19 +0100
Subject: [CVE-2009-0029] System call wrappers part 17

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 kernel/uid16.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/uid16.c b/kernel/uid16.c
index 2460c3199b5..37f48c049a2 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -17,7 +17,7 @@
 
 #include <asm/uaccess.h>
 
-asmlinkage long sys_chown16(const char __user * filename, old_uid_t user, old_gid_t group)
+SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group)
 {
 	long ret = sys_chown(filename, low2highuid(user), low2highgid(group));
 	/* avoid REGPARM breakage on x86: */
@@ -25,7 +25,7 @@ asmlinkage long sys_chown16(const char __user * filename, old_uid_t user, old_gi
 	return ret;
 }
 
-asmlinkage long sys_lchown16(const char __user * filename, old_uid_t user, old_gid_t group)
+SYSCALL_DEFINE3(lchown16, const char __user *, filename, old_uid_t, user, old_gid_t, group)
 {
 	long ret = sys_lchown(filename, low2highuid(user), low2highgid(group));
 	/* avoid REGPARM breakage on x86: */
@@ -33,7 +33,7 @@ asmlinkage long sys_lchown16(const char __user * filename, old_uid_t user, old_g
 	return ret;
 }
 
-asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group)
+SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group)
 {
 	long ret = sys_fchown(fd, low2highuid(user), low2highgid(group));
 	/* avoid REGPARM breakage on x86: */
-- 
cgit v1.2.3


From a6b42e83f249aad723589b2bdf6d1dfb2b0997c8 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:20 +0100
Subject: [CVE-2009-0029] System call wrappers part 18

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 kernel/uid16.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/uid16.c b/kernel/uid16.c
index 37f48c049a2..221894e6e98 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -41,7 +41,7 @@ SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group)
 	return ret;
 }
 
-asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid)
+SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid)
 {
 	long ret = sys_setregid(low2highgid(rgid), low2highgid(egid));
 	/* avoid REGPARM breakage on x86: */
@@ -49,7 +49,7 @@ asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid)
 	return ret;
 }
 
-asmlinkage long sys_setgid16(old_gid_t gid)
+SYSCALL_DEFINE1(setgid16, old_gid_t, gid)
 {
 	long ret = sys_setgid(low2highgid(gid));
 	/* avoid REGPARM breakage on x86: */
@@ -57,7 +57,7 @@ asmlinkage long sys_setgid16(old_gid_t gid)
 	return ret;
 }
 
-asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid)
+SYSCALL_DEFINE2(setreuid16, old_uid_t, ruid, old_uid_t, euid)
 {
 	long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid));
 	/* avoid REGPARM breakage on x86: */
@@ -65,7 +65,7 @@ asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid)
 	return ret;
 }
 
-asmlinkage long sys_setuid16(old_uid_t uid)
+SYSCALL_DEFINE1(setuid16, old_uid_t, uid)
 {
 	long ret = sys_setuid(low2highuid(uid));
 	/* avoid REGPARM breakage on x86: */
@@ -73,7 +73,7 @@ asmlinkage long sys_setuid16(old_uid_t uid)
 	return ret;
 }
 
-asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid)
+SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid)
 {
 	long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid),
 				 low2highuid(suid));
@@ -82,7 +82,7 @@ asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid)
 	return ret;
 }
 
-asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, old_uid_t __user *suid)
+SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruid, old_uid_t __user *, euid, old_uid_t __user *, suid)
 {
 	const struct cred *cred = current_cred();
 	int retval;
@@ -94,7 +94,7 @@ asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid,
 	return retval;
 }
 
-asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid)
+SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid)
 {
 	long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid),
 				 low2highgid(sgid));
@@ -103,7 +103,8 @@ asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid)
 	return ret;
 }
 
-asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, old_gid_t __user *sgid)
+
+SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgid, old_gid_t __user *, egid, old_gid_t __user *, sgid)
 {
 	const struct cred *cred = current_cred();
 	int retval;
@@ -115,7 +116,7 @@ asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid,
 	return retval;
 }
 
-asmlinkage long sys_setfsuid16(old_uid_t uid)
+SYSCALL_DEFINE1(setfsuid16, old_uid_t, uid)
 {
 	long ret = sys_setfsuid(low2highuid(uid));
 	/* avoid REGPARM breakage on x86: */
@@ -123,7 +124,7 @@ asmlinkage long sys_setfsuid16(old_uid_t uid)
 	return ret;
 }
 
-asmlinkage long sys_setfsgid16(old_gid_t gid)
+SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid)
 {
 	long ret = sys_setfsgid(low2highgid(gid));
 	/* avoid REGPARM breakage on x86: */
-- 
cgit v1.2.3


From 003d7ab479168132a2b2c6700fe682b08f08ab0c Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:21 +0100
Subject: [CVE-2009-0029] System call wrappers part 19

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 kernel/uid16.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/uid16.c b/kernel/uid16.c
index 221894e6e98..0314501688b 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -162,7 +162,7 @@ static int groups16_from_user(struct group_info *group_info,
 	return 0;
 }
 
-asmlinkage long sys_getgroups16(int gidsetsize, old_gid_t __user *grouplist)
+SYSCALL_DEFINE2(getgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
 {
 	const struct cred *cred = current_cred();
 	int i;
@@ -185,7 +185,7 @@ out:
 	return i;
 }
 
-asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t __user *grouplist)
+SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
 {
 	struct group_info *group_info;
 	int retval;
@@ -210,22 +210,22 @@ asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t __user *grouplist)
 	return retval;
 }
 
-asmlinkage long sys_getuid16(void)
+SYSCALL_DEFINE0(getuid16)
 {
 	return high2lowuid(current_uid());
 }
 
-asmlinkage long sys_geteuid16(void)
+SYSCALL_DEFINE0(geteuid16)
 {
 	return high2lowuid(current_euid());
 }
 
-asmlinkage long sys_getgid16(void)
+SYSCALL_DEFINE0(getgid16)
 {
 	return high2lowgid(current_gid());
 }
 
-asmlinkage long sys_getegid16(void)
+SYSCALL_DEFINE0(getegid16)
 {
 	return high2lowgid(current_egid());
 }
-- 
cgit v1.2.3


From 5a8a82b1d306a325d899b67715618413657efda4 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:25 +0100
Subject: [CVE-2009-0029] System call wrappers part 23

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 kernel/sys.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 39b192b4003..5292f2119da 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1406,7 +1406,7 @@ asmlinkage long sys_newuname(struct new_utsname __user * name)
 	return errno;
 }
 
-asmlinkage long sys_sethostname(char __user *name, int len)
+SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
 {
 	int errno;
 	char tmp[__NEW_UTS_LEN];
@@ -1430,7 +1430,7 @@ asmlinkage long sys_sethostname(char __user *name, int len)
 
 #ifdef __ARCH_WANT_SYS_GETHOSTNAME
 
-asmlinkage long sys_gethostname(char __user *name, int len)
+SYSCALL_DEFINE2(gethostname, char __user *, name, int, len)
 {
 	int i, errno;
 	struct new_utsname *u;
@@ -1455,7 +1455,7 @@ asmlinkage long sys_gethostname(char __user *name, int len)
  * Only setdomainname; getdomainname can be implemented by calling
  * uname()
  */
-asmlinkage long sys_setdomainname(char __user *name, int len)
+SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
 {
 	int errno;
 	char tmp[__NEW_UTS_LEN];
-- 
cgit v1.2.3


From e48fbb699f82ef1e80bd7126046394d2dc9ca7e6 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:26 +0100
Subject: [CVE-2009-0029] System call wrappers part 24

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 kernel/sys.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 5292f2119da..70ffa8408cd 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1395,7 +1395,7 @@ EXPORT_SYMBOL(in_egroup_p);
 
 DECLARE_RWSEM(uts_sem);
 
-asmlinkage long sys_newuname(struct new_utsname __user * name)
+SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
 {
 	int errno = 0;
 
@@ -1478,7 +1478,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
 	return errno;
 }
 
-asmlinkage long sys_getrlimit(unsigned int resource, struct rlimit __user *rlim)
+SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
 {
 	if (resource >= RLIM_NLIMITS)
 		return -EINVAL;
@@ -1497,7 +1497,8 @@ asmlinkage long sys_getrlimit(unsigned int resource, struct rlimit __user *rlim)
  *	Back compatibility for getrlimit. Needed for some apps.
  */
  
-asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *rlim)
+SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
+		struct rlimit __user *, rlim)
 {
 	struct rlimit x;
 	if (resource >= RLIM_NLIMITS)
@@ -1515,7 +1516,7 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r
 
 #endif
 
-asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
+SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
 {
 	struct rlimit new_rlim, *old_rlim;
 	int retval;
@@ -1688,7 +1689,7 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
 	return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
 }
 
-asmlinkage long sys_getrusage(int who, struct rusage __user *ru)
+SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
 {
 	if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
 	    who != RUSAGE_THREAD)
@@ -1696,7 +1697,7 @@ asmlinkage long sys_getrusage(int who, struct rusage __user *ru)
 	return getrusage(current, who, ru);
 }
 
-asmlinkage long sys_umask(int mask)
+SYSCALL_DEFINE1(umask, int, mask)
 {
 	mask = xchg(&current->fs->umask, mask & S_IRWXUGO);
 	return mask;
-- 
cgit v1.2.3


From c4ea37c26a691ad0b7e86aa5884aab27830e95c9 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:28 +0100
Subject: [CVE-2009-0029] System call wrappers part 26

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 kernel/sys.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 70ffa8408cd..59aadcdad6c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1703,8 +1703,8 @@ SYSCALL_DEFINE1(umask, int, mask)
 	return mask;
 }
 
-asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
-			  unsigned long arg4, unsigned long arg5)
+SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
+		unsigned long, arg4, unsigned long, arg5)
 {
 	struct task_struct *me = current;
 	unsigned char comm[sizeof(me->comm)];
-- 
cgit v1.2.3


From 1e7bfb2134dfec37ce04fb3a4ca89299e892d10c Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:29 +0100
Subject: [CVE-2009-0029] System call wrappers part 27

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 kernel/printk.c | 2 +-
 kernel/ptrace.c | 2 +-
 kernel/sysctl.c | 4 ++--
 kernel/timer.c  | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index e48cf33783f..69188f226a9 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -382,7 +382,7 @@ out:
 	return error;
 }
 
-asmlinkage long sys_syslog(int type, char __user *buf, int len)
+SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
 {
 	return do_syslog(type, buf, len);
 }
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 29dc700e198..c9cf48b21f0 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -574,7 +574,7 @@ struct task_struct *ptrace_get_task_struct(pid_t pid)
 #define arch_ptrace_attach(child)	do { } while (0)
 #endif
 
-asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
+SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
 {
 	struct task_struct *child;
 	long ret;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 89d74436318..3e38b74b612 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1688,7 +1688,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
 	return error;
 }
 
-asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
+SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
 {
 	struct __sysctl_args tmp;
 	int error;
@@ -2989,7 +2989,7 @@ int sysctl_ms_jiffies(struct ctl_table *table,
 #else /* CONFIG_SYSCTL_SYSCALL */
 
 
-asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
+SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
 {
 	struct __sysctl_args tmp;
 	int error;
diff --git a/kernel/timer.c b/kernel/timer.c
index 14a51530a4c..13dd64fe143 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1400,7 +1400,7 @@ out:
 	return 0;
 }
 
-asmlinkage long sys_sysinfo(struct sysinfo __user *info)
+SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info)
 {
 	struct sysinfo val;
 
-- 
cgit v1.2.3


From 6559eed8ca7db0531a207cd80be5e28cd6f213c5 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:32 +0100
Subject: [CVE-2009-0029] System call wrappers part 30

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 8eb37d38c6a..bf0cef8bbdf 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1603,7 +1603,7 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
  * constructed. Here we are modifying the current, active,
  * task_struct.
  */
-asmlinkage long sys_unshare(unsigned long unshare_flags)
+SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 {
 	int err = 0;
 	struct fs_struct *fs, *new_fs = NULL;
-- 
cgit v1.2.3


From 836f92adf121f806e9beb5b6b88bd5c9c4ea3f24 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:33 +0100
Subject: [CVE-2009-0029] System call wrappers part 31

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 kernel/futex.c | 11 +++++------
 kernel/sys.c   |  4 ++--
 2 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index e86931d8d4e..f89d373a9c6 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1733,9 +1733,8 @@ pi_faulted:
  * @head: pointer to the list-head
  * @len: length of the list-head, as userspace expects
  */
-asmlinkage long
-sys_set_robust_list(struct robust_list_head __user *head,
-		    size_t len)
+SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
+		size_t, len)
 {
 	if (!futex_cmpxchg_enabled)
 		return -ENOSYS;
@@ -1756,9 +1755,9 @@ sys_set_robust_list(struct robust_list_head __user *head,
  * @head_ptr: pointer to a list-head pointer, the kernel fills it in
  * @len_ptr: pointer to a length field, the kernel fills in the header size
  */
-asmlinkage long
-sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr,
-		    size_t __user *len_ptr)
+SYSCALL_DEFINE3(get_robust_list, int, pid,
+		struct robust_list_head __user * __user *, head_ptr,
+		size_t __user *, len_ptr)
 {
 	struct robust_list_head __user *head;
 	unsigned long ret;
diff --git a/kernel/sys.c b/kernel/sys.c
index 59aadcdad6c..e7dc0e10a48 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1817,8 +1817,8 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	return error;
 }
 
-asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep,
-			   struct getcpu_cache __user *unused)
+SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
+		struct getcpu_cache __user *, unused)
 {
 	int err = 0;
 	int cpu = raw_smp_processor_id();
-- 
cgit v1.2.3


From d4e82042c4cfa87a7d51710b71f568fe80132551 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:34 +0100
Subject: [CVE-2009-0029] System call wrappers part 32

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 kernel/signal.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index e2333929611..e73759783dc 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2491,11 +2491,10 @@ out:
 #endif /* __ARCH_WANT_SYS_SIGPROCMASK */
 
 #ifdef __ARCH_WANT_SYS_RT_SIGACTION
-asmlinkage long
-sys_rt_sigaction(int sig,
-		 const struct sigaction __user *act,
-		 struct sigaction __user *oact,
-		 size_t sigsetsize)
+SYSCALL_DEFINE4(rt_sigaction, int, sig,
+		const struct sigaction __user *, act,
+		struct sigaction __user *, oact,
+		size_t, sigsetsize)
 {
 	struct k_sigaction new_sa, old_sa;
 	int ret = -EINVAL;
@@ -2578,7 +2577,7 @@ SYSCALL_DEFINE0(pause)
 #endif
 
 #ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
-asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize)
+SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
 {
 	sigset_t newset;
 
-- 
cgit v1.2.3


From 9316fcacb89c59fe556c48587ac02cd7f5d38045 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 14 Jan 2009 09:35:44 -0800
Subject: kernel/up.c: omit it if SMP=y, USE_GENERIC_SMP_HELPERS=n

Fix the sparc build - we were including `up.o' on SMP builds, when
CONFIG_USE_GENERIC_SMP_HELPERS=n.

Tested-by: Robert Reif <reif@earthlink.net>
Fixed-by: Robert Reif <reif@earthlink.net>
Cc: David Miller <davem@davemloft.net>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/Makefile | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 2aebc4cd787..170a9213c1b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -40,9 +40,8 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
 obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
-ifeq ($(CONFIG_USE_GENERIC_SMP_HELPERS),y)
-obj-y += smp.o
-else
+obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o
+ifneq ($(CONFIG_SMP),y)
 obj-y += up.o
 endif
 obj-$(CONFIG_SMP) += spinlock.o
-- 
cgit v1.2.3


From 98a4826b99bc4bcc34c604b2fc4fcf4d771600ec Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 14 Jan 2009 10:56:32 +0100
Subject: sched: fix bandwidth validation for UID grouping

Impact: make rt-limit tunables work again

Mark Glines reported:

> I've got an issue on x86-64 where I can't configure the system to allow
> RT tasks for a non-root user.
>
> In 2.6.26.5, I was able to do the following to set things up nicely:
> echo 450000 >/sys/kernel/uids/0/cpu_rt_runtime
> echo 450000 >/sys/kernel/uids/1000/cpu_rt_runtime
>
> Seems like every value I try to echo into the /sys files returns EINVAL.

For UID grouping we initialize the root group with infinite bandwidth
which by default is actually more than the global limit, therefore the
bandwidth check always fails.

Because the root group is a phantom group (for UID grouping) we cannot
runtime adjust it, therefore we let it reflect the global bandwidth
settings.

Reported-by: Mark Glines <mark@glines.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 3b630d88266..ed62d1cee05 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9050,6 +9050,13 @@ static int tg_schedulable(struct task_group *tg, void *data)
 		runtime = d->rt_runtime;
 	}
 
+#ifdef CONFIG_USER_SCHED
+	if (tg == &root_task_group) {
+		period = global_rt_period();
+		runtime = global_rt_runtime();
+	}
+#endif
+
 	/*
 	 * Cannot have more runtime than the period.
 	 */
-- 
cgit v1.2.3


From cce7ade803699463ecc62a065ca522004f7ccb3d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 15 Jan 2009 14:53:37 +0100
Subject: sched: SCHED_IDLE weight change

Increase the SCHED_IDLE weight from 2 to 3, this gives much more stable
vruntime numbers.

time advanced in 100ms:

 weight=2

 64765.988352
 67012.881408
 88501.412352

 weight=3

 35496.181411
 34130.971298
 35497.411573

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index ed62d1cee05..6acfb3c2398 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1323,8 +1323,8 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
  * slice expiry etc.
  */
 
-#define WEIGHT_IDLEPRIO		2
-#define WMULT_IDLEPRIO		(1 << 31)
+#define WEIGHT_IDLEPRIO                3
+#define WMULT_IDLEPRIO         1431655765
 
 /*
  * Nice levels are multiplicative, with a gentle 10% change for every
-- 
cgit v1.2.3


From 6bc912b71b6f33b041cfde93ca3f019cbaa852bc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 15 Jan 2009 14:53:38 +0100
Subject: sched: SCHED_OTHER vs SCHED_IDLE isolation

Stronger SCHED_IDLE isolation:

 - no SCHED_IDLE buddies
 - never let SCHED_IDLE preempt on wakeup
 - always preempt SCHED_IDLE on wakeup
 - limit SLEEPER fairness for SCHED_IDLE.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 8e1352c7555..cdebd8089cb 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -677,9 +677,13 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 			unsigned long thresh = sysctl_sched_latency;
 
 			/*
-			 * convert the sleeper threshold into virtual time
+			 * Convert the sleeper threshold into virtual time.
+			 * SCHED_IDLE is a special sub-class.  We care about
+			 * fairness only relative to other SCHED_IDLE tasks,
+			 * all of which have the same weight.
 			 */
-			if (sched_feat(NORMALIZED_SLEEPER))
+			if (sched_feat(NORMALIZED_SLEEPER) &&
+					task_of(se)->policy != SCHED_IDLE)
 				thresh = calc_delta_fair(thresh, se);
 
 			vruntime -= thresh;
@@ -1340,14 +1344,18 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
 
 static void set_last_buddy(struct sched_entity *se)
 {
-	for_each_sched_entity(se)
-		cfs_rq_of(se)->last = se;
+	if (likely(task_of(se)->policy != SCHED_IDLE)) {
+		for_each_sched_entity(se)
+			cfs_rq_of(se)->last = se;
+	}
 }
 
 static void set_next_buddy(struct sched_entity *se)
 {
-	for_each_sched_entity(se)
-		cfs_rq_of(se)->next = se;
+	if (likely(task_of(se)->policy != SCHED_IDLE)) {
+		for_each_sched_entity(se)
+			cfs_rq_of(se)->next = se;
+	}
 }
 
 /*
@@ -1393,12 +1401,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
 		return;
 
 	/*
-	 * Batch tasks do not preempt (their preemption is driven by
+	 * Batch and idle tasks do not preempt (their preemption is driven by
 	 * the tick):
 	 */
-	if (unlikely(p->policy == SCHED_BATCH))
+	if (unlikely(p->policy != SCHED_NORMAL))
 		return;
 
+	/* Idle tasks are by definition preempted by everybody. */
+	if (unlikely(curr->policy == SCHED_IDLE)) {
+		resched_task(curr);
+		return;
+	}
+
 	if (!sched_feat(WAKEUP_PREEMPT))
 		return;
 
-- 
cgit v1.2.3


From e17036dac189dd034c092a91df56aa740db7146d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 15 Jan 2009 14:53:39 +0100
Subject: sched: fix update_min_vruntime

Impact: fix SCHED_IDLE latency problems

OK, so we have 1 running task A (which is obviously curr and the tree is
equally obviously empty).

'A' nicely chugs along, doing its thing, carrying min_vruntime along as it
goes.

Then some whacko speed freak SCHED_IDLE task gets inserted due to SMP
balancing, which is very likely far right, in that case

update_curr
  update_min_vruntime
    cfs_rq->rb_leftmost := true (the crazy task sitting in a tree)
      vruntime = se->vruntime

and voila, min_vruntime is waaay right of where it ought to be.

OK, so why did I write it like that to begin with...

Aah, yes.

Say we've just dequeued current

schedule
  deactivate_task(prev)
    dequeue_entity
      update_min_vruntime

Then we'll set

  vruntime = cfs_rq->min_vruntime;

we find !cfs_rq->curr, but do find someone in the tree. Then we _must_
do vruntime = se->vruntime, because

 vruntime = min_vruntime(vruntime := cfs_rq->min_vruntime, se->vruntime)

will not advance vruntime, and cause lags the other way around (which we
fixed with that initial patch: 1af5f730fc1bf7c62ec9fb2d307206e18bf40a69
(sched: more accurate min_vruntime accounting).

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Tested-by: Mike Galbraith <efault@gmx.de>
Acked-by: Mike Galbraith <efault@gmx.de>
Cc: <stable@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index cdebd8089cb..16b419bb8b0 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -283,7 +283,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
 						   struct sched_entity,
 						   run_node);
 
-		if (vruntime == cfs_rq->min_vruntime)
+		if (!cfs_rq->curr)
 			vruntime = se->vruntime;
 		else
 			vruntime = min_vruntime(vruntime, se->vruntime);
-- 
cgit v1.2.3


From 88fc241f54459ac3d86c5e13b449730199f66061 Mon Sep 17 00:00:00 2001
From: Doug Chapman <doug.chapman@hp.com>
Date: Thu, 15 Jan 2009 10:38:56 -0800
Subject: [IA64] dump stack on kernel unaligned warnings

Often the cause of kernel unaligned access warnings is not
obvious from just the ip displayed in the warning.  This adds
the option via proc to dump the stack in addition to the warning.
The default is off (just display the 1 line warning).  To enable
the stack to be shown: echo 1 > /proc/sys/kernel/unaligned-dump-stack

Signed-off-by: Doug Chapman <doug.chapman@hp.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 kernel/sysctl.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3e38b74b612..368d1638ee7 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -144,6 +144,7 @@ extern int acct_parm[];
 
 #ifdef CONFIG_IA64
 extern int no_unaligned_warning;
+extern int unaligned_dump_stack;
 #endif
 
 #ifdef CONFIG_RT_MUTEXES
@@ -781,6 +782,14 @@ static struct ctl_table kern_table[] = {
 	 	.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "unaligned-dump-stack",
+		.data		= &unaligned_dump_stack,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #endif
 #ifdef CONFIG_DETECT_SOFTLOCKUP
 	{
-- 
cgit v1.2.3


From 6272d68cc6a5f90c6b1a2228cf0f67b895305d17 Mon Sep 17 00:00:00 2001
From: Lin Ming <ming.m.lin@intel.com>
Date: Thu, 15 Jan 2009 17:17:15 +0100
Subject: sched: sched_slice() fixlet

Mike's change: 0a582440f "sched: fix sched_slice())" broke group
scheduling by forgetting to reload cfs_rq on each loop.

This patch fixes aim7 regression and specjbb2005 regression becomes
less than 1.5% on 8-core stokley.

Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Tested-by: Jayson King <dev@jaysonking.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 16b419bb8b0..5cc1c162044 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -429,7 +429,10 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
 
 	for_each_sched_entity(se) {
-		struct load_weight *load = &cfs_rq->load;
+		struct load_weight *load;
+
+		cfs_rq = cfs_rq_of(se);
+		load = &cfs_rq->load;
 
 		if (unlikely(!se->on_rq)) {
 			struct load_weight lw = cfs_rq->load;
-- 
cgit v1.2.3


From 45ce80fb6b6f9594d1396d44dd7e7c02d596fef8 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 15 Jan 2009 13:50:59 -0800
Subject: cgroups: consolidate cgroup documents

Move Documentation/cpusets.txt and Documentation/controllers/* to
Documentation/cgroups/

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 647c77a88fc..a85678865c5 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -568,7 +568,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
  * load balancing domains (sched domains) as specified by that partial
  * partition.
  *
- * See "What is sched_load_balance" in Documentation/cpusets.txt
+ * See "What is sched_load_balance" in Documentation/cgroups/cpusets.txt
  * for a background explanation of this.
  *
  * Does not return errors, on the theory that the callers of this
-- 
cgit v1.2.3


From 6ae301e85c9c58d2f430a8a7057ce488b7ff76df Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Thu, 15 Jan 2009 13:51:01 -0800
Subject: resources: fix parameter name and kernel-doc

Fix __request_region() parameter kernel-doc notation and parameter name:

Warning(linux-2.6.28-git10//kernel/resource.c:627): No description found for parameter 'flags'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/resource.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/resource.c b/kernel/resource.c
index ca6a1536b20..fd5d7d574bb 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -620,6 +620,7 @@ resource_size_t resource_alignment(struct resource *res)
  * @start: resource start address
  * @n: resource region size
  * @name: reserving caller's ID string
+ * @flags: IO resource flags
  */
 struct resource * __request_region(struct resource *parent,
 				   resource_size_t start, resource_size_t n,
-- 
cgit v1.2.3