From e03d13e985d48ac4885382c9e3b1510c78bd047f Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 19 Oct 2005 22:21:23 -0700
Subject: [PATCH] Fix cpu timers exit deadlock and races

Oleg Nesterov reported an SMP deadlock.  If there is a running timer
tracking a different process's CPU time clock when the process owning
the timer exits, we deadlock on tasklist_lock in posix_cpu_timer_del via
exit_itimers.

That code was using tasklist_lock to check for a race with __exit_signal
being called on the timer-target task and clearing its ->signal.
However, there is actually no such race.  __exit_signal will have called
posix_cpu_timers_exit and posix_cpu_timers_exit_group before it does
that.  Those will clear those k_itimer's association with the dying
task, so posix_cpu_timer_del will return early and never reach the code
in question.

In addition, posix_cpu_timer_del called from exit_itimers during execve
or directly from timer_delete in the process owning the timer can race
with an exiting timer-target task to cause a double put on timer-target
task struct.  Make sure we always access cpu_timers lists with sighand
lock held.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Chris Wright <chrisw@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-cpu-timers.c | 28 +++++++++++-----------------
 1 file changed, 11 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 7a51a5597c3..b3f3edc475d 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -387,25 +387,19 @@ int posix_cpu_timer_del(struct k_itimer *timer)
 	if (unlikely(p == NULL))
 		return 0;
 
+	spin_lock(&p->sighand->siglock);
 	if (!list_empty(&timer->it.cpu.entry)) {
-		read_lock(&tasklist_lock);
-		if (unlikely(p->signal == NULL)) {
-			/*
-			 * We raced with the reaping of the task.
-			 * The deletion should have cleared us off the list.
-			 */
-			BUG_ON(!list_empty(&timer->it.cpu.entry));
-		} else {
-			/*
-			 * Take us off the task's timer list.
-			 */
-			spin_lock(&p->sighand->siglock);
-			list_del(&timer->it.cpu.entry);
-			spin_unlock(&p->sighand->siglock);
-		}
-		read_unlock(&tasklist_lock);
+		/*
+		 * Take us off the task's timer list.  We don't need to
+		 * take tasklist_lock and check for the task being reaped.
+		 * If it was reaped, it already called posix_cpu_timers_exit
+		 * and posix_cpu_timers_exit_group to clear all the timers
+		 * that pointed to it.
+		 */
+		list_del(&timer->it.cpu.entry);
+		put_task_struct(p);
 	}
-	put_task_struct(p);
+	spin_unlock(&p->sighand->siglock);
 
 	return 0;
 }
-- 
cgit v1.2.3


From d1209d049bbc3df66650f8417637be4f7b57b604 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Wed, 19 Oct 2005 21:23:51 -0700
Subject: [PATCH] Threads shouldn't inherit PF_NOFREEZE

The PF_NOFREEZE process flag should not be inherited when a thread is
forked.  This patch (as585) removes the flag from the child.

This problem is starting to show up more and more as drivers turn to the
kthread API instead of using kernel_thread().  As a result, their kernel
threads are now children of the kthread worker instead of modprobe, and
they inherit the PF_NOFREEZE flag.  This can cause problems during system
suspend; the kernel threads are not getting frozen as they ought to be.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 533ce27f4b2..280bd44ac44 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -848,7 +848,7 @@ static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
 {
 	unsigned long new_flags = p->flags;
 
-	new_flags &= ~PF_SUPERPRIV;
+	new_flags &= ~(PF_SUPERPRIV | PF_NOFREEZE);
 	new_flags |= PF_FORKNOEXEC;
 	if (!(clone_flags & CLONE_PTRACE))
 		p->ptrace = 0;
-- 
cgit v1.2.3


From 9465bee863bc4c6cf1566c12d6f92a8133e3da5c Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Fri, 21 Oct 2005 15:36:00 -0700
Subject: Revert "Fix cpu timers exit deadlock and races"

Revert commit e03d13e985d48ac4885382c9e3b1510c78bd047f, to be replaced
by a much nicer fix from Roland.
---
 kernel/posix-cpu-timers.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index b3f3edc475d..7a51a5597c3 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -387,19 +387,25 @@ int posix_cpu_timer_del(struct k_itimer *timer)
 	if (unlikely(p == NULL))
 		return 0;
 
-	spin_lock(&p->sighand->siglock);
 	if (!list_empty(&timer->it.cpu.entry)) {
-		/*
-		 * Take us off the task's timer list.  We don't need to
-		 * take tasklist_lock and check for the task being reaped.
-		 * If it was reaped, it already called posix_cpu_timers_exit
-		 * and posix_cpu_timers_exit_group to clear all the timers
-		 * that pointed to it.
-		 */
-		list_del(&timer->it.cpu.entry);
-		put_task_struct(p);
+		read_lock(&tasklist_lock);
+		if (unlikely(p->signal == NULL)) {
+			/*
+			 * We raced with the reaping of the task.
+			 * The deletion should have cleared us off the list.
+			 */
+			BUG_ON(!list_empty(&timer->it.cpu.entry));
+		} else {
+			/*
+			 * Take us off the task's timer list.
+			 */
+			spin_lock(&p->sighand->siglock);
+			list_del(&timer->it.cpu.entry);
+			spin_unlock(&p->sighand->siglock);
+		}
+		read_unlock(&tasklist_lock);
 	}
-	spin_unlock(&p->sighand->siglock);
+	put_task_struct(p);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 25f407f0b668f5e4ebd5d13e1fb4306ba6427ead Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 21 Oct 2005 15:03:29 -0700
Subject: [PATCH] Call exit_itimers from do_exit, not __exit_signal

When I originally moved exit_itimers into __exit_signal, that was the only
place where we could reliably know it was the last thread in the group
dying, without races.  Since then we've gotten the signal_struct.live
counter, and do_exit can reliably do group-wide cleanup work.

This patch moves the call to do_exit, where it's made without locks.  This
avoids the deadlock issues that the old __exit_signal code's comment talks
about, and the one that Oleg found recently with process CPU timers.

[ This replaces e03d13e985d48ac4885382c9e3b1510c78bd047f, which is why
  it was just reverted. ]

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c         |  1 +
 kernel/posix-timers.c |  2 +-
 kernel/signal.c       | 14 +-------------
 3 files changed, 3 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 43077732619..3b25b182d2b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -843,6 +843,7 @@ fastcall NORET_TYPE void do_exit(long code)
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
 	if (group_dead) {
  		del_timer_sync(&tsk->signal->real_timer);
+		exit_itimers(tsk->signal);
 		acct_process(code);
 	}
 	exit_mm(tsk);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index b7b532acd9f..dda3cda73c7 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -1157,7 +1157,7 @@ retry_delete:
 }
 
 /*
- * This is called by __exit_signal, only when there are no more
+ * This is called by do_exit or de_thread, only when there are no more
  * references to the shared signal_struct.
  */
 void exit_itimers(struct signal_struct *sig)
diff --git a/kernel/signal.c b/kernel/signal.c
index 50c99264377..f2b96b08fb4 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -397,20 +397,8 @@ void __exit_signal(struct task_struct *tsk)
 	flush_sigqueue(&tsk->pending);
 	if (sig) {
 		/*
-		 * We are cleaning up the signal_struct here.  We delayed
-		 * calling exit_itimers until after flush_sigqueue, just in
-		 * case our thread-local pending queue contained a queued
-		 * timer signal that would have been cleared in
-		 * exit_itimers.  When that called sigqueue_free, it would
-		 * attempt to re-take the tasklist_lock and deadlock.  This
-		 * can never happen if we ensure that all queues the
-		 * timer's signal might be queued on have been flushed
-		 * first.  The shared_pending queue, and our own pending
-		 * queue are the only queues the timer could be on, since
-		 * there are no other threads left in the group and timer
-		 * signals are constrained to threads inside the group.
+		 * We are cleaning up the signal_struct here.
 		 */
-		exit_itimers(sig);
 		exit_thread_group_keys(sig);
 		kmem_cache_free(signal_cachep, sig);
 	}
-- 
cgit v1.2.3


From e80eda94d3eaf1d12cfc97878eff77cd679dabc9 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Sun, 23 Oct 2005 10:02:50 -0700
Subject: Posix timers: limit number of timers firing at once

Bursty timers aren't good for anybody, very much including latency for
other programs when we trigger lots of timers in interrupt context.  So
set a random limit, after which we'll handle the rest on the next timer
tick.

Noted by Oleg Nesterov <oleg@tv-sign.ru>

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-cpu-timers.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 7a51a5597c3..d30b304a338 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -961,14 +961,16 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 static void check_thread_timers(struct task_struct *tsk,
 				struct list_head *firing)
 {
+	int maxfire;
 	struct list_head *timers = tsk->cpu_timers;
 
+	maxfire = 20;
 	tsk->it_prof_expires = cputime_zero;
 	while (!list_empty(timers)) {
 		struct cpu_timer_list *t = list_entry(timers->next,
 						      struct cpu_timer_list,
 						      entry);
-		if (cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
+		if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
 			tsk->it_prof_expires = t->expires.cpu;
 			break;
 		}
@@ -977,12 +979,13 @@ static void check_thread_timers(struct task_struct *tsk,
 	}
 
 	++timers;
+	maxfire = 20;
 	tsk->it_virt_expires = cputime_zero;
 	while (!list_empty(timers)) {
 		struct cpu_timer_list *t = list_entry(timers->next,
 						      struct cpu_timer_list,
 						      entry);
-		if (cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
+		if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
 			tsk->it_virt_expires = t->expires.cpu;
 			break;
 		}
@@ -991,12 +994,13 @@ static void check_thread_timers(struct task_struct *tsk,
 	}
 
 	++timers;
+	maxfire = 20;
 	tsk->it_sched_expires = 0;
 	while (!list_empty(timers)) {
 		struct cpu_timer_list *t = list_entry(timers->next,
 						      struct cpu_timer_list,
 						      entry);
-		if (tsk->sched_time < t->expires.sched) {
+		if (!--maxfire || tsk->sched_time < t->expires.sched) {
 			tsk->it_sched_expires = t->expires.sched;
 			break;
 		}
@@ -1013,6 +1017,7 @@ static void check_thread_timers(struct task_struct *tsk,
 static void check_process_timers(struct task_struct *tsk,
 				 struct list_head *firing)
 {
+	int maxfire;
 	struct signal_struct *const sig = tsk->signal;
 	cputime_t utime, stime, ptime, virt_expires, prof_expires;
 	unsigned long long sched_time, sched_expires;
@@ -1045,12 +1050,13 @@ static void check_process_timers(struct task_struct *tsk,
 	} while (t != tsk);
 	ptime = cputime_add(utime, stime);
 
+	maxfire = 20;
 	prof_expires = cputime_zero;
 	while (!list_empty(timers)) {
 		struct cpu_timer_list *t = list_entry(timers->next,
 						      struct cpu_timer_list,
 						      entry);
-		if (cputime_lt(ptime, t->expires.cpu)) {
+		if (!--maxfire || cputime_lt(ptime, t->expires.cpu)) {
 			prof_expires = t->expires.cpu;
 			break;
 		}
@@ -1059,12 +1065,13 @@ static void check_process_timers(struct task_struct *tsk,
 	}
 
 	++timers;
+	maxfire = 20;
 	virt_expires = cputime_zero;
 	while (!list_empty(timers)) {
 		struct cpu_timer_list *t = list_entry(timers->next,
 						      struct cpu_timer_list,
 						      entry);
-		if (cputime_lt(utime, t->expires.cpu)) {
+		if (!--maxfire || cputime_lt(utime, t->expires.cpu)) {
 			virt_expires = t->expires.cpu;
 			break;
 		}
@@ -1073,12 +1080,13 @@ static void check_process_timers(struct task_struct *tsk,
 	}
 
 	++timers;
+	maxfire = 20;
 	sched_expires = 0;
 	while (!list_empty(timers)) {
 		struct cpu_timer_list *t = list_entry(timers->next,
 						      struct cpu_timer_list,
 						      entry);
-		if (sched_time < t->expires.sched) {
+		if (!--maxfire || sched_time < t->expires.sched) {
 			sched_expires = t->expires.sched;
 			break;
 		}
-- 
cgit v1.2.3