From 9796fdd829da626374458e8706daedcc0e432ddd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 21 Oct 2005 03:22:03 -0400
Subject: [PATCH] gfp_t: kernel/*

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/audit.c        | 6 +++---
 kernel/auditsc.c      | 2 +-
 kernel/kexec.c        | 7 +++----
 kernel/power/swsusp.c | 2 +-
 4 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index aefa73a8a58..0c56320d38d 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -133,7 +133,7 @@ struct audit_buffer {
 	struct list_head     list;
 	struct sk_buff       *skb;	/* formatted skb ready to send */
 	struct audit_context *ctx;	/* NULL or associated context */
-	int		     gfp_mask;
+	gfp_t		     gfp_mask;
 };
 
 static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
@@ -647,7 +647,7 @@ static inline void audit_get_stamp(struct audit_context *ctx,
  * will be written at syscall exit.  If there is no associated task, tsk
  * should be NULL. */
 
-struct audit_buffer *audit_log_start(struct audit_context *ctx, int gfp_mask,
+struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
 				     int type)
 {
 	struct audit_buffer	*ab	= NULL;
@@ -879,7 +879,7 @@ void audit_log_end(struct audit_buffer *ab)
 /* Log an audit record.  This is a convenience function that calls
  * audit_log_start, audit_log_vformat, and audit_log_end.  It may be
  * called in any context. */
-void audit_log(struct audit_context *ctx, int gfp_mask, int type, 
+void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, 
 	       const char *fmt, ...)
 {
 	struct audit_buffer *ab;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 88696f639aa..d8a68509e72 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -803,7 +803,7 @@ static void audit_log_task_info(struct audit_buffer *ab)
 	up_read(&mm->mmap_sem);
 }
 
-static void audit_log_exit(struct audit_context *context, unsigned int gfp_mask)
+static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
 {
 	int i;
 	struct audit_buffer *ab;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index cdd4dcd8fb6..36c5d9cd4cc 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -90,7 +90,7 @@ int kexec_should_crash(struct task_struct *p)
 static int kimage_is_destination_range(struct kimage *image,
 				       unsigned long start, unsigned long end);
 static struct page *kimage_alloc_page(struct kimage *image,
-				       unsigned int gfp_mask,
+				       gfp_t gfp_mask,
 				       unsigned long dest);
 
 static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
@@ -326,8 +326,7 @@ static int kimage_is_destination_range(struct kimage *image,
 	return 0;
 }
 
-static struct page *kimage_alloc_pages(unsigned int gfp_mask,
-					unsigned int order)
+static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
 {
 	struct page *pages;
 
@@ -654,7 +653,7 @@ static kimage_entry_t *kimage_dst_used(struct kimage *image,
 }
 
 static struct page *kimage_alloc_page(struct kimage *image,
-					unsigned int gfp_mask,
+					gfp_t gfp_mask,
 					unsigned long destination)
 {
 	/*
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 2d5c4567644..10bc5ec496d 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -1095,7 +1095,7 @@ static inline void eat_page(void *page)
 	*eaten_memory = c;
 }
 
-unsigned long get_usable_page(unsigned gfp_mask)
+unsigned long get_usable_page(gfp_t gfp_mask)
 {
 	unsigned long m;
 
-- 
cgit v1.2.3


From 8d027de54c77d38eedc9b331c7a2a39807d34691 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sat, 29 Oct 2005 19:37:40 +0400
Subject: [PATCH] fix ->signal->live leak in copy_process()

exit_signal() (called from copy_process's error path) should decrement
->signal->live, otherwise forking process will miss 'group_dead' in
do_exit().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index f2b96b08fb4..6904bbbfe11 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -406,6 +406,8 @@ void __exit_signal(struct task_struct *tsk)
 
 void exit_signal(struct task_struct *tsk)
 {
+	atomic_dec(&tsk->signal->live);
+
 	write_lock_irq(&tasklist_lock);
 	__exit_signal(tsk);
 	write_unlock_irq(&tasklist_lock);
-- 
cgit v1.2.3


From 943eae03143790c71cf42fe13529f1b74ceb0266 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Sat, 29 Oct 2005 07:32:07 +0100
Subject: [PATCH] missing exports of do_settimeofday() variants

frv, sh64, ia64 and sparc64 do not have do_settimeofday() exported (the
last two are using variant in kernel/time.c).  Exports added to match
the rest of architectures.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/time.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/time.c b/kernel/time.c
index 40c2410ac99..a3c2100470e 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -532,6 +532,7 @@ int do_settimeofday (struct timespec *tv)
 	clock_was_set();
 	return 0;
 }
+EXPORT_SYMBOL(do_settimeofday);
 
 void do_gettimeofday (struct timeval *tv)
 {
-- 
cgit v1.2.3


From 4b8f573b5db02a3017afbba49026a6aef480174f Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Sat, 29 Oct 2005 18:15:42 -0700
Subject: [PATCH] TIMERS: add missing compensation for HZ == 250

Add missing compensation for (HZ == 250) != (1 << SHIFT_HZ) in
second_overflow().

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 3ba10fa35b6..6a2e5f8dc72 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -752,6 +752,15 @@ static void second_overflow(void)
     else
 	time_adj += (time_adj >> 2) + (time_adj >> 5);
 #endif
+#if HZ == 250
+    /* Compensate for (HZ==250) != (1 << SHIFT_HZ).
+     * Add 1.5625% and 0.78125% to get 255.85938; => only 0.05% error (p. 14)
+     */
+    if (time_adj < 0)
+	time_adj -= (-time_adj >> 6) + (-time_adj >> 7);
+    else
+	time_adj += (time_adj >> 6) + (time_adj >> 7);
+#endif
 #if HZ == 1000
     /* Compensate for (HZ==1000) != (1 << SHIFT_HZ).
      * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
-- 
cgit v1.2.3


From ab50b8ed818016cfecd747d6d4bb9139986bc029 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:15:56 -0700
Subject: [PATCH] mm: vm_stat_account unshackled

The original vm_stat_account has fallen into disuse, with only one user, and
only one user of vm_stat_unaccount.  It's easier to keep track if we convert
them all to __vm_stat_account, then free it from its __shackles.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 280bd44ac44..e2ff11f8c1b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -212,7 +212,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
 		if (mpnt->vm_flags & VM_DONTCOPY) {
 			long pages = vma_pages(mpnt);
 			mm->total_vm -= pages;
-			__vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
+			vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
 								-pages);
 			continue;
 		}
-- 
cgit v1.2.3


From 404351e67a9facb475abf1492245374a28d13e90 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:04 -0700
Subject: [PATCH] mm: mm_init set_mm_counters

How is anon_rss initialized?  In dup_mmap, and by mm_alloc's memset; but
that's not so good if an mm_counter_t is a special type.  And how is rss
initialized?  By set_mm_counter, all over the place.  Come on, we just need to
initialize them both at once by set_mm_counter in mm_init (which follows the
memcpy when forking).

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index e2ff11f8c1b..25caa02e2ea 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -198,8 +198,6 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
 	mm->free_area_cache = oldmm->mmap_base;
 	mm->cached_hole_size = ~0UL;
 	mm->map_count = 0;
-	set_mm_counter(mm, rss, 0);
-	set_mm_counter(mm, anon_rss, 0);
 	cpus_clear(mm->cpu_vm_mask);
 	mm->mm_rb = RB_ROOT;
 	rb_link = &mm->mm_rb.rb_node;
@@ -323,6 +321,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
 	INIT_LIST_HEAD(&mm->mmlist);
 	mm->core_waiters = 0;
 	mm->nr_ptes = 0;
+	set_mm_counter(mm, rss, 0);
+	set_mm_counter(mm, anon_rss, 0);
 	spin_lock_init(&mm->page_table_lock);
 	rwlock_init(&mm->ioctx_list_lock);
 	mm->ioctx_list = NULL;
-- 
cgit v1.2.3


From 4294621f41a85497019fae64341aa5351a1921b7 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:05 -0700
Subject: [PATCH] mm: rss = file_rss + anon_rss

I was lazy when we added anon_rss, and chose to change as few places as
possible.  So currently each anonymous page has to be counted twice, in rss
and in anon_rss.  Which won't be so good if those are atomic counts in some
configurations.

Change that around: keep file_rss and anon_rss separately, and add them
together (with get_mm_rss macro) when the total is needed - reading two
atomics is much cheaper than updating two atomics.  And update anon_rss
upfront, typically in memory.c, not tucked away in page_add_anon_rmap.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/acct.c | 2 +-
 kernel/fork.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index b756f527497..2e3f4a47e7d 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -553,7 +553,7 @@ void acct_update_integrals(struct task_struct *tsk)
 		if (delta == 0)
 			return;
 		tsk->acct_stimexpd = tsk->stime;
-		tsk->acct_rss_mem1 += delta * get_mm_counter(tsk->mm, rss);
+		tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
 		tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
 	}
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index 25caa02e2ea..2048ed7b587 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -321,7 +321,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
 	INIT_LIST_HEAD(&mm->mmlist);
 	mm->core_waiters = 0;
 	mm->nr_ptes = 0;
-	set_mm_counter(mm, rss, 0);
+	set_mm_counter(mm, file_rss, 0);
 	set_mm_counter(mm, anon_rss, 0);
 	spin_lock_init(&mm->page_table_lock);
 	rwlock_init(&mm->ioctx_list_lock);
@@ -499,7 +499,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
 	if (retval)
 		goto free_pt;
 
-	mm->hiwater_rss = get_mm_counter(mm,rss);
+	mm->hiwater_rss = get_mm_rss(mm);
 	mm->hiwater_vm = mm->total_vm;
 
 good_mm:
-- 
cgit v1.2.3


From fd3e42fcc888a773572282575d2fdbf5cfd6216e Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:06 -0700
Subject: [PATCH] mm: dup_mmap use oldmm more

Use the parent's oldmm throughout dup_mmap, instead of perversely going back
to current->mm.  (Can you hear the sigh of relief from those mpnts?  Usually I
squash them, but not today.)

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 2048ed7b587..0e7fe4a8a8d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -182,16 +182,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 }
 
 #ifdef CONFIG_MMU
-static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
+static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 {
-	struct vm_area_struct * mpnt, *tmp, **pprev;
+	struct vm_area_struct *mpnt, *tmp, **pprev;
 	struct rb_node **rb_link, *rb_parent;
 	int retval;
 	unsigned long charge;
 	struct mempolicy *pol;
 
 	down_write(&oldmm->mmap_sem);
-	flush_cache_mm(current->mm);
+	flush_cache_mm(oldmm);
 	mm->locked_vm = 0;
 	mm->mmap = NULL;
 	mm->mmap_cache = NULL;
@@ -204,7 +204,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
 	rb_parent = NULL;
 	pprev = &mm->mmap;
 
-	for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
+	for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
 		struct file *file;
 
 		if (mpnt->vm_flags & VM_DONTCOPY) {
@@ -265,7 +265,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
 		rb_parent = &tmp->vm_rb;
 
 		mm->map_count++;
-		retval = copy_page_range(mm, current->mm, tmp);
+		retval = copy_page_range(mm, oldmm, tmp);
 		spin_unlock(&mm->page_table_lock);
 
 		if (tmp->vm_ops && tmp->vm_ops->open)
@@ -277,7 +277,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
 	retval = 0;
 
 out:
-	flush_tlb_mm(current->mm);
+	flush_tlb_mm(oldmm);
 	up_write(&oldmm->mmap_sem);
 	return retval;
 fail_nomem_policy:
-- 
cgit v1.2.3


From 7ee78232501ea9de2b6c8f10d32c9a0fee541357 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:08 -0700
Subject: [PATCH] mm: dup_mmap down new mmap_sem

One anomaly remains from when Andrea rationalized the responsibilities of
mmap_sem and page_table_lock: in dup_mmap we add vmas to the child holding its
page_table_lock, but not the mmap_sem which normally guards the vma list and
rbtree.  Which could be an issue for unuse_mm: though since it just walks down
the list (today with page_table_lock, tomorrow not), it's probably okay.  Will
need a memory barrier?  Oh, keep it simple, Nick and I agreed, no harm in
taking child's mmap_sem here.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 0e7fe4a8a8d..2a587b3224e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -192,6 +192,8 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 
 	down_write(&oldmm->mmap_sem);
 	flush_cache_mm(oldmm);
+	down_write(&mm->mmap_sem);
+
 	mm->locked_vm = 0;
 	mm->mmap = NULL;
 	mm->mmap_cache = NULL;
@@ -251,10 +253,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 		}
 
 		/*
-		 * Link in the new vma and copy the page table entries:
-		 * link in first so that swapoff can see swap entries.
-		 * Note that, exceptionally, here the vma is inserted
-		 * without holding mm->mmap_sem.
+		 * Link in the new vma and copy the page table entries.
 		 */
 		spin_lock(&mm->page_table_lock);
 		*pprev = tmp;
@@ -275,8 +274,8 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 			goto out;
 	}
 	retval = 0;
-
 out:
+	up_write(&mm->mmap_sem);
 	flush_tlb_mm(oldmm);
 	up_write(&oldmm->mmap_sem);
 	return retval;
-- 
cgit v1.2.3


From b5810039a54e5babf428e9a1e89fc1940fabff11 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 29 Oct 2005 18:16:12 -0700
Subject: [PATCH] core remove PageReserved

Remove PageReserved() calls from core code by tightening VM_RESERVED
handling in mm/ to cover PageReserved functionality.

PageReserved special casing is removed from get_page and put_page.

All setting and clearing of PageReserved is retained, and it is now flagged
in the page_alloc checks to help ensure we don't introduce any refcount
based freeing of Reserved pages.

MAP_PRIVATE, PROT_WRITE of VM_RESERVED regions is tentatively being
deprecated.  We never completely handled it correctly anyway, and is be
reintroduced in future if required (Hugh has a proof of concept).

Once PageReserved() calls are removed from kernel/power/swsusp.c, and all
arch/ and driver code, the Set and Clear calls, and the PG_reserved bit can
be trivially removed.

Last real user of PageReserved is swsusp, which uses PageReserved to
determine whether a struct page points to valid memory or not.  This still
needs to be addressed (a generic page_is_ram() should work).

A last caveat: the ZERO_PAGE is now refcounted and managed with rmap (and
thus mapcounted and count towards shared rss).  These writes to the struct
page could cause excessive cacheline bouncing on big systems.  There are a
number of ways this could be addressed if it is an issue.

Signed-off-by: Nick Piggin <npiggin@suse.de>

Refcount bug fix for filemap_xip.c

Signed-off-by: Carsten Otte <cotte@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swsusp.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 10bc5ec496d..016504ccfcc 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -578,15 +578,23 @@ static int save_highmem_zone(struct zone *zone)
 			continue;
 		page = pfn_to_page(pfn);
 		/*
-		 * This condition results from rvmalloc() sans vmalloc_32()
-		 * and architectural memory reservations. This should be
-		 * corrected eventually when the cases giving rise to this
-		 * are better understood.
+		 * PageReserved results from rvmalloc() sans vmalloc_32()
+		 * and architectural memory reservations.
+		 *
+		 * rvmalloc should not cause this, because all implementations
+		 * appear to always be using vmalloc_32 on architectures with
+		 * highmem. This is a good thing, because we would like to save
+		 * rvmalloc pages.
+		 *
+		 * It appears to be triggered by pages which do not point to
+		 * valid memory (see arch/i386/mm/init.c:one_highpage_init(),
+		 * which sets PageReserved if the page does not point to valid
+		 * RAM.
+		 *
+		 * XXX: must remove usage of PageReserved!
 		 */
-		if (PageReserved(page)) {
-			printk("highmem reserved page?!\n");
+		if (PageReserved(page))
 			continue;
-		}
 		BUG_ON(PageNosave(page));
 		if (PageNosaveFree(page))
 			continue;
@@ -672,10 +680,9 @@ static int saveable(struct zone * zone, unsigned long * zone_pfn)
 		return 0;
 
 	page = pfn_to_page(pfn);
-	BUG_ON(PageReserved(page) && PageNosave(page));
 	if (PageNosave(page))
 		return 0;
-	if (PageReserved(page) && pfn_is_nosave(pfn)) {
+	if (pfn_is_nosave(pfn)) {
 		pr_debug("[nosave pfn 0x%lx]", pfn);
 		return 0;
 	}
-- 
cgit v1.2.3


From 365e9c87a982c03d0af3886e29d877f581b59611 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:18 -0700
Subject: [PATCH] mm: update_hiwaters just in time

update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability.  Originally it was called whenever rss or
total_vm got raised.  Then many of those callsites were replaced by a timer
tick call from account_system_time.  Now Frank van Maarseveen reports that to
be found inadequate.  How about this?  Works for Frank.

Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm.  Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths.  Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit.  Handle
mm->hiwater_vm in the same way, though it's much less of an issue.  Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.

And there has been no collector of these hiwater statistics in the tree.  The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).

There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high.  A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.

What locking?  None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact.  But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c  | 5 ++++-
 kernel/sched.c | 2 --
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 3b25b182d2b..79f52b85d6e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -839,7 +839,10 @@ fastcall NORET_TYPE void do_exit(long code)
 				preempt_count());
 
 	acct_update_integrals(tsk);
-	update_mem_hiwater(tsk);
+	if (tsk->mm) {
+		update_hiwater_rss(tsk->mm);
+		update_hiwater_vm(tsk->mm);
+	}
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
 	if (group_dead) {
  		del_timer_sync(&tsk->signal->real_timer);
diff --git a/kernel/sched.c b/kernel/sched.c
index 1e5cafdf4e2..4f26c544d02 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2511,8 +2511,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 		cpustat->idle = cputime64_add(cpustat->idle, tmp);
 	/* Account for system time used */
 	acct_update_integrals(p);
-	/* Update rss highwater mark */
-	update_mem_hiwater(p);
 }
 
 /*
-- 
cgit v1.2.3


From c74df32c724a1652ad8399b4891bb02c9d43743a Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:23 -0700
Subject: [PATCH] mm: ptd_alloc take ptlock

Second step in pushing down the page_table_lock.  Remove the temporary
bridging hack from __pud_alloc, __pmd_alloc, __pte_alloc: expect callers not
to hold page_table_lock, whether it's on init_mm or a user mm; take
page_table_lock internally to check if a racing task already allocated.

Convert their callers from common code.  But avoid coming back to change them
again later: instead of moving the spin_lock(&mm->page_table_lock) down,
switch over to new macros pte_alloc_map_lock and pte_unmap_unlock, which
encapsulate the mapping+locking and unlocking+unmapping together, and in the
end may use alternatives to the mm page_table_lock itself.

These callers all hold mmap_sem (some exclusively, some not), so at no level
can a page table be whipped away from beneath them; and pte_alloc uses the
"atomic" pmd_present to test whether it needs to allocate.  It appears that on
all arches we can safely descend without page_table_lock.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 2a587b3224e..8a069612eac 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -255,7 +255,6 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 		/*
 		 * Link in the new vma and copy the page table entries.
 		 */
-		spin_lock(&mm->page_table_lock);
 		*pprev = tmp;
 		pprev = &tmp->vm_next;
 
@@ -265,7 +264,6 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 
 		mm->map_count++;
 		retval = copy_page_range(mm, oldmm, tmp);
-		spin_unlock(&mm->page_table_lock);
 
 		if (tmp->vm_ops && tmp->vm_ops->open)
 			tmp->vm_ops->open(tmp);
-- 
cgit v1.2.3


From deceb6cd17e6dfafe4c4f81b1b4153bc41b2cb70 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:33 -0700
Subject: [PATCH] mm: follow_page with inner ptlock

Final step in pushing down common core's page_table_lock.  follow_page no
longer wants caller to hold page_table_lock, uses pte_offset_map_lock itself;
and so no page_table_lock is taken in get_user_pages itself.

But get_user_pages (and get_futex_key) do then need follow_page to pin the
page for them: take Daniel's suggestion of bitflags to follow_page.

Need one for WRITE, another for TOUCH (it was the accessed flag before:
vanished along with check_user_page_readable, but surely get_numa_maps is
wrong to mark every page it finds as accessed), another for GET.

And another, ANON to dispose of untouched_anonymous_page: it seems silly for
that to descend a second time, let follow_page observe if there was no page
table and return ZERO_PAGE if so.  Fix minor bug in that: check VM_LOCKED -
make_pages_present ought to make readonly anonymous present.

Give get_numa_maps a cond_resched while we're there.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/futex.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index ca05fe6a70b..3b4d5ad44cc 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -205,15 +205,13 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
 	/*
 	 * Do a quick atomic lookup first - this is the fastpath.
 	 */
-	spin_lock(&current->mm->page_table_lock);
-	page = follow_page(mm, uaddr, 0);
+	page = follow_page(mm, uaddr, FOLL_TOUCH|FOLL_GET);
 	if (likely(page != NULL)) {
 		key->shared.pgoff =
 			page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-		spin_unlock(&current->mm->page_table_lock);
+		put_page(page);
 		return 0;
 	}
-	spin_unlock(&current->mm->page_table_lock);
 
 	/*
 	 * Do it the general way.
-- 
cgit v1.2.3


From 4c21e2f2441dc5fbb957b030333f5a3f2d02dea7 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:40 -0700
Subject: [PATCH] mm: split page table lock

Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.

This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock.  (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)

In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.

Splitting the lock is not quite for free: another cacheline access.  Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS.  But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.

There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kexec.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 36c5d9cd4cc..2c95848fbce 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -334,7 +334,7 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
 	if (pages) {
 		unsigned int count, i;
 		pages->mapping = NULL;
-		pages->private = order;
+		set_page_private(pages, order);
 		count = 1 << order;
 		for (i = 0; i < count; i++)
 			SetPageReserved(pages + i);
@@ -347,7 +347,7 @@ static void kimage_free_pages(struct page *page)
 {
 	unsigned int order, count, i;
 
-	order = page->private;
+	order = page_private(page);
 	count = 1 << order;
 	for (i = 0; i < count; i++)
 		ClearPageReserved(page + i);
-- 
cgit v1.2.3


From 4276d32260662d5401a15a0a46e506fb5c8ab563 Mon Sep 17 00:00:00 2001
From: Brian Gerst <bgerst@didntduck.org>
Date: Sun, 30 Oct 2005 14:59:18 -0800
Subject: [PATCH] Remove redundant configs.o

Since CONFIG_IKCONFIG_PROC already depends on CONFIG_IKCONFIG, adding
configs.o again is redundant.

Signed-off-by: Brian Gerst <bgerst@didntduck.org>
Cc: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/Makefile | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index ff4dc02ce17..980b5e45444 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -22,7 +22,6 @@ obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_IKCONFIG) += configs.o
-obj-$(CONFIG_IKCONFIG_PROC) += configs.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
 obj-$(CONFIG_AUDIT) += audit.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
-- 
cgit v1.2.3


From c32b6b8e524d2c337767d312814484d9289550cf Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Sun, 30 Oct 2005 14:59:54 -0800
Subject: [PATCH] create and destroy cpufreq sysfs entries based on cpu
 notifiers

cpufreq entries in sysfs should only be populated when CPU is online state.
 When we either boot with maxcpus=x and then boot the other cpus by echoing
to sysfs online file, these entries should be created and destroyed when
CPU_DEAD is notified.  Same treatement as cache entries under sysfs.

We place the processor in the lowest frequency, so hw managed P-State
transitions can still work on the other threads to save power.

Primary goal was to just make these directories appear/disapper dynamically.

There is one in this patch i had to do, which i really dont like myself but
probably best if someone handling the cpufreq infrastructure could give
this code right treatment if this is not acceptable.  I guess its probably
good for the first cut.

- Converting lock_cpu_hotplug()/unlock_cpu_hotplug() to disable/enable preempt.
  The locking was smack in the middle of the notification path, when the
  hotplug is already holding the lock. I tried another solution to avoid this
  so avoid taking locks if we know we are from notification path. The solution
  was getting very ugly and i decided this was probably good for this iteration
  until someone who understands cpufreq could do a better job than me.

(akpm: export cpucontrol to GPL modules: drivers/cpufreq/cpufreq_stats.c now
does lock_cpu_hotplug())

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Cc: Dave Jones <davej@codemonkey.org.uk>
Cc: Zwane Mwaikambo <zwane@holomorphy.com>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpu.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 53d8263ae12..3619e939182 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -17,6 +17,7 @@
 
 /* This protects CPUs going up and down... */
 DECLARE_MUTEX(cpucontrol);
+EXPORT_SYMBOL_GPL(cpucontrol);
 
 static struct notifier_block *cpu_chain;
 
-- 
cgit v1.2.3


From 351619baf9878731b4272fa10dda0f84f5582241 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 30 Oct 2005 14:59:55 -0800
Subject: [PATCH] swsusp: rework image freeing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The following patch makes swsusp use PG_nosave and PG_nosave_free flags to
mark pages that should be freed after the state of the system has been
restored from the image (or in case of an error during suspend).

This allows us to avoid storing metadata in swap twice and to reduce the
amount of memory needed by swsusp.  �Additionally, it allows us to simplify
the code by removing a couple of functions that are no longer necessary.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swsusp.c | 159 ++++++++++++++++++++++++--------------------------
 1 file changed, 75 insertions(+), 84 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 016504ccfcc..ae46506e213 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -708,24 +708,28 @@ static void count_data_pages(void)
 	}
 }
 
-
 static void copy_data_pages(void)
 {
 	struct zone *zone;
 	unsigned long zone_pfn;
-	struct pbe * pbe = pagedir_nosave;
+	struct pbe *pbe = pagedir_nosave, *p;
 
 	pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages);
 	for_each_zone (zone) {
 		if (is_highmem(zone))
 			continue;
 		mark_free_pages(zone);
+		/* This is necessary for swsusp_free() */
+		for_each_pb_page (p, pagedir_nosave)
+			SetPageNosaveFree(virt_to_page(p));
+		for_each_pbe(p, pagedir_nosave)
+			SetPageNosaveFree(virt_to_page(p->address));
 		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
 			if (saveable(zone, &zone_pfn)) {
 				struct page * page;
 				page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
 				BUG_ON(!pbe);
-				pbe->orig_address = (long) page_address(page);
+				pbe->orig_address = (unsigned long)page_address(page);
 				/* copy_page is not usable for copying task structs. */
 				memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE);
 				pbe = pbe->next;
@@ -736,15 +740,6 @@ static void copy_data_pages(void)
 }
 
 
-/**
- *	calc_nr - Determine the number of pages needed for a pbe list.
- */
-
-static int calc_nr(int nr_copy)
-{
-	return nr_copy + (nr_copy+PBES_PER_PAGE-2)/(PBES_PER_PAGE-1);
-}
-
 /**
  *	free_pagedir - free pages allocated with alloc_pagedir()
  */
@@ -755,6 +750,8 @@ static inline void free_pagedir(struct pbe *pblist)
 
 	while (pblist) {
 		pbe = (pblist + PB_PAGE_SKIP)->next;
+		ClearPageNosave(virt_to_page(pblist));
+		ClearPageNosaveFree(virt_to_page(pblist));
 		free_page((unsigned long)pblist);
 		pblist = pbe;
 	}
@@ -800,6 +797,16 @@ static void create_pbe_list(struct pbe *pblist, unsigned nr_pages)
 	pr_debug("create_pbe_list(): initialized %d PBEs\n", num);
 }
 
+static void *alloc_image_page(void)
+{
+	void *res = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
+	if (res) {
+		SetPageNosave(virt_to_page(res));
+		SetPageNosaveFree(virt_to_page(res));
+	}
+	return res;
+}
+
 /**
  *	alloc_pagedir - Allocate the page directory.
  *
@@ -822,11 +829,11 @@ static struct pbe * alloc_pagedir(unsigned nr_pages)
 		return NULL;
 
 	pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
-	pblist = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
+	pblist = (struct pbe *)alloc_image_page();
 	for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
         		pbe = pbe->next, num += PBES_PER_PAGE) {
 		pbe += PB_PAGE_SKIP;
-		pbe->next = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
+		pbe->next = (struct pbe *)alloc_image_page();
 	}
 	if (!pbe) { /* get_zeroed_page() failed */
 		free_pagedir(pblist);
@@ -836,51 +843,29 @@ static struct pbe * alloc_pagedir(unsigned nr_pages)
 }
 
 /**
- *	free_image_pages - Free pages allocated for snapshot
+ * Free pages we allocated for suspend. Suspend pages are alocated
+ * before atomic copy, so we need to free them after resume.
  */
 
-static void free_image_pages(void)
+void swsusp_free(void)
 {
-	struct pbe * p;
+	struct zone *zone;
+	unsigned long zone_pfn;
 
-	for_each_pbe (p, pagedir_save) {
-		if (p->address) {
-			ClearPageNosave(virt_to_page(p->address));
-			free_page(p->address);
-			p->address = 0;
-		}
+	for_each_zone(zone) {
+		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
+			if (pfn_valid(zone_pfn + zone->zone_start_pfn)) {
+				struct page * page;
+				page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
+				if (PageNosave(page) && PageNosaveFree(page)) {
+					ClearPageNosave(page);
+					ClearPageNosaveFree(page);
+					free_page((long) page_address(page));
+				}
+			}
 	}
 }
 
-/**
- *	alloc_image_pages - Allocate pages for the snapshot.
- */
-
-static int alloc_image_pages(void)
-{
-	struct pbe * p;
-
-	for_each_pbe (p, pagedir_save) {
-		p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
-		if (!p->address)
-			return -ENOMEM;
-		SetPageNosave(virt_to_page(p->address));
-	}
-	return 0;
-}
-
-/* Free pages we allocated for suspend. Suspend pages are alocated
- * before atomic copy, so we need to free them after resume.
- */
-void swsusp_free(void)
-{
-	BUG_ON(PageNosave(virt_to_page(pagedir_save)));
-	BUG_ON(PageNosaveFree(virt_to_page(pagedir_save)));
-	free_image_pages();
-	free_pagedir(pagedir_save);
-}
-
-
 /**
  *	enough_free_mem - Make sure we enough free memory to snapshot.
  *
@@ -890,12 +875,9 @@ void swsusp_free(void)
 
 static int enough_free_mem(void)
 {
-	if (nr_free_pages() < (nr_copy_pages + PAGES_FOR_IO)) {
-		pr_debug("swsusp: Not enough free pages: Have %d\n",
-			 nr_free_pages());
-		return 0;
-	}
-	return 1;
+	pr_debug("swsusp: available memory: %u pages\n", nr_free_pages());
+	return nr_free_pages() > (nr_copy_pages + PAGES_FOR_IO +
+		nr_copy_pages/PBES_PER_PAGE + !!(nr_copy_pages%PBES_PER_PAGE));
 }
 
 
@@ -914,33 +896,16 @@ static int enough_swap(void)
 	struct sysinfo i;
 
 	si_swapinfo(&i);
-	if (i.freeswap < (nr_copy_pages + PAGES_FOR_IO))  {
-		pr_debug("swsusp: Not enough swap. Need %ld\n",i.freeswap);
-		return 0;
-	}
-	return 1;
+	pr_debug("swsusp: available swap: %lu pages\n", i.freeswap);
+	return i.freeswap > (nr_copy_pages + PAGES_FOR_IO +
+		nr_copy_pages/PBES_PER_PAGE + !!(nr_copy_pages%PBES_PER_PAGE));
 }
 
 static int swsusp_alloc(void)
 {
-	int error;
+	struct pbe * p;
 
 	pagedir_nosave = NULL;
-	nr_copy_pages = calc_nr(nr_copy_pages);
-	nr_copy_pages_check = nr_copy_pages;
-
-	pr_debug("suspend: (pages needed: %d + %d free: %d)\n",
-		 nr_copy_pages, PAGES_FOR_IO, nr_free_pages());
-
-	if (!enough_free_mem())
-		return -ENOMEM;
-
-	if (!enough_swap())
-		return -ENOSPC;
-
-	if (MAX_PBES < nr_copy_pages / PBES_PER_PAGE +
-	    !!(nr_copy_pages % PBES_PER_PAGE))
-		return -ENOSPC;
 
 	if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) {
 		printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
@@ -948,10 +913,14 @@ static int swsusp_alloc(void)
 	}
 	create_pbe_list(pagedir_save, nr_copy_pages);
 	pagedir_nosave = pagedir_save;
-	if ((error = alloc_image_pages())) {
-		printk(KERN_ERR "suspend: Allocating image pages failed.\n");
-		swsusp_free();
-		return error;
+
+	for_each_pbe (p, pagedir_save) {
+		p->address = (unsigned long)alloc_image_page();
+		if (!p->address) {
+			printk(KERN_ERR "suspend: Allocating image pages failed.\n");
+			swsusp_free();
+			return -ENOMEM;
+		}
 	}
 
 	return 0;
@@ -963,7 +932,7 @@ static int suspend_prepare_image(void)
 
 	pr_debug("swsusp: critical section: \n");
 	if (save_highmem()) {
-		printk(KERN_CRIT "Suspend machine: Not enough free pages for highmem\n");
+		printk(KERN_CRIT "swsusp: Not enough free pages for highmem\n");
 		restore_highmem();
 		return -ENOMEM;
 	}
@@ -971,6 +940,28 @@ static int suspend_prepare_image(void)
 	drain_local_pages();
 	count_data_pages();
 	printk("swsusp: Need to copy %u pages\n", nr_copy_pages);
+	nr_copy_pages_check = nr_copy_pages;
+
+	pr_debug("swsusp: pages needed: %u + %lu + %u, free: %u\n",
+		 nr_copy_pages,
+		 nr_copy_pages/PBES_PER_PAGE + !!(nr_copy_pages%PBES_PER_PAGE),
+		 PAGES_FOR_IO, nr_free_pages());
+
+	if (!enough_free_mem()) {
+		printk(KERN_ERR "swsusp: Not enough free memory\n");
+		return -ENOMEM;
+	}
+
+	if (MAX_PBES < nr_copy_pages / PBES_PER_PAGE +
+	    !!(nr_copy_pages % PBES_PER_PAGE)) {
+		printk(KERN_ERR "swsusp: Too many image pages\n");
+		return -ENOSPC;
+	}
+
+	if (!enough_swap()) {
+		printk(KERN_ERR "swsusp: Not enough free swap\n");
+		return -ENOSPC;
+	}
 
 	error = swsusp_alloc();
 	if (error)
-- 
cgit v1.2.3


From 25761b6eb7b33823bcfff6bfe2a015badcd76fb8 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 30 Oct 2005 14:59:56 -0800
Subject: [PATCH] swsusp: move snapshot functionality to separate file

The following patch moves the functionality of swsusp related to creating and
handling the snapshot of memory to a separate file, snapshot.c

This should enable us to untangle the code in the future and eventually to
implement some parts of swsusp.c in the user space.

The patch does not change the code.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/Makefile   |   2 +-
 kernel/power/power.h    |  17 ++
 kernel/power/snapshot.c | 464 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/power/swsusp.c   | 440 +--------------------------------------------
 4 files changed, 486 insertions(+), 437 deletions(-)
 create mode 100644 kernel/power/snapshot.c

(limited to 'kernel')

diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 2f438d0eaa1..c71eb4579c0 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -4,7 +4,7 @@ EXTRA_CFLAGS	+=	-DDEBUG
 endif
 
 obj-y				:= main.o process.o console.o pm.o
-obj-$(CONFIG_SOFTWARE_SUSPEND)	+= swsusp.o disk.o
+obj-$(CONFIG_SOFTWARE_SUSPEND)	+= swsusp.o disk.o snapshot.o
 
 obj-$(CONFIG_SUSPEND_SMP)	+= smp.o
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 6748de23e83..e54dd8435de 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -53,3 +53,20 @@ extern void thaw_processes(void);
 
 extern int pm_prepare_console(void);
 extern void pm_restore_console(void);
+
+
+/* References to section boundaries */
+extern const void __nosave_begin, __nosave_end;
+
+extern unsigned int nr_copy_pages;
+extern suspend_pagedir_t *pagedir_nosave;
+extern suspend_pagedir_t *pagedir_save;
+
+extern asmlinkage int swsusp_arch_suspend(void);
+extern asmlinkage int swsusp_arch_resume(void);
+
+extern int restore_highmem(void);
+extern void free_pagedir(struct pbe *pblist);
+extern struct pbe * alloc_pagedir(unsigned nr_pages);
+extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages);
+extern int enough_swap(void);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
new file mode 100644
index 00000000000..0f0a7f306b0
--- /dev/null
+++ b/kernel/power/snapshot.c
@@ -0,0 +1,464 @@
+/*
+ * linux/kernel/power/swsusp.c
+ *
+ * This file is to realize architecture-independent
+ * machine suspend feature using pretty near only high-level routines
+ *
+ * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz>
+ *
+ * This file is released under the GPLv2, and is based on swsusp.c.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/suspend.h>
+#include <linux/smp_lock.h>
+#include <linux/file.h>
+#include <linux/utsname.h>
+#include <linux/version.h>
+#include <linux/delay.h>
+#include <linux/reboot.h>
+#include <linux/bitops.h>
+#include <linux/vt_kern.h>
+#include <linux/kbd_kern.h>
+#include <linux/keyboard.h>
+#include <linux/spinlock.h>
+#include <linux/genhd.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/swap.h>
+#include <linux/pm.h>
+#include <linux/device.h>
+#include <linux/buffer_head.h>
+#include <linux/swapops.h>
+#include <linux/bootmem.h>
+#include <linux/syscalls.h>
+#include <linux/console.h>
+#include <linux/highmem.h>
+#include <linux/bio.h>
+#include <linux/mount.h>
+
+#include <asm/uaccess.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+
+#include <linux/random.h>
+#include <linux/crypto.h>
+#include <asm/scatterlist.h>
+
+#include "power.h"
+
+
+
+
+#ifdef CONFIG_HIGHMEM
+struct highmem_page {
+	char *data;
+	struct page *page;
+	struct highmem_page *next;
+};
+
+static struct highmem_page *highmem_copy;
+
+static int save_highmem_zone(struct zone *zone)
+{
+	unsigned long zone_pfn;
+	mark_free_pages(zone);
+	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
+		struct page *page;
+		struct highmem_page *save;
+		void *kaddr;
+		unsigned long pfn = zone_pfn + zone->zone_start_pfn;
+
+		if (!(pfn%1000))
+			printk(".");
+		if (!pfn_valid(pfn))
+			continue;
+		page = pfn_to_page(pfn);
+		/*
+		 * This condition results from rvmalloc() sans vmalloc_32()
+		 * and architectural memory reservations. This should be
+		 * corrected eventually when the cases giving rise to this
+		 * are better understood.
+		 */
+		if (PageReserved(page)) {
+			printk("highmem reserved page?!\n");
+			continue;
+		}
+		BUG_ON(PageNosave(page));
+		if (PageNosaveFree(page))
+			continue;
+		save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
+		if (!save)
+			return -ENOMEM;
+		save->next = highmem_copy;
+		save->page = page;
+		save->data = (void *) get_zeroed_page(GFP_ATOMIC);
+		if (!save->data) {
+			kfree(save);
+			return -ENOMEM;
+		}
+		kaddr = kmap_atomic(page, KM_USER0);
+		memcpy(save->data, kaddr, PAGE_SIZE);
+		kunmap_atomic(kaddr, KM_USER0);
+		highmem_copy = save;
+	}
+	return 0;
+}
+#endif /* CONFIG_HIGHMEM */
+
+
+static int save_highmem(void)
+{
+#ifdef CONFIG_HIGHMEM
+	struct zone *zone;
+	int res = 0;
+
+	pr_debug("swsusp: Saving Highmem\n");
+	for_each_zone (zone) {
+		if (is_highmem(zone))
+			res = save_highmem_zone(zone);
+		if (res)
+			return res;
+	}
+#endif
+	return 0;
+}
+
+int restore_highmem(void)
+{
+#ifdef CONFIG_HIGHMEM
+	printk("swsusp: Restoring Highmem\n");
+	while (highmem_copy) {
+		struct highmem_page *save = highmem_copy;
+		void *kaddr;
+		highmem_copy = save->next;
+
+		kaddr = kmap_atomic(save->page, KM_USER0);
+		memcpy(kaddr, save->data, PAGE_SIZE);
+		kunmap_atomic(kaddr, KM_USER0);
+		free_page((long) save->data);
+		kfree(save);
+	}
+#endif
+	return 0;
+}
+
+
+static int pfn_is_nosave(unsigned long pfn)
+{
+	unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
+	unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
+	return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
+}
+
+/**
+ *	saveable - Determine whether a page should be cloned or not.
+ *	@pfn:	The page
+ *
+ *	We save a page if it's Reserved, and not in the range of pages
+ *	statically defined as 'unsaveable', or if it isn't reserved, and
+ *	isn't part of a free chunk of pages.
+ */
+
+static int saveable(struct zone * zone, unsigned long * zone_pfn)
+{
+	unsigned long pfn = *zone_pfn + zone->zone_start_pfn;
+	struct page * page;
+
+	if (!pfn_valid(pfn))
+		return 0;
+
+	page = pfn_to_page(pfn);
+	BUG_ON(PageReserved(page) && PageNosave(page));
+	if (PageNosave(page))
+		return 0;
+	if (PageReserved(page) && pfn_is_nosave(pfn)) {
+		pr_debug("[nosave pfn 0x%lx]", pfn);
+		return 0;
+	}
+	if (PageNosaveFree(page))
+		return 0;
+
+	return 1;
+}
+
+static void count_data_pages(void)
+{
+	struct zone *zone;
+	unsigned long zone_pfn;
+
+	nr_copy_pages = 0;
+
+	for_each_zone (zone) {
+		if (is_highmem(zone))
+			continue;
+		mark_free_pages(zone);
+		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
+			nr_copy_pages += saveable(zone, &zone_pfn);
+	}
+}
+
+static void copy_data_pages(void)
+{
+	struct zone *zone;
+	unsigned long zone_pfn;
+	struct pbe *pbe = pagedir_nosave, *p;
+
+	pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages);
+	for_each_zone (zone) {
+		if (is_highmem(zone))
+			continue;
+		mark_free_pages(zone);
+		/* This is necessary for swsusp_free() */
+		for_each_pb_page (p, pagedir_nosave)
+			SetPageNosaveFree(virt_to_page(p));
+		for_each_pbe(p, pagedir_nosave)
+			SetPageNosaveFree(virt_to_page(p->address));
+		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
+			if (saveable(zone, &zone_pfn)) {
+				struct page * page;
+				page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
+				BUG_ON(!pbe);
+				pbe->orig_address = (unsigned long)page_address(page);
+				/* copy_page is not usable for copying task structs. */
+				memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE);
+				pbe = pbe->next;
+			}
+		}
+	}
+	BUG_ON(pbe);
+}
+
+
+/**
+ *	free_pagedir - free pages allocated with alloc_pagedir()
+ */
+
+void free_pagedir(struct pbe *pblist)
+{
+	struct pbe *pbe;
+
+	while (pblist) {
+		pbe = (pblist + PB_PAGE_SKIP)->next;
+		ClearPageNosave(virt_to_page(pblist));
+		ClearPageNosaveFree(virt_to_page(pblist));
+		free_page((unsigned long)pblist);
+		pblist = pbe;
+	}
+}
+
+/**
+ *	fill_pb_page - Create a list of PBEs on a given memory page
+ */
+
+static inline void fill_pb_page(struct pbe *pbpage)
+{
+	struct pbe *p;
+
+	p = pbpage;
+	pbpage += PB_PAGE_SKIP;
+	do
+		p->next = p + 1;
+	while (++p < pbpage);
+}
+
+/**
+ *	create_pbe_list - Create a list of PBEs on top of a given chain
+ *	of memory pages allocated with alloc_pagedir()
+ */
+
+void create_pbe_list(struct pbe *pblist, unsigned nr_pages)
+{
+	struct pbe *pbpage, *p;
+	unsigned num = PBES_PER_PAGE;
+
+	for_each_pb_page (pbpage, pblist) {
+		if (num >= nr_pages)
+			break;
+
+		fill_pb_page(pbpage);
+		num += PBES_PER_PAGE;
+	}
+	if (pbpage) {
+		for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++)
+			p->next = p + 1;
+		p->next = NULL;
+	}
+	pr_debug("create_pbe_list(): initialized %d PBEs\n", num);
+}
+
+static void *alloc_image_page(void)
+{
+	void *res = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
+	if (res) {
+		SetPageNosave(virt_to_page(res));
+		SetPageNosaveFree(virt_to_page(res));
+	}
+	return res;
+}
+
+/**
+ *	alloc_pagedir - Allocate the page directory.
+ *
+ *	First, determine exactly how many pages we need and
+ *	allocate them.
+ *
+ *	We arrange the pages in a chain: each page is an array of PBES_PER_PAGE
+ *	struct pbe elements (pbes) and the last element in the page points
+ *	to the next page.
+ *
+ *	On each page we set up a list of struct_pbe elements.
+ */
+
+struct pbe * alloc_pagedir(unsigned nr_pages)
+{
+	unsigned num;
+	struct pbe *pblist, *pbe;
+
+	if (!nr_pages)
+		return NULL;
+
+	pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
+	pblist = (struct pbe *)alloc_image_page();
+	/* FIXME: rewrite this ugly loop */
+	for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
+        		pbe = pbe->next, num += PBES_PER_PAGE) {
+		pbe += PB_PAGE_SKIP;
+		pbe->next = (struct pbe *)alloc_image_page();
+	}
+	if (!pbe) { /* get_zeroed_page() failed */
+		free_pagedir(pblist);
+		pblist = NULL;
+        }
+	return pblist;
+}
+
+/**
+ * Free pages we allocated for suspend. Suspend pages are alocated
+ * before atomic copy, so we need to free them after resume.
+ */
+
+void swsusp_free(void)
+{
+	struct zone *zone;
+	unsigned long zone_pfn;
+
+	for_each_zone(zone) {
+		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
+			if (pfn_valid(zone_pfn + zone->zone_start_pfn)) {
+				struct page * page;
+				page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
+				if (PageNosave(page) && PageNosaveFree(page)) {
+					ClearPageNosave(page);
+					ClearPageNosaveFree(page);
+					free_page((long) page_address(page));
+				}
+			}
+	}
+}
+
+
+/**
+ *	enough_free_mem - Make sure we enough free memory to snapshot.
+ *
+ *	Returns TRUE or FALSE after checking the number of available
+ *	free pages.
+ */
+
+static int enough_free_mem(void)
+{
+	pr_debug("swsusp: available memory: %u pages\n", nr_free_pages());
+	return nr_free_pages() > (nr_copy_pages + PAGES_FOR_IO +
+		nr_copy_pages/PBES_PER_PAGE + !!(nr_copy_pages%PBES_PER_PAGE));
+}
+
+
+static int swsusp_alloc(void)
+{
+	struct pbe * p;
+
+	pagedir_nosave = NULL;
+
+	if (MAX_PBES < nr_copy_pages / PBES_PER_PAGE +
+	    !!(nr_copy_pages % PBES_PER_PAGE))
+		return -ENOSPC;
+
+	if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) {
+		printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
+		return -ENOMEM;
+	}
+	create_pbe_list(pagedir_save, nr_copy_pages);
+	pagedir_nosave = pagedir_save;
+
+	for_each_pbe (p, pagedir_save) {
+		p->address = (unsigned long)alloc_image_page();
+		if (!p->address) {
+			printk(KERN_ERR "suspend: Allocating image pages failed.\n");
+			swsusp_free();
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+static int suspend_prepare_image(void)
+{
+	int error;
+
+	pr_debug("swsusp: critical section: \n");
+	if (save_highmem()) {
+		printk(KERN_CRIT "swsusp: Not enough free pages for highmem\n");
+		restore_highmem();
+		return -ENOMEM;
+	}
+
+	drain_local_pages();
+	count_data_pages();
+	printk("swsusp: Need to copy %u pages\n", nr_copy_pages);
+
+	pr_debug("swsusp: pages needed: %u + %lu + %u, free: %u\n",
+		 nr_copy_pages,
+		 nr_copy_pages/PBES_PER_PAGE + !!(nr_copy_pages%PBES_PER_PAGE),
+		 PAGES_FOR_IO, nr_free_pages());
+
+	if (!enough_free_mem()) {
+		printk(KERN_ERR "swsusp: Not enough free memory\n");
+		return -ENOMEM;
+	}
+
+	if (!enough_swap()) {
+		printk(KERN_ERR "swsusp: Not enough free swap\n");
+		return -ENOSPC;
+	}
+
+	error = swsusp_alloc();
+	if (error)
+		return error;
+
+	/* During allocating of suspend pagedir, new cold pages may appear.
+	 * Kill them.
+	 */
+	drain_local_pages();
+	copy_data_pages();
+
+	/*
+	 * End of critical section. From now on, we can write to memory,
+	 * but we should not touch disk. This specially means we must _not_
+	 * touch swap space! Except we must write out our image of course.
+	 */
+
+	printk("swsusp: critical section/: done (%d pages copied)\n", nr_copy_pages );
+	return 0;
+}
+
+
+asmlinkage int swsusp_save(void)
+{
+	return suspend_prepare_image();
+}
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index ae46506e213..fc50b5d2dd2 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -5,7 +5,7 @@
  * machine suspend feature using pretty near only high-level routines
  *
  * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
- * Copyright (C) 1998,2001-2004 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
  *
  * This file is released under the GPLv2.
  *
@@ -84,16 +84,10 @@
 #define MAXKEY 32
 #define MAXIV  32
 
-/* References to section boundaries */
-extern const void __nosave_begin, __nosave_end;
-
-/* Variables to be preserved over suspend */
-static int nr_copy_pages_check;
-
 extern char resume_file[];
 
 /* Local variables that should not be affected by save */
-static unsigned int nr_copy_pages __nosavedata = 0;
+unsigned int nr_copy_pages __nosavedata = 0;
 
 /* Suspend pagedir is allocated before final copy, therefore it
    must be freed after resume
@@ -109,7 +103,7 @@ static unsigned int nr_copy_pages __nosavedata = 0;
    MMU hardware.
  */
 suspend_pagedir_t *pagedir_nosave __nosavedata = NULL;
-static suspend_pagedir_t *pagedir_save;
+suspend_pagedir_t *pagedir_save;
 
 #define SWSUSP_SIG	"S1SUSPEND"
 
@@ -123,12 +117,6 @@ static struct swsusp_header {
 
 static struct swsusp_info swsusp_info;
 
-/*
- * XXX: We try to keep some more pages free so that I/O operations succeed
- * without paging. Might this be more?
- */
-#define PAGES_FOR_IO	512
-
 /*
  * Saving part...
  */
@@ -552,335 +540,6 @@ static int write_suspend_image(void)
 	goto Done;
 }
 
-
-#ifdef CONFIG_HIGHMEM
-struct highmem_page {
-	char *data;
-	struct page *page;
-	struct highmem_page *next;
-};
-
-static struct highmem_page *highmem_copy;
-
-static int save_highmem_zone(struct zone *zone)
-{
-	unsigned long zone_pfn;
-	mark_free_pages(zone);
-	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
-		struct page *page;
-		struct highmem_page *save;
-		void *kaddr;
-		unsigned long pfn = zone_pfn + zone->zone_start_pfn;
-
-		if (!(pfn%1000))
-			printk(".");
-		if (!pfn_valid(pfn))
-			continue;
-		page = pfn_to_page(pfn);
-		/*
-		 * PageReserved results from rvmalloc() sans vmalloc_32()
-		 * and architectural memory reservations.
-		 *
-		 * rvmalloc should not cause this, because all implementations
-		 * appear to always be using vmalloc_32 on architectures with
-		 * highmem. This is a good thing, because we would like to save
-		 * rvmalloc pages.
-		 *
-		 * It appears to be triggered by pages which do not point to
-		 * valid memory (see arch/i386/mm/init.c:one_highpage_init(),
-		 * which sets PageReserved if the page does not point to valid
-		 * RAM.
-		 *
-		 * XXX: must remove usage of PageReserved!
-		 */
-		if (PageReserved(page))
-			continue;
-		BUG_ON(PageNosave(page));
-		if (PageNosaveFree(page))
-			continue;
-		save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
-		if (!save)
-			return -ENOMEM;
-		save->next = highmem_copy;
-		save->page = page;
-		save->data = (void *) get_zeroed_page(GFP_ATOMIC);
-		if (!save->data) {
-			kfree(save);
-			return -ENOMEM;
-		}
-		kaddr = kmap_atomic(page, KM_USER0);
-		memcpy(save->data, kaddr, PAGE_SIZE);
-		kunmap_atomic(kaddr, KM_USER0);
-		highmem_copy = save;
-	}
-	return 0;
-}
-#endif /* CONFIG_HIGHMEM */
-
-
-static int save_highmem(void)
-{
-#ifdef CONFIG_HIGHMEM
-	struct zone *zone;
-	int res = 0;
-
-	pr_debug("swsusp: Saving Highmem\n");
-	for_each_zone (zone) {
-		if (is_highmem(zone))
-			res = save_highmem_zone(zone);
-		if (res)
-			return res;
-	}
-#endif
-	return 0;
-}
-
-static int restore_highmem(void)
-{
-#ifdef CONFIG_HIGHMEM
-	printk("swsusp: Restoring Highmem\n");
-	while (highmem_copy) {
-		struct highmem_page *save = highmem_copy;
-		void *kaddr;
-		highmem_copy = save->next;
-
-		kaddr = kmap_atomic(save->page, KM_USER0);
-		memcpy(kaddr, save->data, PAGE_SIZE);
-		kunmap_atomic(kaddr, KM_USER0);
-		free_page((long) save->data);
-		kfree(save);
-	}
-#endif
-	return 0;
-}
-
-
-static int pfn_is_nosave(unsigned long pfn)
-{
-	unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
-	unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
-	return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
-}
-
-/**
- *	saveable - Determine whether a page should be cloned or not.
- *	@pfn:	The page
- *
- *	We save a page if it's Reserved, and not in the range of pages
- *	statically defined as 'unsaveable', or if it isn't reserved, and
- *	isn't part of a free chunk of pages.
- */
-
-static int saveable(struct zone * zone, unsigned long * zone_pfn)
-{
-	unsigned long pfn = *zone_pfn + zone->zone_start_pfn;
-	struct page * page;
-
-	if (!pfn_valid(pfn))
-		return 0;
-
-	page = pfn_to_page(pfn);
-	if (PageNosave(page))
-		return 0;
-	if (pfn_is_nosave(pfn)) {
-		pr_debug("[nosave pfn 0x%lx]", pfn);
-		return 0;
-	}
-	if (PageNosaveFree(page))
-		return 0;
-
-	return 1;
-}
-
-static void count_data_pages(void)
-{
-	struct zone *zone;
-	unsigned long zone_pfn;
-
-	nr_copy_pages = 0;
-
-	for_each_zone (zone) {
-		if (is_highmem(zone))
-			continue;
-		mark_free_pages(zone);
-		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
-			nr_copy_pages += saveable(zone, &zone_pfn);
-	}
-}
-
-static void copy_data_pages(void)
-{
-	struct zone *zone;
-	unsigned long zone_pfn;
-	struct pbe *pbe = pagedir_nosave, *p;
-
-	pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages);
-	for_each_zone (zone) {
-		if (is_highmem(zone))
-			continue;
-		mark_free_pages(zone);
-		/* This is necessary for swsusp_free() */
-		for_each_pb_page (p, pagedir_nosave)
-			SetPageNosaveFree(virt_to_page(p));
-		for_each_pbe(p, pagedir_nosave)
-			SetPageNosaveFree(virt_to_page(p->address));
-		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
-			if (saveable(zone, &zone_pfn)) {
-				struct page * page;
-				page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
-				BUG_ON(!pbe);
-				pbe->orig_address = (unsigned long)page_address(page);
-				/* copy_page is not usable for copying task structs. */
-				memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE);
-				pbe = pbe->next;
-			}
-		}
-	}
-	BUG_ON(pbe);
-}
-
-
-/**
- *	free_pagedir - free pages allocated with alloc_pagedir()
- */
-
-static inline void free_pagedir(struct pbe *pblist)
-{
-	struct pbe *pbe;
-
-	while (pblist) {
-		pbe = (pblist + PB_PAGE_SKIP)->next;
-		ClearPageNosave(virt_to_page(pblist));
-		ClearPageNosaveFree(virt_to_page(pblist));
-		free_page((unsigned long)pblist);
-		pblist = pbe;
-	}
-}
-
-/**
- *	fill_pb_page - Create a list of PBEs on a given memory page
- */
-
-static inline void fill_pb_page(struct pbe *pbpage)
-{
-	struct pbe *p;
-
-	p = pbpage;
-	pbpage += PB_PAGE_SKIP;
-	do
-		p->next = p + 1;
-	while (++p < pbpage);
-}
-
-/**
- *	create_pbe_list - Create a list of PBEs on top of a given chain
- *	of memory pages allocated with alloc_pagedir()
- */
-
-static void create_pbe_list(struct pbe *pblist, unsigned nr_pages)
-{
-	struct pbe *pbpage, *p;
-	unsigned num = PBES_PER_PAGE;
-
-	for_each_pb_page (pbpage, pblist) {
-		if (num >= nr_pages)
-			break;
-
-		fill_pb_page(pbpage);
-		num += PBES_PER_PAGE;
-	}
-	if (pbpage) {
-		for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++)
-			p->next = p + 1;
-		p->next = NULL;
-	}
-	pr_debug("create_pbe_list(): initialized %d PBEs\n", num);
-}
-
-static void *alloc_image_page(void)
-{
-	void *res = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
-	if (res) {
-		SetPageNosave(virt_to_page(res));
-		SetPageNosaveFree(virt_to_page(res));
-	}
-	return res;
-}
-
-/**
- *	alloc_pagedir - Allocate the page directory.
- *
- *	First, determine exactly how many pages we need and
- *	allocate them.
- *
- *	We arrange the pages in a chain: each page is an array of PBES_PER_PAGE
- *	struct pbe elements (pbes) and the last element in the page points
- *	to the next page.
- *
- *	On each page we set up a list of struct_pbe elements.
- */
-
-static struct pbe * alloc_pagedir(unsigned nr_pages)
-{
-	unsigned num;
-	struct pbe *pblist, *pbe;
-
-	if (!nr_pages)
-		return NULL;
-
-	pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
-	pblist = (struct pbe *)alloc_image_page();
-	for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
-        		pbe = pbe->next, num += PBES_PER_PAGE) {
-		pbe += PB_PAGE_SKIP;
-		pbe->next = (struct pbe *)alloc_image_page();
-	}
-	if (!pbe) { /* get_zeroed_page() failed */
-		free_pagedir(pblist);
-		pblist = NULL;
-        }
-	return pblist;
-}
-
-/**
- * Free pages we allocated for suspend. Suspend pages are alocated
- * before atomic copy, so we need to free them after resume.
- */
-
-void swsusp_free(void)
-{
-	struct zone *zone;
-	unsigned long zone_pfn;
-
-	for_each_zone(zone) {
-		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
-			if (pfn_valid(zone_pfn + zone->zone_start_pfn)) {
-				struct page * page;
-				page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
-				if (PageNosave(page) && PageNosaveFree(page)) {
-					ClearPageNosave(page);
-					ClearPageNosaveFree(page);
-					free_page((long) page_address(page));
-				}
-			}
-	}
-}
-
-/**
- *	enough_free_mem - Make sure we enough free memory to snapshot.
- *
- *	Returns TRUE or FALSE after checking the number of available
- *	free pages.
- */
-
-static int enough_free_mem(void)
-{
-	pr_debug("swsusp: available memory: %u pages\n", nr_free_pages());
-	return nr_free_pages() > (nr_copy_pages + PAGES_FOR_IO +
-		nr_copy_pages/PBES_PER_PAGE + !!(nr_copy_pages%PBES_PER_PAGE));
-}
-
-
 /**
  *	enough_swap - Make sure we have enough swap to save the image.
  *
@@ -891,7 +550,7 @@ static int enough_free_mem(void)
  *	We should only consider resume_device.
  */
 
-static int enough_swap(void)
+int enough_swap(void)
 {
 	struct sysinfo i;
 
@@ -901,88 +560,6 @@ static int enough_swap(void)
 		nr_copy_pages/PBES_PER_PAGE + !!(nr_copy_pages%PBES_PER_PAGE));
 }
 
-static int swsusp_alloc(void)
-{
-	struct pbe * p;
-
-	pagedir_nosave = NULL;
-
-	if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) {
-		printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
-		return -ENOMEM;
-	}
-	create_pbe_list(pagedir_save, nr_copy_pages);
-	pagedir_nosave = pagedir_save;
-
-	for_each_pbe (p, pagedir_save) {
-		p->address = (unsigned long)alloc_image_page();
-		if (!p->address) {
-			printk(KERN_ERR "suspend: Allocating image pages failed.\n");
-			swsusp_free();
-			return -ENOMEM;
-		}
-	}
-
-	return 0;
-}
-
-static int suspend_prepare_image(void)
-{
-	int error;
-
-	pr_debug("swsusp: critical section: \n");
-	if (save_highmem()) {
-		printk(KERN_CRIT "swsusp: Not enough free pages for highmem\n");
-		restore_highmem();
-		return -ENOMEM;
-	}
-
-	drain_local_pages();
-	count_data_pages();
-	printk("swsusp: Need to copy %u pages\n", nr_copy_pages);
-	nr_copy_pages_check = nr_copy_pages;
-
-	pr_debug("swsusp: pages needed: %u + %lu + %u, free: %u\n",
-		 nr_copy_pages,
-		 nr_copy_pages/PBES_PER_PAGE + !!(nr_copy_pages%PBES_PER_PAGE),
-		 PAGES_FOR_IO, nr_free_pages());
-
-	if (!enough_free_mem()) {
-		printk(KERN_ERR "swsusp: Not enough free memory\n");
-		return -ENOMEM;
-	}
-
-	if (MAX_PBES < nr_copy_pages / PBES_PER_PAGE +
-	    !!(nr_copy_pages % PBES_PER_PAGE)) {
-		printk(KERN_ERR "swsusp: Too many image pages\n");
-		return -ENOSPC;
-	}
-
-	if (!enough_swap()) {
-		printk(KERN_ERR "swsusp: Not enough free swap\n");
-		return -ENOSPC;
-	}
-
-	error = swsusp_alloc();
-	if (error)
-		return error;
-
-	/* During allocating of suspend pagedir, new cold pages may appear.
-	 * Kill them.
-	 */
-	drain_local_pages();
-	copy_data_pages();
-
-	/*
-	 * End of critical section. From now on, we can write to memory,
-	 * but we should not touch disk. This specially means we must _not_
-	 * touch swap space! Except we must write out our image of course.
-	 */
-
-	printk("swsusp: critical section/: done (%d pages copied)\n", nr_copy_pages );
-	return 0;
-}
-
 
 /* It is important _NOT_ to umount filesystems at this point. We want
  * them synced (in case something goes wrong) but we DO not want to mark
@@ -1002,14 +579,6 @@ int swsusp_write(void)
 }
 
 
-extern asmlinkage int swsusp_arch_suspend(void);
-extern asmlinkage int swsusp_arch_resume(void);
-
-
-asmlinkage int swsusp_save(void)
-{
-	return suspend_prepare_image();
-}
 
 int swsusp_suspend(void)
 {
@@ -1041,7 +610,6 @@ int swsusp_suspend(void)
 		printk(KERN_ERR "Error %d suspending\n", error);
 	/* Restore control flow magically appears here */
 	restore_processor_state();
-	BUG_ON (nr_copy_pages_check != nr_copy_pages);
 	restore_highmem();
 	device_power_up();
 	local_irq_enable();
-- 
cgit v1.2.3


From a0f496517f3e28d651d0cbbcf2d4fb701ed6957e Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 30 Oct 2005 14:59:57 -0800
Subject: [PATCH] swsusp: reduce the use of global variables

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/power.h    |  2 +-
 kernel/power/snapshot.c | 78 ++++++++++++++++++++++++-------------------------
 kernel/power/swsusp.c   |  6 ++--
 3 files changed, 43 insertions(+), 43 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/power.h b/kernel/power/power.h
index e54dd8435de..28afcb09014 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -69,4 +69,4 @@ extern int restore_highmem(void);
 extern void free_pagedir(struct pbe *pblist);
 extern struct pbe * alloc_pagedir(unsigned nr_pages);
 extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages);
-extern int enough_swap(void);
+extern int enough_swap(unsigned nr_pages);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0f0a7f306b0..03916cf3ff0 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -187,37 +187,38 @@ static int saveable(struct zone * zone, unsigned long * zone_pfn)
 	return 1;
 }
 
-static void count_data_pages(void)
+static unsigned count_data_pages(void)
 {
 	struct zone *zone;
 	unsigned long zone_pfn;
+	unsigned n;
 
-	nr_copy_pages = 0;
-
+	n = 0;
 	for_each_zone (zone) {
 		if (is_highmem(zone))
 			continue;
 		mark_free_pages(zone);
 		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
-			nr_copy_pages += saveable(zone, &zone_pfn);
+			n += saveable(zone, &zone_pfn);
 	}
+	return n;
 }
 
-static void copy_data_pages(void)
+static void copy_data_pages(struct pbe *pblist)
 {
 	struct zone *zone;
 	unsigned long zone_pfn;
-	struct pbe *pbe = pagedir_nosave, *p;
+	struct pbe *pbe, *p;
 
-	pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages);
+	pbe = pblist;
 	for_each_zone (zone) {
 		if (is_highmem(zone))
 			continue;
 		mark_free_pages(zone);
 		/* This is necessary for swsusp_free() */
-		for_each_pb_page (p, pagedir_nosave)
+		for_each_pb_page (p, pblist)
 			SetPageNosaveFree(virt_to_page(p));
-		for_each_pbe(p, pagedir_nosave)
+		for_each_pbe (p, pblist)
 			SetPageNosaveFree(virt_to_page(p->address));
 		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
 			if (saveable(zone, &zone_pfn)) {
@@ -370,46 +371,39 @@ void swsusp_free(void)
  *	free pages.
  */
 
-static int enough_free_mem(void)
+static int enough_free_mem(unsigned nr_pages)
 {
 	pr_debug("swsusp: available memory: %u pages\n", nr_free_pages());
-	return nr_free_pages() > (nr_copy_pages + PAGES_FOR_IO +
-		nr_copy_pages/PBES_PER_PAGE + !!(nr_copy_pages%PBES_PER_PAGE));
+	return nr_free_pages() > (nr_pages + PAGES_FOR_IO +
+		(nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
 }
 
 
-static int swsusp_alloc(void)
+static struct pbe *swsusp_alloc(unsigned nr_pages)
 {
-	struct pbe * p;
-
-	pagedir_nosave = NULL;
-
-	if (MAX_PBES < nr_copy_pages / PBES_PER_PAGE +
-	    !!(nr_copy_pages % PBES_PER_PAGE))
-		return -ENOSPC;
+	struct pbe *pblist, *p;
 
-	if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) {
+	if (!(pblist = alloc_pagedir(nr_pages))) {
 		printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
-		return -ENOMEM;
+		return NULL;
 	}
-	create_pbe_list(pagedir_save, nr_copy_pages);
-	pagedir_nosave = pagedir_save;
+	create_pbe_list(pblist, nr_pages);
 
-	for_each_pbe (p, pagedir_save) {
+	for_each_pbe (p, pblist) {
 		p->address = (unsigned long)alloc_image_page();
 		if (!p->address) {
 			printk(KERN_ERR "suspend: Allocating image pages failed.\n");
 			swsusp_free();
-			return -ENOMEM;
+			return NULL;
 		}
 	}
 
-	return 0;
+	return pblist;
 }
 
 static int suspend_prepare_image(void)
 {
-	int error;
+	unsigned nr_pages;
 
 	pr_debug("swsusp: critical section: \n");
 	if (save_highmem()) {
@@ -419,33 +413,37 @@ static int suspend_prepare_image(void)
 	}
 
 	drain_local_pages();
-	count_data_pages();
-	printk("swsusp: Need to copy %u pages\n", nr_copy_pages);
+	nr_pages = count_data_pages();
+	printk("swsusp: Need to copy %u pages\n", nr_pages);
 
 	pr_debug("swsusp: pages needed: %u + %lu + %u, free: %u\n",
-		 nr_copy_pages,
-		 nr_copy_pages/PBES_PER_PAGE + !!(nr_copy_pages%PBES_PER_PAGE),
+		 nr_pages,
+		 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE,
 		 PAGES_FOR_IO, nr_free_pages());
 
-	if (!enough_free_mem()) {
+	/* This is needed because of the fixed size of swsusp_info */
+	if (MAX_PBES < (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE)
+		return -ENOSPC;
+
+	if (!enough_free_mem(nr_pages)) {
 		printk(KERN_ERR "swsusp: Not enough free memory\n");
 		return -ENOMEM;
 	}
 
-	if (!enough_swap()) {
+	if (!enough_swap(nr_pages)) {
 		printk(KERN_ERR "swsusp: Not enough free swap\n");
 		return -ENOSPC;
 	}
 
-	error = swsusp_alloc();
-	if (error)
-		return error;
+	pagedir_nosave = swsusp_alloc(nr_pages);
+	if (!pagedir_nosave)
+		return -ENOMEM;
 
 	/* During allocating of suspend pagedir, new cold pages may appear.
 	 * Kill them.
 	 */
 	drain_local_pages();
-	copy_data_pages();
+	copy_data_pages(pagedir_nosave);
 
 	/*
 	 * End of critical section. From now on, we can write to memory,
@@ -453,7 +451,9 @@ static int suspend_prepare_image(void)
 	 * touch swap space! Except we must write out our image of course.
 	 */
 
-	printk("swsusp: critical section/: done (%d pages copied)\n", nr_copy_pages );
+	nr_copy_pages = nr_pages;
+
+	printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
 	return 0;
 }
 
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index fc50b5d2dd2..f6abfdb0a02 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -550,14 +550,14 @@ static int write_suspend_image(void)
  *	We should only consider resume_device.
  */
 
-int enough_swap(void)
+int enough_swap(unsigned nr_pages)
 {
 	struct sysinfo i;
 
 	si_swapinfo(&i);
 	pr_debug("swsusp: available swap: %lu pages\n", i.freeswap);
-	return i.freeswap > (nr_copy_pages + PAGES_FOR_IO +
-		nr_copy_pages/PBES_PER_PAGE + !!(nr_copy_pages%PBES_PER_PAGE));
+	return i.freeswap > (nr_pages + PAGES_FOR_IO +
+		(nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
 }
 
 
-- 
cgit v1.2.3


From 2c1b4a5ca48831595979a850f40ced8e7da026f8 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 30 Oct 2005 14:59:58 -0800
Subject: [PATCH] swsusp: rework memory freeing on resume

The following patch makes swsusp use the PG_nosave and PG_nosave_free flags to
mark pages that should be freed in case of an error during resume.

This allows us to simplify the code and to use swsusp_free() in all of the
swsusp's resume error paths, which makes them actually work.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/disk.c     |  14 +++---
 kernel/power/power.h    |   2 +-
 kernel/power/snapshot.c |   2 +-
 kernel/power/swsusp.c   | 110 ++++++++++++++++--------------------------------
 4 files changed, 45 insertions(+), 83 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 761956e813f..44ef5e799df 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -30,7 +30,6 @@ extern int swsusp_check(void);
 extern int swsusp_read(void);
 extern void swsusp_close(void);
 extern int swsusp_resume(void);
-extern int swsusp_free(void);
 
 
 static int noresume = 0;
@@ -252,14 +251,17 @@ static int software_resume(void)
 
 	pr_debug("PM: Reading swsusp image.\n");
 
-	if ((error = swsusp_read()))
-		goto Cleanup;
+	if ((error = swsusp_read())) {
+		swsusp_free();
+		goto Thaw;
+	}
 
 	pr_debug("PM: Preparing devices for restore.\n");
 
 	if ((error = device_suspend(PMSG_FREEZE))) {
 		printk("Some devices failed to suspend\n");
-		goto Free;
+		swsusp_free();
+		goto Thaw;
 	}
 
 	mb();
@@ -268,9 +270,7 @@ static int software_resume(void)
 	swsusp_resume();
 	pr_debug("PM: Restore failed, recovering.n");
 	device_resume();
- Free:
-	swsusp_free();
- Cleanup:
+ Thaw:
 	unprepare_processes();
  Done:
 	/* For success case, the suspend path will release the lock */
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 28afcb09014..d4fd96a135a 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -66,7 +66,7 @@ extern asmlinkage int swsusp_arch_suspend(void);
 extern asmlinkage int swsusp_arch_resume(void);
 
 extern int restore_highmem(void);
-extern void free_pagedir(struct pbe *pblist);
 extern struct pbe * alloc_pagedir(unsigned nr_pages);
 extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages);
+extern void swsusp_free(void);
 extern int enough_swap(unsigned nr_pages);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 03916cf3ff0..84e686bdb40 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -240,7 +240,7 @@ static void copy_data_pages(struct pbe *pblist)
  *	free_pagedir - free pages allocated with alloc_pagedir()
  */
 
-void free_pagedir(struct pbe *pblist)
+static void free_pagedir(struct pbe *pblist)
 {
 	struct pbe *pbe;
 
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index f6abfdb0a02..50667f4f3a2 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -629,6 +629,11 @@ int swsusp_resume(void)
 	 * execution continues at place where swsusp_arch_suspend was called
          */
 	BUG_ON(!error);
+	/* The only reason why swsusp_arch_resume() can fail is memory being
+	 * very tight, so we have to free it as soon as we can to avoid
+	 * subsequent failures
+	 */
+	swsusp_free();
 	restore_processor_state();
 	restore_highmem();
 	touch_softlockup_watchdog();
@@ -644,54 +649,28 @@ int swsusp_resume(void)
  *
  *	We don't know which pages are usable until we allocate them.
  *
- *	Allocated but unusable (ie eaten) memory pages are linked together
- *	to create a list, so that we can free them easily
- *
- *	We could have used a type other than (void *)
- *	for this purpose, but ...
+ *	Allocated but unusable (ie eaten) memory pages are marked so that
+ *	swsusp_free() can release them
  */
-static void **eaten_memory = NULL;
 
-static inline void eat_page(void *page)
-{
-	void **c;
-
-	c = eaten_memory;
-	eaten_memory = page;
-	*eaten_memory = c;
-}
-
-unsigned long get_usable_page(gfp_t gfp_mask)
+unsigned long get_safe_page(gfp_t gfp_mask)
 {
 	unsigned long m;
 
-	m = get_zeroed_page(gfp_mask);
-	while (!PageNosaveFree(virt_to_page(m))) {
-		eat_page((void *)m);
+	do {
 		m = get_zeroed_page(gfp_mask);
-		if (!m)
-			break;
+		if (m && PageNosaveFree(virt_to_page(m)))
+			/* This is for swsusp_free() */
+			SetPageNosave(virt_to_page(m));
+	} while (m && PageNosaveFree(virt_to_page(m)));
+	if (m) {
+		/* This is for swsusp_free() */
+		SetPageNosave(virt_to_page(m));
+		SetPageNosaveFree(virt_to_page(m));
 	}
 	return m;
 }
 
-void free_eaten_memory(void)
-{
-	unsigned long m;
-	void **c;
-	int i = 0;
-
-	c = eaten_memory;
-	while (c) {
-		m = (unsigned long)c;
-		c = *c;
-		free_page(m);
-		i++;
-	}
-	eaten_memory = NULL;
-	pr_debug("swsusp: %d unused pages freed\n", i);
-}
-
 /**
  *	check_pagedir - We ensure here that pages that the PBEs point to
  *	won't collide with pages where we're going to restore from the loaded
@@ -709,7 +688,7 @@ static int check_pagedir(struct pbe *pblist)
 		p->address = 0UL;
 
 	for_each_pbe (p, pblist) {
-		p->address = get_usable_page(GFP_ATOMIC);
+		p->address = get_safe_page(GFP_ATOMIC);
 		if (!p->address)
 			return -ENOMEM;
 	}
@@ -728,7 +707,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
 	unsigned long zone_pfn;
 	struct pbe *pbpage, *tail, *p;
 	void *m;
-	int rel = 0, error = 0;
+	int rel = 0;
 
 	if (!pblist) /* a sanity check */
 		return NULL;
@@ -736,41 +715,37 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
 	pr_debug("swsusp: Relocating pagedir (%lu pages to check)\n",
 			swsusp_info.pagedir_pages);
 
-	/* Set page flags */
+	/* Clear page flags */
 
 	for_each_zone (zone) {
         	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
-                	SetPageNosaveFree(pfn_to_page(zone_pfn +
+        		if (pfn_valid(zone_pfn + zone->zone_start_pfn))
+                		ClearPageNosaveFree(pfn_to_page(zone_pfn +
 					zone->zone_start_pfn));
 	}
 
-	/* Clear orig addresses */
+	/* Mark orig addresses */
 
 	for_each_pbe (p, pblist)
-		ClearPageNosaveFree(virt_to_page(p->orig_address));
+		SetPageNosaveFree(virt_to_page(p->orig_address));
 
 	tail = pblist + PB_PAGE_SKIP;
 
 	/* Relocate colliding pages */
 
 	for_each_pb_page (pbpage, pblist) {
-		if (!PageNosaveFree(virt_to_page((unsigned long)pbpage))) {
-			m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD);
-			if (!m) {
-				error = -ENOMEM;
-				break;
-			}
+		if (PageNosaveFree(virt_to_page((unsigned long)pbpage))) {
+			m = (void *)get_safe_page(GFP_ATOMIC | __GFP_COLD);
+			if (!m)
+				return NULL;
 			memcpy(m, (void *)pbpage, PAGE_SIZE);
 			if (pbpage == pblist)
 				pblist = (struct pbe *)m;
 			else
 				tail->next = (struct pbe *)m;
-
-			eat_page((void *)pbpage);
 			pbpage = (struct pbe *)m;
 
 			/* We have to link the PBEs again */
-
 			for (p = pbpage; p < pbpage + PB_PAGE_SKIP; p++)
 				if (p->next) /* needed to save the end */
 					p->next = p + 1;
@@ -780,15 +755,13 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
 		tail = pbpage + PB_PAGE_SKIP;
 	}
 
-	if (error) {
-		printk("\nswsusp: Out of memory\n\n");
-		free_pagedir(pblist);
-		free_eaten_memory();
-		pblist = NULL;
-		/* Is this even worth handling? It should never ever happen, and we
-		   have just lost user's state, anyway... */
-	} else
-		printk("swsusp: Relocated %d pages\n", rel);
+	/* This is for swsusp_free() */
+	for_each_pb_page (pbpage, pblist) {
+		SetPageNosave(virt_to_page(pbpage));
+		SetPageNosaveFree(virt_to_page(pbpage));
+	}
+
+	printk("swsusp: Relocated %d pages\n", rel);
 
 	return pblist;
 }
@@ -1006,9 +979,7 @@ static int read_pagedir(struct pbe *pblist)
 			break;
 	}
 
-	if (error)
-		free_pagedir(pblist);
-	else
+	if (!error)
 		BUG_ON(i != swsusp_info.pagedir_pages);
 
 	return error;
@@ -1051,15 +1022,6 @@ static int read_suspend_image(void)
 	if (!error)
 		error = data_read(pagedir_nosave);
 
-	if (error) { /* We fail cleanly */
-		free_eaten_memory();
-		for_each_pbe (p, pagedir_nosave)
-			if (p->address) {
-				free_page(p->address);
-				p->address = 0UL;
-			}
-		free_pagedir(pagedir_nosave);
-	}
 	return error;
 }
 
-- 
cgit v1.2.3


From 96bc7aec20b50761822f96130127b8e31e168af1 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Sun, 30 Oct 2005 14:59:58 -0800
Subject: [PATCH] swsusp: remove unneccessary includes

Cleanup comments and remove unneccessary includes.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/snapshot.c | 25 ++-----------------------
 kernel/power/swsusp.c   |  9 +--------
 2 files changed, 3 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 84e686bdb40..b4a923e59bc 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1,8 +1,7 @@
 /*
- * linux/kernel/power/swsusp.c
+ * linux/kernel/power/snapshot.c
  *
- * This file is to realize architecture-independent
- * machine suspend feature using pretty near only high-level routines
+ * This file provide system snapshot/restore functionality.
  *
  * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz>
  *
@@ -15,30 +14,16 @@
 #include <linux/mm.h>
 #include <linux/suspend.h>
 #include <linux/smp_lock.h>
-#include <linux/file.h>
-#include <linux/utsname.h>
-#include <linux/version.h>
 #include <linux/delay.h>
-#include <linux/reboot.h>
 #include <linux/bitops.h>
-#include <linux/vt_kern.h>
-#include <linux/kbd_kern.h>
-#include <linux/keyboard.h>
 #include <linux/spinlock.h>
-#include <linux/genhd.h>
 #include <linux/kernel.h>
-#include <linux/major.h>
-#include <linux/swap.h>
 #include <linux/pm.h>
 #include <linux/device.h>
-#include <linux/buffer_head.h>
-#include <linux/swapops.h>
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
 #include <linux/console.h>
 #include <linux/highmem.h>
-#include <linux/bio.h>
-#include <linux/mount.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -46,15 +31,9 @@
 #include <asm/tlbflush.h>
 #include <asm/io.h>
 
-#include <linux/random.h>
-#include <linux/crypto.h>
-#include <asm/scatterlist.h>
-
 #include "power.h"
 
 
-
-
 #ifdef CONFIG_HIGHMEM
 struct highmem_page {
 	char *data;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 50667f4f3a2..ae8425b69b4 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -1,8 +1,7 @@
 /*
  * linux/kernel/power/swsusp.c
  *
- * This file is to realize architecture-independent
- * machine suspend feature using pretty near only high-level routines
+ * This file provides code to write suspend image to swap and read it back.
  *
  * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
  * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
@@ -47,11 +46,7 @@
 #include <linux/utsname.h>
 #include <linux/version.h>
 #include <linux/delay.h>
-#include <linux/reboot.h>
 #include <linux/bitops.h>
-#include <linux/vt_kern.h>
-#include <linux/kbd_kern.h>
-#include <linux/keyboard.h>
 #include <linux/spinlock.h>
 #include <linux/genhd.h>
 #include <linux/kernel.h>
@@ -63,10 +58,8 @@
 #include <linux/swapops.h>
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
-#include <linux/console.h>
 #include <linux/highmem.h>
 #include <linux/bio.h>
-#include <linux/mount.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
-- 
cgit v1.2.3


From de491861e1457c31aed6d44d96afb549365ff790 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Sun, 30 Oct 2005 14:59:59 -0800
Subject: [PATCH] swsusp: cleanups

Reduce number of ifdefs somehow, and fix whitespace a bit.  No real code
changes.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/snapshot.c | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index b4a923e59bc..72787f92563 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -33,7 +33,6 @@
 
 #include "power.h"
 
-
 #ifdef CONFIG_HIGHMEM
 struct highmem_page {
 	char *data;
@@ -88,12 +87,10 @@ static int save_highmem_zone(struct zone *zone)
 	}
 	return 0;
 }
-#endif /* CONFIG_HIGHMEM */
 
 
 static int save_highmem(void)
 {
-#ifdef CONFIG_HIGHMEM
 	struct zone *zone;
 	int res = 0;
 
@@ -104,13 +101,11 @@ static int save_highmem(void)
 		if (res)
 			return res;
 	}
-#endif
 	return 0;
 }
 
 int restore_highmem(void)
 {
-#ifdef CONFIG_HIGHMEM
 	printk("swsusp: Restoring Highmem\n");
 	while (highmem_copy) {
 		struct highmem_page *save = highmem_copy;
@@ -123,9 +118,12 @@ int restore_highmem(void)
 		free_page((long) save->data);
 		kfree(save);
 	}
-#endif
 	return 0;
 }
+#else
+static int save_highmem(void) { return 0; }
+int restore_highmem(void) { return 0; }
+#endif /* CONFIG_HIGHMEM */
 
 
 static int pfn_is_nosave(unsigned long pfn)
@@ -144,10 +142,10 @@ static int pfn_is_nosave(unsigned long pfn)
  *	isn't part of a free chunk of pages.
  */
 
-static int saveable(struct zone * zone, unsigned long * zone_pfn)
+static int saveable(struct zone *zone, unsigned long *zone_pfn)
 {
 	unsigned long pfn = *zone_pfn + zone->zone_start_pfn;
-	struct page * page;
+	struct page *page;
 
 	if (!pfn_valid(pfn))
 		return 0;
@@ -201,7 +199,7 @@ static void copy_data_pages(struct pbe *pblist)
 			SetPageNosaveFree(virt_to_page(p->address));
 		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
 			if (saveable(zone, &zone_pfn)) {
-				struct page * page;
+				struct page *page;
 				page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
 				BUG_ON(!pbe);
 				pbe->orig_address = (unsigned long)page_address(page);
@@ -295,7 +293,7 @@ static void *alloc_image_page(void)
  *	On each page we set up a list of struct_pbe elements.
  */
 
-struct pbe * alloc_pagedir(unsigned nr_pages)
+struct pbe *alloc_pagedir(unsigned nr_pages)
 {
 	unsigned num;
 	struct pbe *pblist, *pbe;
@@ -304,12 +302,12 @@ struct pbe * alloc_pagedir(unsigned nr_pages)
 		return NULL;
 
 	pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
-	pblist = (struct pbe *)alloc_image_page();
+	pblist = alloc_image_page();
 	/* FIXME: rewrite this ugly loop */
 	for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
         		pbe = pbe->next, num += PBES_PER_PAGE) {
 		pbe += PB_PAGE_SKIP;
-		pbe->next = (struct pbe *)alloc_image_page();
+		pbe->next = alloc_image_page();
 	}
 	if (!pbe) { /* get_zeroed_page() failed */
 		free_pagedir(pblist);
-- 
cgit v1.2.3


From 2e32a43efdc8175579cc91e8b620ac331376a437 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 30 Oct 2005 15:00:00 -0800
Subject: [PATCH] swsusp: get rid of unnecessary wrapper function

The following patch merges two functions in a trivial way.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/snapshot.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 72787f92563..42a62870439 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -378,7 +378,7 @@ static struct pbe *swsusp_alloc(unsigned nr_pages)
 	return pblist;
 }
 
-static int suspend_prepare_image(void)
+asmlinkage int swsusp_save(void)
 {
 	unsigned nr_pages;
 
@@ -433,9 +433,3 @@ static int suspend_prepare_image(void)
 	printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
 	return 0;
 }
-
-
-asmlinkage int swsusp_save(void)
-{
-	return suspend_prepare_image();
-}
-- 
cgit v1.2.3


From 0245b3e787dc3267a915e1f56419e7e9c197e148 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 30 Oct 2005 15:00:01 -0800
Subject: [PATCH] swsusp: two simplifications

The following patch simplifies the progress meter in disk.c:free_some_memory()
and makes disk.c:pm_suspend_disk() call device_resume() explicitly in the
suspend path.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/disk.c   | 8 ++------
 kernel/power/swsusp.c | 2 +-
 2 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 44ef5e799df..027322a564f 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -92,10 +92,7 @@ static void free_some_memory(void)
 	printk("Freeing memory...  ");
 	while ((tmp = shrink_all_memory(10000))) {
 		pages += tmp;
-		printk("\b%c", p[i]);
-		i++;
-		if (i > 3)
-			i = 0;
+		printk("\b%c", p[i++ % 4]);
 	}
 	printk("\bdone (%li pages freed)\n", pages);
 }
@@ -177,13 +174,12 @@ int pm_suspend_disk(void)
 		goto Done;
 
 	if (in_suspend) {
+		device_resume();
 		pr_debug("PM: writing image.\n");
 		error = swsusp_write();
 		if (!error)
 			power_down(pm_disk_mode);
 		else {
-		/* swsusp_write can not fail in device_resume,
-		   no need to do second device_resume */
 			swsusp_free();
 			unprepare_processes();
 			return error;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index ae8425b69b4..12db1d2ad61 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -562,7 +562,7 @@ int enough_swap(unsigned nr_pages)
 int swsusp_write(void)
 {
 	int error;
-	device_resume();
+
 	lock_swapdevices();
 	error = write_suspend_image();
 	/* This will unlock ignored swap devices since writing is finished */
-- 
cgit v1.2.3


From eb9289eb20df6b54214c45ac7c6bf5179a149026 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Sun, 30 Oct 2005 15:00:01 -0800
Subject: [PATCH] introduce .valid callback for pm_ops

Add pm_ops.valid callback, so only the available pm states show in
/sys/power/state.  And this also makes an earlier states error report at
enter_state before we do actual suspend/resume.

Signed-off-by: Shaohua Li<shaohua.li@intel.com>
Acked-by: Pavel Machek<pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/main.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/main.c b/kernel/power/main.c
index 22bdc93cc03..18d7d693fbb 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -167,6 +167,8 @@ static int enter_state(suspend_state_t state)
 {
 	int error;
 
+	if (pm_ops->valid && !pm_ops->valid(state))
+		return -ENODEV;
 	if (down_trylock(&pm_sem))
 		return -EBUSY;
 
@@ -236,7 +238,8 @@ static ssize_t state_show(struct subsystem * subsys, char * buf)
 	char * s = buf;
 
 	for (i = 0; i < PM_SUSPEND_MAX; i++) {
-		if (pm_states[i])
+		if (pm_states[i] && pm_ops && (!pm_ops->valid
+			||(pm_ops->valid && pm_ops->valid(i))))
 			s += sprintf(s,"%s ",pm_states[i]);
 	}
 	s += sprintf(s,"\n");
-- 
cgit v1.2.3


From a8db2db1e6a8d323d87a67c5391d48fe2b97faf5 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 30 Oct 2005 15:01:38 -0800
Subject: [PATCH] introduce setup_timer() helper

Every user of init_timer() also needs to initialize ->function and ->data
fields.  This patch adds a simple setup_timer() helper for that.

The schedule_timeout() is patched as an example of usage.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 6a2e5f8dc72..6ed1a826e5c 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1146,12 +1146,8 @@ fastcall signed long __sched schedule_timeout(signed long timeout)
 
 	expire = timeout + jiffies;
 
-	init_timer(&timer);
-	timer.expires = expire;
-	timer.data = (unsigned long) current;
-	timer.function = process_timeout;
-
-	add_timer(&timer);
+	setup_timer(&timer, process_timeout, (unsigned long)current);
+	__mod_timer(&timer, expire);
 	schedule();
 	del_singleshot_timer_sync(&timer);
 
-- 
cgit v1.2.3


From 61e1a9ea4b425eb8c3b4965c35fe953bd881728f Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Sun, 30 Oct 2005 15:01:40 -0800
Subject: [PATCH] Add kthread_stop_sem()

Enhance the kthread API by adding kthread_stop_sem, for use in stopping
threads that spend their idle time waiting on a semaphore.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kthread.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index f50f174e92d..e75950a1092 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -164,6 +164,12 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
 EXPORT_SYMBOL(kthread_bind);
 
 int kthread_stop(struct task_struct *k)
+{
+	return kthread_stop_sem(k, NULL);
+}
+EXPORT_SYMBOL(kthread_stop);
+
+int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
 {
 	int ret;
 
@@ -178,7 +184,10 @@ int kthread_stop(struct task_struct *k)
 
 	/* Now set kthread_should_stop() to true, and wake it up. */
 	kthread_stop_info.k = k;
-	wake_up_process(k);
+	if (s)
+		up(s);
+	else
+		wake_up_process(k);
 	put_task_struct(k);
 
 	/* Once it dies, reset stop ptr, gather result and we're done. */
@@ -189,7 +198,7 @@ int kthread_stop(struct task_struct *k)
 
 	return ret;
 }
-EXPORT_SYMBOL(kthread_stop);
+EXPORT_SYMBOL(kthread_stop_sem);
 
 static __init int helper_init(void)
 {
-- 
cgit v1.2.3


From 1bb34a412750291e4e5e9f1d0fe7ae1b7e976098 Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Sun, 30 Oct 2005 15:01:42 -0800
Subject: [PATCH] NTP shift_right cleanup

Create a macro shift_right() that avoids the numerous ugly conditionals in the
NTP code that look like:

        if(a < 0)
                b = -(-a >> shift);
        else
                b = a >> shift;

Replacing it with:

        b = shift_right(a, shift);

This should have zero effect on the logic, however it should probably have
a bit of testing just to be sure.

Also replace open-coded min/max with the macros.

Signed-off-by : John Stultz <johnstul@us.ibm.com>

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/time.c  | 25 ++++++-------------------
 kernel/timer.c | 53 +++++++++++------------------------------------------
 2 files changed, 17 insertions(+), 61 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time.c b/kernel/time.c
index a3c2100470e..245d595a13c 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -338,30 +338,20 @@ int do_adjtimex(struct timex *txc)
 		        if (mtemp >= MINSEC) {
 			    ltemp = (time_offset / mtemp) << (SHIFT_USEC -
 							      SHIFT_UPDATE);
-			    if (ltemp < 0)
-			        time_freq -= -ltemp >> SHIFT_KH;
-			    else
-			        time_freq += ltemp >> SHIFT_KH;
+			    time_freq += shift_right(ltemp, SHIFT_KH);
 			} else /* calibration interval too short (p. 12) */
 				result = TIME_ERROR;
 		    } else {	/* PLL mode */
 		        if (mtemp < MAXSEC) {
 			    ltemp *= mtemp;
-			    if (ltemp < 0)
-			        time_freq -= -ltemp >> (time_constant +
-							time_constant +
-							SHIFT_KF - SHIFT_USEC);
-			    else
-			        time_freq += ltemp >> (time_constant +
+			    time_freq += shift_right(ltemp,(time_constant +
 						       time_constant +
-						       SHIFT_KF - SHIFT_USEC);
+						       SHIFT_KF - SHIFT_USEC));
 			} else /* calibration interval too long (p. 12) */
 				result = TIME_ERROR;
 		    }
-		    if (time_freq > time_tolerance)
-		        time_freq = time_tolerance;
-		    else if (time_freq < -time_tolerance)
-		        time_freq = -time_tolerance;
+		    time_freq = min(time_freq, time_tolerance);
+		    time_freq = max(time_freq, -time_tolerance);
 		} /* STA_PLL || STA_PPSTIME */
 	    } /* txc->modes & ADJ_OFFSET */
 	    if (txc->modes & ADJ_TICK) {
@@ -384,10 +374,7 @@ leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0
 	if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
 	    txc->offset	   = save_adjust;
 	else {
-	    if (time_offset < 0)
-		txc->offset = -(-time_offset >> SHIFT_UPDATE);
-	    else
-		txc->offset = time_offset >> SHIFT_UPDATE;
+	    txc->offset = shift_right(time_offset, SHIFT_UPDATE);
 	}
 	txc->freq	   = time_freq + pps_freq;
 	txc->maxerror	   = time_maxerror;
diff --git a/kernel/timer.c b/kernel/timer.c
index 6ed1a826e5c..6b94adb45b0 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -703,23 +703,13 @@ static void second_overflow(void)
      * the adjustment over not more than the number of
      * seconds between updates.
      */
-    if (time_offset < 0) {
-	ltemp = -time_offset;
-	if (!(time_status & STA_FLL))
-	    ltemp >>= SHIFT_KG + time_constant;
-	if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
-	    ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
-	time_offset += ltemp;
-	time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
-    } else {
 	ltemp = time_offset;
 	if (!(time_status & STA_FLL))
-	    ltemp >>= SHIFT_KG + time_constant;
-	if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
-	    ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
+		ltemp = shift_right(ltemp, SHIFT_KG + time_constant);
+	ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE);
+	ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE);
 	time_offset -= ltemp;
 	time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
-    }
 
     /*
      * Compute the frequency estimate and additional phase
@@ -736,39 +726,25 @@ static void second_overflow(void)
 			 STA_PPSWANDER | STA_PPSERROR);
     }
     ltemp = time_freq + pps_freq;
-    if (ltemp < 0)
-	time_adj -= -ltemp >>
-	    (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
-    else
-	time_adj += ltemp >>
-	    (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
+    time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE));
 
 #if HZ == 100
     /* Compensate for (HZ==100) != (1 << SHIFT_HZ).
      * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14)
      */
-    if (time_adj < 0)
-	time_adj -= (-time_adj >> 2) + (-time_adj >> 5);
-    else
-	time_adj += (time_adj >> 2) + (time_adj >> 5);
+    time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5);
 #endif
 #if HZ == 250
     /* Compensate for (HZ==250) != (1 << SHIFT_HZ).
      * Add 1.5625% and 0.78125% to get 255.85938; => only 0.05% error (p. 14)
      */
-    if (time_adj < 0)
-	time_adj -= (-time_adj >> 6) + (-time_adj >> 7);
-    else
-	time_adj += (time_adj >> 6) + (time_adj >> 7);
+    time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
 #endif
 #if HZ == 1000
     /* Compensate for (HZ==1000) != (1 << SHIFT_HZ).
      * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
      */
-    if (time_adj < 0)
-	time_adj -= (-time_adj >> 6) + (-time_adj >> 7);
-    else
-	time_adj += (time_adj >> 6) + (time_adj >> 7);
+    time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
 #endif
 }
 
@@ -787,10 +763,8 @@ static void update_wall_time_one_tick(void)
 	     * Limit the amount of the step to be in the range
 	     * -tickadj .. +tickadj
 	     */
-	     if (time_adjust > tickadj)
-		time_adjust_step = tickadj;
-	     else if (time_adjust < -tickadj)
-		time_adjust_step = -tickadj;
+	     time_adjust_step = min(time_adjust_step, (long)tickadj);
+	     time_adjust_step = max(time_adjust_step, (long)-tickadj);
 
 	    /* Reduce by this step the amount of time left  */
 	    time_adjust -= time_adjust_step;
@@ -801,13 +775,8 @@ static void update_wall_time_one_tick(void)
 	 * advance the tick more.
 	 */
 	time_phase += time_adj;
-	if (time_phase <= -FINENSEC) {
-		long ltemp = -time_phase >> (SHIFT_SCALE - 10);
-		time_phase += ltemp << (SHIFT_SCALE - 10);
-		delta_nsec -= ltemp;
-	}
-	else if (time_phase >= FINENSEC) {
-		long ltemp = time_phase >> (SHIFT_SCALE - 10);
+	if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) {
+		long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10));
 		time_phase -= ltemp << (SHIFT_SCALE - 10);
 		delta_nsec += ltemp;
 	}
-- 
cgit v1.2.3


From a5a0d52c7305cb3629ef0cc9e2e0e106869e1907 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 30 Oct 2005 15:01:42 -0800
Subject: [PATCH] ntp whitespace cleanup

Fix bizarre 4-space coding style in the NTP code.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 252 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 126 insertions(+), 126 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 6b94adb45b0..cc18857601e 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -632,77 +632,74 @@ long time_next_adjust;
  */
 static void second_overflow(void)
 {
-    long ltemp;
-
-    /* Bump the maxerror field */
-    time_maxerror += time_tolerance >> SHIFT_USEC;
-    if ( time_maxerror > NTP_PHASE_LIMIT ) {
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_status |= STA_UNSYNC;
-    }
-
-    /*
-     * Leap second processing. If in leap-insert state at
-     * the end of the day, the system clock is set back one
-     * second; if in leap-delete state, the system clock is
-     * set ahead one second. The microtime() routine or
-     * external clock driver will insure that reported time
-     * is always monotonic. The ugly divides should be
-     * replaced.
-     */
-    switch (time_state) {
-
-    case TIME_OK:
-	if (time_status & STA_INS)
-	    time_state = TIME_INS;
-	else if (time_status & STA_DEL)
-	    time_state = TIME_DEL;
-	break;
-
-    case TIME_INS:
-	if (xtime.tv_sec % 86400 == 0) {
-	    xtime.tv_sec--;
-	    wall_to_monotonic.tv_sec++;
-	    /* The timer interpolator will make time change gradually instead
-	     * of an immediate jump by one second.
-	     */
-	    time_interpolator_update(-NSEC_PER_SEC);
-	    time_state = TIME_OOP;
-	    clock_was_set();
-	    printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n");
+	long ltemp;
+
+	/* Bump the maxerror field */
+	time_maxerror += time_tolerance >> SHIFT_USEC;
+	if (time_maxerror > NTP_PHASE_LIMIT) {
+		time_maxerror = NTP_PHASE_LIMIT;
+		time_status |= STA_UNSYNC;
 	}
-	break;
-
-    case TIME_DEL:
-	if ((xtime.tv_sec + 1) % 86400 == 0) {
-	    xtime.tv_sec++;
-	    wall_to_monotonic.tv_sec--;
-	    /* Use of time interpolator for a gradual change of time */
-	    time_interpolator_update(NSEC_PER_SEC);
-	    time_state = TIME_WAIT;
-	    clock_was_set();
-	    printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n");
+
+	/*
+	 * Leap second processing. If in leap-insert state at the end of the
+	 * day, the system clock is set back one second; if in leap-delete
+	 * state, the system clock is set ahead one second. The microtime()
+	 * routine or external clock driver will insure that reported time is
+	 * always monotonic. The ugly divides should be replaced.
+	 */
+	switch (time_state) {
+	case TIME_OK:
+		if (time_status & STA_INS)
+			time_state = TIME_INS;
+		else if (time_status & STA_DEL)
+			time_state = TIME_DEL;
+		break;
+	case TIME_INS:
+		if (xtime.tv_sec % 86400 == 0) {
+			xtime.tv_sec--;
+			wall_to_monotonic.tv_sec++;
+			/*
+			 * The timer interpolator will make time change
+			 * gradually instead of an immediate jump by one second
+			 */
+			time_interpolator_update(-NSEC_PER_SEC);
+			time_state = TIME_OOP;
+			clock_was_set();
+			printk(KERN_NOTICE "Clock: inserting leap second "
+					"23:59:60 UTC\n");
+		}
+		break;
+	case TIME_DEL:
+		if ((xtime.tv_sec + 1) % 86400 == 0) {
+			xtime.tv_sec++;
+			wall_to_monotonic.tv_sec--;
+			/*
+			 * Use of time interpolator for a gradual change of
+			 * time
+			 */
+			time_interpolator_update(NSEC_PER_SEC);
+			time_state = TIME_WAIT;
+			clock_was_set();
+			printk(KERN_NOTICE "Clock: deleting leap second "
+					"23:59:59 UTC\n");
+		}
+		break;
+	case TIME_OOP:
+		time_state = TIME_WAIT;
+		break;
+	case TIME_WAIT:
+		if (!(time_status & (STA_INS | STA_DEL)))
+		time_state = TIME_OK;
 	}
-	break;
-
-    case TIME_OOP:
-	time_state = TIME_WAIT;
-	break;
-
-    case TIME_WAIT:
-	if (!(time_status & (STA_INS | STA_DEL)))
-	    time_state = TIME_OK;
-    }
-
-    /*
-     * Compute the phase adjustment for the next second. In
-     * PLL mode, the offset is reduced by a fixed factor
-     * times the time constant. In FLL mode the offset is
-     * used directly. In either mode, the maximum phase
-     * adjustment for each second is clamped so as to spread
-     * the adjustment over not more than the number of
-     * seconds between updates.
-     */
+
+	/*
+	 * Compute the phase adjustment for the next second. In PLL mode, the
+	 * offset is reduced by a fixed factor times the time constant. In FLL
+	 * mode the offset is used directly. In either mode, the maximum phase
+	 * adjustment for each second is clamped so as to spread the adjustment
+	 * over not more than the number of seconds between updates.
+	 */
 	ltemp = time_offset;
 	if (!(time_status & STA_FLL))
 		ltemp = shift_right(ltemp, SHIFT_KG + time_constant);
@@ -711,40 +708,42 @@ static void second_overflow(void)
 	time_offset -= ltemp;
 	time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
 
-    /*
-     * Compute the frequency estimate and additional phase
-     * adjustment due to frequency error for the next
-     * second. When the PPS signal is engaged, gnaw on the
-     * watchdog counter and update the frequency computed by
-     * the pll and the PPS signal.
-     */
-    pps_valid++;
-    if (pps_valid == PPS_VALID) {	/* PPS signal lost */
-	pps_jitter = MAXTIME;
-	pps_stabil = MAXFREQ;
-	time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
-			 STA_PPSWANDER | STA_PPSERROR);
-    }
-    ltemp = time_freq + pps_freq;
-    time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE));
+	/*
+	 * Compute the frequency estimate and additional phase adjustment due
+	 * to frequency error for the next second. When the PPS signal is
+	 * engaged, gnaw on the watchdog counter and update the frequency
+	 * computed by the pll and the PPS signal.
+	 */
+	pps_valid++;
+	if (pps_valid == PPS_VALID) {	/* PPS signal lost */
+		pps_jitter = MAXTIME;
+		pps_stabil = MAXFREQ;
+		time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
+				STA_PPSWANDER | STA_PPSERROR);
+	}
+	ltemp = time_freq + pps_freq;
+	time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE));
 
 #if HZ == 100
-    /* Compensate for (HZ==100) != (1 << SHIFT_HZ).
-     * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14)
-     */
-    time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5);
+	/*
+	 * Compensate for (HZ==100) != (1 << SHIFT_HZ).  Add 25% and 3.125% to
+	 * get 128.125; => only 0.125% error (p. 14)
+	 */
+	time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5);
 #endif
 #if HZ == 250
-    /* Compensate for (HZ==250) != (1 << SHIFT_HZ).
-     * Add 1.5625% and 0.78125% to get 255.85938; => only 0.05% error (p. 14)
-     */
-    time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
+	/*
+	 * Compensate for (HZ==250) != (1 << SHIFT_HZ).  Add 1.5625% and
+	 * 0.78125% to get 255.85938; => only 0.05% error (p. 14)
+	 */
+	time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
 #endif
 #if HZ == 1000
-    /* Compensate for (HZ==1000) != (1 << SHIFT_HZ).
-     * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
-     */
-    time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
+	/*
+	 * Compensate for (HZ==1000) != (1 << SHIFT_HZ).  Add 1.5625% and
+	 * 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
+	 */
+	time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
 #endif
 }
 
@@ -753,21 +752,20 @@ static void update_wall_time_one_tick(void)
 {
 	long time_adjust_step, delta_nsec;
 
-	if ( (time_adjust_step = time_adjust) != 0 ) {
-	    /* We are doing an adjtime thing. 
-	     *
-	     * Prepare time_adjust_step to be within bounds.
-	     * Note that a positive time_adjust means we want the clock
-	     * to run faster.
-	     *
-	     * Limit the amount of the step to be in the range
-	     * -tickadj .. +tickadj
-	     */
-	     time_adjust_step = min(time_adjust_step, (long)tickadj);
-	     time_adjust_step = max(time_adjust_step, (long)-tickadj);
-
-	    /* Reduce by this step the amount of time left  */
-	    time_adjust -= time_adjust_step;
+	if ((time_adjust_step = time_adjust) != 0 ) {
+		/*
+		 * We are doing an adjtime thing.  Prepare time_adjust_step to
+		 * be within bounds.  Note that a positive time_adjust means we
+		 * want the clock to run faster.
+		 *
+		 * Limit the amount of the step to be in the range
+		 * -tickadj .. +tickadj
+		 */
+		time_adjust_step = min(time_adjust_step, (long)tickadj);
+		time_adjust_step = max(time_adjust_step, (long)-tickadj);
+
+		/* Reduce by this step the amount of time left  */
+		time_adjust -= time_adjust_step;
 	}
 	delta_nsec = tick_nsec + time_adjust_step * 1000;
 	/*
@@ -1106,8 +1104,8 @@ fastcall signed long __sched schedule_timeout(signed long timeout)
 		if (timeout < 0)
 		{
 			printk(KERN_ERR "schedule_timeout: wrong timeout "
-			       "value %lx from %p\n", timeout,
-			       __builtin_return_address(0));
+				"value %lx from %p\n", timeout,
+				__builtin_return_address(0));
 			current->state = TASK_RUNNING;
 			goto out;
 		}
@@ -1133,15 +1131,15 @@ EXPORT_SYMBOL(schedule_timeout);
  */
 signed long __sched schedule_timeout_interruptible(signed long timeout)
 {
-       __set_current_state(TASK_INTERRUPTIBLE);
-       return schedule_timeout(timeout);
+	__set_current_state(TASK_INTERRUPTIBLE);
+	return schedule_timeout(timeout);
 }
 EXPORT_SYMBOL(schedule_timeout_interruptible);
 
 signed long __sched schedule_timeout_uninterruptible(signed long timeout)
 {
-       __set_current_state(TASK_UNINTERRUPTIBLE);
-       return schedule_timeout(timeout);
+	__set_current_state(TASK_UNINTERRUPTIBLE);
+	return schedule_timeout(timeout);
 }
 EXPORT_SYMBOL(schedule_timeout_uninterruptible);
 
@@ -1481,16 +1479,18 @@ static void time_interpolator_update(long delta_nsec)
 	if (!time_interpolator)
 		return;
 
-	/* The interpolator compensates for late ticks by accumulating
-         * the late time in time_interpolator->offset. A tick earlier than
-	 * expected will lead to a reset of the offset and a corresponding
-	 * jump of the clock forward. Again this only works if the
-	 * interpolator clock is running slightly slower than the regular clock
-	 * and the tuning logic insures that.
-         */
+	/*
+	 * The interpolator compensates for late ticks by accumulating the late
+	 * time in time_interpolator->offset. A tick earlier than expected will
+	 * lead to a reset of the offset and a corresponding jump of the clock
+	 * forward. Again this only works if the interpolator clock is running
+	 * slightly slower than the regular clock and the tuning logic insures
+	 * that.
+	 */
 
 	counter = time_interpolator_get_counter(1);
-	offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator);
+	offset = time_interpolator->offset +
+			GET_TI_NSECS(counter, time_interpolator);
 
 	if (delta_nsec < 0 || (unsigned long) delta_nsec < offset)
 		time_interpolator->offset = offset - delta_nsec;
-- 
cgit v1.2.3


From 89ada67917f516212452443a56b9fd3b65b74dc7 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Sun, 30 Oct 2005 15:01:59 -0800
Subject: [PATCH] Use alloc_percpu to allocate workqueues locally

This patch makes the workqueus use alloc_percpu instead of an array.  The
workqueues are placed on nodes local to each processor.

The workqueue structure can grow to a significant size on a system with
lots of processors if this patch is not applied.  64 bit architectures with
all debugging features enabled and configured for 512 processors will not
be able to boot without this patch.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/workqueue.c | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 91bacb13a7e..7cee222231b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -12,6 +12,8 @@
  *   Andrew Morton <andrewm@uow.edu.au>
  *   Kai Petzke <wpp@marie.physik.tu-berlin.de>
  *   Theodore Ts'o <tytso@mit.edu>
+ *
+ * Made to use alloc_percpu by Christoph Lameter <clameter@sgi.com>.
  */
 
 #include <linux/module.h>
@@ -57,7 +59,7 @@ struct cpu_workqueue_struct {
  * per-CPU workqueues:
  */
 struct workqueue_struct {
-	struct cpu_workqueue_struct cpu_wq[NR_CPUS];
+	struct cpu_workqueue_struct *cpu_wq;
 	const char *name;
 	struct list_head list; 	/* Empty if single thread */
 };
@@ -102,7 +104,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
 		if (unlikely(is_single_threaded(wq)))
 			cpu = 0;
 		BUG_ON(!list_empty(&work->entry));
-		__queue_work(wq->cpu_wq + cpu, work);
+		__queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 		ret = 1;
 	}
 	put_cpu();
@@ -118,7 +120,7 @@ static void delayed_work_timer_fn(unsigned long __data)
 	if (unlikely(is_single_threaded(wq)))
 		cpu = 0;
 
-	__queue_work(wq->cpu_wq + cpu, work);
+	__queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 }
 
 int fastcall queue_delayed_work(struct workqueue_struct *wq,
@@ -265,13 +267,13 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
 
 	if (is_single_threaded(wq)) {
 		/* Always use cpu 0's area. */
-		flush_cpu_workqueue(wq->cpu_wq + 0);
+		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, 0));
 	} else {
 		int cpu;
 
 		lock_cpu_hotplug();
 		for_each_online_cpu(cpu)
-			flush_cpu_workqueue(wq->cpu_wq + cpu);
+			flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
 		unlock_cpu_hotplug();
 	}
 }
@@ -279,7 +281,7 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
 static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
 						   int cpu)
 {
-	struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu;
+	struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
 	struct task_struct *p;
 
 	spin_lock_init(&cwq->lock);
@@ -312,6 +314,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
 	if (!wq)
 		return NULL;
 
+	wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
 	wq->name = name;
 	/* We don't need the distraction of CPUs appearing and vanishing. */
 	lock_cpu_hotplug();
@@ -353,7 +356,7 @@ static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu)
 	unsigned long flags;
 	struct task_struct *p;
 
-	cwq = wq->cpu_wq + cpu;
+	cwq = per_cpu_ptr(wq->cpu_wq, cpu);
 	spin_lock_irqsave(&cwq->lock, flags);
 	p = cwq->thread;
 	cwq->thread = NULL;
@@ -380,6 +383,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
 		spin_unlock(&workqueue_lock);
 	}
 	unlock_cpu_hotplug();
+	free_percpu(wq->cpu_wq);
 	kfree(wq);
 }
 
@@ -458,7 +462,7 @@ int current_is_keventd(void)
 
 	BUG_ON(!keventd_wq);
 
-	cwq = keventd_wq->cpu_wq + cpu;
+	cwq = per_cpu_ptr(keventd_wq->cpu_wq, cpu);
 	if (current == cwq->thread)
 		ret = 1;
 
@@ -470,7 +474,7 @@ int current_is_keventd(void)
 /* Take the work from this (downed) CPU. */
 static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
 {
-	struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu;
+	struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
 	LIST_HEAD(list);
 	struct work_struct *work;
 
@@ -481,7 +485,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
 		printk("Taking work for %s\n", wq->name);
 		work = list_entry(list.next,struct work_struct,entry);
 		list_del(&work->entry);
-		__queue_work(wq->cpu_wq + smp_processor_id(), work);
+		__queue_work(per_cpu_ptr(wq->cpu_wq, smp_processor_id()), work);
 	}
 	spin_unlock_irq(&cwq->lock);
 }
@@ -508,15 +512,18 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 	case CPU_ONLINE:
 		/* Kick off worker threads. */
 		list_for_each_entry(wq, &workqueues, list) {
-			kthread_bind(wq->cpu_wq[hotcpu].thread, hotcpu);
-			wake_up_process(wq->cpu_wq[hotcpu].thread);
+			struct cpu_workqueue_struct *cwq;
+
+			cwq = per_cpu_ptr(wq->cpu_wq, hotcpu);
+			kthread_bind(cwq->thread, hotcpu);
+			wake_up_process(cwq->thread);
 		}
 		break;
 
 	case CPU_UP_CANCELED:
 		list_for_each_entry(wq, &workqueues, list) {
 			/* Unbind so it can run. */
-			kthread_bind(wq->cpu_wq[hotcpu].thread,
+			kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread,
 				     smp_processor_id());
 			cleanup_workqueue_thread(wq, hotcpu);
 		}
-- 
cgit v1.2.3


From dfc4f94d2ff95fc92127d3e512c1df7cab274fb8 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 30 Oct 2005 15:02:03 -0800
Subject: [PATCH] remove timer debug field

Remove timer_list.magic and associated debugging code.

I originally added this when a spinlock was added to timer_list - this meant
that an all-zeroes timer became illegal and init_timer() was required.

That spinlock isn't even there any more, although timer.base must now be
initialised.

I'll keep this debugging code in -mm.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 35 -----------------------------------
 1 file changed, 35 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index cc18857601e..562d53eabb4 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -91,30 +91,6 @@ static inline void set_running_timer(tvec_base_t *base,
 #endif
 }
 
-static void check_timer_failed(struct timer_list *timer)
-{
-	static int whine_count;
-	if (whine_count < 16) {
-		whine_count++;
-		printk("Uninitialised timer!\n");
-		printk("This is just a warning.  Your computer is OK\n");
-		printk("function=0x%p, data=0x%lx\n",
-			timer->function, timer->data);
-		dump_stack();
-	}
-	/*
-	 * Now fix it up
-	 */
-	timer->magic = TIMER_MAGIC;
-}
-
-static inline void check_timer(struct timer_list *timer)
-{
-	if (timer->magic != TIMER_MAGIC)
-		check_timer_failed(timer);
-}
-
-
 static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
 {
 	unsigned long expires = timer->expires;
@@ -177,7 +153,6 @@ void fastcall init_timer(struct timer_list *timer)
 {
 	timer->entry.next = NULL;
 	timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base;
-	timer->magic = TIMER_MAGIC;
 }
 EXPORT_SYMBOL(init_timer);
 
@@ -230,7 +205,6 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
 	int ret = 0;
 
 	BUG_ON(!timer->function);
-	check_timer(timer);
 
 	base = lock_timer_base(timer, &flags);
 
@@ -283,9 +257,6 @@ void add_timer_on(struct timer_list *timer, int cpu)
   	unsigned long flags;
 
   	BUG_ON(timer_pending(timer) || !timer->function);
-
-	check_timer(timer);
-
 	spin_lock_irqsave(&base->t_base.lock, flags);
 	timer->base = &base->t_base;
 	internal_add_timer(base, timer);
@@ -316,8 +287,6 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
 {
 	BUG_ON(!timer->function);
 
-	check_timer(timer);
-
 	/*
 	 * This is a common optimization triggered by the
 	 * networking code - if the timer is re-modified
@@ -348,8 +317,6 @@ int del_timer(struct timer_list *timer)
 	unsigned long flags;
 	int ret = 0;
 
-	check_timer(timer);
-
 	if (timer_pending(timer)) {
 		base = lock_timer_base(timer, &flags);
 		if (timer_pending(timer)) {
@@ -412,8 +379,6 @@ out:
  */
 int del_timer_sync(struct timer_list *timer)
 {
-	check_timer(timer);
-
 	for (;;) {
 		int ret = try_to_del_timer_sync(timer);
 		if (ret >= 0)
-- 
cgit v1.2.3


From 19a4fcb531659f2f7d18b5d04cee039176e9540d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 30 Oct 2005 15:02:17 -0800
Subject: [PATCH] kill sigqueue->lock

This lock is used in sigqueue_free(), but it is always equal to
current->sighand->siglock, so we don't need to keep it in the struct
sigqueue.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 6904bbbfe11..8d6e64dfa5c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -277,7 +277,6 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
 	} else {
 		INIT_LIST_HEAD(&q->list);
 		q->flags = 0;
-		q->lock = NULL;
 		q->user = get_uid(t->user);
 	}
 	return(q);
@@ -1371,11 +1370,12 @@ void sigqueue_free(struct sigqueue *q)
 	 * pending queue.
 	 */
 	if (unlikely(!list_empty(&q->list))) {
-		read_lock(&tasklist_lock);  
-		spin_lock_irqsave(q->lock, flags);
+		spinlock_t *lock = &current->sighand->siglock;
+		read_lock(&tasklist_lock);
+		spin_lock_irqsave(lock, flags);
 		if (!list_empty(&q->list))
 			list_del_init(&q->list);
-		spin_unlock_irqrestore(q->lock, flags);
+		spin_unlock_irqrestore(lock, flags);
 		read_unlock(&tasklist_lock);
 	}
 	q->flags &= ~SIGQUEUE_PREALLOC;
@@ -1414,7 +1414,6 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 		goto out;
 	}
 
-	q->lock = &p->sighand->siglock;
 	list_add_tail(&q->list, &p->pending.list);
 	sigaddset(&p->pending.signal, sig);
 	if (!sigismember(&p->blocked, sig))
@@ -1462,7 +1461,6 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 	 * We always use the shared queue for process-wide signals,
 	 * to avoid several races.
 	 */
-	q->lock = &p->sighand->siglock;
 	list_add_tail(&q->list, &p->signal->shared_pending.list);
 	sigaddset(&p->signal->shared_pending.signal, sig);
 
-- 
cgit v1.2.3


From 6dd69f1061bfdeca230509b173438e0731bff767 Mon Sep 17 00:00:00 2001
From: Vadim Lobanov <vlobanov@speakeasy.net>
Date: Sun, 30 Oct 2005 15:02:18 -0800
Subject: [PATCH] Unify sys_tkill() and sys_tgkill()

The majority of the sys_tkill() and sys_tgkill() function code is
duplicated between the two of them.  This patch pulls the duplication out
into a separate function -- do_tkill() -- and lets sys_tkill() and
sys_tgkill() be simple wrappers around it.  This should make it easier to
maintain in light of future changes.

Signed-off-by: Vadim Lobanov <vlobanov@speakeasy.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 71 ++++++++++++++++++++-------------------------------------
 1 file changed, 25 insertions(+), 46 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 8d6e64dfa5c..1d905ec74bd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2283,26 +2283,13 @@ sys_kill(int pid, int sig)
 	return kill_something_info(sig, &info, pid);
 }
 
-/**
- *  sys_tgkill - send signal to one specific thread
- *  @tgid: the thread group ID of the thread
- *  @pid: the PID of the thread
- *  @sig: signal to be sent
- *
- *  This syscall also checks the tgid and returns -ESRCH even if the PID
- *  exists but it's not belonging to the target process anymore. This
- *  method solves the problem of threads exiting and PIDs getting reused.
- */
-asmlinkage long sys_tgkill(int tgid, int pid, int sig)
+static int do_tkill(int tgid, int pid, int sig)
 {
-	struct siginfo info;
 	int error;
+	struct siginfo info;
 	struct task_struct *p;
 
-	/* This is only valid for single tasks */
-	if (pid <= 0 || tgid <= 0)
-		return -EINVAL;
-
+	error = -ESRCH;
 	info.si_signo = sig;
 	info.si_errno = 0;
 	info.si_code = SI_TKILL;
@@ -2311,8 +2298,7 @@ asmlinkage long sys_tgkill(int tgid, int pid, int sig)
 
 	read_lock(&tasklist_lock);
 	p = find_task_by_pid(pid);
-	error = -ESRCH;
-	if (p && (p->tgid == tgid)) {
+	if (p && (tgid <= 0 || p->tgid == tgid)) {
 		error = check_kill_permission(sig, &info, p);
 		/*
 		 * The null signal is a permissions and process existence
@@ -2326,47 +2312,40 @@ asmlinkage long sys_tgkill(int tgid, int pid, int sig)
 		}
 	}
 	read_unlock(&tasklist_lock);
+
 	return error;
 }
 
+/**
+ *  sys_tgkill - send signal to one specific thread
+ *  @tgid: the thread group ID of the thread
+ *  @pid: the PID of the thread
+ *  @sig: signal to be sent
+ *
+ *  This syscall also checks the tgid and returns -ESRCH even if the PID
+ *  exists but it's not belonging to the target process anymore. This
+ *  method solves the problem of threads exiting and PIDs getting reused.
+ */
+asmlinkage long sys_tgkill(int tgid, int pid, int sig)
+{
+	/* This is only valid for single tasks */
+	if (pid <= 0 || tgid <= 0)
+		return -EINVAL;
+
+	return do_tkill(tgid, pid, sig);
+}
+
 /*
  *  Send a signal to only one task, even if it's a CLONE_THREAD task.
  */
 asmlinkage long
 sys_tkill(int pid, int sig)
 {
-	struct siginfo info;
-	int error;
-	struct task_struct *p;
-
 	/* This is only valid for single tasks */
 	if (pid <= 0)
 		return -EINVAL;
 
-	info.si_signo = sig;
-	info.si_errno = 0;
-	info.si_code = SI_TKILL;
-	info.si_pid = current->tgid;
-	info.si_uid = current->uid;
-
-	read_lock(&tasklist_lock);
-	p = find_task_by_pid(pid);
-	error = -ESRCH;
-	if (p) {
-		error = check_kill_permission(sig, &info, p);
-		/*
-		 * The null signal is a permissions and process existence
-		 * probe.  No signal is actually delivered.
-		 */
-		if (!error && sig && p->sighand) {
-			spin_lock_irq(&p->sighand->siglock);
-			handle_stop_signal(sig, p);
-			error = specific_send_sig_info(sig, &info, p);
-			spin_unlock_irq(&p->sighand->siglock);
-		}
-	}
-	read_unlock(&tasklist_lock);
-	return error;
+	return do_tkill(0, pid, sig);
 }
 
 asmlinkage long
-- 
cgit v1.2.3


From 4eb9af2a8a431a832830f986fead7332dab27229 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 30 Oct 2005 15:02:21 -0800
Subject: [PATCH] posix-timers: use schedule_timeout() in common_nsleep()

common_nsleep() reimplements schedule_timeout_interruptible() for unknown
reason.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-timers.c | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index dda3cda73c7..ea55c7a1cd7 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -1295,13 +1295,6 @@ sys_clock_getres(clockid_t which_clock, struct timespec __user *tp)
 	return error;
 }
 
-static void nanosleep_wake_up(unsigned long __data)
-{
-	struct task_struct *p = (struct task_struct *) __data;
-
-	wake_up_process(p);
-}
-
 /*
  * The standard says that an absolute nanosleep call MUST wake up at
  * the requested time in spite of clock settings.  Here is what we do:
@@ -1442,7 +1435,6 @@ static int common_nsleep(clockid_t which_clock,
 			 int flags, struct timespec *tsave)
 {
 	struct timespec t, dum;
-	struct timer_list new_timer;
 	DECLARE_WAITQUEUE(abs_wqueue, current);
 	u64 rq_time = (u64)0;
 	s64 left;
@@ -1451,10 +1443,6 @@ static int common_nsleep(clockid_t which_clock,
 	    &current_thread_info()->restart_block;
 
 	abs_wqueue.flags = 0;
-	init_timer(&new_timer);
-	new_timer.expires = 0;
-	new_timer.data = (unsigned long) current;
-	new_timer.function = nanosleep_wake_up;
 	abs = flags & TIMER_ABSTIME;
 
 	if (restart_block->fn == clock_nanosleep_restart) {
@@ -1490,13 +1478,8 @@ static int common_nsleep(clockid_t which_clock,
 		if (left < (s64)0)
 			break;
 
-		new_timer.expires = jiffies + left;
-		__set_current_state(TASK_INTERRUPTIBLE);
-		add_timer(&new_timer);
-
-		schedule();
+		schedule_timeout_interruptible(left);
 
-		del_timer_sync(&new_timer);
 		left = rq_time - get_jiffies_64();
 	} while (left > (s64)0 && !test_thread_flag(TIF_SIGPENDING));
 
-- 
cgit v1.2.3


From f35f31d7ed0150f9865619f21b5050c91b46c03f Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 30 Oct 2005 15:02:27 -0800
Subject: [PATCH] cpuset cleanup

Remove one more useless line from cpuset_common_file_read().

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 28176d083f7..b9342f90d28 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -995,7 +995,6 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
 		goto out;
 	}
 	*s++ = '\n';
-	*s = '\0';
 
 	retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
 out:
-- 
cgit v1.2.3


From 5aa15b5f27fc2c404530c6c8eabdb8437deb3163 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 30 Oct 2005 15:02:28 -0800
Subject: [PATCH] cpusets: remove depth counted locking hack

Remove a rather hackish depth counter on cpuset locking.  The depth counter
was avoiding a possible double trip on the global cpuset_sem semaphore.  It
worked, but now an improved version of cpuset locking is available, to come
in the next patch, using two global semaphores.

This patch reverses "cpuset semaphore depth check deadlock fix"

The kernel still works, even after this patch, except for some rare and
difficult to reproduce race conditions when agressively creating and
destroying cpusets marked with the notify_on_release option, on very large
systems.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 105 +++++++++++++++++++++-----------------------------------
 1 file changed, 40 insertions(+), 65 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b9342f90d28..cd54dba2be1 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -180,42 +180,6 @@ static struct super_block *cpuset_sb = NULL;
  */
 
 static DECLARE_MUTEX(cpuset_sem);
-static struct task_struct *cpuset_sem_owner;
-static int cpuset_sem_depth;
-
-/*
- * The global cpuset semaphore cpuset_sem can be needed by the
- * memory allocator to update a tasks mems_allowed (see the calls
- * to cpuset_update_current_mems_allowed()) or to walk up the
- * cpuset hierarchy to find a mem_exclusive cpuset see the calls
- * to cpuset_excl_nodes_overlap()).
- *
- * But if the memory allocation is being done by cpuset.c code, it
- * usually already holds cpuset_sem.  Double tripping on a kernel
- * semaphore deadlocks the current task, and any other task that
- * subsequently tries to obtain the lock.
- *
- * Run all up's and down's on cpuset_sem through the following
- * wrappers, which will detect this nested locking, and avoid
- * deadlocking.
- */
-
-static inline void cpuset_down(struct semaphore *psem)
-{
-	if (cpuset_sem_owner != current) {
-		down(psem);
-		cpuset_sem_owner = current;
-	}
-	cpuset_sem_depth++;
-}
-
-static inline void cpuset_up(struct semaphore *psem)
-{
-	if (--cpuset_sem_depth == 0) {
-		cpuset_sem_owner = NULL;
-		up(psem);
-	}
-}
 
 /*
  * A couple of forward declarations required, due to cyclic reference loop:
@@ -558,10 +522,19 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
  * Refresh current tasks mems_allowed and mems_generation from
  * current tasks cpuset.  Call with cpuset_sem held.
  *
- * This routine is needed to update the per-task mems_allowed
- * data, within the tasks context, when it is trying to allocate
- * memory (in various mm/mempolicy.c routines) and notices
- * that some other task has been modifying its cpuset.
+ * Be sure to call refresh_mems() on any cpuset operation which
+ * (1) holds cpuset_sem, and (2) might possibly alloc memory.
+ * Call after obtaining cpuset_sem lock, before any possible
+ * allocation.  Otherwise one risks trying to allocate memory
+ * while the task cpuset_mems_generation is not the same as
+ * the mems_generation in its cpuset, which would deadlock on
+ * cpuset_sem in cpuset_update_current_mems_allowed().
+ *
+ * Since we hold cpuset_sem, once refresh_mems() is called, the
+ * test (current->cpuset_mems_generation != cs->mems_generation)
+ * in cpuset_update_current_mems_allowed() will remain false,
+ * until we drop cpuset_sem.  Anyone else who would change our
+ * cpusets mems_generation needs to lock cpuset_sem first.
  */
 
 static void refresh_mems(void)
@@ -867,7 +840,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 	}
 	buffer[nbytes] = 0;	/* nul-terminate */
 
-	cpuset_down(&cpuset_sem);
+	down(&cpuset_sem);
 
 	if (is_removed(cs)) {
 		retval = -ENODEV;
@@ -901,7 +874,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 	if (retval == 0)
 		retval = nbytes;
 out2:
-	cpuset_up(&cpuset_sem);
+	up(&cpuset_sem);
 	cpuset_release_agent(pathbuf);
 out1:
 	kfree(buffer);
@@ -941,9 +914,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
 {
 	cpumask_t mask;
 
-	cpuset_down(&cpuset_sem);
+	down(&cpuset_sem);
 	mask = cs->cpus_allowed;
-	cpuset_up(&cpuset_sem);
+	up(&cpuset_sem);
 
 	return cpulist_scnprintf(page, PAGE_SIZE, mask);
 }
@@ -952,9 +925,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
 {
 	nodemask_t mask;
 
-	cpuset_down(&cpuset_sem);
+	down(&cpuset_sem);
 	mask = cs->mems_allowed;
-	cpuset_up(&cpuset_sem);
+	up(&cpuset_sem);
 
 	return nodelist_scnprintf(page, PAGE_SIZE, mask);
 }
@@ -1351,7 +1324,8 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 	if (!cs)
 		return -ENOMEM;
 
-	cpuset_down(&cpuset_sem);
+	down(&cpuset_sem);
+	refresh_mems();
 	cs->flags = 0;
 	if (notify_on_release(parent))
 		set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
@@ -1376,14 +1350,14 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 	 * will down() this new directory's i_sem and if we race with
 	 * another mkdir, we might deadlock.
 	 */
-	cpuset_up(&cpuset_sem);
+	up(&cpuset_sem);
 
 	err = cpuset_populate_dir(cs->dentry);
 	/* If err < 0, we have a half-filled directory - oh well ;) */
 	return 0;
 err:
 	list_del(&cs->sibling);
-	cpuset_up(&cpuset_sem);
+	up(&cpuset_sem);
 	kfree(cs);
 	return err;
 }
@@ -1405,13 +1379,14 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 
 	/* the vfs holds both inode->i_sem already */
 
-	cpuset_down(&cpuset_sem);
+	down(&cpuset_sem);
+	refresh_mems();
 	if (atomic_read(&cs->count) > 0) {
-		cpuset_up(&cpuset_sem);
+		up(&cpuset_sem);
 		return -EBUSY;
 	}
 	if (!list_empty(&cs->children)) {
-		cpuset_up(&cpuset_sem);
+		up(&cpuset_sem);
 		return -EBUSY;
 	}
 	parent = cs->parent;
@@ -1427,7 +1402,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	spin_unlock(&d->d_lock);
 	cpuset_d_remove_dir(d);
 	dput(d);
-	cpuset_up(&cpuset_sem);
+	up(&cpuset_sem);
 	cpuset_release_agent(pathbuf);
 	return 0;
 }
@@ -1530,10 +1505,10 @@ void cpuset_exit(struct task_struct *tsk)
 	if (notify_on_release(cs)) {
 		char *pathbuf = NULL;
 
-		cpuset_down(&cpuset_sem);
+		down(&cpuset_sem);
 		if (atomic_dec_and_test(&cs->count))
 			check_for_release(cs, &pathbuf);
-		cpuset_up(&cpuset_sem);
+		up(&cpuset_sem);
 		cpuset_release_agent(pathbuf);
 	} else {
 		atomic_dec(&cs->count);
@@ -1554,11 +1529,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk)
 {
 	cpumask_t mask;
 
-	cpuset_down(&cpuset_sem);
+	down(&cpuset_sem);
 	task_lock((struct task_struct *)tsk);
 	guarantee_online_cpus(tsk->cpuset, &mask);
 	task_unlock((struct task_struct *)tsk);
-	cpuset_up(&cpuset_sem);
+	up(&cpuset_sem);
 
 	return mask;
 }
@@ -1583,9 +1558,9 @@ void cpuset_update_current_mems_allowed(void)
 	if (!cs)
 		return;		/* task is exiting */
 	if (current->cpuset_mems_generation != cs->mems_generation) {
-		cpuset_down(&cpuset_sem);
+		down(&cpuset_sem);
 		refresh_mems();
-		cpuset_up(&cpuset_sem);
+		up(&cpuset_sem);
 	}
 }
 
@@ -1684,14 +1659,14 @@ int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 		return 0;
 
 	/* Not hardwall and node outside mems_allowed: scan up cpusets */
-	cpuset_down(&cpuset_sem);
+	down(&cpuset_sem);
 	cs = current->cpuset;
 	if (!cs)
 		goto done;		/* current task exiting */
 	cs = nearest_exclusive_ancestor(cs);
 	allowed = node_isset(node, cs->mems_allowed);
 done:
-	cpuset_up(&cpuset_sem);
+	up(&cpuset_sem);
 	return allowed;
 }
 
@@ -1712,7 +1687,7 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
 	const struct cpuset *cs1, *cs2;	/* my and p's cpuset ancestors */
 	int overlap = 0;		/* do cpusets overlap? */
 
-	cpuset_down(&cpuset_sem);
+	down(&cpuset_sem);
 	cs1 = current->cpuset;
 	if (!cs1)
 		goto done;		/* current task exiting */
@@ -1723,7 +1698,7 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
 	cs2 = nearest_exclusive_ancestor(cs2);
 	overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
 done:
-	cpuset_up(&cpuset_sem);
+	up(&cpuset_sem);
 
 	return overlap;
 }
@@ -1746,7 +1721,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
 		return -ENOMEM;
 
 	tsk = m->private;
-	cpuset_down(&cpuset_sem);
+	down(&cpuset_sem);
 	task_lock(tsk);
 	cs = tsk->cpuset;
 	task_unlock(tsk);
@@ -1761,7 +1736,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
 	seq_puts(m, buf);
 	seq_putc(m, '\n');
 out:
-	cpuset_up(&cpuset_sem);
+	up(&cpuset_sem);
 	kfree(buf);
 	return retval;
 }
-- 
cgit v1.2.3


From 053199edf54f685e7dea765b60d4d5e9070dadec Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 30 Oct 2005 15:02:30 -0800
Subject: [PATCH] cpusets: dual semaphore locking overhaul

Overhaul cpuset locking.  Replace single semaphore with two semaphores.

The suggestion to use two locks was made by Roman Zippel.

Both locks are global.  Code that wants to modify cpusets must first
acquire the exclusive manage_sem, which allows them read-only access to
cpusets, and holds off other would-be modifiers.  Before making actual
changes, the second semaphore, callback_sem must be acquired as well.  Code
that needs only to query cpusets must acquire callback_sem, which is also a
global exclusive lock.

The earlier problems with double tripping are avoided, because it is
allowed for holders of manage_sem to nest the second callback_sem lock, and
only callback_sem is needed by code called from within __alloc_pages(),
where the double tripping had been possible.

This is not quite the same as a normal read/write semaphore, because
obtaining read-only access with intent to change must hold off other such
attempts, while allowing read-only access w/o such intention.  Changing
cpusets involves several related checks and changes, which must be done
while allowing read-only queries (to avoid the double trip), but while
ensuring nothing changes (holding off other would be modifiers.)

This overhaul of cpuset locking also makes careful use of task_lock() to
guard access to the task->cpuset pointer, closing a couple of race
conditions noticed while reading this code (thanks, Roman).  I've never
seen these races fail in any use or test.

See further the comments in the code.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 418 +++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 281 insertions(+), 137 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index cd54dba2be1..7491352276b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -60,6 +60,9 @@ struct cpuset {
 	cpumask_t cpus_allowed;		/* CPUs allowed to tasks in cpuset */
 	nodemask_t mems_allowed;	/* Memory Nodes allowed to tasks */
 
+	/*
+	 * Count is atomic so can incr (fork) or decr (exit) without a lock.
+	 */
 	atomic_t count;			/* count tasks using this cpuset */
 
 	/*
@@ -142,44 +145,91 @@ static struct vfsmount *cpuset_mount;
 static struct super_block *cpuset_sb = NULL;
 
 /*
- * cpuset_sem should be held by anyone who is depending on the children
- * or sibling lists of any cpuset, or performing non-atomic operations
- * on the flags or *_allowed values of a cpuset, such as raising the
- * CS_REMOVED flag bit iff it is not already raised, or reading and
- * conditionally modifying the *_allowed values.  One kernel global
- * cpuset semaphore should be sufficient - these things don't change
- * that much.
- *
- * The code that modifies cpusets holds cpuset_sem across the entire
- * operation, from cpuset_common_file_write() down, single threading
- * all cpuset modifications (except for counter manipulations from
- * fork and exit) across the system.  This presumes that cpuset
- * modifications are rare - better kept simple and safe, even if slow.
- *
- * The code that reads cpusets, such as in cpuset_common_file_read()
- * and below, only holds cpuset_sem across small pieces of code, such
- * as when reading out possibly multi-word cpumasks and nodemasks, as
- * the risks are less, and the desire for performance a little greater.
- * The proc_cpuset_show() routine needs to hold cpuset_sem to insure
- * that no cs->dentry is NULL, as it walks up the cpuset tree to root.
- *
- * The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't
- * (usually) grab cpuset_sem.  These are the two most performance
- * critical pieces of code here.  The exception occurs on exit(),
- * when a task in a notify_on_release cpuset exits.  Then cpuset_sem
+ * We have two global cpuset semaphores below.  They can nest.
+ * It is ok to first take manage_sem, then nest callback_sem.  We also
+ * require taking task_lock() when dereferencing a tasks cpuset pointer.
+ * See "The task_lock() exception", at the end of this comment.
+ *
+ * A task must hold both semaphores to modify cpusets.  If a task
+ * holds manage_sem, then it blocks others wanting that semaphore,
+ * ensuring that it is the only task able to also acquire callback_sem
+ * and be able to modify cpusets.  It can perform various checks on
+ * the cpuset structure first, knowing nothing will change.  It can
+ * also allocate memory while just holding manage_sem.  While it is
+ * performing these checks, various callback routines can briefly
+ * acquire callback_sem to query cpusets.  Once it is ready to make
+ * the changes, it takes callback_sem, blocking everyone else.
+ *
+ * Calls to the kernel memory allocator can not be made while holding
+ * callback_sem, as that would risk double tripping on callback_sem
+ * from one of the callbacks into the cpuset code from within
+ * __alloc_pages().
+ *
+ * If a task is only holding callback_sem, then it has read-only
+ * access to cpusets.
+ *
+ * The task_struct fields mems_allowed and mems_generation may only
+ * be accessed in the context of that task, so require no locks.
+ *
+ * Any task can increment and decrement the count field without lock.
+ * So in general, code holding manage_sem or callback_sem can't rely
+ * on the count field not changing.  However, if the count goes to
+ * zero, then only attach_task(), which holds both semaphores, can
+ * increment it again.  Because a count of zero means that no tasks
+ * are currently attached, therefore there is no way a task attached
+ * to that cpuset can fork (the other way to increment the count).
+ * So code holding manage_sem or callback_sem can safely assume that
+ * if the count is zero, it will stay zero.  Similarly, if a task
+ * holds manage_sem or callback_sem on a cpuset with zero count, it
+ * knows that the cpuset won't be removed, as cpuset_rmdir() needs
+ * both of those semaphores.
+ *
+ * A possible optimization to improve parallelism would be to make
+ * callback_sem a R/W semaphore (rwsem), allowing the callback routines
+ * to proceed in parallel, with read access, until the holder of
+ * manage_sem needed to take this rwsem for exclusive write access
+ * and modify some cpusets.
+ *
+ * The cpuset_common_file_write handler for operations that modify
+ * the cpuset hierarchy holds manage_sem across the entire operation,
+ * single threading all such cpuset modifications across the system.
+ *
+ * The cpuset_common_file_read() handlers only hold callback_sem across
+ * small pieces of code, such as when reading out possibly multi-word
+ * cpumasks and nodemasks.
+ *
+ * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't
+ * (usually) take either semaphore.  These are the two most performance
+ * critical pieces of code here.  The exception occurs on cpuset_exit(),
+ * when a task in a notify_on_release cpuset exits.  Then manage_sem
  * is taken, and if the cpuset count is zero, a usermode call made
  * to /sbin/cpuset_release_agent with the name of the cpuset (path
  * relative to the root of cpuset file system) as the argument.
  *
- * A cpuset can only be deleted if both its 'count' of using tasks is
- * zero, and its list of 'children' cpusets is empty.  Since all tasks
- * in the system use _some_ cpuset, and since there is always at least
- * one task in the system (init, pid == 1), therefore, top_cpuset
- * always has either children cpusets and/or using tasks.  So no need
- * for any special hack to ensure that top_cpuset cannot be deleted.
+ * A cpuset can only be deleted if both its 'count' of using tasks
+ * is zero, and its list of 'children' cpusets is empty.  Since all
+ * tasks in the system use _some_ cpuset, and since there is always at
+ * least one task in the system (init, pid == 1), therefore, top_cpuset
+ * always has either children cpusets and/or using tasks.  So we don't
+ * need a special hack to ensure that top_cpuset cannot be deleted.
+ *
+ * The above "Tale of Two Semaphores" would be complete, but for:
+ *
+ *	The task_lock() exception
+ *
+ * The need for this exception arises from the action of attach_task(),
+ * which overwrites one tasks cpuset pointer with another.  It does
+ * so using both semaphores, however there are several performance
+ * critical places that need to reference task->cpuset without the
+ * expense of grabbing a system global semaphore.  Therefore except as
+ * noted below, when dereferencing or, as in attach_task(), modifying
+ * a tasks cpuset pointer we use task_lock(), which acts on a spinlock
+ * (task->alloc_lock) already in the task_struct routinely used for
+ * such matters.
  */
 
-static DECLARE_MUTEX(cpuset_sem);
+static DECLARE_MUTEX(manage_sem);
+static DECLARE_MUTEX(callback_sem);
 
 /*
  * A couple of forward declarations required, due to cyclic reference loop:
@@ -354,7 +404,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
 }
 
 /*
- * Call with cpuset_sem held.  Writes path of cpuset into buf.
+ * Call with manage_sem held.  Writes path of cpuset into buf.
  * Returns 0 on success, -errno on error.
  */
 
@@ -406,10 +456,11 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen)
  * status of the /sbin/cpuset_release_agent task, so no sense holding
  * our caller up for that.
  *
- * The simple act of forking that task might require more memory,
- * which might need cpuset_sem.  So this routine must be called while
- * cpuset_sem is not held, to avoid a possible deadlock.  See also
- * comments for check_for_release(), below.
+ * When we had only one cpuset semaphore, we had to call this
+ * without holding it, to avoid deadlock when call_usermodehelper()
+ * allocated memory.  With two locks, we could now call this while
+ * holding manage_sem, but we still don't, so as to minimize
+ * the time manage_sem is held.
  */
 
 static void cpuset_release_agent(const char *pathbuf)
@@ -441,15 +492,15 @@ static void cpuset_release_agent(const char *pathbuf)
  * cs is notify_on_release() and now both the user count is zero and
  * the list of children is empty, prepare cpuset path in a kmalloc'd
  * buffer, to be returned via ppathbuf, so that the caller can invoke
- * cpuset_release_agent() with it later on, once cpuset_sem is dropped.
- * Call here with cpuset_sem held.
+ * cpuset_release_agent() with it later on, once manage_sem is dropped.
+ * Call here with manage_sem held.
  *
  * This check_for_release() routine is responsible for kmalloc'ing
  * pathbuf.  The above cpuset_release_agent() is responsible for
  * kfree'ing pathbuf.  The caller of these routines is responsible
  * for providing a pathbuf pointer, initialized to NULL, then
- * calling check_for_release() with cpuset_sem held and the address
- * of the pathbuf pointer, then dropping cpuset_sem, then calling
+ * calling check_for_release() with manage_sem held and the address
+ * of the pathbuf pointer, then dropping manage_sem, then calling
  * cpuset_release_agent() with pathbuf, as set by check_for_release().
  */
 
@@ -480,7 +531,7 @@ static void check_for_release(struct cpuset *cs, char **ppathbuf)
  * One way or another, we guarantee to return some non-empty subset
  * of cpu_online_map.
  *
- * Call with cpuset_sem held.
+ * Call with callback_sem held.
  */
 
 static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
@@ -504,7 +555,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
  * One way or another, we guarantee to return some non-empty subset
  * of node_online_map.
  *
- * Call with cpuset_sem held.
+ * Call with callback_sem held.
  */
 
 static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
@@ -519,31 +570,44 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
 }
 
 /*
- * Refresh current tasks mems_allowed and mems_generation from
- * current tasks cpuset.  Call with cpuset_sem held.
- *
- * Be sure to call refresh_mems() on any cpuset operation which
- * (1) holds cpuset_sem, and (2) might possibly alloc memory.
- * Call after obtaining cpuset_sem lock, before any possible
- * allocation.  Otherwise one risks trying to allocate memory
- * while the task cpuset_mems_generation is not the same as
- * the mems_generation in its cpuset, which would deadlock on
- * cpuset_sem in cpuset_update_current_mems_allowed().
- *
- * Since we hold cpuset_sem, once refresh_mems() is called, the
- * test (current->cpuset_mems_generation != cs->mems_generation)
- * in cpuset_update_current_mems_allowed() will remain false,
- * until we drop cpuset_sem.  Anyone else who would change our
- * cpusets mems_generation needs to lock cpuset_sem first.
+ * Refresh current tasks mems_allowed and mems_generation from current
+ * tasks cpuset.
+ *
+ * Call without callback_sem or task_lock() held.  May be called with
+ * or without manage_sem held.  Will acquire task_lock() and might
+ * acquire callback_sem during call.
+ *
+ * The task_lock() is required to dereference current->cpuset safely.
+ * Without it, we could pick up the pointer value of current->cpuset
+ * in one instruction, and then attach_task could give us a different
+ * cpuset, and then the cpuset we had could be removed and freed,
+ * and then on our next instruction, we could dereference a no longer
+ * valid cpuset pointer to get its mems_generation field.
+ *
+ * This routine is needed to update the per-task mems_allowed data,
+ * within the tasks context, when it is trying to allocate memory
+ * (in various mm/mempolicy.c routines) and notices that some other
+ * task has been modifying its cpuset.
  */
 
 static void refresh_mems(void)
 {
-	struct cpuset *cs = current->cpuset;
+	int my_cpusets_mem_gen;
+
+	task_lock(current);
+	my_cpusets_mem_gen = current->cpuset->mems_generation;
+	task_unlock(current);
 
-	if (current->cpuset_mems_generation != cs->mems_generation) {
+	if (current->cpuset_mems_generation != my_cpusets_mem_gen) {
+		struct cpuset *cs;
+
+		down(&callback_sem);
+		task_lock(current);
+		cs = current->cpuset;
 		guarantee_online_mems(cs, &current->mems_allowed);
 		current->cpuset_mems_generation = cs->mems_generation;
+		task_unlock(current);
+		up(&callback_sem);
 	}
 }
 
@@ -552,7 +616,7 @@ static void refresh_mems(void)
  *
  * One cpuset is a subset of another if all its allowed CPUs and
  * Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set.
+ * are only set if the other's are set.  Call holding manage_sem.
  */
 
 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -570,7 +634,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
  * If we replaced the flag and mask values of the current cpuset
  * (cur) with those values in the trial cpuset (trial), would
  * our various subset and exclusive rules still be valid?  Presumes
- * cpuset_sem held.
+ * manage_sem held.
  *
  * 'cur' is the address of an actual, in-use cpuset.  Operations
  * such as list traversal that depend on the actual address of the
@@ -624,7 +688,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
  *    exclusive child cpusets
  * Build these two partitions by calling partition_sched_domains
  *
- * Call with cpuset_sem held.  May nest a call to the
+ * Call with manage_sem held.  May nest a call to the
  * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
  */
 
@@ -669,6 +733,10 @@ static void update_cpu_domains(struct cpuset *cur)
 	unlock_cpu_hotplug();
 }
 
+/*
+ * Call with manage_sem held.  May take callback_sem during call.
+ */
+
 static int update_cpumask(struct cpuset *cs, char *buf)
 {
 	struct cpuset trialcs;
@@ -685,12 +753,18 @@ static int update_cpumask(struct cpuset *cs, char *buf)
 	if (retval < 0)
 		return retval;
 	cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
+	down(&callback_sem);
 	cs->cpus_allowed = trialcs.cpus_allowed;
+	up(&callback_sem);
 	if (is_cpu_exclusive(cs) && !cpus_unchanged)
 		update_cpu_domains(cs);
 	return 0;
 }
 
+/*
+ * Call with manage_sem held.  May take callback_sem during call.
+ */
+
 static int update_nodemask(struct cpuset *cs, char *buf)
 {
 	struct cpuset trialcs;
@@ -705,9 +779,11 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 		return -ENOSPC;
 	retval = validate_change(cs, &trialcs);
 	if (retval == 0) {
+		down(&callback_sem);
 		cs->mems_allowed = trialcs.mems_allowed;
 		atomic_inc(&cpuset_mems_generation);
 		cs->mems_generation = atomic_read(&cpuset_mems_generation);
+		up(&callback_sem);
 	}
 	return retval;
 }
@@ -718,6 +794,8 @@ static int update_nodemask(struct cpuset *cs, char *buf)
  *						CS_NOTIFY_ON_RELEASE)
  * cs:	the cpuset to update
  * buf:	the buffer where we read the 0 or 1
+ *
+ * Call with manage_sem held.
  */
 
 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
@@ -739,16 +817,27 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
 		return err;
 	cpu_exclusive_changed =
 		(is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
+	down(&callback_sem);
 	if (turning_on)
 		set_bit(bit, &cs->flags);
 	else
 		clear_bit(bit, &cs->flags);
+	up(&callback_sem);
 
 	if (cpu_exclusive_changed)
                 update_cpu_domains(cs);
 	return 0;
 }
 
+/*
+ * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly
+ * writing the path of the old cpuset in 'ppathbuf' if it needs to be
+ * notified on release.
+ *
+ * Call holding manage_sem.  May take callback_sem and task_lock of
+ * the task 'pid' during call.
+ */
+
 static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 {
 	pid_t pid;
@@ -765,7 +854,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 		read_lock(&tasklist_lock);
 
 		tsk = find_task_by_pid(pid);
-		if (!tsk) {
+		if (!tsk || tsk->flags & PF_EXITING) {
 			read_unlock(&tasklist_lock);
 			return -ESRCH;
 		}
@@ -783,10 +872,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 		get_task_struct(tsk);
 	}
 
+	down(&callback_sem);
+
 	task_lock(tsk);
 	oldcs = tsk->cpuset;
 	if (!oldcs) {
 		task_unlock(tsk);
+		up(&callback_sem);
 		put_task_struct(tsk);
 		return -ESRCH;
 	}
@@ -797,6 +889,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 	guarantee_online_cpus(cs, &cpus);
 	set_cpus_allowed(tsk, cpus);
 
+	up(&callback_sem);
 	put_task_struct(tsk);
 	if (atomic_dec_and_test(&oldcs->count))
 		check_for_release(oldcs, ppathbuf);
@@ -840,7 +933,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 	}
 	buffer[nbytes] = 0;	/* nul-terminate */
 
-	down(&cpuset_sem);
+	down(&manage_sem);
 
 	if (is_removed(cs)) {
 		retval = -ENODEV;
@@ -874,7 +967,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 	if (retval == 0)
 		retval = nbytes;
 out2:
-	up(&cpuset_sem);
+	up(&manage_sem);
 	cpuset_release_agent(pathbuf);
 out1:
 	kfree(buffer);
@@ -914,9 +1007,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
 {
 	cpumask_t mask;
 
-	down(&cpuset_sem);
+	down(&callback_sem);
 	mask = cs->cpus_allowed;
-	up(&cpuset_sem);
+	up(&callback_sem);
 
 	return cpulist_scnprintf(page, PAGE_SIZE, mask);
 }
@@ -925,9 +1018,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
 {
 	nodemask_t mask;
 
-	down(&cpuset_sem);
+	down(&callback_sem);
 	mask = cs->mems_allowed;
-	up(&cpuset_sem);
+	up(&callback_sem);
 
 	return nodelist_scnprintf(page, PAGE_SIZE, mask);
 }
@@ -1135,7 +1228,9 @@ struct ctr_struct {
 
 /*
  * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'.
- * Return actual number of pids loaded.
+ * Return actual number of pids loaded.  No need to task_lock(p)
+ * when reading out p->cpuset, as we don't really care if it changes
+ * on the next cycle, and we are not going to try to dereference it.
  */
 static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs)
 {
@@ -1177,6 +1272,12 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
 	return cnt;
 }
 
+/*
+ * Handle an open on 'tasks' file.  Prepare a buffer listing the
+ * process id's of tasks currently attached to the cpuset being opened.
+ *
+ * Does not require any specific cpuset semaphores, and does not take any.
+ */
 static int cpuset_tasks_open(struct inode *unused, struct file *file)
 {
 	struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
@@ -1324,7 +1425,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 	if (!cs)
 		return -ENOMEM;
 
-	down(&cpuset_sem);
+	down(&manage_sem);
 	refresh_mems();
 	cs->flags = 0;
 	if (notify_on_release(parent))
@@ -1339,25 +1440,27 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 
 	cs->parent = parent;
 
+	down(&callback_sem);
 	list_add(&cs->sibling, &cs->parent->children);
+	up(&callback_sem);
 
 	err = cpuset_create_dir(cs, name, mode);
 	if (err < 0)
 		goto err;
 
 	/*
-	 * Release cpuset_sem before cpuset_populate_dir() because it
+	 * Release manage_sem before cpuset_populate_dir() because it
 	 * will down() this new directory's i_sem and if we race with
 	 * another mkdir, we might deadlock.
 	 */
-	up(&cpuset_sem);
+	up(&manage_sem);
 
 	err = cpuset_populate_dir(cs->dentry);
 	/* If err < 0, we have a half-filled directory - oh well ;) */
 	return 0;
 err:
 	list_del(&cs->sibling);
-	up(&cpuset_sem);
+	up(&manage_sem);
 	kfree(cs);
 	return err;
 }
@@ -1379,30 +1482,32 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 
 	/* the vfs holds both inode->i_sem already */
 
-	down(&cpuset_sem);
+	down(&manage_sem);
 	refresh_mems();
 	if (atomic_read(&cs->count) > 0) {
-		up(&cpuset_sem);
+		up(&manage_sem);
 		return -EBUSY;
 	}
 	if (!list_empty(&cs->children)) {
-		up(&cpuset_sem);
+		up(&manage_sem);
 		return -EBUSY;
 	}
 	parent = cs->parent;
+	down(&callback_sem);
 	set_bit(CS_REMOVED, &cs->flags);
 	if (is_cpu_exclusive(cs))
 		update_cpu_domains(cs);
 	list_del(&cs->sibling);	/* delete my sibling from parent->children */
-	if (list_empty(&parent->children))
-		check_for_release(parent, &pathbuf);
 	spin_lock(&cs->dentry->d_lock);
 	d = dget(cs->dentry);
 	cs->dentry = NULL;
 	spin_unlock(&d->d_lock);
 	cpuset_d_remove_dir(d);
 	dput(d);
-	up(&cpuset_sem);
+	up(&callback_sem);
+	if (list_empty(&parent->children))
+		check_for_release(parent, &pathbuf);
+	up(&manage_sem);
 	cpuset_release_agent(pathbuf);
 	return 0;
 }
@@ -1462,16 +1567,26 @@ void __init cpuset_init_smp(void)
  * cpuset_fork - attach newly forked task to its parents cpuset.
  * @tsk: pointer to task_struct of forking parent process.
  *
- * Description: By default, on fork, a task inherits its
- * parent's cpuset.  The pointer to the shared cpuset is
- * automatically copied in fork.c by dup_task_struct().
- * This cpuset_fork() routine need only increment the usage
- * counter in that cpuset.
+ * Description: A task inherits its parent's cpuset at fork().
+ *
+ * A pointer to the shared cpuset was automatically copied in fork.c
+ * by dup_task_struct().  However, we ignore that copy, since it was
+ * not made under the protection of task_lock(), so might no longer be
+ * a valid cpuset pointer.  attach_task() might have already changed
+ * current->cpuset, allowing the previously referenced cpuset to
+ * be removed and freed.  Instead, we task_lock(current) and copy
+ * its present value of current->cpuset for our freshly forked child.
+ *
+ * At the point that cpuset_fork() is called, 'current' is the parent
+ * task, and the passed argument 'child' points to the child task.
  **/
 
-void cpuset_fork(struct task_struct *tsk)
+void cpuset_fork(struct task_struct *child)
 {
-	atomic_inc(&tsk->cpuset->count);
+	task_lock(current);
+	child->cpuset = current->cpuset;
+	atomic_inc(&child->cpuset->count);
+	task_unlock(current);
 }
 
 /**
@@ -1480,35 +1595,42 @@ void cpuset_fork(struct task_struct *tsk)
  *
  * Description: Detach cpuset from @tsk and release it.
  *
- * Note that cpusets marked notify_on_release force every task
- * in them to take the global cpuset_sem semaphore when exiting.
- * This could impact scaling on very large systems.  Be reluctant
- * to use notify_on_release cpusets where very high task exit
- * scaling is required on large systems.
- *
- * Don't even think about derefencing 'cs' after the cpuset use
- * count goes to zero, except inside a critical section guarded
- * by the cpuset_sem semaphore.  If you don't hold cpuset_sem,
- * then a zero cpuset use count is a license to any other task to
- * nuke the cpuset immediately.
+ * Note that cpusets marked notify_on_release force every task in
+ * them to take the global manage_sem semaphore when exiting.
+ * This could impact scaling on very large systems.  Be reluctant to
+ * use notify_on_release cpusets where very high task exit scaling
+ * is required on large systems.
+ *
+ * Don't even think about derefencing 'cs' after the cpuset use count
+ * goes to zero, except inside a critical section guarded by manage_sem
+ * or callback_sem.   Otherwise a zero cpuset use count is a license to
+ * any other task to nuke the cpuset immediately, via cpuset_rmdir().
+ *
+ * This routine has to take manage_sem, not callback_sem, because
+ * it is holding that semaphore while calling check_for_release(),
+ * which calls kmalloc(), so can't be called holding callback__sem().
+ *
+ * We don't need to task_lock() this reference to tsk->cpuset,
+ * because tsk is already marked PF_EXITING, so attach_task() won't
+ * mess with it.
  **/
 
 void cpuset_exit(struct task_struct *tsk)
 {
 	struct cpuset *cs;
 
-	task_lock(tsk);
+	BUG_ON(!(tsk->flags & PF_EXITING));
+
 	cs = tsk->cpuset;
 	tsk->cpuset = NULL;
-	task_unlock(tsk);
 
 	if (notify_on_release(cs)) {
 		char *pathbuf = NULL;
 
-		down(&cpuset_sem);
+		down(&manage_sem);
 		if (atomic_dec_and_test(&cs->count))
 			check_for_release(cs, &pathbuf);
-		up(&cpuset_sem);
+		up(&manage_sem);
 		cpuset_release_agent(pathbuf);
 	} else {
 		atomic_dec(&cs->count);
@@ -1529,11 +1651,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk)
 {
 	cpumask_t mask;
 
-	down(&cpuset_sem);
+	down(&callback_sem);
 	task_lock((struct task_struct *)tsk);
 	guarantee_online_cpus(tsk->cpuset, &mask);
 	task_unlock((struct task_struct *)tsk);
-	up(&cpuset_sem);
+	up(&callback_sem);
 
 	return mask;
 }
@@ -1549,19 +1671,28 @@ void cpuset_init_current_mems_allowed(void)
  * If the current tasks cpusets mems_allowed changed behind our backs,
  * update current->mems_allowed and mems_generation to the new value.
  * Do not call this routine if in_interrupt().
+ *
+ * Call without callback_sem or task_lock() held.  May be called
+ * with or without manage_sem held.  Unless exiting, it will acquire
+ * task_lock().  Also might acquire callback_sem during call to
+ * refresh_mems().
  */
 
 void cpuset_update_current_mems_allowed(void)
 {
-	struct cpuset *cs = current->cpuset;
+	struct cpuset *cs;
+	int need_to_refresh = 0;
 
+	task_lock(current);
+	cs = current->cpuset;
 	if (!cs)
-		return;		/* task is exiting */
-	if (current->cpuset_mems_generation != cs->mems_generation) {
-		down(&cpuset_sem);
+		goto done;
+	if (current->cpuset_mems_generation != cs->mems_generation)
+		need_to_refresh = 1;
+done:
+	task_unlock(current);
+	if (need_to_refresh)
 		refresh_mems();
-		up(&cpuset_sem);
-	}
 }
 
 /**
@@ -1595,7 +1726,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
 
 /*
  * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
- * ancestor to the specified cpuset.  Call while holding cpuset_sem.
+ * ancestor to the specified cpuset.  Call holding callback_sem.
  * If no ancestor is mem_exclusive (an unusual configuration), then
  * returns the root cpuset.
  */
@@ -1622,12 +1753,12 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
  * GFP_KERNEL allocations are not so marked, so can escape to the
  * nearest mem_exclusive ancestor cpuset.
  *
- * Scanning up parent cpusets requires cpuset_sem.  The __alloc_pages()
+ * Scanning up parent cpusets requires callback_sem.  The __alloc_pages()
  * routine only calls here with __GFP_HARDWALL bit _not_ set if
  * it's a GFP_KERNEL allocation, and all nodes in the current tasks
  * mems_allowed came up empty on the first pass over the zonelist.
  * So only GFP_KERNEL allocations, if all nodes in the cpuset are
- * short of memory, might require taking the cpuset_sem semaphore.
+ * short of memory, might require taking the callback_sem semaphore.
  *
  * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
  * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
@@ -1659,14 +1790,16 @@ int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 		return 0;
 
 	/* Not hardwall and node outside mems_allowed: scan up cpusets */
-	down(&cpuset_sem);
-	cs = current->cpuset;
-	if (!cs)
-		goto done;		/* current task exiting */
-	cs = nearest_exclusive_ancestor(cs);
+	down(&callback_sem);
+
+	if (current->flags & PF_EXITING) /* Let dying task have memory */
+		return 1;
+	task_lock(current);
+	cs = nearest_exclusive_ancestor(current->cpuset);
+	task_unlock(current);
+
 	allowed = node_isset(node, cs->mems_allowed);
-done:
-	up(&cpuset_sem);
+	up(&callback_sem);
 	return allowed;
 }
 
@@ -1679,7 +1812,7 @@ done:
  * determine if task @p's memory usage might impact the memory
  * available to the current task.
  *
- * Acquires cpuset_sem - not suitable for calling from a fast path.
+ * Acquires callback_sem - not suitable for calling from a fast path.
  **/
 
 int cpuset_excl_nodes_overlap(const struct task_struct *p)
@@ -1687,18 +1820,27 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
 	const struct cpuset *cs1, *cs2;	/* my and p's cpuset ancestors */
 	int overlap = 0;		/* do cpusets overlap? */
 
-	down(&cpuset_sem);
-	cs1 = current->cpuset;
-	if (!cs1)
-		goto done;		/* current task exiting */
-	cs2 = p->cpuset;
-	if (!cs2)
-		goto done;		/* task p is exiting */
-	cs1 = nearest_exclusive_ancestor(cs1);
-	cs2 = nearest_exclusive_ancestor(cs2);
+	down(&callback_sem);
+
+	task_lock(current);
+	if (current->flags & PF_EXITING) {
+		task_unlock(current);
+		goto done;
+	}
+	cs1 = nearest_exclusive_ancestor(current->cpuset);
+	task_unlock(current);
+
+	task_lock((struct task_struct *)p);
+	if (p->flags & PF_EXITING) {
+		task_unlock((struct task_struct *)p);
+		goto done;
+	}
+	cs2 = nearest_exclusive_ancestor(p->cpuset);
+	task_unlock((struct task_struct *)p);
+
 	overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
 done:
-	up(&cpuset_sem);
+	up(&callback_sem);
 
 	return overlap;
 }
@@ -1707,6 +1849,10 @@ done:
  * proc_cpuset_show()
  *  - Print tasks cpuset path into seq_file.
  *  - Used for /proc/<pid>/cpuset.
+ *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
+ *    doesn't really matter if tsk->cpuset changes after we read it,
+ *    and we take manage_sem, keeping attach_task() from changing it
+ *    anyway.
  */
 
 static int proc_cpuset_show(struct seq_file *m, void *v)
@@ -1721,10 +1867,8 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
 		return -ENOMEM;
 
 	tsk = m->private;
-	down(&cpuset_sem);
-	task_lock(tsk);
+	down(&manage_sem);
 	cs = tsk->cpuset;
-	task_unlock(tsk);
 	if (!cs) {
 		retval = -EINVAL;
 		goto out;
@@ -1736,7 +1880,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
 	seq_puts(m, buf);
 	seq_putc(m, '\n');
 out:
-	up(&cpuset_sem);
+	up(&manage_sem);
 	kfree(buf);
 	return retval;
 }
-- 
cgit v1.2.3


From 18a19cb3047e454ee5ecbc35d7acf3f8e09e0466 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 30 Oct 2005 15:02:31 -0800
Subject: [PATCH] cpusets: simple rename

Add support for renaming cpusets.  Only allow simple rename of cpuset
directories in place.  Don't allow moving cpusets elsewhere in hierarchy or
renaming the special cpuset files in each cpuset directory.

The usefulness of this simple rename became apparent when developing task
migration facilities.  It allows building a second cpuset hierarchy using
new names and containing new CPUs and Memory Nodes, moving tasks from the
old to the new cpusets, removing the old cpusets, and then renaming the new
cpusets to be just like the old names, so that any knowledge that the tasks
had of their cpuset names will still be valid.

Leaf node cpusets can be migrated to other CPUs or Memory Nodes by just
updating their 'cpus' and 'mems' files, but because no cpuset can contain
CPUs or Nodes not in its parent cpuset, one cannot do this in a cpuset
hierarchy without first expanding all the non-leaf cpusets to contain the
union of both the old and new CPUs and Nodes, which would obfuscate the
one-to-one migration of a task from one cpuset to another required to
correctly migrate the physical page frames currently allocated to that
task.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7491352276b..6633f3fb641 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1113,6 +1113,21 @@ static int cpuset_file_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+/*
+ * cpuset_rename - Only allow simple rename of directories in place.
+ */
+static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry,
+                  struct inode *new_dir, struct dentry *new_dentry)
+{
+	if (!S_ISDIR(old_dentry->d_inode->i_mode))
+		return -ENOTDIR;
+	if (new_dentry->d_inode)
+		return -EEXIST;
+	if (old_dir != new_dir)
+		return -EIO;
+	return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
+
 static struct file_operations cpuset_file_operations = {
 	.read = cpuset_file_read,
 	.write = cpuset_file_write,
@@ -1125,6 +1140,7 @@ static struct inode_operations cpuset_dir_inode_operations = {
 	.lookup = simple_lookup,
 	.mkdir = cpuset_mkdir,
 	.rmdir = cpuset_rmdir,
+	.rename = cpuset_rename,
 };
 
 static int cpuset_create_file(struct dentry *dentry, int mode)
-- 
cgit v1.2.3


From 68860ec10bcc07ab4f89f9d940e3b77ae5ca13b3 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 30 Oct 2005 15:02:36 -0800
Subject: [PATCH] cpusets: automatic numa mempolicy rebinding

This patch automatically updates a tasks NUMA mempolicy when its cpuset
memory placement changes.  It does so within the context of the task,
without any need to support low level external mempolicy manipulation.

If a system is not using cpusets, or if running on a system with just the
root (all-encompassing) cpuset, then this remap is a no-op.  Only when a
task is moved between cpusets, or a cpusets memory placement is changed
does the following apply.  Otherwise, the main routine below,
rebind_policy() is not even called.

When mixing cpusets, scheduler affinity, and NUMA mempolicies, the
essential role of cpusets is to place jobs (several related tasks) on a set
of CPUs and Memory Nodes, the essential role of sched_setaffinity is to
manage a jobs processor placement within its allowed cpuset, and the
essential role of NUMA mempolicy (mbind, set_mempolicy) is to manage a jobs
memory placement within its allowed cpuset.

However, CPU affinity and NUMA memory placement are managed within the
kernel using absolute system wide numbering, not cpuset relative numbering.

This is ok until a job is migrated to a different cpuset, or what's the
same, a jobs cpuset is moved to different CPUs and Memory Nodes.

Then the CPU affinity and NUMA memory placement of the tasks in the job
need to be updated, to preserve their cpuset-relative position.  This can
be done for CPU affinity using sched_setaffinity() from user code, as one
task can modify anothers CPU affinity.  This cannot be done from an
external task for NUMA memory placement, as that can only be modified in
the context of the task using it.

However, it easy enough to remap a tasks NUMA mempolicy automatically when
a task is migrated, using the existing cpuset mechanism to trigger a
refresh of a tasks memory placement after its cpuset has changed.  All that
is needed is the old and new nodemask, and notice to the task that it needs
to rebind its mempolicy.  The tasks mems_allowed has the old mask, the
tasks cpuset has the new mask, and the existing
cpuset_update_current_mems_allowed() mechanism provides the notice.  The
bitmap/cpumask/nodemask remap operators provide the cpuset relative
calculations.

This patch leaves open a couple of issues:

 1) Updating vma and shmfs/tmpfs/hugetlbfs memory policies:

    These mempolicies may reference nodes outside of those allowed to
    the current task by its cpuset.  Tasks are migrated as part of jobs,
    which reside on what might be several cpusets in a subtree.  When such
    a job is migrated, all NUMA memory policy references to nodes within
    that cpuset subtree should be translated, and references to any nodes
    outside that subtree should be left untouched.  A future patch will
    provide the cpuset mechanism needed to mark such subtrees.  With that
    patch, we will be able to correctly migrate these other memory policies
    across a job migration.

 2) Updating cpuset, affinity and memory policies in user space:

    This is harder.  Any placement state stored in user space using
    system-wide numbering will be invalidated across a migration.  More
    work will be required to provide user code with a migration-safe means
    to manage its cpuset relative placement, while preserving the current
    API's that pass system wide numbers, not cpuset relative numbers across
    the kernel-user boundary.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6633f3fb641..5a737ed9dac 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -32,6 +32,7 @@
 #include <linux/kernel.h>
 #include <linux/kmod.h>
 #include <linux/list.h>
+#include <linux/mempolicy.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/mount.h>
@@ -600,6 +601,7 @@ static void refresh_mems(void)
 
 	if (current->cpuset_mems_generation != my_cpusets_mem_gen) {
 		struct cpuset *cs;
+		nodemask_t oldmem = current->mems_allowed;
 
 		down(&callback_sem);
 		task_lock(current);
@@ -608,6 +610,8 @@ static void refresh_mems(void)
 		current->cpuset_mems_generation = cs->mems_generation;
 		task_unlock(current);
 		up(&callback_sem);
+		if (!nodes_equal(oldmem, current->mems_allowed))
+			numa_policy_rebind(&oldmem, &current->mems_allowed);
 	}
 }
 
-- 
cgit v1.2.3


From 30e0fca6c1d7d26f3f2daa4dd2b12c51dadc778a Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <andrea@suse.de>
Date: Sun, 30 Oct 2005 15:02:38 -0800
Subject: [PATCH] ptrace/coredump/exit_group deadlock

I could seldom reproduce a deadlock with a task not killable in T state
(TASK_STOPPED, not TASK_TRACED) by attaching a NPTL threaded program to
gdb, by segfaulting the task and triggering a core dump while some other
task is executing exit_group and while one task is in ptrace_attached
TASK_STOPPED state (not TASK_TRACED yet).  This originated from a gdb
bugreport (the fact gdb was segfaulting the task wasn't a kernel bug), but
I just incidentally noticed the gdb bug triggered a real kernel bug as
well.

Most threads hangs in exit_mm because the core_dumping is still going, the
core dumping hangs because the stopped task doesn't exit, the stopped task
can't wakeup because it has SIGNAL_GROUP_EXIT set, hence the deadlock.

To me it seems that the problem is that the force_sig_specific(SIGKILL) in
zap_threads is a noop if the task has PF_PTRACED set (like in this case
because gdb is attached).  The __ptrace_unlink does nothing because the
signal->flags is set to SIGNAL_GROUP_EXIT|SIGNAL_STOP_DEQUEUED (verified).

The above info also shows that the stopped task hit a race and got the stop
signal (presumably by the ptrace_attach, only the attach, state is still
TASK_STOPPED and gdb hangs waiting the core before it can set it to
TASK_TRACED) after one of the thread invoked the core dump (it's the core
dump that sets signal->flags to SIGNAL_GROUP_EXIT).

So beside the fact nobody would wakeup the task in __ptrace_unlink (the
state is _not_ TASK_TRACED), there's a secondary problem in the signal
handling code, where a task should ignore the ptrace-sigstops as long as
SIGNAL_GROUP_EXIT is set (or the wakeup in __ptrace_unlink path wouldn't be
enough).

So I attempted to make this patch that seems to fix the problem.  There
were various ways to fix it, perhaps you prefer a different one, I just
opted to the one that looked safer to me.

I also removed the clearing of the stopped bits from the zap_other_threads
(zap_other_threads was safe unlike zap_threads).  I don't like useless
code, this whole NPTL signal/ptrace thing is already unreadable enough and
full of corner cases without confusing useless code into it to make it even
less readable.  And if this code is really needed, then you may want to
explain why it's not being done in the other paths that sets
SIGNAL_GROUP_EXIT at least.

Even after this patch I still wonder who serializes the read of
p->ptrace in zap_threads.

Patch is called ptrace-core_dump-exit_group-deadlock-1.

This was the trace I've got:

test          T ffff81003e8118c0     0 14305      1         14311 14309 (NOTLB)
ffff810058ccdde8 0000000000000082 000001f4000037e1 ffff810000000013
       00000000000000f8 ffff81003e811b00 ffff81003e8118c0 ffff810011362100
       0000000000000012 ffff810017ca4180
Call Trace:<ffffffff801317ed>{try_to_wake_up+893} <ffffffff80141677>{finish_stop+87}
       <ffffffff8014367f>{get_signal_to_deliver+1359} <ffffffff8010d3ad>{do_signal+157}
       <ffffffff8013deee>{ptrace_check_attach+222} <ffffffff80111575>{sys_ptrace+2293}
       <ffffffff80131810>{default_wake_function+0} <ffffffff80196399>{sys_ioctl+73}
       <ffffffff8010dd27>{sysret_signal+28} <ffffffff8010e00f>{ptregscall_common+103}

test          D ffff810011362100     0 14309      1         14305 14312 (NOTLB)
ffff810053c81cf8 0000000000000082 0000000000000286 0000000000000001
       0000000000000195 ffff810011362340 ffff810011362100 ffff81002e338040
       ffff810001e0ca80 0000000000000001
Call Trace:<ffffffff801317ed>{try_to_wake_up+893} <ffffffff8044677d>{wait_for_completion+173}
       <ffffffff80131810>{default_wake_function+0} <ffffffff80137435>{exit_mm+149}
       <ffffffff801381af>{do_exit+479} <ffffffff80138d0c>{do_group_exit+252}
       <ffffffff801436db>{get_signal_to_deliver+1451} <ffffffff8010d3ad>{do_signal+157}
       <ffffffff8013deee>{ptrace_check_attach+222} <ffffffff80140850>{specific_send_sig_info+2

       <ffffffff8014208a>{force_sig_info+186} <ffffffff804479a0>{do_int3+112}
       <ffffffff8010e308>{retint_signal+61}
test          D ffff81002e338040     0 14311      1         14716 14305 (NOTLB)
ffff81005ca8dcf8 0000000000000082 0000000000000286 0000000000000001
       0000000000000120 ffff81002e338280 ffff81002e338040 ffff8100481cb740
       ffff810001e0ca80 0000000000000001
Call Trace:<ffffffff801317ed>{try_to_wake_up+893} <ffffffff8044677d>{wait_for_completion+173}
       <ffffffff80131810>{default_wake_function+0} <ffffffff80137435>{exit_mm+149}
       <ffffffff801381af>{do_exit+479} <ffffffff80142d0e>{__dequeue_signal+558}
       <ffffffff80138d0c>{do_group_exit+252} <ffffffff801436db>{get_signal_to_deliver+1451}
       <ffffffff8010d3ad>{do_signal+157} <ffffffff8013deee>{ptrace_check_attach+222}
       <ffffffff80140850>{specific_send_sig_info+208} <ffffffff8014208a>{force_sig_info+186}
       <ffffffff804479a0>{do_int3+112} <ffffffff8010e308>{retint_signal+61}

test          D ffff810017ca4180     0 14312      1         14309 13882 (NOTLB)
ffff81005d15fcb8 0000000000000082 ffff81005d15fc58 ffffffff80130816
       0000000000000897 ffff810017ca43c0 ffff810017ca4180 ffff81003e8118c0
       0000000000000082 ffffffff801317ed
Call Trace:<ffffffff80130816>{activate_task+150} <ffffffff801317ed>{try_to_wake_up+893}
       <ffffffff8044677d>{wait_for_completion+173} <ffffffff80131810>{default_wake_function+0}
       <ffffffff8018cdc3>{do_coredump+819} <ffffffff80445f52>{thread_return+82}
       <ffffffff801436d4>{get_signal_to_deliver+1444} <ffffffff8010d3ad>{do_signal+157}
       <ffffffff8013deee>{ptrace_check_attach+222} <ffffffff80140850>{specific_send_sig_info+2

       <ffffffff804472e5>{_spin_unlock_irqrestore+5} <ffffffff8014208a>{force_sig_info+186}
       <ffffffff804476ff>{do_general_protection+159} <ffffffff8010e308>{retint_signal+61}

Signed-off-by: Andrea Arcangeli <andrea@suse.de>
Cc: Roland McGrath <roland@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/ptrace.c | 7 +++++--
 kernel/signal.c | 6 +++---
 2 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 019e04ec065..863eee8bff4 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -56,6 +56,10 @@ void ptrace_untrace(task_t *child)
 			signal_wake_up(child, 1);
 		}
 	}
+	if (child->signal->flags & SIGNAL_GROUP_EXIT) {
+		sigaddset(&child->pending.signal, SIGKILL);
+		signal_wake_up(child, 1);
+	}
 	spin_unlock(&child->sighand->siglock);
 }
 
@@ -77,8 +81,7 @@ void __ptrace_unlink(task_t *child)
 		SET_LINKS(child);
 	}
 
-	if (child->state == TASK_TRACED)
-		ptrace_untrace(child);
+	ptrace_untrace(child);
 }
 
 /*
diff --git a/kernel/signal.c b/kernel/signal.c
index 1d905ec74bd..9d1512dcf17 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1108,8 +1108,8 @@ void zap_other_threads(struct task_struct *p)
 		if (t != p->group_leader)
 			t->exit_signal = -1;
 
+		/* SIGKILL will be handled before any pending SIGSTOP */
 		sigaddset(&t->pending.signal, SIGKILL);
-		rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
 		signal_wake_up(t, 1);
 	}
 }
@@ -1879,9 +1879,9 @@ relock:
 			/* Let the debugger run.  */
 			ptrace_stop(signr, signr, info);
 
-			/* We're back.  Did the debugger cancel the sig?  */
+			/* We're back.  Did the debugger cancel the sig or group_exit? */
 			signr = current->exit_code;
-			if (signr == 0)
+			if (signr == 0 || current->signal->flags & SIGNAL_GROUP_EXIT)
 				continue;
 
 			current->exit_code = 0;
-- 
cgit v1.2.3


From 20e1129ab831486c811d50d3905343ad48c4275f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sun, 30 Oct 2005 15:02:44 -0800
Subject: [PATCH] Keys: Get rid of warning in kmod.c if keys disabled

The attached patch gets rid of a "statement without effect" warning when
CONFIG_KEYS is disabled by making use of the return value of key_get().
The compiler will optimise all of this away when keys are disabled.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kmod.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 44166e3bb8a..51a892063aa 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -131,14 +131,14 @@ struct subprocess_info {
 static int ____call_usermodehelper(void *data)
 {
 	struct subprocess_info *sub_info = data;
-	struct key *old_session;
+	struct key *new_session, *old_session;
 	int retval;
 
 	/* Unblock all signals and set the session keyring. */
-	key_get(sub_info->ring);
+	new_session = key_get(sub_info->ring);
 	flush_signals(current);
 	spin_lock_irq(&current->sighand->siglock);
-	old_session = __install_session_keyring(current, sub_info->ring);
+	old_session = __install_session_keyring(current, new_session);
 	flush_signal_handlers(current, 1);
 	sigemptyset(&current->blocked);
 	recalc_sigpending();
-- 
cgit v1.2.3


From 40dc565122ed1e180a0637f88cdfca734d33db78 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jesper.juhl@gmail.com>
Date: Sun, 30 Oct 2005 15:02:46 -0800
Subject: [PATCH] cleanup for kernel/printk.c

- Removes some trailing whitespace

- Breaks long lines and make other small changes to conform to CodingStyle

- Add explicit printk loglevels in two places.

Signed-off-by: Jesper Juhl <jesper.juhl@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/printk.c | 78 +++++++++++++++++++++++++++++++--------------------------
 1 file changed, 42 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 4b8f0f9230a..3cb9708209b 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -10,7 +10,7 @@
  * elsewhere, in preparation for a serial line console (someday).
  * Ted Ts'o, 2/11/93.
  * Modified for sysctl support, 1/8/97, Chris Horn.
- * Fixed SMP synchronization, 08/08/99, Manfred Spraul 
+ * Fixed SMP synchronization, 08/08/99, Manfred Spraul
  *     manfreds@colorfullife.com
  * Rewrote bits to get rid of console_lock
  *	01Mar01 Andrew Morton <andrewm@uow.edu.au>
@@ -148,7 +148,7 @@ static int __init console_setup(char *str)
 	if (!strcmp(str, "ttyb"))
 		strcpy(name, "ttyS1");
 #endif
-	for(s = name; *s; s++)
+	for (s = name; *s; s++)
 		if ((*s >= '0' && *s <= '9') || *s == ',')
 			break;
 	idx = simple_strtoul(s, NULL, 10);
@@ -169,11 +169,11 @@ static int __init log_buf_len_setup(char *str)
 		size = roundup_pow_of_two(size);
 	if (size > log_buf_len) {
 		unsigned long start, dest_idx, offset;
-		char * new_log_buf;
+		char *new_log_buf;
 
 		new_log_buf = alloc_bootmem(size);
 		if (!new_log_buf) {
-			printk("log_buf_len: allocation failed\n");
+			printk(KERN_WARNING "log_buf_len: allocation failed\n");
 			goto out;
 		}
 
@@ -193,10 +193,9 @@ static int __init log_buf_len_setup(char *str)
 		log_end -= offset;
 		spin_unlock_irqrestore(&logbuf_lock, flags);
 
-		printk("log_buf_len: %d\n", log_buf_len);
+		printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len);
 	}
 out:
-
 	return 1;
 }
 
@@ -217,7 +216,7 @@ __setup("log_buf_len=", log_buf_len_setup);
  *	9 -- Return number of unread characters in the log buffer
  *     10 -- Return size of the log buffer
  */
-int do_syslog(int type, char __user * buf, int len)
+int do_syslog(int type, char __user *buf, int len)
 {
 	unsigned long i, j, limit, count;
 	int do_clear = 0;
@@ -244,7 +243,8 @@ int do_syslog(int type, char __user * buf, int len)
 			error = -EFAULT;
 			goto out;
 		}
-		error = wait_event_interruptible(log_wait, (log_start - log_end));
+		error = wait_event_interruptible(log_wait,
+							(log_start - log_end));
 		if (error)
 			goto out;
 		i = 0;
@@ -264,7 +264,7 @@ int do_syslog(int type, char __user * buf, int len)
 			error = i;
 		break;
 	case 4:		/* Read/clear last kernel messages */
-		do_clear = 1; 
+		do_clear = 1;
 		/* FALL THRU */
 	case 3:		/* Read last kernel messages */
 		error = -EINVAL;
@@ -288,11 +288,11 @@ int do_syslog(int type, char __user * buf, int len)
 		limit = log_end;
 		/*
 		 * __put_user() could sleep, and while we sleep
-		 * printk() could overwrite the messages 
+		 * printk() could overwrite the messages
 		 * we try to copy to user space. Therefore
 		 * the messages are copied in reverse. <manfreds>
 		 */
-		for(i = 0; i < count && !error; i++) {
+		for (i = 0; i < count && !error; i++) {
 			j = limit-1-i;
 			if (j + log_buf_len < log_end)
 				break;
@@ -306,10 +306,10 @@ int do_syslog(int type, char __user * buf, int len)
 		if (error)
 			break;
 		error = i;
-		if(i != count) {
+		if (i != count) {
 			int offset = count-error;
 			/* buffer overflow during copy, correct user buffer. */
-			for(i=0;i<error;i++) {
+			for (i = 0; i < error; i++) {
 				if (__get_user(c,&buf[i+offset]) ||
 				    __put_user(c,&buf[i])) {
 					error = -EFAULT;
@@ -351,7 +351,7 @@ out:
 	return error;
 }
 
-asmlinkage long sys_syslog(int type, char __user * buf, int len)
+asmlinkage long sys_syslog(int type, char __user *buf, int len)
 {
 	return do_syslog(type, buf, len);
 }
@@ -404,21 +404,19 @@ static void call_console_drivers(unsigned long start, unsigned long end)
 	cur_index = start;
 	start_print = start;
 	while (cur_index != end) {
-		if (	msg_level < 0 &&
-			((end - cur_index) > 2) &&
-			LOG_BUF(cur_index + 0) == '<' &&
-			LOG_BUF(cur_index + 1) >= '0' &&
-			LOG_BUF(cur_index + 1) <= '7' &&
-			LOG_BUF(cur_index + 2) == '>')
-		{
+		if (msg_level < 0 && ((end - cur_index) > 2) &&
+				LOG_BUF(cur_index + 0) == '<' &&
+				LOG_BUF(cur_index + 1) >= '0' &&
+				LOG_BUF(cur_index + 1) <= '7' &&
+				LOG_BUF(cur_index + 2) == '>') {
 			msg_level = LOG_BUF(cur_index + 1) - '0';
 			cur_index += 3;
 			start_print = cur_index;
 		}
 		while (cur_index != end) {
 			char c = LOG_BUF(cur_index);
-			cur_index++;
 
+			cur_index++;
 			if (c == '\n') {
 				if (msg_level < 0) {
 					/*
@@ -461,7 +459,7 @@ static void zap_locks(void)
 	static unsigned long oops_timestamp;
 
 	if (time_after_eq(jiffies, oops_timestamp) &&
-			!time_after(jiffies, oops_timestamp + 30*HZ))
+			!time_after(jiffies, oops_timestamp + 30 * HZ))
 		return;
 
 	oops_timestamp = jiffies;
@@ -495,7 +493,7 @@ __attribute__((weak)) unsigned long long printk_clock(void)
 
 /*
  * This is printk.  It can be called from any context.  We want it to work.
- * 
+ *
  * We try to grab the console_sem.  If we succeed, it's easy - we log the output and
  * call the console drivers.  If we fail to get the semaphore we place the output
  * into the log buffer and return.  The current holder of the console_sem will
@@ -639,13 +637,19 @@ EXPORT_SYMBOL(vprintk);
 
 #else
 
-asmlinkage long sys_syslog(int type, char __user * buf, int len)
+asmlinkage long sys_syslog(int type, char __user *buf, int len)
 {
 	return 0;
 }
 
-int do_syslog(int type, char __user * buf, int len) { return 0; }
-static void call_console_drivers(unsigned long start, unsigned long end) {}
+int do_syslog(int type, char __user *buf, int len)
+{
+	return 0;
+}
+
+static void call_console_drivers(unsigned long start, unsigned long end)
+{
+}
 
 #endif
 
@@ -851,9 +855,9 @@ EXPORT_SYMBOL(console_start);
  * print any messages that were printed by the kernel before the
  * console driver was initialized.
  */
-void register_console(struct console * console)
+void register_console(struct console *console)
 {
-	int     i;
+	int i;
 	unsigned long flags;
 
 	if (preferred_console < 0)
@@ -878,7 +882,8 @@ void register_console(struct console * console)
 	 *	See if this console matches one we selected on
 	 *	the command line.
 	 */
-	for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) {
+	for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0];
+			i++) {
 		if (strcmp(console_cmdline[i].name, console->name) != 0)
 			continue;
 		if (console->index >= 0 &&
@@ -933,9 +938,9 @@ void register_console(struct console * console)
 }
 EXPORT_SYMBOL(register_console);
 
-int unregister_console(struct console * console)
+int unregister_console(struct console *console)
 {
-        struct console *a,*b;
+        struct console *a, *b;
 	int res = 1;
 
 	acquire_console_sem();
@@ -949,10 +954,10 @@ int unregister_console(struct console * console)
 				b->next = a->next;
 				res = 0;
 				break;
-			}  
+			}
 		}
 	}
-	
+
 	/* If last console is removed, we re-enable picking the first
 	 * one that gets registered. Without that, pmac early boot console
 	 * would prevent fbcon from taking over.
@@ -994,7 +999,7 @@ void tty_write_message(struct tty_struct *tty, char *msg)
 int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
 {
 	static DEFINE_SPINLOCK(ratelimit_lock);
-	static unsigned long toks = 10*5*HZ;
+	static unsigned long toks = 10 * 5 * HZ;
 	static unsigned long last_msg;
 	static int missed;
 	unsigned long flags;
@@ -1007,6 +1012,7 @@ int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
 		toks = ratelimit_burst * ratelimit_jiffies;
 	if (toks >= ratelimit_jiffies) {
 		int lost = missed;
+
 		missed = 0;
 		toks -= ratelimit_jiffies;
 		spin_unlock_irqrestore(&ratelimit_lock, flags);
@@ -1021,7 +1027,7 @@ int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
 EXPORT_SYMBOL(__printk_ratelimit);
 
 /* minimum time in jiffies between messages */
-int printk_ratelimit_jiffies = 5*HZ;
+int printk_ratelimit_jiffies = 5 * HZ;
 
 /* number of messages we send before ratelimiting */
 int printk_ratelimit_burst = 10;
-- 
cgit v1.2.3


From 7407251a0e2ed099e4b12b742b635503e981507c Mon Sep 17 00:00:00 2001
From: Coywolf Qi Hunt <qiyong@fc-cn.com>
Date: Sun, 30 Oct 2005 15:02:47 -0800
Subject: [PATCH] PF_DEAD cleanup

The PF_DEAD setting doesn't belong to exit_notify(), move it to a proper
place.

Signed-off-by: Coywolf Qi Hunt <qiyong@fc-cn.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 79f52b85d6e..6ef8f7356a7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -783,10 +783,6 @@ static void exit_notify(struct task_struct *tsk)
 	/* If the process is dead, release it - nobody will wait for it */
 	if (state == EXIT_DEAD)
 		release_task(tsk);
-
-	/* PF_DEAD causes final put_task_struct after we schedule. */
-	preempt_disable();
-	tsk->flags |= PF_DEAD;
 }
 
 fastcall NORET_TYPE void do_exit(long code)
@@ -873,7 +869,11 @@ fastcall NORET_TYPE void do_exit(long code)
 	tsk->mempolicy = NULL;
 #endif
 
-	BUG_ON(!(current->flags & PF_DEAD));
+	/* PF_DEAD causes final put_task_struct after we schedule. */
+	preempt_disable();
+	BUG_ON(tsk->flags & PF_DEAD);
+	tsk->flags |= PF_DEAD;
+
 	schedule();
 	BUG();
 	/* Avoid "noreturn function does return".  */
-- 
cgit v1.2.3


From 7f2a52555998c699a7e89f24636c909d6fc08a60 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Sun, 30 Oct 2005 15:02:50 -0800
Subject: [PATCH] wait4 PTRACE_ATTACH race fix

Back about a year ago when I last fiddled heavily with the do_wait code, I
was thinking too hard about the wrong thing and I now think I introduced a
bug whose inverse thought I was fixing.

Apparently noone was looking too hard over much shoulder, so as to cite my
bogus reasoning at the time.  In the race condition when PTRACE_ATTACH is
about to steal a child and then the child hits a tracing event (what
my_ptrace_child checks for), the real parent does need to set its flag
noting it has some eligible live children.  Otherwise a spurious ECHILD
error is possible, since the child in question is not yet on the
ptrace_children list.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 6ef8f7356a7..2d39ccc367e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1383,6 +1383,15 @@ repeat:
 
 			switch (p->state) {
 			case TASK_TRACED:
+				/*
+				 * When we hit the race with PTRACE_ATTACH,
+				 * we will not report this child.  But the
+				 * race means it has not yet been moved to
+				 * our ptrace_children list, so we need to
+				 * set the flag here to avoid a spurious ECHILD
+				 * when the race happens with the only child.
+				 */
+				flag = 1;
 				if (!my_ptrace_child(p))
 					continue;
 				/*FALLTHROUGH*/
-- 
cgit v1.2.3


From ecea8d19c9f0ebd62ddaa07fc919ff4e4b820d99 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 30 Oct 2005 15:03:00 -0800
Subject: [PATCH] jiffies_64 cleanup

Define jiffies_64 in kernel/timer.c rather than having 24 duplicated
defines in each architecture.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 562d53eabb4..fd74268d866 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -46,6 +46,10 @@ static void time_interpolator_update(long delta_nsec);
 #define time_interpolator_update(x)
 #endif
 
+u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
+
+EXPORT_SYMBOL(jiffies_64);
+
 /*
  * per-CPU timer vector definitions:
  */
-- 
cgit v1.2.3


From a241ec65aeac3d69a08a7b153cccbdb7ea35063f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@us.ibm.com>
Date: Sun, 30 Oct 2005 15:03:12 -0800
Subject: [PATCH] RCU torture-testing kernel module

This patch is a rewrite of the one submitted on October 1st, using modules
(http://marc.theaimsgroup.com/?l=linux-kernel&m=112819093522998&w=2).

This rewrite adds a tristate CONFIG_RCU_TORTURE_TEST, which enables an
intense torture test of the RCU infratructure.  This is needed due to the
continued changes to the RCU infrastructure to accommodate dynamic ticks,
CPU hotplug, realtime, and so on.  Most of the code is in a separate file
that is compiled only if the CONFIG variable is set.  Documentation on how
to run the test and interpret the output is also included.

This code has been tested on i386 and ppc64, and an earlier version of the
code has received extensive testing on a number of architectures as part of
the PREEMPT_RT patchset.

Signed-off-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/Makefile     |   1 +
 kernel/rcupdate.c   |  10 ++
 kernel/rcutorture.c | 492 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 503 insertions(+)
 create mode 100644 kernel/rcutorture.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 980b5e45444..4f5a1453093 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -31,6 +31,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_SECCOMP) += seccomp.o
+obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 2559d4b8f23..c4d159a21e0 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -153,6 +153,15 @@ void fastcall call_rcu_bh(struct rcu_head *head,
 	local_irq_restore(flags);
 }
 
+/*
+ * Return the number of RCU batches processed thus far.  Useful
+ * for debug and statistics.
+ */
+long rcu_batches_completed(void)
+{
+	return rcu_ctrlblk.completed;
+}
+
 /*
  * Invoke the completed RCU callbacks. They are expected to be in
  * a per-cpu list.
@@ -501,6 +510,7 @@ void synchronize_kernel(void)
 }
 
 module_param(maxbatch, int, 0);
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
 EXPORT_SYMBOL(call_rcu);  /* WARNING: GPL-only in April 2006. */
 EXPORT_SYMBOL(call_rcu_bh);  /* WARNING: GPL-only in April 2006. */
 EXPORT_SYMBOL_GPL(synchronize_rcu);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
new file mode 100644
index 00000000000..9b58f1eff3c
--- /dev/null
+++ b/kernel/rcutorture.c
@@ -0,0 +1,492 @@
+/*
+ * Read-Copy Update /proc-based torture test facility
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2005
+ *
+ * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ *
+ * See also:  Documentation/RCU/torture.txt
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/rcuref.h>
+#include <linux/cpu.h>
+#include <linux/random.h>
+#include <linux/delay.h>
+#include <linux/byteorder/swabb.h>
+#include <linux/stat.h>
+
+MODULE_LICENSE("GPL");
+
+static int nreaders = -1;	/* # reader threads, defaults to 4*ncpus */
+static int stat_interval = 0;	/* Interval between stats, in seconds. */
+				/*  Defaults to "only at end of test". */
+static int verbose = 0;		/* Print more debug info. */
+
+MODULE_PARM(nreaders, "i");
+MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
+MODULE_PARM(stat_interval, "i");
+MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
+MODULE_PARM(verbose, "i");
+MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
+#define TORTURE_FLAG "rcutorture: "
+#define PRINTK_STRING(s) \
+	do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0)
+#define VERBOSE_PRINTK_STRING(s) \
+	do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0)
+#define VERBOSE_PRINTK_ERRSTRING(s) \
+	do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0)
+
+static char printk_buf[4096];
+
+static int nrealreaders;
+static struct task_struct *writer_task;
+static struct task_struct **reader_tasks;
+static struct task_struct *stats_task;
+
+#define RCU_TORTURE_PIPE_LEN 10
+
+struct rcu_torture {
+	struct rcu_head rtort_rcu;
+	int rtort_pipe_count;
+	struct list_head rtort_free;
+};
+
+static int fullstop = 0;	/* stop generating callbacks at test end. */
+static LIST_HEAD(rcu_torture_freelist);
+static struct rcu_torture *rcu_torture_current = NULL;
+static long rcu_torture_current_version = 0;
+static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
+static DEFINE_SPINLOCK(rcu_torture_lock);
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
+	{ 0 };
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) =
+	{ 0 };
+static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
+atomic_t n_rcu_torture_alloc;
+atomic_t n_rcu_torture_alloc_fail;
+atomic_t n_rcu_torture_free;
+
+/*
+ * Allocate an element from the rcu_tortures pool.
+ */
+struct rcu_torture *
+rcu_torture_alloc(void)
+{
+	struct list_head *p;
+
+	spin_lock(&rcu_torture_lock);
+	if (list_empty(&rcu_torture_freelist)) {
+		atomic_inc(&n_rcu_torture_alloc_fail);
+		spin_unlock(&rcu_torture_lock);
+		return NULL;
+	}
+	atomic_inc(&n_rcu_torture_alloc);
+	p = rcu_torture_freelist.next;
+	list_del_init(p);
+	spin_unlock(&rcu_torture_lock);
+	return container_of(p, struct rcu_torture, rtort_free);
+}
+
+/*
+ * Free an element to the rcu_tortures pool.
+ */
+static void
+rcu_torture_free(struct rcu_torture *p)
+{
+	atomic_inc(&n_rcu_torture_free);
+	spin_lock(&rcu_torture_lock);
+	list_add_tail(&p->rtort_free, &rcu_torture_freelist);
+	spin_unlock(&rcu_torture_lock);
+}
+
+static void
+rcu_torture_cb(struct rcu_head *p)
+{
+	int i;
+	struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
+
+	if (fullstop) {
+		/* Test is ending, just drop callbacks on the floor. */
+		/* The next initialization will pick up the pieces. */
+		return;
+	}
+	i = rp->rtort_pipe_count;
+	if (i > RCU_TORTURE_PIPE_LEN)
+		i = RCU_TORTURE_PIPE_LEN;
+	atomic_inc(&rcu_torture_wcount[i]);
+	if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN)
+		rcu_torture_free(rp);
+	else
+		call_rcu(p, rcu_torture_cb);
+}
+
+struct rcu_random_state {
+	unsigned long rrs_state;
+	unsigned long rrs_count;
+};
+
+#define RCU_RANDOM_MULT 39916801  /* prime */
+#define RCU_RANDOM_ADD	479001701 /* prime */
+#define RCU_RANDOM_REFRESH 10000
+
+#define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 }
+
+/*
+ * Crude but fast random-number generator.  Uses a linear congruential
+ * generator, with occasional help from get_random_bytes().
+ */
+static long
+rcu_random(struct rcu_random_state *rrsp)
+{
+	long refresh;
+
+	if (--rrsp->rrs_count < 0) {
+		get_random_bytes(&refresh, sizeof(refresh));
+		rrsp->rrs_state += refresh;
+		rrsp->rrs_count = RCU_RANDOM_REFRESH;
+	}
+	rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
+	return swahw32(rrsp->rrs_state);
+}
+
+/*
+ * RCU torture writer kthread.  Repeatedly substitutes a new structure
+ * for that pointed to by rcu_torture_current, freeing the old structure
+ * after a series of grace periods (the "pipeline").
+ */
+static int
+rcu_torture_writer(void *arg)
+{
+	int i;
+	long oldbatch = rcu_batches_completed();
+	struct rcu_torture *rp;
+	struct rcu_torture *old_rp;
+	static DEFINE_RCU_RANDOM(rand);
+
+	VERBOSE_PRINTK_STRING("rcu_torture_writer task started");
+	do {
+		schedule_timeout_uninterruptible(1);
+		if (rcu_batches_completed() == oldbatch)
+			continue;
+		if ((rp = rcu_torture_alloc()) == NULL)
+			continue;
+		rp->rtort_pipe_count = 0;
+		udelay(rcu_random(&rand) & 0x3ff);
+		old_rp = rcu_torture_current;
+		rcu_assign_pointer(rcu_torture_current, rp);
+		smp_wmb();
+		if (old_rp != NULL) {
+			i = old_rp->rtort_pipe_count;
+			if (i > RCU_TORTURE_PIPE_LEN)
+				i = RCU_TORTURE_PIPE_LEN;
+			atomic_inc(&rcu_torture_wcount[i]);
+			old_rp->rtort_pipe_count++;
+			call_rcu(&old_rp->rtort_rcu, rcu_torture_cb);
+		}
+		rcu_torture_current_version++;
+		oldbatch = rcu_batches_completed();
+	} while (!kthread_should_stop() && !fullstop);
+	VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
+	while (!kthread_should_stop())
+		schedule_timeout_uninterruptible(1);
+	return 0;
+}
+
+/*
+ * RCU torture reader kthread.  Repeatedly dereferences rcu_torture_current,
+ * incrementing the corresponding element of the pipeline array.  The
+ * counter in the element should never be greater than 1, otherwise, the
+ * RCU implementation is broken.
+ */
+static int
+rcu_torture_reader(void *arg)
+{
+	int completed;
+	DEFINE_RCU_RANDOM(rand);
+	struct rcu_torture *p;
+	int pipe_count;
+
+	VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
+	do {
+		rcu_read_lock();
+		completed = rcu_batches_completed();
+		p = rcu_dereference(rcu_torture_current);
+		if (p == NULL) {
+			/* Wait for rcu_torture_writer to get underway */
+			rcu_read_unlock();
+			schedule_timeout_interruptible(HZ);
+			continue;
+		}
+		udelay(rcu_random(&rand) & 0x7f);
+		preempt_disable();
+		pipe_count = p->rtort_pipe_count;
+		if (pipe_count > RCU_TORTURE_PIPE_LEN) {
+			/* Should not happen, but... */
+			pipe_count = RCU_TORTURE_PIPE_LEN;
+		}
+		++__get_cpu_var(rcu_torture_count)[pipe_count];
+		completed = rcu_batches_completed() - completed;
+		if (completed > RCU_TORTURE_PIPE_LEN) {
+			/* Should not happen, but... */
+			completed = RCU_TORTURE_PIPE_LEN;
+		}
+		++__get_cpu_var(rcu_torture_batch)[completed];
+		preempt_enable();
+		rcu_read_unlock();
+		schedule();
+	} while (!kthread_should_stop() && !fullstop);
+	VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
+	while (!kthread_should_stop())
+		schedule_timeout_uninterruptible(1);
+	return 0;
+}
+
+/*
+ * Create an RCU-torture statistics message in the specified buffer.
+ */
+static int
+rcu_torture_printk(char *page)
+{
+	int cnt = 0;
+	int cpu;
+	int i;
+	long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
+	long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
+
+	for_each_cpu(cpu) {
+		for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
+			pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i];
+			batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i];
+		}
+	}
+	for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) {
+		if (pipesummary[i] != 0)
+			break;
+	}
+	cnt += sprintf(&page[cnt], "rcutorture: ");
+	cnt += sprintf(&page[cnt],
+		       "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d",
+		       rcu_torture_current,
+		       rcu_torture_current_version,
+		       list_empty(&rcu_torture_freelist),
+		       atomic_read(&n_rcu_torture_alloc),
+		       atomic_read(&n_rcu_torture_alloc_fail),
+		       atomic_read(&n_rcu_torture_free));
+	cnt += sprintf(&page[cnt], "\nrcutorture: ");
+	if (i > 1)
+		cnt += sprintf(&page[cnt], "!!! ");
+	cnt += sprintf(&page[cnt], "Reader Pipe: ");
+	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
+		cnt += sprintf(&page[cnt], " %ld", pipesummary[i]);
+	cnt += sprintf(&page[cnt], "\nrcutorture: ");
+	cnt += sprintf(&page[cnt], "Reader Batch: ");
+	for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++)
+		cnt += sprintf(&page[cnt], " %ld", batchsummary[i]);
+	cnt += sprintf(&page[cnt], "\nrcutorture: ");
+	cnt += sprintf(&page[cnt], "Free-Block Circulation: ");
+	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
+		cnt += sprintf(&page[cnt], " %d",
+			       atomic_read(&rcu_torture_wcount[i]));
+	}
+	cnt += sprintf(&page[cnt], "\n");
+	return cnt;
+}
+
+/*
+ * Print torture statistics.  Caller must ensure that there is only
+ * one call to this function at a given time!!!  This is normally
+ * accomplished by relying on the module system to only have one copy
+ * of the module loaded, and then by giving the rcu_torture_stats
+ * kthread full control (or the init/cleanup functions when rcu_torture_stats
+ * thread is not running).
+ */
+static void
+rcu_torture_stats_print(void)
+{
+	int cnt;
+
+	cnt = rcu_torture_printk(printk_buf);
+	printk(KERN_ALERT "%s", printk_buf);
+}
+
+/*
+ * Periodically prints torture statistics, if periodic statistics printing
+ * was specified via the stat_interval module parameter.
+ *
+ * No need to worry about fullstop here, since this one doesn't reference
+ * volatile state or register callbacks.
+ */
+static int
+rcu_torture_stats(void *arg)
+{
+	VERBOSE_PRINTK_STRING("rcu_torture_stats task started");
+	do {
+		schedule_timeout_interruptible(stat_interval * HZ);
+		rcu_torture_stats_print();
+	} while (!kthread_should_stop());
+	VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping");
+	return 0;
+}
+
+static void
+rcu_torture_cleanup(void)
+{
+	int i;
+
+	fullstop = 1;
+	if (writer_task != NULL) {
+		VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
+		kthread_stop(writer_task);
+	}
+	writer_task = NULL;
+
+	if (reader_tasks != NULL) {
+		for (i = 0; i < nrealreaders; i++) {
+			if (reader_tasks[i] != NULL) {
+				VERBOSE_PRINTK_STRING(
+					"Stopping rcu_torture_reader task");
+				kthread_stop(reader_tasks[i]);
+			}
+			reader_tasks[i] = NULL;
+		}
+		kfree(reader_tasks);
+		reader_tasks = NULL;
+	}
+	rcu_torture_current = NULL;
+
+	if (stats_task != NULL) {
+		VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
+		kthread_stop(stats_task);
+	}
+	stats_task = NULL;
+
+	/* Wait for all RCU callbacks to fire.  */
+
+	for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++)
+		synchronize_rcu();
+	rcu_torture_stats_print();  /* -After- the stats thread is stopped! */
+	PRINTK_STRING("--- End of test");
+}
+
+static int
+rcu_torture_init(void)
+{
+	int i;
+	int cpu;
+	int firsterr = 0;
+
+	/* Process args and tell the world that the torturer is on the job. */
+
+	if (nreaders >= 0)
+		nrealreaders = nreaders;
+	else
+		nrealreaders = 2 * num_online_cpus();
+	printk(KERN_ALERT TORTURE_FLAG
+	       "--- Start of test: nreaders=%d stat_interval=%d verbose=%d\n",
+	       nrealreaders, stat_interval, verbose);
+	fullstop = 0;
+
+	/* Set up the freelist. */
+
+	INIT_LIST_HEAD(&rcu_torture_freelist);
+	for (i = 0; i < sizeof(rcu_tortures) / sizeof(rcu_tortures[0]); i++) {
+		list_add_tail(&rcu_tortures[i].rtort_free,
+			      &rcu_torture_freelist);
+	}
+
+	/* Initialize the statistics so that each run gets its own numbers. */
+
+	rcu_torture_current = NULL;
+	rcu_torture_current_version = 0;
+	atomic_set(&n_rcu_torture_alloc, 0);
+	atomic_set(&n_rcu_torture_alloc_fail, 0);
+	atomic_set(&n_rcu_torture_free, 0);
+	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
+		atomic_set(&rcu_torture_wcount[i], 0);
+	for_each_cpu(cpu) {
+		for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
+			per_cpu(rcu_torture_count, cpu)[i] = 0;
+			per_cpu(rcu_torture_batch, cpu)[i] = 0;
+		}
+	}
+
+	/* Start up the kthreads. */
+
+	VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task");
+	writer_task = kthread_run(rcu_torture_writer, NULL,
+				  "rcu_torture_writer");
+	if (IS_ERR(writer_task)) {
+		firsterr = PTR_ERR(writer_task);
+		VERBOSE_PRINTK_ERRSTRING("Failed to create writer");
+		writer_task = NULL;
+		goto unwind;
+	}
+	reader_tasks = kmalloc(nrealreaders * sizeof(reader_tasks[0]),
+			       GFP_KERNEL);
+	if (reader_tasks == NULL) {
+		VERBOSE_PRINTK_ERRSTRING("out of memory");
+		firsterr = -ENOMEM;
+		goto unwind;
+	}
+	for (i = 0; i < nrealreaders; i++) {
+		VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task");
+		reader_tasks[i] = kthread_run(rcu_torture_reader, NULL,
+					      "rcu_torture_reader");
+		if (IS_ERR(reader_tasks[i])) {
+			firsterr = PTR_ERR(reader_tasks[i]);
+			VERBOSE_PRINTK_ERRSTRING("Failed to create reader");
+			reader_tasks[i] = NULL;
+			goto unwind;
+		}
+	}
+	if (stat_interval > 0) {
+		VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task");
+		stats_task = kthread_run(rcu_torture_stats, NULL,
+					"rcu_torture_stats");
+		if (IS_ERR(stats_task)) {
+			firsterr = PTR_ERR(stats_task);
+			VERBOSE_PRINTK_ERRSTRING("Failed to create stats");
+			stats_task = NULL;
+			goto unwind;
+		}
+	}
+	return 0;
+
+unwind:
+	rcu_torture_cleanup();
+	return firsterr;
+}
+
+module_init(rcu_torture_init);
+module_exit(rcu_torture_cleanup);
-- 
cgit v1.2.3


From 708f430dcc50787d1c0b5c31962a5ff0dd8e35eb Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Sun, 30 Oct 2005 15:03:13 -0800
Subject: [PATCH] posix-cpu-timers: fix overrun reporting

This change corrects an omission in posix_cpu_timer_schedule, so that it
correctly propagates the overrun calculation to where it will get reported
to the user.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-cpu-timers.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index bf374fceb39..91a89426494 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1225,7 +1225,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 		/*
 		 * The task was cleaned up already, no future firings.
 		 */
-		return;
+		goto out;
 
 	/*
 	 * Fetch the current sample and update the timer's expiry time.
@@ -1235,7 +1235,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 		bump_cpu_timer(timer, now);
 		if (unlikely(p->exit_state)) {
 			clear_dead_task(timer, now);
-			return;
+			goto out;
 		}
 		read_lock(&tasklist_lock); /* arm_timer needs it.  */
 	} else {
@@ -1248,8 +1248,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 			put_task_struct(p);
 			timer->it.cpu.task = p = NULL;
 			timer->it.cpu.expires.sched = 0;
-			read_unlock(&tasklist_lock);
-			return;
+			goto out_unlock;
 		} else if (unlikely(p->exit_state) && thread_group_empty(p)) {
 			/*
 			 * We've noticed that the thread is dead, but
@@ -1257,8 +1256,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 			 * drop our task ref.
 			 */
 			clear_dead_task(timer, now);
-			read_unlock(&tasklist_lock);
-			return;
+			goto out_unlock;
 		}
 		cpu_clock_sample_group(timer->it_clock, p, &now);
 		bump_cpu_timer(timer, now);
@@ -1270,7 +1268,13 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 	 */
 	arm_timer(timer, now);
 
+out_unlock:
 	read_unlock(&tasklist_lock);
+
+out:
+	timer->it_overrun_last = timer->it_overrun;
+	timer->it_overrun = -1;
+	++timer->it_requeue_pending;
 }
 
 /*
-- 
cgit v1.2.3


From 4098f9918e068e51fed1727f6ba80efcec372378 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 30 Oct 2005 15:03:21 -0800
Subject: [PATCH] sched: hardcode non-smp set_cpus_allowed

Simplify the UP (1 CPU) implementatin of set_cpus_allowed.

The one CPU is hardcoded to be cpu 0 - so just test for that bit, and avoid
having to pick up the cpu_online_map.

Also, unexport cpu_online_map: it was only needed for set_cpus_allowed().

Signed-off-by: Paul Jackson <pj@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 4f26c544d02..340dd238c16 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3877,7 +3877,6 @@ EXPORT_SYMBOL(cpu_present_map);
 
 #ifndef CONFIG_SMP
 cpumask_t cpu_online_map = CPU_MASK_ALL;
-EXPORT_SYMBOL_GPL(cpu_online_map);
 cpumask_t cpu_possible_map = CPU_MASK_ALL;
 #endif
 
-- 
cgit v1.2.3


From b67a1b9e4bf878aa5d4b6b44cb5a251a2f425f0d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 30 Oct 2005 15:03:44 -0800
Subject: [PATCH] remove hardcoded SEND_SIG_xxx constants

This patch replaces hardcoded SEND_SIG_xxx constants with
their symbolic names.

No changes in affected .o files.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c   | 10 +++++-----
 kernel/signal.c | 35 ++++++++++++++++++++---------------
 2 files changed, 25 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 2d39ccc367e..537394b25e8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -547,7 +547,7 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced)
 
 	if (p->pdeath_signal)
 		/* We already hold the tasklist_lock here.  */
-		group_send_sig_info(p->pdeath_signal, (void *) 0, p);
+		group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
 
 	/* Move the child from its dying parent to the new one.  */
 	if (unlikely(traced)) {
@@ -591,8 +591,8 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced)
 		int pgrp = process_group(p);
 
 		if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) {
-			__kill_pg_info(SIGHUP, (void *)1, pgrp);
-			__kill_pg_info(SIGCONT, (void *)1, pgrp);
+			__kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp);
+			__kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp);
 		}
 	}
 }
@@ -727,8 +727,8 @@ static void exit_notify(struct task_struct *tsk)
 	    (t->signal->session == tsk->signal->session) &&
 	    will_become_orphaned_pgrp(process_group(tsk), tsk) &&
 	    has_stopped_jobs(process_group(tsk))) {
-		__kill_pg_info(SIGHUP, (void *)1, process_group(tsk));
-		__kill_pg_info(SIGCONT, (void *)1, process_group(tsk));
+		__kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk));
+		__kill_pg_info(SIGCONT, SEND_SIG_PRIV, process_group(tsk));
 	}
 
 	/* Let father know we died 
diff --git a/kernel/signal.c b/kernel/signal.c
index 9d1512dcf17..1f7b2aaa4a3 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -651,8 +651,9 @@ static int check_kill_permission(int sig, struct siginfo *info,
 	if (!valid_signal(sig))
 		return error;
 	error = -EPERM;
-	if ((!info || ((unsigned long)info != 1 &&
-			(unsigned long)info != 2 && SI_FROMUSER(info)))
+	if ((info == SEND_SIG_NOINFO ||
+			(info != SEND_SIG_PRIV && info != SEND_SIG_FORCED
+				&& SI_FROMUSER(info)))
 	    && ((sig != SIGCONT) ||
 		(current->signal->session != t->signal->session))
 	    && (current->euid ^ t->suid) && (current->euid ^ t->uid)
@@ -789,7 +790,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 	 * fast-pathed signals for kernel-internal things like SIGSTOP
 	 * or SIGKILL.
 	 */
-	if ((unsigned long)info == 2)
+	if (info == SEND_SIG_FORCED)
 		goto out_set;
 
 	/* Real-time signals must be queued if sent by sigqueue, or
@@ -801,19 +802,19 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 	   pass on the info struct.  */
 
 	q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN &&
-					     ((unsigned long) info < 2 ||
+					     (info < SEND_SIG_FORCED ||
 					      info->si_code >= 0)));
 	if (q) {
 		list_add_tail(&q->list, &signals->list);
 		switch ((unsigned long) info) {
-		case 0:
+		case (unsigned long) SEND_SIG_NOINFO:
 			q->info.si_signo = sig;
 			q->info.si_errno = 0;
 			q->info.si_code = SI_USER;
 			q->info.si_pid = current->pid;
 			q->info.si_uid = current->uid;
 			break;
-		case 1:
+		case (unsigned long) SEND_SIG_PRIV:
 			q->info.si_signo = sig;
 			q->info.si_errno = 0;
 			q->info.si_code = SI_KERNEL;
@@ -825,14 +826,15 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			break;
 		}
 	} else {
-		if (sig >= SIGRTMIN && info && (unsigned long)info != 1
+		if (sig >= SIGRTMIN
+		   && info != SEND_SIG_NOINFO && info != SEND_SIG_PRIV
 		   && info->si_code != SI_USER)
 		/*
 		 * Queue overflow, abort.  We may abort if the signal was rt
 		 * and sent by user using something other than kill().
 		 */
 			return -EAGAIN;
-		if (((unsigned long)info > 1) && (info->si_code == SI_TIMER))
+		if ((info > SEND_SIG_PRIV) && (info->si_code == SI_TIMER))
 			/*
 			 * Set up a return to indicate that we dropped 
 			 * the signal.
@@ -858,7 +860,7 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 		BUG();
 	assert_spin_locked(&t->sighand->siglock);
 
-	if (((unsigned long)info > 2) && (info->si_code == SI_TIMER))
+	if ((info > SEND_SIG_FORCED) && (info->si_code == SI_TIMER))
 		/*
 		 * Set up a return to indicate that we dropped the signal.
 		 */
@@ -914,7 +916,7 @@ force_sig_specific(int sig, struct task_struct *t)
 		t->sighand->action[sig-1].sa.sa_handler = SIG_DFL;
 	sigdelset(&t->blocked, sig);
 	recalc_sigpending_tsk(t);
-	specific_send_sig_info(sig, (void *)2, t);
+	specific_send_sig_info(sig, SEND_SIG_FORCED, t);
 	spin_unlock_irqrestore(&t->sighand->siglock, flags);
 }
 
@@ -1050,7 +1052,7 @@ __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 	assert_spin_locked(&p->sighand->siglock);
 	handle_stop_signal(sig, p);
 
-	if (((unsigned long)info > 2) && (info->si_code == SI_TIMER))
+	if ((info > SEND_SIG_FORCED) && (info->si_code == SI_TIMER))
 		/*
 		 * Set up a return to indicate that we dropped the signal.
 		 */
@@ -1285,10 +1287,13 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 	return ret;
 }
 
+#define __si_special(priv) \
+	((priv) ? SEND_SIG_PRIV : SEND_SIG_NOINFO)
+
 int
 send_sig(int sig, struct task_struct *p, int priv)
 {
-	return send_sig_info(sig, (void*)(long)(priv != 0), p);
+	return send_sig_info(sig, __si_special(priv), p);
 }
 
 /*
@@ -1308,7 +1313,7 @@ send_group_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 void
 force_sig(int sig, struct task_struct *p)
 {
-	force_sig_info(sig, (void*)1L, p);
+	force_sig_info(sig, SEND_SIG_PRIV, p);
 }
 
 /*
@@ -1333,13 +1338,13 @@ force_sigsegv(int sig, struct task_struct *p)
 int
 kill_pg(pid_t pgrp, int sig, int priv)
 {
-	return kill_pg_info(sig, (void *)(long)(priv != 0), pgrp);
+	return kill_pg_info(sig, __si_special(priv), pgrp);
 }
 
 int
 kill_proc(pid_t pid, int sig, int priv)
 {
-	return kill_proc_info(sig, (void *)(long)(priv != 0), pid);
+	return kill_proc_info(sig, __si_special(priv), pid);
 }
 
 /*
-- 
cgit v1.2.3


From 621d31219d9a788bda924a0613048053f3f5f211 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 30 Oct 2005 15:03:45 -0800
Subject: [PATCH] cleanup the usage of SEND_SIG_xxx constants

This patch simplifies some checks for magic siginfo values.  It should not
change the behaviour in any way.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 1f7b2aaa4a3..27533b90c34 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -651,9 +651,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
 	if (!valid_signal(sig))
 		return error;
 	error = -EPERM;
-	if ((info == SEND_SIG_NOINFO ||
-			(info != SEND_SIG_PRIV && info != SEND_SIG_FORCED
-				&& SI_FROMUSER(info)))
+	if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info)))
 	    && ((sig != SIGCONT) ||
 		(current->signal->session != t->signal->session))
 	    && (current->euid ^ t->suid) && (current->euid ^ t->uid)
@@ -802,7 +800,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 	   pass on the info struct.  */
 
 	q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN &&
-					     (info < SEND_SIG_FORCED ||
+					     (is_si_special(info) ||
 					      info->si_code >= 0)));
 	if (q) {
 		list_add_tail(&q->list, &signals->list);
@@ -825,16 +823,14 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			copy_siginfo(&q->info, info);
 			break;
 		}
-	} else {
-		if (sig >= SIGRTMIN
-		   && info != SEND_SIG_NOINFO && info != SEND_SIG_PRIV
-		   && info->si_code != SI_USER)
+	} else if (!is_si_special(info)) {
+		if (sig >= SIGRTMIN && info->si_code != SI_USER)
 		/*
 		 * Queue overflow, abort.  We may abort if the signal was rt
 		 * and sent by user using something other than kill().
 		 */
 			return -EAGAIN;
-		if ((info > SEND_SIG_PRIV) && (info->si_code == SI_TIMER))
+		if (info->si_code == SI_TIMER)
 			/*
 			 * Set up a return to indicate that we dropped 
 			 * the signal.
@@ -860,7 +856,7 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 		BUG();
 	assert_spin_locked(&t->sighand->siglock);
 
-	if ((info > SEND_SIG_FORCED) && (info->si_code == SI_TIMER))
+	if (!is_si_special(info) && (info->si_code == SI_TIMER))
 		/*
 		 * Set up a return to indicate that we dropped the signal.
 		 */
@@ -1052,7 +1048,7 @@ __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 	assert_spin_locked(&p->sighand->siglock);
 	handle_stop_signal(sig, p);
 
-	if ((info > SEND_SIG_FORCED) && (info->si_code == SI_TIMER))
+	if (!is_si_special(info) && (info->si_code == SI_TIMER))
 		/*
 		 * Set up a return to indicate that we dropped the signal.
 		 */
-- 
cgit v1.2.3


From ae6866c377943de73e2c95398ff0120516f167ce Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 30 Oct 2005 15:03:46 -0800
Subject: [PATCH] remove unneeded SI_TIMER checks

This patch removes checks for ->si_code == SI_TIMER from send_signal,
specific_send_sig_info, __group_send_sig_info.

I think posix-timers.c used these functions some time ago, now it sends
signals via send_{,group_}sigqueue, so these hooks are unneeded.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 18 ------------------
 1 file changed, 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 27533b90c34..1d8f84c5c6e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -830,12 +830,6 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 		 * and sent by user using something other than kill().
 		 */
 			return -EAGAIN;
-		if (info->si_code == SI_TIMER)
-			/*
-			 * Set up a return to indicate that we dropped 
-			 * the signal.
-			 */
-			ret = info->si_sys_private;
 	}
 
 out_set:
@@ -856,12 +850,6 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 		BUG();
 	assert_spin_locked(&t->sighand->siglock);
 
-	if (!is_si_special(info) && (info->si_code == SI_TIMER))
-		/*
-		 * Set up a return to indicate that we dropped the signal.
-		 */
-		ret = info->si_sys_private;
-
 	/* Short-circuit ignored signals.  */
 	if (sig_ignored(t, sig))
 		goto out;
@@ -1048,12 +1036,6 @@ __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 	assert_spin_locked(&p->sighand->siglock);
 	handle_stop_signal(sig, p);
 
-	if (!is_si_special(info) && (info->si_code == SI_TIMER))
-		/*
-		 * Set up a return to indicate that we dropped the signal.
-		 */
-		ret = info->si_sys_private;
-
 	/* Short-circuit ignored signals.  */
 	if (sig_ignored(p, sig))
 		return ret;
-- 
cgit v1.2.3


From b0423a0d9cc836b2c3d796623cd19236bfedfe63 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@us.ibm.com>
Date: Sun, 30 Oct 2005 15:03:46 -0800
Subject: [PATCH] Remove duplicate code in signal.c

Combine a bit of redundant code between force_sig_info() and
force_sig_specific().

Signed-off-by: paulmck@us.ibm.com
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 1d8f84c5c6e..1bf3c39d610 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -879,11 +879,13 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 	int ret;
 
 	spin_lock_irqsave(&t->sighand->siglock, flags);
-	if (sigismember(&t->blocked, sig) || t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) {
+	if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) {
 		t->sighand->action[sig-1].sa.sa_handler = SIG_DFL;
+	}
+	if (sigismember(&t->blocked, sig)) {
 		sigdelset(&t->blocked, sig);
-		recalc_sigpending_tsk(t);
 	}
+	recalc_sigpending_tsk(t);
 	ret = specific_send_sig_info(sig, info, t);
 	spin_unlock_irqrestore(&t->sighand->siglock, flags);
 
@@ -893,15 +895,7 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 void
 force_sig_specific(int sig, struct task_struct *t)
 {
-	unsigned long int flags;
-
-	spin_lock_irqsave(&t->sighand->siglock, flags);
-	if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN)
-		t->sighand->action[sig-1].sa.sa_handler = SIG_DFL;
-	sigdelset(&t->blocked, sig);
-	recalc_sigpending_tsk(t);
-	specific_send_sig_info(sig, SEND_SIG_FORCED, t);
-	spin_unlock_irqrestore(&t->sighand->siglock, flags);
+	force_sig_info(sig, SEND_SIG_FORCED, t);
 }
 
 /*
-- 
cgit v1.2.3


From 4e57b6817880946a3a78d5d8cad1ace363f7e449 Mon Sep 17 00:00:00 2001
From: Tim Schmielau <tim@physik3.uni-rostock.de>
Date: Sun, 30 Oct 2005 15:03:48 -0800
Subject: [PATCH] fix missing includes

I recently picked up my older work to remove unnecessary #includes of
sched.h, starting from a patch by Dave Jones to not include sched.h
from module.h. This reduces the number of indirect includes of sched.h
by ~300. Another ~400 pointless direct includes can be removed after
this disentangling (patch to follow later).
However, quite a few indirect includes need to be fixed up for this.

In order to feed the patches through -mm with as little disturbance as
possible, I've split out the fixes I accumulated up to now (complete for
i386 and x86_64, more archs to follow later) and post them before the real
patch.  This way this large part of the patch is kept simple with only
adding #includes, and all hunks are independent of each other.  So if any
hunk rejects or gets in the way of other patches, just drop it.  My scripts
will pick it up again in the next round.

Signed-off-by: Tim Schmielau <tim@physik3.uni-rostock.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kallsyms.c | 1 +
 kernel/kprobes.c  | 1 +
 kernel/params.c   | 1 +
 3 files changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 13bcec151b5..39277dd6bf9 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -18,6 +18,7 @@
 #include <linux/fs.h>
 #include <linux/err.h>
 #include <linux/proc_fs.h>
+#include <linux/sched.h>	/* for cond_resched */
 #include <linux/mm.h>
 
 #include <asm/sections.h>
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index f3ea492ab44..ce4915dd683 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -35,6 +35,7 @@
 #include <linux/spinlock.h>
 #include <linux/hash.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/moduleloader.h>
 #include <asm-generic/sections.h>
diff --git a/kernel/params.c b/kernel/params.c
index 1a8614bac5d..47ba6954794 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -23,6 +23,7 @@
 #include <linux/module.h>
 #include <linux/device.h>
 #include <linux/err.h>
+#include <linux/slab.h>
 
 #if 0
 #define DEBUGP printk
-- 
cgit v1.2.3