From 8f72e4028a1ff968000cec4a034f45619fbd7ec4 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 10 Jul 2006 04:44:01 -0700
Subject: [PATCH] fadvise: remove dead comments

Cc: "Michael Kerrisk" <mtk-manpages@gmx.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/fadvise.c | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'mm')

diff --git a/mm/fadvise.c b/mm/fadvise.c
index 0a03357a1f8..60a5d55e51d 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -23,18 +23,6 @@
 /*
  * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
  * deactivate the pages and clear PG_Referenced.
- *
- * LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file
- * offsets `offset' and `offset+len' inclusive.  Any pages which are currently
- * under writeout are skipped, whether or not they are dirty.
- *
- * LINUX_FADV_WRITE_WAIT: wait upon writeout of any dirty pages between file
- * offsets `offset' and `offset+len'.
- *
- * By combining these two operations the application may do several things:
- *
- * LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk.
- *
  */
 asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
 {
-- 
cgit v1.2.3


From 6d46cc6b9b04dc28a9c5db62db791aeec8ab2ea5 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 10 Jul 2006 04:44:21 -0700
Subject: [PATCH] mm/bootmem.c: EXPORT_UNUSED_SYMBOL

This patch marks an unused export as EXPORT_UNUSED_SYMBOL.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/bootmem.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/bootmem.c b/mm/bootmem.c
index d213feded10..50353e0dac1 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -29,9 +29,7 @@ unsigned long max_low_pfn;
 unsigned long min_low_pfn;
 unsigned long max_pfn;
 
-EXPORT_SYMBOL(max_pfn);		/* This is exported so
-				 * dma_get_required_mask(), which uses
-				 * it, can be an inline function */
+EXPORT_UNUSED_SYMBOL(max_pfn);  /*  June 2006  */
 
 static LIST_HEAD(bdata_list);
 #ifdef CONFIG_CRASH_DUMP
-- 
cgit v1.2.3


From 26fc52367af3774b123334bca409159ce37d2857 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 10 Jul 2006 04:44:22 -0700
Subject: [PATCH] mm/memory.c: EXPORT_UNUSED_SYMBOL

This patch marks an unused export as EXPORT_UNUSED_SYMBOL.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index c1e14c9e67e..dc0d82cf2a1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1853,7 +1853,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
 
 	return 0;
 }
-EXPORT_SYMBOL(vmtruncate_range);
+EXPORT_UNUSED_SYMBOL(vmtruncate_range);  /*  June 2006  */
 
 /* 
  * Primitive swap readahead code. We simply read an aligned block of
-- 
cgit v1.2.3


From b0d85c5c3009d292fe195f666cbbec7da47dabf4 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 10 Jul 2006 04:44:23 -0700
Subject: [PATCH] mm/mmzone.c: EXPORT_UNUSED_SYMBOL

This patch marks three unused exports as EXPORT_UNUSED_SYMBOL.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mmzone.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/mmzone.c b/mm/mmzone.c
index 0959ee1a479..febea1c9816 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -14,7 +14,7 @@ struct pglist_data *first_online_pgdat(void)
 	return NODE_DATA(first_online_node);
 }
 
-EXPORT_SYMBOL(first_online_pgdat);
+EXPORT_UNUSED_SYMBOL(first_online_pgdat);  /*  June 2006  */
 
 struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
 {
@@ -24,7 +24,7 @@ struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
 		return NULL;
 	return NODE_DATA(nid);
 }
-EXPORT_SYMBOL(next_online_pgdat);
+EXPORT_UNUSED_SYMBOL(next_online_pgdat);  /*  June 2006  */
 
 
 /*
@@ -45,5 +45,5 @@ struct zone *next_zone(struct zone *zone)
 	}
 	return zone;
 }
-EXPORT_SYMBOL(next_zone);
+EXPORT_UNUSED_SYMBOL(next_zone);  /*  June 2006  */
 
-- 
cgit v1.2.3


From 32dd66fce3b0ad5857433433b795844cb397608e Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 10 Jul 2006 04:44:31 -0700
Subject: [PATCH] vmstat: export all_vm_events()

Add missing EXPORT_SYMBOL for all_vm_events(). Git commit
f8891e5e1f93a128c3900f82035e8541357896a7 caused this:

  Building modules, stage 2.
  MODPOST
WARNING: "all_vm_events" [arch/s390/appldata/appldata_mem.ko] undefined!
  CC      arch/s390/appldata/appldata_mem.mod.o

Cc: Christoph Lameter <christoph@lameter.com>
Cc: Gerald Schaefer <geraldsc@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmstat.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/vmstat.c b/mm/vmstat.c
index 73b83d67bab..dfdf2413390 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -81,6 +81,7 @@ void all_vm_events(unsigned long *ret)
 {
 	sum_vm_events(ret, &cpu_online_map);
 }
+EXPORT_SYMBOL_GPL(all_vm_events);
 
 #ifdef CONFIG_HOTPLUG
 /*
-- 
cgit v1.2.3


From 873623dfabaa6ebbdc1ce16c1766a3c0ec5d9923 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 13 Jul 2006 14:44:38 +0200
Subject: [PATCH] lockdep: undo mm/slab.c annotation

undo existing mm/slab.c lock-validator annotations, in preparation
of a new, less intrusive annotation patch.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 33 ++++++++++-----------------------
 1 file changed, 10 insertions(+), 23 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 85c2e03098a..fd1e4c4c139 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1021,8 +1021,7 @@ static void drain_alien_cache(struct kmem_cache *cachep,
 	}
 }
 
-static inline int cache_free_alien(struct kmem_cache *cachep, void *objp,
-				   int nesting)
+static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
 {
 	struct slab *slabp = virt_to_slab(objp);
 	int nodeid = slabp->nodeid;
@@ -1040,7 +1039,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp,
 	STATS_INC_NODEFREES(cachep);
 	if (l3->alien && l3->alien[nodeid]) {
 		alien = l3->alien[nodeid];
-		spin_lock_nested(&alien->lock, nesting);
+		spin_lock(&alien->lock);
 		if (unlikely(alien->avail == alien->limit)) {
 			STATS_INC_ACOVERFLOW(cachep);
 			__drain_alien_cache(cachep, alien, nodeid);
@@ -1069,8 +1068,7 @@ static inline void free_alien_cache(struct array_cache **ac_ptr)
 {
 }
 
-static inline int cache_free_alien(struct kmem_cache *cachep, void *objp,
-				   int nesting)
+static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
 {
 	return 0;
 }
@@ -1760,8 +1758,6 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
 }
 #endif
 
-static void __cache_free(struct kmem_cache *cachep, void *objp, int nesting);
-
 /**
  * slab_destroy - destroy and release all objects in a slab
  * @cachep: cache pointer being destroyed
@@ -1785,17 +1781,8 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
 		call_rcu(&slab_rcu->head, kmem_rcu_free);
 	} else {
 		kmem_freepages(cachep, addr);
-		if (OFF_SLAB(cachep)) {
-			unsigned long flags;
-
-			/*
-		 	 * lockdep: we may nest inside an already held
-			 * ac->lock, so pass in a nesting flag:
-			 */
-			local_irq_save(flags);
-			__cache_free(cachep->slabp_cache, slabp, 1);
-			local_irq_restore(flags);
-		}
+		if (OFF_SLAB(cachep))
+			kmem_cache_free(cachep->slabp_cache, slabp);
 	}
 }
 
@@ -3135,7 +3122,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
 #endif
 	check_irq_off();
 	l3 = cachep->nodelists[node];
-	spin_lock_nested(&l3->list_lock, SINGLE_DEPTH_NESTING);
+	spin_lock(&l3->list_lock);
 	if (l3->shared) {
 		struct array_cache *shared_array = l3->shared;
 		int max = shared_array->limit - shared_array->avail;
@@ -3178,14 +3165,14 @@ free_done:
  * Release an obj back to its cache. If the obj has a constructed state, it must
  * be in this state _before_ it is released.  Called with disabled ints.
  */
-static void __cache_free(struct kmem_cache *cachep, void *objp, int nesting)
+static inline void __cache_free(struct kmem_cache *cachep, void *objp)
 {
 	struct array_cache *ac = cpu_cache_get(cachep);
 
 	check_irq_off();
 	objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
 
-	if (cache_free_alien(cachep, objp, nesting))
+	if (cache_free_alien(cachep, objp))
 		return;
 
 	if (likely(ac->avail < ac->limit)) {
@@ -3424,7 +3411,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
 	BUG_ON(virt_to_cache(objp) != cachep);
 
 	local_irq_save(flags);
-	__cache_free(cachep, objp, 0);
+	__cache_free(cachep, objp);
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(kmem_cache_free);
@@ -3449,7 +3436,7 @@ void kfree(const void *objp)
 	kfree_debugcheck(objp);
 	c = virt_to_cache(objp);
 	debug_check_no_locks_freed(objp, obj_size(c));
-	__cache_free(c, (void *)objp, 0);
+	__cache_free(c, (void *)objp);
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(kfree);
-- 
cgit v1.2.3


From f1aaee53f2877a7afa55e8245c241ff60a86367d Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Thu, 13 Jul 2006 14:46:03 +0200
Subject: [PATCH] lockdep: annotate mm/slab.c

mm/slab.c uses nested locking when dealing with 'off-slab'
caches, in that case it allocates the slab header from the
(on-slab) kmalloc caches. Teach the lock validator about
this by putting all on-slab caches into a separate class.

this patch has no effect on non-lockdep kernels.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index fd1e4c4c139..5a57cda7490 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -674,6 +674,37 @@ static struct kmem_cache cache_cache = {
 #endif
 };
 
+#ifdef CONFIG_LOCKDEP
+
+/*
+ * Slab sometimes uses the kmalloc slabs to store the slab headers
+ * for other slabs "off slab".
+ * The locking for this is tricky in that it nests within the locks
+ * of all other slabs in a few places; to deal with this special
+ * locking we put on-slab caches into a separate lock-class.
+ */
+static struct lock_class_key on_slab_key;
+
+static inline void init_lock_keys(struct cache_sizes *s)
+{
+	int q;
+
+	for (q = 0; q < MAX_NUMNODES; q++) {
+		if (!s->cs_cachep->nodelists[q] || OFF_SLAB(s->cs_cachep))
+			continue;
+		lockdep_set_class(&s->cs_cachep->nodelists[q]->list_lock,
+				  &on_slab_key);
+	}
+}
+
+#else
+static inline void init_lock_keys(struct cache_sizes *s)
+{
+}
+#endif
+
+
+
 /* Guard access to the cache-chain. */
 static DEFINE_MUTEX(cache_chain_mutex);
 static struct list_head cache_chain;
@@ -1391,6 +1422,7 @@ void __init kmem_cache_init(void)
 					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
 					NULL, NULL);
 		}
+		init_lock_keys(sizes);
 
 		sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
 					sizes->cs_size,
-- 
cgit v1.2.3


From fc818301a8a39fedd7f0a71f878f29130c72193d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 13 Jul 2006 09:12:21 +0200
Subject: [PATCH] revert slab.c locking change

Chandra Seetharaman reported SLAB crashes caused by the slab.c lock
annotation patch.  There is only one chunk of that patch that has a
material effect on the slab logic - this patch undoes that chunk.

This was confirmed to fix the slab problem by Chandra.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Tested-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 5a57cda7490..0f20843beff 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3119,16 +3119,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
 		if (slabp->inuse == 0) {
 			if (l3->free_objects > l3->free_limit) {
 				l3->free_objects -= cachep->num;
-				/*
-				 * It is safe to drop the lock. The slab is
-				 * no longer linked to the cache. cachep
-				 * cannot disappear - we are using it and
-				 * all destruction of caches must be
-				 * serialized properly by the user.
-				 */
-				spin_unlock(&l3->list_lock);
 				slab_destroy(cachep, slabp);
-				spin_lock(&l3->list_lock);
 			} else {
 				list_add(&slabp->list, &l3->slabs_free);
 			}
-- 
cgit v1.2.3


From 8757d5fa6b75e8ea906baf0309d49b980e7f9bc9 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@web.de>
Date: Fri, 14 Jul 2006 00:23:56 -0700
Subject: [PATCH] mm: fix oom roll-back of __vmalloc_area_node

__vunmap must not rely on area->nr_pages when picking the release methode
for area->pages.  It may be too small when __vmalloc_area_node failed early
due to lacking memory.  Instead, use a flag in vmstruct to differentiate.

Signed-off-by: Jan Kiszka <jan.kiszka@web.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmalloc.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 7b450798b45..266162d2ba2 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -340,7 +340,7 @@ void __vunmap(void *addr, int deallocate_pages)
 			__free_page(area->pages[i]);
 		}
 
-		if (area->nr_pages > PAGE_SIZE/sizeof(struct page *))
+		if (area->flags & VM_VPAGES)
 			vfree(area->pages);
 		else
 			kfree(area->pages);
@@ -427,9 +427,10 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 
 	area->nr_pages = nr_pages;
 	/* Please note that the recursion is strictly bounded. */
-	if (array_size > PAGE_SIZE)
+	if (array_size > PAGE_SIZE) {
 		pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node);
-	else
+		area->flags |= VM_VPAGES;
+	} else
 		pages = kmalloc_node(array_size, (gfp_mask & ~__GFP_HIGHMEM), node);
 	area->pages = pages;
 	if (!area->pages) {
-- 
cgit v1.2.3


From c38c8db7225465c8d124f38b24d3024decc26bbd Mon Sep 17 00:00:00 2001
From: Anil Keshavamurthy <anil.s.keshavamurthy@intel.com>
Date: Fri, 14 Jul 2006 00:23:57 -0700
Subject: [PATCH] ia64: race flushing icache in COW path

There is a race condition that showed up in a threaded JIT environment.
The situation is that a process with a JIT code page forks, so the page is
marked read-only, then some threads are created in the child.  One of the
threads attempts to add a new code block to the JIT page, so a
copy-on-write fault is taken, and the kernel allocates a new page, copies
the data, installs the new pte, and then calls lazy_mmu_prot_update() to
flush caches to make sure that the icache and dcache are in sync.
Unfortunately, the other thread runs right after the new pte is installed,
but before the caches have been flushed.  It tries to execute some old JIT
code that was already in this page, but it sees some garbage in the i-cache
from the previous users of the new physical page.

Fix: we must make the caches consistent before installing the pte.  This is
an ia64 only fix because lazy_mmu_prot_update() is a no-op on all other
architectures.

Signed-off-by: Anil Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index dc0d82cf2a1..de8bc85dc8f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1549,9 +1549,9 @@ gotten:
 		flush_cache_page(vma, address, pte_pfn(orig_pte));
 		entry = mk_pte(new_page, vma->vm_page_prot);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+		lazy_mmu_prot_update(entry);
 		ptep_establish(vma, address, page_table, entry);
 		update_mmu_cache(vma, address, entry);
-		lazy_mmu_prot_update(entry);
 		lru_cache_add_active(new_page);
 		page_add_new_anon_rmap(new_page, vma, address);
 
-- 
cgit v1.2.3


From 22c4af4092fc2e037ce2e2922023fc222cf0c443 Mon Sep 17 00:00:00 2001
From: Luke Yang <luke.adi@gmail.com>
Date: Fri, 14 Jul 2006 00:24:09 -0700
Subject: [PATCH] nommu: export two symbols for drivers to use

nommu.c needs to export two more symbols for drivers to use:
remap_pfn_range and unmap_mapping_range.

Signed-off-by: Luke Yang <luke.adi@gmail.com>
Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/nommu.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'mm')

diff --git a/mm/nommu.c b/mm/nommu.c
index 5151c44a825..c576df71e3b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1070,6 +1070,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
 	vma->vm_start = vma->vm_pgoff << PAGE_SHIFT;
 	return 0;
 }
+EXPORT_SYMBOL(remap_pfn_range);
 
 void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 {
@@ -1090,6 +1091,7 @@ void unmap_mapping_range(struct address_space *mapping,
 			 int even_cows)
 {
 }
+EXPORT_SYMBOL(unmap_mapping_range);
 
 /*
  * Check that a process has enough memory to allocate a new virtual
-- 
cgit v1.2.3


From 0ff922452df86f3e9a2c6f705c4588ec62d096a7 Mon Sep 17 00:00:00 2001
From: Shailabh Nagar <nagar@watson.ibm.com>
Date: Fri, 14 Jul 2006 00:24:37 -0700
Subject: [PATCH] per-task-delay-accounting: sync block I/O and swapin delay
 collection

Unlike earlier iterations of the delay accounting patches, now delays are only
collected for the actual I/O waits rather than try and cover the delays seen
in I/O submission paths.

Account separately for block I/O delays incurred as a result of swapin page
faults whose frequency can be affected by the task/process' rss limit.  Hence
swapin delays can act as feedback for rss limit changes independent of I/O
priority changes.

Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Cc: Jes Sorensen <jes@sgi.com>
Cc: Peter Chubb <peterc@gelato.unsw.edu.au>
Cc: Erich Focht <efocht@ess.nec.de>
Cc: Levent Serinol <lserinol@gmail.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index de8bc85dc8f..109e9866237 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -47,6 +47,7 @@
 #include <linux/pagemap.h>
 #include <linux/rmap.h>
 #include <linux/module.h>
+#include <linux/delayacct.h>
 #include <linux/init.h>
 
 #include <asm/pgalloc.h>
@@ -1934,6 +1935,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		migration_entry_wait(mm, pmd, address);
 		goto out;
 	}
+	delayacct_set_flag(DELAYACCT_PF_SWAPIN);
 	page = lookup_swap_cache(entry);
 	if (!page) {
  		swapin_readahead(entry, address, vma);
@@ -1946,6 +1948,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 			if (likely(pte_same(*page_table, orig_pte)))
 				ret = VM_FAULT_OOM;
+			delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
 			goto unlock;
 		}
 
@@ -1955,6 +1958,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		grab_swap_token();
 	}
 
+	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
 	mark_page_accessed(page);
 	lock_page(page);
 
-- 
cgit v1.2.3


From b83a8e64fd1aecf021111d22c062c97a3241d0c4 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 29 Jul 2006 21:42:55 +0200
Subject: [PATCH] MM: Remove rogue readahead printk

For some reason it triggers always with NFS root and spams the kernel
logs of my nfs root boxes a lot.

Signed-off-by: Andi Kleen <ak@suse.de>
Acked-by: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/filemap.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index d087fc3d328..b9a60c43b61 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -849,8 +849,6 @@ static void shrink_readahead_size_eio(struct file *filp,
 		return;
 
 	ra->ra_pages /= 4;
-	printk(KERN_WARNING "Reducing readahead size to %luK\n",
-			ra->ra_pages << (PAGE_CACHE_SHIFT - 10));
 }
 
 /**
-- 
cgit v1.2.3


From 8c78f3075dab4be279e283f901f00e33ce44890a Mon Sep 17 00:00:00 2001
From: Chandra Seetharaman <sekharan@us.ibm.com>
Date: Sun, 30 Jul 2006 03:03:35 -0700
Subject: [PATCH] cpu hotplug: replace __devinit* with __cpuinit* for cpu
 notifications

Few of the callback functions and notifier blocks that are associated with cpu
notifications incorrectly have __devinit and __devinitdata.  They should be
__cpuinit and __cpuinitdata instead.

It makes no functional difference but wastes text area when CONFIG_HOTPLUG is
enabled and CONFIG_HOTPLUG_CPU is not.

This patch fixes all those instances.

Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Cc: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 0f20843beff..252972ff649 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1106,7 +1106,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
 
 #endif
 
-static int __devinit cpuup_callback(struct notifier_block *nfb,
+static int __cpuinit cpuup_callback(struct notifier_block *nfb,
 				    unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
-- 
cgit v1.2.3


From b8008b2bc21fb13b45964e21247f18c013d6e985 Mon Sep 17 00:00:00 2001
From: Rolf Eike Beer <eike-kernel@sf-tec.de>
Date: Sun, 30 Jul 2006 03:04:04 -0700
Subject: [PATCH] Fix kmem_cache_alloc() been documented twice

kmem_cache_alloc() was documented twice, but kmem_cache_zalloc() never.
Fix this obvious typo to get things right.

Signed-off-by: Rolf Eike Beer <eike-kernel@sf-tec.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 252972ff649..21ba0603570 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3224,7 +3224,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 EXPORT_SYMBOL(kmem_cache_alloc);
 
 /**
- * kmem_cache_alloc - Allocate an object. The memory is set to zero.
+ * kmem_cache_zalloc - Allocate an object. The memory is set to zero.
  * @cache: The cache to allocate from.
  * @flags: See kmalloc().
  *
-- 
cgit v1.2.3


From 60c371bc753495f36d3a71338b46030f7fffce3b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sat, 5 Aug 2006 12:14:25 -0700
Subject: [PATCH] fadvise() make POSIX_FADV_NOREUSE a no-op

The POSIX_FADV_NOREUSE hint means "the application will use this range of the
file a single time".  It seems to be intended that the implementation will use
this hint to perform drop-behind of that part of the file when the application
gets around to reading or writing it.

However for reasons which aren't obvious (or sane?) I mapped
POSIX_FADV_NOREUSE onto POSIX_FADV_WILLNEED.  ie: it does readahead.

That's daft.  So for now, make POSIX_FADV_NOREUSE a no-op.

This is a non-back-compatible change.  If someone was using POSIX_FADV_NOREUSE
to perform readahead, they lose.  The likelihood is low.

If/when we later implement POSIX_FADV_NOREUSE things will get interesting - to
do it fully we'll need to maintain file offset/length ranges and peform all
sorts of complex tricks, and managing the lifetime of those ranges' data
structures will be interesting..

A sensible implementation would probably ignore the file range and would
simply mark the entire file as needing some form of drop-behind treatment.

Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/fadvise.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/fadvise.c b/mm/fadvise.c
index 60a5d55e51d..168c78a121b 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -73,7 +73,6 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
 		file->f_ra.ra_pages = bdi->ra_pages * 2;
 		break;
 	case POSIX_FADV_WILLNEED:
-	case POSIX_FADV_NOREUSE:
 		if (!mapping->a_ops->readpage) {
 			ret = -EINVAL;
 			break;
@@ -94,6 +93,8 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
 		if (ret > 0)
 			ret = 0;
 		break;
+	case POSIX_FADV_NOREUSE:
+		break;
 	case POSIX_FADV_DONTNEED:
 		if (!bdi_write_congested(mapping->backing_dev_info))
 			filemap_flush(mapping);
-- 
cgit v1.2.3


From 6f712711dbd180aa3777efe5ae3b9b0e915b9471 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Sat, 5 Aug 2006 12:14:58 -0700
Subject: [PATCH] memory hotadd fixes: not-aligned memory hotadd handling fix

ioresouce handling code in memory hotplug allows not-aligned memory hot add.
But when memmap and other memory structures are initialized, parameters should
be aligned.  (if not aligned, initialization of mem_map will do wrong, it
assumes parameters are aligned.) This patch fix it.

And this patch allows ioresource collision check to handle -EEXIST.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Keith Mannthey <kmannth@gmail.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory_hotplug.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 01c9fb97c61..7d25cc12235 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -76,15 +76,22 @@ int __add_pages(struct zone *zone, unsigned long phys_start_pfn,
 {
 	unsigned long i;
 	int err = 0;
+	int start_sec, end_sec;
+	/* during initialize mem_map, align hot-added range to section */
+	start_sec = pfn_to_section_nr(phys_start_pfn);
+	end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
 
-	for (i = 0; i < nr_pages; i += PAGES_PER_SECTION) {
-		err = __add_section(zone, phys_start_pfn + i);
+	for (i = start_sec; i <= end_sec; i++) {
+		err = __add_section(zone, i << PFN_SECTION_SHIFT);
 
-		/* We want to keep adding the rest of the
-		 * sections if the first ones already exist
+		/*
+		 * EEXIST is finally dealed with by ioresource collision
+		 * check. see add_memory() => register_memory_resource()
+		 * Warning will be printed if there is collision.
 		 */
 		if (err && (err != -EEXIST))
 			break;
+		err = 0;
 	}
 
 	return err;
@@ -213,10 +220,10 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
 }
 
 /* add this memory to iomem resource */
-static void register_memory_resource(u64 start, u64 size)
+static int register_memory_resource(u64 start, u64 size)
 {
 	struct resource *res;
-
+	int ret = 0;
 	res = kzalloc(sizeof(struct resource), GFP_KERNEL);
 	BUG_ON(!res);
 
@@ -228,7 +235,9 @@ static void register_memory_resource(u64 start, u64 size)
 		printk("System RAM resource %llx - %llx cannot be added\n",
 		(unsigned long long)res->start, (unsigned long long)res->end);
 		kfree(res);
+		ret = -EEXIST;
 	}
+	return ret;
 }
 
 
@@ -269,7 +278,7 @@ int add_memory(int nid, u64 start, u64 size)
 	}
 
 	/* register this memory as resource */
-	register_memory_resource(start, size);
+	ret = register_memory_resource(start, size);
 
 	return ret;
 error:
-- 
cgit v1.2.3


From 58c1b5b079071d82b2f924000b7e8fb5585ce7d8 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Sat, 5 Aug 2006 12:15:01 -0700
Subject: [PATCH] memory hotadd fixes: find_next_system_ram catch range fix

find_next_system_ram() is used to find available memory resource at onlining
newly added memory.  This patch fixes following problem.

find_next_system_ram() cannot catch this case.

Resource:      (start)-------------(end)
Section :                (start)-------------(end)

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Keith Mannthey <kmannth@gmail.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory_hotplug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 7d25cc12235..26f1840879d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -163,7 +163,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 	res.flags = IORESOURCE_MEM; /* we just need system ram */
 	section_end = res.end;
 
-	while (find_next_system_ram(&res) >= 0) {
+	while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) {
 		start_pfn = (unsigned long)(res.start >> PAGE_SHIFT);
 		nr_pages = (unsigned long)
                            ((res.end + 1 - res.start) >> PAGE_SHIFT);
-- 
cgit v1.2.3


From ebd15302dc0ba1b8761600c20854f5371e7bae1e Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Sat, 5 Aug 2006 12:15:06 -0700
Subject: [PATCH] memory hotadd fixes: enhance collision check

This patch is for collision check enhancement for memory hot add.

It's better to do resouce collision check before doing memory hot add,
which will touch memory management structures.

And add_section() should check section exists or not before calling
sparse_add_one_section(). (sparse_add_one_section() will do another
check anyway. but checking in memory_hotplug.c will be easy to understand.)

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: keith mannthey <kmannth@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory_hotplug.c | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 26f1840879d..c37319542b7 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -52,6 +52,9 @@ static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
 	int nr_pages = PAGES_PER_SECTION;
 	int ret;
 
+	if (pfn_valid(phys_start_pfn))
+		return -EEXIST;
+
 	ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);
 
 	if (ret < 0)
@@ -220,10 +223,9 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
 }
 
 /* add this memory to iomem resource */
-static int register_memory_resource(u64 start, u64 size)
+static struct resource *register_memory_resource(u64 start, u64 size)
 {
 	struct resource *res;
-	int ret = 0;
 	res = kzalloc(sizeof(struct resource), GFP_KERNEL);
 	BUG_ON(!res);
 
@@ -235,9 +237,18 @@ static int register_memory_resource(u64 start, u64 size)
 		printk("System RAM resource %llx - %llx cannot be added\n",
 		(unsigned long long)res->start, (unsigned long long)res->end);
 		kfree(res);
-		ret = -EEXIST;
+		res = NULL;
 	}
-	return ret;
+	return res;
+}
+
+static void release_memory_resource(struct resource *res)
+{
+	if (!res)
+		return;
+	release_resource(res);
+	kfree(res);
+	return;
 }
 
 
@@ -246,8 +257,13 @@ int add_memory(int nid, u64 start, u64 size)
 {
 	pg_data_t *pgdat = NULL;
 	int new_pgdat = 0;
+	struct resource *res;
 	int ret;
 
+	res = register_memory_resource(start, size);
+	if (!res)
+		return -EEXIST;
+
 	if (!node_online(nid)) {
 		pgdat = hotadd_new_pgdat(nid, start);
 		if (!pgdat)
@@ -277,14 +293,13 @@ int add_memory(int nid, u64 start, u64 size)
 		BUG_ON(ret);
 	}
 
-	/* register this memory as resource */
-	ret = register_memory_resource(start, size);
-
 	return ret;
 error:
 	/* rollback pgdat allocation and others */
 	if (new_pgdat)
 		rollback_node_hotadd(nid, pgdat);
+	if (res)
+		release_memory_resource(res);
 
 	return ret;
 }
-- 
cgit v1.2.3


From 1d7ea7324ae7a59f8e17e4ba76a2707c1e6f24d2 Mon Sep 17 00:00:00 2001
From: Alexander Zarochentsev <zam@namesys.com>
Date: Sun, 13 Aug 2006 23:24:27 -0700
Subject: [PATCH] fuse: fix error case in fuse_readpages

Don't let fuse_readpages leave the @pages list not empty when exiting
on error.

[akpm@osdl.org: kernel-doc fixes]
Signed-off-by: Alexander Zarochentsev <zam@namesys.com>
Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 mm/swap.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'mm')

diff --git a/mm/swap.c b/mm/swap.c
index 8fd095c4ae5..687686a61f7 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -54,6 +54,26 @@ void put_page(struct page *page)
 }
 EXPORT_SYMBOL(put_page);
 
+/**
+ * put_pages_list(): release a list of pages
+ *
+ * Release a list of pages which are strung together on page.lru.  Currently
+ * used by read_cache_pages() and related error recovery code.
+ *
+ * @pages: list of pages threaded on page->lru
+ */
+void put_pages_list(struct list_head *pages)
+{
+	while (!list_empty(pages)) {
+		struct page *victim;
+
+		victim = list_entry(pages->prev, struct page, lru);
+		list_del(&victim->lru);
+		page_cache_release(victim);
+	}
+}
+EXPORT_SYMBOL(put_pages_list);
+
 /*
  * Writeback is about to end against a page which has been marked for immediate
  * reclaim.  If it still appears to be reclaimable, move it to the tail of the
-- 
cgit v1.2.3


From b6b5bce3571e496504a89ee575d32101e0a98b93 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 27 Aug 2006 01:23:25 -0700
Subject: [PATCH] swsusp: Fix swap_type_of

There is a bug in mm/swapfile.c#swap_type_of() that makes swsusp only be
able to use the first active swap partition as the resume device.  Fix it.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Hugh Dickins <hugh@veritas.com>
Acked-by: Pavel Machek <pavel@suse.cz>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swapfile.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/swapfile.c b/mm/swapfile.c
index e70d6c6d6fe..f1f5ec78378 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -442,11 +442,12 @@ int swap_type_of(dev_t device)
 
 		if (!(swap_info[i].flags & SWP_WRITEOK))
 			continue;
+
 		if (!device) {
 			spin_unlock(&swap_lock);
 			return i;
 		}
-		inode = swap_info->swap_file->f_dentry->d_inode;
+		inode = swap_info[i].swap_file->f_dentry->d_inode;
 		if (S_ISBLK(inode->i_mode) &&
 		    device == MKDEV(imajor(inode), iminor(inode))) {
 			spin_unlock(&swap_lock);
-- 
cgit v1.2.3


From a302eb4e4602d6444ae75a0e516fb2f2c62d6642 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Thu, 31 Aug 2006 21:27:34 -0700
Subject: [PATCH] ZVC: Overstep counters

Increments and decrements are usually grouped rather than mixed.  We can
optimize the inc and dec functions for that case.

Increment and decrement the counters by 50% more than the threshold in
those cases and set the differential accordingly.  This decreases the need
to update the atomic counters.

The idea came originally from Andrew Morton.  The overstepping alone was
sufficient to address the contention issue found when updating the global
and the per zone counters from 160 processors.

Also remove some code in dec_zone_page_state.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmstat.c | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

(limited to 'mm')

diff --git a/mm/vmstat.c b/mm/vmstat.c
index dfdf2413390..3799a0f7543 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -190,8 +190,8 @@ static void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
 	(*p)++;
 
 	if (unlikely(*p > STAT_THRESHOLD)) {
-		zone_page_state_add(*p, zone, item);
-		*p = 0;
+		zone_page_state_add(*p + STAT_THRESHOLD / 2, zone, item);
+		*p = -STAT_THRESHOLD / 2;
 	}
 }
 
@@ -209,8 +209,8 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
 	(*p)--;
 
 	if (unlikely(*p < -STAT_THRESHOLD)) {
-		zone_page_state_add(*p, zone, item);
-		*p = 0;
+		zone_page_state_add(*p - STAT_THRESHOLD / 2, zone, item);
+		*p = STAT_THRESHOLD /2;
 	}
 }
 EXPORT_SYMBOL(__dec_zone_page_state);
@@ -239,19 +239,9 @@ EXPORT_SYMBOL(inc_zone_page_state);
 void dec_zone_page_state(struct page *page, enum zone_stat_item item)
 {
 	unsigned long flags;
-	struct zone *zone;
-	s8 *p;
 
-	zone = page_zone(page);
 	local_irq_save(flags);
-	p = diff_pointer(zone, item);
-
-	(*p)--;
-
-	if (unlikely(*p < -STAT_THRESHOLD)) {
-		zone_page_state_add(*p, zone, item);
-		*p = 0;
-	}
+	__dec_zone_page_state(page, item);
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(dec_zone_page_state);
-- 
cgit v1.2.3


From df9ecaba3f152d1ea79f2a5e0b87505e03f47590 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Thu, 31 Aug 2006 21:27:35 -0700
Subject: [PATCH] ZVC: Scale thresholds depending on the size of the system

The ZVC counter update threshold is currently set to a fixed value of 32.
This patch sets up the threshold depending on the number of processors and
the sizes of the zones in the system.

With the current threshold of 32, I was able to observe slight contention
when more than 130-140 processors concurrently updated the counters.  The
contention vanished when I either increased the threshold to 64 or used
Andrew's idea of overstepping the interval (see ZVC overstep patch).

However, we saw contention again at 220-230 processors.  So we need higher
values for larger systems.

But the current default is already a bit of an overkill for smaller
systems.  Some systems have tiny zones where precision matters.  For
example i386 and x86_64 have 16M DMA zones and either 900M ZONE_NORMAL or
ZONE_DMA32.  These are even present on SMP and NUMA systems.

The patch here sets up a threshold based on the number of processors in the
system and the size of the zone that these counters are used for.  The
threshold should grow logarithmically, so we use fls() as an easy
approximation.

Results of tests on a system with 1024 processors (4TB RAM)

The following output is from a test allocating 1GB of memory concurrently
on each processor (Forking the process.  So contention on mmap_sem and the
pte locks is not a factor):

                       X                   MIN
TYPE:               CPUS       WALL       WALL        SYS     USER     TOTCPU
fork                   1      0.552      0.552      0.540    0.012      0.552
fork                   4      0.552      0.548      2.164    0.036      2.200
fork                  16      0.564      0.548      8.812    0.164      8.976
fork                 128      0.580      0.572     72.204    1.208     73.412
fork                 256      1.300      0.660    310.400    2.160    312.560
fork                 512      3.512      0.696   1526.836    4.816   1531.652
fork                1020     20.024      0.700  17243.176    6.688  17249.863

So a threshold of 32 is fine up to 128 processors. At 256 processors contention
becomes a factor.

Overstepping the counter (earlier patch) improves the numbers a bit:

fork                   4      0.552      0.548      2.164    0.040      2.204
fork                  16      0.552      0.548      8.640    0.148      8.788
fork                 128      0.556      0.548     69.676    0.956     70.632
fork                 256      0.876      0.636    212.468    2.108    214.576
fork                 512      2.276      0.672    997.324    4.260   1001.584
fork                1020     13.564      0.680  11586.436    6.088  11592.523

Still contention at 512 and 1020. Contention at 1020 is down by a third.
256 still has a slight bit of contention.

After this patch the counter threshold will be set to 125 which reduces
contention significantly:

fork                 128      0.560      0.548     69.776    0.932     70.708
fork                 256      0.636      0.556    143.460    2.036    145.496
fork                 512      0.640      0.548    284.244    4.236    288.480
fork                1020      1.500      0.588   1326.152    8.892   1335.044

[akpm@osdl.org: !SMP build fix]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmstat.c | 139 +++++++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 119 insertions(+), 20 deletions(-)

(limited to 'mm')

diff --git a/mm/vmstat.c b/mm/vmstat.c
index 3799a0f7543..c1b5f4106b3 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -12,6 +12,7 @@
 #include <linux/config.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/cpu.h>
 
 void __get_zone_counts(unsigned long *active, unsigned long *inactive,
 			unsigned long *free, struct pglist_data *pgdat)
@@ -114,17 +115,72 @@ EXPORT_SYMBOL(vm_stat);
 
 #ifdef CONFIG_SMP
 
-#define STAT_THRESHOLD 32
+static int calculate_threshold(struct zone *zone)
+{
+	int threshold;
+	int mem;	/* memory in 128 MB units */
+
+	/*
+	 * The threshold scales with the number of processors and the amount
+	 * of memory per zone. More memory means that we can defer updates for
+	 * longer, more processors could lead to more contention.
+ 	 * fls() is used to have a cheap way of logarithmic scaling.
+	 *
+	 * Some sample thresholds:
+	 *
+	 * Threshold	Processors	(fls)	Zonesize	fls(mem+1)
+	 * ------------------------------------------------------------------
+	 * 8		1		1	0.9-1 GB	4
+	 * 16		2		2	0.9-1 GB	4
+	 * 20 		2		2	1-2 GB		5
+	 * 24		2		2	2-4 GB		6
+	 * 28		2		2	4-8 GB		7
+	 * 32		2		2	8-16 GB		8
+	 * 4		2		2	<128M		1
+	 * 30		4		3	2-4 GB		5
+	 * 48		4		3	8-16 GB		8
+	 * 32		8		4	1-2 GB		4
+	 * 32		8		4	0.9-1GB		4
+	 * 10		16		5	<128M		1
+	 * 40		16		5	900M		4
+	 * 70		64		7	2-4 GB		5
+	 * 84		64		7	4-8 GB		6
+	 * 108		512		9	4-8 GB		6
+	 * 125		1024		10	8-16 GB		8
+	 * 125		1024		10	16-32 GB	9
+	 */
+
+	mem = zone->present_pages >> (27 - PAGE_SHIFT);
+
+	threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
+
+	/*
+	 * Maximum threshold is 125
+	 */
+	threshold = min(125, threshold);
+
+	return threshold;
+}
 
 /*
- * Determine pointer to currently valid differential byte given a zone and
- * the item number.
- *
- * Preemption must be off
+ * Refresh the thresholds for each zone.
  */
-static inline s8 *diff_pointer(struct zone *zone, enum zone_stat_item item)
+static void refresh_zone_stat_thresholds(void)
 {
-	return &zone_pcp(zone, smp_processor_id())->vm_stat_diff[item];
+	struct zone *zone;
+	int cpu;
+	int threshold;
+
+	for_each_zone(zone) {
+
+		if (!zone->present_pages)
+			continue;
+
+		threshold = calculate_threshold(zone);
+
+		for_each_online_cpu(cpu)
+			zone_pcp(zone, cpu)->stat_threshold = threshold;
+	}
 }
 
 /*
@@ -133,17 +189,16 @@ static inline s8 *diff_pointer(struct zone *zone, enum zone_stat_item item)
 void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
 				int delta)
 {
-	s8 *p;
+	struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+	s8 *p = pcp->vm_stat_diff + item;
 	long x;
 
-	p = diff_pointer(zone, item);
 	x = delta + *p;
 
-	if (unlikely(x > STAT_THRESHOLD || x < -STAT_THRESHOLD)) {
+	if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
 		zone_page_state_add(x, zone, item);
 		x = 0;
 	}
-
 	*p = x;
 }
 EXPORT_SYMBOL(__mod_zone_page_state);
@@ -172,10 +227,12 @@ EXPORT_SYMBOL(mod_zone_page_state);
  * No overflow check is necessary and therefore the differential can be
  * incremented or decremented in place which may allow the compilers to
  * generate better code.
- *
  * The increment or decrement is known and therefore one boundary check can
  * be omitted.
  *
+ * NOTE: These functions are very performance sensitive. Change only
+ * with care.
+ *
  * Some processors have inc/dec instructions that are atomic vs an interrupt.
  * However, the code must first determine the differential location in a zone
  * based on the processor number and then inc/dec the counter. There is no
@@ -185,13 +242,16 @@ EXPORT_SYMBOL(mod_zone_page_state);
  */
 static void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
 {
-	s8 *p = diff_pointer(zone, item);
+	struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+	s8 *p = pcp->vm_stat_diff + item;
 
 	(*p)++;
 
-	if (unlikely(*p > STAT_THRESHOLD)) {
-		zone_page_state_add(*p + STAT_THRESHOLD / 2, zone, item);
-		*p = -STAT_THRESHOLD / 2;
+	if (unlikely(*p > pcp->stat_threshold)) {
+		int overstep = pcp->stat_threshold / 2;
+
+		zone_page_state_add(*p + overstep, zone, item);
+		*p = -overstep;
 	}
 }
 
@@ -204,13 +264,16 @@ EXPORT_SYMBOL(__inc_zone_page_state);
 void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
 {
 	struct zone *zone = page_zone(page);
-	s8 *p = diff_pointer(zone, item);
+	struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+	s8 *p = pcp->vm_stat_diff + item;
 
 	(*p)--;
 
-	if (unlikely(*p < -STAT_THRESHOLD)) {
-		zone_page_state_add(*p - STAT_THRESHOLD / 2, zone, item);
-		*p = STAT_THRESHOLD /2;
+	if (unlikely(*p < - pcp->stat_threshold)) {
+		int overstep = pcp->stat_threshold / 2;
+
+		zone_page_state_add(*p - overstep, zone, item);
+		*p = overstep;
 	}
 }
 EXPORT_SYMBOL(__dec_zone_page_state);
@@ -515,6 +578,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
 					   pageset->pcp[j].high,
 					   pageset->pcp[j].batch);
 			}
+#ifdef CONFIG_SMP
+			seq_printf(m, "\n  vm stats threshold: %d",
+					pageset->stat_threshold);
+#endif
 		}
 		seq_printf(m,
 			   "\n  all_unreclaimable: %u"
@@ -603,3 +670,35 @@ struct seq_operations vmstat_op = {
 
 #endif /* CONFIG_PROC_FS */
 
+#ifdef CONFIG_SMP
+/*
+ * Use the cpu notifier to insure that the thresholds are recalculated
+ * when necessary.
+ */
+static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
+		unsigned long action,
+		void *hcpu)
+{
+	switch (action) {
+		case CPU_UP_PREPARE:
+		case CPU_UP_CANCELED:
+		case CPU_DEAD:
+			refresh_zone_stat_thresholds();
+			break;
+		default:
+			break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata vmstat_notifier =
+	{ &vmstat_cpuup_callback, NULL, 0 };
+
+int __init setup_vmstat(void)
+{
+	refresh_zone_stat_thresholds();
+	register_cpu_notifier(&vmstat_notifier);
+	return 0;
+}
+module_init(setup_vmstat)
+#endif
-- 
cgit v1.2.3


From 0b1d647a02c5a1b67d45287eeb6cb3b2219c41c3 Mon Sep 17 00:00:00 2001
From: Pavel Mironchik <tibor0@gmail.com>
Date: Thu, 31 Aug 2006 21:27:47 -0700
Subject: [PATCH] dm: work around mempool_alloc, bio_alloc_bioset deadlocks

This patch works around a complex dm-related deadlock/livelock down in the
mempool allocator.

Alasdair said:

  Several dm targets suffer from this.

  Mempools are not yet used correctly everywhere in device-mapper: they can
  get shared when devices are stacked, and some targets share them across
  multiple instances.  I made fixing this one of the prerequisites for this
  patch:

    md-dm-reduce-stack-usage-with-stacked-block-devices.patch

  which in some cases makes people more likely to hit the problem.

  There's been some progress on this recently with (unfinished) dm-crypt
  patches at:

    http://www.kernel.org/pub/linux/kernel/people/agk/patches/2.6/editing/
      (dm-crypt-move-io-to-workqueue.patch plus dependencies)

and:

  I've no problems with a temporary workaround like that, but Milan Broz (a
  new Redhat developer in the Czech Republic) has started reviewing all the
  mempool usage in device-mapper so I'm expecting we'll soon have a proper fix
  for this associated problems.  [He's back from holiday at the start of next
  week.]

For now, this sad-but-safe little patch will allow the machine to recover.

[akpm@osdl.org: rewrote changelog]
Cc: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempool.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/mempool.c b/mm/mempool.c
index fe6e05289cc..ccd8cb8cd41 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -238,8 +238,13 @@ repeat_alloc:
 	init_wait(&wait);
 	prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
 	smp_mb();
-	if (!pool->curr_nr)
-		io_schedule();
+	if (!pool->curr_nr) {
+		/*
+		 * FIXME: this should be io_schedule().  The timeout is there
+		 * as a workaround for some DM problems in 2.6.18.
+		 */
+		io_schedule_timeout(5*HZ);
+	}
 	finish_wait(&pool->wait, &wait);
 
 	goto repeat_alloc;
-- 
cgit v1.2.3


From 3b98b087fc2daab67518d2baa8aef19a6ad82723 Mon Sep 17 00:00:00 2001
From: Nishanth Aravamudan <nacc@us.ibm.com>
Date: Thu, 31 Aug 2006 21:27:53 -0700
Subject: [PATCH] fix NUMA interleaving for huge pages

Since vma->vm_pgoff is in units of smallpages, VMAs for huge pages have the
lower HPAGE_SHIFT - PAGE_SHIFT bits always cleared, which results in badd
offsets to the interleave functions.  Take this difference from small pages
into account when calculating the offset.  This does add a 0-bit shift into
the small-page path (via alloc_page_vma()), but I think that is negligible.
 Also add a BUG_ON to prevent the offset from growing due to a negative
right-shift, which probably shouldn't be allowed anyways.

Tested on an 8-memory node ppc64 NUMA box and got the interleaving I
expected.

Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Signed-off-by: Adam Litke <agl@us.ibm.com>
Cc: Andi Kleen <ak@muc.de>
Acked-by: Christoph Lameter <clameter@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e07e27e846a..a9963ceddd6 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1176,7 +1176,15 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
 	if (vma) {
 		unsigned long off;
 
-		off = vma->vm_pgoff;
+		/*
+		 * for small pages, there is no difference between
+		 * shift and PAGE_SHIFT, so the bit-shift is safe.
+		 * for huge pages, since vm_pgoff is in units of small
+		 * pages, we need to shift off the always 0 bits to get
+		 * a useful offset.
+		 */
+		BUG_ON(shift < PAGE_SHIFT);
+		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
 		off += (addr - vma->vm_start) >> shift;
 		return offset_il_node(pol, vma, off);
 	} else
-- 
cgit v1.2.3


From 3a459756810912d2c2bf188cef566af255936b4d Mon Sep 17 00:00:00 2001
From: Kirill Korotaev <dev@openvz.org>
Date: Thu, 7 Sep 2006 14:17:04 +0400
Subject: [PATCH] IA64,sparc: local DoS with corrupted ELFs

This prevents cross-region mappings on IA64 and SPARC which could lead
to system crash.  They were correctly trapped for normal mmap() calls,
but not for the kernel internal calls generated by executable loading.

This code just moves the architecture-specific cross-region checks into
an arch-specific "arch_mmap_check()" macro, and defines that for the
architectures that needed it (ia64, sparc and sparc64).

Architectures that don't have any special requirements can just ignore
the new cross-region check, since the mmap() code will just notice on
its own when the macro isn't defined.

Signed-off-by: Pavel Emelianov <xemul@openvz.org>
Signed-off-by: Kirill Korotaev <dev@openvz.org>
Acked-by: David Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
[ Cleaned up to not affect architectures that don't need it ]
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mmap.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index c1868ecdbc5..e66a0b524af 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -30,6 +30,10 @@
 #include <asm/cacheflush.h>
 #include <asm/tlb.h>
 
+#ifndef arch_mmap_check
+#define arch_mmap_check(addr, len, flags)	(0)
+#endif
+
 static void unmap_region(struct mm_struct *mm,
 		struct vm_area_struct *vma, struct vm_area_struct *prev,
 		unsigned long start, unsigned long end);
@@ -913,6 +917,10 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
 	if (!len)
 		return -EINVAL;
 
+	error = arch_mmap_check(addr, len, flags);
+	if (error)
+		return error;
+
 	/* Careful about overflows.. */
 	len = PAGE_ALIGN(len);
 	if (!len || len > TASK_SIZE)
@@ -1859,6 +1867,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
 	unsigned long flags;
 	struct rb_node ** rb_link, * rb_parent;
 	pgoff_t pgoff = addr >> PAGE_SHIFT;
+	int error;
 
 	len = PAGE_ALIGN(len);
 	if (!len)
@@ -1867,6 +1876,12 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
 	if ((addr + len) > TASK_SIZE || (addr + len) < addr)
 		return -EINVAL;
 
+	flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
+
+	error = arch_mmap_check(addr, len, flags);
+	if (error)
+		return error;
+
 	/*
 	 * mlock MCL_FUTURE?
 	 */
@@ -1907,8 +1922,6 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
 	if (security_vm_enough_memory(len >> PAGE_SHIFT))
 		return -ENOMEM;
 
-	flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
-
 	/* Can we just expand an old private anonymous mapping? */
 	if (vma_merge(mm, prev, addr, addr + len, flags,
 					NULL, NULL, pgoff, NULL))
-- 
cgit v1.2.3


From 016eb4a0ed06a3677d67a584da901f0e9a63c666 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 8 Sep 2006 09:48:38 -0700
Subject: [PATCH] invalidate_complete_page() race fix

If a CPU faults this page into pagetables after invalidate_mapping_pages()
checked page_mapped(), invalidate_complete_page() will still proceed to remove
the page from pagecache.  This leaves the page-faulting process with a
detached page.  If it was MAP_SHARED then file data loss will ensue.

Fix that up by checking the page's refcount after taking tree_lock.

Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/truncate.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/truncate.c b/mm/truncate.c
index cf1b015df4a..c6ab55ec688 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -68,10 +68,10 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
 		return 0;
 
 	write_lock_irq(&mapping->tree_lock);
-	if (PageDirty(page)) {
-		write_unlock_irq(&mapping->tree_lock);
-		return 0;
-	}
+	if (PageDirty(page))
+		goto failed;
+	if (page_count(page) != 2)	/* caller's ref + pagecache ref */
+		goto failed;
 
 	BUG_ON(PagePrivate(page));
 	__remove_from_page_cache(page);
@@ -79,6 +79,9 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
 	ClearPageUptodate(page);
 	page_cache_release(page);	/* pagecache ref */
 	return 1;
+failed:
+	write_unlock_irq(&mapping->tree_lock);
+	return 0;
 }
 
 /**
-- 
cgit v1.2.3