From f481891fdc49d3d1b8a9674a1825d183069a805f Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Wed, 19 Nov 2008 15:36:30 -0800
Subject: cpuset: update top cpuset's mems after adding a node

After adding a node into the machine, top cpuset's mems isn't updated.

By reviewing the code, we found that the update function

  cpuset_track_online_nodes()

was invoked after node_states[N_ONLINE] changes.  It is wrong because
N_ONLINE just means node has pgdat, and if node has/added memory, we use
N_HIGH_MEMORY.  So, We should invoke the update function after
node_states[N_HIGH_MEMORY] changes, just like its commit says.

This patch fixes it.  And we use notifier of memory hotplug instead of
direct calling of cpuset_track_online_nodes().

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Acked-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Paul Menage <menage@google.com
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'mm')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6837a101437..b5b2b15085a 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -22,7 +22,6 @@
 #include <linux/highmem.h>
 #include <linux/vmalloc.h>
 #include <linux/ioport.h>
-#include <linux/cpuset.h>
 #include <linux/delay.h>
 #include <linux/migrate.h>
 #include <linux/page-isolation.h>
@@ -498,8 +497,6 @@ int add_memory(int nid, u64 start, u64 size)
 	/* we online node here. we can't roll back from here. */
 	node_set_online(nid);
 
-	cpuset_track_online_nodes();
-
 	if (new_pgdat) {
 		ret = register_one_node(nid);
 		/*
-- 
cgit v1.2.3


From f011c2dae6cffc50ef67d9bd937b488ba5db8913 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 19 Nov 2008 15:36:32 -0800
Subject: mm: vmalloc allocator off by one

Fix off by one bug in the KVA allocator that can leave gaps in the address
space.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmalloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ba6b0f5f7fa..46aab4dbf61 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -362,7 +362,7 @@ retry:
 				goto found;
 		}
 
-		while (addr + size >= first->va_start && addr + size <= vend) {
+		while (addr + size > first->va_start && addr + size <= vend) {
 			addr = ALIGN(first->va_end + PAGE_SIZE, align);
 
 			n = rb_next(&first->rb_node);
-- 
cgit v1.2.3


From 496850e5f5a372029ceb2b35c811770a9bb073b6 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 19 Nov 2008 15:36:33 -0800
Subject: mm: vmalloc failure flush fix

An initial vmalloc failure should start off a synchronous flush of lazy
areas, in case someone is in progress flushing them already, which could
cause us to return an allocation failure even if there is plenty of KVA
free.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmalloc.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 46aab4dbf61..04f5e320e74 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -521,6 +521,17 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
 	spin_unlock(&purge_lock);
 }
 
+/*
+ * Kick off a purge of the outstanding lazy areas. Don't bother if somebody
+ * is already purging.
+ */
+static void try_purge_vmap_area_lazy(void)
+{
+	unsigned long start = ULONG_MAX, end = 0;
+
+	__purge_vmap_area_lazy(&start, &end, 0, 0);
+}
+
 /*
  * Kick off a purge of the outstanding lazy areas.
  */
@@ -528,7 +539,7 @@ static void purge_vmap_area_lazy(void)
 {
 	unsigned long start = ULONG_MAX, end = 0;
 
-	__purge_vmap_area_lazy(&start, &end, 0, 0);
+	__purge_vmap_area_lazy(&start, &end, 1, 0);
 }
 
 /*
@@ -539,7 +550,7 @@ static void free_unmap_vmap_area(struct vmap_area *va)
 	va->flags |= VM_LAZY_FREE;
 	atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
 	if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages()))
-		purge_vmap_area_lazy();
+		try_purge_vmap_area_lazy();
 }
 
 static struct vmap_area *find_vmap_area(unsigned long addr)
-- 
cgit v1.2.3


From 0ae15132a4f5c758a6ffcde74495641dc3f62ba1 Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@redhat.com>
Date: Wed, 19 Nov 2008 15:36:33 -0800
Subject: mm: vmalloc search restart fix

Current vmalloc restart search for a free area in case we can't find one.
The reason is there are areas which are lazily freed, and could be
possibly freed now.  However, current implementation start searching the
tree from the last failing address, which is pretty much by definition at
the end of address space.  So, we fail.

The proposal of this patch is to restart the search from the beginning of
the requested vstart address.  This fixes the regression in running KVM
virtual machines for me, described in http://lkml.org/lkml/2008/10/28/349,
caused by commit db64fe02258f1507e13fe5212a989922323685ce.

Signed-off-by: Glauber Costa <glommer@redhat.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmalloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 04f5e320e74..30f826d484f 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -324,14 +324,14 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 
 	BUG_ON(size & ~PAGE_MASK);
 
-	addr = ALIGN(vstart, align);
-
 	va = kmalloc_node(sizeof(struct vmap_area),
 			gfp_mask & GFP_RECLAIM_MASK, node);
 	if (unlikely(!va))
 		return ERR_PTR(-ENOMEM);
 
 retry:
+	addr = ALIGN(vstart, align);
+
 	spin_lock(&vmap_area_lock);
 	/* XXX: could have a last_hole cache */
 	n = vmap_area_root.rb_node;
-- 
cgit v1.2.3


From bda8550deed96687f29992d711a88ea21cff4d26 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Wed, 19 Nov 2008 15:36:36 -0800
Subject: migration: fix writepage error

Page migration's writeout() has got understandably confused by the nasty
AOP_WRITEPAGE_ACTIVATE case: as in normal success, a writepage() error has
unlocked the page, so writeout() then needs to relock it.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/migrate.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/migrate.c b/mm/migrate.c
index 385db89f0c3..1e0d6b237f4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -522,15 +522,12 @@ static int writeout(struct address_space *mapping, struct page *page)
 	remove_migration_ptes(page, page);
 
 	rc = mapping->a_ops->writepage(page, &wbc);
-	if (rc < 0)
-		/* I/O Error writing */
-		return -EIO;
 
 	if (rc != AOP_WRITEPAGE_ACTIVATE)
 		/* unlocked. Relock */
 		lock_page(page);
 
-	return -EAGAIN;
+	return (rc < 0) ? -EIO : -EAGAIN;
 }
 
 /*
-- 
cgit v1.2.3


From 63eb6b93ce725e4c5f38fc85dd703d49465b03cb Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Wed, 19 Nov 2008 15:36:37 -0800
Subject: vmscan: let GFP_NOFS go to swap again

In the past, GFP_NOFS (but of course not GFP_NOIO) was allowed to reclaim
by writing to swap.  That got partially broken in 2.6.23, when may_enter_fs
initialization was moved up before the allocation of swap, so its
PageSwapCache test was failing the first time around,

Fix it by setting may_enter_fs when add_to_swap() succeeds with
__GFP_IO.  In fact, check __GFP_IO before calling add_to_swap():
allocating swap we're not ready to use just increases disk seeking.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index c141b3e7807..f83a7ed5c6c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -623,6 +623,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 * Try to allocate it some swap space here.
 		 */
 		if (PageAnon(page) && !PageSwapCache(page)) {
+			if (!(sc->gfp_mask & __GFP_IO))
+				goto keep_locked;
 			switch (try_to_munlock(page)) {
 			case SWAP_FAIL:		/* shouldn't happen */
 			case SWAP_AGAIN:
@@ -634,6 +636,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 			}
 			if (!add_to_swap(page, GFP_ATOMIC))
 				goto activate_locked;
+			may_enter_fs = 1;
 		}
 #endif /* CONFIG_SWAP */
 
-- 
cgit v1.2.3


From 00d8089c54867053a5aae062b765f257ca419e27 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Wed, 19 Nov 2008 15:36:44 -0800
Subject: vmscan: fix get_scan_ratio() comment

Fix the old comment on the scan ratio calculations.

Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index f83a7ed5c6c..7ea1440b53d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1389,9 +1389,9 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
 	file_prio = 200 - sc->swappiness;
 
 	/*
-	 *                  anon       recent_rotated[0]
-	 * %anon = 100 * ----------- / ----------------- * IO cost
-	 *               anon + file      rotate_sum
+	 * The amount of pressure on anon vs file pages is inversely
+	 * proportional to the fraction of recently scanned pages on
+	 * each list that were recently referenced and in active use.
 	 */
 	ap = (anon_prio + 1) * (zone->recent_scanned[0] + 1);
 	ap /= zone->recent_rotated[0] + 1;
-- 
cgit v1.2.3


From 31168481c32c8a485e1003af9433124dede57f8d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Sat, 22 Nov 2008 17:33:24 +0000
Subject: meminit section warnings

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c |  9 +++++----
 mm/page_cgroup.c    | 13 +++++++------
 mm/sparse.c         |  2 +-
 3 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b5b2b15085a..b1737118546 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -189,7 +189,7 @@ static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
 					pgdat->node_start_pfn;
 }
 
-static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
+static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	int nr_pages = PAGES_PER_SECTION;
@@ -216,7 +216,7 @@ static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 	return 0;
 }
 
-static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
+static int __meminit __add_section(struct zone *zone, unsigned long phys_start_pfn)
 {
 	int nr_pages = PAGES_PER_SECTION;
 	int ret;
@@ -273,7 +273,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
  * call this function after deciding the zone to which to
  * add the new pages.
  */
-int __add_pages(struct zone *zone, unsigned long phys_start_pfn,
+int __ref __add_pages(struct zone *zone, unsigned long phys_start_pfn,
 		 unsigned long nr_pages)
 {
 	unsigned long i;
@@ -470,7 +470,8 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
 }
 
 
-int add_memory(int nid, u64 start, u64 size)
+/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
+int __ref add_memory(int nid, u64 start, u64 size)
 {
 	pg_data_t *pgdat = NULL;
 	int new_pgdat = 0;
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 1223d927904..436c00229e7 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -21,7 +21,7 @@ static unsigned long total_usage;
 #if !defined(CONFIG_SPARSEMEM)
 
 
-void __init pgdat_page_cgroup_init(struct pglist_data *pgdat)
+void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
 {
 	pgdat->node_page_cgroup = NULL;
 }
@@ -97,7 +97,8 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
 	return section->page_cgroup + pfn;
 }
 
-int __meminit init_section_page_cgroup(unsigned long pfn)
+/* __alloc_bootmem...() is protected by !slab_available() */
+int __init_refok init_section_page_cgroup(unsigned long pfn)
 {
 	struct mem_section *section;
 	struct page_cgroup *base, *pc;
@@ -158,7 +159,7 @@ void __free_page_cgroup(unsigned long pfn)
 	}
 }
 
-int online_page_cgroup(unsigned long start_pfn,
+int __meminit online_page_cgroup(unsigned long start_pfn,
 			unsigned long nr_pages,
 			int nid)
 {
@@ -183,7 +184,7 @@ int online_page_cgroup(unsigned long start_pfn,
 	return -ENOMEM;
 }
 
-int offline_page_cgroup(unsigned long start_pfn,
+int __meminit offline_page_cgroup(unsigned long start_pfn,
 		unsigned long nr_pages, int nid)
 {
 	unsigned long start, end, pfn;
@@ -197,7 +198,7 @@ int offline_page_cgroup(unsigned long start_pfn,
 
 }
 
-static int page_cgroup_callback(struct notifier_block *self,
+static int __meminit page_cgroup_callback(struct notifier_block *self,
 			       unsigned long action, void *arg)
 {
 	struct memory_notify *mn = arg;
@@ -248,7 +249,7 @@ void __init page_cgroup_init(void)
 	" want\n");
 }
 
-void __init pgdat_page_cgroup_init(struct pglist_data *pgdat)
+void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
 {
 	return;
 }
diff --git a/mm/sparse.c b/mm/sparse.c
index 39db301b920..083f5b63e7a 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -570,7 +570,7 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
  * set.  If this is <=0, then that means that the passed-in
  * map was not consumed and must be freed.
  */
-int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
+int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
 			   int nr_pages)
 {
 	unsigned long section_nr = pfn_to_section_nr(start_pfn);
-- 
cgit v1.2.3


From 2a1dc509747fdcfdf3a2df818a14908aed86c3d4 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@saeurebad.de>
Date: Mon, 1 Dec 2008 03:00:35 +0100
Subject: vmscan: protect zone rotation stats by lru lock

The zone's rotation statistics must not be accessed without the
corresponding LRU lock held.  Fix an unprotected write in
shrink_active_list().

Acked-by: Rik van Riel <riel@redhat.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Johannes Weiner <hannes@saeurebad.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7ea1440b53d..62e7f62fb55 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1248,6 +1248,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 		list_add(&page->lru, &l_inactive);
 	}
 
+	spin_lock_irq(&zone->lru_lock);
 	/*
 	 * Count referenced pages from currently used mappings as
 	 * rotated, even though they are moved to the inactive list.
@@ -1263,7 +1264,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 
 	pgmoved = 0;
 	lru = LRU_BASE + file * LRU_FILE;
-	spin_lock_irq(&zone->lru_lock);
 	while (!list_empty(&l_inactive)) {
 		page = lru_to_page(&l_inactive);
 		prefetchw_prev_lru_page(page, &l_inactive, flags);
-- 
cgit v1.2.3


From b29acbdcf877009af3f1fc0750bcac314c51e055 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 1 Dec 2008 13:13:47 -0800
Subject: mm: vmalloc fix lazy unmapping cache aliasing

Jim Radford has reported that the vmap subsystem rewrite was sometimes
causing his VIVT ARM system to behave strangely (seemed like going into
infinite loops trying to fault in pages to userspace).

We determined that the problem was most likely due to a cache aliasing
issue.  flush_cache_vunmap was only being called at the moment the page
tables were to be taken down, however with lazy unmapping, this can happen
after the page has subsequently been freed and allocated for something
else.  The dangling alias may still have dirty data attached to it.

The fix for this problem is to do the cache flushing when the caller has
called vunmap -- it would be a bug for them to write anything else to the
mapping at that point.

That appeared to solve Jim's problems.

Reported-by: Jim Radford <radford@blackbean.org>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Russell King <rmk@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmalloc.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 30f826d484f..f3f6e075856 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -77,7 +77,6 @@ static void vunmap_page_range(unsigned long addr, unsigned long end)
 
 	BUG_ON(addr >= end);
 	pgd = pgd_offset_k(addr);
-	flush_cache_vunmap(addr, end);
 	do {
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
@@ -543,9 +542,10 @@ static void purge_vmap_area_lazy(void)
 }
 
 /*
- * Free and unmap a vmap area
+ * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been
+ * called for the correct range previously.
  */
-static void free_unmap_vmap_area(struct vmap_area *va)
+static void free_unmap_vmap_area_noflush(struct vmap_area *va)
 {
 	va->flags |= VM_LAZY_FREE;
 	atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
@@ -553,6 +553,15 @@ static void free_unmap_vmap_area(struct vmap_area *va)
 		try_purge_vmap_area_lazy();
 }
 
+/*
+ * Free and unmap a vmap area
+ */
+static void free_unmap_vmap_area(struct vmap_area *va)
+{
+	flush_cache_vunmap(va->va_start, va->va_end);
+	free_unmap_vmap_area_noflush(va);
+}
+
 static struct vmap_area *find_vmap_area(unsigned long addr)
 {
 	struct vmap_area *va;
@@ -734,7 +743,7 @@ static void free_vmap_block(struct vmap_block *vb)
 	spin_unlock(&vmap_block_tree_lock);
 	BUG_ON(tmp != vb);
 
-	free_unmap_vmap_area(vb->va);
+	free_unmap_vmap_area_noflush(vb->va);
 	call_rcu(&vb->rcu_head, rcu_free_vb);
 }
 
@@ -796,6 +805,9 @@ static void vb_free(const void *addr, unsigned long size)
 
 	BUG_ON(size & ~PAGE_MASK);
 	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
+
+	flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size);
+
 	order = get_order(size);
 
 	offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
-- 
cgit v1.2.3


From dc19f9db38295f811d9041bd89b113beccbd763a Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Mon, 1 Dec 2008 13:13:48 -0800
Subject: memcg: memory hotplug fix for notifier callback

Fixes for memcg/memory hotplug.

While memory hotplug allocate/free memmap, page_cgroup doesn't free
page_cgroup at OFFLINE when page_cgroup is allocated via bootomem.
(Because freeing bootmem requires special care.)

Then, if page_cgroup is allocated by bootmem and memmap is freed/allocated
by memory hotplug, page_cgroup->page == page is no longer true.

But current MEM_ONLINE handler doesn't check it and update
page_cgroup->page if it's not necessary to allocate page_cgroup.  (This
was not found because memmap is not freed if SPARSEMEM_VMEMMAP is y.)

And I noticed that MEM_ONLINE can be called against "part of section".
So, freeing page_cgroup at CANCEL_ONLINE will cause trouble.  (freeing
used page_cgroup) Don't rollback at CANCEL.

One more, current memory hotplug notifier is stopped by slub because it
sets NOTIFY_STOP_MASK to return vaule.  So, page_cgroup's callback never
be called.  (low priority than slub now.)

I think this slub's behavior is not intentional(BUG). and fixes it.

Another way to be considered about page_cgroup allocation:
  - free page_cgroup at OFFLINE even if it's from bootmem
    and remove specieal handler. But it requires more changes.

Addresses http://bugzilla.kernel.org/show_bug.cgi?id=12041

Signed-off-by: KAMEZAWA Hiruyoki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Tested-by: Badari Pulavarty <pbadari@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_cgroup.c | 43 +++++++++++++++++++++++++++++--------------
 mm/slub.c        |  6 ++++--
 2 files changed, 33 insertions(+), 16 deletions(-)

(limited to 'mm')

diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 436c00229e7..0b3cbf090a6 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -107,19 +107,29 @@ int __init_refok init_section_page_cgroup(unsigned long pfn)
 
 	section = __pfn_to_section(pfn);
 
-	if (section->page_cgroup)
-		return 0;
-
-	nid = page_to_nid(pfn_to_page(pfn));
-
-	table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
-	if (slab_is_available()) {
-		base = kmalloc_node(table_size, GFP_KERNEL, nid);
-		if (!base)
-			base = vmalloc_node(table_size, nid);
-	} else {
-		base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), table_size,
+	if (!section->page_cgroup) {
+		nid = page_to_nid(pfn_to_page(pfn));
+		table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
+		if (slab_is_available()) {
+			base = kmalloc_node(table_size, GFP_KERNEL, nid);
+			if (!base)
+				base = vmalloc_node(table_size, nid);
+		} else {
+			base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
+				table_size,
 				PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+		}
+	} else {
+		/*
+ 		 * We don't have to allocate page_cgroup again, but
+		 * address of memmap may be changed. So, we have to initialize
+		 * again.
+		 */
+		base = section->page_cgroup + pfn;
+		table_size = 0;
+		/* check address of memmap is changed or not. */
+		if (base->page == pfn_to_page(pfn))
+			return 0;
 	}
 
 	if (!base) {
@@ -208,18 +218,23 @@ static int __meminit page_cgroup_callback(struct notifier_block *self,
 		ret = online_page_cgroup(mn->start_pfn,
 				   mn->nr_pages, mn->status_change_nid);
 		break;
-	case MEM_CANCEL_ONLINE:
 	case MEM_OFFLINE:
 		offline_page_cgroup(mn->start_pfn,
 				mn->nr_pages, mn->status_change_nid);
 		break;
+	case MEM_CANCEL_ONLINE:
 	case MEM_GOING_OFFLINE:
 		break;
 	case MEM_ONLINE:
 	case MEM_CANCEL_OFFLINE:
 		break;
 	}
-	ret = notifier_from_errno(ret);
+
+	if (ret)
+		ret = notifier_from_errno(ret);
+	else
+		ret = NOTIFY_OK;
+
 	return ret;
 }
 
diff --git a/mm/slub.c b/mm/slub.c
index 7ad489af956..749588a50a5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2931,8 +2931,10 @@ static int slab_memory_callback(struct notifier_block *self,
 	case MEM_CANCEL_OFFLINE:
 		break;
 	}
-
-	ret = notifier_from_errno(ret);
+	if (ret)
+		ret = notifier_from_errno(ret);
+	else
+		ret = NOTIFY_OK;
 	return ret;
 }
 
-- 
cgit v1.2.3


From f1d0b063d993527754f062c589b73f125024d216 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Tue, 2 Dec 2008 10:31:50 -0800
Subject: bdi: register sysfs bdi device only once per queue

Devices which share the same queue, like floppies and mtd devices, get
registered multiple times in the bdi interface, but bdi accounts only the
last registered device of the devices sharing one queue.

On remove, all earlier registered devices leak, stay around in sysfs, and
cause "duplicate filename" errors if the devices are re-created.

This prevents the creation of multiple bdi interfaces per queue, and the
bdi device will carry the dev_t name of the block device which is the
first one registered, of the pool of devices using the same queue.

[akpm@linux-foundation.org: add a WARN_ON so we know which drivers are misbehaving]
Tested-by: Peter Korsgaard <jacmet@sunsite.dk>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/backing-dev.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'mm')

diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index f2e574dbc30..2a56124dbc2 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -176,6 +176,9 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 	int ret = 0;
 	struct device *dev;
 
+	if (WARN_ON(bdi->dev))
+		goto exit;
+
 	va_start(args, fmt);
 	dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
 	va_end(args);
-- 
cgit v1.2.3


From 9ff473b9a72942c5ac0ad35607cae28d8d59ed7a Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Tue, 2 Dec 2008 10:31:52 -0800
Subject: vmscan: evict streaming IO first

Count the insertion of new pages in the statistics used to drive the
pageout scanning code.  This should help the kernel quickly evict
streaming file IO.

We count on the fact that new file pages start on the inactive file LRU
and new anonymous pages start on the active anon list.  This means
streaming file IO will increment the recent scanned file statistic, while
leaving the recent rotated file statistic alone, driving pageout scanning
to the file LRUs.

Pageout activity does its own list manipulation.

Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Tested-by: Gene Heskett <gene.heskett@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swap.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/swap.c b/mm/swap.c
index 2152e48a7b8..2881987603e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -445,6 +445,7 @@ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
 	for (i = 0; i < pagevec_count(pvec); i++) {
 		struct page *page = pvec->pages[i];
 		struct zone *pagezone = page_zone(page);
+		int file;
 
 		if (pagezone != zone) {
 			if (zone)
@@ -456,8 +457,12 @@ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
 		VM_BUG_ON(PageUnevictable(page));
 		VM_BUG_ON(PageLRU(page));
 		SetPageLRU(page);
-		if (is_active_lru(lru))
+		file = is_file_lru(lru);
+		zone->recent_scanned[file]++;
+		if (is_active_lru(lru)) {
 			SetPageActive(page);
+			zone->recent_rotated[file]++;
+		}
 		add_page_to_lru_list(zone, page, lru);
 	}
 	if (zone)
-- 
cgit v1.2.3