From 4ce072f1faf29d24df4600f53db8cdd62d400a8f Mon Sep 17 00:00:00 2001
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Date: Fri, 29 Sep 2006 01:58:42 -0700
Subject: [PATCH] mm: fix a race condition under SMC + COW

Failing context is a multi threaded process context and the failing
sequence is as follows.

One thread T0 doing self modifying code on page X on processor P0 and
another thread T1 doing COW (breaking the COW setup as part of just
happened fork() in another thread T2) on the same page X on processor P1.
T0 doing SMC can endup modifying the new page Y (allocated by the T1 doing
COW on P1) but because of different I/D TLB's, P0 ITLB will not see the new
mapping till the flush TLB IPI from P1 is received.  During this interval,
if T0 executes the code created by SMC it can result in an app error (as
ITLB still points to old page X and endup executing the content in page X
rather than using the content in page Y).

Fix this issue by first clearing the PTE and flushing it, before updating
it with new entry.

Hugh sayeth:

  I was a bit sceptical, in the habit of thinking that Self Modifying Code
  must look such issues itself: but I guess there's nothing it can do to avoid
  this one.

  Fair enough, what you're changing it to is pretty much what powerpc and
  s390 were already doing, and is a more robust way of proceeding, consistent
  with how ptes are set everywhere else.

  The ptep_clear_flush is a bit heavy-handed (it's anxious to return the pte
  that was atomically cleared), but we'd have to wander through lots of arches
  to get the right minimal behaviour.  It'd also be nice to eliminate
  ptep_establish completely, now only used to define other macros/inlines: it
  always seemed obfuscation to me, what you've got there now is clearer.
  Let's put those cleanups on a TODO list.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Acked-by: "David S. Miller" <davem@davemloft.net>
Acked-by: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 601159a46ab..160f5b503ea 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1577,7 +1577,14 @@ gotten:
 		entry = mk_pte(new_page, vma->vm_page_prot);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 		lazy_mmu_prot_update(entry);
-		ptep_establish(vma, address, page_table, entry);
+		/*
+		 * Clear the pte entry and flush it first, before updating the
+		 * pte with the new entry. This will avoid a race condition
+		 * seen in the presence of one thread doing SMC and another
+		 * thread doing COW.
+		 */
+		ptep_clear_flush(vma, address, page_table);
+		set_pte_at(mm, address, page_table, entry);
 		update_mmu_cache(vma, address, entry);
 		lru_cache_add_active(new_page);
 		page_add_new_anon_rmap(new_page, vma, address);
-- 
cgit v1.2.3


From 79f5acf5d784492afe80723496624093079aed9c Mon Sep 17 00:00:00 2001
From: Adam Litke <agl@us.ibm.com>
Date: Fri, 29 Sep 2006 01:58:43 -0700
Subject: [PATCH] mm: make filemap_nopage use NOPAGE_SIGBUS

Don't open-code NOPAGE_SIGBUS.

Signed-off-by: Adam Litke <agl@us.ibm.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/filemap.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index afcdc72b5e9..3277f3b2352 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1471,7 +1471,7 @@ outside_data_content:
 	 * accessible..
 	 */
 	if (area->vm_mm == current->mm)
-		return NULL;
+		return NOPAGE_SIGBUS;
 	/* Fall through to the non-read-ahead case */
 no_cached_page:
 	/*
@@ -1496,7 +1496,7 @@ no_cached_page:
 	 */
 	if (error == -ENOMEM)
 		return NOPAGE_OOM;
-	return NULL;
+	return NOPAGE_SIGBUS;
 
 page_not_uptodate:
 	if (!did_readaround) {
@@ -1565,7 +1565,7 @@ page_not_uptodate:
 	 */
 	shrink_readahead_size_eio(file, ra);
 	page_cache_release(page);
-	return NULL;
+	return NOPAGE_SIGBUS;
 }
 EXPORT_SYMBOL(filemap_nopage);
 
-- 
cgit v1.2.3


From aa83aa40ed2ae113d9ee5529cdd9e8c0e5fabe61 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Fri, 29 Sep 2006 01:59:51 -0700
Subject: [PATCH] single bit flip detector

In cases where we detect a single bit has been flipped, we spew the usual
slab corruption message, which users instantly think is a kernel bug.  In a
lot of cases, single bit errors are down to bad memory, or other hardware
failure.

This patch adds an extra line to the slab debug messages in those cases, in
the hope that users will try memtest before they report a bug.

000: 6b 6b 6b 6b 6a 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b
Single bit error detected. Possibly bad RAM. Run memtest86.

[akpm@osdl.org: cleanups]
Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 792bfe320a8..3dbd6f4e747 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1683,10 +1683,32 @@ static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
 static void dump_line(char *data, int offset, int limit)
 {
 	int i;
+	unsigned char error = 0;
+	int bad_count = 0;
+
 	printk(KERN_ERR "%03x:", offset);
-	for (i = 0; i < limit; i++)
+	for (i = 0; i < limit; i++) {
+		if (data[offset + i] != POISON_FREE) {
+			error = data[offset + i];
+			bad_count++;
+		}
 		printk(" %02x", (unsigned char)data[offset + i]);
+	}
 	printk("\n");
+
+	if (bad_count == 1) {
+		error ^= POISON_FREE;
+		if (!(error & (error - 1))) {
+			printk(KERN_ERR "Single bit error detected. Probably "
+					"bad RAM.\n");
+#ifdef CONFIG_X86
+			printk(KERN_ERR "Run memtest86+ or a similar memory "
+					"test tool.\n");
+#else
+			printk(KERN_ERR "Run a memory test tool.\n");
+#endif
+		}
+	}
 }
 #endif
 
-- 
cgit v1.2.3


From f400e198b2ed26ce55b22a1412ded0896e7516ac Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Date: Fri, 29 Sep 2006 02:00:07 -0700
Subject: [PATCH] pidspace: is_init()

This is an updated version of Eric Biederman's is_init() patch.
(http://lkml.org/lkml/2006/2/6/280).  It applies cleanly to 2.6.18-rc3 and
replaces a few more instances of ->pid == 1 with is_init().

Further, is_init() checks pid and thus removes dependency on Eric's other
patches for now.

Eric's original description:

	There are a lot of places in the kernel where we test for init
	because we give it special properties.  Most  significantly init
	must not die.  This results in code all over the kernel test
	->pid == 1.

	Introduce is_init to capture this case.

	With multiple pid spaces for all of the cases affected we are
	looking for only the first process on the system, not some other
	process that has pid == 1.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: <lxc-devel@lists.sourceforge.net>
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index bada3d03119..f3dd79c1c36 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -255,7 +255,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
  */
 static void __oom_kill_task(struct task_struct *p, const char *message)
 {
-	if (p->pid == 1) {
+	if (is_init(p)) {
 		WARN_ON(1);
 		printk(KERN_WARNING "tried to kill init!\n");
 		return;
-- 
cgit v1.2.3


From 55a101f8f71a3d3dbda7b5c77083ffe47552f831 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 29 Sep 2006 02:01:10 -0700
Subject: [PATCH] kill PF_DEAD flag

After the previous change (->flags & PF_DEAD) <=> (->state == EXIT_DEAD), we
don't need PF_DEAD any longer.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f3dd79c1c36..202f186a753 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -226,8 +226,8 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 		releasing = test_tsk_thread_flag(p, TIF_MEMDIE) ||
 						p->flags & PF_EXITING;
 		if (releasing) {
-			/* PF_DEAD tasks have already released their mm */
-			if (p->flags & PF_DEAD)
+			/* TASK_DEAD tasks have already released their mm */
+			if (p->state == EXIT_DEAD)
 				continue;
 			if (p->flags & PF_EXITING && p == current) {
 				chosen = p;
-- 
cgit v1.2.3


From c394cc9fbb367f87faa2228ec2eabacd2d4701c6 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 29 Sep 2006 02:01:11 -0700
Subject: [PATCH] introduce TASK_DEAD state

I am not sure about this patch, I am asking Ingo to take a decision.

task_struct->state == EXIT_DEAD is a very special case, to avoid a confusion
it makes sense to introduce a new state, TASK_DEAD, while EXIT_DEAD should
live only in ->exit_state as documented in sched.h.

Note that this state is not visible to user-space, get_task_state() masks off
unsuitable states.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 202f186a753..21f0a7e8e51 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -227,7 +227,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 						p->flags & PF_EXITING;
 		if (releasing) {
 			/* TASK_DEAD tasks have already released their mm */
-			if (p->state == EXIT_DEAD)
+			if (p->state == TASK_DEAD)
 				continue;
 			if (p->flags & PF_EXITING && p == current) {
 				chosen = p;
-- 
cgit v1.2.3


From 28324d1df646521256e83389244adcce98e89ff2 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 29 Sep 2006 02:01:12 -0700
Subject: [PATCH] select_bad_process(): kill a bogus PF_DEAD/TASK_DEAD check

The only one usage of TASK_DEAD outside of last schedule path,

select_bad_process:

	for_each_task(p) {

		if (!p->mm)
			continue;
		...
			if (p->state == TASK_DEAD)
				continue;
		...

TASK_DEAD state is set at the end of do_exit(), this means that p->mm
was already set == NULL by exit_mm(), so this task was already rejected
by 'if (!p->mm)' above.

Note also that the caller holds tasklist_lock, this means that p can't
pass exit_notify() and then set TASK_DEAD when p->mm != NULL.

Also, remove open-coded is_init().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 21f0a7e8e51..423dcae323a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -206,11 +206,14 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 		unsigned long points;
 		int releasing;
 
-		/* skip kernel threads */
+		/*
+		 * skip kernel threads and tasks which have already released
+		 * their mm.
+		 */
 		if (!p->mm)
 			continue;
-		/* skip the init task with pid == 1 */
-		if (p->pid == 1)
+		/* skip the init task */
+		if (is_init(p))
 			continue;
 
 		/*
@@ -226,9 +229,6 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 		releasing = test_tsk_thread_flag(p, TIF_MEMDIE) ||
 						p->flags & PF_EXITING;
 		if (releasing) {
-			/* TASK_DEAD tasks have already released their mm */
-			if (p->state == TASK_DEAD)
-				continue;
 			if (p->flags & PF_EXITING && p == current) {
 				chosen = p;
 				*ppoints = ULONG_MAX;
-- 
cgit v1.2.3


From 972c4ea59c9dbf82647ee9665d9e945241911a51 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 29 Sep 2006 02:01:12 -0700
Subject: [PATCH] select_bad_process(): cleanup 'releasing' check

No logic changes, but imho easier to read.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 423dcae323a..991bf0cf477 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -204,7 +204,6 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 	do_posix_clock_monotonic_gettime(&uptime);
 	do_each_thread(g, p) {
 		unsigned long points;
-		int releasing;
 
 		/*
 		 * skip kernel threads and tasks which have already released
@@ -226,16 +225,15 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 		 * the process of exiting and releasing its resources.
 		 * Otherwise we could get an OOM deadlock.
 		 */
-		releasing = test_tsk_thread_flag(p, TIF_MEMDIE) ||
-						p->flags & PF_EXITING;
-		if (releasing) {
-			if (p->flags & PF_EXITING && p == current) {
-				chosen = p;
-				*ppoints = ULONG_MAX;
-				break;
-			}
-			return ERR_PTR(-1UL);
+		if ((p->flags & PF_EXITING) && p == current) {
+			chosen = p;
+			*ppoints = ULONG_MAX;
+			break;
 		}
+		if ((p->flags & PF_EXITING) ||
+				test_tsk_thread_flag(p, TIF_MEMDIE))
+			return ERR_PTR(-1UL);
+
 		if (p->oomkilladj == OOM_DISABLE)
 			continue;
 
@@ -245,6 +243,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 			*ppoints = points;
 		}
 	} while_each_thread(g, p);
+
 	return chosen;
 }
 
-- 
cgit v1.2.3


From 01017a227044d64face2588fab9427a1da1bdb9f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 29 Sep 2006 02:01:13 -0700
Subject: [PATCH] oom_kill_task(): cleanup ->mm checks

- It is not possible to have task->mm == &init_mm.

- task_lock() buys nothing for 'if (!p->mm)' check.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 991bf0cf477..a5493a3b485 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -260,14 +260,11 @@ static void __oom_kill_task(struct task_struct *p, const char *message)
 		return;
 	}
 
-	task_lock(p);
-	if (!p->mm || p->mm == &init_mm) {
+	if (!p->mm) {
 		WARN_ON(1);
 		printk(KERN_WARNING "tried to kill an mm-less task!\n");
-		task_unlock(p);
 		return;
 	}
-	task_unlock(p);
 
 	if (message) {
 		printk(KERN_ERR "%s: Killed process %d (%s).\n",
@@ -301,7 +298,7 @@ static int oom_kill_task(struct task_struct *p, const char *message)
 	 * However, this is of no concern to us.
 	 */
 
-	if (mm == NULL || mm == &init_mm)
+	if (mm == NULL)
 		return 1;
 
 	__oom_kill_task(p, message);
-- 
cgit v1.2.3


From b78483a4ba60d5d90930262a533a784e1d9df660 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Fri, 29 Sep 2006 02:01:14 -0700
Subject: [PATCH] oom: don't kill current when another OOM in progress

A previous patch to allow an exiting task to OOM kill itself (and thereby
avoid a little deadlock) introduced a problem.  We don't want the
PF_EXITING task, even if it is 'current', to access mem reserves if there
is already a TIF_MEMDIE process in the system sucking up reserves.

Also make the commenting a little bit clearer, and note that our current
scheme of effectively single threading the OOM killer is not itself
perfect.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index a5493a3b485..20f41b082e1 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -215,6 +215,18 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 		if (is_init(p))
 			continue;
 
+		/*
+		 * This task already has access to memory reserves and is
+		 * being killed. Don't allow any other task access to the
+		 * memory reserve.
+		 *
+		 * Note: this may have a chance of deadlock if it gets
+		 * blocked waiting for another task which itself is waiting
+		 * for memory. Is there a better alternative?
+		 */
+		if (test_tsk_thread_flag(p, TIF_MEMDIE))
+			return ERR_PTR(-1UL);
+
 		/*
 		 * This is in the process of releasing memory so wait for it
 		 * to finish before killing some other task by mistake.
@@ -223,16 +235,15 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 		 * go ahead if it is exiting: this will simply set TIF_MEMDIE,
 		 * which will allow it to gain access to memory reserves in
 		 * the process of exiting and releasing its resources.
-		 * Otherwise we could get an OOM deadlock.
+		 * Otherwise we could get an easy OOM deadlock.
 		 */
-		if ((p->flags & PF_EXITING) && p == current) {
+		if (p->flags & PF_EXITING) {
+			if (p != current)
+				return ERR_PTR(-1UL);
+
 			chosen = p;
 			*ppoints = ULONG_MAX;
-			break;
 		}
-		if ((p->flags & PF_EXITING) ||
-				test_tsk_thread_flag(p, TIF_MEMDIE))
-			return ERR_PTR(-1UL);
 
 		if (p->oomkilladj == OOM_DISABLE)
 			continue;
-- 
cgit v1.2.3


From 38837fc75acb7fa9b0e111b0241fe4fe76c5d4b3 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Fri, 29 Sep 2006 02:01:16 -0700
Subject: [PATCH] cpuset: top_cpuset tracks hotplug changes to node_online_map

Change the list of memory nodes allowed to tasks in the top (root) nodeset
to dynamically track what cpus are online, using a call to a cpuset hook
from the memory hotplug code.  Make this top cpus file read-only.

On systems that have cpusets configured in their kernel, but that aren't
actively using cpusets (for some distros, this covers the majority of
systems) all tasks end up in the top cpuset.

If that system does support memory hotplug, then these tasks cannot make
use of memory nodes that are added after system boot, because the memory
nodes are not allowed in the top cpuset.  This is a surprising regression
over earlier kernels that didn't have cpusets enabled.

One key motivation for this change is to remain consistent with the
behaviour for the top_cpuset's 'cpus', which is also read-only, and which
automatically tracks the cpu_online_map.

This change also has the minor benefit that it fixes a long standing,
little noticed, minor bug in cpusets.  The cpuset performance tweak to
short circuit the cpuset_zone_allowed() check on systems with just a single
cpuset (see 'number_of_cpusets', in linux/cpuset.h) meant that simply
changing the 'mems' of the top_cpuset had no affect, even though the change
(the write system call) appeared to succeed.  With the following change,
that write to the 'mems' file fails -EACCES, and the 'mems' file stubbornly
refuses to be changed via user space writes.  Thus no one should be mislead
into thinking they've changed the top_cpusets's 'mems' when in affect they
haven't.

In order to keep the behaviour of cpusets consistent between systems
actively making use of them and systems not using them, this patch changes
the behaviour of the 'mems' file in the top (root) cpuset, making it read
only, and making it automatically track the value of node_online_map.  Thus
tasks in the top cpuset will have automatic use of hot plugged memory nodes
allowed by their cpuset.

[akpm@osdl.org: build fix]
[bunk@stusta.de: build fix]
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory_hotplug.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'mm')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c37319542b7..9576ed920c0 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -21,6 +21,7 @@
 #include <linux/highmem.h>
 #include <linux/vmalloc.h>
 #include <linux/ioport.h>
+#include <linux/cpuset.h>
 
 #include <asm/tlbflush.h>
 
@@ -283,6 +284,8 @@ int add_memory(int nid, u64 start, u64 size)
 	/* we online node here. we can't roll back from here. */
 	node_set_online(nid);
 
+	cpuset_track_online_nodes();
+
 	if (new_pgdat) {
 		ret = register_one_node(nid);
 		/*
-- 
cgit v1.2.3


From 40c99aae23529f3d069ae08836ae46fadb3fd2bd Mon Sep 17 00:00:00 2001
From: Chandra Seetharaman <sekharan@us.ibm.com>
Date: Fri, 29 Sep 2006 02:01:24 -0700
Subject: [PATCH] remove static variable mm/page-writeback.c:total_pages

page-writeback.c has a static local variable "total_pages", which is the
total number of pages in the system.

There is a global variable "vm_total_pages", which is the total number of
pages the VM controls.

Both are assigned from the return value of nr_free_pagecache_pages().

This patch removes the local variable and uses the global variable in that
place.

One more issue with the local static variable "total_pages" is that it is
not updated when new pages are hot-added.  Since vm_total_pages is updated
when new pages are hot-added, this patch fixes that problem too.

Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page-writeback.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 555752907dc..efd2705e498 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -46,7 +46,6 @@
  */
 static long ratelimit_pages = 32;
 
-static long total_pages;	/* The total number of pages in the machine. */
 static int dirty_exceeded __cacheline_aligned_in_smp;	/* Dirty mem may be over limit */
 
 /*
@@ -126,7 +125,7 @@ get_dirty_limits(long *pbackground, long *pdirty,
 	int unmapped_ratio;
 	long background;
 	long dirty;
-	unsigned long available_memory = total_pages;
+	unsigned long available_memory = vm_total_pages;
 	struct task_struct *tsk;
 
 #ifdef CONFIG_HIGHMEM
@@ -141,7 +140,7 @@ get_dirty_limits(long *pbackground, long *pdirty,
 
 	unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) +
 				global_page_state(NR_ANON_PAGES)) * 100) /
-					total_pages;
+					vm_total_pages;
 
 	dirty_ratio = vm_dirty_ratio;
 	if (dirty_ratio > unmapped_ratio / 2)
@@ -504,7 +503,7 @@ void laptop_sync_completion(void)
 
 static void set_ratelimit(void)
 {
-	ratelimit_pages = total_pages / (num_online_cpus() * 32);
+	ratelimit_pages = vm_total_pages / (num_online_cpus() * 32);
 	if (ratelimit_pages < 16)
 		ratelimit_pages = 16;
 	if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
@@ -533,9 +532,7 @@ void __init page_writeback_init(void)
 	long buffer_pages = nr_free_buffer_pages();
 	long correction;
 
-	total_pages = nr_free_pagecache_pages();
-
-	correction = (100 * 4 * buffer_pages) / total_pages;
+	correction = (100 * 4 * buffer_pages) / vm_total_pages;
 
 	if (correction < 100) {
 		dirty_background_ratio *= correction;
-- 
cgit v1.2.3


From 2d1d43f6a43b703587e759145f69467e7c6553a7 Mon Sep 17 00:00:00 2001
From: Chandra Seetharaman <sekharan@us.ibm.com>
Date: Fri, 29 Sep 2006 02:01:25 -0700
Subject: [PATCH] call mm/page-writeback.c:set_ratelimit() when new pages are
 hot-added

ratelimit_pages in page-writeback.c is recalculated (in set_ratelimit())
every time a CPU is hot-added/removed.  But this value is not recalculated
when new pages are hot-added.

This patch fixes that problem by calling set_ratelimit() when new pages
are hot-added.

[akpm@osdl.org: cleanups]
Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory_hotplug.c | 2 ++
 mm/page-writeback.c | 6 +++---
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9576ed920c0..2053bb165a2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -13,6 +13,7 @@
 #include <linux/compiler.h>
 #include <linux/module.h>
 #include <linux/pagevec.h>
+#include <linux/writeback.h>
 #include <linux/slab.h>
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
@@ -192,6 +193,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 	if (need_zonelists_rebuild)
 		build_all_zonelists();
 	vm_total_pages = nr_free_pagecache_pages();
+	writeback_set_ratelimit();
 	return 0;
 }
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index efd2705e498..488b7088557 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -501,7 +501,7 @@ void laptop_sync_completion(void)
  * will write six megabyte chunks, max.
  */
 
-static void set_ratelimit(void)
+void writeback_set_ratelimit(void)
 {
 	ratelimit_pages = vm_total_pages / (num_online_cpus() * 32);
 	if (ratelimit_pages < 16)
@@ -513,7 +513,7 @@ static void set_ratelimit(void)
 static int __cpuinit
 ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
 {
-	set_ratelimit();
+	writeback_set_ratelimit();
 	return 0;
 }
 
@@ -546,7 +546,7 @@ void __init page_writeback_init(void)
 			vm_dirty_ratio = 1;
 	}
 	mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
-	set_ratelimit();
+	writeback_set_ratelimit();
 	register_cpu_notifier(&ratelimit_nb);
 }
 
-- 
cgit v1.2.3


From 3f9e7949f86dfe2bd9a1ad0604f78e7683c059de Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Fri, 29 Sep 2006 02:01:26 -0700
Subject: [PATCH] valid_swaphandles() fix

akpm draws my attention to the fact that sysctl(VM_PAGE_CLUSTER) might
conceivably change page_cluster to 0 while valid_swaphandles() is in the
middle of using it, leading to an embarrassingly long loop: take a local
snapshot of page_cluster and work with that.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swapfile.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/swapfile.c b/mm/swapfile.c
index f1f5ec78378..a15def63f28 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1723,13 +1723,14 @@ get_swap_info_struct(unsigned type)
  */
 int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
 {
-	int ret = 0, i = 1 << page_cluster;
+	int our_page_cluster = page_cluster;
+	int ret = 0, i = 1 << our_page_cluster;
 	unsigned long toff;
 	struct swap_info_struct *swapdev = swp_type(entry) + swap_info;
 
-	if (!page_cluster)	/* no readahead */
+	if (!our_page_cluster)	/* no readahead */
 		return 0;
-	toff = (swp_offset(entry) >> page_cluster) << page_cluster;
+	toff = (swp_offset(entry) >> our_page_cluster) << our_page_cluster;
 	if (!toff)		/* first page is swap header */
 		toff++, i--;
 	*offset = toff;
-- 
cgit v1.2.3


From 39f0247d3823e4e0bf8f6838a10362864b1e1053 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@suse.de>
Date: Fri, 29 Sep 2006 02:01:35 -0700
Subject: [PATCH] Access Control Lists for tmpfs

Add access control lists for tmpfs.

Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/Makefile    |   1 +
 mm/shmem.c     |  99 ++++++++++++++++++++++++++++-
 mm/shmem_acl.c | 197 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 295 insertions(+), 2 deletions(-)
 create mode 100644 mm/shmem_acl.c

(limited to 'mm')

diff --git a/mm/Makefile b/mm/Makefile
index 60c56c0b5e1..6200c6d6afd 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
 obj-$(CONFIG_NUMA) 	+= mempolicy.o
 obj-$(CONFIG_SPARSEMEM)	+= sparse.o
 obj-$(CONFIG_SHMEM) += shmem.o
+obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
 obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
 obj-$(CONFIG_SLOB) += slob.o
 obj-$(CONFIG_SLAB) += slab.o
diff --git a/mm/shmem.c b/mm/shmem.c
index eda907c3a86..b96de69f236 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -26,6 +26,8 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/fs.h>
+#include <linux/xattr.h>
+#include <linux/generic_acl.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/file.h>
@@ -177,6 +179,7 @@ static const struct address_space_operations shmem_aops;
 static struct file_operations shmem_file_operations;
 static struct inode_operations shmem_inode_operations;
 static struct inode_operations shmem_dir_inode_operations;
+static struct inode_operations shmem_special_inode_operations;
 static struct vm_operations_struct shmem_vm_ops;
 
 static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
@@ -637,7 +640,7 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
 	struct page *page = NULL;
 	int error;
 
-	if (attr->ia_valid & ATTR_SIZE) {
+	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
 		if (attr->ia_size < inode->i_size) {
 			/*
 			 * If truncating down to a partial page, then
@@ -670,6 +673,10 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
 	error = inode_change_ok(inode, attr);
 	if (!error)
 		error = inode_setattr(inode, attr);
+#ifdef CONFIG_TMPFS_POSIX_ACL
+	if (!error && (attr->ia_valid & ATTR_MODE))
+		error = generic_acl_chmod(inode, &shmem_acl_ops);
+#endif
 	if (page)
 		page_cache_release(page);
 	return error;
@@ -1362,6 +1369,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
 
 		switch (mode & S_IFMT) {
 		default:
+			inode->i_op = &shmem_special_inode_operations;
 			init_special_inode(inode, mode, dev);
 			break;
 		case S_IFREG:
@@ -1682,7 +1690,11 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 				iput(inode);
 				return error;
 			}
-			error = 0;
+		}
+		error = shmem_acl_init(inode, dir);
+		if (error) {
+			iput(inode);
+			return error;
 		}
 		if (dir->i_mode & S_ISGID) {
 			inode->i_gid = dir->i_gid;
@@ -1897,6 +1909,53 @@ static struct inode_operations shmem_symlink_inode_operations = {
 	.put_link	= shmem_put_link,
 };
 
+#ifdef CONFIG_TMPFS_POSIX_ACL
+/**
+ * Superblocks without xattr inode operations will get security.* xattr
+ * support from the VFS "for free". As soon as we have any other xattrs
+ * like ACLs, we also need to implement the security.* handlers at
+ * filesystem level, though.
+ */
+
+static size_t shmem_xattr_security_list(struct inode *inode, char *list,
+					size_t list_len, const char *name,
+					size_t name_len)
+{
+	return security_inode_listsecurity(inode, list, list_len);
+}
+
+static int shmem_xattr_security_get(struct inode *inode, const char *name,
+				    void *buffer, size_t size)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return security_inode_getsecurity(inode, name, buffer, size,
+					  -EOPNOTSUPP);
+}
+
+static int shmem_xattr_security_set(struct inode *inode, const char *name,
+				    const void *value, size_t size, int flags)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return security_inode_setsecurity(inode, name, value, size, flags);
+}
+
+struct xattr_handler shmem_xattr_security_handler = {
+	.prefix = XATTR_SECURITY_PREFIX,
+	.list   = shmem_xattr_security_list,
+	.get    = shmem_xattr_security_get,
+	.set    = shmem_xattr_security_set,
+};
+
+static struct xattr_handler *shmem_xattr_handlers[] = {
+	&shmem_xattr_acl_access_handler,
+	&shmem_xattr_acl_default_handler,
+	&shmem_xattr_security_handler,
+	NULL
+};
+#endif
+
 static int shmem_parse_options(char *options, int *mode, uid_t *uid,
 	gid_t *gid, unsigned long *blocks, unsigned long *inodes,
 	int *policy, nodemask_t *policy_nodes)
@@ -2094,6 +2153,10 @@ static int shmem_fill_super(struct super_block *sb,
 	sb->s_magic = TMPFS_MAGIC;
 	sb->s_op = &shmem_ops;
 	sb->s_time_gran = 1;
+#ifdef CONFIG_TMPFS_POSIX_ACL
+	sb->s_xattr = shmem_xattr_handlers;
+	sb->s_flags |= MS_POSIXACL;
+#endif
 
 	inode = shmem_get_inode(sb, S_IFDIR | mode, 0);
 	if (!inode)
@@ -2130,6 +2193,7 @@ static void shmem_destroy_inode(struct inode *inode)
 		/* only struct inode is valid if it's an inline symlink */
 		mpol_free_shared_policy(&SHMEM_I(inode)->policy);
 	}
+	shmem_acl_destroy_inode(inode);
 	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
 }
 
@@ -2141,6 +2205,10 @@ static void init_once(void *foo, struct kmem_cache *cachep,
 	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
 	    SLAB_CTOR_CONSTRUCTOR) {
 		inode_init_once(&p->vfs_inode);
+#ifdef CONFIG_TMPFS_POSIX_ACL
+		p->i_acl = NULL;
+		p->i_default_acl = NULL;
+#endif
 	}
 }
 
@@ -2184,6 +2252,14 @@ static struct inode_operations shmem_inode_operations = {
 	.truncate	= shmem_truncate,
 	.setattr	= shmem_notify_change,
 	.truncate_range	= shmem_truncate_range,
+#ifdef CONFIG_TMPFS_POSIX_ACL
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= generic_listxattr,
+	.removexattr	= generic_removexattr,
+	.permission	= shmem_permission,
+#endif
+
 };
 
 static struct inode_operations shmem_dir_inode_operations = {
@@ -2198,6 +2274,25 @@ static struct inode_operations shmem_dir_inode_operations = {
 	.mknod		= shmem_mknod,
 	.rename		= shmem_rename,
 #endif
+#ifdef CONFIG_TMPFS_POSIX_ACL
+	.setattr	= shmem_notify_change,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= generic_listxattr,
+	.removexattr	= generic_removexattr,
+	.permission	= shmem_permission,
+#endif
+};
+
+static struct inode_operations shmem_special_inode_operations = {
+#ifdef CONFIG_TMPFS_POSIX_ACL
+	.setattr	= shmem_notify_change,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= generic_listxattr,
+	.removexattr	= generic_removexattr,
+	.permission	= shmem_permission,
+#endif
 };
 
 static struct super_operations shmem_ops = {
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
new file mode 100644
index 00000000000..c946bf46871
--- /dev/null
+++ b/mm/shmem_acl.c
@@ -0,0 +1,197 @@
+/*
+ * mm/shmem_acl.c
+ *
+ * (C) 2005 Andreas Gruenbacher <agruen@suse.de>
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/fs.h>
+#include <linux/shmem_fs.h>
+#include <linux/xattr.h>
+#include <linux/generic_acl.h>
+
+/**
+ * shmem_get_acl  -   generic_acl_operations->getacl() operation
+ */
+static struct posix_acl *
+shmem_get_acl(struct inode *inode, int type)
+{
+	struct posix_acl *acl = NULL;
+
+	spin_lock(&inode->i_lock);
+	switch(type) {
+		case ACL_TYPE_ACCESS:
+			acl = posix_acl_dup(SHMEM_I(inode)->i_acl);
+			break;
+
+		case ACL_TYPE_DEFAULT:
+			acl = posix_acl_dup(SHMEM_I(inode)->i_default_acl);
+			break;
+	}
+	spin_unlock(&inode->i_lock);
+
+	return acl;
+}
+
+/**
+ * shmem_get_acl  -   generic_acl_operations->setacl() operation
+ */
+static void
+shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+{
+	struct posix_acl *free = NULL;
+
+	spin_lock(&inode->i_lock);
+	switch(type) {
+		case ACL_TYPE_ACCESS:
+			free = SHMEM_I(inode)->i_acl;
+			SHMEM_I(inode)->i_acl = posix_acl_dup(acl);
+			break;
+
+		case ACL_TYPE_DEFAULT:
+			free = SHMEM_I(inode)->i_default_acl;
+			SHMEM_I(inode)->i_default_acl = posix_acl_dup(acl);
+			break;
+	}
+	spin_unlock(&inode->i_lock);
+	posix_acl_release(free);
+}
+
+struct generic_acl_operations shmem_acl_ops = {
+	.getacl = shmem_get_acl,
+	.setacl = shmem_set_acl,
+};
+
+/**
+ * shmem_list_acl_access, shmem_get_acl_access, shmem_set_acl_access,
+ * shmem_xattr_acl_access_handler  -  plumbing code to implement the
+ * system.posix_acl_access xattr using the generic acl functions.
+ */
+
+static size_t
+shmem_list_acl_access(struct inode *inode, char *list, size_t list_size,
+		      const char *name, size_t name_len)
+{
+	return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_ACCESS,
+				list, list_size);
+}
+
+static int
+shmem_get_acl_access(struct inode *inode, const char *name, void *buffer,
+		     size_t size)
+{
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+	return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, buffer,
+			       size);
+}
+
+static int
+shmem_set_acl_access(struct inode *inode, const char *name, const void *value,
+		     size_t size, int flags)
+{
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+	return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, value,
+			       size);
+}
+
+struct xattr_handler shmem_xattr_acl_access_handler = {
+	.prefix = POSIX_ACL_XATTR_ACCESS,
+	.list	= shmem_list_acl_access,
+	.get	= shmem_get_acl_access,
+	.set	= shmem_set_acl_access,
+};
+
+/**
+ * shmem_list_acl_default, shmem_get_acl_default, shmem_set_acl_default,
+ * shmem_xattr_acl_default_handler  -  plumbing code to implement the
+ * system.posix_acl_default xattr using the generic acl functions.
+ */
+
+static size_t
+shmem_list_acl_default(struct inode *inode, char *list, size_t list_size,
+		       const char *name, size_t name_len)
+{
+	return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT,
+				list, list_size);
+}
+
+static int
+shmem_get_acl_default(struct inode *inode, const char *name, void *buffer,
+		      size_t size)
+{
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+	return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, buffer,
+			       size);
+}
+
+static int
+shmem_set_acl_default(struct inode *inode, const char *name, const void *value,
+		      size_t size, int flags)
+{
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+	return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, value,
+			       size);
+}
+
+struct xattr_handler shmem_xattr_acl_default_handler = {
+	.prefix = POSIX_ACL_XATTR_DEFAULT,
+	.list	= shmem_list_acl_default,
+	.get	= shmem_get_acl_default,
+	.set	= shmem_set_acl_default,
+};
+
+/**
+ * shmem_acl_init  -  Inizialize the acl(s) of a new inode
+ */
+int
+shmem_acl_init(struct inode *inode, struct inode *dir)
+{
+	return generic_acl_init(inode, dir, &shmem_acl_ops);
+}
+
+/**
+ * shmem_acl_destroy_inode  -  destroy acls hanging off the in-memory inode
+ *
+ * This is done before destroying the actual inode.
+ */
+
+void
+shmem_acl_destroy_inode(struct inode *inode)
+{
+	if (SHMEM_I(inode)->i_acl)
+		posix_acl_release(SHMEM_I(inode)->i_acl);
+	SHMEM_I(inode)->i_acl = NULL;
+	if (SHMEM_I(inode)->i_default_acl)
+		posix_acl_release(SHMEM_I(inode)->i_default_acl);
+	SHMEM_I(inode)->i_default_acl = NULL;
+}
+
+/**
+ * shmem_check_acl  -  check_acl() callback for generic_permission()
+ */
+static int
+shmem_check_acl(struct inode *inode, int mask)
+{
+	struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS);
+
+	if (acl) {
+		int error = posix_acl_permission(inode, acl, mask);
+		posix_acl_release(acl);
+		return error;
+	}
+	return -EAGAIN;
+}
+
+/**
+ * shmem_permission  -  permission() inode operation
+ */
+int
+shmem_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+	return generic_permission(inode, mask, shmem_check_acl);
+}
-- 
cgit v1.2.3


From cf9a2ae8d49948f861b56e5333530e491a9da190 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 29 Aug 2006 19:05:54 +0100
Subject: [PATCH] BLOCK: Move functions out of buffer code [try #6]

Move some functions out of the buffering code that aren't strictly buffering
specific.  This is a precursor to being able to disable the block layer.

 (*) Moved some stuff out of fs/buffer.c:

     (*) The file sync and general sync stuff moved to fs/sync.c.

     (*) The superblock sync stuff moved to fs/super.c.

     (*) do_invalidatepage() moved to mm/truncate.c.

     (*) try_to_release_page() moved to mm/filemap.c.

 (*) Moved some related declarations between header files:

     (*) declarations for do_invalidatepage() and try_to_release_page() moved
     	 to linux/mm.h.

     (*) __set_page_dirty_buffers() moved to linux/buffer_head.h.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 mm/filemap.c        | 30 ++++++++++++++++++++++++++++++
 mm/page-writeback.c |  1 +
 mm/truncate.c       | 24 ++++++++++++++++++++++++
 3 files changed, 55 insertions(+)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index 3277f3b2352..d6846de0888 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2491,3 +2491,33 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 	}
 	return retval;
 }
+
+/**
+ * try_to_release_page() - release old fs-specific metadata on a page
+ *
+ * @page: the page which the kernel is trying to free
+ * @gfp_mask: memory allocation flags (and I/O mode)
+ *
+ * The address_space is to try to release any data against the page
+ * (presumably at page->private).  If the release was successful, return `1'.
+ * Otherwise return zero.
+ *
+ * The @gfp_mask argument specifies whether I/O may be performed to release
+ * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
+ *
+ * NOTE: @gfp_mask may go away, and this function may become non-blocking.
+ */
+int try_to_release_page(struct page *page, gfp_t gfp_mask)
+{
+	struct address_space * const mapping = page->mapping;
+
+	BUG_ON(!PageLocked(page));
+	if (PageWriteback(page))
+		return 0;
+
+	if (mapping && mapping->a_ops->releasepage)
+		return mapping->a_ops->releasepage(page, gfp_mask);
+	return try_to_free_buffers(page);
+}
+
+EXPORT_SYMBOL(try_to_release_page);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 488b7088557..9fdcc790395 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -30,6 +30,7 @@
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
 #include <linux/syscalls.h>
+#include <linux/buffer_head.h>
 
 /*
  * The maximum number of pages to writeout in a single bdflush/kupdate
diff --git a/mm/truncate.c b/mm/truncate.c
index a654928323d..cd3e34b816d 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -17,6 +17,30 @@
 				   do_invalidatepage */
 
 
+/**
+ * do_invalidatepage - invalidate part of all of a page
+ * @page: the page which is affected
+ * @offset: the index of the truncation point
+ *
+ * do_invalidatepage() is called when all or part of the page has become
+ * invalidated by a truncate operation.
+ *
+ * do_invalidatepage() does not have to release all buffers, but it must
+ * ensure that no dirty buffer is left outside @offset and that no I/O
+ * is underway against any of the blocks which are outside the truncation
+ * point.  Because the caller is about to free (and possibly reuse) those
+ * blocks on-disk.
+ */
+void do_invalidatepage(struct page *page, unsigned long offset)
+{
+	void (*invalidatepage)(struct page *, unsigned long);
+	invalidatepage = page->mapping->a_ops->invalidatepage;
+	if (!invalidatepage)
+		invalidatepage = block_invalidatepage;
+	if (invalidatepage)
+		(*invalidatepage)(page, offset);
+}
+
 static inline void truncate_partial_page(struct page *page, unsigned partial)
 {
 	memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
-- 
cgit v1.2.3


From b398f6bff93a247d2a7099e92905374966e4558f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 29 Aug 2006 19:05:58 +0100
Subject: [PATCH] BLOCK: Stop fallback_migrate_page() from using
 page_has_buffers() [try #6]

Stop fallback_migrate_page() from using page_has_buffers() since that might not
be available.  Use PagePrivate() instead since that's more general.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 mm/migrate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/migrate.c b/mm/migrate.c
index 20a8c2687b1..7f50e3ff54c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -525,7 +525,7 @@ static int fallback_migrate_page(struct address_space *mapping,
 	 * Buffers may be managed in a filesystem specific way.
 	 * We must have no buffers or drop them.
 	 */
-	if (page_has_buffers(page) &&
+	if (PagePrivate(page) &&
 	    !try_to_release_page(page, GFP_KERNEL))
 		return -EAGAIN;
 
-- 
cgit v1.2.3


From 831058dec3735665fe91bd0d37b6a8cf56b91abd Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 29 Aug 2006 19:06:00 +0100
Subject: [PATCH] BLOCK: Separate the bounce buffering code from the highmem
 code [try #6]

Move the bounce buffer code from mm/highmem.c to mm/bounce.c so that it can be
more easily disabled when the block layer is disabled.

!!!NOTE!!! There may be a bug in this code: Should init_emergency_pool() be
	   contingent on CONFIG_HIGHMEM?

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 mm/Makefile  |   3 +
 mm/bounce.c  | 302 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/highmem.c | 281 ------------------------------------------------------
 3 files changed, 305 insertions(+), 281 deletions(-)
 create mode 100644 mm/bounce.c

(limited to 'mm')

diff --git a/mm/Makefile b/mm/Makefile
index 6200c6d6afd..4f2166a833b 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -12,6 +12,9 @@ obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   readahead.o swap.o truncate.o vmscan.o \
 			   prio_tree.o util.o mmzone.o vmstat.o $(mmu-y)
 
+ifeq ($(CONFIG_MMU),y)
+obj-y			+= bounce.o
+endif
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
 obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
 obj-$(CONFIG_NUMA) 	+= mempolicy.o
diff --git a/mm/bounce.c b/mm/bounce.c
new file mode 100644
index 00000000000..e4b62d2a402
--- /dev/null
+++ b/mm/bounce.c
@@ -0,0 +1,302 @@
+/* bounce buffer handling for block devices
+ *
+ * - Split from highmem.c
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/bio.h>
+#include <linux/pagemap.h>
+#include <linux/mempool.h>
+#include <linux/blkdev.h>
+#include <linux/init.h>
+#include <linux/hash.h>
+#include <linux/highmem.h>
+#include <linux/blktrace_api.h>
+#include <asm/tlbflush.h>
+
+#define POOL_SIZE	64
+#define ISA_POOL_SIZE	16
+
+static mempool_t *page_pool, *isa_page_pool;
+
+#ifdef CONFIG_HIGHMEM
+static __init int init_emergency_pool(void)
+{
+	struct sysinfo i;
+	si_meminfo(&i);
+	si_swapinfo(&i);
+
+	if (!i.totalhigh)
+		return 0;
+
+	page_pool = mempool_create_page_pool(POOL_SIZE, 0);
+	BUG_ON(!page_pool);
+	printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
+
+	return 0;
+}
+
+__initcall(init_emergency_pool);
+
+/*
+ * highmem version, map in to vec
+ */
+static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
+{
+	unsigned long flags;
+	unsigned char *vto;
+
+	local_irq_save(flags);
+	vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ);
+	memcpy(vto + to->bv_offset, vfrom, to->bv_len);
+	kunmap_atomic(vto, KM_BOUNCE_READ);
+	local_irq_restore(flags);
+}
+
+#else /* CONFIG_HIGHMEM */
+
+#define bounce_copy_vec(to, vfrom)	\
+	memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len)
+
+#endif /* CONFIG_HIGHMEM */
+
+/*
+ * allocate pages in the DMA region for the ISA pool
+ */
+static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
+{
+	return mempool_alloc_pages(gfp_mask | GFP_DMA, data);
+}
+
+/*
+ * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
+ * as the max address, so check if the pool has already been created.
+ */
+int init_emergency_isa_pool(void)
+{
+	if (isa_page_pool)
+		return 0;
+
+	isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa,
+				       mempool_free_pages, (void *) 0);
+	BUG_ON(!isa_page_pool);
+
+	printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE);
+	return 0;
+}
+
+/*
+ * Simple bounce buffer support for highmem pages. Depending on the
+ * queue gfp mask set, *to may or may not be a highmem page. kmap it
+ * always, it will do the Right Thing
+ */
+static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
+{
+	unsigned char *vfrom;
+	struct bio_vec *tovec, *fromvec;
+	int i;
+
+	__bio_for_each_segment(tovec, to, i, 0) {
+		fromvec = from->bi_io_vec + i;
+
+		/*
+		 * not bounced
+		 */
+		if (tovec->bv_page == fromvec->bv_page)
+			continue;
+
+		/*
+		 * fromvec->bv_offset and fromvec->bv_len might have been
+		 * modified by the block layer, so use the original copy,
+		 * bounce_copy_vec already uses tovec->bv_len
+		 */
+		vfrom = page_address(fromvec->bv_page) + tovec->bv_offset;
+
+		flush_dcache_page(tovec->bv_page);
+		bounce_copy_vec(tovec, vfrom);
+	}
+}
+
+static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
+{
+	struct bio *bio_orig = bio->bi_private;
+	struct bio_vec *bvec, *org_vec;
+	int i;
+
+	if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
+		set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
+
+	/*
+	 * free up bounce indirect pages used
+	 */
+	__bio_for_each_segment(bvec, bio, i, 0) {
+		org_vec = bio_orig->bi_io_vec + i;
+		if (bvec->bv_page == org_vec->bv_page)
+			continue;
+
+		dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
+		mempool_free(bvec->bv_page, pool);
+	}
+
+	bio_endio(bio_orig, bio_orig->bi_size, err);
+	bio_put(bio);
+}
+
+static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done, int err)
+{
+	if (bio->bi_size)
+		return 1;
+
+	bounce_end_io(bio, page_pool, err);
+	return 0;
+}
+
+static int bounce_end_io_write_isa(struct bio *bio, unsigned int bytes_done, int err)
+{
+	if (bio->bi_size)
+		return 1;
+
+	bounce_end_io(bio, isa_page_pool, err);
+	return 0;
+}
+
+static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err)
+{
+	struct bio *bio_orig = bio->bi_private;
+
+	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+		copy_to_high_bio_irq(bio_orig, bio);
+
+	bounce_end_io(bio, pool, err);
+}
+
+static int bounce_end_io_read(struct bio *bio, unsigned int bytes_done, int err)
+{
+	if (bio->bi_size)
+		return 1;
+
+	__bounce_end_io_read(bio, page_pool, err);
+	return 0;
+}
+
+static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int err)
+{
+	if (bio->bi_size)
+		return 1;
+
+	__bounce_end_io_read(bio, isa_page_pool, err);
+	return 0;
+}
+
+static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig,
+			       mempool_t *pool)
+{
+	struct page *page;
+	struct bio *bio = NULL;
+	int i, rw = bio_data_dir(*bio_orig);
+	struct bio_vec *to, *from;
+
+	bio_for_each_segment(from, *bio_orig, i) {
+		page = from->bv_page;
+
+		/*
+		 * is destination page below bounce pfn?
+		 */
+		if (page_to_pfn(page) < q->bounce_pfn)
+			continue;
+
+		/*
+		 * irk, bounce it
+		 */
+		if (!bio)
+			bio = bio_alloc(GFP_NOIO, (*bio_orig)->bi_vcnt);
+
+		to = bio->bi_io_vec + i;
+
+		to->bv_page = mempool_alloc(pool, q->bounce_gfp);
+		to->bv_len = from->bv_len;
+		to->bv_offset = from->bv_offset;
+		inc_zone_page_state(to->bv_page, NR_BOUNCE);
+
+		if (rw == WRITE) {
+			char *vto, *vfrom;
+
+			flush_dcache_page(from->bv_page);
+			vto = page_address(to->bv_page) + to->bv_offset;
+			vfrom = kmap(from->bv_page) + from->bv_offset;
+			memcpy(vto, vfrom, to->bv_len);
+			kunmap(from->bv_page);
+		}
+	}
+
+	/*
+	 * no pages bounced
+	 */
+	if (!bio)
+		return;
+
+	/*
+	 * at least one page was bounced, fill in possible non-highmem
+	 * pages
+	 */
+	__bio_for_each_segment(from, *bio_orig, i, 0) {
+		to = bio_iovec_idx(bio, i);
+		if (!to->bv_page) {
+			to->bv_page = from->bv_page;
+			to->bv_len = from->bv_len;
+			to->bv_offset = from->bv_offset;
+		}
+	}
+
+	bio->bi_bdev = (*bio_orig)->bi_bdev;
+	bio->bi_flags |= (1 << BIO_BOUNCED);
+	bio->bi_sector = (*bio_orig)->bi_sector;
+	bio->bi_rw = (*bio_orig)->bi_rw;
+
+	bio->bi_vcnt = (*bio_orig)->bi_vcnt;
+	bio->bi_idx = (*bio_orig)->bi_idx;
+	bio->bi_size = (*bio_orig)->bi_size;
+
+	if (pool == page_pool) {
+		bio->bi_end_io = bounce_end_io_write;
+		if (rw == READ)
+			bio->bi_end_io = bounce_end_io_read;
+	} else {
+		bio->bi_end_io = bounce_end_io_write_isa;
+		if (rw == READ)
+			bio->bi_end_io = bounce_end_io_read_isa;
+	}
+
+	bio->bi_private = *bio_orig;
+	*bio_orig = bio;
+}
+
+void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
+{
+	mempool_t *pool;
+
+	/*
+	 * for non-isa bounce case, just check if the bounce pfn is equal
+	 * to or bigger than the highest pfn in the system -- in that case,
+	 * don't waste time iterating over bio segments
+	 */
+	if (!(q->bounce_gfp & GFP_DMA)) {
+		if (q->bounce_pfn >= blk_max_pfn)
+			return;
+		pool = page_pool;
+	} else {
+		BUG_ON(!isa_page_pool);
+		pool = isa_page_pool;
+	}
+
+	blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
+
+	/*
+	 * slow path
+	 */
+	__blk_queue_bounce(q, bio_orig, pool);
+}
+
+EXPORT_SYMBOL(blk_queue_bounce);
diff --git a/mm/highmem.c b/mm/highmem.c
index ee5519b176e..0206e7e5018 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -29,13 +29,6 @@
 #include <linux/blktrace_api.h>
 #include <asm/tlbflush.h>
 
-static mempool_t *page_pool, *isa_page_pool;
-
-static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
-{
-	return mempool_alloc_pages(gfp_mask | GFP_DMA, data);
-}
-
 /*
  * Virtual_count is not a pure "count".
  *  0 means that it is not mapped, and has not been mapped
@@ -217,282 +210,8 @@ void fastcall kunmap_high(struct page *page)
 }
 
 EXPORT_SYMBOL(kunmap_high);
-
-#define POOL_SIZE	64
-
-static __init int init_emergency_pool(void)
-{
-	struct sysinfo i;
-	si_meminfo(&i);
-	si_swapinfo(&i);
-        
-	if (!i.totalhigh)
-		return 0;
-
-	page_pool = mempool_create_page_pool(POOL_SIZE, 0);
-	BUG_ON(!page_pool);
-	printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
-
-	return 0;
-}
-
-__initcall(init_emergency_pool);
-
-/*
- * highmem version, map in to vec
- */
-static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
-{
-	unsigned long flags;
-	unsigned char *vto;
-
-	local_irq_save(flags);
-	vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ);
-	memcpy(vto + to->bv_offset, vfrom, to->bv_len);
-	kunmap_atomic(vto, KM_BOUNCE_READ);
-	local_irq_restore(flags);
-}
-
-#else /* CONFIG_HIGHMEM */
-
-#define bounce_copy_vec(to, vfrom)	\
-	memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len)
-
 #endif
 
-#define ISA_POOL_SIZE	16
-
-/*
- * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
- * as the max address, so check if the pool has already been created.
- */
-int init_emergency_isa_pool(void)
-{
-	if (isa_page_pool)
-		return 0;
-
-	isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa,
-				       mempool_free_pages, (void *) 0);
-	BUG_ON(!isa_page_pool);
-
-	printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE);
-	return 0;
-}
-
-/*
- * Simple bounce buffer support for highmem pages. Depending on the
- * queue gfp mask set, *to may or may not be a highmem page. kmap it
- * always, it will do the Right Thing
- */
-static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
-{
-	unsigned char *vfrom;
-	struct bio_vec *tovec, *fromvec;
-	int i;
-
-	__bio_for_each_segment(tovec, to, i, 0) {
-		fromvec = from->bi_io_vec + i;
-
-		/*
-		 * not bounced
-		 */
-		if (tovec->bv_page == fromvec->bv_page)
-			continue;
-
-		/*
-		 * fromvec->bv_offset and fromvec->bv_len might have been
-		 * modified by the block layer, so use the original copy,
-		 * bounce_copy_vec already uses tovec->bv_len
-		 */
-		vfrom = page_address(fromvec->bv_page) + tovec->bv_offset;
-
-		flush_dcache_page(tovec->bv_page);
-		bounce_copy_vec(tovec, vfrom);
-	}
-}
-
-static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
-{
-	struct bio *bio_orig = bio->bi_private;
-	struct bio_vec *bvec, *org_vec;
-	int i;
-
-	if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
-		set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
-
-	/*
-	 * free up bounce indirect pages used
-	 */
-	__bio_for_each_segment(bvec, bio, i, 0) {
-		org_vec = bio_orig->bi_io_vec + i;
-		if (bvec->bv_page == org_vec->bv_page)
-			continue;
-
-		dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
-		mempool_free(bvec->bv_page, pool);
-	}
-
-	bio_endio(bio_orig, bio_orig->bi_size, err);
-	bio_put(bio);
-}
-
-static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done, int err)
-{
-	if (bio->bi_size)
-		return 1;
-
-	bounce_end_io(bio, page_pool, err);
-	return 0;
-}
-
-static int bounce_end_io_write_isa(struct bio *bio, unsigned int bytes_done, int err)
-{
-	if (bio->bi_size)
-		return 1;
-
-	bounce_end_io(bio, isa_page_pool, err);
-	return 0;
-}
-
-static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err)
-{
-	struct bio *bio_orig = bio->bi_private;
-
-	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
-		copy_to_high_bio_irq(bio_orig, bio);
-
-	bounce_end_io(bio, pool, err);
-}
-
-static int bounce_end_io_read(struct bio *bio, unsigned int bytes_done, int err)
-{
-	if (bio->bi_size)
-		return 1;
-
-	__bounce_end_io_read(bio, page_pool, err);
-	return 0;
-}
-
-static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int err)
-{
-	if (bio->bi_size)
-		return 1;
-
-	__bounce_end_io_read(bio, isa_page_pool, err);
-	return 0;
-}
-
-static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig,
-			       mempool_t *pool)
-{
-	struct page *page;
-	struct bio *bio = NULL;
-	int i, rw = bio_data_dir(*bio_orig);
-	struct bio_vec *to, *from;
-
-	bio_for_each_segment(from, *bio_orig, i) {
-		page = from->bv_page;
-
-		/*
-		 * is destination page below bounce pfn?
-		 */
-		if (page_to_pfn(page) < q->bounce_pfn)
-			continue;
-
-		/*
-		 * irk, bounce it
-		 */
-		if (!bio)
-			bio = bio_alloc(GFP_NOIO, (*bio_orig)->bi_vcnt);
-
-		to = bio->bi_io_vec + i;
-
-		to->bv_page = mempool_alloc(pool, q->bounce_gfp);
-		to->bv_len = from->bv_len;
-		to->bv_offset = from->bv_offset;
-		inc_zone_page_state(to->bv_page, NR_BOUNCE);
-
-		if (rw == WRITE) {
-			char *vto, *vfrom;
-
-			flush_dcache_page(from->bv_page);
-			vto = page_address(to->bv_page) + to->bv_offset;
-			vfrom = kmap(from->bv_page) + from->bv_offset;
-			memcpy(vto, vfrom, to->bv_len);
-			kunmap(from->bv_page);
-		}
-	}
-
-	/*
-	 * no pages bounced
-	 */
-	if (!bio)
-		return;
-
-	/*
-	 * at least one page was bounced, fill in possible non-highmem
-	 * pages
-	 */
-	__bio_for_each_segment(from, *bio_orig, i, 0) {
-		to = bio_iovec_idx(bio, i);
-		if (!to->bv_page) {
-			to->bv_page = from->bv_page;
-			to->bv_len = from->bv_len;
-			to->bv_offset = from->bv_offset;
-		}
-	}
-
-	bio->bi_bdev = (*bio_orig)->bi_bdev;
-	bio->bi_flags |= (1 << BIO_BOUNCED);
-	bio->bi_sector = (*bio_orig)->bi_sector;
-	bio->bi_rw = (*bio_orig)->bi_rw;
-
-	bio->bi_vcnt = (*bio_orig)->bi_vcnt;
-	bio->bi_idx = (*bio_orig)->bi_idx;
-	bio->bi_size = (*bio_orig)->bi_size;
-
-	if (pool == page_pool) {
-		bio->bi_end_io = bounce_end_io_write;
-		if (rw == READ)
-			bio->bi_end_io = bounce_end_io_read;
-	} else {
-		bio->bi_end_io = bounce_end_io_write_isa;
-		if (rw == READ)
-			bio->bi_end_io = bounce_end_io_read_isa;
-	}
-
-	bio->bi_private = *bio_orig;
-	*bio_orig = bio;
-}
-
-void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
-{
-	mempool_t *pool;
-
-	/*
-	 * for non-isa bounce case, just check if the bounce pfn is equal
-	 * to or bigger than the highest pfn in the system -- in that case,
-	 * don't waste time iterating over bio segments
-	 */
-	if (!(q->bounce_gfp & GFP_DMA)) {
-		if (q->bounce_pfn >= blk_max_pfn)
-			return;
-		pool = page_pool;
-	} else {
-		BUG_ON(!isa_page_pool);
-		pool = isa_page_pool;
-	}
-
-	blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
-
-	/*
-	 * slow path
-	 */
-	__blk_queue_bounce(q, bio_orig, pool);
-}
-
-EXPORT_SYMBOL(blk_queue_bounce);
-
 #if defined(HASHED_PAGE_VIRTUAL)
 
 #define PA_HASH_ORDER	7
-- 
cgit v1.2.3


From 811d736f9e8013966e1a5a930c0db09508bdbb15 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 29 Aug 2006 19:06:09 +0100
Subject: [PATCH] BLOCK: Dissociate generic_writepages() from mpage stuff [try
 #6]

Dissociate the generic_writepages() function from the mpage stuff, moving its
declaration to linux/mm.h and actually emitting a full implementation into
mm/page-writeback.c.

The implementation is a partial duplicate of mpage_writepages() with all BIO
references removed.

It is used by NFS to do writeback.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 mm/page-writeback.c | 134 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 134 insertions(+)

(limited to 'mm')

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 9fdcc790395..ecf27839c20 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -31,6 +31,7 @@
 #include <linux/cpu.h>
 #include <linux/syscalls.h>
 #include <linux/buffer_head.h>
+#include <linux/pagevec.h>
 
 /*
  * The maximum number of pages to writeout in a single bdflush/kupdate
@@ -551,6 +552,139 @@ void __init page_writeback_init(void)
 	register_cpu_notifier(&ratelimit_nb);
 }
 
+/**
+ * generic_writepages - walk the list of dirty pages of the given
+ *                      address space and writepage() all of them.
+ *
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ *
+ * This is a library function, which implements the writepages()
+ * address_space_operation.
+ *
+ * If a page is already under I/O, generic_writepages() skips it, even
+ * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
+ * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
+ * and msync() need to guarantee that all the data which was dirty at the time
+ * the call was made get new I/O started against them.  If wbc->sync_mode is
+ * WB_SYNC_ALL then we were called for data integrity and we must wait for
+ * existing IO to complete.
+ *
+ * Derived from mpage_writepages() - if you fix this you should check that
+ * also!
+ */
+int generic_writepages(struct address_space *mapping,
+		       struct writeback_control *wbc)
+{
+	struct backing_dev_info *bdi = mapping->backing_dev_info;
+	int ret = 0;
+	int done = 0;
+	int (*writepage)(struct page *page, struct writeback_control *wbc);
+	struct pagevec pvec;
+	int nr_pages;
+	pgoff_t index;
+	pgoff_t end;		/* Inclusive */
+	int scanned = 0;
+	int range_whole = 0;
+
+	if (wbc->nonblocking && bdi_write_congested(bdi)) {
+		wbc->encountered_congestion = 1;
+		return 0;
+	}
+
+	writepage = mapping->a_ops->writepage;
+
+	/* deal with chardevs and other special file */
+	if (!writepage)
+		return 0;
+
+	pagevec_init(&pvec, 0);
+	if (wbc->range_cyclic) {
+		index = mapping->writeback_index; /* Start from prev offset */
+		end = -1;
+	} else {
+		index = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			range_whole = 1;
+		scanned = 1;
+	}
+retry:
+	while (!done && (index <= end) &&
+	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+					      PAGECACHE_TAG_DIRTY,
+					      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+		unsigned i;
+
+		scanned = 1;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			/*
+			 * At this point we hold neither mapping->tree_lock nor
+			 * lock on the page itself: the page may be truncated or
+			 * invalidated (changing page->mapping to NULL), or even
+			 * swizzled back from swapper_space to tmpfs file
+			 * mapping
+			 */
+			lock_page(page);
+
+			if (unlikely(page->mapping != mapping)) {
+				unlock_page(page);
+				continue;
+			}
+
+			if (!wbc->range_cyclic && page->index > end) {
+				done = 1;
+				unlock_page(page);
+				continue;
+			}
+
+			if (wbc->sync_mode != WB_SYNC_NONE)
+				wait_on_page_writeback(page);
+
+			if (PageWriteback(page) ||
+			    !clear_page_dirty_for_io(page)) {
+				unlock_page(page);
+				continue;
+			}
+
+			ret = (*writepage)(page, wbc);
+			if (ret) {
+				if (ret == -ENOSPC)
+					set_bit(AS_ENOSPC, &mapping->flags);
+				else
+					set_bit(AS_EIO, &mapping->flags);
+			}
+
+			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE))
+				unlock_page(page);
+			if (ret || (--(wbc->nr_to_write) <= 0))
+				done = 1;
+			if (wbc->nonblocking && bdi_write_congested(bdi)) {
+				wbc->encountered_congestion = 1;
+				done = 1;
+			}
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+	if (!scanned && !done) {
+		/*
+		 * We hit the last page and there is more work to be done: wrap
+		 * back to the start of the file
+		 */
+		scanned = 1;
+		index = 0;
+		goto retry;
+	}
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+		mapping->writeback_index = index;
+	return ret;
+}
+
+EXPORT_SYMBOL(generic_writepages);
+
 int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
 	int ret;
-- 
cgit v1.2.3


From 9361401eb7619c033e2394e4f9f6d410d6719ac7 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 30 Sep 2006 20:45:40 +0200
Subject: [PATCH] BLOCK: Make it possible to disable the block layer [try #6]

Make it possible to disable the block layer.  Not all embedded devices require
it, some can make do with just JFFS2, NFS, ramfs, etc - none of which require
the block layer to be present.

This patch does the following:

 (*) Introduces CONFIG_BLOCK to disable the block layer, buffering and blockdev
     support.

 (*) Adds dependencies on CONFIG_BLOCK to any configuration item that controls
     an item that uses the block layer.  This includes:

     (*) Block I/O tracing.

     (*) Disk partition code.

     (*) All filesystems that are block based, eg: Ext3, ReiserFS, ISOFS.

     (*) The SCSI layer.  As far as I can tell, even SCSI chardevs use the
     	 block layer to do scheduling.  Some drivers that use SCSI facilities -
     	 such as USB storage - end up disabled indirectly from this.

     (*) Various block-based device drivers, such as IDE and the old CDROM
     	 drivers.

     (*) MTD blockdev handling and FTL.

     (*) JFFS - which uses set_bdev_super(), something it could avoid doing by
     	 taking a leaf out of JFFS2's book.

 (*) Makes most of the contents of linux/blkdev.h, linux/buffer_head.h and
     linux/elevator.h contingent on CONFIG_BLOCK being set.  sector_div() is,
     however, still used in places, and so is still available.

 (*) Also made contingent are the contents of linux/mpage.h, linux/genhd.h and
     parts of linux/fs.h.

 (*) Makes a number of files in fs/ contingent on CONFIG_BLOCK.

 (*) Makes mm/bounce.c (bounce buffering) contingent on CONFIG_BLOCK.

 (*) set_page_dirty() doesn't call __set_page_dirty_buffers() if CONFIG_BLOCK
     is not enabled.

 (*) fs/no-block.c is created to hold out-of-line stubs and things that are
     required when CONFIG_BLOCK is not set:

     (*) Default blockdev file operations (to give error ENODEV on opening).

 (*) Makes some /proc changes:

     (*) /proc/devices does not list any blockdevs.

     (*) /proc/diskstats and /proc/partitions are contingent on CONFIG_BLOCK.

 (*) Makes some compat ioctl handling contingent on CONFIG_BLOCK.

 (*) If CONFIG_BLOCK is not defined, makes sys_quotactl() return -ENODEV if
     given command other than Q_SYNC or if a special device is specified.

 (*) In init/do_mounts.c, no reference is made to the blockdev routines if
     CONFIG_BLOCK is not defined.  This does not prohibit NFS roots or JFFS2.

 (*) The bdflush, ioprio_set and ioprio_get syscalls can now be absent (return
     error ENOSYS by way of cond_syscall if so).

 (*) The seclvl_bd_claim() and seclvl_bd_release() security calls do nothing if
     CONFIG_BLOCK is not set, since they can't then happen.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 mm/Makefile         | 2 +-
 mm/filemap.c        | 4 ++++
 mm/migrate.c        | 2 ++
 mm/page-writeback.c | 8 +++++---
 mm/truncate.c       | 2 ++
 5 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/Makefile b/mm/Makefile
index 4f2166a833b..12b3a4eee88 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -12,7 +12,7 @@ obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   readahead.o swap.o truncate.o vmscan.o \
 			   prio_tree.o util.o mmzone.o vmstat.o $(mmu-y)
 
-ifeq ($(CONFIG_MMU),y)
+ifeq ($(CONFIG_MMU)$(CONFIG_BLOCK),yy)
 obj-y			+= bounce.o
 endif
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
diff --git a/mm/filemap.c b/mm/filemap.c
index d6846de0888..c4fe97f5ace 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2020,6 +2020,7 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
 		if (unlikely(*pos + *count > inode->i_sb->s_maxbytes))
 			*count = inode->i_sb->s_maxbytes - *pos;
 	} else {
+#ifdef CONFIG_BLOCK
 		loff_t isize;
 		if (bdev_read_only(I_BDEV(inode)))
 			return -EPERM;
@@ -2031,6 +2032,9 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
 
 		if (*pos + *count > isize)
 			*count = isize - *pos;
+#else
+		return -EPERM;
+#endif
 	}
 	return 0;
 }
diff --git a/mm/migrate.c b/mm/migrate.c
index 7f50e3ff54c..ba2453f9483 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -409,6 +409,7 @@ int migrate_page(struct address_space *mapping,
 }
 EXPORT_SYMBOL(migrate_page);
 
+#ifdef CONFIG_BLOCK
 /*
  * Migration function for pages with buffers. This function can only be used
  * if the underlying filesystem guarantees that no other references to "page"
@@ -466,6 +467,7 @@ int buffer_migrate_page(struct address_space *mapping,
 	return 0;
 }
 EXPORT_SYMBOL(buffer_migrate_page);
+#endif
 
 /*
  * Writeback a page to clean the dirty state
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index ecf27839c20..c0d4ce144de 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -807,9 +807,11 @@ int fastcall set_page_dirty(struct page *page)
 
 	if (likely(mapping)) {
 		int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
-		if (spd)
-			return (*spd)(page);
-		return __set_page_dirty_buffers(page);
+#ifdef CONFIG_BLOCK
+		if (!spd)
+			spd = __set_page_dirty_buffers;
+#endif
+		return (*spd)(page);
 	}
 	if (!PageDirty(page)) {
 		if (!TestSetPageDirty(page))
diff --git a/mm/truncate.c b/mm/truncate.c
index cd3e34b816d..8fde6580657 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -35,8 +35,10 @@ void do_invalidatepage(struct page *page, unsigned long offset)
 {
 	void (*invalidatepage)(struct page *, unsigned long);
 	invalidatepage = page->mapping->a_ops->invalidatepage;
+#ifdef CONFIG_BLOCK
 	if (!invalidatepage)
 		invalidatepage = block_invalidatepage;
+#endif
 	if (invalidatepage)
 		(*invalidatepage)(page, offset);
 }
-- 
cgit v1.2.3


From 3fcd03e07008ec0f667dfb7626171165699ea5c2 Mon Sep 17 00:00:00 2001
From: Gavin Lambert <gavinl@compacsort.com>
Date: Sat, 30 Sep 2006 23:27:01 -0700
Subject: [PATCH] NOMMU: don't try and give NULL to fput()

Don't try and give NULL to fput() in the error handling in do_mmap_pgoff()
as it'll cause an oops.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/nommu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/nommu.c b/mm/nommu.c
index 56454066219..365019599df 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -948,7 +948,8 @@ unsigned long do_mmap_pgoff(struct file *file,
 	up_write(&nommu_vma_sem);
 	kfree(vml);
 	if (vma) {
-		fput(vma->vm_file);
+		if (vma->vm_file)
+			fput(vma->vm_file);
 		kfree(vma);
 	}
 	return ret;
-- 
cgit v1.2.3


From f28c5edc06ecd8068b38b7662ad19f4d20d741af Mon Sep 17 00:00:00 2001
From: Keith Mannthey <kmannth@us.ibm.com>
Date: Sat, 30 Sep 2006 23:27:04 -0700
Subject: [PATCH] hot-add-mem x86_64: fixup externs

Fix up externs in memory_hotplug.c.  Cleanup.

Signed-off-by: Keith Mannthey <kmannth@us.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory_hotplug.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'mm')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 2053bb165a2..63b14d4f68f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -26,8 +26,6 @@
 
 #include <asm/tlbflush.h>
 
-extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
-			  unsigned long size);
 static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
@@ -47,8 +45,6 @@ static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 	return 0;
 }
 
-extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
-				  int nr_pages);
 static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
 {
 	int nr_pages = PAGES_PER_SECTION;
-- 
cgit v1.2.3


From ec69acbb1191df671ff8e07c8e146619a5c53f70 Mon Sep 17 00:00:00 2001
From: Keith Mannthey <kmannth@us.ibm.com>
Date: Sat, 30 Sep 2006 23:27:05 -0700
Subject: [PATCH] hot-add-mem x86_64: Kconfig changes

Create Kconfig namespace for MEMORY_HOTPLUG_RESERVE and MEMORY_HOTPLUG_SPARSE.
 This is needed to create a disticiton between the 2 paths.  Selecting the
high level opiton of MEMORY_HOTPLUG will get you MEMORY_HOTPLUG_SPARSE if you
have sparsemem enabled or MEMORY_HOTPLUG_RESERVE if you are x86_64 with
discontig and ACPI numa support.

Signed-off-by: Keith Mannthey <kmannth@us.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/Kconfig | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/Kconfig b/mm/Kconfig
index 8f5b45615f7..5d88489ef2d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -115,12 +115,17 @@ config SPARSEMEM_EXTREME
 # eventually, we can have this option just 'select SPARSEMEM'
 config MEMORY_HOTPLUG
 	bool "Allow for memory hot-add"
-	depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND && ARCH_ENABLE_MEMORY_HOTPLUG
+	depends on SPARSEMEM || X86_64_ACPI_NUMA
+	depends on HOTPLUG && !SOFTWARE_SUSPEND && ARCH_ENABLE_MEMORY_HOTPLUG
 	depends on (IA64 || X86 || PPC64)
 
 comment "Memory hotplug is currently incompatible with Software Suspend"
 	depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND
 
+config MEMORY_HOTPLUG_SPARSE
+	def_bool y
+	depends on SPARSEMEM && MEMORY_HOTPLUG
+
 # Heavily threaded applications may benefit from splitting the mm-wide
 # page_table_lock, so that faults on different parts of the user address
 # space can be handled with less contention: split it at this NR_CPUS.
-- 
cgit v1.2.3


From 53947027ad90542ddb2bb746e3175827c270610a Mon Sep 17 00:00:00 2001
From: Keith Mannthey <kmannth@us.ibm.com>
Date: Sat, 30 Sep 2006 23:27:08 -0700
Subject: [PATCH] hot-add-mem x86_64: use CONFIG_MEMORY_HOTPLUG_SPARSE

Migate CONFIG_MEMORY_HOTPLUG to CONFIG_MEMORY_HOTPLUG_SPARSE where needed.

Signed-off-by: Keith Mannthey <kmannth@us.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory_hotplug.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'mm')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 63b14d4f68f..7666dbd328d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -26,6 +26,7 @@
 
 #include <asm/tlbflush.h>
 
+#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
 static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
@@ -192,6 +193,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 	writeback_set_ratelimit();
 	return 0;
 }
+#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
 
 static pg_data_t *hotadd_new_pgdat(int nid, u64 start)
 {
-- 
cgit v1.2.3


From 45e0b78b0532f92c01e363dd4287617c5be4574f Mon Sep 17 00:00:00 2001
From: Keith Mannthey <kmannth@us.ibm.com>
Date: Sat, 30 Sep 2006 23:27:09 -0700
Subject: [PATCH] hot-add-mem x86_64: use CONFIG_MEMORY_HOTPLUG_RESERVE

The api for hot-add memory already has a construct for finding nodes based on
an address, memory_add_physaddr_to_nid.  This patch allows the fucntion to do
something besides return 0.  It uses the nodes_add infomation to lookup to
node info for a hot add event.

Signed-off-by: Keith Mannthey <kmannth@us.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory_hotplug.c | 60 ++++++++++++++++++++++++++---------------------------
 1 file changed, 30 insertions(+), 30 deletions(-)

(limited to 'mm')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 7666dbd328d..fd678a662ea 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -26,6 +26,36 @@
 
 #include <asm/tlbflush.h>
 
+/* add this memory to iomem resource */
+static struct resource *register_memory_resource(u64 start, u64 size)
+{
+	struct resource *res;
+	res = kzalloc(sizeof(struct resource), GFP_KERNEL);
+	BUG_ON(!res);
+
+	res->name = "System RAM";
+	res->start = start;
+	res->end = start + size - 1;
+	res->flags = IORESOURCE_MEM;
+	if (request_resource(&iomem_resource, res) < 0) {
+		printk("System RAM resource %llx - %llx cannot be added\n",
+		(unsigned long long)res->start, (unsigned long long)res->end);
+		kfree(res);
+		res = NULL;
+	}
+	return res;
+}
+
+static void release_memory_resource(struct resource *res)
+{
+	if (!res)
+		return;
+	release_resource(res);
+	kfree(res);
+	return;
+}
+
+
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
 static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 {
@@ -223,36 +253,6 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
 	return;
 }
 
-/* add this memory to iomem resource */
-static struct resource *register_memory_resource(u64 start, u64 size)
-{
-	struct resource *res;
-	res = kzalloc(sizeof(struct resource), GFP_KERNEL);
-	BUG_ON(!res);
-
-	res->name = "System RAM";
-	res->start = start;
-	res->end = start + size - 1;
-	res->flags = IORESOURCE_MEM;
-	if (request_resource(&iomem_resource, res) < 0) {
-		printk("System RAM resource %llx - %llx cannot be added\n",
-		(unsigned long long)res->start, (unsigned long long)res->end);
-		kfree(res);
-		res = NULL;
-	}
-	return res;
-}
-
-static void release_memory_resource(struct resource *res)
-{
-	if (!res)
-		return;
-	release_resource(res);
-	kfree(res);
-	return;
-}
-
-
 
 int add_memory(int nid, u64 start, u64 size)
 {
-- 
cgit v1.2.3


From 1a2f67b459bb7846d4a15924face63eb2683acc2 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 30 Sep 2006 23:27:20 -0700
Subject: [PATCH] kmemdup: introduce

One of idiomatic ways to duplicate a region of memory is

	dst = kmalloc(len, GFP_KERNEL);
	if (!dst)
		return -ENOMEM;
	memcpy(dst, src, len);

which is neat code except a programmer needs to write size twice.  Which
sometimes leads to mistakes.  If len passed to kmalloc is smaller that len
passed to memcpy, it's straight overwrite-beyond-end.  If len passed to
memcpy is smaller than len passed to kmalloc, it's either a) legit
behaviour ;-), or b) cloned buffer will contain garbage in second half.

Slight trolling of commit lists shows several duplications bugs
done exactly because of diverged lenghts:

	Linux:
		[CRYPTO]: Fix memcpy/memset args.
		[PATCH] memcpy/memset fixes
	OpenBSD:
		kerberosV/src/lib/asn1: der_copy.c:1.4

If programmer is given only one place to play with lengths, I believe, such
mistakes could be avoided.

With kmemdup, the snippet above will be rewritten as:

	dst = kmemdup(src, len, GFP_KERNEL);
	if (!dst)
		return -ENOMEM;

This also leads to smaller code (kzalloc effect). Quick grep shows
200+ places where kmemdup() can be used.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/util.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'mm')

diff --git a/mm/util.c b/mm/util.c
index 7368479220b..e14fa84ef39 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -40,6 +40,24 @@ char *kstrdup(const char *s, gfp_t gfp)
 }
 EXPORT_SYMBOL(kstrdup);
 
+/**
+ * kmemdup - duplicate region of memory
+ *
+ * @src: memory region to duplicate
+ * @len: memory region length
+ * @gfp: GFP mask to use
+ */
+void *kmemdup(const void *src, size_t len, gfp_t gfp)
+{
+	void *p;
+
+	p = ____kmalloc(len, gfp);
+	if (p)
+		memcpy(p, src, len);
+	return p;
+}
+EXPORT_SYMBOL(kmemdup);
+
 /*
  * strndup_user - duplicate an existing string from user space
  *
-- 
cgit v1.2.3


From 52978be636374c4bfb61220b37fa12f55a071c46 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 30 Sep 2006 23:27:21 -0700
Subject: [PATCH] kmemdup: some users

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index cf18f094255..25788b1b7fc 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1324,12 +1324,11 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
 	atomic_set(&new->refcnt, 1);
 	if (new->policy == MPOL_BIND) {
 		int sz = ksize(old->v.zonelist);
-		new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
+		new->v.zonelist = kmemdup(old->v.zonelist, sz, SLAB_KERNEL);
 		if (!new->v.zonelist) {
 			kmem_cache_free(policy_cache, new);
 			return ERR_PTR(-ENOMEM);
 		}
-		memcpy(new->v.zonelist, old->v.zonelist, sz);
 	}
 	return new;
 }
-- 
cgit v1.2.3


From 027445c37282bc1ed26add45e573ad2d3e4860a5 Mon Sep 17 00:00:00 2001
From: Badari Pulavarty <pbadari@us.ibm.com>
Date: Sat, 30 Sep 2006 23:28:46 -0700
Subject: [PATCH] Vectorize aio_read/aio_write fileop methods

This patch vectorizes aio_read() and aio_write() methods to prepare for
collapsing all aio & vectored operations into one interface - which is
aio_read()/aio_write().

Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Michael Holzheu <HOLZHEU@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/filemap.c | 39 ++++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 19 deletions(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index c4fe97f5ace..f6c1d22b504 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1226,12 +1226,11 @@ out:
 EXPORT_SYMBOL(__generic_file_aio_read);
 
 ssize_t
-generic_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos)
+generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long nr_segs, loff_t pos)
 {
-	struct iovec local_iov = { .iov_base = buf, .iov_len = count };
-
 	BUG_ON(iocb->ki_pos != pos);
-	return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos);
+	return __generic_file_aio_read(iocb, iov, nr_segs, &iocb->ki_pos);
 }
 EXPORT_SYMBOL(generic_file_aio_read);
 
@@ -2315,22 +2314,22 @@ out:
 	current->backing_dev_info = NULL;
 	return written ? written : err;
 }
-EXPORT_SYMBOL(generic_file_aio_write_nolock);
 
-ssize_t
-generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
-				unsigned long nr_segs, loff_t *ppos)
+ssize_t generic_file_aio_write_nolock(struct kiocb *iocb,
+		const struct iovec *iov, unsigned long nr_segs, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
 	ssize_t ret;
-	loff_t pos = *ppos;
 
-	ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, ppos);
+	BUG_ON(iocb->ki_pos != pos);
+
+	ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs,
+			&iocb->ki_pos);
 
 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
-		int err;
+		ssize_t err;
 
 		err = sync_page_range_nolock(inode, mapping, pos, ret);
 		if (err < 0)
@@ -2338,6 +2337,7 @@ generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
 	}
 	return ret;
 }
+EXPORT_SYMBOL(generic_file_aio_write_nolock);
 
 static ssize_t
 __generic_file_write_nolock(struct file *file, const struct iovec *iov,
@@ -2347,8 +2347,9 @@ __generic_file_write_nolock(struct file *file, const struct iovec *iov,
 	ssize_t ret;
 
 	init_sync_kiocb(&kiocb, file);
+	kiocb.ki_pos = *ppos;
 	ret = __generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
-	if (ret == -EIOCBQUEUED)
+	if (-EIOCBQUEUED == ret)
 		ret = wait_on_sync_kiocb(&kiocb);
 	return ret;
 }
@@ -2361,28 +2362,28 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov,
 	ssize_t ret;
 
 	init_sync_kiocb(&kiocb, file);
-	ret = generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
+	kiocb.ki_pos = *ppos;
+	ret = generic_file_aio_write_nolock(&kiocb, iov, nr_segs, *ppos);
 	if (-EIOCBQUEUED == ret)
 		ret = wait_on_sync_kiocb(&kiocb);
+	*ppos = kiocb.ki_pos;
 	return ret;
 }
 EXPORT_SYMBOL(generic_file_write_nolock);
 
-ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf,
-			       size_t count, loff_t pos)
+ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long nr_segs, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
 	ssize_t ret;
-	struct iovec local_iov = { .iov_base = (void __user *)buf,
-					.iov_len = count };
 
 	BUG_ON(iocb->ki_pos != pos);
 
 	mutex_lock(&inode->i_mutex);
-	ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1,
-						&iocb->ki_pos);
+	ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs,
+			&iocb->ki_pos);
 	mutex_unlock(&inode->i_mutex);
 
 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
-- 
cgit v1.2.3


From ee0b3e671baff681d69fbf0db33b47603c0a8280 Mon Sep 17 00:00:00 2001
From: Badari Pulavarty <pbadari@us.ibm.com>
Date: Sat, 30 Sep 2006 23:28:47 -0700
Subject: [PATCH] Remove readv/writev methods and use aio_read/aio_write
 instead

This patch removes readv() and writev() methods and replaces them with
aio_read()/aio_write() methods.

Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/filemap.c | 36 ------------------------------------
 1 file changed, 36 deletions(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index f6c1d22b504..48497094ae8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2421,42 +2421,6 @@ ssize_t generic_file_write(struct file *file, const char __user *buf,
 }
 EXPORT_SYMBOL(generic_file_write);
 
-ssize_t generic_file_readv(struct file *filp, const struct iovec *iov,
-			unsigned long nr_segs, loff_t *ppos)
-{
-	struct kiocb kiocb;
-	ssize_t ret;
-
-	init_sync_kiocb(&kiocb, filp);
-	ret = __generic_file_aio_read(&kiocb, iov, nr_segs, ppos);
-	if (-EIOCBQUEUED == ret)
-		ret = wait_on_sync_kiocb(&kiocb);
-	return ret;
-}
-EXPORT_SYMBOL(generic_file_readv);
-
-ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
-			unsigned long nr_segs, loff_t *ppos)
-{
-	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = mapping->host;
-	ssize_t ret;
-
-	mutex_lock(&inode->i_mutex);
-	ret = __generic_file_write_nolock(file, iov, nr_segs, ppos);
-	mutex_unlock(&inode->i_mutex);
-
-	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
-		int err;
-
-		err = sync_page_range(inode, mapping, *ppos - ret, ret);
-		if (err < 0)
-			ret = err;
-	}
-	return ret;
-}
-EXPORT_SYMBOL(generic_file_writev);
-
 /*
  * Called under i_mutex for writes to S_ISREG files.   Returns -EIO if something
  * went wrong during pagecache shootdown.
-- 
cgit v1.2.3


From 543ade1fc901db4c3dbe9fb27241fb977f1f3eea Mon Sep 17 00:00:00 2001
From: Badari Pulavarty <pbadari@us.ibm.com>
Date: Sat, 30 Sep 2006 23:28:48 -0700
Subject: [PATCH] Streamline generic_file_* interfaces and filemap cleanups

This patch cleans up generic_file_*_read/write() interfaces.  Christoph
Hellwig gave me the idea for this clean ups.

In a nutshell, all filesystems should set .aio_read/.aio_write methods and use
do_sync_read/ do_sync_write() as their .read/.write methods.  This allows us
to cleanup all variants of generic_file_* routines.

Final available interfaces:

generic_file_aio_read() - read handler
generic_file_aio_write() - write handler
generic_file_aio_write_nolock() - no lock write handler

__generic_file_aio_write_nolock() - internal worker routine

Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/filemap.c | 87 +++---------------------------------------------------------
 1 file changed, 4 insertions(+), 83 deletions(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index 48497094ae8..ec469235985 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1149,13 +1149,14 @@ success:
  * that can use the page cache directly.
  */
 ssize_t
-__generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long nr_segs, loff_t *ppos)
+generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long nr_segs, loff_t pos)
 {
 	struct file *filp = iocb->ki_filp;
 	ssize_t retval;
 	unsigned long seg;
 	size_t count;
+	loff_t *ppos = &iocb->ki_pos;
 
 	count = 0;
 	for (seg = 0; seg < nr_segs; seg++) {
@@ -1179,7 +1180,7 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 
 	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
 	if (filp->f_flags & O_DIRECT) {
-		loff_t pos = *ppos, size;
+		loff_t size;
 		struct address_space *mapping;
 		struct inode *inode;
 
@@ -1223,32 +1224,8 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 out:
 	return retval;
 }
-EXPORT_SYMBOL(__generic_file_aio_read);
-
-ssize_t
-generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long nr_segs, loff_t pos)
-{
-	BUG_ON(iocb->ki_pos != pos);
-	return __generic_file_aio_read(iocb, iov, nr_segs, &iocb->ki_pos);
-}
 EXPORT_SYMBOL(generic_file_aio_read);
 
-ssize_t
-generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
-{
-	struct iovec local_iov = { .iov_base = buf, .iov_len = count };
-	struct kiocb kiocb;
-	ssize_t ret;
-
-	init_sync_kiocb(&kiocb, filp);
-	ret = __generic_file_aio_read(&kiocb, &local_iov, 1, ppos);
-	if (-EIOCBQUEUED == ret)
-		ret = wait_on_sync_kiocb(&kiocb);
-	return ret;
-}
-EXPORT_SYMBOL(generic_file_read);
-
 int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
 {
 	ssize_t written;
@@ -2339,38 +2316,6 @@ ssize_t generic_file_aio_write_nolock(struct kiocb *iocb,
 }
 EXPORT_SYMBOL(generic_file_aio_write_nolock);
 
-static ssize_t
-__generic_file_write_nolock(struct file *file, const struct iovec *iov,
-				unsigned long nr_segs, loff_t *ppos)
-{
-	struct kiocb kiocb;
-	ssize_t ret;
-
-	init_sync_kiocb(&kiocb, file);
-	kiocb.ki_pos = *ppos;
-	ret = __generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
-	if (-EIOCBQUEUED == ret)
-		ret = wait_on_sync_kiocb(&kiocb);
-	return ret;
-}
-
-ssize_t
-generic_file_write_nolock(struct file *file, const struct iovec *iov,
-				unsigned long nr_segs, loff_t *ppos)
-{
-	struct kiocb kiocb;
-	ssize_t ret;
-
-	init_sync_kiocb(&kiocb, file);
-	kiocb.ki_pos = *ppos;
-	ret = generic_file_aio_write_nolock(&kiocb, iov, nr_segs, *ppos);
-	if (-EIOCBQUEUED == ret)
-		ret = wait_on_sync_kiocb(&kiocb);
-	*ppos = kiocb.ki_pos;
-	return ret;
-}
-EXPORT_SYMBOL(generic_file_write_nolock);
-
 ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 		unsigned long nr_segs, loff_t pos)
 {
@@ -2397,30 +2342,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 }
 EXPORT_SYMBOL(generic_file_aio_write);
 
-ssize_t generic_file_write(struct file *file, const char __user *buf,
-			   size_t count, loff_t *ppos)
-{
-	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = mapping->host;
-	ssize_t	ret;
-	struct iovec local_iov = { .iov_base = (void __user *)buf,
-					.iov_len = count };
-
-	mutex_lock(&inode->i_mutex);
-	ret = __generic_file_write_nolock(file, &local_iov, 1, ppos);
-	mutex_unlock(&inode->i_mutex);
-
-	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
-		ssize_t err;
-
-		err = sync_page_range(inode, mapping, *ppos - ret, ret);
-		if (err < 0)
-			ret = err;
-	}
-	return ret;
-}
-EXPORT_SYMBOL(generic_file_write);
-
 /*
  * Called under i_mutex for writes to S_ISREG files.   Returns -EIO if something
  * went wrong during pagecache shootdown.
-- 
cgit v1.2.3


From 9a53c3a783c2fa9b969628e65695c11c3e51e673 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Sat, 30 Sep 2006 23:29:03 -0700
Subject: [PATCH] r/o bind mounts: unlink: monitor i_nlink

When a filesystem decrements i_nlink to zero, it means that a write must be
performed in order to drop the inode from the filesystem.

We're shortly going to have keep filesystems from being remounted r/o between
the time that this i_nlink decrement and that write occurs.

So, add a little helper function to do the decrements.  We'll tie into it in a
bit to note when i_nlink hits zero.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/shmem.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/shmem.c b/mm/shmem.c
index b96de69f236..908dd947b1e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1772,7 +1772,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
 
 	dir->i_size -= BOGO_DIRENT_SIZE;
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
-	inode->i_nlink--;
+	drop_nlink(inode);
 	dput(dentry);	/* Undo the count from "create" - this does all the work */
 	return 0;
 }
@@ -1782,8 +1782,8 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
 	if (!simple_empty(dentry))
 		return -ENOTEMPTY;
 
-	dentry->d_inode->i_nlink--;
-	dir->i_nlink--;
+	drop_nlink(dentry->d_inode);
+	drop_nlink(dir);
 	return shmem_unlink(dir, dentry);
 }
 
@@ -1804,9 +1804,9 @@ static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct
 	if (new_dentry->d_inode) {
 		(void) shmem_unlink(new_dir, new_dentry);
 		if (they_are_dirs)
-			old_dir->i_nlink--;
+			drop_nlink(old_dir);
 	} else if (they_are_dirs) {
-		old_dir->i_nlink--;
+		drop_nlink(old_dir);
 		new_dir->i_nlink++;
 	}
 
-- 
cgit v1.2.3


From d8c76e6f45c111c32a4b3e50a2adc9210737b0d8 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Sat, 30 Sep 2006 23:29:04 -0700
Subject: [PATCH] r/o bind mount prepwork: inc_nlink() helper

This is mostly included for parity with dec_nlink(), where we will have some
more hooks.  This one should stay pretty darn straightforward for now.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/shmem.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/shmem.c b/mm/shmem.c
index 908dd947b1e..bb8ca7ef709 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1379,7 +1379,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
 							&sbinfo->policy_nodes);
 			break;
 		case S_IFDIR:
-			inode->i_nlink++;
+			inc_nlink(inode);
 			/* Some things misbehave if size == 0 on a directory */
 			inode->i_size = 2 * BOGO_DIRENT_SIZE;
 			inode->i_op = &shmem_dir_inode_operations;
@@ -1715,7 +1715,7 @@ static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
 		return error;
-	dir->i_nlink++;
+	inc_nlink(dir);
 	return 0;
 }
 
@@ -1750,7 +1750,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
 
 	dir->i_size += BOGO_DIRENT_SIZE;
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
-	inode->i_nlink++;
+	inc_nlink(inode);
 	atomic_inc(&inode->i_count);	/* New dentry reference */
 	dget(dentry);		/* Extra pinning count for the created dentry */
 	d_instantiate(dentry, inode);
@@ -1807,7 +1807,7 @@ static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct
 			drop_nlink(old_dir);
 	} else if (they_are_dirs) {
 		drop_nlink(old_dir);
-		new_dir->i_nlink++;
+		inc_nlink(new_dir);
 	}
 
 	old_dir->i_size -= BOGO_DIRENT_SIZE;
-- 
cgit v1.2.3


From bd4c8ce41a2e2f0c5bf54343ab54e8e09faec021 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sat, 30 Sep 2006 23:29:29 -0700
Subject: [PATCH] invalidate_inode_pages2(): ignore page refcounts

The recent fix to invalidate_inode_pages() (git commit 016eb4a) managed to
unfix invalidate_inode_pages2().

The problem is that various bits of code in the kernel can take transient refs
on pages: the page scanner will do this when inspecting a batch of pages, and
the lru_cache_add() batching pagevecs also hold a ref.

Net result is transient failures in invalidate_inode_pages2().  This affects
NFS directory invalidation (observed) and presumably also block-backed
direct-io (not yet reported).

Fix it by reverting invalidate_inode_pages2() back to the old version which
ignores the page refcounts.

We may come up with something more clever later, but for now we need a 2.6.18
fix for NFS.

Cc: Chuck Lever <cel@citi.umich.edu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/truncate.c | 34 ++++++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/truncate.c b/mm/truncate.c
index 8fde6580657..f4edbc179d1 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -287,9 +287,39 @@ unsigned long invalidate_inode_pages(struct address_space *mapping)
 {
 	return invalidate_mapping_pages(mapping, 0, ~0UL);
 }
-
 EXPORT_SYMBOL(invalidate_inode_pages);
 
+/*
+ * This is like invalidate_complete_page(), except it ignores the page's
+ * refcount.  We do this because invalidate_inode_pages2() needs stronger
+ * invalidation guarantees, and cannot afford to leave pages behind because
+ * shrink_list() has a temp ref on them, or because they're transiently sitting
+ * in the lru_cache_add() pagevecs.
+ */
+static int
+invalidate_complete_page2(struct address_space *mapping, struct page *page)
+{
+	if (page->mapping != mapping)
+		return 0;
+
+	if (PagePrivate(page) && !try_to_release_page(page, 0))
+		return 0;
+
+	write_lock_irq(&mapping->tree_lock);
+	if (PageDirty(page))
+		goto failed;
+
+	BUG_ON(PagePrivate(page));
+	__remove_from_page_cache(page);
+	write_unlock_irq(&mapping->tree_lock);
+	ClearPageUptodate(page);
+	page_cache_release(page);	/* pagecache ref */
+	return 1;
+failed:
+	write_unlock_irq(&mapping->tree_lock);
+	return 0;
+}
+
 /**
  * invalidate_inode_pages2_range - remove range of pages from an address_space
  * @mapping: the address_space
@@ -356,7 +386,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 				}
 			}
 			was_dirty = test_clear_page_dirty(page);
-			if (!invalidate_complete_page(mapping, page)) {
+			if (!invalidate_complete_page2(mapping, page)) {
 				if (was_dirty)
 					set_page_dirty(page);
 				ret = -EIO;
-- 
cgit v1.2.3


From 3dc907951446b9317b1887223caa4e083390de9f Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 30 Sep 2006 23:29:30 -0700
Subject: [PATCH] paravirt: remove read hazard from cow

We don't want to read PTEs directly like this after they have been modified,
as a lazy MMU implementation of direct page tables may not have written the
updated PTE back to memory yet.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 160f5b503ea..7707187484c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -467,7 +467,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	 */
 	if (is_cow_mapping(vm_flags)) {
 		ptep_set_wrprotect(src_mm, addr, src_pte);
-		pte = *src_pte;
+		pte = pte_wrprotect(pte);
 	}
 
 	/*
-- 
cgit v1.2.3


From 9888a1cae3f859db38b9604e3df1c02177161bb0 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 30 Sep 2006 23:29:31 -0700
Subject: [PATCH] paravirt: pte clear not present

Change pte_clear_full to a more appropriately named pte_clear_not_present,
allowing optimizations when not-present mapping changes need not be reflected
in the hardware TLB for protected page table modes.  There is also another
case that can use it in the fremap code.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/fremap.c | 2 +-
 mm/memory.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/fremap.c b/mm/fremap.c
index aa30618ec6b..7a9d0f5d246 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -39,7 +39,7 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 	} else {
 		if (!pte_file(pte))
 			free_swap_and_cache(pte_to_swp_entry(pte));
-		pte_clear(mm, addr, ptep);
+		pte_clear_not_present_full(mm, addr, ptep, 0);
 	}
 	return !!page;
 }
diff --git a/mm/memory.c b/mm/memory.c
index 7707187484c..2e754621d33 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -690,7 +690,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			continue;
 		if (!pte_file(ptent))
 			free_swap_and_cache(pte_to_swp_entry(ptent));
-		pte_clear_full(mm, addr, pte, tlb->fullmm);
+		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
 	} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
 
 	add_mm_rss(mm, file_rss, anon_rss);
-- 
cgit v1.2.3


From 6606c3e0da5360799e07ae24b05080cc85c68e72 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 30 Sep 2006 23:29:33 -0700
Subject: [PATCH] paravirt: lazy mmu mode hooks.patch

Implement lazy MMU update hooks which are SMP safe for both direct and shadow
page tables.  The idea is that PTE updates and page invalidations while in
lazy mode can be batched into a single hypercall.  We use this in VMI for
shadow page table synchronization, and it is a win.  It also can be used by
PPC and for direct page tables on Xen.

For SMP, the enter / leave must happen under protection of the page table
locks for page tables which are being modified.  This is because otherwise,
you end up with stale state in the batched hypercall, which other CPUs can
race ahead of.  Doing this under the protection of the locks guarantees the
synchronization is correct, and also means that spurious faults which are
generated during this window by remote CPUs are properly handled, as the page
fault handler must re-check the PTE under protection of the same lock.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c   | 8 ++++++++
 mm/mprotect.c | 2 ++
 mm/mremap.c   | 2 ++
 3 files changed, 12 insertions(+)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 2e754621d33..9cf3f341a28 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -506,6 +506,7 @@ again:
 	src_pte = pte_offset_map_nested(src_pmd, addr);
 	src_ptl = pte_lockptr(src_mm, src_pmd);
 	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+	arch_enter_lazy_mmu_mode();
 
 	do {
 		/*
@@ -527,6 +528,7 @@ again:
 		progress += 8;
 	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
 
+	arch_leave_lazy_mmu_mode();
 	spin_unlock(src_ptl);
 	pte_unmap_nested(src_pte - 1);
 	add_mm_rss(dst_mm, rss[0], rss[1]);
@@ -628,6 +630,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 	int anon_rss = 0;
 
 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+	arch_enter_lazy_mmu_mode();
 	do {
 		pte_t ptent = *pte;
 		if (pte_none(ptent)) {
@@ -694,6 +697,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 	} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
 
 	add_mm_rss(mm, file_rss, anon_rss);
+	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(pte - 1, ptl);
 
 	return addr;
@@ -1109,6 +1113,7 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
 	if (!pte)
 		return -ENOMEM;
+	arch_enter_lazy_mmu_mode();
 	do {
 		struct page *page = ZERO_PAGE(addr);
 		pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
@@ -1118,6 +1123,7 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 		BUG_ON(!pte_none(*pte));
 		set_pte_at(mm, addr, pte, zero_pte);
 	} while (pte++, addr += PAGE_SIZE, addr != end);
+	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(pte - 1, ptl);
 	return 0;
 }
@@ -1275,11 +1281,13 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
 	if (!pte)
 		return -ENOMEM;
+	arch_enter_lazy_mmu_mode();
 	do {
 		BUG_ON(!pte_none(*pte));
 		set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
 		pfn++;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
+	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(pte - 1, ptl);
 	return 0;
 }
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 955f9d0e38a..3b8f3c0c63f 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -34,6 +34,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
 	spinlock_t *ptl;
 
 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+	arch_enter_lazy_mmu_mode();
 	do {
 		oldpte = *pte;
 		if (pte_present(oldpte)) {
@@ -70,6 +71,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
 		}
 
 	} while (pte++, addr += PAGE_SIZE, addr != end);
+	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(pte - 1, ptl);
 }
 
diff --git a/mm/mremap.c b/mm/mremap.c
index 7c15cf3373a..9c769fa29f3 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -98,6 +98,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 	new_ptl = pte_lockptr(mm, new_pmd);
 	if (new_ptl != old_ptl)
 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+	arch_enter_lazy_mmu_mode();
 
 	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
 				   new_pte++, new_addr += PAGE_SIZE) {
@@ -109,6 +110,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 		set_pte_at(mm, new_addr, new_pte, pte);
 	}
 
+	arch_leave_lazy_mmu_mode();
 	if (new_ptl != old_ptl)
 		spin_unlock(new_ptl);
 	pte_unmap_nested(new_pte - 1);
-- 
cgit v1.2.3