8 files changed, 95 insertions, 46 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 807a463fd5e..9c7334bafda 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -828,6 +828,32 @@ grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
 }
 EXPORT_SYMBOL(grab_cache_page_nowait);
 
+/*
+ * CD/DVDs are error prone. When a medium error occurs, the driver may fail
+ * a _large_ part of the i/o request. Imagine the worst scenario:
+ *
+ *      ---R__________________________________________B__________
+ *         ^ reading here                             ^ bad block(assume 4k)
+ *
+ * read(R) => miss => readahead(R...B) => media error => frustrating retries
+ * => failing the whole request => read(R) => read(R+1) =>
+ * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
+ * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
+ * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
+ *
+ * It is going insane. Fix it by quickly scaling down the readahead size.
+ */
+static void shrink_readahead_size_eio(struct file *filp,
+					struct file_ra_state *ra)
+{
+	if (!ra->ra_pages)
+		return;
+
+	ra->ra_pages /= 4;
+	printk(KERN_WARNING "Reducing readahead size to %luK\n",
+			ra->ra_pages << (PAGE_CACHE_SHIFT - 10));
+}
+
 /**
  * do_generic_mapping_read - generic file read routine
  * @mapping:	address_space to be read
@@ -985,6 +1011,7 @@ readpage:
 				}
 				unlock_page(page);
 				error = -EIO;
+				shrink_readahead_size_eio(filp, &ra);
 				goto readpage_error;
 			}
 			unlock_page(page);
@@ -1522,6 +1549,7 @@ page_not_uptodate:
 	 * Things didn't work out. Return zero to tell the
 	 * mm layer so, possibly freeing the page cache page first.
 	 */
+	shrink_readahead_size_eio(file, ra);
 	page_cache_release(page);
 	return NULL;
 }
@@ -1892,7 +1920,7 @@ int remove_suid(struct dentry *dentry)
 EXPORT_SYMBOL(remove_suid);
 
 size_t
-__filemap_copy_from_user_iovec(char *vaddr, 
+__filemap_copy_from_user_iovec_inatomic(char *vaddr,
 			const struct iovec *iov, size_t base, size_t bytes)
 {
 	size_t copied = 0, left = 0;
@@ -1908,12 +1936,8 @@ __filemap_copy_from_user_iovec(char *vaddr,
 		vaddr += copy;
 		iov++;
 
-		if (unlikely(left)) {
-			/* zero the rest of the target like __copy_from_user */
-			if (bytes)
-				memset(vaddr, 0, bytes);
+		if (unlikely(left))
 			break;
-		}
 	}
 	return copied - left;
 }
diff --git a/mm/filemap.h b/mm/filemap.h
index 5683cde2205..536979fb4ba 100644
--- a/mm/filemap.h
+++ b/mm/filemap.h
@@ -16,15 +16,23 @@
 #include <linux/uaccess.h>
 
 size_t
-__filemap_copy_from_user_iovec(char *vaddr,
-			       const struct iovec *iov,
-			       size_t base,
-			       size_t bytes);
+__filemap_copy_from_user_iovec_inatomic(char *vaddr,
+					const struct iovec *iov,
+					size_t base,
+					size_t bytes);
 
 /*
  * Copy as much as we can into the page and return the number of bytes which
  * were sucessfully copied.  If a fault is encountered then clear the page
  * out to (offset+bytes) and return the number of bytes which were copied.
+ *
+ * NOTE: For this to work reliably we really want copy_from_user_inatomic_nocache
+ * to *NOT* zero any tail of the buffer that it failed to copy.  If it does,
+ * and if the following non-atomic copy succeeds, then there is a small window
+ * where the target page contains neither the data before the write, nor the
+ * data after the write (it contains zero).  A read at this time will see
+ * data that is inconsistent with any ordering of the read and the write.
+ * (This has been detected in practice).
  */
 static inline size_t
 filemap_copy_from_user(struct page *page, unsigned long offset,
@@ -60,13 +68,15 @@ filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
 	size_t copied;
 
 	kaddr = kmap_atomic(page, KM_USER0);
-	copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
-						base, bytes);
+	copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov,
+							 base, bytes);
 	kunmap_atomic(kaddr, KM_USER0);
 	if (copied != bytes) {
 		kaddr = kmap(page);
-		copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
-							base, bytes);
+		copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov,
+								 base, bytes);
+		if (bytes - copied)
+			memset(kaddr + offset + copied, 0, bytes - copied);
 		kunmap(page);
 	}
 	return copied;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index ec4a1a950df..73e0f23b7f5 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -632,6 +632,10 @@ int do_migrate_pages(struct mm_struct *mm,
 
   	down_read(&mm->mmap_sem);
 
+	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
+	if (err)
+		goto out;
+
 /*
  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
@@ -691,7 +695,7 @@ int do_migrate_pages(struct mm_struct *mm,
 		if (err < 0)
 			break;
 	}
-
+out:
 	up_read(&mm->mmap_sem);
 	if (err < 0)
 		return err;
diff --git a/mm/migrate.c b/mm/migrate.c
index 1c2a71aa05c..3f1e0c2c942 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -616,15 +616,13 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 	/*
 	 * Establish migration ptes or remove ptes
 	 */
-	if (try_to_unmap(page, 1) != SWAP_FAIL) {
-		if (!page_mapped(page))
-			rc = move_to_new_page(newpage, page);
-	} else
-		/* A vma has VM_LOCKED set -> permanent failure */
-		rc = -EPERM;
+	try_to_unmap(page, 1);
+	if (!page_mapped(page))
+		rc = move_to_new_page(newpage, page);
 
 	if (rc)
 		remove_migration_ptes(page, page);
+
 unlock:
 	unlock_page(page);
 
@@ -976,3 +974,23 @@ out2:
 }
 #endif
 
+/*
+ * Call migration functions in the vma_ops that may prepare
+ * memory in a vm for migration. migration functions may perform
+ * the migration for vmas that do not have an underlying page struct.
+ */
+int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
+	const nodemask_t *from, unsigned long flags)
+{
+ 	struct vm_area_struct *vma;
+ 	int err = 0;
+
+ 	for(vma = mm->mmap; vma->vm_next && !err; vma = vma->vm_next) {
+ 		if (vma->vm_ops && vma->vm_ops->migrate) {
+ 			err = vma->vm_ops->migrate(vma, to, from, flags);
+ 			if (err)
+ 				break;
+ 		}
+ 	}
+ 	return err;
+}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 423db0db7c0..6c1174fcf52 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -957,8 +957,7 @@ restart:
 		goto got_pg;
 
 	do {
-		if (cpuset_zone_allowed(*z, gfp_mask|__GFP_HARDWALL))
-			wakeup_kswapd(*z, order);
+		wakeup_kswapd(*z, order);
 	} while (*(++z));
 
 	/*
diff --git a/mm/pdflush.c b/mm/pdflush.c
index df7e50b8f70..b02102feeb4 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -104,21 +104,20 @@ static int __pdflush(struct pdflush_work *my_work)
 		list_move(&my_work->list, &pdflush_list);
 		my_work->when_i_went_to_sleep = jiffies;
 		spin_unlock_irq(&pdflush_lock);
-
 		schedule();
-		if (try_to_freeze()) {
-			spin_lock_irq(&pdflush_lock);
-			continue;
-		}
-
+		try_to_freeze();
 		spin_lock_irq(&pdflush_lock);
 		if (!list_empty(&my_work->list)) {
-			printk("pdflush: bogus wakeup!\n");
+			/*
+			 * Someone woke us up, but without removing our control
+			 * structure from the global list.  swsusp will do this
+			 * in try_to_freeze()->refrigerator().  Handle it.
+			 */
 			my_work->fn = NULL;
 			continue;
 		}
 		if (my_work->fn == NULL) {
-			printk("pdflush: NULL work function\n");
+			printk("pdflush: bogus wakeup\n");
 			continue;
 		}
 		spin_unlock_irq(&pdflush_lock);
diff --git a/mm/readahead.c b/mm/readahead.c
index 0f142a40984..e39e416860d 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -118,8 +118,7 @@ static inline unsigned long get_next_ra_size(struct file_ra_state *ra)
 #define list_to_page(head) (list_entry((head)->prev, struct page, lru))
 
 /**
- * read_cache_pages - populate an address space with some pages, and
- * 			start reads against them.
+ * read_cache_pages - populate an address space with some pages & start reads against them
  * @mapping: the address_space
  * @pages: The address of a list_head which contains the target pages.  These
  *   pages have their ->index populated and are otherwise uninitialised.
@@ -182,14 +181,11 @@ static int read_pages(struct address_space *mapping, struct file *filp,
 		list_del(&page->lru);
 		if (!add_to_page_cache(page, mapping,
 					page->index, GFP_KERNEL)) {
-			ret = mapping->a_ops->readpage(filp, page);
-			if (ret != AOP_TRUNCATED_PAGE) {
-				if (!pagevec_add(&lru_pvec, page))
-					__pagevec_lru_add(&lru_pvec);
-				continue;
-			} /* else fall through to release */
-		}
-		page_cache_release(page);
+			mapping->a_ops->readpage(filp, page);
+			if (!pagevec_add(&lru_pvec, page))
+				__pagevec_lru_add(&lru_pvec);
+		} else
+			page_cache_release(page);
 	}
 	pagevec_lru_add(&lru_pvec);
 	ret = 0;
diff --git a/mm/rmap.c b/mm/rmap.c
index 882a85826bb..e76909e880c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -562,9 +562,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 	 * If it's recently referenced (perhaps page_referenced
 	 * skipped over this mm) then we should reactivate it.
 	 */
-	if ((vma->vm_flags & VM_LOCKED) ||
-			(ptep_clear_flush_young(vma, address, pte)
-				&& !migration)) {
+	if (!migration && ((vma->vm_flags & VM_LOCKED) ||
+			(ptep_clear_flush_young(vma, address, pte)))) {
 		ret = SWAP_FAIL;
 		goto out_unmap;
 	}
@@ -771,7 +770,7 @@ static int try_to_unmap_file(struct page *page, int migration)
 
 	list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 						shared.vm_set.list) {
-		if (vma->vm_flags & VM_LOCKED)
+		if ((vma->vm_flags & VM_LOCKED) && !migration)
 			continue;
 		cursor = (unsigned long) vma->vm_private_data;
 		if (cursor > max_nl_cursor)
@@ -805,7 +804,7 @@ static int try_to_unmap_file(struct page *page, int migration)
 	do {
 		list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 						shared.vm_set.list) {
-			if (vma->vm_flags & VM_LOCKED)
+			if ((vma->vm_flags & VM_LOCKED) && !migration)
 				continue;
 			cursor = (unsigned long) vma->vm_private_data;
 			while ( cursor < max_nl_cursor &&